-
Notifications
You must be signed in to change notification settings - Fork 11
[DRAFT] Perceptual Hashing Deduplication #226
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: develop
Are you sure you want to change the base?
Changes from all commits
b920739
b21a57f
f2f5180
ef5234a
3c00e5f
bade86b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,7 +4,7 @@ | |
|
||
See https://github.com/voxel51/fiftyone for more information. | ||
|
||
| Copyright 2017-2024, Voxel51, Inc. | ||
| Copyright 2017-2025, Voxel51, Inc. | ||
| `voxel51.com <https://voxel51.com/>`_ | ||
| | ||
""" | ||
|
@@ -544,6 +544,7 @@ def compute_similarity( | |
brain_key=None, | ||
model=None, | ||
model_kwargs=None, | ||
hash_method=None, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What do you think about removing these arguments from
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I now see that this is effectively what you are doing in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What do you mean by this? The functionality is primarily exposed through There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Both functionality and implementation should be in
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd argue it makes more sense to keep the current layout for the following reasons.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this discussion is getting to how we want to manage the brain from a design philosophy standpoint. This is probably something we should get together and discuss properly, because I’m sure it will come up a lot more in the future. What do you think? Regarding the current point at hand: First I’d like to say that my main concern is this code not being in similarity, rather than it being in The case for moving to
To address your points:
|
||
force_square=False, | ||
alpha=None, | ||
batch_size=None, | ||
|
@@ -631,6 +632,8 @@ def compute_similarity( | |
must expose embeddings (``model.has_embeddings = True``) | ||
model_kwargs (None): a dictionary of optional keyword arguments to pass | ||
to the model's ``Config`` when a model name is provided | ||
hash_method (None): the perceptual hashing method to use in place of | ||
embeddings. The supported values are ``["dhash", "phash", "ahash"]`` | ||
force_square (False): whether to minimally manipulate the patch | ||
bounding boxes into squares prior to extraction. Only applicable | ||
when a ``model`` and ``patches_field``/``roi_field`` are specified | ||
|
@@ -672,6 +675,7 @@ def compute_similarity( | |
brain_key, | ||
model, | ||
model_kwargs, | ||
hash_method, | ||
force_square, | ||
alpha, | ||
batch_size, | ||
|
@@ -691,6 +695,7 @@ def compute_near_duplicates( | |
similarity_index=None, | ||
model=None, | ||
model_kwargs=None, | ||
hash_method=None, | ||
force_square=False, | ||
alpha=None, | ||
batch_size=None, | ||
|
@@ -745,6 +750,8 @@ def compute_near_duplicates( | |
(``model.has_embeddings = True``) | ||
model_kwargs (None): a dictionary of optional keyword arguments to pass | ||
to the model's ``Config`` when a model name is provided | ||
hash_method (None): the perceptual hashing method to use in place of | ||
embeddings. The supported values are ``["dhash", "phash", "ahash"]`` | ||
force_square (False): whether to minimally manipulate the patch | ||
bounding boxes into squares prior to extraction. Only applicable | ||
when a ``model`` and ``roi_field`` are specified | ||
|
@@ -779,6 +786,7 @@ def compute_near_duplicates( | |
similarity_index=similarity_index, | ||
model=model, | ||
model_kwargs=model_kwargs, | ||
hash_method=hash_method, | ||
force_square=force_square, | ||
alpha=alpha, | ||
batch_size=batch_size, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,11 +19,16 @@ | |
import fiftyone.brain as fb | ||
import fiftyone.brain.similarity as fbs | ||
import fiftyone.brain.internal.core.utils as fbu | ||
import fiftyone.brain.internal.core.perceptual_hash as fbh | ||
|
||
from sklearn.metrics.pairwise import pairwise_distances | ||
import numpy as np | ||
|
||
|
||
logger = logging.getLogger(__name__) | ||
|
||
_DEFAULT_MODEL = "resnet18-imagenet-torch" | ||
FILE_HASH_TYPES = ["md5", "sha1", "sha256", "sha512"] | ||
|
||
|
||
def compute_near_duplicates( | ||
|
@@ -34,6 +39,7 @@ def compute_near_duplicates( | |
similarity_index=None, | ||
model=None, | ||
model_kwargs=None, | ||
hash_method=None, | ||
force_square=False, | ||
alpha=None, | ||
batch_size=None, | ||
|
@@ -62,6 +68,7 @@ def compute_near_duplicates( | |
model is None | ||
and embeddings is None | ||
and similarity_index is None | ||
and hash_method is None | ||
and not embeddings_exist | ||
): | ||
model = _DEFAULT_MODEL | ||
|
@@ -74,6 +81,7 @@ def compute_near_duplicates( | |
embeddings=embeddings_field or embeddings, | ||
model=model, | ||
model_kwargs=model_kwargs, | ||
hash_method=hash_method, | ||
force_square=force_square, | ||
alpha=alpha, | ||
batch_size=batch_size, | ||
|
@@ -139,6 +147,7 @@ def compute_exact_duplicates(samples, num_workers, skip_failures, progress): | |
|
||
def _compute_filehashes(samples, method, progress): | ||
ids, filepaths = samples.values(["id", "filepath"]) | ||
# I need embeddings, sample_ids, label_ids | ||
|
||
with fou.ProgressBar(total=len(ids), progress=progress) as pb: | ||
return { | ||
|
@@ -166,7 +175,10 @@ def _compute_filehashes_multi(samples, method, num_workers, progress): | |
|
||
def _compute_filehash(filepath, method): | ||
try: | ||
filehash = fou.compute_filehash(filepath, method=method) | ||
if method is None or method in FILE_HASH_TYPES: | ||
filehash = fou.compute_filehash(filepath, method=method) | ||
else: | ||
filehash = fbh.compute_image_hash(filepath, method=method) | ||
Comment on lines
+178
to
+181
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I find it a bit confusing to add image hash computation under the |
||
except: | ||
filehash = None | ||
|
||
|
@@ -176,7 +188,10 @@ def _compute_filehash(filepath, method): | |
def _do_compute_filehash(args): | ||
_id, filepath, method = args | ||
try: | ||
filehash = fou.compute_filehash(filepath, method=method) | ||
if method is None or method in FILE_HASH_TYPES: | ||
filehash = fou.compute_filehash(filepath, method=method) | ||
else: | ||
filehash = fbh.compute_image_hash(filepath, method=method) | ||
Comment on lines
+191
to
+194
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Similar comment to earlier - this function name is no longer accurate if we are not just calculating file hashses anymore here. |
||
except: | ||
filehash = None | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,122 @@ | ||
""" | ||
Image hashing methods. | ||
|
||
| Copyright 2017-2024, Voxel51, Inc. | ||
| `voxel51.com <https://voxel51.com/>`_ | ||
| | ||
""" | ||
|
||
import numpy as np | ||
import eta.core.image as etai | ||
import scipy | ||
|
||
|
||
def compute_image_hash(image_path, method="phash", hash_size=8): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not specific to this, not necessarily important to change here but wanted to note: we use this pattern (consolidate many implementations into 1 core function) a lot. It's not great obviously because you have to manually add each new option to the if-else. We should think about moving to registries over time. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good point, I typically prefer registries when the set of implementations is un-bounded (new model architectures for example). I believe the set of hash functions for images is relatively bounded so I wasn't too worried. |
||
""" | ||
Computes a hash of the input image. | ||
|
||
Args: | ||
image_path: Input image path. | ||
method: The hashing method to use. Supported values are | ||
"ahash", "phash", and "dhash". | ||
hash_size: Size of the hash (default is 8x8). | ||
|
||
Returns: | ||
A 1D NumPy array representing the hash. | ||
""" | ||
image = etai.read(image_path) | ||
if method == "ahash": | ||
return ahash(image, hash_size=hash_size) | ||
elif method == "phash": | ||
return phash(image, hash_size=hash_size) | ||
elif method == "dhash": | ||
return dhash(image, hash_size=hash_size) | ||
else: | ||
raise ValueError("Unsupported hashing method '%s'" % method) | ||
|
||
|
||
def ahash(image, hash_size=8): | ||
""" | ||
Computes the average hash (aHash) of an image. | ||
|
||
Args: | ||
image: Input image as a NumPy array. | ||
hash_size: Size of the hash (default is 8x8). | ||
|
||
Returns: | ||
A 1D NumPy array representing the hash. | ||
""" | ||
# Step 1: Convert to grayscale | ||
gray = etai.rgb_to_gray(image) | ||
|
||
# Step 2: Resize to hash_size x hash_size | ||
resized = etai.resize(gray, hash_size, hash_size) | ||
|
||
# Step 3: Compute the mean pixel value | ||
mean = resized.mean() | ||
|
||
# Step 4: Create the binary hash | ||
binary_hash = (resized >= mean).astype(np.uint8) | ||
|
||
# Step 5: Flatten the hash to 1D | ||
flat_hash = binary_hash.flatten() | ||
|
||
return flat_hash | ||
|
||
|
||
def phash(image, hash_size=8): | ||
""" | ||
Computes the perceptual hash (pHash) of an image. | ||
|
||
Args: | ||
image: Input image as a NumPy array. | ||
hash_size: Size of the hash (default is 8x8). | ||
|
||
Returns: | ||
A 1D NumPy array representing the hash. | ||
""" | ||
# Step 1: Convert to grayscale | ||
gray = etai.rgb_to_gray(image) | ||
|
||
# Step 2: Resize to hash_size x hash_size | ||
resized = etai.resize(gray, hash_size, hash_size) | ||
|
||
# Step 3: Compute the Discrete Cosine Transform (DCT) | ||
dct = scipy.fft.dct(resized, norm="ortho") | ||
|
||
# Step 4: Extract the top-left hash_size x hash_size values | ||
dct = dct[:hash_size, :hash_size] | ||
|
||
# Step 5: Compute the median of the top-left values | ||
median = np.median(dct) | ||
|
||
# Step 6: Create the binary hash | ||
binary_hash = (dct >= median).astype(np.uint8) | ||
|
||
# Step 7: Flatten the hash to 1D | ||
flat_hash = binary_hash.flatten() | ||
|
||
return flat_hash | ||
|
||
|
||
def dhash(image, hash_size=8): | ||
""" | ||
Compute the dHash for the input image. | ||
|
||
:param image: Input image to hash (as a NumPy array). | ||
:param hash_size: Size of the hash (default 8x8). | ||
:return: The dHash value of the image as a 64-bit integer. | ||
""" | ||
# Convert the image to grayscale | ||
gray = etai.rgb_to_gray(image) | ||
|
||
# Resize the image to (hash_size + 1, hash_size) | ||
resized = etai.resize(gray, hash_size + 1, hash_size) | ||
|
||
# Compute the differences between adjacent pixels | ||
diff = resized[:, 1:] > resized[:, :-1] | ||
|
||
# Convert the difference image to a binary array | ||
binary_array = diff.flatten().astype(int) | ||
|
||
return binary_array |
Uh oh!
There was an error while loading. Please reload this page.