-
Notifications
You must be signed in to change notification settings - Fork 11
[DRAFT] Perceptual Hashing Deduplication #226
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: develop
Are you sure you want to change the base?
Changes from 1 commit
b920739
b21a57f
f2f5180
ef5234a
3c00e5f
bade86b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,6 +19,10 @@ | |
import fiftyone.brain as fb | ||
import fiftyone.brain.similarity as fbs | ||
import fiftyone.brain.internal.core.utils as fbu | ||
import fiftyone.brain.internal.core.perceptual_hash as fbh | ||
|
||
from sklearn.metrics.pairwise import pairwise_distances | ||
import numpy as np | ||
|
||
|
||
logger = logging.getLogger(__name__) | ||
|
@@ -92,7 +96,14 @@ def compute_near_duplicates( | |
return similarity_index | ||
|
||
|
||
def compute_exact_duplicates(samples, num_workers, skip_failures, progress): | ||
def compute_exact_duplicates( | ||
samples, | ||
num_workers, | ||
skip_failures, | ||
progress, | ||
hash_method="filehash", | ||
threshold=None, | ||
mwoodson1 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
): | ||
"""See ``fiftyone/brain/__init__.py``.""" | ||
|
||
fov.validate_collection(samples) | ||
|
@@ -105,7 +116,10 @@ def compute_exact_duplicates(samples, num_workers, skip_failures, progress): | |
|
||
logger.info("Computing filehashes...") | ||
|
||
method = "md5" if samples.media_type == fom.VIDEO else None | ||
if hash_method == "filehash": | ||
method = "md5" if samples.media_type == fom.VIDEO else None | ||
else: | ||
method = hash_method | ||
|
||
if num_workers <= 1: | ||
hashes = _compute_filehashes(samples, method, progress) | ||
|
@@ -123,16 +137,31 @@ def compute_exact_duplicates(samples, num_workers, skip_failures, progress): | |
raise ValueError(msg) | ||
|
||
neighbors_map = defaultdict(list) | ||
if hash_method == "filehash": | ||
observed_hashes = {} | ||
for _id, _hash in hashes.items(): | ||
if _hash is None: | ||
continue | ||
|
||
if _hash in observed_hashes: | ||
neighbors_map[observed_hashes[_hash]].append(_id) | ||
else: | ||
observed_hashes[_hash] = _id | ||
else: | ||
observed_hashes = {} | ||
|
||
observed_hashes = {} | ||
for _id, _hash in hashes.items(): | ||
if _hash is None: | ||
continue | ||
_d = hashes.items() | ||
_ids = [item[0] for item in _d] | ||
_hashes = [item[1] for item in _d] | ||
|
||
if _hash in observed_hashes: | ||
neighbors_map[observed_hashes[_hash]].append(_id) | ||
else: | ||
observed_hashes[_hash] = _id | ||
distances = pairwise_distances(_hashes, _hashes, metric="hamming") | ||
|
||
mask = np.eye(distances.shape[0], dtype=bool) | ||
thresholded_distances = np.logical_and(distances < threshold, ~mask) | ||
mwoodson1 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
for i, _id in enumerate(_ids): | ||
nearby_indices = np.where(thresholded_distances[i, :])[0] | ||
duplicate_ids = [_ids[j] for j in nearby_indices] | ||
neighbors_map[_id] = duplicate_ids | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is different from the similarity index neighbors_map that gives keys of "unique" samples and values of duplicates of that sample. Not an issue but can be misleading. Keep in mind this recounts neighbors both ways. May be worthwhile to only go over the upper triangle of the |
||
|
||
return dict(neighbors_map) | ||
|
||
|
@@ -166,7 +195,10 @@ def _compute_filehashes_multi(samples, method, num_workers, progress): | |
|
||
def _compute_filehash(filepath, method): | ||
try: | ||
filehash = fou.compute_filehash(filepath, method=method) | ||
if method is None or method in ["md5", "sha1", "sha256", "sha512"]: | ||
mwoodson1 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
filehash = fou.compute_filehash(filepath, method=method) | ||
else: | ||
filehash = fbh.compute_image_hash(filepath, method=method) | ||
except: | ||
filehash = None | ||
|
||
|
@@ -176,7 +208,10 @@ def _compute_filehash(filepath, method): | |
def _do_compute_filehash(args): | ||
_id, filepath, method = args | ||
try: | ||
filehash = fou.compute_filehash(filepath, method=method) | ||
if method is None or method in ["md5", "sha1", "sha256", "sha512"]: | ||
filehash = fou.compute_filehash(filepath, method=method) | ||
else: | ||
filehash = fbh.compute_image_hash(filepath, method=method) | ||
except: | ||
filehash = None | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,173 @@ | ||
""" | ||
Image hashing methods. | ||
|
||
| Copyright 2017-2024, Voxel51, Inc. | ||
| `voxel51.com <https://voxel51.com/>`_ | ||
| | ||
""" | ||
|
||
import numpy as np | ||
import eta.core.image as etai | ||
import scipy | ||
|
||
|
||
def compute_image_hash(image_path, method="phash", hash_size=8): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not specific to this, not necessarily important to change here but wanted to note: we use this pattern (consolidate many implementations into 1 core function) a lot. It's not great obviously because you have to manually add each new option to the if-else. We should think about moving to registries over time. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good point, I typically prefer registries when the set of implementations is un-bounded (new model architectures for example). I believe the set of hash functions for images is relatively bounded so I wasn't too worried. |
||
""" | ||
Computes a hash of the input image. | ||
|
||
Args: | ||
image_path: Input image path. | ||
method: The hashing method to use. Supported values are | ||
"ahash", "phash", "dhash", and "whash". | ||
hash_size: Size of the hash (default is 8x8). | ||
|
||
Returns: | ||
A 1D NumPy array representing the hash. | ||
""" | ||
image = etai.read(image_path) | ||
if method == "ahash": | ||
return ahash(image, hash_size=hash_size) | ||
elif method == "phash": | ||
return phash(image, hash_size=hash_size) | ||
elif method == "dhash": | ||
return dhash(image, hash_size=hash_size) | ||
elif method == "whash": | ||
return whash(image, hash_size=hash_size) | ||
else: | ||
raise ValueError("Unsupported hashing method '%s'" % method) | ||
|
||
|
||
def ahash(image, hash_size=8): | ||
""" | ||
Computes the average hash (aHash) of an image. | ||
|
||
Args: | ||
image: Input image as a NumPy array. | ||
hash_size: Size of the hash (default is 8x8). | ||
|
||
Returns: | ||
A 1D NumPy array representing the hash. | ||
""" | ||
# Step 1: Convert to grayscale | ||
gray = etai.rgb_to_gray(image) | ||
|
||
# Step 2: Resize to hash_size x hash_size | ||
resized = etai.resize(gray, hash_size, hash_size) | ||
|
||
# Step 3: Compute the mean pixel value | ||
mean = resized.mean() | ||
|
||
# Step 4: Create the binary hash | ||
binary_hash = (resized >= mean).astype(np.uint8) | ||
|
||
# Step 5: Flatten the hash to 1D | ||
flat_hash = binary_hash.flatten() | ||
|
||
return flat_hash | ||
|
||
|
||
def phash(image, hash_size=8): | ||
""" | ||
Computes the perceptual hash (pHash) of an image. | ||
|
||
Args: | ||
image: Input image as a NumPy array. | ||
hash_size: Size of the hash (default is 8x8). | ||
|
||
Returns: | ||
A 1D NumPy array representing the hash. | ||
""" | ||
# Step 1: Convert to grayscale | ||
gray = etai.rgb_to_gray(image) | ||
|
||
# Step 2: Resize to hash_size x hash_size | ||
resized = etai.resize(gray, hash_size, hash_size) | ||
|
||
# Step 3: Compute the Discrete Cosine Transform (DCT) | ||
dct = scipy.fft.dct(resized, norm="ortho") | ||
|
||
# Step 4: Extract the top-left hash_size x hash_size values | ||
dct = dct[:hash_size, :hash_size] | ||
|
||
# Step 5: Compute the median of the top-left values | ||
median = np.median(dct) | ||
|
||
# Step 6: Create the binary hash | ||
binary_hash = (dct >= median).astype(np.uint8) | ||
|
||
# Step 7: Flatten the hash to 1D | ||
flat_hash = binary_hash.flatten() | ||
|
||
return flat_hash | ||
|
||
|
||
def dhash(image, hash_size=8): | ||
""" | ||
Compute the dHash for the input image. | ||
|
||
:param image: Input image to hash (as a NumPy array). | ||
:param hash_size: Size of the hash (default 8x8). | ||
:return: The dHash value of the image as a 64-bit integer. | ||
""" | ||
# Convert the image to grayscale | ||
gray = etai.rgb_to_gray(image) | ||
|
||
# Resize the image to (hash_size + 1, hash_size) | ||
resized = etai.resize(gray, hash_size + 1, hash_size) | ||
|
||
# Compute the differences between adjacent pixels | ||
diff = resized[:, 1:] > resized[:, :-1] | ||
|
||
# Convert the difference image to a binary array | ||
binary_array = diff.flatten().astype(int) | ||
|
||
return binary_array | ||
|
||
|
||
def whash(image, hash_size=8): | ||
mwoodson1 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
""" | ||
Computes the wavelet hash (wHash) of an image. | ||
|
||
Args: | ||
image: Input image as a NumPy array. | ||
hash_size: Size of the hash (default is 8x8). | ||
|
||
Returns: | ||
A 1D NumPy array representing the hash. | ||
""" | ||
import pywt | ||
|
||
# Step 1: Convert to grayscale | ||
gray = etai.rgb_to_gray(image) | ||
|
||
# Step 2: Resize to hash_size x hash_size | ||
resized = etai.resize(gray, hash_size, hash_size) | ||
|
||
# Step 3: Compute the wavelet transform | ||
coeffs = pywt.dwt2(resized, "haar") | ||
cA, (cH, cV, cD) = coeffs | ||
|
||
# Step 4: Extract the approximation coefficients | ||
cA = cA.flatten() | ||
|
||
# Step 5: Compute the mean of the approximation coefficients | ||
mean = cA.mean() | ||
|
||
# Step 6: Create the binary hash | ||
binary_hash = (cA >= mean).astype(np.uint8) | ||
|
||
return binary_hash | ||
|
||
|
||
def hamming_distance(hash1, hash2): | ||
""" | ||
Computes the Hamming distance between two hashes. | ||
|
||
Args: | ||
hash1: First hash as a 1D NumPy array. | ||
hash2: Second hash as a 1D NumPy array. | ||
|
||
Returns: | ||
The Hamming distance (integer). | ||
""" | ||
return np.count_nonzero(hash1 != hash2) | ||
mwoodson1 marked this conversation as resolved.
Show resolved
Hide resolved
|
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe have this in
compute_near_duplicates
? You can compute hashes into a field and pass that as the embedding field. It should give the same behavior. This serves the following:sklearn.metrics.pairwise.pairwise_distances
backend, just like the similarity index w/ duplicates mixin in compute near duplicates.What do you think?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I went back and forth on whether this should be in
compute_exact_duplicates
vscompute_near_duplicates
for a while. I'd be curious to get @prernadh 's opinion since this was a customer request but in my opinion hash based deduplication is closer to exact deduplication than it is to near deduplication.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
mmm, I would think this is closer to near duplicates rather than exact as we are able to use a threshold to find close samples