voxel51 · mwoodson1 · Jan 9, 2025 · Jan 9, 2025 · Jan 10, 2025 · Jan 10, 2025
diff --git a/fiftyone/brain/__init__.py b/fiftyone/brain/__init__.py
@@ -793,6 +793,8 @@ def compute_exact_duplicates(
     num_workers=None,
     skip_failures=True,
     progress=None,
+    hash_method="filehash",
+    threshold=None,
 ):
     """Detects duplicate media in a sample collection.
 
@@ -811,6 +813,10 @@ def compute_exact_duplicates(
         progress (None): whether to render a progress bar (True/False), use the
             default value ``fiftyone.config.show_progress_bars`` (None), or a
             progress callback function to invoke instead
+        hash_method ('filehash'): the method to use to hash the samples. The
+            options include: 'filehash', 'phash', 'dhash', 'ahash', 'whash'
+        threshold (None): the threshold to use when comparing hashes. Only used
+            if ``hash_method`` is 'phash', 'dhash', 'ahash', or 'whash'
 
     Returns:
         a dictionary mapping IDs of samples with exact duplicates to lists of
@@ -819,7 +825,7 @@ def compute_exact_duplicates(
     import fiftyone.brain.internal.core.duplicates as fbd
 
     return fbd.compute_exact_duplicates(
-        samples, num_workers, skip_failures, progress
+        samples, num_workers, skip_failures, progress, hash_method, threshold
     )
 
 

diff --git a/fiftyone/brain/internal/core/duplicates.py b/fiftyone/brain/internal/core/duplicates.py
@@ -19,6 +19,10 @@
 import fiftyone.brain as fb
 import fiftyone.brain.similarity as fbs
 import fiftyone.brain.internal.core.utils as fbu
+import fiftyone.brain.internal.core.perceptual_hash as fbh
+
+from sklearn.metrics.pairwise import pairwise_distances
+import numpy as np
 
 
 logger = logging.getLogger(__name__)
@@ -92,7 +96,14 @@ def compute_near_duplicates(
     return similarity_index
 
 
-def compute_exact_duplicates(samples, num_workers, skip_failures, progress):
+def compute_exact_duplicates(
+    samples,
+    num_workers,
+    skip_failures,
+    progress,
+    hash_method="filehash",
+    threshold=None,
+):
     """See ``fiftyone/brain/__init__.py``."""
 
     fov.validate_collection(samples)
@@ -105,7 +116,10 @@ def compute_exact_duplicates(samples, num_workers, skip_failures, progress):
 
     logger.info("Computing filehashes...")
 
-    method = "md5" if samples.media_type == fom.VIDEO else None
+    if hash_method == "filehash":
+        method = "md5" if samples.media_type == fom.VIDEO else None
+    else:
+        method = hash_method
 
     if num_workers <= 1:
         hashes = _compute_filehashes(samples, method, progress)
@@ -123,16 +137,31 @@ def compute_exact_duplicates(samples, num_workers, skip_failures, progress):
             raise ValueError(msg)
 
     neighbors_map = defaultdict(list)
+    if hash_method == "filehash":
+        observed_hashes = {}
+        for _id, _hash in hashes.items():
+            if _hash is None:
+                continue
+
+            if _hash in observed_hashes:
+                neighbors_map[observed_hashes[_hash]].append(_id)
+            else:
+                observed_hashes[_hash] = _id
+    else:
+        observed_hashes = {}
 
-    observed_hashes = {}
-    for _id, _hash in hashes.items():
-        if _hash is None:
-            continue
+        _d = hashes.items()
+        _ids = [item[0] for item in _d]
+        _hashes = [item[1] for item in _d]
 
-        if _hash in observed_hashes:
-            neighbors_map[observed_hashes[_hash]].append(_id)
-        else:
-            observed_hashes[_hash] = _id
+        distances = pairwise_distances(_hashes, _hashes, metric="hamming")
+
+        mask = np.eye(distances.shape[0], dtype=bool)
+        thresholded_distances = np.logical_and(distances < threshold, ~mask)
+        for i, _id in enumerate(_ids):
+            nearby_indices = np.where(thresholded_distances[i, :])[0]
+            duplicate_ids = [_ids[j] for j in nearby_indices]
+            neighbors_map[_id] = duplicate_ids
 
     return dict(neighbors_map)
 
@@ -166,7 +195,10 @@ def _compute_filehashes_multi(samples, method, num_workers, progress):
 
 def _compute_filehash(filepath, method):
     try:
-        filehash = fou.compute_filehash(filepath, method=method)
+        if method is None or method in ["md5", "sha1", "sha256", "sha512"]:
+            filehash = fou.compute_filehash(filepath, method=method)
+        else:
+            filehash = fbh.compute_image_hash(filepath, method=method)
     except:
         filehash = None
 
@@ -176,7 +208,10 @@ def _compute_filehash(filepath, method):
 def _do_compute_filehash(args):
     _id, filepath, method = args
     try:
-        filehash = fou.compute_filehash(filepath, method=method)
+        if method is None or method in ["md5", "sha1", "sha256", "sha512"]:
+            filehash = fou.compute_filehash(filepath, method=method)
+        else:
+            filehash = fbh.compute_image_hash(filepath, method=method)
     except:
         filehash = None
 

diff --git a/fiftyone/brain/internal/core/perceptual_hash.py b/fiftyone/brain/internal/core/perceptual_hash.py
@@ -0,0 +1,173 @@
+"""
+Image hashing methods.
+
+| Copyright 2017-2024, Voxel51, Inc.
+| `voxel51.com <https://voxel51.com/>`_
+|
+"""
+
+import numpy as np
+import eta.core.image as etai
+import scipy
+
+
+def compute_image_hash(image_path, method="phash", hash_size=8):
+    """
+    Computes a hash of the input image.
+
+    Args:
+        image_path: Input image path.
+        method: The hashing method to use. Supported values are
+            "ahash", "phash", "dhash", and "whash".
+        hash_size: Size of the hash (default is 8x8).
+
+    Returns:
+        A 1D NumPy array representing the hash.
+    """
+    image = etai.read(image_path)
+    if method == "ahash":
+        return ahash(image, hash_size=hash_size)
+    elif method == "phash":
+        return phash(image, hash_size=hash_size)
+    elif method == "dhash":
+        return dhash(image, hash_size=hash_size)
+    elif method == "whash":
+        return whash(image, hash_size=hash_size)
+    else:
+        raise ValueError("Unsupported hashing method '%s'" % method)
+
+
+def ahash(image, hash_size=8):
+    """
+    Computes the average hash (aHash) of an image.
+
+    Args:
+        image: Input image as a NumPy array.
+        hash_size: Size of the hash (default is 8x8).
+
+    Returns:
+        A 1D NumPy array representing the hash.
+    """
+    # Step 1: Convert to grayscale
+    gray = etai.rgb_to_gray(image)
+
+    # Step 2: Resize to hash_size x hash_size
+    resized = etai.resize(gray, hash_size, hash_size)
+
+    # Step 3: Compute the mean pixel value
+    mean = resized.mean()
+
+    # Step 4: Create the binary hash
+    binary_hash = (resized >= mean).astype(np.uint8)
+
+    # Step 5: Flatten the hash to 1D
+    flat_hash = binary_hash.flatten()
+
+    return flat_hash
+
+
+def phash(image, hash_size=8):
+    """
+    Computes the perceptual hash (pHash) of an image.
+
+    Args:
+        image: Input image as a NumPy array.
+        hash_size: Size of the hash (default is 8x8).
+
+    Returns:
+        A 1D NumPy array representing the hash.
+    """
+    # Step 1: Convert to grayscale
+    gray = etai.rgb_to_gray(image)
+
+    # Step 2: Resize to hash_size x hash_size
+    resized = etai.resize(gray, hash_size, hash_size)
+
+    # Step 3: Compute the Discrete Cosine Transform (DCT)
+    dct = scipy.fft.dct(resized, norm="ortho")
+
+    # Step 4: Extract the top-left hash_size x hash_size values
+    dct = dct[:hash_size, :hash_size]
+
+    # Step 5: Compute the median of the top-left values
+    median = np.median(dct)
+
+    # Step 6: Create the binary hash
+    binary_hash = (dct >= median).astype(np.uint8)
+
+    # Step 7: Flatten the hash to 1D
+    flat_hash = binary_hash.flatten()
+
+    return flat_hash
+
+
+def dhash(image, hash_size=8):
+    """
+    Compute the dHash for the input image.
+
+    :param image: Input image to hash (as a NumPy array).
+    :param hash_size: Size of the hash (default 8x8).
+    :return: The dHash value of the image as a 64-bit integer.
+    """
+    # Convert the image to grayscale
+    gray = etai.rgb_to_gray(image)
+
+    # Resize the image to (hash_size + 1, hash_size)
+    resized = etai.resize(gray, hash_size + 1, hash_size)
+
+    # Compute the differences between adjacent pixels
+    diff = resized[:, 1:] > resized[:, :-1]
+
+    # Convert the difference image to a binary array
+    binary_array = diff.flatten().astype(int)
+
+    return binary_array
+
+
+def whash(image, hash_size=8):
+    """
+    Computes the wavelet hash (wHash) of an image.
+
+    Args:
+        image: Input image as a NumPy array.
+        hash_size: Size of the hash (default is 8x8).
+
+    Returns:
+        A 1D NumPy array representing the hash.
+    """
+    import pywt
+
+    # Step 1: Convert to grayscale
+    gray = etai.rgb_to_gray(image)
+
+    # Step 2: Resize to hash_size x hash_size
+    resized = etai.resize(gray, hash_size, hash_size)
+
+    # Step 3: Compute the wavelet transform
+    coeffs = pywt.dwt2(resized, "haar")
+    cA, (cH, cV, cD) = coeffs
+
+    # Step 4: Extract the approximation coefficients
+    cA = cA.flatten()
+
+    # Step 5: Compute the mean of the approximation coefficients
+    mean = cA.mean()
+
+    # Step 6: Create the binary hash
+    binary_hash = (cA >= mean).astype(np.uint8)
+
+    return binary_hash
+
+
+def hamming_distance(hash1, hash2):
+    """
+    Computes the Hamming distance between two hashes.
+
+    Args:
+        hash1: First hash as a 1D NumPy array.
+        hash2: Second hash as a 1D NumPy array.
+
+    Returns:
+        The Hamming distance (integer).
+    """
+    return np.count_nonzero(hash1 != hash2)