voxel51 · mwoodson1 · Jan 9, 2025 · Jan 9, 2025 · Jan 10, 2025 · Jan 10, 2025
diff --git a/fiftyone/brain/__init__.py b/fiftyone/brain/__init__.py
@@ -4,7 +4,7 @@
 
 See https://github.com/voxel51/fiftyone for more information.
 
-| Copyright 2017-2024, Voxel51, Inc.
+| Copyright 2017-2025, Voxel51, Inc.
 | `voxel51.com <https://voxel51.com/>`_
 |
 """
@@ -544,6 +544,7 @@ def compute_similarity(
     brain_key=None,
     model=None,
     model_kwargs=None,
+    hash_method=None,
     force_square=False,
     alpha=None,
     batch_size=None,
@@ -631,6 +632,8 @@ def compute_similarity(
             must expose embeddings (``model.has_embeddings = True``)
         model_kwargs (None): a dictionary of optional keyword arguments to pass
             to the model's ``Config`` when a model name is provided
+        hash_method (None): the perceptual hashing method to use in place of
+            embeddings. The supported values are ``["dhash", "phash", "ahash"]``
         force_square (False): whether to minimally manipulate the patch
             bounding boxes into squares prior to extraction. Only applicable
             when a ``model`` and ``patches_field``/``roi_field`` are specified
@@ -672,6 +675,7 @@ def compute_similarity(
         brain_key,
         model,
         model_kwargs,
+        hash_method,
         force_square,
         alpha,
         batch_size,
@@ -691,6 +695,7 @@ def compute_near_duplicates(
     similarity_index=None,
     model=None,
     model_kwargs=None,
+    hash_method=None,
     force_square=False,
     alpha=None,
     batch_size=None,
@@ -745,6 +750,8 @@ def compute_near_duplicates(
             (``model.has_embeddings = True``)
         model_kwargs (None): a dictionary of optional keyword arguments to pass
             to the model's ``Config`` when a model name is provided
+        hash_method (None): the perceptual hashing method to use in place of
+            embeddings. The supported values are ``["dhash", "phash", "ahash"]``
         force_square (False): whether to minimally manipulate the patch
             bounding boxes into squares prior to extraction. Only applicable
             when a ``model`` and ``roi_field`` are specified
@@ -779,6 +786,7 @@ def compute_near_duplicates(
         similarity_index=similarity_index,
         model=model,
         model_kwargs=model_kwargs,
+        hash_method=hash_method,
         force_square=force_square,
         alpha=alpha,
         batch_size=batch_size,

diff --git a/fiftyone/brain/internal/core/duplicates.py b/fiftyone/brain/internal/core/duplicates.py
@@ -19,11 +19,16 @@
 import fiftyone.brain as fb
 import fiftyone.brain.similarity as fbs
 import fiftyone.brain.internal.core.utils as fbu
+import fiftyone.brain.internal.core.perceptual_hash as fbh
+
+from sklearn.metrics.pairwise import pairwise_distances
+import numpy as np
 
 
 logger = logging.getLogger(__name__)
 
 _DEFAULT_MODEL = "resnet18-imagenet-torch"
+FILE_HASH_TYPES = ["md5", "sha1", "sha256", "sha512"]
 
 
 def compute_near_duplicates(
@@ -34,6 +39,7 @@ def compute_near_duplicates(
     similarity_index=None,
     model=None,
     model_kwargs=None,
+    hash_method=None,
     force_square=False,
     alpha=None,
     batch_size=None,
@@ -62,6 +68,7 @@ def compute_near_duplicates(
         model is None
         and embeddings is None
         and similarity_index is None
+        and hash_method is None
         and not embeddings_exist
     ):
         model = _DEFAULT_MODEL
@@ -74,6 +81,7 @@ def compute_near_duplicates(
             embeddings=embeddings_field or embeddings,
             model=model,
             model_kwargs=model_kwargs,
+            hash_method=hash_method,
             force_square=force_square,
             alpha=alpha,
             batch_size=batch_size,
@@ -139,6 +147,7 @@ def compute_exact_duplicates(samples, num_workers, skip_failures, progress):
 
 def _compute_filehashes(samples, method, progress):
     ids, filepaths = samples.values(["id", "filepath"])
+    # I need embeddings, sample_ids, label_ids
 
     with fou.ProgressBar(total=len(ids), progress=progress) as pb:
         return {
@@ -166,7 +175,10 @@ def _compute_filehashes_multi(samples, method, num_workers, progress):
 
 def _compute_filehash(filepath, method):
     try:
-        filehash = fou.compute_filehash(filepath, method=method)
+        if method is None or method in FILE_HASH_TYPES:
+            filehash = fou.compute_filehash(filepath, method=method)
+        else:
+            filehash = fbh.compute_image_hash(filepath, method=method)
     except:
         filehash = None
 
@@ -176,7 +188,10 @@ def _compute_filehash(filepath, method):
 def _do_compute_filehash(args):
     _id, filepath, method = args
     try:
-        filehash = fou.compute_filehash(filepath, method=method)
+        if method is None or method in FILE_HASH_TYPES:
+            filehash = fou.compute_filehash(filepath, method=method)
+        else:
+            filehash = fbh.compute_image_hash(filepath, method=method)
     except:
         filehash = None
 

diff --git a/fiftyone/brain/internal/core/perceptual_hash.py b/fiftyone/brain/internal/core/perceptual_hash.py
@@ -0,0 +1,122 @@
+"""
+Image hashing methods.
+
+| Copyright 2017-2024, Voxel51, Inc.
+| `voxel51.com <https://voxel51.com/>`_
+|
+"""
+
+import numpy as np
+import eta.core.image as etai
+import scipy
+
+
+def compute_image_hash(image_path, method="phash", hash_size=8):
+    """
+    Computes a hash of the input image.
+
+    Args:
+        image_path: Input image path.
+        method: The hashing method to use. Supported values are
+            "ahash", "phash", and "dhash".
+        hash_size: Size of the hash (default is 8x8).
+
+    Returns:
+        A 1D NumPy array representing the hash.
+    """
+    image = etai.read(image_path)
+    if method == "ahash":
+        return ahash(image, hash_size=hash_size)
+    elif method == "phash":
+        return phash(image, hash_size=hash_size)
+    elif method == "dhash":
+        return dhash(image, hash_size=hash_size)
+    else:
+        raise ValueError("Unsupported hashing method '%s'" % method)
+
+
+def ahash(image, hash_size=8):
+    """
+    Computes the average hash (aHash) of an image.
+
+    Args:
+        image: Input image as a NumPy array.
+        hash_size: Size of the hash (default is 8x8).
+
+    Returns:
+        A 1D NumPy array representing the hash.
+    """
+    # Step 1: Convert to grayscale
+    gray = etai.rgb_to_gray(image)
+
+    # Step 2: Resize to hash_size x hash_size
+    resized = etai.resize(gray, hash_size, hash_size)
+
+    # Step 3: Compute the mean pixel value
+    mean = resized.mean()
+
+    # Step 4: Create the binary hash
+    binary_hash = (resized >= mean).astype(np.uint8)
+
+    # Step 5: Flatten the hash to 1D
+    flat_hash = binary_hash.flatten()
+
+    return flat_hash
+
+
+def phash(image, hash_size=8):
+    """
+    Computes the perceptual hash (pHash) of an image.
+
+    Args:
+        image: Input image as a NumPy array.
+        hash_size: Size of the hash (default is 8x8).
+
+    Returns:
+        A 1D NumPy array representing the hash.
+    """
+    # Step 1: Convert to grayscale
+    gray = etai.rgb_to_gray(image)
+
+    # Step 2: Resize to hash_size x hash_size
+    resized = etai.resize(gray, hash_size, hash_size)
+
+    # Step 3: Compute the Discrete Cosine Transform (DCT)
+    dct = scipy.fft.dct(resized, norm="ortho")
+
+    # Step 4: Extract the top-left hash_size x hash_size values
+    dct = dct[:hash_size, :hash_size]
+
+    # Step 5: Compute the median of the top-left values
+    median = np.median(dct)
+
+    # Step 6: Create the binary hash
+    binary_hash = (dct >= median).astype(np.uint8)
+
+    # Step 7: Flatten the hash to 1D
+    flat_hash = binary_hash.flatten()
+
+    return flat_hash
+
+
+def dhash(image, hash_size=8):
+    """
+    Compute the dHash for the input image.
+
+    :param image: Input image to hash (as a NumPy array).
+    :param hash_size: Size of the hash (default 8x8).
+    :return: The dHash value of the image as a 64-bit integer.
+    """
+    # Convert the image to grayscale
+    gray = etai.rgb_to_gray(image)
+
+    # Resize the image to (hash_size + 1, hash_size)
+    resized = etai.resize(gray, hash_size + 1, hash_size)
+
+    # Compute the differences between adjacent pixels
+    diff = resized[:, 1:] > resized[:, :-1]
+
+    # Convert the difference image to a binary array
+    binary_array = diff.flatten().astype(int)
+
+    return binary_array
diff --git a/fiftyone/brain/similarity.py b/fiftyone/brain/similarity.py
@@ -21,10 +21,12 @@
 import fiftyone.core.fields as fof
 import fiftyone.core.labels as fol
 import fiftyone.core.patches as fop
+import fiftyone.core.media as fomm
 import fiftyone.core.stages as fos
 import fiftyone.core.utils as fou
 import fiftyone.core.validation as fov
 import fiftyone.zoo as foz
+import fiftyone.brain.internal.core.duplicates as fbd
 from fiftyone import ViewField as F
 
 fbu = fou.lazy_import("fiftyone.brain.internal.core.utils")
@@ -51,6 +53,7 @@ def compute_similarity(
     brain_key,
     model,
     model_kwargs,
+    hash_method,
     force_square,
     alpha,
     batch_size,
@@ -69,6 +72,17 @@ def compute_similarity(
             samples, roi_field, _ALLOWED_ROI_FIELD_TYPES
         )
 
+    if hash_method is not None and backend != "sklearn":
+        raise ValueError(
+            "The `hash_method` parameter is only supported by the 'sklearn' "
+            "backend"
+        )
+
+    if hash_method is not None and samples.media_type != fomm.IMAGE:
+        raise ValueError(
+            "The `hash_method` parameter is only supported for image datasets"
+        )
+
     # Allow for `embeddings_field=XXX` and `embeddings=False` together
     embeddings_field = kwargs.pop("embeddings_field", None)
     if embeddings_field is not None or etau.is_str(embeddings):
@@ -86,9 +100,10 @@ def compute_similarity(
         embeddings_exist = None
 
     if model is None and embeddings is None and not embeddings_exist:
-        model = _DEFAULT_MODEL
-        if batch_size is None:
-            batch_size = _DEFAULT_BATCH_SIZE
+        if hash_method is None:
+            model = _DEFAULT_MODEL
+            if batch_size is None:
+                batch_size = _DEFAULT_BATCH_SIZE
 
     if etau.is_str(model):
         _model_kwargs = model_kwargs or {}
@@ -101,13 +116,18 @@ def compute_similarity(
         _model = model
         supports_prompts = None
 
+    metric = "cosine"
+    if hash_method is not None:
+        metric = "manhattan"
+
     config = _parse_config(
         backend,
         embeddings_field=embeddings_field,
         patches_field=patches_field,
         roi_field=roi_field,
         model=model,
         model_kwargs=model_kwargs,
+        metric=metric,
         supports_prompts=supports_prompts,
         **kwargs,
     )
@@ -139,21 +159,35 @@ def compute_similarity(
             handle_missing = "skip"
             agg_fcn = None
 
-        embeddings, sample_ids, label_ids = fbu.get_embeddings(
-            samples,
-            model=_model,
-            patches_field=patches_field or roi_field,
-            embeddings=embeddings,
-            embeddings_field=embeddings_field,
-            force_square=force_square,
-            alpha=alpha,
-            handle_missing=handle_missing,
-            agg_fcn=agg_fcn,
-            batch_size=batch_size,
-            num_workers=num_workers,
-            skip_failures=skip_failures,
-            progress=progress,
-        )
+        if model is not None:
+            embeddings, sample_ids, label_ids = fbu.get_embeddings(
+                samples,
+                model=_model,
+                patches_field=patches_field or roi_field,
+                embeddings=embeddings,
+                embeddings_field=embeddings_field,
+                force_square=force_square,
+                alpha=alpha,
+                handle_missing=handle_missing,
+                agg_fcn=agg_fcn,
+                batch_size=batch_size,
+                num_workers=num_workers,
+                skip_failures=skip_failures,
+                progress=progress,
+            )
+        else:
+            assert hash_method is not None
+            hashes = fbd._compute_filehashes(samples, hash_method, progress)
+            sample_ids, label_ids = fbu.get_ids(
+                samples,
+                patches_field=patches_field or roi_field,
+                data=hashes,
+                data_type="hash",
+                handle_missing=handle_missing,
+                ref_sample_ids=None,
+            )
+            embeddings = np.asarray(list(hashes.values())).astype(np.float64)
+
     else:
         embeddings = None