dumb image things

digitalmethodsinitiative · Oct 24, 2024 · bb0909c · bb0909c
1 parent ac543cc
commit bb0909c
Show file tree

Hide file tree

Showing 2 changed files with 72 additions and 47 deletions.
diff --git a/processors/filtering/unique_images.py b/processors/filtering/unique_images.py
@@ -1,15 +1,12 @@
 """
 Filter by unique images
 """
-import imagehash
-import hashlib
 import shutil
 import json
 
-from PIL import Image
 from backend.lib.processor import BasicProcessor
 from common.lib.exceptions import ProcessorInterruptedException
-from common.lib.helpers import UserInput
+from common.lib.helpers import UserInput, hash_file
 
 __author__ = "Stijn Peeters"
 __credits__ = ["Stijn Peeters"]
@@ -60,37 +57,6 @@ def is_compatible_with(cls, module=None, user=None):
         return module.get_media_type() == "image" or module.type.startswith(
             "image-downloader") or module.type == "video-frames"
 
-    def hash_file(self, image_file, hash_type="file-hash"):
-        """
-        Generate an image hash
-
-        :param Path image_file:  Image file to hash
-        :param str hash_type:  Hash type, one of `file-hash`, `colorhash`,
-        `phash`, `average_hash`, `dhash`
-        :return str:  Hexadecimal hash value
-        """
-        if not image_file.exists():
-            raise FileNotFoundError()
-
-        if hash_type == "file-hash":
-            hasher = hashlib.sha1()
-
-            # Open the file in binary mode
-            with image_file.open("rb") as infile:
-                # Read and update hash in chunks to handle large files
-                while chunk := infile.read(1024):
-                    hasher.update(chunk)
-
-            return hasher.hexdigest()
-
-        elif hash_type in ("colorhash", "phash", "average_hash", "dhash"):
-            image = Image.open(image_file)
-
-            return str(getattr(imagehash, hash_type)(image))
-
-        else:
-            raise NotImplementedError(f"Unknown hash type '{hash_type}'")
-
     def process(self):
         """
         Loop through images and only retain ones that have not been seen yet
@@ -111,7 +77,7 @@ def process(self):
 
             self.dataset.update_progress(processed / self.source_dataset.num_rows)
             if processed % 100 == 0:
-                self.dataset.update_progress(f"Processed {processed:,} of {self.source_dataset.num_rows:,} images, "
+                self.dataset.update_status(f"Processed {processed:,} of {self.source_dataset.num_rows:,} images, "
                                              f"found {dupes:,} duplicate(s)")
             processed += 1
 
@@ -120,7 +86,7 @@ def process(self):
                     metadata = json.load(infile)
                 continue
 
-            image_hash = self.hash_file(image_file, self.parameters.get("hash-type"))
+            image_hash = hash_file(image_file, self.parameters.get("hash-type"))
 
             if image_hash not in seen_hashes:
                 seen_hashes.add(image_hash)

diff --git a/processors/networks/image-network.py b/processors/networks/image-network.py
@@ -4,6 +4,7 @@
 import json
 
 from backend.lib.processor import BasicProcessor
+from common.lib.helpers import hash_file
 
 import networkx as nx
 
@@ -61,6 +62,20 @@ def get_options(cls, parent_dataset=None, user=None):
                 "tooltip": "The image node label will have this value. Depending on the network visualisation software "
                            "you use, one or the other is required to display the images as nodes."
             },
+            "deduplicate": {
+                "type": UserInput.OPTION_CHOICE,
+                "help": "Merge images",
+                "tooltip": "Similar images can be merged into a single node, represented by the first image of the set "
+                           "that was encountered.",
+                "options": {
+                    "none": "Do not merge",
+                    "file-hash": "File hash (files need to be byte-by-byte duplicates)",
+                    "colorhash": "Colour hash (good at colours, worse at shapes)",
+                    "phash": "Perceptual hash (decent at colours and shapes)",
+                    "average_hash": "Average hash (good at crops, less tolerant of differences than perceptual hashing)",
+                    "dhash": "Difference hash (similar to average hash, better at photos and art)"
+                }
+            },
             **({
                    "column": {
                        "help": "Dataset field",
@@ -83,22 +98,48 @@ def is_compatible_with(cls, module=None, user=None):
 
     def process(self):
         column = self.parameters.get("column")
+        hash_type = self.parameters.get("deduplicate")
+        filename_filter = [".metadata.json"] if hash_type == "none" else []
         metadata = None
-        for file in self.iterate_archive_contents(self.source_file, filename_filter=[".metadata.json"]):
-            with file.open() as infile:
+        hashed = 0
+
+        # some maps to make sure we use the right value in the right place
+        # url or filename, original image or duplicate, etc
+        file_hash_map = {}
+        hash_file_map = {}
+        seen_hashes = set()
+        id_file_map = {}
+
+        for file in self.iterate_archive_contents(self.source_file, filename_filter=filename_filter):
+            if file.name == ".metadata.json":
+                with file.open() as infile:
+                    try:
+                        metadata = json.load(infile)
+                        file_hash_map = {i: v["filename"] for i, v in metadata.items()} if self.parameters.get("image-value") == "url" else {i["filename"]: i["filename"] for i in metadata.values()}
+                    except json.JSONDecodeError:
+                        pass
+            else:
                 try:
-                    metadata = json.load(infile)
-                except json.JSONDecodeError:
-                    pass
+                    hashed += 1
+                    if hashed % 100 == 0:
+                        self.dataset.update_status(f"Generated identity hashes for {hashed:,} of {self.source_dataset.num_rows-1:,} item(s)")
+                    self.dataset.update_progress(hashed / (self.source_dataset.num_rows-1) * 0.5)
+                    file_hash = hash_file(file, hash_type)
+                    file_hash_map[file.name] = file_hash
+                    if file_hash not in hash_file_map:
+                        hash_file_map[file_hash] = file.name
+
+                except (FileNotFoundError, ValueError) as e:
+                    continue
 
         if not metadata:
             return self.dataset.finish_with_error("No valid metadata found in image archive - this processor can only "
                                                   "be run on sets of images sourced from another 4CAT dataset.")
 
-        id_file_map = {}
+        file_url_map = {v["filename"]: u for u, v in metadata.items()}
         for url, details in metadata.items():
             for item_id in details.get("post_ids", []):
-                id_file_map[item_id] = url if self.parameters.get("image-value") == "url" else details["filename"]
+                id_file_map[item_id] = details["filename"]
 
         root_dataset = None
         for parent in reversed(self.dataset.get_genealogy()):
@@ -113,7 +154,12 @@ def process(self):
         network = nx.DiGraph()
         processed = 0
         for item in root_dataset.iterate_items():
-            self.dataset.update_progress(processed / root_dataset.num_rows)
+            progress = processed / root_dataset.num_rows
+            if hashed:
+                # if hashing was necessary, we approximate that as 50% of the work
+                progress = (progress * 0.5) + 0.5
+
+            self.dataset.update_progress(progress)
             processed += 1
             if processed % 100 == 0:
                 self.dataset.update_status(f"Processed {processed:,} of {root_dataset.num_rows:,} item(s)")
@@ -126,9 +172,22 @@ def process(self):
 
             from_node_label = item.get(column)
             from_node = f"{column}-{from_node_label}"
-            to_node_label = id_file_map[item.get("id")]
-            to_node = f"image-{to_node_label}"
 
+            image_file = id_file_map[item.get("id")]
+            image_hash = file_hash_map[image_file]
+            if image_hash in seen_hashes:
+                to_node_label = hash_file_map[image_hash]
+                if image_file != to_node_label:
+                    self.dataset.update_status(f"Image {image_file} is a duplicate of {to_node_label} - merging.")
+
+            else:
+                seen_hashes.add(image_hash)
+                to_node_label = id_file_map[item.get("id")]
+
+            if self.parameters.get("image-value") == "url":
+                to_node_label = file_url_map[to_node_label]
+
+            to_node = f"image-{to_node_label}"
             if from_node not in network.nodes:
                 network.add_node(from_node, label=from_node_label, category=column)