Skip to content

Commit

Permalink
dumb image things
Browse files Browse the repository at this point in the history
  • Loading branch information
stijn-uva committed Oct 24, 2024
1 parent ac543cc commit bb0909c
Show file tree
Hide file tree
Showing 2 changed files with 72 additions and 47 deletions.
40 changes: 3 additions & 37 deletions processors/filtering/unique_images.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,12 @@
"""
Filter by unique images
"""
import imagehash
import hashlib
import shutil
import json

from PIL import Image
from backend.lib.processor import BasicProcessor
from common.lib.exceptions import ProcessorInterruptedException
from common.lib.helpers import UserInput
from common.lib.helpers import UserInput, hash_file

__author__ = "Stijn Peeters"
__credits__ = ["Stijn Peeters"]
Expand Down Expand Up @@ -60,37 +57,6 @@ def is_compatible_with(cls, module=None, user=None):
return module.get_media_type() == "image" or module.type.startswith(
"image-downloader") or module.type == "video-frames"

def hash_file(self, image_file, hash_type="file-hash"):
"""
Generate an image hash
:param Path image_file: Image file to hash
:param str hash_type: Hash type, one of `file-hash`, `colorhash`,
`phash`, `average_hash`, `dhash`
:return str: Hexadecimal hash value
"""
if not image_file.exists():
raise FileNotFoundError()

if hash_type == "file-hash":
hasher = hashlib.sha1()

# Open the file in binary mode
with image_file.open("rb") as infile:
# Read and update hash in chunks to handle large files
while chunk := infile.read(1024):
hasher.update(chunk)

return hasher.hexdigest()

elif hash_type in ("colorhash", "phash", "average_hash", "dhash"):
image = Image.open(image_file)

return str(getattr(imagehash, hash_type)(image))

else:
raise NotImplementedError(f"Unknown hash type '{hash_type}'")

def process(self):
"""
Loop through images and only retain ones that have not been seen yet
Expand All @@ -111,7 +77,7 @@ def process(self):

self.dataset.update_progress(processed / self.source_dataset.num_rows)
if processed % 100 == 0:
self.dataset.update_progress(f"Processed {processed:,} of {self.source_dataset.num_rows:,} images, "
self.dataset.update_status(f"Processed {processed:,} of {self.source_dataset.num_rows:,} images, "
f"found {dupes:,} duplicate(s)")
processed += 1

Expand All @@ -120,7 +86,7 @@ def process(self):
metadata = json.load(infile)
continue

image_hash = self.hash_file(image_file, self.parameters.get("hash-type"))
image_hash = hash_file(image_file, self.parameters.get("hash-type"))

if image_hash not in seen_hashes:
seen_hashes.add(image_hash)
Expand Down
79 changes: 69 additions & 10 deletions processors/networks/image-network.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import json

from backend.lib.processor import BasicProcessor
from common.lib.helpers import hash_file

import networkx as nx

Expand Down Expand Up @@ -61,6 +62,20 @@ def get_options(cls, parent_dataset=None, user=None):
"tooltip": "The image node label will have this value. Depending on the network visualisation software "
"you use, one or the other is required to display the images as nodes."
},
"deduplicate": {
"type": UserInput.OPTION_CHOICE,
"help": "Merge images",
"tooltip": "Similar images can be merged into a single node, represented by the first image of the set "
"that was encountered.",
"options": {
"none": "Do not merge",
"file-hash": "File hash (files need to be byte-by-byte duplicates)",
"colorhash": "Colour hash (good at colours, worse at shapes)",
"phash": "Perceptual hash (decent at colours and shapes)",
"average_hash": "Average hash (good at crops, less tolerant of differences than perceptual hashing)",
"dhash": "Difference hash (similar to average hash, better at photos and art)"
}
},
**({
"column": {
"help": "Dataset field",
Expand All @@ -83,22 +98,48 @@ def is_compatible_with(cls, module=None, user=None):

def process(self):
column = self.parameters.get("column")
hash_type = self.parameters.get("deduplicate")
filename_filter = [".metadata.json"] if hash_type == "none" else []
metadata = None
for file in self.iterate_archive_contents(self.source_file, filename_filter=[".metadata.json"]):
with file.open() as infile:
hashed = 0

# some maps to make sure we use the right value in the right place
# url or filename, original image or duplicate, etc
file_hash_map = {}
hash_file_map = {}
seen_hashes = set()
id_file_map = {}

for file in self.iterate_archive_contents(self.source_file, filename_filter=filename_filter):
if file.name == ".metadata.json":
with file.open() as infile:
try:
metadata = json.load(infile)
file_hash_map = {i: v["filename"] for i, v in metadata.items()} if self.parameters.get("image-value") == "url" else {i["filename"]: i["filename"] for i in metadata.values()}
except json.JSONDecodeError:
pass
else:
try:
metadata = json.load(infile)
except json.JSONDecodeError:
pass
hashed += 1
if hashed % 100 == 0:
self.dataset.update_status(f"Generated identity hashes for {hashed:,} of {self.source_dataset.num_rows-1:,} item(s)")
self.dataset.update_progress(hashed / (self.source_dataset.num_rows-1) * 0.5)
file_hash = hash_file(file, hash_type)
file_hash_map[file.name] = file_hash
if file_hash not in hash_file_map:
hash_file_map[file_hash] = file.name

except (FileNotFoundError, ValueError) as e:
continue

if not metadata:
return self.dataset.finish_with_error("No valid metadata found in image archive - this processor can only "
"be run on sets of images sourced from another 4CAT dataset.")

id_file_map = {}
file_url_map = {v["filename"]: u for u, v in metadata.items()}
for url, details in metadata.items():
for item_id in details.get("post_ids", []):
id_file_map[item_id] = url if self.parameters.get("image-value") == "url" else details["filename"]
id_file_map[item_id] = details["filename"]

root_dataset = None
for parent in reversed(self.dataset.get_genealogy()):
Expand All @@ -113,7 +154,12 @@ def process(self):
network = nx.DiGraph()
processed = 0
for item in root_dataset.iterate_items():
self.dataset.update_progress(processed / root_dataset.num_rows)
progress = processed / root_dataset.num_rows
if hashed:
# if hashing was necessary, we approximate that as 50% of the work
progress = (progress * 0.5) + 0.5

self.dataset.update_progress(progress)
processed += 1
if processed % 100 == 0:
self.dataset.update_status(f"Processed {processed:,} of {root_dataset.num_rows:,} item(s)")
Expand All @@ -126,9 +172,22 @@ def process(self):

from_node_label = item.get(column)
from_node = f"{column}-{from_node_label}"
to_node_label = id_file_map[item.get("id")]
to_node = f"image-{to_node_label}"

image_file = id_file_map[item.get("id")]
image_hash = file_hash_map[image_file]
if image_hash in seen_hashes:
to_node_label = hash_file_map[image_hash]
if image_file != to_node_label:
self.dataset.update_status(f"Image {image_file} is a duplicate of {to_node_label} - merging.")

else:
seen_hashes.add(image_hash)
to_node_label = id_file_map[item.get("id")]

if self.parameters.get("image-value") == "url":
to_node_label = file_url_map[to_node_label]

to_node = f"image-{to_node_label}"
if from_node not in network.nodes:
network.add_node(from_node, label=from_node_label, category=column)

Expand Down

0 comments on commit bb0909c

Please sign in to comment.