diff --git a/Java/src/main/java/com/nuix/superutilities/misc/FormatUtility.java b/Java/src/main/java/com/nuix/superutilities/misc/FormatUtility.java index 09555a9..f5732da 100644 --- a/Java/src/main/java/com/nuix/superutilities/misc/FormatUtility.java +++ b/Java/src/main/java/com/nuix/superutilities/misc/FormatUtility.java @@ -63,7 +63,7 @@ public static String bytesToHex(byte[] bytes){ /*** * Attempts to convert data types, which may be values of metadata properties - * or custom meta data, to a String. Data types supported: + * or custom metadata, to a String. Data types supported: * - String * - Integer * - Long Integer diff --git a/Java/src/main/java/com/nuix/superutilities/misc/ProfileDigester.java b/Java/src/main/java/com/nuix/superutilities/misc/ProfileDigester.java new file mode 100644 index 0000000..a31bf71 --- /dev/null +++ b/Java/src/main/java/com/nuix/superutilities/misc/ProfileDigester.java @@ -0,0 +1,249 @@ +package com.nuix.superutilities.misc; + +import java.nio.charset.Charset; +import java.security.MessageDigest; +import java.util.Collection; +import java.util.HashMap; +import java.util.Map; +import java.util.function.BiConsumer; +import java.util.function.Consumer; + +import org.apache.log4j.Logger; + +import nuix.Case; +import nuix.Item; +import nuix.ItemExpression; +import nuix.ItemSet; +import nuix.MetadataItem; +import nuix.MetadataProfile; +import nuix.ItemEventCallback; +import nuix.ItemEventInfo; + +public class ProfileDigester { + private static Logger logger = Logger.getLogger(ProfileDigester.class); + + private boolean includeItemText = false; + private MetadataProfile profile = null; + + private BiConsumer progressCallback = null; + private Consumer infoMessageCallback = null; + private BiConsumer errorCallback = null; + + private void fireProgressUpdate(int current, int total) { + if(progressCallback != null) { + progressCallback.accept(current, total); + } + } + + private void logInfo(String message) { + if(infoMessageCallback != null) { + infoMessageCallback.accept(message); + } else { + logger.info(message); + } + } + + private void logError(String message, Item item) { + if(errorCallback != null) { + errorCallback.accept(message, item); + } else { + logger.error(message); + } + } + + /*** + * Invoked when progress is updated in {@link #addItemsToItemSet(Case, String, String, Collection)}. Provides 2 integers, the first is current progress and the second is total progress. + * @param callback Callback to be invoked when progress is updated. + */ + public void whenProgressUpdated(BiConsumer callback) { + progressCallback = callback; + } + + /*** + * Invoked when a message is logged by {@link #addItemsToItemSet(Case, String, String, Collection)}. + * @param callback Callback invoked when a message is logged. If callback is not provided, message is instead sent to Nuix log. + */ + public void whenMessageLogged(Consumer callback) { + infoMessageCallback = callback; + } + + /*** + * Invoked when an error occurs in {@link #addItemsToItemSet(Case, String, String, Collection)}. Provides a message String and the item being processed when + * the error occurred. If callback is not provided, message will instead be sent to Nuix log. + * @param callback + */ + public void whenErrorLogged(BiConsumer callback) { + errorCallback = callback; + } + + public ProfileDigester() {} + public ProfileDigester(MetadataProfile metadataProfile) { profile = metadataProfile; } + + /*** + * Adds items to an item set using "Scripted" deduplication, providing a MD5 hash generated from the concatenation of the values yielded + * by the MetadataProfile associated with this instance. Additionally can include an items content text in the concatenated values. + * Effectively this allows you to create an item set where rather than deduplicating by the MD5 Nuix calculated for a given item during + * ingestion, the values of the given metadata profile are used to determine original/duplicate status of the provided items. + * @param nuixCase The Nuix Case containing the items to be deduplicated and where the ItemSet will be created. + * @param itemSetName Name of item set. If item set already exists, existing item set will be used, if not one will be created. Important: when + * adding items to an existing item set, it is important that items previously added to that item set were added using this method, the same metadata profile + * and same setting for includeItemText, otherwise deduplication results are undefined. + * @param deduplicateBy Valid options are "INDIVIDUAL" and "FAMILY", these settings correspond to the behaviors noted in addItems(Collection items, Map options). + * @param items The items to add to the item set. + * @return The item set the items were added to. + */ + public ItemSet addItemsToItemSet(Case nuixCase, String itemSetName, String deduplicateBy, Collection items) { + // Require we have a profile to work with + if(profile == null) { + throw new IllegalArgumentException("profile cannot be null, please provide a profile by calling setProfile(MetadataProfile profile) before calling this method"); + } + + // Require that an item set name was provided + if(itemSetName == null || itemSetName.trim().isEmpty()) { + throw new IllegalArgumentException("itemSetName cannot be null or empty"); + } + + // Require that a "deduplicateBy" value is provided which will be accepted by the API + deduplicateBy = deduplicateBy.toUpperCase().trim(); + if(deduplicateBy.equalsIgnoreCase("FAMILY") == false && deduplicateBy.equalsIgnoreCase("INDIVIDUAL") == false) { + throw new IllegalArgumentException("deduplicateBy can only be 'FAMILY' or 'INDIVIDUAL', was provided: "+deduplicateBy); + } + + logInfo("Deduplicate By: "+deduplicateBy); + String profileName = profile.getName(); + if(profileName == null || profileName.trim().isEmpty()) { + profileName = ""; + } + logInfo("Using metadata profile "+profileName); + + // Is there an existing item set with this name? + ItemSet targetItemSet = nuixCase.findItemSetByName(itemSetName); + + // If not, we shall create an item set with this name + if(targetItemSet == null) { + logInfo("Creating ItemSet with name "+itemSetName); + String description = null; + if(includeItemText) { + description = String.format("Generated using MD5 of profile '%s' field values concatenation and Item Text", profile.getName()); + } else { + description = String.format("Generated using MD5 of profile '%s' field values concatenationt", profile.getName()); + } + + Map itemSetSettings = new HashMap(); + itemSetSettings.put("deduplication", "Scripted"); + itemSetSettings.put("description", description); + itemSetSettings.put("deduplicateBy", deduplicateBy); + targetItemSet = nuixCase.createItemSet(itemSetName, itemSetSettings); + } else { + logInfo("Using existing ItemSet with name "+itemSetName); + } + + // Build settings Map for call to addItems which includes: + // - Our custom expression which internally generates the custom MD5 for each item using provided metadata profile + // - Progress callback which will in turn call fireProgressUpdate + Map settings = new HashMap(); + + // Define custom expression + settings.put("expression", new ItemExpression() { + @Override + public String evaluate(Item item) { + try { + return generateMd5String(item); + } catch (Exception e) { + String message = String.format("Error while generating custom MD5 for item with GUID %s and name %s", item.getGuid(), item.getLocalisedName()); + logError(message, item); + return "ERROR"; + } + } + }); + + // Define progress callback which will in turn push out progress updates to callback on this instance + settings.put("progress", new ItemEventCallback() { + @Override + public void itemProcessed(ItemEventInfo info) { + fireProgressUpdate((int)info.getStageCount(),items.size()); + } + }); + + // Add the items to the item set + targetItemSet.addItems(items, settings); + + // Provide back item set we used/created + return targetItemSet; + } + + /*** + * Generates MD5 digest byte array for a given item. Digest is generated by digesting concatenation of values yielded by the + * metadata profile associated with this instance for the given item and optionally including the item's content text. + * @param item The item to generate a custom MD5 digest for. + * @return Byte array representation of the MD5 digest + * @throws Exception Most likely if metadata profile has not yet been set for this instance. + */ + public byte[] generateMd5Bytes(Item item) throws Exception { + if(profile == null) { + throw new IllegalArgumentException("profile cannot be null, please provide a profile by calling setProfile(MetadataProfile profile) before calling this method"); + } + + MessageDigest md = MessageDigest.getInstance("MD5"); + for(MetadataItem field : profile.getMetadata()) { + String fieldValue = field.evaluate(item); + if(fieldValue != null) { + md.update(fieldValue.getBytes()); + } + } + + if(includeItemText) { + md.update(item.getTextObject().toString().getBytes(Charset.forName("utf8"))); + } + + return md.digest(); + } + + /*** + * Generates MD5 digest hex string for a given item. Digest is generated by digesting values concatenation of values yielded by the + * metadata profile associated with this instance for the given item and optionally including the item's content text. + * Internally this method first calls {@link #generateMd5Bytes(Item)} then converts the result of that method + * into a hexadecimal string. + * @param item The item to generate a custom MD5 digest for. + * @return Hexadecimal string representation of the MD5 digest + * @throws Exception Most likely if metadata profile has not yet been set for this instance. + */ + public String generateMd5String(Item item) throws Exception { + if(profile == null) { + throw new IllegalArgumentException("profile cannot be null, please provide a profile by calling setProfile(MetadataProfile profile) before calling this method"); + } + return FormatUtility.bytesToHex(generateMd5Bytes(item)); + } + + /*** + * Gets whether this instance should include the item's content text when calculating a digest. + * @return Whether this instance should include the item's content text when calculating a digest. + */ + public boolean getIncludeItemText() { + return includeItemText; + } + + /*** + * Sets whether this instance should include the item's content text when calculating a digest. + * @param includeItemText Whether this instance should include the item's content text when calculating a digest. + */ + public void setIncludeItemText(boolean includeItemText) { + this.includeItemText = includeItemText; + } + + /*** + * Gets the metadata profile used to obtain the values provided to the hash computation. + * @return The metadata profile used to obtain the values provided to the hash computation. + */ + public MetadataProfile getProfile() { + return profile; + } + + /*** + * Sets the metadata profile used to obtain the values provided to the hash computation. + * @param profile The metadata profile used to obtain the values provided to the hash computation. + */ + public void setProfile(MetadataProfile profile) { + this.profile = profile; + } +} diff --git a/RubyTests/Test_ProfileDigester.rb b/RubyTests/Test_ProfileDigester.rb new file mode 100644 index 0000000..a5856b6 --- /dev/null +++ b/RubyTests/Test_ProfileDigester.rb @@ -0,0 +1,97 @@ +script_directory = File.dirname(__FILE__) +require File.join(script_directory,"SuperUtilities.jar") +java_import com.nuix.superutilities.SuperUtilities +$su = SuperUtilities.init($utilities,NUIX_VERSION) +java_import com.nuix.superutilities.misc.ProfileDigester + +$current_case = $utilities.getCaseFactory.open("D:\\cases\\FakeData_8.0") + +profile_digester = ProfileDigester.new + +include_content_text = true +items = $current_case.search("kind:email") + +profile = $utilities.getMetadataProfileStore.createMetadataProfile +profile = profile.addMetadata("SPECIAL","Name") +profile = profile.addMetadata("SPECIAL","Kind") + +profile_digester.setProfile(profile) +profile_digester.setIncludeItemText(include_content_text) + +profile_digester.whenMessageLogged do |message| + puts message +end + +profile_digester.whenProgressUpdated do |current,total| + if current == 0 || current == total || current % 100 == 0 + puts "Progress #{current}/#{total}" + end +end + +profile_digester.whenErrorLogged do |message,item| + puts message +end + +concat_grouped = Hash.new{|h,k| h[k] = [] } +hash_grouped = Hash.new{|h,k| h[k] = [] } + +items.each do |item| + concat = profile.getMetadata.map{|field| field.evaluate(item)}.join + concat << item.getTextObject.toString if include_content_text + + hash = profile_digester.generateMd5String(item) + + concat_grouped[concat] << item + hash_grouped[hash] << item +end + +if concat_grouped.size == hash_grouped.size + puts "Group sizes match" + + + # Ruby iterates a hash in key insertion order so I believe each + # entry should correspond to the same "dedupe group". We will peel + # off 1 group from each and make sure the concatenation based key + # grouped up the same number of input items as the + cge = concat_grouped.to_enum + hge = hash_grouped.to_enum + + matched = 0 + mismatched = 0 + + concat_grouped.size.times do |iteration| + # Get next key value pair from each hash + c = cge.next + h = hge.next + + # For each group, do they have the same number of items? We expect they should. + if c[1].size != h[1].size + # If item counts for same group differ, report about it + puts "="*20 + puts "#{c[0]} => #{c[1].size}" + puts "#{h[0]} => #{h[1].size}" + "Concat had #{c[1].size} for group but hash had #{h[1].size} for same group" + mismatched += 1 + else + matched += 1 + end + end + + puts "Matched: #{matched}" + puts "Mismatched: #{mismatched}" + + # Now we put same items into an item set using ProfileDigester, we expect that the + # total number of originals in that item set should match the number of groups in either hash, + # 1 original per group, when deduping by individual + item_set_name = "Profile Digester #{Time.now.to_i}" + dedupe_by = "INDIVIDUAL" + item_set = profile_digester.addItemsToItemSet($current_case,item_set_name,dedupe_by,items) + puts "Item Set Originals: #{item_set.getOriginals.size}, Expected: #{concat_grouped.size}" + # We expect that duplicates will be items count - originals count + puts "Item Set Duplicates: #{item_set.getDuplicates.size}, Expected: #{items.size - concat_grouped.size}" +else + puts "Sizes don't match!" +end + + +$current_case.close \ No newline at end of file