Skip to content

Commit

Permalink
Continued development of Tag support in AnnotationRepository
Browse files Browse the repository at this point in the history
  • Loading branch information
JuicyDragon committed Jun 20, 2019
1 parent f4656e1 commit f16caa8
Show file tree
Hide file tree
Showing 3 changed files with 159 additions and 55 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,11 @@ public void whenMessageLogged(Consumer<String> callback) {
messageLoggedCallback = callback;
}

/***
* Logs a message, either providing it to the callback supplied in a call to {@link #whenMessageLogged(Consumer)} or in
* absence of that callback, to log4j.
* @param message
*/
private void logMessage(String message) {
if(messageLoggedCallback != null) {
messageLoggedCallback.accept(message);
Expand All @@ -60,6 +65,13 @@ private void logMessage(String message) {
}
}

/***
* Logs a message, either providing it to the callback supplied in a call to {@link #whenMessageLogged(Consumer)} or in
* absence of that callback, to log4j. This method passes the message through a call to String.format with the message
* being provided as the format and the params provided as the params to String.format.
* @param format The message format string, formatted as accepted by String.format.
* @param params Parameters to be inserted into the formatted string, as accepted by String.format.
*/
private void logMessage(String format, Object... params) {
logMessage(String.format(format, params));
}
Expand All @@ -74,6 +86,11 @@ public void whenProgressUpdated(BiConsumer<Integer,Integer> callback) {
progressUpdatedCallback = callback;
}

/***
* Invokes callback previously provided in a call to {@link #whenProgressUpdated(BiConsumer)}, if one has been provided.
* @param current The current progress amount.
* @param total The total amount of work.
*/
private void fireProgressUpdated(int current, int total) {
if(progressUpdatedCallback != null) {
progressUpdatedCallback.accept(current,total);
Expand Down Expand Up @@ -138,7 +155,8 @@ private void createTables() throws SQLException {
}

/***
* Rebuilds in memory look ups for GUID/MD5 => database record IDs
* Rebuilds in memory look ups for GUID/MD5 => database record IDs. Later as new records are added
* to the database file, these lookup will be updated in tandem as in memory caches.
* @throws SQLException Thrown if there are errors while interacting with the SQLite DB file.
*/
private void rebuildXrefs() throws SQLException {
Expand Down Expand Up @@ -179,7 +197,7 @@ public void accept(ResultSet rs) {
});

logMessage("Building Tag name lookup from any existing entries in DB...");
markupSetIdLookup.clear();
tagIdLookup.clear();
sql = "SELECT Name,ID FROM Tag";
executeQuery(sql,null, new Consumer<ResultSet>() {
@Override
Expand All @@ -188,7 +206,7 @@ public void accept(ResultSet rs) {
while(rs.next()) {
String name = rs.getString(1);
long id = rs.getLong(2);
markupSetIdLookup.put(name, id);
tagIdLookup.put(name, id);
}
} catch (SQLException exc) {
logger.error("Error building Tag Name to ID XREF",exc);
Expand Down Expand Up @@ -229,7 +247,12 @@ public long getItemId(Item item) throws SQLException {
String name = item.getLocalisedName();

byte[] guidBytes = FormatUtility.hexToBytes(guid);
byte[] md5Bytes = FormatUtility.hexToBytes(md5);
byte[] md5Bytes = null;
if(md5 == null) {
logMessage("Item with GUID %s has no MD5",guid);
} else {
md5Bytes = FormatUtility.hexToBytes(md5);
}

String sql = "INSERT INTO Item (GUID,MD5,Name) VALUES (?,?,?)";
executeInsert(sql, guidBytes, md5Bytes, name);
Expand Down Expand Up @@ -279,9 +302,16 @@ public long getTagId(String tagName) throws SQLException {
}
}

/***
* Stores a specific tag present in the provided case as records in the DB file.
* @param nuixCase The Nuix case that the specified tag is present in.
* @param tagName The name of the tag in the specified case to store in the DB file.
* @throws IOException Thrown if a search error occurs.
* @throws SQLException Thrown if there are errors while interacting with the SQLite DB file.
*/
public void storeTag(Case nuixCase, String tagName) throws IOException, SQLException {
logMessage("Storing tag: %s",tagName);
String insertItemTag = "INSERT INTO Tag (Item_ID,Tag_ID) VALUES (?,?)";
String insertItemTag = "INSERT INTO ItemTag (Item_ID,Tag_ID) VALUES (?,?)";
String itemQuery = QueryHelper.orTagQuery(tagName);
Set<Item> tagItems = nuixCase.searchUnsorted(itemQuery);
long tagId = getTagId(tagName);
Expand All @@ -295,6 +325,12 @@ public void storeTag(Case nuixCase, String tagName) throws IOException, SQLExcep
}
}

/***
* Stores all tags present in the provided case as records in the DB file.
* @param nuixCase The Nuix case to record tags from.
* @throws IOException Thrown if a search error occurs.
* @throws SQLException Thrown if there are errors while interacting with the SQLite DB file.
*/
public void storeAllTags(Case nuixCase) throws IOException, SQLException {
Set<String> tags = nuixCase.getAllTags();
for(String tag : tags) {
Expand All @@ -304,95 +340,129 @@ public void storeAllTags(Case nuixCase) throws IOException, SQLException {
}
}

/***
* Applies tags to items in the provided case based on tag records in the DB file associated to this instance.
* @param nuixCase The case in which items will be tagged.
* @param matchingMethod Determines how a record in the DB file is associated to an item in the case to apply tags to it.
* @throws SQLException Thrown if there are errors while interacting with the SQLite DB file.
*/
public void applyTagsFromDatabaseToCase(Case nuixCase, AnnotationMatchingMethod matchingMethod) throws SQLException {
// Will reuse this multiple times to provide values to be bound to prepared SQL statements later
List<Object> bindData = new ArrayList<Object>();
BulkAnnotater annotater = SuperUtilities.getInstance().getBulkAnnotater();
String itemTagSql = "SELECT i.GUID,i.MD5 FROM Tag AS t " +
// Will use this to apply tags later
BulkAnnotater annotater = SuperUtilities.getBulkAnnotater();

// SQL query to get information about each item a given tag is to be applied to
String itemTagSql = "SELECT i.GUID,i.MD5,i.Name FROM ItemTag AS t " +
"INNER JOIN Item AS i ON t.Item_ID = i.ID " +
"WHERE t.ID = ? " +
"WHERE t.Tag_ID = ? " +
"ORDER BY MD5,GUID";

String itemTagCountSql = "SELECT COUNT(*) FROM Tag AS t " +
// SQL query to get count of items a given tag should be applied to
String itemTagCountSql = "SELECT COUNT(*) FROM ItemTag AS t " +
"INNER JOIN Item AS i ON t.Item_ID = i.ID " +
"WHERE t.ID = ? " +
"WHERE t.Tag_ID = ? " +
"ORDER BY MD5,GUID";

// Always good to tell the user what you're doing and create a record of the settings in use
logMessage("Applying tags to case...");
if(matchingMethod == AnnotationMatchingMethod.GUID) {
logMessage("Matching DB entries to case items using: GUID");
} else if(matchingMethod == AnnotationMatchingMethod.MD5) {
logMessage("Matching DB entries to case items using: MD5");
}

// We use a cache for item retrieval, running a serach for the item by GUID or MD5 if requested but
// not currently present in the cache.
LoadingCache<String,Set<Item>> itemCache = CacheBuilder.newBuilder()
.maximumSize(1000)
.build(new CacheLoader<String,Set<Item>>(){
@Override
public Set<Item> load(String guidOrMd5) throws Exception {
// When a given GUID or MD5 is found to note already be present in our cache
// we will need to go find it in our case, cache it and return it.
Set<Item> items = new HashSet<Item>();
if(matchingMethod == AnnotationMatchingMethod.GUID) {
items = nuixCase.searchUnsorted("guid:"+guidOrMd5);
if(items.size() < 1) {
logMessage("No items in case found to match GUID: %s",guidOrMd5);
}
} else if(matchingMethod == AnnotationMatchingMethod.MD5) {
items = nuixCase.searchUnsorted("md5:"+guidOrMd5);
if(items.size() < 1) {
logMessage("No items in case found to match MD5: %s",guidOrMd5);
}
}
return items;
}
});
// Since we can apply any given tag to multiple items at once and this is a more efficient approach,
// we gather up all the items a given tag will be applied to and then apply that tag in batches. Periodically
// we will apply a batch of tags and clear this collection so we don't need to hold on to all the items receiving
// a given tag at once.
Set<Item> tagGroupedItems = new HashSet<Item>();

// Use our in memory cache of Name->ID to drive application of each tag since it should
// already be in memory and synced to the state of the database.
for(Map.Entry<String, Long> tagEntry : tagIdLookup.entrySet()) {
// Support aborting
if(abortWasRequested) { break; }

String tagName = tagEntry.getKey();
long tagId = tagEntry.getValue();

bindData.clear();
bindData.add(tagId);

// Determine how many items this tag should be applied to based on the number
// of ItemTag records associated to this tag.
int totalItemTags = executeLongScalar(itemTagCountSql,bindData).intValue();

// Here we run the query for ItemTag records associated with the tag we are currently
// processing. We will then collect up the relevant items. When we get a good batch of
// items, we tag them, clear the collection and continue on.
logMessage("Applying tag '%s' to %s items",tagName,totalItemTags);
executeQuery(itemTagSql,bindData, rs ->{
int currentIndex = 1;
try {
while(rs.next()) {
fireProgressUpdated(currentIndex,totalItemTags);
String guid = rs.getString(1);
String md5 = rs.getString(2);

// GUID and MD5 are hex strings. We store them in the database as the byte arrays those hex
// strings represent which reduces the storage footprint for these values. Nuix needs them as
// hex strings for use in queries, so we need to convert them back to hex strings here.
String guid = FormatUtility.bytesToHex(rs.getBytes(1));
String md5 = FormatUtility.bytesToHex(rs.getBytes(2));
String itemName = rs.getString(3);

// If a given record does not have an MD5 (likely because the source item had no Md5) we can't really
// use MD5 matching from database record to destination case item, so we report the issue to the user
// and skip this record.
if(md5 == null && matchingMethod == AnnotationMatchingMethod.MD5) {
logMessage("Record for item named '%s' with GUID %s does not have an MD5 value",itemName,guid);
continue;
}

Set<Item> items = null;

// Leverage our cache to minimize unnecessary searching for the same item or items repeatedly
// Obtain the relevant item or items depending on the matching method specified
if(matchingMethod == AnnotationMatchingMethod.GUID) {
items = itemCache.get(guid);
items = nuixCase.searchUnsorted("guid:"+guid);
} else if(matchingMethod == AnnotationMatchingMethod.MD5) {
items = itemCache.get(md5);
items = nuixCase.searchUnsorted("md5:"+md5);
}

// Add all the items we found to our collection
tagGroupedItems.addAll(items);

// If our collection has 5000 items or more in it now, lets tag those items and then
// clear the collection so we aren't holding on to all of the items at once.
if(tagGroupedItems.size() > 5000) {
logMessage(" Apply tag '%s' to 5000 items",tagName);
annotater.addTag(tagName, tagGroupedItems);
tagGroupedItems.clear();
}

// Apply tag to relevant items in the destination case
annotater.addTag(tagName, items);
currentIndex++;
}

// If there are any items left in our collection that still need a tag applied, we check and
// tag them here.
if(tagGroupedItems.size() > 0) {
logMessage(" Apply tag '%s' to %s items",tagName,tagGroupedItems.size());
annotater.addTag(tagName, tagGroupedItems);
tagGroupedItems.clear();
}

} catch (SQLException e) {
logger.error("Error retrieving ItemTag data from database", e);
logMessage("Error retrieving ItemTag data from database: %s",e.getMessage());
} catch (IOException e) {
logger.error("Error retrieving item from case", e);
logMessage("Error retrieving item from case: ",e.getMessage());
} catch (ExecutionException e) {
logger.error(e);
}
});
}
}

/***
* Stores a particular markup set living in the provided Nuix case.
* Stores a particular markup set present in the provided Nuix case.
* @param nuixCase The Nuix case containing the provided markup set.
* @param markupSet The specific markup set to store.
* @throws IOException Thrown most likely if there was an issue searching or retrieving printed pages of and item.
Expand Down Expand Up @@ -454,26 +524,33 @@ public void applyMarkupsFromDatabaseToCase(Case nuixCase, boolean addToExistingM

List<Object> bindData = new ArrayList<Object>();

// Use our in memory cache of Name->ID to drive application of each markup set since it should
// already be in memory and synced to the state of the database.
for(Map.Entry<String, Long> markupEntry : markupSetIdLookup.entrySet()) {
// Support aborting
if(abortWasRequested) { break; }

String markupSetName = markupEntry.getKey();
long markupSetId = markupEntry.getValue();

// SQL to get description for this markup set
String markupSetDescription = executeStringScalar("SELECT Description FROM MarkupSet WHERE ID = ?",markupSetId);
// SQL to get reason for this markup set
String markupSetRedactionReason = executeStringScalar("SELECT RedactionReason FROM MarkupSet WHERE ID = ?",markupSetId);

// Always good to echo back to user the settings they are using
logMessage("Applying markups to case from MarkupSet: %s",markupSetName);
if(matchingMethod == AnnotationMatchingMethod.GUID) {
logMessage("Matching DB entries to case items using: GUID");
} else if(matchingMethod == AnnotationMatchingMethod.MD5) {
logMessage("Matching DB entries to case items using: MD5");
}

// We need to resolve the MarkupSet object, either by obtaining existing one in case or creating new one
// We need to resolve the MarkupSet object, either by obtaining an existing one in destination case or creating a new one.
MarkupSet markupSet = null;
if(existingMarkupSetLookup.containsKey(markupSetName)) {
if(addToExistingMarkupSet) {
// We can just add more annotations the the existing markup set with the same name
logMessage("Applying markups in destination case to existing markup set: %s",markupSetName);
markupSet = existingMarkupSetLookup.get(markupSetName);
} else {
Expand Down Expand Up @@ -506,7 +583,7 @@ public void applyMarkupsFromDatabaseToCase(Case nuixCase, boolean addToExistingM

// SQL for info needed to apply markups. Sorted by MD5/GUID/PageNumber so that we should get markups for the same item
// one after another, making our cache defined below more efficiently leveraged.
String itemMarkupSql = "SELECT i.GUID,i.MD5,im.PageNumber,im.IsRedaction,im.X,im.Y,im.Width,im.Height FROM ItemMarkup AS im " +
String itemMarkupSql = "SELECT i.GUID,i.MD5,i.Name,im.PageNumber,im.IsRedaction,im.X,im.Y,im.Width,im.Height FROM ItemMarkup AS im " +
"INNER JOIN Item AS i ON im.Item_ID = i.ID " +
"WHERE im.MarkupSet_ID = ? " +
"ORDER BY MD5,GUID,PageNumber";
Expand Down Expand Up @@ -559,14 +636,27 @@ public void accept(ResultSet rs) {
if(abortWasRequested) { break; }

fireProgressUpdated(currentIndex,totalItemMarkups);

// GUID and MD5 are stored in database as byte arrays to save space, but we need them as hex strings
// for Nuix searching, so we need to convert them back to strings here.
String guid = FormatUtility.bytesToHex(rs.getBytes(1));
String md5 = FormatUtility.bytesToHex(rs.getBytes(2));
long pageNumber = rs.getLong(3);
boolean isRedaction = rs.getBoolean(4);
double x = rs.getDouble(5);
double y = rs.getDouble(6);
double width = rs.getDouble(7);
double height = rs.getDouble(8);
String itemName = rs.getString(3);

// Get details needed to apply a markup to the relevant item
long pageNumber = rs.getLong(4);
boolean isRedaction = rs.getBoolean(5);
double x = rs.getDouble(6);
double y = rs.getDouble(7);
double width = rs.getDouble(8);
double height = rs.getDouble(9);

// If our matching method is MD5, but the current record does not have an MD5 (likely because the originating item
// did not have an MD5, we let the user know and then skip this record.
if(md5 == null && matchingMethod == AnnotationMatchingMethod.MD5) {
logMessage("Record for item named '%s' with GUID %s does not have an MD5 value",itemName,guid);
continue;
}

Set<Item> items = null;

Expand All @@ -581,6 +671,18 @@ public void accept(ResultSet rs) {
for(Item item : items) {
MutablePrintedImage itemImage = item.getPrintedImage();
List<? extends PrintedPage> pages = itemImage.getPages();

if(pages == null || pages.size() < 1) {
logMessage("Item named '%s' and GUID %s has no printed pages, generating now...",itemName,guid);
itemImage.generate();
pages = itemImage.getPages();
}

if(pages.size() < pageNumber-1) {
logMessage("Item named '%s' and GUID %s does not have a page %s",itemName,guid,pageNumber);
continue;
}

MutablePrintedPage page = (MutablePrintedPage)pages.get((int) (pageNumber-1));
if(isRedaction) {
page.createRedaction(targetMarkupSet, x, y, width, height);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ public static FormatUtility getInstance(){
* @return Byte array equivalent.
*/
public static byte[] hexToBytes(String hex){
if(hex == null) { return null; }
String normalizedHex = hex.replace("-", "");
return DatatypeConverter.parseHexBinary(normalizedHex);
}
Expand All @@ -53,6 +54,7 @@ public static byte[] hexToBytes(String hex){
* @return A string representation of the byte array as hexadecimal.
*/
public static String bytesToHex(byte[] bytes){
if(bytes == null) { return null; }
return DatatypeConverter.printHexBinary(bytes);
}

Expand Down
Loading

0 comments on commit f16caa8

Please sign in to comment.