Skip to content

Commit 932b76f

Browse files
authored
Merge pull request #19 from Nuix/bulk-redactor-threading
add parellelism to bulk redactor
2 parents ef90197 + 9b2dc28 commit 932b76f

File tree

4 files changed

+86
-55
lines changed

4 files changed

+86
-55
lines changed

Java/src/main/java/com/nuix/superutilities/annotations/AnnotationEvent.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,9 @@
22

33
import java.util.Collection;
44

5+
import org.apache.log4j.Logger;
56
import org.joda.time.DateTime;
67

7-
import jxl.common.Logger;
88
import nuix.Case;
99
import nuix.Item;
1010

Java/src/main/java/com/nuix/superutilities/annotations/AnnotationRepository.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
import com.nuix.superutilities.misc.SQLiteBacked;
2424
import com.nuix.superutilities.query.QueryHelper;
2525

26-
import jxl.common.Logger;
26+
import org.apache.log4j.Logger;
2727
import nuix.BulkAnnotater;
2828
import nuix.Case;
2929
import nuix.Item;

Java/src/main/java/com/nuix/superutilities/annotations/BulkRedactor.java

Lines changed: 81 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,12 @@
55
import java.util.Collection;
66
import java.util.Comparator;
77
import java.util.HashSet;
8-
import java.util.TreeMap;
98
import java.util.List;
109
import java.util.Map;
1110
import java.util.Set;
11+
import java.util.TreeMap;
12+
import java.util.concurrent.ForkJoinPool;
13+
import java.util.concurrent.atomic.AtomicInteger;
1214
import java.util.function.Consumer;
1315
import java.util.stream.Collectors;
1416

@@ -69,7 +71,7 @@ public void whenProgressUpdated(Consumer<BulkRedactorProgressInfo> callback) {
6971
progressUpdatedCallback = callback;
7072
}
7173

72-
private void fireProgressUpdated(BulkRedactorProgressInfo info) {
74+
private synchronized void fireProgressUpdated(BulkRedactorProgressInfo info) {
7375
if(progressUpdatedCallback != null) {
7476
progressUpdatedCallback.accept(info);
7577
}
@@ -197,13 +199,17 @@ public List<NuixImageAnnotationRegion> findExpressionsInPdfFile(File file, Colle
197199
* @param nuixCase The source Nuix case. Needed to obtain items (if none were given) and/or obtain the appropriate markup set.
198200
* @param settings The settings used to find and generate the redactions.
199201
* @param scopeItems Items to find and redact.
202+
* @param concurrency How many threads to put in ForkJoinPool
200203
* @throws Exception If something goes wrong
201204
* @return Returns a list of all match region objects (so they can be reported, inspected, etc)
202205
*/
203-
public List<NuixImageAnnotationRegion> findAndMarkup(Case nuixCase, BulkRedactorSettings settings, Collection<Item> scopeItems) throws Exception {
206+
public List<NuixImageAnnotationRegion> findAndMarkup(Case nuixCase, BulkRedactorSettings settings, Collection<Item> scopeItems, int concurrency) throws Exception {
207+
Collection<Item> itemsToProcess;
204208
if(scopeItems == null || scopeItems.size() < 1) {
205209
logger.info("No scopeItems were provided, using all items in case");
206-
scopeItems = nuixCase.search("");
210+
itemsToProcess = nuixCase.search("");
211+
} else {
212+
itemsToProcess = scopeItems;
207213
}
208214

209215
List<NuixImageAnnotationRegion> allFoundRegions = new ArrayList<NuixImageAnnotationRegion>();
@@ -213,12 +219,13 @@ public List<NuixImageAnnotationRegion> findAndMarkup(Case nuixCase, BulkRedactor
213219
com.nuix.data.util.aspose.AsposePdf.ensureInitialised();
214220

215221
PdfWorkCache pdfCache = new PdfWorkCache(settings.getTempDirectory());
216-
MarkupSet markupSet = null;
217-
if (settings.getApplyRedactions() || settings.getApplyRedactions()) {
222+
MarkupSet markupSet;
223+
if (settings.getApplyRedactions() || settings.getApplyRedactions()) {
218224
markupSet = settings.getMarkupSet(nuixCase);
225+
} else {
226+
markupSet = null;
219227
}
220-
int currentIteration = 0;
221-
int matches = 0;
228+
222229

223230
logMessage("Regular Expressions:");
224231
for(String expression : settings.getExpressions()) {
@@ -230,53 +237,75 @@ public List<NuixImageAnnotationRegion> findAndMarkup(Case nuixCase, BulkRedactor
230237
logMessage(namedEntity);
231238
}
232239

233-
for(Item item : scopeItems) {
234-
currentIteration += 1;
235-
File tempPdf = pdfCache.getPdfPath(item);
236-
237-
List<NuixImageAnnotationRegion> regions = findExpressionsInPdfFile(tempPdf, settings.getExpressions());
238-
if(regions.size() > 0) {
239-
for(NuixImageAnnotationRegion region : regions) {
240-
region.setItem(item);
241-
}
242-
allFoundRegions.addAll(regions);
243-
logMessage("Item with GUID %s had %s matches",item.getGuid(),regions.size());
244-
for(NuixImageAnnotationRegion region : regions) {
245-
matches++;
246-
if(settings.getApplyRedactions()) { region.applyRedaction(markupSet); }
247-
if(settings.getApplyHighLights()) { region.applyHighlight(markupSet); }
248-
}
249-
}
250-
251-
//Named entities require that we get matched values, convert those to expressions and then do another pass
252-
if (settings.getNamedEntityTypes().size() > 0) {
253-
Set<String> entityValues = new HashSet<String>();
254-
for(String entityType : settings.getNamedEntityTypes()) {
255-
entityValues.addAll(item.getEntities(entityType));
256-
}
257-
258-
Set<String> entityExpressions = entityValues.stream().map(v -> BulkRedactorSettings.phraseToExpression(v)).collect(Collectors.toSet());
259-
List<NuixImageAnnotationRegion> entityRegions = findExpressionsInPdfFile(tempPdf, entityExpressions);
260-
if(entityRegions.size() > 0) {
261-
for(NuixImageAnnotationRegion region : entityRegions) {
262-
region.setItem(item);
240+
int scopeItemsSize = scopeItems.size();
241+
AtomicInteger currentIteration = new AtomicInteger(0);
242+
AtomicInteger matches = new AtomicInteger(0);
243+
244+
Consumer<Item> workHorse = new Consumer<Item>() {
245+
@Override
246+
public void accept(Item item) {
247+
try {
248+
currentIteration.addAndGet(1);
249+
File tempPdf = pdfCache.getPdfPath(item);
250+
251+
Set<String> allExpressions = new HashSet<String>();
252+
allExpressions.addAll(settings.getExpressions());
253+
if (settings.getNamedEntityTypes().size() > 0) {
254+
Set<String> entityValues = new HashSet<String>();
255+
for(String entityType : settings.getNamedEntityTypes()) {
256+
entityValues.addAll(item.getEntities(entityType));
257+
}
258+
entityValues.stream().map(v -> BulkRedactorSettings.phraseToExpression(v)).forEach(new Consumer<String>() {
259+
@Override
260+
public void accept(String exp) {
261+
allExpressions.add(exp);
262+
}
263+
});
263264
}
264-
allFoundRegions.addAll(regions);
265-
logMessage("Item with GUID %s had %s named entity matches",item.getGuid(),entityRegions.size());
266-
for(NuixImageAnnotationRegion region : entityRegions) {
267-
matches++;
268-
if(settings.getApplyRedactions()) { region.applyRedaction(markupSet); }
269-
if(settings.getApplyHighLights()) { region.applyHighlight(markupSet); }
270-
}
265+
266+
List<NuixImageAnnotationRegion> regions = findExpressionsInPdfFile(tempPdf, allExpressions);
267+
if(regions.size() > 0) {
268+
for(NuixImageAnnotationRegion region : regions) {
269+
region.setItem(item);
270+
}
271+
allFoundRegions.addAll(regions);
272+
logMessage("Item with GUID %s had %s matches",item.getGuid(),regions.size());
273+
for(NuixImageAnnotationRegion region : regions) {
274+
if(settings.getApplyRedactions()) { region.applyRedaction(markupSet); }
275+
if(settings.getApplyHighLights()) { region.applyHighlight(markupSet); }
276+
}
277+
matches.addAndGet(regions.size());
278+
}
279+
280+
pdfCache.forgetItem(item);
281+
282+
// Report progress
283+
synchronized(this) {
284+
BulkRedactorProgressInfo progressInfo = new BulkRedactorProgressInfo();
285+
progressInfo.setCurrent(currentIteration.get());
286+
progressInfo.setTotal(scopeItemsSize);
287+
progressInfo.setMatches(matches.get());
288+
fireProgressUpdated(progressInfo);
289+
}
290+
} catch (Exception e) {
291+
logMessage("Exception processing item with GUID %s, %s (See Nuix logs for more detail)", item.getGuid(), e.getMessage());
292+
logger.error(String.format("Error while processing item with GUID %s", item.getGuid()),e);
271293
}
272294
}
273-
274-
// Report progress
275-
BulkRedactorProgressInfo progressInfo = new BulkRedactorProgressInfo();
276-
progressInfo.setCurrent(currentIteration);
277-
progressInfo.setTotal(scopeItems.size());
278-
progressInfo.setMatches(matches);
279-
fireProgressUpdated(progressInfo);
295+
};
296+
297+
ForkJoinPool pool = null;
298+
try {
299+
pool = new ForkJoinPool(concurrency);
300+
pool.submit(()->{
301+
itemsToProcess.parallelStream().forEach(workHorse);
302+
}).get();
303+
} catch (Exception e) {
304+
logger.error("Error while scanning",e);
305+
throw e;
306+
} finally {
307+
if(pool != null)
308+
pool.shutdown();
280309
}
281310

282311
logMessage("Cleaning up temp directory %s",settings.getTempDirectory());

Java/src/main/java/com/nuix/superutilities/export/PdfWorkCache.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,9 @@ public synchronized File getPdfPath(Item item) throws Exception {
5353
tempPdf = new File(tempPdf,guid.substring(3, 6));
5454
tempPdf.mkdirs();
5555
tempPdf = new File(tempPdf,item.getGuid()+".pdf");
56-
item.getPrintedImage().generate(printSettings); // Make sure PDF is generated or export can have issues
56+
if(!item.getPrintedImage().isStored()) {
57+
item.getPrintedImage().generate(printSettings); // Make sure PDF is generated or export can have issues
58+
}
5759
pdfExporter.exportItem(item, tempPdf);
5860
pdfCache.put(item.getGuid(), tempPdf);
5961
}

0 commit comments

Comments
 (0)