5
5
import java .util .Collection ;
6
6
import java .util .Comparator ;
7
7
import java .util .HashSet ;
8
- import java .util .TreeMap ;
9
8
import java .util .List ;
10
9
import java .util .Map ;
11
10
import java .util .Set ;
11
+ import java .util .TreeMap ;
12
+ import java .util .concurrent .ForkJoinPool ;
13
+ import java .util .concurrent .atomic .AtomicInteger ;
12
14
import java .util .function .Consumer ;
13
15
import java .util .stream .Collectors ;
14
16
@@ -69,7 +71,7 @@ public void whenProgressUpdated(Consumer<BulkRedactorProgressInfo> callback) {
69
71
progressUpdatedCallback = callback ;
70
72
}
71
73
72
- private void fireProgressUpdated (BulkRedactorProgressInfo info ) {
74
+ private synchronized void fireProgressUpdated (BulkRedactorProgressInfo info ) {
73
75
if (progressUpdatedCallback != null ) {
74
76
progressUpdatedCallback .accept (info );
75
77
}
@@ -197,13 +199,17 @@ public List<NuixImageAnnotationRegion> findExpressionsInPdfFile(File file, Colle
197
199
* @param nuixCase The source Nuix case. Needed to obtain items (if none were given) and/or obtain the appropriate markup set.
198
200
* @param settings The settings used to find and generate the redactions.
199
201
* @param scopeItems Items to find and redact.
202
+ * @param concurrency How many threads to put in ForkJoinPool
200
203
* @throws Exception If something goes wrong
201
204
* @return Returns a list of all match region objects (so they can be reported, inspected, etc)
202
205
*/
203
- public List <NuixImageAnnotationRegion > findAndMarkup (Case nuixCase , BulkRedactorSettings settings , Collection <Item > scopeItems ) throws Exception {
206
+ public List <NuixImageAnnotationRegion > findAndMarkup (Case nuixCase , BulkRedactorSettings settings , Collection <Item > scopeItems , int concurrency ) throws Exception {
207
+ Collection <Item > itemsToProcess ;
204
208
if (scopeItems == null || scopeItems .size () < 1 ) {
205
209
logger .info ("No scopeItems were provided, using all items in case" );
206
- scopeItems = nuixCase .search ("" );
210
+ itemsToProcess = nuixCase .search ("" );
211
+ } else {
212
+ itemsToProcess = scopeItems ;
207
213
}
208
214
209
215
List <NuixImageAnnotationRegion > allFoundRegions = new ArrayList <NuixImageAnnotationRegion >();
@@ -213,12 +219,13 @@ public List<NuixImageAnnotationRegion> findAndMarkup(Case nuixCase, BulkRedactor
213
219
com .nuix .data .util .aspose .AsposePdf .ensureInitialised ();
214
220
215
221
PdfWorkCache pdfCache = new PdfWorkCache (settings .getTempDirectory ());
216
- MarkupSet markupSet = null ;
217
- if (settings .getApplyRedactions () || settings .getApplyRedactions ()) {
222
+ MarkupSet markupSet ;
223
+ if (settings .getApplyRedactions () || settings .getApplyRedactions ()) {
218
224
markupSet = settings .getMarkupSet (nuixCase );
225
+ } else {
226
+ markupSet = null ;
219
227
}
220
- int currentIteration = 0 ;
221
- int matches = 0 ;
228
+
222
229
223
230
logMessage ("Regular Expressions:" );
224
231
for (String expression : settings .getExpressions ()) {
@@ -230,53 +237,75 @@ public List<NuixImageAnnotationRegion> findAndMarkup(Case nuixCase, BulkRedactor
230
237
logMessage (namedEntity );
231
238
}
232
239
233
- for (Item item : scopeItems ) {
234
- currentIteration += 1 ;
235
- File tempPdf = pdfCache .getPdfPath (item );
236
-
237
- List <NuixImageAnnotationRegion > regions = findExpressionsInPdfFile (tempPdf , settings .getExpressions ());
238
- if (regions .size () > 0 ) {
239
- for (NuixImageAnnotationRegion region : regions ) {
240
- region .setItem (item );
241
- }
242
- allFoundRegions .addAll (regions );
243
- logMessage ("Item with GUID %s had %s matches" ,item .getGuid (),regions .size ());
244
- for (NuixImageAnnotationRegion region : regions ) {
245
- matches ++;
246
- if (settings .getApplyRedactions ()) { region .applyRedaction (markupSet ); }
247
- if (settings .getApplyHighLights ()) { region .applyHighlight (markupSet ); }
248
- }
249
- }
250
-
251
- //Named entities require that we get matched values, convert those to expressions and then do another pass
252
- if (settings .getNamedEntityTypes ().size () > 0 ) {
253
- Set <String > entityValues = new HashSet <String >();
254
- for (String entityType : settings .getNamedEntityTypes ()) {
255
- entityValues .addAll (item .getEntities (entityType ));
256
- }
257
-
258
- Set <String > entityExpressions = entityValues .stream ().map (v -> BulkRedactorSettings .phraseToExpression (v )).collect (Collectors .toSet ());
259
- List <NuixImageAnnotationRegion > entityRegions = findExpressionsInPdfFile (tempPdf , entityExpressions );
260
- if (entityRegions .size () > 0 ) {
261
- for (NuixImageAnnotationRegion region : entityRegions ) {
262
- region .setItem (item );
240
+ int scopeItemsSize = scopeItems .size ();
241
+ AtomicInteger currentIteration = new AtomicInteger (0 );
242
+ AtomicInteger matches = new AtomicInteger (0 );
243
+
244
+ Consumer <Item > workHorse = new Consumer <Item >() {
245
+ @ Override
246
+ public void accept (Item item ) {
247
+ try {
248
+ currentIteration .addAndGet (1 );
249
+ File tempPdf = pdfCache .getPdfPath (item );
250
+
251
+ Set <String > allExpressions = new HashSet <String >();
252
+ allExpressions .addAll (settings .getExpressions ());
253
+ if (settings .getNamedEntityTypes ().size () > 0 ) {
254
+ Set <String > entityValues = new HashSet <String >();
255
+ for (String entityType : settings .getNamedEntityTypes ()) {
256
+ entityValues .addAll (item .getEntities (entityType ));
257
+ }
258
+ entityValues .stream ().map (v -> BulkRedactorSettings .phraseToExpression (v )).forEach (new Consumer <String >() {
259
+ @ Override
260
+ public void accept (String exp ) {
261
+ allExpressions .add (exp );
262
+ }
263
+ });
263
264
}
264
- allFoundRegions .addAll (regions );
265
- logMessage ("Item with GUID %s had %s named entity matches" ,item .getGuid (),entityRegions .size ());
266
- for (NuixImageAnnotationRegion region : entityRegions ) {
267
- matches ++;
268
- if (settings .getApplyRedactions ()) { region .applyRedaction (markupSet ); }
269
- if (settings .getApplyHighLights ()) { region .applyHighlight (markupSet ); }
270
- }
265
+
266
+ List <NuixImageAnnotationRegion > regions = findExpressionsInPdfFile (tempPdf , allExpressions );
267
+ if (regions .size () > 0 ) {
268
+ for (NuixImageAnnotationRegion region : regions ) {
269
+ region .setItem (item );
270
+ }
271
+ allFoundRegions .addAll (regions );
272
+ logMessage ("Item with GUID %s had %s matches" ,item .getGuid (),regions .size ());
273
+ for (NuixImageAnnotationRegion region : regions ) {
274
+ if (settings .getApplyRedactions ()) { region .applyRedaction (markupSet ); }
275
+ if (settings .getApplyHighLights ()) { region .applyHighlight (markupSet ); }
276
+ }
277
+ matches .addAndGet (regions .size ());
278
+ }
279
+
280
+ pdfCache .forgetItem (item );
281
+
282
+ // Report progress
283
+ synchronized (this ) {
284
+ BulkRedactorProgressInfo progressInfo = new BulkRedactorProgressInfo ();
285
+ progressInfo .setCurrent (currentIteration .get ());
286
+ progressInfo .setTotal (scopeItemsSize );
287
+ progressInfo .setMatches (matches .get ());
288
+ fireProgressUpdated (progressInfo );
289
+ }
290
+ } catch (Exception e ) {
291
+ logMessage ("Exception processing item with GUID %s, %s (See Nuix logs for more detail)" , item .getGuid (), e .getMessage ());
292
+ logger .error (String .format ("Error while processing item with GUID %s" , item .getGuid ()),e );
271
293
}
272
294
}
273
-
274
- // Report progress
275
- BulkRedactorProgressInfo progressInfo = new BulkRedactorProgressInfo ();
276
- progressInfo .setCurrent (currentIteration );
277
- progressInfo .setTotal (scopeItems .size ());
278
- progressInfo .setMatches (matches );
279
- fireProgressUpdated (progressInfo );
295
+ };
296
+
297
+ ForkJoinPool pool = null ;
298
+ try {
299
+ pool = new ForkJoinPool (concurrency );
300
+ pool .submit (()->{
301
+ itemsToProcess .parallelStream ().forEach (workHorse );
302
+ }).get ();
303
+ } catch (Exception e ) {
304
+ logger .error ("Error while scanning" ,e );
305
+ throw e ;
306
+ } finally {
307
+ if (pool != null )
308
+ pool .shutdown ();
280
309
}
281
310
282
311
logMessage ("Cleaning up temp directory %s" ,settings .getTempDirectory ());
0 commit comments