Skip to content

Commit

Permalink
Merge pull request #4 from Nuix/named_entity_matching
Browse files Browse the repository at this point in the history
Named entity matching
  • Loading branch information
JuicyDragon authored Apr 23, 2019
2 parents 6120b0b + 3cbe42c commit e68dc4b
Show file tree
Hide file tree
Showing 181 changed files with 610 additions and 1,286 deletions.
47 changes: 44 additions & 3 deletions Java/src/main/java/com/nuix/superutilities/regex/RegexScanner.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package com.nuix.superutilities.regex;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
Expand Down Expand Up @@ -51,6 +52,9 @@ public static void setMaxToStringLength(int maxLength){
private boolean captureContextualText = true;
private int contextSize = 100;

private boolean matchNamedEntityValues = false;
private Set<String> namedEntityTypes = new HashSet<String>();

private boolean abortWasRequested = false;
private Object scanErrorLock = new Object();

Expand Down Expand Up @@ -292,9 +296,29 @@ public void accept(Item item) {
protected ItemRegexMatchCollection scanItem(Item item) {
ItemRegexMatchCollection itemMatches = new ItemRegexMatchCollection(item);

List<PatternInfo> patternsToScanFor = patterns;
if(matchNamedEntityValues && namedEntityTypes.size() > 0) {
patternsToScanFor = new ArrayList<PatternInfo>();
patternsToScanFor.addAll(patterns);
for(String namedEntityType : namedEntityTypes) {
try {
Set<String> entityValues = item.getEntities(namedEntityType);
for(String entityValue : entityValues) {
PatternInfo entityPattern = new PatternInfo(namedEntityType, "\\Q"+entityValue+"\\E");
entityPattern.compile(caseSensitive);
patternsToScanFor.add(entityPattern);
}
} catch (IOException e) {
RegexScanError error = new RegexScanError(item, null, "Named Entity Pattern Generation", e);
fireScanError(error);
}
}
}


if(scanProperties){
try {
for (PatternInfo p : patterns) {
for (PatternInfo p : patternsToScanFor) {
Matcher m = null;

for (Entry<String,String> propertyEntry : getStringProperties(item,propertiesToScan).entrySet()) {
Expand Down Expand Up @@ -332,7 +356,7 @@ protected ItemRegexMatchCollection scanItem(Item item) {

if(scanCustomMetadata){
try {
for (PatternInfo p : patterns) {
for (PatternInfo p : patternsToScanFor) {
Matcher m = null;

for (Entry<String,String> cmEntry : getStringCustomMetadata(item,customMetadataFieldsToScan).entrySet()) {
Expand Down Expand Up @@ -370,7 +394,7 @@ protected ItemRegexMatchCollection scanItem(Item item) {

if(scanContent){
try {
for (PatternInfo p : patterns) {
for (PatternInfo p : patternsToScanFor) {
try {
CharSequence contentTextCharSequence = item.getTextObject();
if(contentTextCharSequence != null){
Expand Down Expand Up @@ -534,6 +558,23 @@ public void setCustomMetadataToScan(List<String> fieldsToScan) {
}
}

public boolean getMatchNamedEntityValues() {
return matchNamedEntityValues;
}

public void setMatchNamedEntityValues(boolean matchNamedEntityValues) {
this.matchNamedEntityValues = matchNamedEntityValues;
}

public Set<String> getNamedEntityTypes() {
return namedEntityTypes;
}

public void setNamedEntityTypes(Collection<String> namedEntityTypes) {
this.namedEntityTypes.clear();
this.namedEntityTypes.addAll(namedEntityTypes);
}

/***
* When running a scan by providing a Consumer callback, this will signal
* that further scanning should be aborted.
Expand Down
55 changes: 1 addition & 54 deletions docs/allclasses-frame.html
Original file line number Diff line number Diff line change
Expand Up @@ -2,69 +2,16 @@
<!-- NewPage -->
<html lang="en">
<head>
<!-- Generated by javadoc (1.8.0_181) on Thu Mar 07 09:33:40 PST 2019 -->
<!-- Generated by javadoc -->
<title>All Classes</title>
<meta name="date" content="2019-03-07">
<link rel="stylesheet" type="text/css" href="stylesheet.css" title="Style">
<script type="text/javascript" src="script.js"></script>
</head>
<body>
<h1 class="bar">All&nbsp;Classes</h1>
<div class="indexContainer">
<ul>
<li><a href="com/nuix/superutilities/annotations/AnnotationEvent.html" title="class in com.nuix.superutilities.annotations" target="classFrame">AnnotationEvent</a></li>
<li><a href="com/nuix/superutilities/annotations/AnnotationHistoryRepository.html" title="class in com.nuix.superutilities.annotations" target="classFrame">AnnotationHistoryRepository</a></li>
<li><a href="com/nuix/superutilities/annotations/AnnotationHistoryRepositorySummary.html" title="class in com.nuix.superutilities.annotations" target="classFrame">AnnotationHistoryRepositorySummary</a></li>
<li><a href="com/nuix/superutilities/annotations/AnnotationSyncSettings.html" title="class in com.nuix.superutilities.annotations" target="classFrame">AnnotationSyncSettings</a></li>
<li><a href="com/nuix/superutilities/reporting/AsposeCellsColorHelper.html" title="class in com.nuix.superutilities.reporting" target="classFrame">AsposeCellsColorHelper</a></li>
<li><a href="com/nuix/superutilities/reporting/AsposeCellsStyleHelper.html" title="class in com.nuix.superutilities.reporting" target="classFrame">AsposeCellsStyleHelper</a></li>
<li><a href="com/nuix/superutilities/cases/BulkCaseProcessor.html" title="class in com.nuix.superutilities.cases" target="classFrame">BulkCaseProcessor</a></li>
<li><a href="com/nuix/superutilities/cases/CaseConsumer.html" title="interface in com.nuix.superutilities.cases" target="classFrame"><span class="interfaceName">CaseConsumer</span></a></li>
<li><a href="com/nuix/superutilities/cases/CaseHistoryHelper.html" title="class in com.nuix.superutilities.cases" target="classFrame">CaseHistoryHelper</a></li>
<li><a href="com/nuix/superutilities/cases/CaseInfo.html" title="class in com.nuix.superutilities.cases" target="classFrame">CaseInfo</a></li>
<li><a href="com/nuix/superutilities/cases/CaseIssueReaction.html" title="enum in com.nuix.superutilities.cases" target="classFrame">CaseIssueReaction</a></li>
<li><a href="com/nuix/superutilities/cases/CaseLockedEventInfo.html" title="class in com.nuix.superutilities.cases" target="classFrame">CaseLockedEventInfo</a></li>
<li><a href="com/nuix/superutilities/cases/CaseLockInfo.html" title="class in com.nuix.superutilities.cases" target="classFrame">CaseLockInfo</a></li>
<li><a href="com/nuix/superutilities/cases/CaseOpenErrorEvent.html" title="class in com.nuix.superutilities.cases" target="classFrame">CaseOpenErrorEvent</a></li>
<li><a href="com/nuix/superutilities/cases/CaseUtility.html" title="class in com.nuix.superutilities.cases" target="classFrame">CaseUtility</a></li>
<li><a href="com/nuix/superutilities/export/ChunkedDATExporter.html" title="class in com.nuix.superutilities.export" target="classFrame">ChunkedDATExporter</a></li>
<li><a href="com/nuix/superutilities/reporting/ColorRing.html" title="class in com.nuix.superutilities.reporting" target="classFrame">ColorRing</a></li>
<li><a href="com/nuix/superutilities/reporting/ColumnValueGenerator.html" title="class in com.nuix.superutilities.reporting" target="classFrame">ColumnValueGenerator</a></li>
<li><a href="com/nuix/superutilities/annotations/CustodianEvent.html" title="class in com.nuix.superutilities.annotations" target="classFrame">CustodianEvent</a></li>
<li><a href="com/nuix/superutilities/annotations/CustomMetadataEvent.html" title="class in com.nuix.superutilities.annotations" target="classFrame">CustomMetadataEvent</a></li>
<li><a href="com/nuix/superutilities/misc/DriveSpaceInfo.html" title="class in com.nuix.superutilities.misc" target="classFrame">DriveSpaceInfo</a></li>
<li><a href="com/nuix/superutilities/annotations/ExclusionEvent.html" title="class in com.nuix.superutilities.annotations" target="classFrame">ExclusionEvent</a></li>
<li><a href="com/nuix/superutilities/misc/FormatUtility.html" title="class in com.nuix.superutilities.misc" target="classFrame">FormatUtility</a></li>
<li><a href="com/nuix/superutilities/misc/FreeSpaceMonitor.html" title="class in com.nuix.superutilities.misc" target="classFrame">FreeSpaceMonitor</a></li>
<li><a href="com/nuix/superutilities/misc/FreeSpaceMonitorEventCallback.html" title="interface in com.nuix.superutilities.misc" target="classFrame"><span class="interfaceName">FreeSpaceMonitorEventCallback</span></a></li>
<li><a href="com/nuix/superutilities/reporting/IntersectionReport.html" title="class in com.nuix.superutilities.reporting" target="classFrame">IntersectionReport</a></li>
<li><a href="com/nuix/superutilities/reporting/IntersectionReportProgressCallback.html" title="interface in com.nuix.superutilities.reporting" target="classFrame"><span class="interfaceName">IntersectionReportProgressCallback</span></a></li>
<li><a href="com/nuix/superutilities/reporting/IntersectionReportSheetConfiguration.html" title="class in com.nuix.superutilities.reporting" target="classFrame">IntersectionReportSheetConfiguration</a></li>
<li><a href="com/nuix/superutilities/regex/ItemRegexMatchCollection.html" title="class in com.nuix.superutilities.regex" target="classFrame">ItemRegexMatchCollection</a></li>
<li><a href="com/nuix/superutilities/annotations/ItemSetEvent.html" title="class in com.nuix.superutilities.annotations" target="classFrame">ItemSetEvent</a></li>
<li><a href="com/nuix/superutilities/export/JsonExporter.html" title="class in com.nuix.superutilities.export" target="classFrame">JsonExporter</a></li>
<li><a href="com/nuix/superutilities/namedentities/NamedEntityRedactionProgressCallback.html" title="interface in com.nuix.superutilities.namedentities" target="classFrame"><span class="interfaceName">NamedEntityRedactionProgressCallback</span></a></li>
<li><a href="com/nuix/superutilities/namedentities/NamedEntityRedactionResults.html" title="class in com.nuix.superutilities.namedentities" target="classFrame">NamedEntityRedactionResults</a></li>
<li><a href="com/nuix/superutilities/namedentities/NamedEntityRedactionSettings.html" title="class in com.nuix.superutilities.namedentities" target="classFrame">NamedEntityRedactionSettings</a></li>
<li><a href="com/nuix/superutilities/namedentities/NamedEntityUtility.html" title="class in com.nuix.superutilities.namedentities" target="classFrame">NamedEntityUtility</a></li>
<li><a href="com/nuix/superutilities/reporting/NamedQuery.html" title="class in com.nuix.superutilities.reporting" target="classFrame">NamedQuery</a></li>
<li><a href="com/nuix/superutilities/misc/NuixVersion.html" title="class in com.nuix.superutilities.misc" target="classFrame">NuixVersion</a></li>
<li><a href="com/nuix/superutilities/regex/PatternInfo.html" title="class in com.nuix.superutilities.regex" target="classFrame">PatternInfo</a></li>
<li><a href="com/nuix/superutilities/misc/PdfUtility.html" title="class in com.nuix.superutilities.misc" target="classFrame">PdfUtility</a></li>
<li><a href="com/nuix/superutilities/misc/PlaceholderResolver.html" title="class in com.nuix.superutilities.misc" target="classFrame">PlaceholderResolver</a></li>
<li><a href="com/nuix/superutilities/annotations/ProductionSetEvent.html" title="class in com.nuix.superutilities.annotations" target="classFrame">ProductionSetEvent</a></li>
<li><a href="com/nuix/superutilities/query/QueryHelper.html" title="class in com.nuix.superutilities.query" target="classFrame">QueryHelper</a></li>
<li><a href="com/nuix/superutilities/regex/RegexMatch.html" title="class in com.nuix.superutilities.regex" target="classFrame">RegexMatch</a></li>
<li><a href="com/nuix/superutilities/regex/RegexScanError.html" title="class in com.nuix.superutilities.regex" target="classFrame">RegexScanError</a></li>
<li><a href="com/nuix/superutilities/regex/RegexScanner.html" title="class in com.nuix.superutilities.regex" target="classFrame">RegexScanner</a></li>
<li><a href="com/nuix/superutilities/reporting/ScriptedColumnValueGenerator.html" title="class in com.nuix.superutilities.reporting" target="classFrame">ScriptedColumnValueGenerator</a></li>
<li><a href="com/nuix/superutilities/reporting/SimpleWorksheet.html" title="class in com.nuix.superutilities.reporting" target="classFrame">SimpleWorksheet</a></li>
<li><a href="com/nuix/superutilities/reporting/SimpleXlsx.html" title="class in com.nuix.superutilities.reporting" target="classFrame">SimpleXlsx</a></li>
<li><a href="com/nuix/superutilities/items/SuperItemUtility.html" title="class in com.nuix.superutilities.items" target="classFrame">SuperItemUtility</a></li>
<li><a href="com/nuix/superutilities/SuperUtilities.html" title="class in com.nuix.superutilities" target="classFrame">SuperUtilities</a></li>
<li><a href="com/nuix/superutilities/annotations/TagEvent.html" title="class in com.nuix.superutilities.annotations" target="classFrame">TagEvent</a></li>
<li><a href="com/nuix/superutilities/export/TemplateExporter.html" title="class in com.nuix.superutilities.export" target="classFrame">TemplateExporter</a></li>
<li><a href="com/nuix/superutilities/cases/WorkFunctionErrorEvent.html" title="class in com.nuix.superutilities.cases" target="classFrame">WorkFunctionErrorEvent</a></li>
</ul>
</div>
</body>
Expand Down
Loading

0 comments on commit e68dc4b

Please sign in to comment.