-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #6 from Nuix/term_expander
Term expander
- Loading branch information
Showing
7 changed files
with
615 additions
and
0 deletions.
There are no files selected for viewing
47 changes: 47 additions & 0 deletions
47
Java/src/main/java/com/nuix/superutilities/misc/ExpandedTermInfo.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
package com.nuix.superutilities.misc; | ||
|
||
/*** | ||
* Encapsulates information about a term designated as a match result generated by {@link TermExpander}. | ||
* @author Jason Wells | ||
* | ||
*/ | ||
public class ExpandedTermInfo { | ||
private String originalTerm = ""; | ||
private String matchedTerm = ""; | ||
private long ocurrences = 0; | ||
private float similarity = 0.0f; | ||
|
||
public String getOriginalTerm() { | ||
return originalTerm; | ||
} | ||
public void setOriginalTerm(String originalTerm) { | ||
this.originalTerm = originalTerm; | ||
} | ||
|
||
public String getMatchedTerm() { | ||
return matchedTerm; | ||
} | ||
public void setMatchedTerm(String matchedTerm) { | ||
this.matchedTerm = matchedTerm; | ||
} | ||
|
||
public long getOcurrences() { | ||
return ocurrences; | ||
} | ||
public void setOcurrences(long ocurrences) { | ||
this.ocurrences = ocurrences; | ||
} | ||
|
||
public float getSimilarity() { | ||
return similarity; | ||
} | ||
public void setSimilarity(float similarity) { | ||
this.similarity = similarity; | ||
} | ||
|
||
@Override | ||
public String toString() { | ||
return "ExpandedTermInfo [originalTerm=" + originalTerm + ", matchedTerm=" + matchedTerm + ", ocurrences=" | ||
+ ocurrences + ", similarity=" + similarity + "]"; | ||
} | ||
} |
88 changes: 88 additions & 0 deletions
88
Java/src/main/java/com/nuix/superutilities/misc/FuzzyTermInfo.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
package com.nuix.superutilities.misc; | ||
|
||
import java.util.regex.Matcher; | ||
import java.util.regex.Pattern; | ||
|
||
import org.apache.lucene.search.spell.LuceneLevenshteinDistance; | ||
import org.apache.lucene.search.spell.LevensteinDistance; | ||
import org.apache.lucene.search.spell.JaroWinklerDistance; | ||
import org.apache.lucene.search.spell.NGramDistance; | ||
|
||
/*** | ||
* Encapsulates information about a fuzzy term expression. Used by {@link TermExpander}. | ||
* @author Jason Wells | ||
* | ||
*/ | ||
public class FuzzyTermInfo { | ||
private static Pattern fuzzyPattern = Pattern.compile("(?<term>([a-z0-9]+))~(?<similarity>([0-1]\\.?[0-9]*)?)",Pattern.CASE_INSENSITIVE); | ||
|
||
private static LevensteinDistance levDist = new LevensteinDistance(); | ||
private static LuceneLevenshteinDistance luceneLevDist = new LuceneLevenshteinDistance(); | ||
private static JaroWinklerDistance jaroDist = new JaroWinklerDistance(); | ||
private static NGramDistance ngramDist = new NGramDistance(); | ||
|
||
public static boolean isFuzzyTerm(String term) { | ||
return fuzzyPattern.matcher(term.trim()).find(); | ||
} | ||
|
||
/*** | ||
* Parses a fuzzy term string into component term and similarity score. When a similarity score is | ||
* not present defaults to 0.5 (like <a href="https://lucene.apache.org/core/2_9_4/queryparsersyntax.html#Fuzzy%20Searches">Lucene</a>). | ||
* @param term Fuzzy term expression to parse in form: <code>term~0.5</code> or <code>term~</code>. | ||
* @return A Fuzzy object containing term and similarity score. | ||
*/ | ||
public static FuzzyTermInfo parseFuzzyTerm(String term) { | ||
Matcher m = fuzzyPattern.matcher(term); | ||
FuzzyTermInfo f = new FuzzyTermInfo(); | ||
if(m.find()) { | ||
f.term = m.group("term"); | ||
String similarity = m.group("similarity"); | ||
if(similarity.trim().isEmpty()) { | ||
f.similarity = 0.5f; | ||
} else { | ||
f.setTargetSimilarity(Float.parseFloat(similarity)); | ||
} | ||
} | ||
return f; | ||
} | ||
|
||
public float calculateLevensteinSimilarityTo(String otherTerm) { | ||
return levDist.getDistance(this.term, otherTerm); | ||
} | ||
|
||
public float calculateLuceneLevenshteinSimilarityTo(String otherTerm) { | ||
return luceneLevDist.getDistance(this.term, otherTerm); | ||
} | ||
|
||
public float calculateJaroWinklerSimilarityTo(String otherTerm) { | ||
return jaroDist.getDistance(this.term, otherTerm); | ||
} | ||
|
||
public float calculateNGramSimilarityTo(String otherTerm) { | ||
return ngramDist.getDistance(this.term, otherTerm); | ||
} | ||
|
||
private String term = ""; | ||
private float similarity = 0.5f; | ||
public FuzzyTermInfo() {} | ||
|
||
public String getTerm() { | ||
return term; | ||
} | ||
|
||
public void setTerm(String term) { | ||
this.term = term; | ||
} | ||
|
||
public float getTargetSimilarity() { | ||
return similarity; | ||
} | ||
|
||
public void setTargetSimilarity(float similarity) { | ||
if(similarity < 0.0f) { this.similarity = 0.0f; } | ||
else if(similarity > 1.0f) { this.similarity = 1.0f; } | ||
else { this.similarity = similarity; } | ||
} | ||
|
||
|
||
} |
24 changes: 24 additions & 0 deletions
24
Java/src/main/java/com/nuix/superutilities/misc/LevenshteinDistance.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
package com.nuix.superutilities.misc; | ||
|
||
public class LevenshteinDistance { | ||
// https://rosettacode.org/wiki/Levenshtein_distance#Java | ||
public static int calculate(String a, String b) { | ||
a = a.toLowerCase(); | ||
b = b.toLowerCase(); | ||
// i == 0 | ||
int [] costs = new int [b.length() + 1]; | ||
for (int j = 0; j < costs.length; j++) | ||
costs[j] = j; | ||
for (int i = 1; i <= a.length(); i++) { | ||
// j == 0; nw = lev(i - 1, j) | ||
costs[0] = i; | ||
int nw = i - 1; | ||
for (int j = 1; j <= b.length(); j++) { | ||
int cj = Math.min(1 + Math.min(costs[j], costs[j - 1]), a.charAt(i - 1) == b.charAt(j - 1) ? nw : nw + 1); | ||
nw = costs[j]; | ||
costs[j] = cj; | ||
} | ||
} | ||
return costs[b.length()]; | ||
} | ||
} |
15 changes: 15 additions & 0 deletions
15
Java/src/main/java/com/nuix/superutilities/misc/SimilarityCalculation.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
package com.nuix.superutilities.misc; | ||
|
||
/*** | ||
* Used by {@link com.nuix.superutilities.misc.TermExpander} to determine how fuzzy similarity should be | ||
* calculated to expand a given fuzzy term into matched terms. | ||
* @author Jason Wells | ||
* | ||
*/ | ||
public enum SimilarityCalculation { | ||
Nuix, | ||
Levenstein, | ||
LuceneLevenshstein, | ||
JaroWinkler, | ||
NGram, | ||
} |
Oops, something went wrong.