Skip to content

Commit b4cd9b5

Browse files
authored
Merge pull request #6 from Nuix/term_expander
Term expander
2 parents 2a96f1e + 9b6ca4e commit b4cd9b5

File tree

7 files changed

+615
-0
lines changed

7 files changed

+615
-0
lines changed
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
package com.nuix.superutilities.misc;
2+
3+
/***
4+
* Encapsulates information about a term designated as a match result generated by {@link TermExpander}.
5+
* @author Jason Wells
6+
*
7+
*/
8+
public class ExpandedTermInfo {
9+
private String originalTerm = "";
10+
private String matchedTerm = "";
11+
private long ocurrences = 0;
12+
private float similarity = 0.0f;
13+
14+
public String getOriginalTerm() {
15+
return originalTerm;
16+
}
17+
public void setOriginalTerm(String originalTerm) {
18+
this.originalTerm = originalTerm;
19+
}
20+
21+
public String getMatchedTerm() {
22+
return matchedTerm;
23+
}
24+
public void setMatchedTerm(String matchedTerm) {
25+
this.matchedTerm = matchedTerm;
26+
}
27+
28+
public long getOcurrences() {
29+
return ocurrences;
30+
}
31+
public void setOcurrences(long ocurrences) {
32+
this.ocurrences = ocurrences;
33+
}
34+
35+
public float getSimilarity() {
36+
return similarity;
37+
}
38+
public void setSimilarity(float similarity) {
39+
this.similarity = similarity;
40+
}
41+
42+
@Override
43+
public String toString() {
44+
return "ExpandedTermInfo [originalTerm=" + originalTerm + ", matchedTerm=" + matchedTerm + ", ocurrences="
45+
+ ocurrences + ", similarity=" + similarity + "]";
46+
}
47+
}
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
package com.nuix.superutilities.misc;
2+
3+
import java.util.regex.Matcher;
4+
import java.util.regex.Pattern;
5+
6+
import org.apache.lucene.search.spell.LuceneLevenshteinDistance;
7+
import org.apache.lucene.search.spell.LevensteinDistance;
8+
import org.apache.lucene.search.spell.JaroWinklerDistance;
9+
import org.apache.lucene.search.spell.NGramDistance;
10+
11+
/***
12+
* Encapsulates information about a fuzzy term expression. Used by {@link TermExpander}.
13+
* @author Jason Wells
14+
*
15+
*/
16+
public class FuzzyTermInfo {
17+
private static Pattern fuzzyPattern = Pattern.compile("(?<term>([a-z0-9]+))~(?<similarity>([0-1]\\.?[0-9]*)?)",Pattern.CASE_INSENSITIVE);
18+
19+
private static LevensteinDistance levDist = new LevensteinDistance();
20+
private static LuceneLevenshteinDistance luceneLevDist = new LuceneLevenshteinDistance();
21+
private static JaroWinklerDistance jaroDist = new JaroWinklerDistance();
22+
private static NGramDistance ngramDist = new NGramDistance();
23+
24+
public static boolean isFuzzyTerm(String term) {
25+
return fuzzyPattern.matcher(term.trim()).find();
26+
}
27+
28+
/***
29+
* Parses a fuzzy term string into component term and similarity score. When a similarity score is
30+
* not present defaults to 0.5 (like <a href="https://lucene.apache.org/core/2_9_4/queryparsersyntax.html#Fuzzy%20Searches">Lucene</a>).
31+
* @param term Fuzzy term expression to parse in form: <code>term~0.5</code> or <code>term~</code>.
32+
* @return A Fuzzy object containing term and similarity score.
33+
*/
34+
public static FuzzyTermInfo parseFuzzyTerm(String term) {
35+
Matcher m = fuzzyPattern.matcher(term);
36+
FuzzyTermInfo f = new FuzzyTermInfo();
37+
if(m.find()) {
38+
f.term = m.group("term");
39+
String similarity = m.group("similarity");
40+
if(similarity.trim().isEmpty()) {
41+
f.similarity = 0.5f;
42+
} else {
43+
f.setTargetSimilarity(Float.parseFloat(similarity));
44+
}
45+
}
46+
return f;
47+
}
48+
49+
public float calculateLevensteinSimilarityTo(String otherTerm) {
50+
return levDist.getDistance(this.term, otherTerm);
51+
}
52+
53+
public float calculateLuceneLevenshteinSimilarityTo(String otherTerm) {
54+
return luceneLevDist.getDistance(this.term, otherTerm);
55+
}
56+
57+
public float calculateJaroWinklerSimilarityTo(String otherTerm) {
58+
return jaroDist.getDistance(this.term, otherTerm);
59+
}
60+
61+
public float calculateNGramSimilarityTo(String otherTerm) {
62+
return ngramDist.getDistance(this.term, otherTerm);
63+
}
64+
65+
private String term = "";
66+
private float similarity = 0.5f;
67+
public FuzzyTermInfo() {}
68+
69+
public String getTerm() {
70+
return term;
71+
}
72+
73+
public void setTerm(String term) {
74+
this.term = term;
75+
}
76+
77+
public float getTargetSimilarity() {
78+
return similarity;
79+
}
80+
81+
public void setTargetSimilarity(float similarity) {
82+
if(similarity < 0.0f) { this.similarity = 0.0f; }
83+
else if(similarity > 1.0f) { this.similarity = 1.0f; }
84+
else { this.similarity = similarity; }
85+
}
86+
87+
88+
}
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
package com.nuix.superutilities.misc;
2+
3+
public class LevenshteinDistance {
4+
// https://rosettacode.org/wiki/Levenshtein_distance#Java
5+
public static int calculate(String a, String b) {
6+
a = a.toLowerCase();
7+
b = b.toLowerCase();
8+
// i == 0
9+
int [] costs = new int [b.length() + 1];
10+
for (int j = 0; j < costs.length; j++)
11+
costs[j] = j;
12+
for (int i = 1; i <= a.length(); i++) {
13+
// j == 0; nw = lev(i - 1, j)
14+
costs[0] = i;
15+
int nw = i - 1;
16+
for (int j = 1; j <= b.length(); j++) {
17+
int cj = Math.min(1 + Math.min(costs[j], costs[j - 1]), a.charAt(i - 1) == b.charAt(j - 1) ? nw : nw + 1);
18+
nw = costs[j];
19+
costs[j] = cj;
20+
}
21+
}
22+
return costs[b.length()];
23+
}
24+
}
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
package com.nuix.superutilities.misc;
2+
3+
/***
4+
* Used by {@link com.nuix.superutilities.misc.TermExpander} to determine how fuzzy similarity should be
5+
* calculated to expand a given fuzzy term into matched terms.
6+
* @author Jason Wells
7+
*
8+
*/
9+
public enum SimilarityCalculation {
10+
Nuix,
11+
Levenstein,
12+
LuceneLevenshstein,
13+
JaroWinkler,
14+
NGram,
15+
}

0 commit comments

Comments
 (0)