Skip to content

Commit

Permalink
Merge pull request #6 from Nuix/term_expander
Browse files Browse the repository at this point in the history
Term expander
  • Loading branch information
JuicyDragon authored May 14, 2019
2 parents 2a96f1e + 9b6ca4e commit b4cd9b5
Show file tree
Hide file tree
Showing 7 changed files with 615 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
package com.nuix.superutilities.misc;

/***
* Encapsulates information about a term designated as a match result generated by {@link TermExpander}.
* @author Jason Wells
*
*/
public class ExpandedTermInfo {
private String originalTerm = "";
private String matchedTerm = "";
private long ocurrences = 0;
private float similarity = 0.0f;

public String getOriginalTerm() {
return originalTerm;
}
public void setOriginalTerm(String originalTerm) {
this.originalTerm = originalTerm;
}

public String getMatchedTerm() {
return matchedTerm;
}
public void setMatchedTerm(String matchedTerm) {
this.matchedTerm = matchedTerm;
}

public long getOcurrences() {
return ocurrences;
}
public void setOcurrences(long ocurrences) {
this.ocurrences = ocurrences;
}

public float getSimilarity() {
return similarity;
}
public void setSimilarity(float similarity) {
this.similarity = similarity;
}

@Override
public String toString() {
return "ExpandedTermInfo [originalTerm=" + originalTerm + ", matchedTerm=" + matchedTerm + ", ocurrences="
+ ocurrences + ", similarity=" + similarity + "]";
}
}
88 changes: 88 additions & 0 deletions Java/src/main/java/com/nuix/superutilities/misc/FuzzyTermInfo.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
package com.nuix.superutilities.misc;

import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.lucene.search.spell.LuceneLevenshteinDistance;
import org.apache.lucene.search.spell.LevensteinDistance;
import org.apache.lucene.search.spell.JaroWinklerDistance;
import org.apache.lucene.search.spell.NGramDistance;

/***
* Encapsulates information about a fuzzy term expression. Used by {@link TermExpander}.
* @author Jason Wells
*
*/
public class FuzzyTermInfo {
private static Pattern fuzzyPattern = Pattern.compile("(?<term>([a-z0-9]+))~(?<similarity>([0-1]\\.?[0-9]*)?)",Pattern.CASE_INSENSITIVE);

private static LevensteinDistance levDist = new LevensteinDistance();
private static LuceneLevenshteinDistance luceneLevDist = new LuceneLevenshteinDistance();
private static JaroWinklerDistance jaroDist = new JaroWinklerDistance();
private static NGramDistance ngramDist = new NGramDistance();

public static boolean isFuzzyTerm(String term) {
return fuzzyPattern.matcher(term.trim()).find();
}

/***
* Parses a fuzzy term string into component term and similarity score. When a similarity score is
* not present defaults to 0.5 (like <a href="https://lucene.apache.org/core/2_9_4/queryparsersyntax.html#Fuzzy%20Searches">Lucene</a>).
* @param term Fuzzy term expression to parse in form: <code>term~0.5</code> or <code>term~</code>.
* @return A Fuzzy object containing term and similarity score.
*/
public static FuzzyTermInfo parseFuzzyTerm(String term) {
Matcher m = fuzzyPattern.matcher(term);
FuzzyTermInfo f = new FuzzyTermInfo();
if(m.find()) {
f.term = m.group("term");
String similarity = m.group("similarity");
if(similarity.trim().isEmpty()) {
f.similarity = 0.5f;
} else {
f.setTargetSimilarity(Float.parseFloat(similarity));
}
}
return f;
}

public float calculateLevensteinSimilarityTo(String otherTerm) {
return levDist.getDistance(this.term, otherTerm);
}

public float calculateLuceneLevenshteinSimilarityTo(String otherTerm) {
return luceneLevDist.getDistance(this.term, otherTerm);
}

public float calculateJaroWinklerSimilarityTo(String otherTerm) {
return jaroDist.getDistance(this.term, otherTerm);
}

public float calculateNGramSimilarityTo(String otherTerm) {
return ngramDist.getDistance(this.term, otherTerm);
}

private String term = "";
private float similarity = 0.5f;
public FuzzyTermInfo() {}

public String getTerm() {
return term;
}

public void setTerm(String term) {
this.term = term;
}

public float getTargetSimilarity() {
return similarity;
}

public void setTargetSimilarity(float similarity) {
if(similarity < 0.0f) { this.similarity = 0.0f; }
else if(similarity > 1.0f) { this.similarity = 1.0f; }
else { this.similarity = similarity; }
}


}
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
package com.nuix.superutilities.misc;

public class LevenshteinDistance {
// https://rosettacode.org/wiki/Levenshtein_distance#Java
public static int calculate(String a, String b) {
a = a.toLowerCase();
b = b.toLowerCase();
// i == 0
int [] costs = new int [b.length() + 1];
for (int j = 0; j < costs.length; j++)
costs[j] = j;
for (int i = 1; i <= a.length(); i++) {
// j == 0; nw = lev(i - 1, j)
costs[0] = i;
int nw = i - 1;
for (int j = 1; j <= b.length(); j++) {
int cj = Math.min(1 + Math.min(costs[j], costs[j - 1]), a.charAt(i - 1) == b.charAt(j - 1) ? nw : nw + 1);
nw = costs[j];
costs[j] = cj;
}
}
return costs[b.length()];
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
package com.nuix.superutilities.misc;

/***
* Used by {@link com.nuix.superutilities.misc.TermExpander} to determine how fuzzy similarity should be
* calculated to expand a given fuzzy term into matched terms.
* @author Jason Wells
*
*/
public enum SimilarityCalculation {
Nuix,
Levenstein,
LuceneLevenshstein,
JaroWinkler,
NGram,
}
Loading

0 comments on commit b4cd9b5

Please sign in to comment.