-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add simple char filter for normalizing chracters
For NFKC and lowercasing.
- Loading branch information
Showing
7 changed files
with
190 additions
and
2 deletions.
There are no files selected for viewing
78 changes: 78 additions & 0 deletions
78
src/main/java/org/elasticsearch/analysis/UnicodeNormalizationCharFilter.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
package org.elasticsearch.analysis; | ||
|
||
import org.apache.lucene.analysis.CharFilter; | ||
|
||
import java.io.IOException; | ||
import java.io.Reader; | ||
import java.text.Normalizer; | ||
|
||
public class UnicodeNormalizationCharFilter extends CharFilter { | ||
private final Normalizer.Form form; | ||
private final boolean lowerCase; | ||
|
||
private final StringBuilder normalized = new StringBuilder(); | ||
private final StringBuilder raw = new StringBuilder(); | ||
|
||
private final char[] buffer = new char[1024]; | ||
|
||
private boolean read = false; | ||
private int position = 0; | ||
|
||
public UnicodeNormalizationCharFilter(Reader input, Normalizer.Form form, boolean lowerCase) { | ||
super(input); | ||
this.form = form; | ||
this.lowerCase = lowerCase; | ||
} | ||
|
||
@Override | ||
protected int correct(int i) { | ||
return 0; | ||
} | ||
|
||
@Override | ||
public int read(char[] cbuf, int off, int len) throws IOException { | ||
if (off + len > this.normalized.length()) { | ||
readAllAndNormalize(); | ||
} | ||
|
||
if (!this.read) { | ||
return -1; | ||
} | ||
|
||
int readLength = Math.min(len, this.normalized.length() - this.position); | ||
|
||
this.normalized.getChars(this.position, this.position + readLength, cbuf, off); | ||
this.position += readLength; | ||
|
||
// consumed all normalized buffer. | ||
if (this.position == this.normalized.length()) { | ||
clear(); | ||
} | ||
|
||
return readLength; | ||
} | ||
|
||
@Override | ||
public void reset() throws IOException { | ||
super.reset(); | ||
clear(); | ||
} | ||
|
||
private void clear() { | ||
this.position = 0; | ||
this.normalized.delete(0, this.normalized.length()); | ||
this.raw.delete(0, this.raw.length()); | ||
this.read = false; | ||
} | ||
|
||
private void readAllAndNormalize() throws IOException { | ||
int length; | ||
while ((length = this.input.read(this.buffer)) != -1) { | ||
this.read = true; | ||
this.raw.append(this.buffer, 0, length); | ||
} | ||
String normalized = Normalizer.normalize(this.raw, this.form); | ||
|
||
this.normalized.append(this.lowerCase ? normalized.toLowerCase() : normalized); | ||
} | ||
} |
28 changes: 28 additions & 0 deletions
28
src/main/java/org/elasticsearch/analysis/UnicodeNormalizationCharFilterFactory.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
package org.elasticsearch.analysis; | ||
|
||
import org.elasticsearch.common.inject.Inject; | ||
import org.elasticsearch.common.inject.assistedinject.Assisted; | ||
import org.elasticsearch.common.settings.Settings; | ||
import org.elasticsearch.index.Index; | ||
import org.elasticsearch.index.analysis.AbstractCharFilterFactory; | ||
import org.elasticsearch.index.settings.IndexSettingsService; | ||
|
||
import java.io.Reader; | ||
import java.text.Normalizer; | ||
|
||
public class UnicodeNormalizationCharFilterFactory extends AbstractCharFilterFactory { | ||
private final Normalizer.Form form; | ||
private final boolean lowerCase; | ||
@Inject | ||
public UnicodeNormalizationCharFilterFactory(Index index, IndexSettingsService indexSettingsService, | ||
@Assisted String name, @Assisted Settings settings) { | ||
super(index, indexSettingsService.getSettings(), name); | ||
this.form = Normalizer.Form.valueOf(settings.get("form", "NFKC").toUpperCase()); | ||
this.lowerCase = settings.getAsBoolean("lower_case", true); | ||
} | ||
|
||
@Override | ||
public Reader create(Reader tokenStream) { | ||
return new UnicodeNormalizationCharFilter(tokenStream, this.form, this.lowerCase); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
52 changes: 52 additions & 0 deletions
52
src/test/java/org/elasticsearch/index/analysis/UnicodeNormalizationCharFilterTest.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
package org.elasticsearch.index.analysis; | ||
|
||
import org.apache.lucene.analysis.BaseTokenStreamTestCase; | ||
import org.apache.lucene.analysis.CharFilter; | ||
import org.apache.lucene.analysis.MockTokenizer; | ||
import org.elasticsearch.analysis.UnicodeNormalizationCharFilter; | ||
import org.junit.Test; | ||
|
||
import java.io.IOException; | ||
import java.io.StringReader; | ||
import java.text.Normalizer; | ||
|
||
public class UnicodeNormalizationCharFilterTest extends BaseTokenStreamTestCase { | ||
@Test | ||
public void testSimpleNFKC() throws IOException { | ||
String input = "アイウエオ 123 ABC"; | ||
CharFilter charFilter = new UnicodeNormalizationCharFilter(new StringReader(input), Normalizer.Form.NFKC, true); | ||
|
||
|
||
MockTokenizer tokenizer = new MockTokenizer(); | ||
tokenizer.setReader(charFilter); | ||
|
||
String[] expected = new String[] {"アイウエオ", "123", "abc"}; | ||
assertTokenStreamContents(tokenizer, expected); | ||
} | ||
|
||
@Test | ||
public void testComposeNfkc() throws IOException { | ||
String input = "ガギグゲゴ"; | ||
CharFilter charFilter = new UnicodeNormalizationCharFilter(new StringReader(input), Normalizer.Form.NFKC, true); | ||
|
||
|
||
MockTokenizer tokenizer = new MockTokenizer(); | ||
tokenizer.setReader(charFilter); | ||
|
||
String[] expected = new String[] {"ガギグゲゴ"}; | ||
assertTokenStreamContents(tokenizer, expected); | ||
} | ||
|
||
@Test | ||
public void testComposeNfkcHalfWidthKatakana() throws IOException { | ||
String input = "ガギグゲゴ"; | ||
CharFilter charFilter = new UnicodeNormalizationCharFilter(new StringReader(input), Normalizer.Form.NFKC, true); | ||
|
||
|
||
MockTokenizer tokenizer = new MockTokenizer(); | ||
tokenizer.setReader(charFilter); | ||
|
||
String[] expected = new String[] {"ガギグゲゴ"}; | ||
assertTokenStreamContents(tokenizer, expected); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters