Skip to content

Commit

Permalink
Add simple char filter for normalizing chracters
Browse files Browse the repository at this point in the history
For NFKC and lowercasing.
  • Loading branch information
masaruh committed Jan 6, 2016
1 parent 141789c commit e51da18
Show file tree
Hide file tree
Showing 7 changed files with 190 additions and 2 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
package org.elasticsearch.analysis;

import org.apache.lucene.analysis.CharFilter;

import java.io.IOException;
import java.io.Reader;
import java.text.Normalizer;

public class UnicodeNormalizationCharFilter extends CharFilter {
private final Normalizer.Form form;
private final boolean lowerCase;

private final StringBuilder normalized = new StringBuilder();
private final StringBuilder raw = new StringBuilder();

private final char[] buffer = new char[1024];

private boolean read = false;
private int position = 0;

public UnicodeNormalizationCharFilter(Reader input, Normalizer.Form form, boolean lowerCase) {
super(input);
this.form = form;
this.lowerCase = lowerCase;
}

@Override
protected int correct(int i) {
return 0;
}

@Override
public int read(char[] cbuf, int off, int len) throws IOException {
if (off + len > this.normalized.length()) {
readAllAndNormalize();
}

if (!this.read) {
return -1;
}

int readLength = Math.min(len, this.normalized.length() - this.position);

this.normalized.getChars(this.position, this.position + readLength, cbuf, off);
this.position += readLength;

// consumed all normalized buffer.
if (this.position == this.normalized.length()) {
clear();
}

return readLength;
}

@Override
public void reset() throws IOException {
super.reset();
clear();
}

private void clear() {
this.position = 0;
this.normalized.delete(0, this.normalized.length());
this.raw.delete(0, this.raw.length());
this.read = false;
}

private void readAllAndNormalize() throws IOException {
int length;
while ((length = this.input.read(this.buffer)) != -1) {
this.read = true;
this.raw.append(this.buffer, 0, length);
}
String normalized = Normalizer.normalize(this.raw, this.form);

this.normalized.append(this.lowerCase ? normalized.toLowerCase() : normalized);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
package org.elasticsearch.analysis;

import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.inject.assistedinject.Assisted;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.analysis.AbstractCharFilterFactory;
import org.elasticsearch.index.settings.IndexSettingsService;

import java.io.Reader;
import java.text.Normalizer;

public class UnicodeNormalizationCharFilterFactory extends AbstractCharFilterFactory {
private final Normalizer.Form form;
private final boolean lowerCase;
@Inject
public UnicodeNormalizationCharFilterFactory(Index index, IndexSettingsService indexSettingsService,
@Assisted String name, @Assisted Settings settings) {
super(index, indexSettingsService.getSettings(), name);
this.form = Normalizer.Form.valueOf(settings.get("form", "NFKC").toUpperCase());
this.lowerCase = settings.getAsBoolean("lower_case", true);
}

@Override
public Reader create(Reader tokenStream) {
return new UnicodeNormalizationCharFilter(tokenStream, this.form, this.lowerCase);
}
}
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package org.elasticsearch.plugin;

import org.elasticsearch.analysis.KuromojiSuggestTokenizerFactory;
import org.elasticsearch.analysis.UnicodeNormalizationCharFilterFactory;
import org.elasticsearch.index.analysis.AnalysisModule;
import org.elasticsearch.index.mapper.core.CompletionFieldMapper;
import org.elasticsearch.indices.IndicesModule;
Expand All @@ -25,6 +26,7 @@ public void onModule(IndicesModule indicesModule) {

public void onModule(AnalysisModule module) {
module.addTokenizer("kuromoji_suggest", KuromojiSuggestTokenizerFactory.class);
module.addCharFilter("unicode_normalize", UnicodeNormalizationCharFilterFactory.class);
}

public void onModule(SearchModule module) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ public void testDedupInput() throws IOException {
public void testKanjiAndAlphaNumeric() throws IOException {
testTokenization(createAnalyzer(true, false), "2015年", Lists.newArrayList("2015年", "2015nen"));
testTokenization(createAnalyzer(true, false), "第138回", Lists.newArrayList("第138回", "dai138kai"));
testTokenization(createAnalyzer(true, false), "A型", Lists.newArrayList("A型", "Agata"));
testTokenization(createAnalyzer(true, false), "A型", Lists.newArrayList("a型", "agata"));
}

@Test
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
package org.elasticsearch.index.analysis;

import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CharFilter;
import org.apache.lucene.analysis.MockTokenizer;
import org.elasticsearch.analysis.UnicodeNormalizationCharFilter;
import org.junit.Test;

import java.io.IOException;
import java.io.StringReader;
import java.text.Normalizer;

public class UnicodeNormalizationCharFilterTest extends BaseTokenStreamTestCase {
@Test
public void testSimpleNFKC() throws IOException {
String input = "アイウエオ 123 ABC";
CharFilter charFilter = new UnicodeNormalizationCharFilter(new StringReader(input), Normalizer.Form.NFKC, true);


MockTokenizer tokenizer = new MockTokenizer();
tokenizer.setReader(charFilter);

String[] expected = new String[] {"アイウエオ", "123", "abc"};
assertTokenStreamContents(tokenizer, expected);
}

@Test
public void testComposeNfkc() throws IOException {
String input = "ガギグゲゴ";
CharFilter charFilter = new UnicodeNormalizationCharFilter(new StringReader(input), Normalizer.Form.NFKC, true);


MockTokenizer tokenizer = new MockTokenizer();
tokenizer.setReader(charFilter);

String[] expected = new String[] {"ガギグゲゴ"};
assertTokenStreamContents(tokenizer, expected);
}

@Test
public void testComposeNfkcHalfWidthKatakana() throws IOException {
String input = "ガギグゲゴ";
CharFilter charFilter = new UnicodeNormalizationCharFilter(new StringReader(input), Normalizer.Form.NFKC, true);


MockTokenizer tokenizer = new MockTokenizer();
tokenizer.setReader(charFilter);

String[] expected = new String[] {"ガギグゲゴ"};
assertTokenStreamContents(tokenizer, expected);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,20 @@ public void testPrefixFiltering() throws IOException {
assertSuggestResult(index, field, "小", 1, null);
}

public void testNormlization() throws IOException {
String index = "normalization_test";
String type = "type";
String field = "suggest";

createTestIndex(index, type, field);

feedDocument(index, type, field, "ガギグゲゴ");
feedDocument(index, type, field, "ABCDE");

assertSuggestResult(index, field, "ガ", 1, "ガギグゲゴ");
assertSuggestResult(index, field, "a", 1, "ABCDE");
}

public void createTestIndex(String index, String type, String completionField) throws IOException {
client().admin().indices().prepareCreate(index)
.setSettings(
Expand All @@ -94,9 +108,11 @@ public void createTestIndex(String index, String type, String completionField) t
.startObject("analyzer")
.startObject("kuromoji_suggest_index")
.field("tokenizer", "kuromoji_suggest_index")
.array("char_filter", "nfkc_lc")
.endObject()
.startObject("kuromoji_suggest_search")
.field("tokenizer", "kuromoji_suggest_search")
.array("char_filter", "nfkc_lc")
.endObject()
.endObject()
.startObject("tokenizer")
Expand All @@ -109,6 +125,11 @@ public void createTestIndex(String index, String type, String completionField) t
.field("expand", false)
.endObject()
.endObject()
.startObject("char_filter")
.startObject("nfkc_lc")
.field("type", "unicode_normalize")
.endObject()
.endObject()
.endObject()
.endObject()
.endObject())
Expand Down
9 changes: 8 additions & 1 deletion src/test/resources/analyzer_configuration_template.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
"analysis": {
"analyzer": {
"kuromoji_suggest":{
"tokenizer":"kuromoji_suggest"
"tokenizer":"kuromoji_suggest",
"char_filter": ["nfkc_lc"]
}
},

Expand All @@ -14,6 +15,12 @@
"max_expansions": %MAX_EXPANSIONS%,
"edge_ngram": %EDGE_NGRAM%
}
},

"char_filter":{
"nfkc_lc": {
"type": "unicode_normalize"
}
}
}
}
Expand Down

0 comments on commit e51da18

Please sign in to comment.