Add simple char filter for normalizing chracters

For NFKC and lowercasing.
masaruh · Jan 6, 2016 · e51da18 · e51da18
1 parent 141789c
commit e51da18
Show file tree

Hide file tree

Showing 7 changed files with 190 additions and 2 deletions.
diff --git a/src/main/java/org/elasticsearch/analysis/UnicodeNormalizationCharFilter.java b/src/main/java/org/elasticsearch/analysis/UnicodeNormalizationCharFilter.java
@@ -0,0 +1,78 @@
+package org.elasticsearch.analysis;
+
+import org.apache.lucene.analysis.CharFilter;
+
+import java.io.IOException;
+import java.io.Reader;
+import java.text.Normalizer;
+
+public class UnicodeNormalizationCharFilter extends CharFilter {
+    private final Normalizer.Form form;
+    private final boolean lowerCase;
+
+    private final StringBuilder normalized = new StringBuilder();
+    private final StringBuilder raw = new StringBuilder();
+
+    private final char[] buffer = new char[1024];
+
+    private boolean read = false;
+    private int position = 0;
+
+    public UnicodeNormalizationCharFilter(Reader input, Normalizer.Form form, boolean lowerCase) {
+        super(input);
+        this.form = form;
+        this.lowerCase = lowerCase;
+    }
+
+    @Override
+    protected int correct(int i) {
+        return 0;
+    }
+
+    @Override
+    public int read(char[] cbuf, int off, int len) throws IOException {
+        if (off + len > this.normalized.length()) {
+            readAllAndNormalize();
+        }
+
+        if (!this.read) {
+            return -1;
+        }
+
+        int readLength = Math.min(len, this.normalized.length() - this.position);
+
+        this.normalized.getChars(this.position, this.position + readLength, cbuf, off);
+        this.position += readLength;
+
+        // consumed all normalized buffer.
+        if (this.position == this.normalized.length()) {
+            clear();
+        }
+
+        return readLength;
+    }
+
+    @Override
+    public void reset() throws IOException {
+        super.reset();
+        clear();
+    }
+
+    private void clear() {
+        this.position = 0;
+        this.normalized.delete(0, this.normalized.length());
+        this.raw.delete(0, this.raw.length());
+        this.read = false;
+    }
+
+    private void readAllAndNormalize() throws IOException {
+        int length;
+        while ((length = this.input.read(this.buffer)) != -1) {
+            this.read = true;
+            this.raw.append(this.buffer, 0, length);
+        }
+        String normalized = Normalizer.normalize(this.raw, this.form);
+
+        this.normalized.append(this.lowerCase ? normalized.toLowerCase() : normalized);
+    }
+}
diff --git a/src/main/java/org/elasticsearch/analysis/UnicodeNormalizationCharFilterFactory.java b/src/main/java/org/elasticsearch/analysis/UnicodeNormalizationCharFilterFactory.java
@@ -0,0 +1,28 @@
+package org.elasticsearch.analysis;
+
+import org.elasticsearch.common.inject.Inject;
+import org.elasticsearch.common.inject.assistedinject.Assisted;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.index.Index;
+import org.elasticsearch.index.analysis.AbstractCharFilterFactory;
+import org.elasticsearch.index.settings.IndexSettingsService;
+
+import java.io.Reader;
+import java.text.Normalizer;
+
+public class UnicodeNormalizationCharFilterFactory extends AbstractCharFilterFactory {
+    private final Normalizer.Form form;
+    private final boolean lowerCase;
+    @Inject
+    public UnicodeNormalizationCharFilterFactory(Index index, IndexSettingsService indexSettingsService,
+                                                 @Assisted String name, @Assisted Settings settings) {
+        super(index, indexSettingsService.getSettings(), name);
+        this.form = Normalizer.Form.valueOf(settings.get("form", "NFKC").toUpperCase());
+        this.lowerCase = settings.getAsBoolean("lower_case", true);
+    }
+
+    @Override
+    public Reader create(Reader tokenStream) {
+        return new UnicodeNormalizationCharFilter(tokenStream, this.form, this.lowerCase);
+    }
+}
diff --git a/src/main/java/org/elasticsearch/plugin/JapaneseSuggesterPlugin.java b/src/main/java/org/elasticsearch/plugin/JapaneseSuggesterPlugin.java
@@ -1,6 +1,7 @@
 package org.elasticsearch.plugin;
 
 import org.elasticsearch.analysis.KuromojiSuggestTokenizerFactory;
+import org.elasticsearch.analysis.UnicodeNormalizationCharFilterFactory;
 import org.elasticsearch.index.analysis.AnalysisModule;
 import org.elasticsearch.index.mapper.core.CompletionFieldMapper;
 import org.elasticsearch.indices.IndicesModule;
@@ -25,6 +26,7 @@ public void onModule(IndicesModule indicesModule) {
 
     public void onModule(AnalysisModule module) {
         module.addTokenizer("kuromoji_suggest", KuromojiSuggestTokenizerFactory.class);
+        module.addCharFilter("unicode_normalize", UnicodeNormalizationCharFilterFactory.class);
     }
 
     public void onModule(SearchModule module) {

diff --git a/src/test/java/org/elasticsearch/index/analysis/KuromojiSuggestAnalysisTest.java b/src/test/java/org/elasticsearch/index/analysis/KuromojiSuggestAnalysisTest.java
@@ -88,7 +88,7 @@ public void testDedupInput() throws IOException {
     public void testKanjiAndAlphaNumeric() throws IOException {
         testTokenization(createAnalyzer(true, false), "2015年", Lists.newArrayList("2015年", "2015nen"));
         testTokenization(createAnalyzer(true, false), "第138回", Lists.newArrayList("第138回", "dai138kai"));
-        testTokenization(createAnalyzer(true, false), "A型", Lists.newArrayList("A型", "Agata"));
+        testTokenization(createAnalyzer(true, false), "A型", Lists.newArrayList("a型", "agata"));
     }
 
     @Test

diff --git a/src/test/java/org/elasticsearch/index/analysis/UnicodeNormalizationCharFilterTest.java b/src/test/java/org/elasticsearch/index/analysis/UnicodeNormalizationCharFilterTest.java
@@ -0,0 +1,52 @@
+package org.elasticsearch.index.analysis;
+
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharFilter;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.elasticsearch.analysis.UnicodeNormalizationCharFilter;
+import org.junit.Test;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.text.Normalizer;
+
+public class UnicodeNormalizationCharFilterTest extends BaseTokenStreamTestCase {
+    @Test
+    public void testSimpleNFKC() throws IOException {
+        String input = "ｱｲｳｴｵ １２３ ＡＢＣ";
+        CharFilter charFilter = new UnicodeNormalizationCharFilter(new StringReader(input), Normalizer.Form.NFKC, true);
+
+
+        MockTokenizer tokenizer = new MockTokenizer();
+        tokenizer.setReader(charFilter);
+
+        String[] expected = new String[] {"アイウエオ", "123", "abc"};
+        assertTokenStreamContents(tokenizer, expected);
+    }
+
+    @Test
+    public void testComposeNfkc() throws IOException {
+        String input = "ガギグゲゴ";
+        CharFilter charFilter = new UnicodeNormalizationCharFilter(new StringReader(input), Normalizer.Form.NFKC, true);
+
+
+        MockTokenizer tokenizer = new MockTokenizer();
+        tokenizer.setReader(charFilter);
+
+        String[] expected = new String[] {"ガギグゲゴ"};
+        assertTokenStreamContents(tokenizer, expected);
+    }
+
+    @Test
+    public void testComposeNfkcHalfWidthKatakana() throws IOException {
+        String input = "ｶﾞｷﾞｸﾞｹﾞｺﾞ";
+        CharFilter charFilter = new UnicodeNormalizationCharFilter(new StringReader(input), Normalizer.Form.NFKC, true);
+
+
+        MockTokenizer tokenizer = new MockTokenizer();
+        tokenizer.setReader(charFilter);
+
+        String[] expected = new String[] {"ガギグゲゴ"};
+        assertTokenStreamContents(tokenizer, expected);
+    }
+}
diff --git a/...est/java/org/elasticsearch/search/suggest/completion/JapaneseCompletionSuggesterTest.java b/...est/java/org/elasticsearch/search/suggest/completion/JapaneseCompletionSuggesterTest.java
@@ -84,6 +84,20 @@ public void testPrefixFiltering() throws IOException {
         assertSuggestResult(index, field, "小", 1, null);
     }
 
+    public void testNormlization() throws IOException {
+        String index = "normalization_test";
+        String type = "type";
+        String field = "suggest";
+
+        createTestIndex(index, type, field);
+
+        feedDocument(index, type, field, "ｶﾞｷﾞｸﾞｹﾞｺﾞ");
+        feedDocument(index, type, field, "ABCDE");
+
+        assertSuggestResult(index, field, "ガ", 1, "ｶﾞｷﾞｸﾞｹﾞｺﾞ");
+        assertSuggestResult(index, field, "a", 1, "ABCDE");
+    }
+
     public void createTestIndex(String index, String type, String completionField) throws IOException {
         client().admin().indices().prepareCreate(index)
                 .setSettings(
@@ -94,9 +108,11 @@ public void createTestIndex(String index, String type, String completionField) t
                                         .startObject("analyzer")
                                             .startObject("kuromoji_suggest_index")
                                                 .field("tokenizer", "kuromoji_suggest_index")
+                                                .array("char_filter", "nfkc_lc")
                                             .endObject()
                                             .startObject("kuromoji_suggest_search")
                                                 .field("tokenizer", "kuromoji_suggest_search")
+                                                .array("char_filter", "nfkc_lc")
                                             .endObject()
                                         .endObject()
                                         .startObject("tokenizer")
@@ -109,6 +125,11 @@ public void createTestIndex(String index, String type, String completionField) t
                                                 .field("expand", false)
                                             .endObject()
                                         .endObject()
+                                        .startObject("char_filter")
+                                            .startObject("nfkc_lc")
+                                                .field("type", "unicode_normalize")
+                                            .endObject()
+                                        .endObject()
                                     .endObject()
                                 .endObject()
                             .endObject())

diff --git a/src/test/resources/analyzer_configuration_template.json b/src/test/resources/analyzer_configuration_template.json
@@ -3,7 +3,8 @@
     "analysis": {
       "analyzer": {
         "kuromoji_suggest":{
-          "tokenizer":"kuromoji_suggest"
+          "tokenizer":"kuromoji_suggest",
+          "char_filter": ["nfkc_lc"]
         }
       },
 
@@ -14,6 +15,12 @@
           "max_expansions": %MAX_EXPANSIONS%,
           "edge_ngram": %EDGE_NGRAM%
         }
+      },
+
+      "char_filter":{
+        "nfkc_lc": {
+          "type": "unicode_normalize"
+        }
       }
     }
   }