Merge pull request #248 from WorksApplications/feature/244-iterate-entries

mh-northlander · web-flow · commit 9c43b13f4ca2 · 2024-11-28T13:51:12.000+09:00
Stream dictionary entries
diff --git a/src/main/java/com/worksap/nlp/sudachi/Dictionary.java b/src/main/java/com/worksap/nlp/sudachi/Dictionary.java
@@ -16,12 +16,13 @@
 
 package com.worksap.nlp.sudachi;
 
-import com.worksap.nlp.sudachi.dictionary.POS;
-
 import java.io.IOException;
 import java.util.Arrays;
 import java.util.List;
 import java.util.function.Predicate;
+import java.util.stream.Stream;
+
+import com.worksap.nlp.sudachi.dictionary.POS;
 
 /**
  * A lexicon and a grammar for morphological analysis.
@@ -56,6 +57,18 @@ public interface Dictionary extends AutoCloseable {
     @Override
     public void close() throws IOException;
 
+    /**
+     * Create a parallel stream of all words in the dictionary as morphemes.
+     *
+     * Corresponds to the lines in the lexicon csv, i.e. it includes entries that
+     * appear only when refered from other words (e.g. as constitution) during an
+     * analysis and excludes entries that automatically added to store a
+     * normalization form of another word. Entries in the stream are not sorted.
+     *
+     * @return a parallel stream of morphemes.
+     */
+    public Stream<Morpheme> entries();
+
     /**
      * Lookup entries in the dictionary without performing an analysis.
      * 
diff --git a/src/main/java/com/worksap/nlp/sudachi/JapaneseDictionary.java b/src/main/java/com/worksap/nlp/sudachi/JapaneseDictionary.java
@@ -25,9 +25,15 @@
 import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.Collections;
+import java.util.Iterator;
 import java.util.List;
+import java.util.NoSuchElementException;
+import java.util.Spliterator;
+import java.util.Spliterators;
 import java.util.function.Predicate;
 import java.util.stream.IntStream;
+import java.util.stream.Stream;
+import java.util.stream.StreamSupport;
 
 public class JapaneseDictionary implements Dictionary, DictionaryAccess {
 
@@ -127,6 +133,45 @@ public void close() throws IOException {
         }
     }
 
+    /**
+     * Iterator of morphemes in the dictionary.
+     */
+    private class EntryItr implements Iterator<Morpheme> {
+        private final GrammarImpl grammar;
+        private final LexiconSet lexicon;
+        private Iterator<Integer> wordIdItr;
+
+        EntryItr() {
+            this.grammar = getGrammar();
+            this.lexicon = getLexicon();
+            this.wordIdItr = this.lexicon.wordIds();
+        }
+
+        @Override
+        public boolean hasNext() {
+            return wordIdItr.hasNext();
+        }
+
+        @Override
+        public Morpheme next() {
+            if (!hasNext()) {
+                throw new NoSuchElementException();
+            }
+            return new SingleMorphemeImpl(this.grammar, this.lexicon, wordIdItr.next());
+        }
+    }
+
+    @Override
+    public Stream<Morpheme> entries() {
+        Iterator<Morpheme> iterator = new EntryItr();
+        int size = getLexicon().size();
+        int characteristics = Spliterator.DISTINCT | Spliterator.IMMUTABLE | Spliterator.NONNULL | Spliterator.SIZED;
+        boolean parallel = true;
+
+        Spliterator<Morpheme> spliterator = Spliterators.spliterator(iterator, size, characteristics);
+        return StreamSupport.stream(spliterator, parallel);
+    }
+
     @Override
     public List<Morpheme> lookup(CharSequence surface) {
         UTF8InputTextBuilder builder = new UTF8InputTextBuilder(surface, grammar);
@@ -204,10 +249,12 @@ static String readAll(InputStream input) throws IOException {
         }
     }
 
+    @Override
     public GrammarImpl getGrammar() {
         return grammar;
     }
 
+    @Override
     public LexiconSet getLexicon() {
         return lexicon;
     }
diff --git a/src/main/java/com/worksap/nlp/sudachi/Morpheme.java b/src/main/java/com/worksap/nlp/sudachi/Morpheme.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Works Applications Co., Ltd.
+ * Copyright (c) 2021-2024 Works Applications Co., Ltd.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -123,9 +123,10 @@ public interface Morpheme {
      * The IDs change when the dictionaries are updated or the combination of
      * dictionaries changes.
      *
-     * If the morpheme is OOV, it returns an undefined value.
+     * If the morpheme is OOV, it returns an id consist of OOV flag and pos id.
      *
      * @return the word ID
+     * @see WordId
      */
     public int getWordId();
 
diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java
@@ -93,12 +93,13 @@ public enum WordRefMode {
         grammar.setCharacterCategory(CharacterCategory.loadDefault());
         textNormalizer = new TextNormalizer(grammar);
 
-        // in order to output dictionary entries in in-dictionary order we need to sort
-        // them. iterator over them will get them not in the sorted order, but grouped
-        // by index-form (and sorted in groups).
+        // In order to output dictionary entries in in-dictionary order we need to sort
+        // them. Iterator over them will get them not in the sorted order, but grouped
+        // by index-form. Here we assume DoubleArrayLexicon and use WordIdTable.wordIds
+        // for the performance.
         DoubleArrayLexicon targetLex = dic.getLexicon();
         Ints allIds = new Ints(targetLex.size());
-        Iterator<Ints> ids = targetLex.wordIds(0);
+        Iterator<Ints> ids = targetLex.getWordIdTable().wordIds();
         while (ids.hasNext()) {
             allIds.appendAll(ids.next());
         }
diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexicon.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexicon.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Works Applications Co., Ltd.
+ * Copyright (c) 2021-2024 Works Applications Co., Ltd.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 import java.nio.ByteBuffer;
 import java.nio.IntBuffer;
 import java.util.Iterator;
+import java.util.NoSuchElementException;
 
 import com.worksap.nlp.dartsclone.DoubleArray;
 import com.worksap.nlp.sudachi.MorphemeList;
@@ -103,7 +104,7 @@ public long parameters(int wordId) {
 
     private class Itr implements Iterator<int[]> {
         private final Iterator<int[]> iterator;
-        private Integer[] wordIds;
+        private int[] wordIds;
         private int length;
         private int index;
 
@@ -148,8 +149,39 @@ public int size() {
         return description.getNumTotalEntries();
     }
 
-    public Iterator<Ints> wordIds(int dic) {
-        return wordIdTable.wordIds();
+    public Iterator<Integer> wordIds() {
+        return new WordIdItr();
+    }
+
+    private class WordIdItr implements Iterator<Integer> {
+        private final Iterator<Ints> iterator;
+        private Ints ints;
+        private int index;
+
+        WordIdItr() {
+            this.iterator = getWordIdTable().wordIds();
+            index = 0;
+        }
+
+        @Override
+        public boolean hasNext() {
+            while (ints == null || index >= ints.length()) {
+                if (!iterator.hasNext()) {
+                    return false;
+                }
+                ints = iterator.next();
+                index = 0;
+            }
+            return true;
+        }
+
+        @Override
+        public Integer next() {
+            if (!hasNext()) {
+                throw new NoSuchElementException();
+            }
+            return ints.get(index++);
+        }
     }
 
     /**
diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/Lexicon.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/Lexicon.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Works Applications Co., Ltd.
+ * Copyright (c) 2021-2024 Works Applications Co., Ltd.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,15 @@
  */
 public interface Lexicon {
 
+    /**
+     * Lookup entries that match the text starting from the offset.
+     * 
+     * @param text
+     *            input byte text. should be normalized
+     * @param offset
+     *            input offset to start lookup from
+     * @return iterator of (wordid, length) pair
+     */
     Iterator<int[]> lookup(byte[] text, int offset);
 
     /**
@@ -73,8 +82,9 @@ public interface Lexicon {
     WordInfoList wordInfos(int dic);
 
     /**
-     * Iterates over all word ids in the specified dictionary. Returned word ids are
-     * not sorted.
+     * Iterates over all word ids in the dictionary.
+     * 
+     * Returned word ids are not sorted.
      */
-    Iterator<Ints> wordIds(int dic);
+    Iterator<Integer> wordIds();
 }
diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/LexiconSet.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/LexiconSet.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2022 Works Applications Co., Ltd.
+ * Copyright (c) 2017-2024 Works Applications Co., Ltd.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -171,7 +171,38 @@ public WordInfoList wordInfos(int dic) {
     }
 
     @Override
-    public Iterator<Ints> wordIds(int dic) {
-        return lexicons.get(dic).wordIds(dic);
+    public Iterator<Integer> wordIds() {
+        return new WordIdItr();
+    }
+
+    private class WordIdItr implements Iterator<Integer> {
+        private int dictId;
+        private Iterator<Integer> iterator;
+
+        WordIdItr() {
+            this.dictId = 0;
+            this.iterator = lexicons.get(dictId).wordIds();
+        }
+
+        @Override
+        public boolean hasNext() {
+            while (!iterator.hasNext()) {
+                int nextDictId = dictId + 1;
+                if (nextDictId >= lexicons.size()) {
+                    return false;
+                }
+                dictId = nextDictId;
+                iterator = lexicons.get(nextDictId).wordIds();
+            }
+            return true;
+        }
+
+        @Override
+        public Integer next() {
+            if (!hasNext()) {
+                throw new NoSuchElementException();
+            }
+            return iterator.next();
+        }
     }
 }
diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/WordIdTable.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/WordIdTable.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Works Applications Co., Ltd.
+ * Copyright (c) 2021-2024 Works Applications Co., Ltd.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,27 +23,27 @@
 import java.util.NoSuchElementException;
 import java.util.Iterator;
 
-class WordIdTable {
+/**
+ * Table which contains the list of (internal) word ids that has same index
+ * form.
+ * 
+ * Automatically fills dict parts of word id using the dicId set.
+ */
+public class WordIdTable {
     private final ByteBuffer bytes;
     private int dicIdMask = 0;
 
     WordIdTable(ByteBuffer bytes) {
         this.bytes = bytes;
     }
 
-    Integer[] get(int index) {
+    int[] get(int index) {
         ByteBuffer dup = bytes.duplicate();
         dup.position(index);
         BufReader reader = new BufReader(dup);
         int length = reader.readVarint32();
-        Integer[] result = new Integer[length];
-        int mask = dicIdMask;
-        int sum = 0;
-        for (int i = 0; i < length; i++) {
-            int v = reader.readVarint32();
-            result[i] = WordId.applyMask(v + sum, mask);
-            sum += v;
-        }
+        int[] result = new int[length];
+        readDeltaCompressed(result, length, this.dicIdMask, reader);
         return result;
     }
 
@@ -75,8 +75,8 @@ private static void readDeltaCompressed(int[] result, int count, int mask, BufRe
         }
     }
 
-    void setDictionaryId(int id) {
-        dicIdMask = WordId.dicIdMask(id);
+    void setDictionaryId(int dictId) {
+        dicIdMask = WordId.dicIdMask(dictId);
     }
 
     /**
diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java
@@ -19,7 +19,6 @@
 import com.worksap.nlp.sudachi.dictionary.Block;
 import com.worksap.nlp.sudachi.dictionary.DoubleArrayLexicon;
 import com.worksap.nlp.sudachi.dictionary.Ints;
-import com.worksap.nlp.sudachi.dictionary.Lexicon;
 import com.worksap.nlp.sudachi.dictionary.WordInfoList;
 
 import java.io.IOException;
@@ -63,17 +62,19 @@ public class RawLexicon {
      * used to resolve wordref.
      * 
      * @param lexicon
+     *            lexicon of a system dictionary.
      * @return number of entries read.
      */
-    public int preloadFrom(Lexicon lexicon, Progress progress) {
+    public int preloadFrom(DoubleArrayLexicon lexicon, Progress progress) {
         this.isUser = true;
 
         Ints allIds = new Ints(lexicon.size());
-        Iterator<Ints> ids = lexicon.wordIds(0);
+        Iterator<Ints> ids = lexicon.getWordIdTable().wordIds();
         while (ids.hasNext()) {
             allIds.appendAll(ids.next());
         }
         allIds.sort();
+
         for (int i = 0; i < allIds.length(); i++) {
             preloadedEntries.add(new CompiledWordEntry(lexicon, allIds.get(i)));
             progress.progress(i, allIds.length());
diff --git a/src/test/java/com/worksap/nlp/sudachi/JapaneseDictionaryTest.kt b/src/test/java/com/worksap/nlp/sudachi/JapaneseDictionaryTest.kt
diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinterTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinterTest.kt
diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexiconTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexiconTest.kt
diff --git a/src/test/resources/dict/lex.csv b/src/test/resources/dict/lex.csv

Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,5 @@`
`1`	`1`	`/*`
`2`		`- * Copyright (c) 2021 Works Applications Co., Ltd.`
	`2`	`+ * Copyright (c) 2021-2024 Works Applications Co., Ltd.`
`3`	`3`	`*`
`4`	`4`	`* Licensed under the Apache License, Version 2.0 (the "License");`
`5`	`5`	`* you may not use this file except in compliance with the License.`
`@@ -123,9 +123,10 @@ public interface Morpheme {`
`123`	`123`	`* The IDs change when the dictionaries are updated or the combination of`
`124`	`124`	`* dictionaries changes.`
`125`	`125`	`*`
`126`		`- * If the morpheme is OOV, it returns an undefined value.`
	`126`	`+ * If the morpheme is OOV, it returns an id consist of OOV flag and pos id.`
`127`	`127`	`*`
`128`	`128`	`* @return the word ID`
	`129`	`+ * @see WordId`
`129`	`130`	`*/`
`130`	`131`	`public int getWordId();`
`131`	`132`