Skip to content

Commit 9c43b13

Browse files
Merge pull request #248 from WorksApplications/feature/244-iterate-entries
Stream dictionary entries
2 parents 148d072 + 73d1978 commit 9c43b13

File tree

13 files changed

+210
-41
lines changed

13 files changed

+210
-41
lines changed

src/main/java/com/worksap/nlp/sudachi/Dictionary.java

+15-2
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,13 @@
1616

1717
package com.worksap.nlp.sudachi;
1818

19-
import com.worksap.nlp.sudachi.dictionary.POS;
20-
2119
import java.io.IOException;
2220
import java.util.Arrays;
2321
import java.util.List;
2422
import java.util.function.Predicate;
23+
import java.util.stream.Stream;
24+
25+
import com.worksap.nlp.sudachi.dictionary.POS;
2526

2627
/**
2728
* A lexicon and a grammar for morphological analysis.
@@ -56,6 +57,18 @@ public interface Dictionary extends AutoCloseable {
5657
@Override
5758
public void close() throws IOException;
5859

60+
/**
61+
* Create a parallel stream of all words in the dictionary as morphemes.
62+
*
63+
* Corresponds to the lines in the lexicon csv, i.e. it includes entries that
64+
* appear only when refered from other words (e.g. as constitution) during an
65+
* analysis and excludes entries that automatically added to store a
66+
* normalization form of another word. Entries in the stream are not sorted.
67+
*
68+
* @return a parallel stream of morphemes.
69+
*/
70+
public Stream<Morpheme> entries();
71+
5972
/**
6073
* Lookup entries in the dictionary without performing an analysis.
6174
*

src/main/java/com/worksap/nlp/sudachi/JapaneseDictionary.java

+47
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,15 @@
2525
import java.nio.charset.StandardCharsets;
2626
import java.util.ArrayList;
2727
import java.util.Collections;
28+
import java.util.Iterator;
2829
import java.util.List;
30+
import java.util.NoSuchElementException;
31+
import java.util.Spliterator;
32+
import java.util.Spliterators;
2933
import java.util.function.Predicate;
3034
import java.util.stream.IntStream;
35+
import java.util.stream.Stream;
36+
import java.util.stream.StreamSupport;
3137

3238
public class JapaneseDictionary implements Dictionary, DictionaryAccess {
3339

@@ -127,6 +133,45 @@ public void close() throws IOException {
127133
}
128134
}
129135

136+
/**
137+
* Iterator of morphemes in the dictionary.
138+
*/
139+
private class EntryItr implements Iterator<Morpheme> {
140+
private final GrammarImpl grammar;
141+
private final LexiconSet lexicon;
142+
private Iterator<Integer> wordIdItr;
143+
144+
EntryItr() {
145+
this.grammar = getGrammar();
146+
this.lexicon = getLexicon();
147+
this.wordIdItr = this.lexicon.wordIds();
148+
}
149+
150+
@Override
151+
public boolean hasNext() {
152+
return wordIdItr.hasNext();
153+
}
154+
155+
@Override
156+
public Morpheme next() {
157+
if (!hasNext()) {
158+
throw new NoSuchElementException();
159+
}
160+
return new SingleMorphemeImpl(this.grammar, this.lexicon, wordIdItr.next());
161+
}
162+
}
163+
164+
@Override
165+
public Stream<Morpheme> entries() {
166+
Iterator<Morpheme> iterator = new EntryItr();
167+
int size = getLexicon().size();
168+
int characteristics = Spliterator.DISTINCT | Spliterator.IMMUTABLE | Spliterator.NONNULL | Spliterator.SIZED;
169+
boolean parallel = true;
170+
171+
Spliterator<Morpheme> spliterator = Spliterators.spliterator(iterator, size, characteristics);
172+
return StreamSupport.stream(spliterator, parallel);
173+
}
174+
130175
@Override
131176
public List<Morpheme> lookup(CharSequence surface) {
132177
UTF8InputTextBuilder builder = new UTF8InputTextBuilder(surface, grammar);
@@ -204,10 +249,12 @@ static String readAll(InputStream input) throws IOException {
204249
}
205250
}
206251

252+
@Override
207253
public GrammarImpl getGrammar() {
208254
return grammar;
209255
}
210256

257+
@Override
211258
public LexiconSet getLexicon() {
212259
return lexicon;
213260
}

src/main/java/com/worksap/nlp/sudachi/Morpheme.java

+3-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2021 Works Applications Co., Ltd.
2+
* Copyright (c) 2021-2024 Works Applications Co., Ltd.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -123,9 +123,10 @@ public interface Morpheme {
123123
* The IDs change when the dictionaries are updated or the combination of
124124
* dictionaries changes.
125125
*
126-
* If the morpheme is OOV, it returns an undefined value.
126+
* If the morpheme is OOV, it returns an id consist of OOV flag and pos id.
127127
*
128128
* @return the word ID
129+
* @see WordId
129130
*/
130131
public int getWordId();
131132

src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java

+5-4
Original file line numberDiff line numberDiff line change
@@ -93,12 +93,13 @@ public enum WordRefMode {
9393
grammar.setCharacterCategory(CharacterCategory.loadDefault());
9494
textNormalizer = new TextNormalizer(grammar);
9595

96-
// in order to output dictionary entries in in-dictionary order we need to sort
97-
// them. iterator over them will get them not in the sorted order, but grouped
98-
// by index-form (and sorted in groups).
96+
// In order to output dictionary entries in in-dictionary order we need to sort
97+
// them. Iterator over them will get them not in the sorted order, but grouped
98+
// by index-form. Here we assume DoubleArrayLexicon and use WordIdTable.wordIds
99+
// for the performance.
99100
DoubleArrayLexicon targetLex = dic.getLexicon();
100101
Ints allIds = new Ints(targetLex.size());
101-
Iterator<Ints> ids = targetLex.wordIds(0);
102+
Iterator<Ints> ids = targetLex.getWordIdTable().wordIds();
102103
while (ids.hasNext()) {
103104
allIds.appendAll(ids.next());
104105
}

src/main/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexicon.java

+36-4
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2021 Works Applications Co., Ltd.
2+
* Copyright (c) 2021-2024 Works Applications Co., Ltd.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
1919
import java.nio.ByteBuffer;
2020
import java.nio.IntBuffer;
2121
import java.util.Iterator;
22+
import java.util.NoSuchElementException;
2223

2324
import com.worksap.nlp.dartsclone.DoubleArray;
2425
import com.worksap.nlp.sudachi.MorphemeList;
@@ -103,7 +104,7 @@ public long parameters(int wordId) {
103104

104105
private class Itr implements Iterator<int[]> {
105106
private final Iterator<int[]> iterator;
106-
private Integer[] wordIds;
107+
private int[] wordIds;
107108
private int length;
108109
private int index;
109110

@@ -148,8 +149,39 @@ public int size() {
148149
return description.getNumTotalEntries();
149150
}
150151

151-
public Iterator<Ints> wordIds(int dic) {
152-
return wordIdTable.wordIds();
152+
public Iterator<Integer> wordIds() {
153+
return new WordIdItr();
154+
}
155+
156+
private class WordIdItr implements Iterator<Integer> {
157+
private final Iterator<Ints> iterator;
158+
private Ints ints;
159+
private int index;
160+
161+
WordIdItr() {
162+
this.iterator = getWordIdTable().wordIds();
163+
index = 0;
164+
}
165+
166+
@Override
167+
public boolean hasNext() {
168+
while (ints == null || index >= ints.length()) {
169+
if (!iterator.hasNext()) {
170+
return false;
171+
}
172+
ints = iterator.next();
173+
index = 0;
174+
}
175+
return true;
176+
}
177+
178+
@Override
179+
public Integer next() {
180+
if (!hasNext()) {
181+
throw new NoSuchElementException();
182+
}
183+
return ints.get(index++);
184+
}
153185
}
154186

155187
/**

src/main/java/com/worksap/nlp/sudachi/dictionary/Lexicon.java

+14-4
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2021 Works Applications Co., Ltd.
2+
* Copyright (c) 2021-2024 Works Applications Co., Ltd.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -25,6 +25,15 @@
2525
*/
2626
public interface Lexicon {
2727

28+
/**
29+
* Lookup entries that match the text starting from the offset.
30+
*
31+
* @param text
32+
* input byte text. should be normalized
33+
* @param offset
34+
* input offset to start lookup from
35+
* @return iterator of (wordid, length) pair
36+
*/
2837
Iterator<int[]> lookup(byte[] text, int offset);
2938

3039
/**
@@ -73,8 +82,9 @@ public interface Lexicon {
7382
WordInfoList wordInfos(int dic);
7483

7584
/**
76-
* Iterates over all word ids in the specified dictionary. Returned word ids are
77-
* not sorted.
85+
* Iterates over all word ids in the dictionary.
86+
*
87+
* Returned word ids are not sorted.
7888
*/
79-
Iterator<Ints> wordIds(int dic);
89+
Iterator<Integer> wordIds();
8090
}

src/main/java/com/worksap/nlp/sudachi/dictionary/LexiconSet.java

+34-3
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2017-2022 Works Applications Co., Ltd.
2+
* Copyright (c) 2017-2024 Works Applications Co., Ltd.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -171,7 +171,38 @@ public WordInfoList wordInfos(int dic) {
171171
}
172172

173173
@Override
174-
public Iterator<Ints> wordIds(int dic) {
175-
return lexicons.get(dic).wordIds(dic);
174+
public Iterator<Integer> wordIds() {
175+
return new WordIdItr();
176+
}
177+
178+
private class WordIdItr implements Iterator<Integer> {
179+
private int dictId;
180+
private Iterator<Integer> iterator;
181+
182+
WordIdItr() {
183+
this.dictId = 0;
184+
this.iterator = lexicons.get(dictId).wordIds();
185+
}
186+
187+
@Override
188+
public boolean hasNext() {
189+
while (!iterator.hasNext()) {
190+
int nextDictId = dictId + 1;
191+
if (nextDictId >= lexicons.size()) {
192+
return false;
193+
}
194+
dictId = nextDictId;
195+
iterator = lexicons.get(nextDictId).wordIds();
196+
}
197+
return true;
198+
}
199+
200+
@Override
201+
public Integer next() {
202+
if (!hasNext()) {
203+
throw new NoSuchElementException();
204+
}
205+
return iterator.next();
206+
}
176207
}
177208
}

src/main/java/com/worksap/nlp/sudachi/dictionary/WordIdTable.java

+13-13
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2021 Works Applications Co., Ltd.
2+
* Copyright (c) 2021-2024 Works Applications Co., Ltd.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -23,27 +23,27 @@
2323
import java.util.NoSuchElementException;
2424
import java.util.Iterator;
2525

26-
class WordIdTable {
26+
/**
27+
* Table which contains the list of (internal) word ids that has same index
28+
* form.
29+
*
30+
* Automatically fills dict parts of word id using the dicId set.
31+
*/
32+
public class WordIdTable {
2733
private final ByteBuffer bytes;
2834
private int dicIdMask = 0;
2935

3036
WordIdTable(ByteBuffer bytes) {
3137
this.bytes = bytes;
3238
}
3339

34-
Integer[] get(int index) {
40+
int[] get(int index) {
3541
ByteBuffer dup = bytes.duplicate();
3642
dup.position(index);
3743
BufReader reader = new BufReader(dup);
3844
int length = reader.readVarint32();
39-
Integer[] result = new Integer[length];
40-
int mask = dicIdMask;
41-
int sum = 0;
42-
for (int i = 0; i < length; i++) {
43-
int v = reader.readVarint32();
44-
result[i] = WordId.applyMask(v + sum, mask);
45-
sum += v;
46-
}
45+
int[] result = new int[length];
46+
readDeltaCompressed(result, length, this.dicIdMask, reader);
4747
return result;
4848
}
4949

@@ -75,8 +75,8 @@ private static void readDeltaCompressed(int[] result, int count, int mask, BufRe
7575
}
7676
}
7777

78-
void setDictionaryId(int id) {
79-
dicIdMask = WordId.dicIdMask(id);
78+
void setDictionaryId(int dictId) {
79+
dicIdMask = WordId.dicIdMask(dictId);
8080
}
8181

8282
/**

src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java

+4-3
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919
import com.worksap.nlp.sudachi.dictionary.Block;
2020
import com.worksap.nlp.sudachi.dictionary.DoubleArrayLexicon;
2121
import com.worksap.nlp.sudachi.dictionary.Ints;
22-
import com.worksap.nlp.sudachi.dictionary.Lexicon;
2322
import com.worksap.nlp.sudachi.dictionary.WordInfoList;
2423

2524
import java.io.IOException;
@@ -63,17 +62,19 @@ public class RawLexicon {
6362
* used to resolve wordref.
6463
*
6564
* @param lexicon
65+
* lexicon of a system dictionary.
6666
* @return number of entries read.
6767
*/
68-
public int preloadFrom(Lexicon lexicon, Progress progress) {
68+
public int preloadFrom(DoubleArrayLexicon lexicon, Progress progress) {
6969
this.isUser = true;
7070

7171
Ints allIds = new Ints(lexicon.size());
72-
Iterator<Ints> ids = lexicon.wordIds(0);
72+
Iterator<Ints> ids = lexicon.getWordIdTable().wordIds();
7373
while (ids.hasNext()) {
7474
allIds.appendAll(ids.next());
7575
}
7676
allIds.sort();
77+
7778
for (int i = 0; i < allIds.length(); i++) {
7879
preloadedEntries.add(new CompiledWordEntry(lexicon, allIds.get(i)));
7980
progress.progress(i, allIds.length());

0 commit comments

Comments
 (0)