Skip to content

Commit 148d072

Browse files
Merge pull request #245 from WorksApplications/feature/201-lookup
Add lookup and oovMorpheme method
2 parents cacdf10 + 08c09cc commit 148d072

17 files changed

+885
-290
lines changed

src/main/java/com/worksap/nlp/sudachi/Dictionary.java

+51-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2021 Works Applications Co., Ltd.
2+
* Copyright (c) 2021-2024 Works Applications Co., Ltd.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -56,6 +56,56 @@ public interface Dictionary extends AutoCloseable {
5656
@Override
5757
public void close() throws IOException;
5858

59+
/**
60+
* Lookup entries in the dictionary without performing an analysis.
61+
*
62+
* Specified surface will be normalized. This will work like performing analysis
63+
* on the given headword and find paths with a single morpheme, but returns all
64+
* paths instead of the lowest cost one.
65+
*
66+
* @param surface
67+
* to lookup. Will be normalized beforehand.
68+
* @return a list of morphemes that match the surface. Their begin/end will be
69+
* 0/length of their headword.
70+
*/
71+
public List<Morpheme> lookup(CharSequence surface);
72+
73+
/**
74+
* Create an out-of-vocabulary morpheme from the pos id and string forms.
75+
*
76+
* Begin/end will be set based on the surface.
77+
*
78+
* @param posId
79+
* part-of-speech id of the morpheme
80+
* @param surface
81+
* surface of the morpheme
82+
* @param reading
83+
* reading form of the morpheme
84+
* @param normalizedForm
85+
* normalized form of the morpheme
86+
* @param dictionaryForm
87+
* dictionary form of the morpheme
88+
* @return an oov morpheme with given information
89+
*/
90+
public Morpheme oovMorpheme(short posId, String surface, String reading, String normalizedForm,
91+
String dictionaryForm);
92+
93+
/**
94+
* Create an out-of-vocabulary morpheme from the pos id and the surface.
95+
*
96+
* Use the surface to for other string forms. Begin/end will be set based on the
97+
* surface.
98+
*
99+
* @param posId
100+
* part-of-speech id of the morpheme
101+
* @param surface
102+
* surface of the morpheme
103+
* @return an oov morpheme with given information
104+
*/
105+
public default Morpheme oovMorpheme(short posId, String surface) {
106+
return oovMorpheme(posId, surface, surface, surface, surface);
107+
}
108+
59109
/**
60110
* Returns the number of types of part-of-speech.
61111
*

src/main/java/com/worksap/nlp/sudachi/JapaneseDictionary.java

+35-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2017-2022 Works Applications Co., Ltd.
2+
* Copyright (c) 2017-2024 Works Applications Co., Ltd.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -127,6 +127,40 @@ public void close() throws IOException {
127127
}
128128
}
129129

130+
@Override
131+
public List<Morpheme> lookup(CharSequence surface) {
132+
UTF8InputTextBuilder builder = new UTF8InputTextBuilder(surface, grammar);
133+
for (InputTextPlugin plugin : inputTextPlugins) {
134+
plugin.rewrite(builder);
135+
}
136+
UTF8InputText input = builder.build();
137+
byte[] bytes = input.getByteText();
138+
139+
List<Morpheme> morphemes = new ArrayList<>();
140+
WordLookup wordLookup = lexicon.makeLookup();
141+
wordLookup.reset(bytes, 0, bytes.length);
142+
while (wordLookup.next()) {
143+
int end = wordLookup.getEndOffset();
144+
if (end != bytes.length) {
145+
continue;
146+
}
147+
int numWords = wordLookup.getNumWords();
148+
int[] wordIds = wordLookup.getWordsIds();
149+
for (int word = 0; word < numWords; ++word) {
150+
int wordId = wordIds[word];
151+
Morpheme morpheme = new SingleMorphemeImpl(getGrammar(), getLexicon(), wordId);
152+
morphemes.add(morpheme);
153+
}
154+
}
155+
return morphemes;
156+
}
157+
158+
@Override
159+
public Morpheme oovMorpheme(short posId, String surface, String reading, String normalizedForm,
160+
String dictionaryForm) {
161+
return new SingleMorphemeImpl(getGrammar(), posId, surface, reading, normalizedForm, dictionaryForm);
162+
}
163+
130164
@Override
131165
public Tokenizer tokenizer() {
132166
if (grammar == null || lexicon == null) {

src/main/java/com/worksap/nlp/sudachi/LatticeNode.java

+21-5
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2021 Works Applications Co., Ltd.
2+
* Copyright (c) 2021-2024 Works Applications Co., Ltd.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -98,25 +98,41 @@ public interface LatticeNode {
9898
*/
9999
public WordInfo getWordInfo();
100100

101+
/**
102+
* Returns the string information of the node.
103+
*
104+
* @return the string information of the node.
105+
* @see StringsCache
106+
*/
107+
public StringsCache getStrings();
108+
101109
/**
102110
* @return the text of node.
103111
*/
104-
public String getSurface();
112+
public default String getSurface() {
113+
return getStrings().getSurface();
114+
}
105115

106116
/**
107117
* @return the reading form of node.
108118
*/
109-
public String getReading();
119+
public default String getReading() {
120+
return getStrings().getReading();
121+
}
110122

111123
/**
112124
* @return the normalized form of node.
113125
*/
114-
public String getNormalizedForm();
126+
public default String getNormalizedForm() {
127+
return getStrings().getNormalizedForm();
128+
}
115129

116130
/**
117131
* @return the dictionary form of node.
118132
*/
119-
public String getDictionaryForm();
133+
public default String getDictionaryForm() {
134+
return getStrings().getDictionaryForm();
135+
}
120136

121137
/**
122138
* Sets the morpheme information to the node.

src/main/java/com/worksap/nlp/sudachi/LatticeNodeImpl.java

+13-108
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2021 Works Applications Co., Ltd.
2+
* Copyright (c) 2021-2024 Works Applications Co., Ltd.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -42,13 +42,13 @@ public class LatticeNodeImpl implements LatticeNode {
4242
LatticeNodeImpl bestPreviousNode;
4343

4444
// either Lexicon or StringsCache object
45-
Object lexicon;
45+
Object lexiconOrStrings;
4646

4747
// Empty wordInfo for special words.
4848
static final WordInfo UNDEFINED_WORDINFO = new WordInfo((short) 0, (short) -1);
4949

5050
LatticeNodeImpl(Lexicon lexicon, long params, int wordId) {
51-
this.lexicon = lexicon;
51+
this.lexiconOrStrings = lexicon;
5252
this.leftId = WordParameters.leftId(params);
5353
this.rightId = WordParameters.rightId(params);
5454
this.cost = WordParameters.cost(params);
@@ -82,7 +82,7 @@ static LatticeNodeImpl makeOov(int begin, int end, short posId, String surface,
8282
LatticeNodeImpl node = new LatticeNodeImpl();
8383
node.wordId = WordId.makeOov(posId);
8484
node.wordInfo = new WordInfo((short) (end - begin), posId);
85-
node.lexicon = new StringsCache(surface, readingForm, normalizedForm, dictionaryForm);
85+
node.lexiconOrStrings = new StringsCache(surface, readingForm, normalizedForm, dictionaryForm);
8686
node.begin = begin;
8787
node.end = end;
8888
return node;
@@ -110,10 +110,10 @@ public void setParameter(long params) {
110110
}
111111

112112
private Lexicon lexicon() {
113-
if (lexicon instanceof Lexicon) {
114-
return (Lexicon) lexicon;
115-
} else if (lexicon instanceof StringsCache) {
116-
return ((StringsCache) lexicon).lexicon;
113+
if (lexiconOrStrings instanceof Lexicon) {
114+
return (Lexicon) lexiconOrStrings;
115+
} else if (lexiconOrStrings instanceof StringsCache) {
116+
return ((StringsCache) lexiconOrStrings).getLexicon();
117117
} else {
118118
throw new IllegalStateException("lexicon was null probably");
119119
}
@@ -191,26 +191,6 @@ public boolean isConnectedToBOS() {
191191
return bestPreviousNode != null;
192192
}
193193

194-
@Override
195-
public String getSurface() {
196-
return strings().getSurface(this);
197-
}
198-
199-
@Override
200-
public String getReading() {
201-
return strings().getReading(this);
202-
}
203-
204-
@Override
205-
public String getNormalizedForm() {
206-
return strings().getNormalizedForm(this);
207-
}
208-
209-
@Override
210-
public String getDictionaryForm() {
211-
return strings().getDictionaryForm(this);
212-
}
213-
214194
@Override
215195
public String toString() {
216196
String surface = getSurface();
@@ -220,11 +200,12 @@ public String toString() {
220200
cost);
221201
}
222202

223-
private StringsCache strings() {
224-
Object l = lexicon;
203+
@Override
204+
public StringsCache getStrings() {
205+
Object l = lexiconOrStrings;
225206
if (l instanceof Lexicon) {
226-
StringsCache c = new StringsCache((Lexicon) l);
227-
lexicon = c;
207+
StringsCache c = new StringsCache((Lexicon) l, wordId);
208+
lexiconOrStrings = c;
228209
return c;
229210
} else if (l instanceof StringsCache) {
230211
return (StringsCache) l;
@@ -274,82 +255,6 @@ private void appendSplitsTo(List<LatticeNodeImpl> result, int[] splitsId) {
274255
}
275256
}
276257

277-
/**
278-
* Cache to reduce the access to the lexicon. Also used to mock the lexicon for
279-
* OOV nodes.
280-
*/
281-
private static final class StringsCache {
282-
private final Lexicon lexicon;
283-
private String surface;
284-
private String reading;
285-
private String normalizedForm;
286-
private String dictionaryForm;
287-
288-
public StringsCache(Lexicon lexicon) {
289-
this.lexicon = lexicon;
290-
}
291-
292-
public StringsCache(String surface, String readingForm, String normalizedForm, String dictionaryForm) {
293-
this.lexicon = null;
294-
this.surface = surface;
295-
this.reading = readingForm;
296-
this.normalizedForm = normalizedForm;
297-
this.dictionaryForm = dictionaryForm;
298-
}
299-
300-
public String getSurface(LatticeNodeImpl node) {
301-
// benign data race pattern
302-
// https://shipilev.net/blog/2016/close-encounters-of-jmm-kind/#wishful-benign-is-resilient
303-
String s = surface;
304-
if (s == null) {
305-
WordInfo wi = node.getWordInfo();
306-
int headwordPtr = wi.getHeadword();
307-
int dic = WordId.dic(node.getWordId());
308-
s = lexicon.string(dic, headwordPtr);
309-
surface = s;
310-
}
311-
return s;
312-
}
313-
314-
public String getReading(LatticeNodeImpl node) {
315-
String s = reading;
316-
if (s == null) {
317-
WordInfo wi = node.getWordInfo();
318-
int readingPtr = wi.getReadingForm();
319-
int dic = WordId.dic(node.getWordId());
320-
s = lexicon.string(dic, readingPtr);
321-
reading = s;
322-
}
323-
return s;
324-
}
325-
326-
public String getNormalizedForm(LatticeNodeImpl node) {
327-
String s = normalizedForm;
328-
if (s == null) {
329-
WordInfo wi = node.getWordInfo();
330-
int wordref = wi.getNormalizedForm();
331-
int dic = WordId.refDic(wordref, WordId.dic(node.wordId));
332-
int headwordPtr = lexicon.wordInfos(dic).headwordPtr(WordId.word(wordref));
333-
s = lexicon.string(dic, headwordPtr);
334-
normalizedForm = s;
335-
}
336-
return s;
337-
}
338-
339-
public String getDictionaryForm(LatticeNodeImpl node) {
340-
String s = dictionaryForm;
341-
if (s == null) {
342-
WordInfo wi = node.getWordInfo();
343-
int wordref = wi.getDictionaryForm();
344-
int dic = WordId.refDic(wordref, WordId.dic(node.wordId));
345-
int headwordPtr = lexicon.wordInfos(dic).headwordPtr(WordId.word(wordref));
346-
s = lexicon.string(dic, headwordPtr);
347-
dictionaryForm = s;
348-
}
349-
return s;
350-
}
351-
}
352-
353258
/** Alias for {@link OOVFactory} constructor. */
354259
public static OOVFactory oovFactory(short leftId, short rightId, short cost, short posId) {
355260
return new OOVFactory(leftId, rightId, cost, posId);

0 commit comments

Comments
 (0)