@@ -205,12 +205,13 @@ MorphemeList tokenizeSentence(Tokenizer.SplitMode mode, UTF8InputText input) {
205
205
LatticeImpl buildLattice (UTF8InputText input ) {
206
206
byte [] bytes = input .getByteText ();
207
207
lattice .resize (bytes .length );
208
- ArrayList <LatticeNodeImpl > unkNodes = new ArrayList <>(64 );
208
+ ArrayList <LatticeNodeImpl > crrNodes = new ArrayList <>(64 );
209
209
WordLookup wordLookup = lexicon .makeLookup ();
210
210
for (int byteBoundary = 0 ; byteBoundary < bytes .length ; byteBoundary ++) {
211
211
if (!input .canBow (byteBoundary ) || !lattice .hasPreviousNode (byteBoundary )) {
212
212
continue ;
213
213
}
214
+ crrNodes .clear ();
214
215
wordLookup .reset (bytes , byteBoundary , bytes .length );
215
216
long wordMask = 0L ;
216
217
while (wordLookup .next ()) {
@@ -224,7 +225,7 @@ LatticeImpl buildLattice(UTF8InputText input) {
224
225
int wordId = wordIds [word ];
225
226
LatticeNodeImpl n = new LatticeNodeImpl (lexicon , lexicon .parameters (wordId ), wordId );
226
227
lattice .insert (byteBoundary , end , n );
227
- unkNodes .add (n );
228
+ crrNodes .add (n );
228
229
wordMask = WordMask .addNth (wordMask , end - byteBoundary );
229
230
}
230
231
}
@@ -233,11 +234,11 @@ LatticeImpl buildLattice(UTF8InputText input) {
233
234
// OOV
234
235
if (!input .getCharCategoryTypes (byteBoundary ).contains (CategoryType .NOOOVBOW )) {
235
236
for (OovProviderPlugin plugin : oovProviderPlugins ) {
236
- wordMaskWithOov = provideOovs (plugin , input , unkNodes , byteBoundary , wordMaskWithOov );
237
+ wordMaskWithOov = provideOovs (plugin , input , byteBoundary , wordMaskWithOov , crrNodes );
237
238
}
238
239
}
239
240
if (wordMaskWithOov == 0 && defaultOovProvider != null ) {
240
- wordMaskWithOov = provideOovs (defaultOovProvider , input , unkNodes , byteBoundary , wordMaskWithOov );
241
+ wordMaskWithOov = provideOovs (defaultOovProvider , input , byteBoundary , wordMaskWithOov , crrNodes );
241
242
}
242
243
if (wordMaskWithOov == 0 ) {
243
244
throw new IllegalStateException ("failed to found any morpheme candidate at boundary " + byteBoundary );
@@ -249,19 +250,31 @@ LatticeImpl buildLattice(UTF8InputText input) {
249
250
}
250
251
251
252
/**
252
- * Create OOV nodes using plugin and add them to the lattice and unkNodes.
253
+ * Create OOV nodes using plugin at the given position and update crrNodes and
254
+ * wordMask.
253
255
*
256
+ * @param plugin
257
+ * OOVProviderPlugin to use
258
+ * @param input
259
+ * Full inputText
260
+ * @param boundary
261
+ * Byte index of inputText where OOV nodes should start from
262
+ * @param crrNodes
263
+ * Nodes already provided by dict or other plugins. Provided nodes
264
+ * should be appended to this
265
+ * @param wordMask
266
+ * Word mask based on crrNodes
254
267
* @return wordMask updated based on created OOV nodes.
255
268
*/
256
- private long provideOovs (OovProviderPlugin plugin , UTF8InputText input , ArrayList < LatticeNodeImpl > unkNodes ,
257
- int boundary , long wordMask ) {
258
- int initialSize = unkNodes .size ();
259
- int created = plugin .provideOOV (input , boundary , wordMask , unkNodes );
269
+ private long provideOovs (OovProviderPlugin plugin , UTF8InputText input , int boundary , long wordMask ,
270
+ ArrayList < LatticeNodeImpl > crrNodes ) {
271
+ int initialSize = crrNodes .size ();
272
+ int created = plugin .provideOOV (input , boundary , wordMask , crrNodes );
260
273
if (created == 0 ) {
261
274
return wordMask ;
262
275
}
263
276
for (int i = initialSize ; i < initialSize + created ; ++i ) {
264
- LatticeNodeImpl node = unkNodes .get (i );
277
+ LatticeNodeImpl node = crrNodes .get (i );
265
278
lattice .insert (node .getBegin (), node .getEnd (), node );
266
279
wordMask = WordMask .addNth (wordMask , node .getEnd () - node .getBegin ());
267
280
}
0 commit comments