@@ -408,101 +408,6 @@ public virtual void TestPositionIncrementGreaterThanNWithoutUnigrams()
408408 this . shingleFilterTest ( 2 , 3 , TEST_TOKEN_POS_INCR_GREATER_THAN_N , TRI_GRAM_TOKENS_POS_INCR_GREATER_THAN_N_WITHOUT_UNIGRAMS , TRI_GRAM_POSITION_INCREMENTS_POS_INCR_GREATER_THAN_N_WITHOUT_UNIGRAMS , TRI_GRAM_TYPES_POS_INCR_GREATER_THAN_N_WITHOUT_UNIGRAMS , false ) ;
409409 }
410410
411-
412-
413- [ Test ]
414- public void testPositionLength ( )
415- {
416- Analyzer a = Analyzer . NewAnonymous ( createComponents : ( fieldName , reader ) =>
417- {
418- MockBytesAttributeFactory factory = new MockBytesAttributeFactory ( ) ;
419- Tokenizer tokenizer = new MockTokenizer ( factory , reader , MockTokenizer . WHITESPACE , false , MockTokenizer . DEFAULT_MAX_TOKEN_LENGTH ) ;
420- ShingleFilter filter = new ShingleFilter ( tokenizer , 4 , 4 ) ;
421- filter . SetOutputUnigrams ( false ) ;
422- return new TokenStreamComponents ( tokenizer , filter ) ;
423- } ) ;
424-
425- AssertTokenStreamContents ( a . GetTokenStream ( "" , "to be or not to be" ) ,
426- new String [ ] { "to be or not" , "be or not to" , "or not to be" } ,
427- new int [ ] { 0 , 3 , 6 } ,
428- new int [ ] { 12 , 15 , 18 } ,
429- null ,
430- new int [ ] { 1 , 1 , 1 } ,
431- new int [ ] { 1 , 1 , 1 } ,
432- 18 ,
433- // offsets are correct but assertTokenStreamContents does not handle multiple terms with different offsets
434- // finishing at the same position
435- false ) ;
436-
437-
438- a = Analyzer . NewAnonymous ( createComponents : ( fieldName , reader ) =>
439- {
440- MockBytesAttributeFactory factory = new MockBytesAttributeFactory ( ) ;
441- Tokenizer tokenizer = new MockTokenizer ( factory , reader , MockTokenizer . WHITESPACE , false , MockTokenizer . DEFAULT_MAX_TOKEN_LENGTH ) ;
442- ShingleFilter filter = new ShingleFilter ( tokenizer , 2 , 4 ) ;
443- filter . SetOutputUnigrams ( false ) ;
444- return new TokenStreamComponents ( tokenizer , filter ) ;
445- } ) ;
446-
447- AssertTokenStreamContents ( a . GetTokenStream ( "" , "to be or not to be" ) ,
448- new String [ ] { "to be" , "to be or" , "to be or not" , "be or" , "be or not" , "be or not to" , "or not" , "or not to" ,
449- "or not to be" , "not to" , "not to be" , "to be" } ,
450- new int [ ] { 0 , 0 , 0 , 3 , 3 , 3 , 6 , 6 , 6 , 9 , 9 , 13 } ,
451- new int [ ] { 5 , 8 , 12 , 8 , 12 , 15 , 12 , 15 , 18 , 15 , 18 , 18 } ,
452- null ,
453- new int [ ] { 1 , 0 , 0 , 1 , 0 , 0 , 1 , 0 , 0 , 1 , 0 , 1 } ,
454- new int [ ] { 1 , 2 , 3 , 1 , 2 , 3 , 1 , 2 , 3 , 1 , 2 , 1 } ,
455- 18 ,
456- // offsets are correct but assertTokenStreamContents does not handle multiple terms with different offsets
457- // finishing at the same position
458- false ) ;
459-
460- a = Analyzer . NewAnonymous ( createComponents : ( fieldName , reader ) =>
461- {
462- MockBytesAttributeFactory factory = new MockBytesAttributeFactory ( ) ;
463- Tokenizer tokenizer = new MockTokenizer ( factory , reader , MockTokenizer . WHITESPACE , false , MockTokenizer . DEFAULT_MAX_TOKEN_LENGTH ) ;
464- ShingleFilter filter = new ShingleFilter ( tokenizer , 3 , 4 ) ;
465- filter . SetOutputUnigrams ( false ) ;
466- return new TokenStreamComponents ( tokenizer , filter ) ;
467- } ) ;
468-
469-
470- AssertTokenStreamContents ( a . GetTokenStream ( "" , "to be or not to be" ) ,
471- new String [ ] { "to be or" , "to be or not" , "be or not" , "be or not to" , "or not to" ,
472- "or not to be" , "not to be" } ,
473- new int [ ] { 0 , 0 , 3 , 3 , 6 , 6 , 9 } ,
474- new int [ ] { 8 , 12 , 12 , 15 , 15 , 18 , 18 } ,
475- null ,
476- new int [ ] { 1 , 0 , 1 , 0 , 1 , 0 , 1 , 0 } ,
477- new int [ ] { 1 , 2 , 1 , 2 , 1 , 2 , 1 , 2 } ,
478- 18 ,
479- // offsets are correct but assertTokenStreamContents does not handle multiple terms with different offsets
480- // finishing at the same position
481- false ) ;
482-
483- a = Analyzer . NewAnonymous ( createComponents : ( fieldName , reader ) =>
484- {
485- MockBytesAttributeFactory factory = new MockBytesAttributeFactory ( ) ;
486- Tokenizer tokenizer = new MockTokenizer ( factory , reader , MockTokenizer . WHITESPACE , false , MockTokenizer . DEFAULT_MAX_TOKEN_LENGTH ) ;
487- ShingleFilter filter = new ShingleFilter ( tokenizer , 3 , 5 ) ;
488- filter . SetOutputUnigrams ( false ) ;
489- return new TokenStreamComponents ( tokenizer , filter ) ;
490- } ) ;
491-
492- AssertTokenStreamContents ( a . GetTokenStream ( "" , "to be or not to be" ) ,
493- new String [ ] { "to be or" , "to be or not" , "to be or not to" , "be or not" , "be or not to" ,
494- "be or not to be" , "or not to" , "or not to be" , "not to be" } ,
495- new int [ ] { 0 , 0 , 0 , 3 , 3 , 3 , 6 , 6 , 9 , 9 } ,
496- new int [ ] { 8 , 12 , 15 , 12 , 15 , 18 , 15 , 18 , 18 } ,
497- null ,
498- new int [ ] { 1 , 0 , 0 , 1 , 0 , 0 , 1 , 0 , 1 , 0 } ,
499- new int [ ] { 1 , 2 , 3 , 1 , 2 , 3 , 1 , 2 , 1 } ,
500- 18 ,
501- // offsets are correct but assertTokenStreamContents does not handle multiple terms with different offsets
502- // finishing at the same position
503- false ) ;
504- }
505-
506411 [ Test ]
507412 public virtual void TestReset ( )
508413 {
@@ -712,5 +617,97 @@ public virtual void TestTwoTrailingHolesTriShingleWithTokenFiller()
712617
713618 AssertTokenStreamContents ( filter , new string [ ] { "purple" , "purplewizard" , "purplewizard" , "wizard" , "wizard" , "wizard" } , new int [ ] { 0 , 0 , 0 , 7 , 7 , 7 } , new int [ ] { 6 , 13 , 20 , 13 , 20 , 20 } , new int [ ] { 1 , 0 , 0 , 1 , 0 , 0 } , 20 ) ;
714619 }
620+
621+ // LUCENENET-specific: backported fix from Lucene 6.5.0 (LUCENE-7708)
622+ [ Test ]
623+ public void TestPositionLength ( )
624+ {
625+ Analyzer a = Analyzer . NewAnonymous ( createComponents : ( fieldName , reader ) =>
626+ {
627+ MockBytesAttributeFactory factory = new MockBytesAttributeFactory ( ) ;
628+ Tokenizer tokenizer = new MockTokenizer ( factory , reader , MockTokenizer . WHITESPACE , false , MockTokenizer . DEFAULT_MAX_TOKEN_LENGTH ) ;
629+ ShingleFilter filter = new ShingleFilter ( tokenizer , 4 , 4 ) ;
630+ filter . SetOutputUnigrams ( false ) ;
631+ return new TokenStreamComponents ( tokenizer , filter ) ;
632+ } ) ;
633+
634+ AssertTokenStreamContents ( a . GetTokenStream ( "" , "to be or not to be" ) ,
635+ new string [ ] { "to be or not" , "be or not to" , "or not to be" } ,
636+ new int [ ] { 0 , 3 , 6 } ,
637+ new int [ ] { 12 , 15 , 18 } ,
638+ null ,
639+ new int [ ] { 1 , 1 , 1 } ,
640+ new int [ ] { 1 , 1 , 1 } ,
641+ 18 ,
642+ // offsets are correct but assertTokenStreamContents does not handle multiple terms with different offsets
643+ // finishing at the same position
644+ false ) ;
645+
646+ a = Analyzer . NewAnonymous ( createComponents : ( fieldName , reader ) =>
647+ {
648+ MockBytesAttributeFactory factory = new MockBytesAttributeFactory ( ) ;
649+ Tokenizer tokenizer = new MockTokenizer ( factory , reader , MockTokenizer . WHITESPACE , false , MockTokenizer . DEFAULT_MAX_TOKEN_LENGTH ) ;
650+ ShingleFilter filter = new ShingleFilter ( tokenizer , 2 , 4 ) ;
651+ filter . SetOutputUnigrams ( false ) ;
652+ return new TokenStreamComponents ( tokenizer , filter ) ;
653+ } ) ;
654+
655+ AssertTokenStreamContents ( a . GetTokenStream ( "" , "to be or not to be" ) ,
656+ new string [ ] { "to be" , "to be or" , "to be or not" , "be or" , "be or not" , "be or not to" , "or not" , "or not to" ,
657+ "or not to be" , "not to" , "not to be" , "to be" } ,
658+ new int [ ] { 0 , 0 , 0 , 3 , 3 , 3 , 6 , 6 , 6 , 9 , 9 , 13 } ,
659+ new int [ ] { 5 , 8 , 12 , 8 , 12 , 15 , 12 , 15 , 18 , 15 , 18 , 18 } ,
660+ null ,
661+ new int [ ] { 1 , 0 , 0 , 1 , 0 , 0 , 1 , 0 , 0 , 1 , 0 , 1 } ,
662+ new int [ ] { 1 , 2 , 3 , 1 , 2 , 3 , 1 , 2 , 3 , 1 , 2 , 1 } ,
663+ 18 ,
664+ // offsets are correct but assertTokenStreamContents does not handle multiple terms with different offsets
665+ // finishing at the same position
666+ false ) ;
667+
668+ a = Analyzer . NewAnonymous ( createComponents : ( fieldName , reader ) =>
669+ {
670+ MockBytesAttributeFactory factory = new MockBytesAttributeFactory ( ) ;
671+ Tokenizer tokenizer = new MockTokenizer ( factory , reader , MockTokenizer . WHITESPACE , false , MockTokenizer . DEFAULT_MAX_TOKEN_LENGTH ) ;
672+ ShingleFilter filter = new ShingleFilter ( tokenizer , 3 , 4 ) ;
673+ filter . SetOutputUnigrams ( false ) ;
674+ return new TokenStreamComponents ( tokenizer , filter ) ;
675+ } ) ;
676+
677+ AssertTokenStreamContents ( a . GetTokenStream ( "" , "to be or not to be" ) ,
678+ new string [ ] { "to be or" , "to be or not" , "be or not" , "be or not to" , "or not to" ,
679+ "or not to be" , "not to be" } ,
680+ new int [ ] { 0 , 0 , 3 , 3 , 6 , 6 , 9 } ,
681+ new int [ ] { 8 , 12 , 12 , 15 , 15 , 18 , 18 } ,
682+ null ,
683+ new int [ ] { 1 , 0 , 1 , 0 , 1 , 0 , 1 , 0 } ,
684+ new int [ ] { 1 , 2 , 1 , 2 , 1 , 2 , 1 , 2 } ,
685+ 18 ,
686+ // offsets are correct but assertTokenStreamContents does not handle multiple terms with different offsets
687+ // finishing at the same position
688+ false ) ;
689+
690+ a = Analyzer . NewAnonymous ( createComponents : ( fieldName , reader ) =>
691+ {
692+ MockBytesAttributeFactory factory = new MockBytesAttributeFactory ( ) ;
693+ Tokenizer tokenizer = new MockTokenizer ( factory , reader , MockTokenizer . WHITESPACE , false , MockTokenizer . DEFAULT_MAX_TOKEN_LENGTH ) ;
694+ ShingleFilter filter = new ShingleFilter ( tokenizer , 3 , 5 ) ;
695+ filter . SetOutputUnigrams ( false ) ;
696+ return new TokenStreamComponents ( tokenizer , filter ) ;
697+ } ) ;
698+
699+ AssertTokenStreamContents ( a . GetTokenStream ( "" , "to be or not to be" ) ,
700+ new string [ ] { "to be or" , "to be or not" , "to be or not to" , "be or not" , "be or not to" ,
701+ "be or not to be" , "or not to" , "or not to be" , "not to be" } ,
702+ new int [ ] { 0 , 0 , 0 , 3 , 3 , 3 , 6 , 6 , 9 , 9 } ,
703+ new int [ ] { 8 , 12 , 15 , 12 , 15 , 18 , 15 , 18 , 18 } ,
704+ null ,
705+ new int [ ] { 1 , 0 , 0 , 1 , 0 , 0 , 1 , 0 , 1 , 0 } ,
706+ new int [ ] { 1 , 2 , 3 , 1 , 2 , 3 , 1 , 2 , 1 } ,
707+ 18 ,
708+ // offsets are correct but assertTokenStreamContents does not handle multiple terms with different offsets
709+ // finishing at the same position
710+ false ) ;
711+ }
715712 }
716713}
0 commit comments