Skip to content

Commit fd87edb

Browse files
committed
Add LUCENENET-specific backport comment, fix test name, fix test position and code style
1 parent d44ff50 commit fd87edb

File tree

2 files changed

+93
-96
lines changed

2 files changed

+93
-96
lines changed

src/Lucene.Net.Analysis.Common/Analysis/Shingle/ShingleFilter.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -366,7 +366,7 @@ public override bool IncrementToken()
366366
noShingleOutput = false;
367367
}
368368
offsetAtt.SetOffset(offsetAtt.StartOffset, nextToken.offsetAtt.EndOffset);
369-
// posLenAtt.PositionLength = builtGramSize;
369+
// LUCENENET-specific: backported fix from Lucene 6.5.0 (LUCENE-7708)
370370
if (outputUnigrams)
371371
{
372372
posLenAtt.PositionLength = builtGramSize;

src/Lucene.Net.Tests.Analysis.Common/Analysis/Shingle/ShingleFilterTest.cs

Lines changed: 92 additions & 95 deletions
Original file line numberDiff line numberDiff line change
@@ -408,101 +408,6 @@ public virtual void TestPositionIncrementGreaterThanNWithoutUnigrams()
408408
this.shingleFilterTest(2, 3, TEST_TOKEN_POS_INCR_GREATER_THAN_N, TRI_GRAM_TOKENS_POS_INCR_GREATER_THAN_N_WITHOUT_UNIGRAMS, TRI_GRAM_POSITION_INCREMENTS_POS_INCR_GREATER_THAN_N_WITHOUT_UNIGRAMS, TRI_GRAM_TYPES_POS_INCR_GREATER_THAN_N_WITHOUT_UNIGRAMS, false);
409409
}
410410

411-
412-
413-
[Test]
414-
public void testPositionLength()
415-
{
416-
Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
417-
{
418-
MockBytesAttributeFactory factory = new MockBytesAttributeFactory();
419-
Tokenizer tokenizer = new MockTokenizer(factory, reader, MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH);
420-
ShingleFilter filter = new ShingleFilter(tokenizer, 4, 4);
421-
filter.SetOutputUnigrams(false);
422-
return new TokenStreamComponents(tokenizer, filter);
423-
});
424-
425-
AssertTokenStreamContents(a.GetTokenStream("", "to be or not to be"),
426-
new String[] {"to be or not", "be or not to", "or not to be"},
427-
new int[] {0, 3, 6},
428-
new int[] { 12, 15, 18 },
429-
null,
430-
new int[] { 1, 1, 1 },
431-
new int[] { 1, 1, 1 },
432-
18,
433-
// offsets are correct but assertTokenStreamContents does not handle multiple terms with different offsets
434-
// finishing at the same position
435-
false);
436-
437-
438-
a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
439-
{
440-
MockBytesAttributeFactory factory = new MockBytesAttributeFactory();
441-
Tokenizer tokenizer = new MockTokenizer(factory, reader, MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH);
442-
ShingleFilter filter = new ShingleFilter(tokenizer, 2, 4);
443-
filter.SetOutputUnigrams(false);
444-
return new TokenStreamComponents(tokenizer, filter);
445-
});
446-
447-
AssertTokenStreamContents(a.GetTokenStream("", "to be or not to be"),
448-
new String[] {"to be", "to be or", "to be or not", "be or", "be or not", "be or not to", "or not", "or not to",
449-
"or not to be", "not to", "not to be", "to be"},
450-
new int[] { 0, 0, 0, 3, 3, 3, 6, 6, 6, 9, 9, 13 },
451-
new int[] { 5, 8, 12, 8, 12, 15, 12, 15, 18, 15, 18, 18 },
452-
null,
453-
new int[] { 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1 },
454-
new int[] { 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 1 },
455-
18,
456-
// offsets are correct but assertTokenStreamContents does not handle multiple terms with different offsets
457-
// finishing at the same position
458-
false);
459-
460-
a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
461-
{
462-
MockBytesAttributeFactory factory = new MockBytesAttributeFactory();
463-
Tokenizer tokenizer = new MockTokenizer(factory, reader, MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH);
464-
ShingleFilter filter = new ShingleFilter(tokenizer, 3, 4);
465-
filter.SetOutputUnigrams(false);
466-
return new TokenStreamComponents(tokenizer, filter);
467-
});
468-
469-
470-
AssertTokenStreamContents(a.GetTokenStream("", "to be or not to be"),
471-
new String[] {"to be or", "to be or not", "be or not", "be or not to", "or not to",
472-
"or not to be", "not to be"},
473-
new int[] { 0, 0, 3, 3, 6, 6, 9 },
474-
new int[] { 8, 12, 12, 15, 15, 18, 18 },
475-
null,
476-
new int[] { 1, 0, 1, 0, 1, 0, 1, 0 },
477-
new int[] { 1, 2, 1, 2, 1, 2, 1, 2 },
478-
18,
479-
// offsets are correct but assertTokenStreamContents does not handle multiple terms with different offsets
480-
// finishing at the same position
481-
false);
482-
483-
a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
484-
{
485-
MockBytesAttributeFactory factory = new MockBytesAttributeFactory();
486-
Tokenizer tokenizer = new MockTokenizer(factory, reader, MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH);
487-
ShingleFilter filter = new ShingleFilter(tokenizer, 3, 5);
488-
filter.SetOutputUnigrams(false);
489-
return new TokenStreamComponents(tokenizer, filter);
490-
});
491-
492-
AssertTokenStreamContents(a.GetTokenStream("", "to be or not to be"),
493-
new String[] {"to be or", "to be or not", "to be or not to", "be or not", "be or not to",
494-
"be or not to be", "or not to", "or not to be", "not to be"},
495-
new int[] { 0, 0, 0, 3, 3, 3, 6, 6, 9, 9 },
496-
new int[] { 8, 12, 15, 12, 15, 18, 15, 18, 18 },
497-
null,
498-
new int[] { 1, 0, 0, 1, 0, 0, 1, 0, 1, 0 },
499-
new int[] { 1, 2, 3, 1, 2, 3, 1, 2, 1 },
500-
18,
501-
// offsets are correct but assertTokenStreamContents does not handle multiple terms with different offsets
502-
// finishing at the same position
503-
false);
504-
}
505-
506411
[Test]
507412
public virtual void TestReset()
508413
{
@@ -712,5 +617,97 @@ public virtual void TestTwoTrailingHolesTriShingleWithTokenFiller()
712617

713618
AssertTokenStreamContents(filter, new string[] { "purple", "purplewizard", "purplewizard", "wizard", "wizard", "wizard" }, new int[] { 0, 0, 0, 7, 7, 7 }, new int[] { 6, 13, 20, 13, 20, 20 }, new int[] { 1, 0, 0, 1, 0, 0 }, 20);
714619
}
620+
621+
// LUCENENET-specific: backported fix from Lucene 6.5.0 (LUCENE-7708)
622+
[Test]
623+
public void TestPositionLength()
624+
{
625+
Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
626+
{
627+
MockBytesAttributeFactory factory = new MockBytesAttributeFactory();
628+
Tokenizer tokenizer = new MockTokenizer(factory, reader, MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH);
629+
ShingleFilter filter = new ShingleFilter(tokenizer, 4, 4);
630+
filter.SetOutputUnigrams(false);
631+
return new TokenStreamComponents(tokenizer, filter);
632+
});
633+
634+
AssertTokenStreamContents(a.GetTokenStream("", "to be or not to be"),
635+
new string[] {"to be or not", "be or not to", "or not to be"},
636+
new int[] {0, 3, 6},
637+
new int[] { 12, 15, 18 },
638+
null,
639+
new int[] { 1, 1, 1 },
640+
new int[] { 1, 1, 1 },
641+
18,
642+
// offsets are correct but assertTokenStreamContents does not handle multiple terms with different offsets
643+
// finishing at the same position
644+
false);
645+
646+
a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
647+
{
648+
MockBytesAttributeFactory factory = new MockBytesAttributeFactory();
649+
Tokenizer tokenizer = new MockTokenizer(factory, reader, MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH);
650+
ShingleFilter filter = new ShingleFilter(tokenizer, 2, 4);
651+
filter.SetOutputUnigrams(false);
652+
return new TokenStreamComponents(tokenizer, filter);
653+
});
654+
655+
AssertTokenStreamContents(a.GetTokenStream("", "to be or not to be"),
656+
new string[] {"to be", "to be or", "to be or not", "be or", "be or not", "be or not to", "or not", "or not to",
657+
"or not to be", "not to", "not to be", "to be"},
658+
new int[] { 0, 0, 0, 3, 3, 3, 6, 6, 6, 9, 9, 13 },
659+
new int[] { 5, 8, 12, 8, 12, 15, 12, 15, 18, 15, 18, 18 },
660+
null,
661+
new int[] { 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1 },
662+
new int[] { 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 1 },
663+
18,
664+
// offsets are correct but assertTokenStreamContents does not handle multiple terms with different offsets
665+
// finishing at the same position
666+
false);
667+
668+
a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
669+
{
670+
MockBytesAttributeFactory factory = new MockBytesAttributeFactory();
671+
Tokenizer tokenizer = new MockTokenizer(factory, reader, MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH);
672+
ShingleFilter filter = new ShingleFilter(tokenizer, 3, 4);
673+
filter.SetOutputUnigrams(false);
674+
return new TokenStreamComponents(tokenizer, filter);
675+
});
676+
677+
AssertTokenStreamContents(a.GetTokenStream("", "to be or not to be"),
678+
new string[] {"to be or", "to be or not", "be or not", "be or not to", "or not to",
679+
"or not to be", "not to be"},
680+
new int[] { 0, 0, 3, 3, 6, 6, 9 },
681+
new int[] { 8, 12, 12, 15, 15, 18, 18 },
682+
null,
683+
new int[] { 1, 0, 1, 0, 1, 0, 1, 0 },
684+
new int[] { 1, 2, 1, 2, 1, 2, 1, 2 },
685+
18,
686+
// offsets are correct but assertTokenStreamContents does not handle multiple terms with different offsets
687+
// finishing at the same position
688+
false);
689+
690+
a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
691+
{
692+
MockBytesAttributeFactory factory = new MockBytesAttributeFactory();
693+
Tokenizer tokenizer = new MockTokenizer(factory, reader, MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH);
694+
ShingleFilter filter = new ShingleFilter(tokenizer, 3, 5);
695+
filter.SetOutputUnigrams(false);
696+
return new TokenStreamComponents(tokenizer, filter);
697+
});
698+
699+
AssertTokenStreamContents(a.GetTokenStream("", "to be or not to be"),
700+
new string[] {"to be or", "to be or not", "to be or not to", "be or not", "be or not to",
701+
"be or not to be", "or not to", "or not to be", "not to be"},
702+
new int[] { 0, 0, 0, 3, 3, 3, 6, 6, 9, 9 },
703+
new int[] { 8, 12, 15, 12, 15, 18, 15, 18, 18 },
704+
null,
705+
new int[] { 1, 0, 0, 1, 0, 0, 1, 0, 1, 0 },
706+
new int[] { 1, 2, 3, 1, 2, 3, 1, 2, 1 },
707+
18,
708+
// offsets are correct but assertTokenStreamContents does not handle multiple terms with different offsets
709+
// finishing at the same position
710+
false);
711+
}
715712
}
716713
}

0 commit comments

Comments
 (0)