Skip to content

Commit

Permalink
Add LUCENENET-specific backport comment, fix test name, fix test posi…
Browse files Browse the repository at this point in the history
…tion and code style
  • Loading branch information
paulirwin committed Oct 29, 2024
1 parent d44ff50 commit fd87edb
Show file tree
Hide file tree
Showing 2 changed files with 93 additions and 96 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -366,7 +366,7 @@ public override bool IncrementToken()
noShingleOutput = false;
}
offsetAtt.SetOffset(offsetAtt.StartOffset, nextToken.offsetAtt.EndOffset);
// posLenAtt.PositionLength = builtGramSize;
// LUCENENET-specific: backported fix from Lucene 6.5.0 (LUCENE-7708)
if (outputUnigrams)
{
posLenAtt.PositionLength = builtGramSize;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -408,101 +408,6 @@ public virtual void TestPositionIncrementGreaterThanNWithoutUnigrams()
this.shingleFilterTest(2, 3, TEST_TOKEN_POS_INCR_GREATER_THAN_N, TRI_GRAM_TOKENS_POS_INCR_GREATER_THAN_N_WITHOUT_UNIGRAMS, TRI_GRAM_POSITION_INCREMENTS_POS_INCR_GREATER_THAN_N_WITHOUT_UNIGRAMS, TRI_GRAM_TYPES_POS_INCR_GREATER_THAN_N_WITHOUT_UNIGRAMS, false);
}



[Test]
public void testPositionLength()
{
Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
{
MockBytesAttributeFactory factory = new MockBytesAttributeFactory();
Tokenizer tokenizer = new MockTokenizer(factory, reader, MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH);
ShingleFilter filter = new ShingleFilter(tokenizer, 4, 4);
filter.SetOutputUnigrams(false);
return new TokenStreamComponents(tokenizer, filter);
});

AssertTokenStreamContents(a.GetTokenStream("", "to be or not to be"),
new String[] {"to be or not", "be or not to", "or not to be"},
new int[] {0, 3, 6},
new int[] { 12, 15, 18 },
null,
new int[] { 1, 1, 1 },
new int[] { 1, 1, 1 },
18,
// offsets are correct but assertTokenStreamContents does not handle multiple terms with different offsets
// finishing at the same position
false);


a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
{
MockBytesAttributeFactory factory = new MockBytesAttributeFactory();
Tokenizer tokenizer = new MockTokenizer(factory, reader, MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH);
ShingleFilter filter = new ShingleFilter(tokenizer, 2, 4);
filter.SetOutputUnigrams(false);
return new TokenStreamComponents(tokenizer, filter);
});

AssertTokenStreamContents(a.GetTokenStream("", "to be or not to be"),
new String[] {"to be", "to be or", "to be or not", "be or", "be or not", "be or not to", "or not", "or not to",
"or not to be", "not to", "not to be", "to be"},
new int[] { 0, 0, 0, 3, 3, 3, 6, 6, 6, 9, 9, 13 },
new int[] { 5, 8, 12, 8, 12, 15, 12, 15, 18, 15, 18, 18 },
null,
new int[] { 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1 },
new int[] { 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 1 },
18,
// offsets are correct but assertTokenStreamContents does not handle multiple terms with different offsets
// finishing at the same position
false);

a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
{
MockBytesAttributeFactory factory = new MockBytesAttributeFactory();
Tokenizer tokenizer = new MockTokenizer(factory, reader, MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH);
ShingleFilter filter = new ShingleFilter(tokenizer, 3, 4);
filter.SetOutputUnigrams(false);
return new TokenStreamComponents(tokenizer, filter);
});


AssertTokenStreamContents(a.GetTokenStream("", "to be or not to be"),
new String[] {"to be or", "to be or not", "be or not", "be or not to", "or not to",
"or not to be", "not to be"},
new int[] { 0, 0, 3, 3, 6, 6, 9 },
new int[] { 8, 12, 12, 15, 15, 18, 18 },
null,
new int[] { 1, 0, 1, 0, 1, 0, 1, 0 },
new int[] { 1, 2, 1, 2, 1, 2, 1, 2 },
18,
// offsets are correct but assertTokenStreamContents does not handle multiple terms with different offsets
// finishing at the same position
false);

a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
{
MockBytesAttributeFactory factory = new MockBytesAttributeFactory();
Tokenizer tokenizer = new MockTokenizer(factory, reader, MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH);
ShingleFilter filter = new ShingleFilter(tokenizer, 3, 5);
filter.SetOutputUnigrams(false);
return new TokenStreamComponents(tokenizer, filter);
});

AssertTokenStreamContents(a.GetTokenStream("", "to be or not to be"),
new String[] {"to be or", "to be or not", "to be or not to", "be or not", "be or not to",
"be or not to be", "or not to", "or not to be", "not to be"},
new int[] { 0, 0, 0, 3, 3, 3, 6, 6, 9, 9 },
new int[] { 8, 12, 15, 12, 15, 18, 15, 18, 18 },
null,
new int[] { 1, 0, 0, 1, 0, 0, 1, 0, 1, 0 },
new int[] { 1, 2, 3, 1, 2, 3, 1, 2, 1 },
18,
// offsets are correct but assertTokenStreamContents does not handle multiple terms with different offsets
// finishing at the same position
false);
}

[Test]
public virtual void TestReset()
{
Expand Down Expand Up @@ -712,5 +617,97 @@ public virtual void TestTwoTrailingHolesTriShingleWithTokenFiller()

AssertTokenStreamContents(filter, new string[] { "purple", "purplewizard", "purplewizard", "wizard", "wizard", "wizard" }, new int[] { 0, 0, 0, 7, 7, 7 }, new int[] { 6, 13, 20, 13, 20, 20 }, new int[] { 1, 0, 0, 1, 0, 0 }, 20);
}

// LUCENENET-specific: backported fix from Lucene 6.5.0 (LUCENE-7708)
[Test]
public void TestPositionLength()
{
Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
{
MockBytesAttributeFactory factory = new MockBytesAttributeFactory();
Tokenizer tokenizer = new MockTokenizer(factory, reader, MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH);
ShingleFilter filter = new ShingleFilter(tokenizer, 4, 4);
filter.SetOutputUnigrams(false);
return new TokenStreamComponents(tokenizer, filter);
});

AssertTokenStreamContents(a.GetTokenStream("", "to be or not to be"),
new string[] {"to be or not", "be or not to", "or not to be"},
new int[] {0, 3, 6},
new int[] { 12, 15, 18 },
null,
new int[] { 1, 1, 1 },
new int[] { 1, 1, 1 },
18,
// offsets are correct but assertTokenStreamContents does not handle multiple terms with different offsets
// finishing at the same position
false);

a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
{
MockBytesAttributeFactory factory = new MockBytesAttributeFactory();
Tokenizer tokenizer = new MockTokenizer(factory, reader, MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH);
ShingleFilter filter = new ShingleFilter(tokenizer, 2, 4);
filter.SetOutputUnigrams(false);
return new TokenStreamComponents(tokenizer, filter);
});

AssertTokenStreamContents(a.GetTokenStream("", "to be or not to be"),
new string[] {"to be", "to be or", "to be or not", "be or", "be or not", "be or not to", "or not", "or not to",
"or not to be", "not to", "not to be", "to be"},
new int[] { 0, 0, 0, 3, 3, 3, 6, 6, 6, 9, 9, 13 },
new int[] { 5, 8, 12, 8, 12, 15, 12, 15, 18, 15, 18, 18 },
null,
new int[] { 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1 },
new int[] { 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 1 },
18,
// offsets are correct but assertTokenStreamContents does not handle multiple terms with different offsets
// finishing at the same position
false);

a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
{
MockBytesAttributeFactory factory = new MockBytesAttributeFactory();
Tokenizer tokenizer = new MockTokenizer(factory, reader, MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH);
ShingleFilter filter = new ShingleFilter(tokenizer, 3, 4);
filter.SetOutputUnigrams(false);
return new TokenStreamComponents(tokenizer, filter);
});

AssertTokenStreamContents(a.GetTokenStream("", "to be or not to be"),
new string[] {"to be or", "to be or not", "be or not", "be or not to", "or not to",
"or not to be", "not to be"},
new int[] { 0, 0, 3, 3, 6, 6, 9 },
new int[] { 8, 12, 12, 15, 15, 18, 18 },
null,
new int[] { 1, 0, 1, 0, 1, 0, 1, 0 },
new int[] { 1, 2, 1, 2, 1, 2, 1, 2 },
18,
// offsets are correct but assertTokenStreamContents does not handle multiple terms with different offsets
// finishing at the same position
false);

a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
{
MockBytesAttributeFactory factory = new MockBytesAttributeFactory();
Tokenizer tokenizer = new MockTokenizer(factory, reader, MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH);
ShingleFilter filter = new ShingleFilter(tokenizer, 3, 5);
filter.SetOutputUnigrams(false);
return new TokenStreamComponents(tokenizer, filter);
});

AssertTokenStreamContents(a.GetTokenStream("", "to be or not to be"),
new string[] {"to be or", "to be or not", "to be or not to", "be or not", "be or not to",
"be or not to be", "or not to", "or not to be", "not to be"},
new int[] { 0, 0, 0, 3, 3, 3, 6, 6, 9, 9 },
new int[] { 8, 12, 15, 12, 15, 18, 15, 18, 18 },
null,
new int[] { 1, 0, 0, 1, 0, 0, 1, 0, 1, 0 },
new int[] { 1, 2, 3, 1, 2, 3, 1, 2, 1 },
18,
// offsets are correct but assertTokenStreamContents does not handle multiple terms with different offsets
// finishing at the same position
false);
}
}
}

0 comments on commit fd87edb

Please sign in to comment.