Add LUCENENET-specific backport comment, fix test name, fix test posi…

…tion and code style
apache · Oct 29, 2024 · fd87edb · fd87edb
1 parent d44ff50
commit fd87edb
Show file tree

Hide file tree

Showing 2 changed files with 93 additions and 96 deletions.
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Shingle/ShingleFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Shingle/ShingleFilter.cs
@@ -366,7 +366,7 @@ public override bool IncrementToken()
                         noShingleOutput = false;
                     }
                     offsetAtt.SetOffset(offsetAtt.StartOffset, nextToken.offsetAtt.EndOffset);
-                    // posLenAtt.PositionLength = builtGramSize;
+                    // LUCENENET-specific: backported fix from Lucene 6.5.0 (LUCENE-7708)
                     if (outputUnigrams)
                     {
                         posLenAtt.PositionLength = builtGramSize;

diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Shingle/ShingleFilterTest.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Shingle/ShingleFilterTest.cs
@@ -408,101 +408,6 @@ public virtual void TestPositionIncrementGreaterThanNWithoutUnigrams()
             this.shingleFilterTest(2, 3, TEST_TOKEN_POS_INCR_GREATER_THAN_N, TRI_GRAM_TOKENS_POS_INCR_GREATER_THAN_N_WITHOUT_UNIGRAMS, TRI_GRAM_POSITION_INCREMENTS_POS_INCR_GREATER_THAN_N_WITHOUT_UNIGRAMS, TRI_GRAM_TYPES_POS_INCR_GREATER_THAN_N_WITHOUT_UNIGRAMS, false);
         }
 
-
-
-        [Test]
-        public void testPositionLength()
-        {
-            Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
-            {
-                MockBytesAttributeFactory factory = new MockBytesAttributeFactory();
-                Tokenizer tokenizer = new MockTokenizer(factory, reader, MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH);
-                ShingleFilter filter = new ShingleFilter(tokenizer, 4, 4);
-                filter.SetOutputUnigrams(false);
-                return new TokenStreamComponents(tokenizer, filter);
-            });
-
-            AssertTokenStreamContents(a.GetTokenStream("", "to be or not to be"),
-                new String[] {"to be or not", "be or not to", "or not to be"},
-                new int[] {0, 3, 6},
-                new int[] { 12, 15, 18 },
-                null,
-                new int[] { 1, 1, 1 },
-                new int[] { 1, 1, 1 },
-                18,
-                // offsets are correct but assertTokenStreamContents does not handle multiple terms with different offsets
-                // finishing at the same position
-                false);
-
-
-            a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
-            {
-                MockBytesAttributeFactory factory = new MockBytesAttributeFactory();
-                Tokenizer tokenizer = new MockTokenizer(factory, reader, MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH);
-                ShingleFilter filter = new ShingleFilter(tokenizer, 2, 4);
-                filter.SetOutputUnigrams(false);
-                return new TokenStreamComponents(tokenizer, filter);
-            });
-
-        AssertTokenStreamContents(a.GetTokenStream("", "to be or not to be"),
-            new String[] {"to be", "to be or", "to be or not", "be or", "be or not", "be or not to", "or not", "or not to",
-                    "or not to be", "not to", "not to be", "to be"},
-            new int[] { 0, 0, 0, 3, 3, 3, 6, 6, 6, 9, 9, 13 },
-            new int[] { 5, 8, 12, 8, 12, 15, 12, 15, 18, 15, 18, 18 },
-            null,
-            new int[] { 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1 },
-            new int[] { 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 1 },
-            18,
-            // offsets are correct but assertTokenStreamContents does not handle multiple terms with different offsets
-            // finishing at the same position
-            false);
-
-            a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
-            {
-                MockBytesAttributeFactory factory = new MockBytesAttributeFactory();
-                Tokenizer tokenizer = new MockTokenizer(factory, reader, MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH);
-                ShingleFilter filter = new ShingleFilter(tokenizer, 3, 4);
-                filter.SetOutputUnigrams(false);
-                return new TokenStreamComponents(tokenizer, filter);
-            });
-
-
-        AssertTokenStreamContents(a.GetTokenStream("", "to be or not to be"),
-            new String[] {"to be or", "to be or not", "be or not", "be or not to", "or not to",
-                    "or not to be", "not to be"},
-            new int[] { 0, 0, 3, 3, 6, 6, 9 },
-            new int[] { 8, 12, 12, 15, 15, 18, 18 },
-            null,
-            new int[] { 1, 0, 1, 0, 1, 0, 1, 0 },
-            new int[] { 1, 2, 1, 2, 1, 2, 1, 2 },
-            18,
-            // offsets are correct but assertTokenStreamContents does not handle multiple terms with different offsets
-            // finishing at the same position
-            false);
-
-            a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
-            {
-                MockBytesAttributeFactory factory = new MockBytesAttributeFactory();
-                Tokenizer tokenizer = new MockTokenizer(factory, reader, MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH);
-                ShingleFilter filter = new ShingleFilter(tokenizer, 3, 5);
-                filter.SetOutputUnigrams(false);
-                return new TokenStreamComponents(tokenizer, filter);
-            });
-
-            AssertTokenStreamContents(a.GetTokenStream("", "to be or not to be"),
-                new String[] {"to be or", "to be or not", "to be or not to", "be or not", "be or not to",
-                        "be or not to be", "or not to", "or not to be", "not to be"},
-                new int[] { 0, 0, 0, 3, 3, 3, 6, 6, 9, 9 },
-                new int[] { 8, 12, 15, 12, 15, 18, 15, 18, 18 },
-                null,
-                new int[] { 1, 0, 0, 1, 0, 0, 1, 0, 1, 0 },
-                new int[] { 1, 2, 3, 1, 2, 3, 1, 2, 1 },
-                18,
-                // offsets are correct but assertTokenStreamContents does not handle multiple terms with different offsets
-                // finishing at the same position
-                false);
-      }
-
         [Test]
         public virtual void TestReset()
         {
@@ -712,5 +617,97 @@ public virtual void TestTwoTrailingHolesTriShingleWithTokenFiller()
 
             AssertTokenStreamContents(filter, new string[] { "purple", "purplewizard", "purplewizard", "wizard", "wizard", "wizard" }, new int[] { 0, 0, 0, 7, 7, 7 }, new int[] { 6, 13, 20, 13, 20, 20 }, new int[] { 1, 0, 0, 1, 0, 0 }, 20);
         }
+
+        // LUCENENET-specific: backported fix from Lucene 6.5.0 (LUCENE-7708)
+        [Test]
+        public void TestPositionLength()
+        {
+            Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
+            {
+                MockBytesAttributeFactory factory = new MockBytesAttributeFactory();
+                Tokenizer tokenizer = new MockTokenizer(factory, reader, MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH);
+                ShingleFilter filter = new ShingleFilter(tokenizer, 4, 4);
+                filter.SetOutputUnigrams(false);
+                return new TokenStreamComponents(tokenizer, filter);
+            });
+
+            AssertTokenStreamContents(a.GetTokenStream("", "to be or not to be"),
+                new string[] {"to be or not", "be or not to", "or not to be"},
+                new int[] {0, 3, 6},
+                new int[] { 12, 15, 18 },
+                null,
+                new int[] { 1, 1, 1 },
+                new int[] { 1, 1, 1 },
+                18,
+                // offsets are correct but assertTokenStreamContents does not handle multiple terms with different offsets
+                // finishing at the same position
+                false);
+
+            a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
+            {
+                MockBytesAttributeFactory factory = new MockBytesAttributeFactory();
+                Tokenizer tokenizer = new MockTokenizer(factory, reader, MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH);
+                ShingleFilter filter = new ShingleFilter(tokenizer, 2, 4);
+                filter.SetOutputUnigrams(false);
+                return new TokenStreamComponents(tokenizer, filter);
+            });
+
+            AssertTokenStreamContents(a.GetTokenStream("", "to be or not to be"),
+                new string[] {"to be", "to be or", "to be or not", "be or", "be or not", "be or not to", "or not", "or not to",
+                        "or not to be", "not to", "not to be", "to be"},
+                new int[] { 0, 0, 0, 3, 3, 3, 6, 6, 6, 9, 9, 13 },
+                new int[] { 5, 8, 12, 8, 12, 15, 12, 15, 18, 15, 18, 18 },
+                null,
+                new int[] { 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1 },
+                new int[] { 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 1 },
+                18,
+                // offsets are correct but assertTokenStreamContents does not handle multiple terms with different offsets
+                // finishing at the same position
+                false);
+
+            a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
+            {
+                MockBytesAttributeFactory factory = new MockBytesAttributeFactory();
+                Tokenizer tokenizer = new MockTokenizer(factory, reader, MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH);
+                ShingleFilter filter = new ShingleFilter(tokenizer, 3, 4);
+                filter.SetOutputUnigrams(false);
+                return new TokenStreamComponents(tokenizer, filter);
+            });
+
+            AssertTokenStreamContents(a.GetTokenStream("", "to be or not to be"),
+                new string[] {"to be or", "to be or not", "be or not", "be or not to", "or not to",
+                        "or not to be", "not to be"},
+                new int[] { 0, 0, 3, 3, 6, 6, 9 },
+                new int[] { 8, 12, 12, 15, 15, 18, 18 },
+                null,
+                new int[] { 1, 0, 1, 0, 1, 0, 1, 0 },
+                new int[] { 1, 2, 1, 2, 1, 2, 1, 2 },
+                18,
+                // offsets are correct but assertTokenStreamContents does not handle multiple terms with different offsets
+                // finishing at the same position
+                false);
+
+            a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
+            {
+                MockBytesAttributeFactory factory = new MockBytesAttributeFactory();
+                Tokenizer tokenizer = new MockTokenizer(factory, reader, MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH);
+                ShingleFilter filter = new ShingleFilter(tokenizer, 3, 5);
+                filter.SetOutputUnigrams(false);
+                return new TokenStreamComponents(tokenizer, filter);
+            });
+
+            AssertTokenStreamContents(a.GetTokenStream("", "to be or not to be"),
+                new string[] {"to be or", "to be or not", "to be or not to", "be or not", "be or not to",
+                        "be or not to be", "or not to", "or not to be", "not to be"},
+                new int[] { 0, 0, 0, 3, 3, 3, 6, 6, 9, 9 },
+                new int[] { 8, 12, 15, 12, 15, 18, 15, 18, 18 },
+                null,
+                new int[] { 1, 0, 0, 1, 0, 0, 1, 0, 1, 0 },
+                new int[] { 1, 2, 3, 1, 2, 3, 1, 2, 1 },
+                18,
+                // offsets are correct but assertTokenStreamContents does not handle multiple terms with different offsets
+                // finishing at the same position
+                false);
+        }
     }
 }