Merge branch 'main' into knn-dictionary

apache · Oct 29, 2024 · 5c2cb2d · 5c2cb2d
2 parents ed233ba + 937432a
commit 5c2cb2d
Show file tree

Hide file tree

Showing 57 changed files with 724 additions and 737 deletions.
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
@@ -7,7 +7,7 @@ http://s.apache.org/luceneversions
 
 API Changes
 ---------------------
-(No changes)
+* GITHUB#11023: Removing deprecated parameters from CheckIndex. (Jakub Slowinski)
 
 New Features
 ---------------------
@@ -36,6 +36,10 @@ API Changes
 
 * GITHUB#13859: Allow open-ended ranges in Intervals range queries. (Mayya Sharipova)
 
+* GITHUB#13950: Make BooleanQuery#getClauses public and add #add(Collection<BooleanClause>) to BQ builder. (Shubham Chaudhary)
+
+* GITHUB#13957: Removed LeafSimScorer class, to save its overhead. Scorers now
+  compute scores directly from a SimScorer, postings and norms. (Adrien Grand)
 
 * GITHUB#13831: Complete refactoring of random-access vector API, eliminating copy() method. Now random-access vectors
   are accessed by calling Byte/FloatVectorValues.vectors().get(int).
@@ -56,12 +60,32 @@ Optimizations
 * GITHUB#13800: MaxScoreBulkScorer now recomputes scorer partitions when the
   minimum competitive allows for a more favorable partitioning. (Adrien Grand)
 
+* GITHUB#13930: Use growNoCopy when copying bytes in BytesRefBuilder. (Ignacio Vera)
+
+* GITHUB#13931: Refactored `BooleanScorer` to evaluate matches of sub clauses
+  using the `Scorer` abstraction rather than the `BulkScorer` abstraction. This
+  speeds up exhaustive evaluation of disjunctions of term queries.
+  (Adrien Grand)
+
+* GITHUB#13941: Optimized computation of top-hits on disjunctive queries with
+  many clauses. (Adrien Grand)
+
+* GITHUB#13954: Disabled exchanging scores across slices for exhaustive
+  top-hits evaluation. (Adrien Grand)
+
+* GITHUB#13899: Check ahead if we can get the count. (Lu Xugang)
+
+* GITHUB#13943: Removed shared `HitsThresholdChecker`, which reduces overhead
+  but may delay a bit when dynamic pruning kicks in. (Adrien Grand)
+
 Bug Fixes
 ---------------------
 * GITHUB#13832: Fixed an issue where the DefaultPassageFormatter.format method did not format passages as intended
   when they were not sorted by startOffset. (Seunghan Jung)
 * GITHUB#13884: Remove broken .toArray from Long/CharObjectHashMap entirely. (Pan Guixin)
 * GITHUB#12686: Added support for highlighting IndexOrDocValuesQuery. (Prudhvi Godithi)
+* GITHUB#13927: Fix StoredFieldsConsumer finish. (linfn)
+* GITHUB#13944: Ensure deterministic order of clauses for `DisjunctionMaxQuery#toString`. (Laurent Jakubina)
 
 Build
 ---------------------

diff --git a/lucene/MIGRATE.md b/lucene/MIGRATE.md
@@ -19,6 +19,13 @@
 
 ## Migration from Lucene 9.x to Lucene 10.0
 
+### DataInput#readVLong() may now read negative vlongs
+
+LUCENE-10376 started allowing `DataInput#readVLong()` to read negative vlongs.
+In particular, this feature is used by the `DataInput#readZLong()` method. A
+practical implication is that `DataInput#readVLong()` may now read up to 10
+bytes, while it would never read more than 9 bytes in Lucene 9.x.
+
 ### Changes to DataInput.readGroupVInt and readGroupVInts methods 
 
 As part of GITHUB#13820, GITHUB#13825, GITHUB#13830, this issue corrects DataInput.readGroupVInts 

diff --git a/lucene/core/src/generated/checksums/generateForDeltaUtil.json b/lucene/core/src/generated/checksums/generateForDeltaUtil.json
@@ -1,4 +1,4 @@
 {
-    "lucene/core/src/java/org/apache/lucene/codecs/lucene912/ForDeltaUtil.java": "f561578ccb6a95364bb62c5ed86b38ff0b4a009d",
-    "lucene/core/src/java/org/apache/lucene/codecs/lucene912/gen_ForDeltaUtil.py": "eea1a71be9da8a13fdd979354dc4a8c6edf21be1"
+    "lucene/core/src/java/org/apache/lucene/codecs/lucene912/ForDeltaUtil.java": "b662da5848b0decc8bceb4225f433875ae9e3c11",
+    "lucene/core/src/java/org/apache/lucene/codecs/lucene912/gen_ForDeltaUtil.py": "01787b97bbe79edb7703498cef8ddb85901a6b1e"
 }
diff --git a/lucene/core/src/generated/checksums/generateForUtil.json b/lucene/core/src/generated/checksums/generateForUtil.json
@@ -1,4 +1,4 @@
 {
-    "lucene/core/src/java/org/apache/lucene/codecs/lucene912/ForUtil.java": "159e82388346fde147924d5e15ca65df4dd63b9a",
-    "lucene/core/src/java/org/apache/lucene/codecs/lucene912/gen_ForUtil.py": "66dc8813160feae2a37d8b50474f5f9830b6cb22"
+    "lucene/core/src/java/org/apache/lucene/codecs/lucene912/ForUtil.java": "02e0c8c290e65d0314664fde24c9331bdec44925",
+    "lucene/core/src/java/org/apache/lucene/codecs/lucene912/gen_ForUtil.py": "d7850f37e52a16c6592322950d0f6219cad23a33"
 }
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene912/ForDeltaUtil.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene912/ForDeltaUtil.java
@@ -286,19 +286,19 @@ void decodeAndPrefixSum(int bitsPerValue, PostingDecodingUtil pdu, long base, lo
       throws IOException {
     switch (bitsPerValue) {
       case 1:
-        decode1(pdu, tmp, longs);
+        decode1(pdu, longs);
         prefixSum8(longs, base);
         break;
       case 2:
-        decode2(pdu, tmp, longs);
+        decode2(pdu, longs);
         prefixSum8(longs, base);
         break;
       case 3:
         decode3(pdu, tmp, longs);
         prefixSum8(longs, base);
         break;
       case 4:
-        decode4(pdu, tmp, longs);
+        decode4(pdu, longs);
         prefixSum8(longs, base);
         break;
       case 5:
@@ -314,7 +314,7 @@ void decodeAndPrefixSum(int bitsPerValue, PostingDecodingUtil pdu, long base, lo
         prefixSum16(longs, base);
         break;
       case 8:
-        decode8To16(pdu, tmp, longs);
+        decode8To16(pdu, longs);
         prefixSum16(longs, base);
         break;
       case 9:
@@ -346,7 +346,7 @@ void decodeAndPrefixSum(int bitsPerValue, PostingDecodingUtil pdu, long base, lo
         prefixSum32(longs, base);
         break;
       case 16:
-        decode16To32(pdu, tmp, longs);
+        decode16To32(pdu, longs);
         prefixSum32(longs, base);
         break;
       case 17:
@@ -431,8 +431,7 @@ private static void decode7To16(PostingDecodingUtil pdu, long[] tmp, long[] long
     }
   }
 
-  private static void decode8To16(PostingDecodingUtil pdu, long[] tmp, long[] longs)
-      throws IOException {
+  private static void decode8To16(PostingDecodingUtil pdu, long[] longs) throws IOException {
     pdu.splitLongs(16, longs, 8, 8, MASK16_8, longs, 16, MASK16_8);
   }
 
@@ -522,8 +521,7 @@ private static void decode15To32(PostingDecodingUtil pdu, long[] tmp, long[] lon
     }
   }
 
-  private static void decode16To32(PostingDecodingUtil pdu, long[] tmp, long[] longs)
-      throws IOException {
+  private static void decode16To32(PostingDecodingUtil pdu, long[] longs) throws IOException {
     pdu.splitLongs(32, longs, 16, 16, MASK32_16, longs, 32, MASK32_16);
   }
 }
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene912/ForUtil.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene912/ForUtil.java
@@ -291,19 +291,19 @@ static void decodeSlow(int bitsPerValue, PostingDecodingUtil pdu, long[] tmp, lo
   void decode(int bitsPerValue, PostingDecodingUtil pdu, long[] longs) throws IOException {
     switch (bitsPerValue) {
       case 1:
-        decode1(pdu, tmp, longs);
+        decode1(pdu, longs);
         expand8(longs);
         break;
       case 2:
-        decode2(pdu, tmp, longs);
+        decode2(pdu, longs);
         expand8(longs);
         break;
       case 3:
         decode3(pdu, tmp, longs);
         expand8(longs);
         break;
       case 4:
-        decode4(pdu, tmp, longs);
+        decode4(pdu, longs);
         expand8(longs);
         break;
       case 5:
@@ -319,7 +319,7 @@ void decode(int bitsPerValue, PostingDecodingUtil pdu, long[] longs) throws IOEx
         expand8(longs);
         break;
       case 8:
-        decode8(pdu, tmp, longs);
+        decode8(pdu, longs);
         expand8(longs);
         break;
       case 9:
@@ -351,7 +351,7 @@ void decode(int bitsPerValue, PostingDecodingUtil pdu, long[] longs) throws IOEx
         expand16(longs);
         break;
       case 16:
-        decode16(pdu, tmp, longs);
+        decode16(pdu, longs);
         expand16(longs);
         break;
       case 17:
@@ -393,11 +393,11 @@ void decode(int bitsPerValue, PostingDecodingUtil pdu, long[] longs) throws IOEx
     }
   }
 
-  static void decode1(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {
+  static void decode1(PostingDecodingUtil pdu, long[] longs) throws IOException {
     pdu.splitLongs(2, longs, 7, 1, MASK8_1, longs, 14, MASK8_1);
   }
 
-  static void decode2(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {
+  static void decode2(PostingDecodingUtil pdu, long[] longs) throws IOException {
     pdu.splitLongs(4, longs, 6, 2, MASK8_2, longs, 12, MASK8_2);
   }
 
@@ -413,7 +413,7 @@ static void decode3(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IO
     }
   }
 
-  static void decode4(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {
+  static void decode4(PostingDecodingUtil pdu, long[] longs) throws IOException {
     pdu.splitLongs(8, longs, 4, 4, MASK8_4, longs, 8, MASK8_4);
   }
 
@@ -457,7 +457,7 @@ static void decode7(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IO
     }
   }
 
-  static void decode8(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {
+  static void decode8(PostingDecodingUtil pdu, long[] longs) throws IOException {
     pdu.in.readLongs(longs, 0, 16);
   }
 
@@ -601,7 +601,7 @@ static void decode15(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws I
     }
   }
 
-  static void decode16(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {
+  static void decode16(PostingDecodingUtil pdu, long[] longs) throws IOException {
     pdu.in.readLongs(longs, 0, 32);
   }
 

diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene912/Lucene912PostingsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene912/Lucene912PostingsReader.java
@@ -427,7 +427,7 @@ public boolean canReuse(IndexInput docIn, FieldInfo fieldInfo) {
     public PostingsEnum reset(IntBlockTermState termState, int flags) throws IOException {
       resetIndexInput(termState);
       if (pforUtil == null && docFreq >= BLOCK_SIZE) {
-        pforUtil = new PForUtil(new ForUtil());
+        pforUtil = new PForUtil();
         forDeltaUtil = new ForDeltaUtil();
       }
       totalTermFreq = indexHasFreq ? termState.totalTermFreq : docFreq;
@@ -727,7 +727,7 @@ public PostingsEnum reset(IntBlockTermState termState, int flags) throws IOExcep
       }
       totalTermFreq = termState.totalTermFreq;
       if (pforUtil == null && totalTermFreq >= BLOCK_SIZE) {
-        pforUtil = new PForUtil(new ForUtil());
+        pforUtil = new PForUtil();
       }
       // Where this term's postings start in the .pos file:
       final long posTermStartFP = termState.posStartFP;
@@ -1142,7 +1142,7 @@ public long cost() {
   private abstract class BlockImpactsEnum extends ImpactsEnum {
 
     protected final ForDeltaUtil forDeltaUtil = new ForDeltaUtil();
-    protected final PForUtil pforUtil = new PForUtil(new ForUtil());
+    protected final PForUtil pforUtil = new PForUtil();
 
     protected final long[] docBuffer = new long[BLOCK_SIZE + 1];
     protected final long[] freqBuffer = new long[BLOCK_SIZE];

diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene912/Lucene912PostingsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene912/Lucene912PostingsWriter.java
@@ -142,9 +142,8 @@ public Lucene912PostingsWriter(SegmentWriteState state) throws IOException {
           metaOut, META_CODEC, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
       CodecUtil.writeIndexHeader(
           docOut, DOC_CODEC, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
-      final ForUtil forUtil = new ForUtil();
       forDeltaUtil = new ForDeltaUtil();
-      pforUtil = new PForUtil(forUtil);
+      pforUtil = new PForUtil();
       if (state.fieldInfos.hasProx()) {
         posDeltaBuffer = new long[BLOCK_SIZE];
         String posFileName =

diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene912/PForUtil.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene912/PForUtil.java
@@ -38,11 +38,10 @@ static boolean allEqual(long[] l) {
     return true;
   }
 
-  private final ForUtil forUtil;
+  private final ForUtil forUtil = new ForUtil();
 
-  PForUtil(ForUtil forUtil) {
+  static {
     assert ForUtil.BLOCK_SIZE <= 256 : "blocksize must fit in one byte. got " + ForUtil.BLOCK_SIZE;
-    this.forUtil = forUtil;
   }
 
   /** Encode 128 integers from {@code longs} into {@code out}. */
@@ -106,17 +105,18 @@ void encode(long[] longs, DataOutput out) throws IOException {
 
   /** Decode 128 integers into {@code ints}. */
   void decode(PostingDecodingUtil pdu, long[] longs) throws IOException {
-    final int token = Byte.toUnsignedInt(pdu.in.readByte());
+    var in = pdu.in;
+    final int token = Byte.toUnsignedInt(in.readByte());
     final int bitsPerValue = token & 0x1f;
-    final int numExceptions = token >>> 5;
     if (bitsPerValue == 0) {
-      Arrays.fill(longs, 0, ForUtil.BLOCK_SIZE, pdu.in.readVLong());
+      Arrays.fill(longs, 0, ForUtil.BLOCK_SIZE, in.readVLong());
     } else {
       forUtil.decode(bitsPerValue, pdu, longs);
     }
+    final int numExceptions = token >>> 5;
     for (int i = 0; i < numExceptions; ++i) {
-      longs[Byte.toUnsignedInt(pdu.in.readByte())] |=
-          Byte.toUnsignedLong(pdu.in.readByte()) << bitsPerValue;
+      longs[Byte.toUnsignedInt(in.readByte())] |=
+          Byte.toUnsignedLong(in.readByte()) << bitsPerValue;
     }
   }
 

diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene912/gen_ForDeltaUtil.py b/lucene/core/src/java/org/apache/lucene/codecs/lucene912/gen_ForDeltaUtil.py
@@ -361,7 +361,10 @@ def writeRemainder(bpv, next_primitive, remaining_bits_per_long, o, num_values,
 
 def writeDecode(bpv, f):
   next_primitive = primitive_size_for_bpv(bpv)
-  f.write('  private static void decode%dTo%d(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {\n' %(bpv, next_primitive))
+  if next_primitive % bpv == 0:
+    f.write('  private static void decode%dTo%d(PostingDecodingUtil pdu, long[] longs) throws IOException {\n' %(bpv, next_primitive))
+  else:
+    f.write('  private static void decode%dTo%d(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {\n' %(bpv, next_primitive))
   if bpv == next_primitive:
     f.write('    pdu.in.readLongs(longs, 0, %d);\n' %(bpv*2))
   else:
@@ -390,9 +393,15 @@ def writeDecode(bpv, f):
     primitive_size = primitive_size_for_bpv(bpv)
     f.write('    case %d:\n' %bpv)
     if next_primitive(bpv) == primitive_size:
-      f.write('        decode%d(pdu, tmp, longs);\n' %bpv)
+      if primitive_size % bpv == 0:
+        f.write('        decode%d(pdu, longs);\n' %bpv)
+      else:
+        f.write('        decode%d(pdu, tmp, longs);\n' %bpv)
     else:
-      f.write('        decode%dTo%d(pdu, tmp, longs);\n' %(bpv, primitive_size))
+      if primitive_size % bpv == 0:
+        f.write('        decode%dTo%d(pdu, longs);\n' %(bpv, primitive_size))
+      else:
+        f.write('        decode%dTo%d(pdu, tmp, longs);\n' %(bpv, primitive_size))
     f.write('      prefixSum%d(longs, base);\n' %primitive_size)
     f.write('      break;\n')
   f.write('    default:\n')

diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene912/gen_ForUtil.py b/lucene/core/src/java/org/apache/lucene/codecs/lucene912/gen_ForUtil.py
@@ -287,17 +287,19 @@ def writeDecode(bpv, f):
     next_primitive = 8
   elif bpv <= 16:
     next_primitive = 16
-  f.write('  static void decode%d(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {\n' %bpv)
   if bpv == next_primitive:
+    f.write('  static void decode%d(PostingDecodingUtil pdu, long[] longs) throws IOException {\n' %bpv)
     f.write('    pdu.in.readLongs(longs, 0, %d);\n' %(bpv*2))
   else:
     num_values_per_long = 64 / next_primitive
     remaining_bits = next_primitive % bpv
     num_iters = (next_primitive - 1) // bpv
     o = 2 * bpv * num_iters
     if remaining_bits == 0:
+      f.write('  static void decode%d(PostingDecodingUtil pdu, long[] longs) throws IOException {\n' %bpv)
       f.write('    pdu.splitLongs(%d, longs, %d, %d, MASK%d_%d, longs, %d, MASK%d_%d);\n' %(bpv*2, next_primitive - bpv, bpv, next_primitive, bpv, o, next_primitive, next_primitive - num_iters * bpv))
     else:
+      f.write('  static void decode%d(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {\n' %bpv)
       f.write('    pdu.splitLongs(%d, longs, %d, %d, MASK%d_%d, tmp, 0, MASK%d_%d);\n' %(bpv*2, next_primitive - bpv, bpv, next_primitive, bpv, next_primitive, next_primitive - num_iters * bpv))
       writeRemainder(bpv, next_primitive, remaining_bits, o, 128/num_values_per_long - o, f)
   f.write('  }\n')
@@ -334,7 +336,10 @@ def writeDecode(bpv, f):
     elif bpv <= 16:
       next_primitive = 16
     f.write('      case %d:\n' %bpv)
-    f.write('        decode%d(pdu, tmp, longs);\n' %bpv)
+    if next_primitive % bpv == 0:
+      f.write('        decode%d(pdu, longs);\n' %bpv)
+    else:
+      f.write('        decode%d(pdu, tmp, longs);\n' %bpv)
     f.write('        expand%d(longs);\n' %next_primitive)
     f.write('        break;\n')
   f.write('      default:\n')

diff --git a/lucene/core/src/java/org/apache/lucene/document/FeatureQuery.java b/lucene/core/src/java/org/apache/lucene/document/FeatureQuery.java
@@ -27,7 +27,6 @@
 import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.search.Explanation;
 import org.apache.lucene.search.IndexSearcher;
-import org.apache.lucene.search.LeafSimScorer;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.QueryVisitor;
 import org.apache.lucene.search.ScoreMode;
@@ -120,7 +119,6 @@ public Explanation explain(LeafReaderContext context, int doc) throws IOExceptio
 
       @Override
       public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOException {
-        final Weight thisWeight = this;
         Terms terms = Terms.getTerms(context.reader(), fieldName);
         TermsEnum termsEnum = terms.iterator();
         if (termsEnum.seekExact(new BytesRef(featureName)) == false) {
@@ -135,10 +133,8 @@ public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOExcepti
           @Override
           public Scorer get(long leadCost) throws IOException {
             final SimScorer scorer = function.scorer(boost);
-            final LeafSimScorer simScorer =
-                new LeafSimScorer(scorer, context.reader(), fieldName, false);
             final ImpactsEnum impacts = termsEnum.impacts(PostingsEnum.FREQS);
-            return new TermScorer(thisWeight, impacts, simScorer, topLevelScoringClause);
+            return new TermScorer(impacts, scorer, null, topLevelScoringClause);
           }
 
           @Override