CNDB-14317: Optimize doc freq computation for memtable BM25 queries (#1789)

michaeljmarshall · web-flow · commit 8cccc374fcd2 · 2025-06-09T17:31:45.000-05:00
- **CNDB-14317: Optimize doc freq computation for memtable BM25 queries** - **Update approximateTotalTermCount() javadoc** ### What is the issue Fixes riptano/cndb#14317 ### What does this PR fix and why was it fixed We optimize the in memory BM25 computation by using the trie to get the number of rows matching a query term. This change removes a memtable scan and analyze by replacing it with calls to the index to get the number of docs. There are no new tests because it is expected to maintain the same semantics. I will review the test coverage to verify that assertion.
diff --git a/src/java/org/apache/cassandra/index/sai/memory/TrieMemtableIndex.java b/src/java/org/apache/cassandra/index/sai/memory/TrieMemtableIndex.java
@@ -23,11 +23,10 @@
 import java.util.Arrays;
 import java.util.Comparator;
 import java.util.HashMap;
-import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;
+import java.util.Map;
 import java.util.Objects;
-import java.util.Set;
 import java.util.concurrent.atomic.LongAdder;
 import java.util.stream.Stream;
 import javax.annotation.Nullable;
@@ -39,17 +38,14 @@
 
 import org.apache.cassandra.cql3.Operator;
 import org.apache.cassandra.db.Clustering;
-import org.apache.cassandra.db.DataRange;
 import org.apache.cassandra.db.DecoratedKey;
 import org.apache.cassandra.db.PartitionPosition;
-import org.apache.cassandra.db.RegularAndStaticColumns;
-import org.apache.cassandra.db.filter.ColumnFilter;
 import org.apache.cassandra.db.marshal.AbstractType;
 import org.apache.cassandra.db.memtable.Memtable;
 import org.apache.cassandra.db.memtable.ShardBoundaries;
 import org.apache.cassandra.db.memtable.TrieMemtable;
-import org.apache.cassandra.db.rows.Row;
 import org.apache.cassandra.dht.AbstractBounds;
+import org.apache.cassandra.dht.Range;
 import org.apache.cassandra.index.sai.IndexContext;
 import org.apache.cassandra.index.sai.QueryContext;
 import org.apache.cassandra.index.sai.analyzer.AbstractAnalyzer;
@@ -130,7 +126,7 @@ public int indexedRows()
 
     /**
      * Approximate total count of terms in the memory index.
-     * The count is approximate because deletions are not accounted for.
+     * The count is approximate because some deletions are not accounted for in the current implementation.
      *
      * @return total count of terms for indexes rows.
      */
@@ -290,7 +286,7 @@ public void update(DecoratedKey key, Clustering clustering, Iterator<ByteBuffer>
     public KeyRangeIterator search(QueryContext queryContext, Expression expression, AbstractBounds<PartitionPosition> keyRange, int limit)
     {
         int startShard = boundaries.getShardForToken(keyRange.left.getToken());
-        int endShard = keyRange.right.isMinimum() ? boundaries.shardCount() - 1 : boundaries.getShardForToken(keyRange.right.getToken());
+        int endShard = getEndShardForBounds(keyRange);
 
         KeyRangeConcatIterator.Builder builder = KeyRangeConcatIterator.builder(endShard - startShard + 1);
 
@@ -320,6 +316,20 @@ public KeyRangeIterator search(QueryContext queryContext, Expression expression,
         return builder.build();
     }
 
+    public KeyRangeIterator eagerSearch(Expression expression, AbstractBounds<PartitionPosition> keyRange)
+    {
+        int startShard = boundaries.getShardForToken(keyRange.left.getToken());
+        int endShard = getEndShardForBounds(keyRange);
+
+        KeyRangeConcatIterator.Builder builder = KeyRangeConcatIterator.builder(endShard - startShard + 1);
+        for (int shard = startShard; shard <= endShard; ++shard)
+        {
+            assert rangeIndexes[shard] != null;
+            builder.add(rangeIndexes[shard].search(expression, keyRange));
+        }
+        return builder.build();
+    }
+
     @Override
     public List<CloseableIterator<PrimaryKeyWithSortKey>> orderBy(QueryContext queryContext,
                                                                   Orderer orderer,
@@ -328,16 +338,29 @@ public List<CloseableIterator<PrimaryKeyWithSortKey>> orderBy(QueryContext query
                                                                   int limit)
     {
         int startShard = boundaries.getShardForToken(keyRange.left.getToken());
-        int endShard = keyRange.right.isMinimum() ? boundaries.shardCount() - 1 : boundaries.getShardForToken(keyRange.right.getToken());
+        int endShard = getEndShardForBounds(keyRange);
 
         if (orderer.isBM25())
         {
             // Intersect iterators to find documents containing all terms
             List<ByteBuffer> queryTerms = orderer.getQueryTerms();
-            List<KeyRangeIterator> termIterators = keyIteratorsPerTerm(queryContext, keyRange, queryTerms);
+            Map<ByteBuffer, Long> documentFrequencies = new HashMap<>();
+            List<KeyRangeIterator> termIterators = new ArrayList<>(queryTerms.size());
+            for (ByteBuffer term : queryTerms)
+            {
+                Expression expr = new Expression(indexContext).add(Operator.ANALYZER_MATCHES, term);
+                // getMaxKeys() counts all rows that match the expression for shards within the key range. The key
+                // range is not applied to the search results yet, so there is a small chance for overcounting if
+                // the key range filters within a shard. This is assumed to be acceptable because the on disk
+                // estimate also uses the key range to skip irrelevant sstable segments but does not apply the key
+                // range when getting the estimate within a segment.
+                KeyRangeIterator iterator = eagerSearch(expr, keyRange);
+                documentFrequencies.put(term, iterator.getMaxKeys());
+                termIterators.add(iterator);
+            }
             KeyRangeIterator intersectedIterator = KeyRangeIntersectionIterator.builder(termIterators).build();
 
-            return List.of(orderByBM25(Streams.stream(intersectedIterator), orderer));
+            return List.of(orderByBM25(Streams.stream(intersectedIterator), documentFrequencies, orderer));
         }
         else
         {
@@ -351,36 +374,50 @@ public List<CloseableIterator<PrimaryKeyWithSortKey>> orderBy(QueryContext query
         }
     }
 
-    private List<KeyRangeIterator> keyIteratorsPerTerm(QueryContext queryContext, AbstractBounds<PartitionPosition> keyRange, List<ByteBuffer> queryTerms)
-    {
-        List<KeyRangeIterator> termIterators = new ArrayList<>(queryTerms.size());
-        for (ByteBuffer term : queryTerms)
-        {
-            Expression expr = new Expression(indexContext);
-            expr.add(Operator.ANALYZER_MATCHES, term);
-            KeyRangeIterator iterator = search(queryContext, expr, keyRange, Integer.MAX_VALUE);
-            termIterators.add(iterator);
-        }
-        return termIterators;
-    }
-
     @Override
     public long estimateMatchingRowsCount(Expression expression, AbstractBounds<PartitionPosition> keyRange)
     {
         int startShard = boundaries.getShardForToken(keyRange.left.getToken());
-        int endShard = keyRange.right.isMinimum() ? boundaries.shardCount() - 1 : boundaries.getShardForToken(keyRange.right.getToken());
+        int endShard = getEndShardForBounds(keyRange);
         return rangeIndexes[startShard].estimateMatchingRowsCount(expression, keyRange) * (endShard - startShard + 1);
     }
 
+    // In the BM25 logic, estimateMatchingRowsCount is not accurate enough because we use the result to compute the
+    // document score.
+    private long completeEstimateMatchingRowsCount(Expression expression, AbstractBounds<PartitionPosition> keyRange)
+    {
+        int startShard = boundaries.getShardForToken(keyRange.left.getToken());
+        int endShard = getEndShardForBounds(keyRange);
+        long count = 0;
+        for (int shard = startShard; shard <= endShard; ++shard)
+        {
+            assert rangeIndexes[shard] != null;
+            count += rangeIndexes[shard].estimateMatchingRowsCount(expression, keyRange);
+        }
+        return count;
+    }
+
     @Override
     public CloseableIterator<PrimaryKeyWithSortKey> orderResultsBy(QueryContext queryContext, List<PrimaryKey> keys, Orderer orderer, int limit)
     {
         if (keys.isEmpty())
             return CloseableIterator.emptyIterator();
 
         if (orderer.isBM25())
-            return orderByBM25(keys.stream(), orderer);
+        {
+            HashMap<ByteBuffer, Long> documentFrequencies = new HashMap<>();
+            // We only need to get the document frequencies for the shards that contain the keys.
+            Range<PartitionPosition> range = Range.makeRowRange(keys.get(0).partitionKey().getToken(),
+                                                                keys.get(keys.size() - 1).partitionKey().getToken());
+            for (ByteBuffer term : orderer.getQueryTerms())
+            {
+                Expression expression = new Expression(indexContext).add(Operator.ANALYZER_MATCHES, term);
+                documentFrequencies.put(term, completeEstimateMatchingRowsCount(expression, range));
+            }
+            return orderByBM25(keys.stream(), documentFrequencies, orderer);
+        }
         else
+        {
             return SortingIterator.createCloseable(
                 orderer.getComparator(),
                 keys,
@@ -403,14 +440,15 @@ public CloseableIterator<PrimaryKeyWithSortKey> orderResultsBy(QueryContext quer
                 },
                 Runnables.doNothing()
             );
+        }
     }
 
-    private CloseableIterator<PrimaryKeyWithSortKey> orderByBM25(Stream<PrimaryKey> stream, Orderer orderer)
+    private CloseableIterator<PrimaryKeyWithSortKey> orderByBM25(Stream<PrimaryKey> stream, Map<ByteBuffer, Long> documentFrequencies, Orderer orderer)
     {
         assert orderer.isBM25();
         List<ByteBuffer> queryTerms = orderer.getQueryTerms();
         AbstractAnalyzer analyzer = indexContext.getAnalyzerFactory().create();
-        BM25Utils.DocStats docStats = computeDocumentFrequencies(queryTerms, analyzer);
+        BM25Utils.DocStats docStats = new BM25Utils.DocStats(documentFrequencies, indexedRows(), approximateTotalTermCount());
         Iterator<BM25Utils.DocTF> it = stream
                                        .map(pk -> BM25Utils.EagerDocTF.createFromDocument(pk, getCellForKey(pk), analyzer, queryTerms))
                                        .filter(Objects::nonNull)
@@ -422,54 +460,6 @@ private CloseableIterator<PrimaryKeyWithSortKey> orderByBM25(Stream<PrimaryKey>
                                        memtable);
     }
 
-    /**
-     * Count document frequencies for each term using brute force
-     */
-    private BM25Utils.DocStats computeDocumentFrequencies(List<ByteBuffer> queryTerms, AbstractAnalyzer docAnalyzer)
-    {
-        var documentFrequencies = new HashMap<ByteBuffer, Long>();
-
-        // count all documents in the queried column
-        try (var it = memtable.makePartitionIterator(ColumnFilter.selection(RegularAndStaticColumns.of(indexContext.getDefinition())),
-                                                     DataRange.allData(memtable.metadata().partitioner)))
-        {
-            while (it.hasNext())
-            {
-                var partitions = it.next();
-                while (partitions.hasNext())
-                {
-                    var unfiltered = partitions.next();
-                    if (!unfiltered.isRow())
-                        continue;
-                    var row = (Row) unfiltered;
-                    var cell = row.getCell(indexContext.getDefinition());
-                    if (cell == null)
-                        continue;
-
-                    Set<ByteBuffer> queryTermsPerDoc = new HashSet<>(queryTerms.size());
-                    docAnalyzer.reset(cell.buffer());
-                    try
-                    {
-                        while (docAnalyzer.hasNext())
-                        {
-                            ByteBuffer term = docAnalyzer.next();
-                            if (queryTerms.contains(term))
-                                queryTermsPerDoc.add(term);
-                        }
-                    }
-                    finally
-                    {
-                        docAnalyzer.end();
-                    }
-                    for (ByteBuffer term : queryTermsPerDoc)
-                        documentFrequencies.merge(term, 1L, Long::sum);
-
-                }
-            }
-        }
-        return new BM25Utils.DocStats(documentFrequencies, indexedRows(), approximateTotalTermCount());
-    }
-
     @Nullable
     private org.apache.cassandra.db.rows.Cell<?> getCellForKey(PrimaryKey key)
     {
@@ -487,6 +477,13 @@ private ByteComparable encode(ByteBuffer input)
         return Version.current().onDiskFormat().encodeForTrie(input, indexContext.getValidator());
     }
 
+    private int getEndShardForBounds(AbstractBounds<PartitionPosition> bounds)
+    {
+        PartitionPosition position = bounds.right;
+        return position.isMinimum() ? boundaries.shardCount() - 1
+                                    : boundaries.getShardForToken(position.getToken());
+    }
+
     /**
      * NOTE: returned data may contain partition key not within the provided min and max which are only used to find
      * corresponding subranges. We don't do filtering here to avoid unnecessary token comparison. In case of JBOD,
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/BM25Test.java b/test/unit/org/apache/cassandra/index/sai/cql/BM25Test.java
@@ -128,6 +128,34 @@ public void testDeletedRowWithPredicate() throws Throwable
         beforeAndAfterFlush(() -> assertRows(execute(select), row(1)));
     }
 
+    @Test
+    public void testRangeRestrictedBM25OnlyQuery() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int PRIMARY KEY, v text, n int)");
+        createIndex("CREATE CUSTOM INDEX ON %s(n) USING 'org.apache.cassandra.index.sai.StorageAttachedIndex'");
+        createAnalyzedIndex();
+        execute("INSERT INTO %s (k, v, n) VALUES (1, 'apple', 0)");
+        execute("INSERT INTO %s (k, v, n) VALUES (2, 'apple juice', 0)");
+        String select = "SELECT k FROM %s WHERE token(k) > token(1) AND token(k) < token(3) ORDER BY v BM25 OF 'apple' LIMIT 3";
+        beforeAndAfterFlush(() -> assertRows(execute(select), row(2)));
+    }
+
+    @Test
+    public void testRangeRestrictedHybridQuery() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int PRIMARY KEY, v text, n int)");
+        createIndex("CREATE CUSTOM INDEX ON %s(n) USING 'org.apache.cassandra.index.sai.StorageAttachedIndex'");
+        createAnalyzedIndex();
+        execute("INSERT INTO %s (k, v, n) VALUES (1, 'apple', 0)");
+        execute("INSERT INTO %s (k, v, n) VALUES (2, 'apple juice', 0)");
+        // Insert many unrelated rows so we do search-then-sort
+        for (int i = 3; i < 100; i++)
+            execute("INSERT INTO %s (k, v, n) VALUES (?, 'apple juice', 1)", i);
+        String select = "SELECT k FROM %s WHERE token(k) > token(1) AND token(k) < token(3) " +
+                        "AND n = 0 ORDER BY v BM25 OF 'apple' LIMIT 3";
+        beforeAndAfterFlush(() -> assertRows(execute(select), row(2)));
+    }
+
     @Test
     public void testTwoIndexesAmbiguousPredicate() throws Throwable
     {