CNDB-16176: CNDB-15919: Optimize SAI NOT queries, push logic into posting lists (#2112)

michaeljmarshall · michaelsembwever · commit 9dd4c28e631e · 2025-12-15T10:20:43.000+01:00
Fixes: riptano/cndb#15919 Test PR: riptano/cndb#15949 In the original implementation for #820, we introduced the `PrimaryKeyMapIterator` to iterate all primary keys in an sstable and then do an anti-join on the result of an equality query. That design works, but requires some additional reads from disk to get primary keys that are unnecessary. There are two possible solutions: 1. We can use row ids (either sstable or segment) to do the complement of the resulting posting lists. This will be the most performant, since it avoids object allocations. The main issue with this solution is that it is much more complicated to implement and had unaddressed edge cases. 2. We can use the `primaryKeyFromRowId` that takes primary key bounds and then uses a row id, when rows are from the same sstable. This will be worse that solution 1 because it creates an object per key and requires comparing sstable ids before comparing sstable row ids, but it is a significant improvement over the current solution, which hits disk to load the primary key. When testing on my local machine and reviewing the JMH benchmarks, I can see that the current solution is about 16x worse than the minimum solution (2) and 32x worse than the optimal (1) solution. Given that the benchmarks in question are highly specific to the use case, I do no think we have sufficient motivation to introduce the exceedingly complex (1) solution. Note that the ideal solution to 1, that would have much less complexity, is to convert posting lists into a single iterator of sstable row ids, and then to take the complement of them.
diff --git a/src/java/org/apache/cassandra/index/sai/disk/PrimaryKeyMapIterator.java b/src/java/org/apache/cassandra/index/sai/disk/PrimaryKeyMapIterator.java
@@ -110,7 +110,7 @@ protected PrimaryKey computeNext()
     {
         while (currentRowId >= 0 && currentRowId < keys.count())
         {
-            PrimaryKey key = keys.primaryKeyFromRowId(currentRowId++);
+            PrimaryKey key = keys.primaryKeyFromRowId(currentRowId++, getMinimum(), getMaximum());
             if (filter == KeyFilter.KEYS_WITH_CLUSTERING && key.hasEmptyClustering())
                 continue;
             return key;
diff --git a/test/microbench/org/apache/cassandra/test/microbench/index/sai/NEQQueryBench.java b/test/microbench/org/apache/cassandra/test/microbench/index/sai/NEQQueryBench.java
@@ -0,0 +1,98 @@
+/*
+ * Copyright DataStax, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.test.microbench.index.sai;
+
+import java.io.IOException;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.TimeUnit;
+
+import org.apache.cassandra.cql3.CQLTester;
+import org.apache.cassandra.db.commitlog.CommitLog;
+import org.apache.cassandra.index.sai.SAITester;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Level;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.TearDown;
+import org.openjdk.jmh.annotations.Threads;
+import org.openjdk.jmh.annotations.Warmup;
+
+@BenchmarkMode(Mode.AverageTime)
+@OutputTimeUnit(TimeUnit.NANOSECONDS)
+@Warmup(iterations = 10, time = 10, timeUnit = TimeUnit.SECONDS)
+@Measurement(iterations = 10, time = 10, timeUnit = TimeUnit.SECONDS)
+@Fork(value = 1)
+@Threads(1)
+@State(Scope.Benchmark)
+public class NEQQueryBench extends SAITester
+{
+
+    /**
+     * The more rows, the deeper the NEQ query has to search, based on the implementation at the time of this commit.
+     */
+    @Param({ "1000", "10000" })
+    public int numRowsWithinPartition;
+
+    @Setup(Level.Trial)
+    public void setup() throws Throwable
+    {
+        CQLTester.setUpClass();
+        CQLTester.prepareServer();
+        beforeTest();
+
+        // create the schema
+        createTable("CREATE TABLE %s (k int, c int, l list<text>, PRIMARY KEY (k, c))");
+        createIndex("CREATE CUSTOM INDEX ON %s(l) USING 'StorageAttachedIndex'");
+
+        // Insert the data so that there are many keys in the index, most of them will not match the query
+        // predicate (l NOT CONTAINS 'a'), and the ones that do contain it are at the end of the posting list
+        // because of the clustering order.
+        for (int k = 0; k < 100; k++)
+        {
+            for (int c = 0; c < numRowsWithinPartition; c++)
+            {
+                execute("INSERT INTO %s (k, c, l) VALUES (?, ?, ?)", k, c, list("a"));
+            }
+            // Now add one at the end of the partition's clustering order that will satisfy the query
+            execute("INSERT INTO %s (k, c, l) VALUES (?, ?, ?)", k, 1000, list("zzz"));
+        }
+        flush();
+    }
+
+    @TearDown(Level.Trial)
+    public void teardown() throws IOException, ExecutionException, InterruptedException
+    {
+        CommitLog.instance.shutdownBlocking();
+        CQLTester.cleanup();
+    }
+
+    /**
+     * Test the cost of creating the execution info object.
+     */
+    @Benchmark
+    public Object queryNEQ()
+    {
+        return execute("SELECT * FROM %s WHERE l NOT CONTAINS 'a'");
+    }
+}

Original file line number	Diff line number	Diff line change
`@@ -110,7 +110,7 @@ protected PrimaryKey computeNext()`
`110`	`110`	`{`
`111`	`111`	`while (currentRowId >= 0 && currentRowId < keys.count())`
`112`	`112`	`{`
`113`		`- PrimaryKey key = keys.primaryKeyFromRowId(currentRowId++);`
	`113`	`+ PrimaryKey key = keys.primaryKeyFromRowId(currentRowId++, getMinimum(), getMaximum());`
`114`	`114`	`if (filter == KeyFilter.KEYS_WITH_CLUSTERING && key.hasEmptyClustering())`
`115`	`115`	`continue;`
`116`	`116`	`return key;`