Skip to content

Commit 9dd4c28

Browse files
michaeljmarshallmichaelsembwever
authored andcommitted
CNDB-16176: CNDB-15919: Optimize SAI NOT queries, push logic into posting lists (#2112)
Fixes: riptano/cndb#15919 Test PR: riptano/cndb#15949 In the original implementation for #820, we introduced the `PrimaryKeyMapIterator` to iterate all primary keys in an sstable and then do an anti-join on the result of an equality query. That design works, but requires some additional reads from disk to get primary keys that are unnecessary. There are two possible solutions: 1. We can use row ids (either sstable or segment) to do the complement of the resulting posting lists. This will be the most performant, since it avoids object allocations. The main issue with this solution is that it is much more complicated to implement and had unaddressed edge cases. 2. We can use the `primaryKeyFromRowId` that takes primary key bounds and then uses a row id, when rows are from the same sstable. This will be worse that solution 1 because it creates an object per key and requires comparing sstable ids before comparing sstable row ids, but it is a significant improvement over the current solution, which hits disk to load the primary key. When testing on my local machine and reviewing the JMH benchmarks, I can see that the current solution is about 16x worse than the minimum solution (2) and 32x worse than the optimal (1) solution. Given that the benchmarks in question are highly specific to the use case, I do no think we have sufficient motivation to introduce the exceedingly complex (1) solution. Note that the ideal solution to 1, that would have much less complexity, is to convert posting lists into a single iterator of sstable row ids, and then to take the complement of them.
1 parent e054ecf commit 9dd4c28

File tree

2 files changed

+99
-1
lines changed

2 files changed

+99
-1
lines changed

src/java/org/apache/cassandra/index/sai/disk/PrimaryKeyMapIterator.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ protected PrimaryKey computeNext()
110110
{
111111
while (currentRowId >= 0 && currentRowId < keys.count())
112112
{
113-
PrimaryKey key = keys.primaryKeyFromRowId(currentRowId++);
113+
PrimaryKey key = keys.primaryKeyFromRowId(currentRowId++, getMinimum(), getMaximum());
114114
if (filter == KeyFilter.KEYS_WITH_CLUSTERING && key.hasEmptyClustering())
115115
continue;
116116
return key;
Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
/*
2+
* Copyright DataStax, Inc.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package org.apache.cassandra.test.microbench.index.sai;
18+
19+
import java.io.IOException;
20+
import java.util.concurrent.ExecutionException;
21+
import java.util.concurrent.TimeUnit;
22+
23+
import org.apache.cassandra.cql3.CQLTester;
24+
import org.apache.cassandra.db.commitlog.CommitLog;
25+
import org.apache.cassandra.index.sai.SAITester;
26+
import org.openjdk.jmh.annotations.Benchmark;
27+
import org.openjdk.jmh.annotations.BenchmarkMode;
28+
import org.openjdk.jmh.annotations.Fork;
29+
import org.openjdk.jmh.annotations.Level;
30+
import org.openjdk.jmh.annotations.Measurement;
31+
import org.openjdk.jmh.annotations.Mode;
32+
import org.openjdk.jmh.annotations.OutputTimeUnit;
33+
import org.openjdk.jmh.annotations.Param;
34+
import org.openjdk.jmh.annotations.Scope;
35+
import org.openjdk.jmh.annotations.Setup;
36+
import org.openjdk.jmh.annotations.State;
37+
import org.openjdk.jmh.annotations.TearDown;
38+
import org.openjdk.jmh.annotations.Threads;
39+
import org.openjdk.jmh.annotations.Warmup;
40+
41+
@BenchmarkMode(Mode.AverageTime)
42+
@OutputTimeUnit(TimeUnit.NANOSECONDS)
43+
@Warmup(iterations = 10, time = 10, timeUnit = TimeUnit.SECONDS)
44+
@Measurement(iterations = 10, time = 10, timeUnit = TimeUnit.SECONDS)
45+
@Fork(value = 1)
46+
@Threads(1)
47+
@State(Scope.Benchmark)
48+
public class NEQQueryBench extends SAITester
49+
{
50+
51+
/**
52+
* The more rows, the deeper the NEQ query has to search, based on the implementation at the time of this commit.
53+
*/
54+
@Param({ "1000", "10000" })
55+
public int numRowsWithinPartition;
56+
57+
@Setup(Level.Trial)
58+
public void setup() throws Throwable
59+
{
60+
CQLTester.setUpClass();
61+
CQLTester.prepareServer();
62+
beforeTest();
63+
64+
// create the schema
65+
createTable("CREATE TABLE %s (k int, c int, l list<text>, PRIMARY KEY (k, c))");
66+
createIndex("CREATE CUSTOM INDEX ON %s(l) USING 'StorageAttachedIndex'");
67+
68+
// Insert the data so that there are many keys in the index, most of them will not match the query
69+
// predicate (l NOT CONTAINS 'a'), and the ones that do contain it are at the end of the posting list
70+
// because of the clustering order.
71+
for (int k = 0; k < 100; k++)
72+
{
73+
for (int c = 0; c < numRowsWithinPartition; c++)
74+
{
75+
execute("INSERT INTO %s (k, c, l) VALUES (?, ?, ?)", k, c, list("a"));
76+
}
77+
// Now add one at the end of the partition's clustering order that will satisfy the query
78+
execute("INSERT INTO %s (k, c, l) VALUES (?, ?, ?)", k, 1000, list("zzz"));
79+
}
80+
flush();
81+
}
82+
83+
@TearDown(Level.Trial)
84+
public void teardown() throws IOException, ExecutionException, InterruptedException
85+
{
86+
CommitLog.instance.shutdownBlocking();
87+
CQLTester.cleanup();
88+
}
89+
90+
/**
91+
* Test the cost of creating the execution info object.
92+
*/
93+
@Benchmark
94+
public Object queryNEQ()
95+
{
96+
return execute("SELECT * FROM %s WHERE l NOT CONTAINS 'a'");
97+
}
98+
}

0 commit comments

Comments
 (0)