Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
8a634e2
CNDB-15554: Bump jvector version
michaeljmarshall Oct 2, 2025
303b56c
Work in progress for fused adc, fails when querying compacted graph
michaeljmarshall Oct 3, 2025
05b2148
Skip writing fused graph if version to early
michaeljmarshall Oct 6, 2025
903b76b
Comment out writeHeader to fix build
michaeljmarshall Oct 8, 2025
e63f53d
Merge branch 'cndb-15554' into revive-fused-adc
michaeljmarshall Oct 8, 2025
3afcb24
Add writeHeader back in to fix failing compaction test
michaeljmarshall Oct 9, 2025
6859092
CNDB-15640: Determine if vectors are unit length at insert
michaeljmarshall Oct 9, 2025
7c3c804
Merge remote-tracking branch 'datastax/main' into revive-fused-adc
michaeljmarshall Oct 10, 2025
db9cfab
Remove unused variable
michaeljmarshall Oct 10, 2025
2ea5650
Merge branch 'cndb-15640' into revive-fused-adc
michaeljmarshall Oct 10, 2025
51c3326
Save progress on COHG (doesn't work yet)
michaeljmarshall Oct 15, 2025
2dd5a4f
Save progress (works now, but need to consider refactoring)
michaeljmarshall Oct 15, 2025
e21aade
Fix writing header before closing CompactionGraph's first ODGIW
michaeljmarshall Oct 15, 2025
342aeb4
Merge remote-tracking branch 'datastax/main' into revive-fused-adc
michaeljmarshall Nov 3, 2025
bd73ca0
Update for latest code changes
michaeljmarshall Nov 11, 2025
97643a8
Merge remote-tracking branch 'datastax/main' into revive-fused-adc
michaeljmarshall Nov 18, 2025
7a37a55
Use internal build of jvector
michaeljmarshall Nov 25, 2025
5281fdd
Merge remote-tracking branch 'datastax/main' into revive-fused-adc
michaeljmarshall Dec 1, 2025
569d0ae
Bring in jvector bug fix
michaeljmarshall Dec 4, 2025
3410e36
Add temp hack to parallelize pq computation
michaeljmarshall Dec 4, 2025
1d9129c
CNDB-16243: Improve ChunkCache Key in order to keep O(1) access time
eolivelli Dec 7, 2025
29b6121
CNDB-16252: temporary hack to allow building multiple vector indexes …
eolivelli Dec 9, 2025
213a2b9
CNDB-16308: Add GLOBAL_HOLES_ALLOWED logic to CompactionGraph
michaeljmarshall Dec 15, 2025
1f4e76f
Fix indentation
michaeljmarshall Dec 15, 2025
9e638f8
Add cassandra.sai.vector.encode_write_graph_parallel, default to true
michaeljmarshall Dec 15, 2025
23d7f3f
Pass local path to OnDiskGraphIndexWriter for parallel writes
michaeljmarshall Dec 16, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion build.xml
Original file line number Diff line number Diff line change
Expand Up @@ -743,7 +743,7 @@
<dependency groupId="org.apache.lucene" artifactId="lucene-core" version="9.8.0" />
<dependency groupId="org.apache.lucene" artifactId="lucene-analysis-common" version="9.8.0" />
<dependency groupId="org.apache.lucene" artifactId="lucene-backward-codecs" version="9.8.0" />
<dependency groupId="io.github.jbellis" artifactId="jvector" version="4.0.0-rc.5" />
<dependency groupId="io.github.jbellis" artifactId="jvector" version="4.0.0-rc.7-352eda26" />
<dependency groupId="com.bpodgursky" artifactId="jbool_expressions" version="1.14" scope="test"/>

<dependency groupId="com.carrotsearch.randomizedtesting" artifactId="randomizedtesting-runner" version="2.1.2" scope="test">
Expand Down
27 changes: 21 additions & 6 deletions src/java/org/apache/cassandra/cache/ChunkCache.java
Original file line number Diff line number Diff line change
Expand Up @@ -297,7 +297,7 @@ public void invalidateFileNow(File file)
synchronousCache.invalidateAll(Iterables.filter(cache.asMap().keySet(), x -> (x.readerId & mask) == fileId));
}

static class Key
static class Key implements Comparable<Key>
{
final long readerId;
final long position;
Expand All @@ -312,11 +312,15 @@ private Key(long readerId, long position)
@Override
public int hashCode()
{
final int prime = 31;
int result = 1;
result = prime * result + Long.hashCode(readerId);
result = prime * result + Long.hashCode(position);
return result;
// Mix readerId and position into a single long using a large prime multiplier
// This constant is a mixing constant derived from the Golden Ratio
long mixed = (readerId + position) * 0x9E3779B97F4A7C15L;

// Spread the bits (XOR-shift) to ensure high bits affect low bits
mixed ^= (mixed >>> 32);
mixed ^= (mixed >>> 16);

return (int) mixed;
}

@Override
Expand All @@ -331,6 +335,17 @@ public boolean equals(Object obj)
return (position == other.position)
&& readerId == other.readerId;
}

@Override
public int compareTo(Key other) {
// Compare readerId first
int cmp = Long.compare(this.readerId, other.readerId);
if (cmp != 0) {
return cmp;
}
// Then compare position
return Long.compare(this.position, other.position);
}
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -436,12 +436,16 @@ public enum CassandraRelevantProperties
SAI_VECTOR_FLUSH_THRESHOLD_MAX_ROWS("cassandra.sai.vector_flush_threshold_max_rows", "-1"),
// Use non-positive value to disable it. Period in millis to trigger a flush for SAI vector memtable index.
SAI_VECTOR_FLUSH_PERIOD_IN_MILLIS("cassandra.sai.vector_flush_period_in_millis", "-1"),
// Whether compaction should build vector indexes using fused adc
SAI_VECTOR_ENABLE_FUSED("cassandra.sai.vector.enable_fused", "true"),
// Use nvq when building graphs in compaction. Disabled by default for now. Enabling will reduce recall slightly
// while also reducing the storage footprint.
SAI_VECTOR_ENABLE_NVQ("cassandra.sai.vector.enable_nvq", "false"),
// NVQ number of subvectors. This isn't really expected to change much so we're only exposing
// it as a global variable in case it's needed.
SAI_VECTOR_NVQ_NUM_SUB_VECTORS("cassandra.sai.vector.nvq_num_sub_vectors", "2"),
// When building a compaction graph, encode layer 0 nodes in parallel and subsequently use async io for writes.
SAI_ENCODE_AND_WRITE_VECTOR_GRAPH_IN_PARALLEL("cassandra.sai.vector.encode_write_graph_parallel", "true"),
/**
* Whether to disable auto-compaction
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
import org.apache.cassandra.index.sai.disk.v5.V5OnDiskFormat;
import org.apache.cassandra.index.sai.disk.v6.V6OnDiskFormat;
import org.apache.cassandra.index.sai.disk.v7.V7OnDiskFormat;
import org.apache.cassandra.index.sai.disk.v8.V8OnDiskFormat;
import org.apache.cassandra.index.sai.utils.TypeUtil;
import org.apache.cassandra.io.sstable.format.SSTableFormat;
import org.apache.cassandra.schema.SchemaConstants;
Expand Down Expand Up @@ -75,10 +76,12 @@ public class Version implements Comparable<Version>
public static final Version EC = new Version("ec", V7OnDiskFormat.instance, (c, i, g) -> stargazerFileNameFormat(c, i, g, "ec"));
// total terms count serialization in index metadata, enables ANN_USE_SYNTHETIC_SCORE by default
public static final Version ED = new Version("ed", V7OnDiskFormat.instance, (c, i, g) -> stargazerFileNameFormat(c, i, g, "ed"));
// jvector file format version 6 (skipped 5)
public static final Version FA = new Version("fa", V8OnDiskFormat.instance, (c, i, g) -> stargazerFileNameFormat(c, i, g, "fa"));

// These are in reverse-chronological order so that the latest version is first. Version matching tests
// are more likely to match the latest version, so we want to test that one first.
public static final List<Version> ALL = Lists.newArrayList(ED, EC, EB, DC, DB, CA, BA, AA);
public static final List<Version> ALL = Lists.newArrayList(FA, ED, EC, EB, DC, DB, CA, BA, AA);

public static final Version EARLIEST = AA;
public static final Version VECTOR_EARLIEST = BA;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.cassandra.index.sai.disk.v8;

import org.apache.cassandra.index.sai.disk.v7.V7OnDiskFormat;

public class V8OnDiskFormat extends V7OnDiskFormat
{
public static final V8OnDiskFormat instance = new V8OnDiskFormat();

@Override
public int jvectorFileFormatVersion()
{
return 6;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ public CassandraDiskAnn(SSTableContext sstableContext, SegmentMetadata segmentMe

SegmentMetadata.ComponentMetadata termsMetadata = this.componentMetadatas.get(IndexComponentType.TERMS_DATA);
graphHandle = indexFiles.termsData();
var rawGraph = OnDiskGraphIndex.load(graphHandle::createReader, termsMetadata.offset);
var rawGraph = OnDiskGraphIndex.load(graphHandle::createReader, termsMetadata.offset, false);
features = rawGraph.getFeatureSet();
graph = rawGraph;
usesNVQ = features.contains(FeatureId.NVQ_VECTORS);
Expand Down Expand Up @@ -123,7 +123,7 @@ public CassandraDiskAnn(SSTableContext sstableContext, SegmentMetadata segmentMe
}

VectorCompression.CompressionType compressionType = VectorCompression.CompressionType.values()[reader.readByte()];
if (features.contains(FeatureId.FUSED_ADC))
if (features.contains(FeatureId.FUSED_PQ))
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@marianotepper - I just noticed that the features map already has logic that loads the ProductQuantization, meaning this branch currently keeps two identical maps in memory. I think it'd make sense to possibly expose the features map in the OnDiskGraphIndex so we can remove the duplicate cost. Any reason we can't do that?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like we actually already use the one from the header, I just didn't catch it. I think we'll be able to get rid of it with a little extra work in CC.

{
assert compressionType == VectorCompression.CompressionType.PRODUCT_QUANTIZATION;
compressedVectors = null;
Expand Down Expand Up @@ -239,9 +239,7 @@ public CloseableIterator<RowIdWithScore> search(VectorFloat<?> queryVector,
{
var view = (ImmutableGraphIndex.ScoringView) searcher.getView();
SearchScoreProvider ssp;
// FusedADC can no longer be written due to jvector upgrade. However, it's possible these index files
// still exist, so we have to support them.
if (features.contains(FeatureId.FUSED_ADC))
if (features.contains(FeatureId.FUSED_PQ))
{
var asf = view.approximateScoreFunctionFor(queryVector, similarityFunction);
var rr = isRerankless ? null : view.rerankerFor(queryVector, similarityFunction);
Expand Down
Loading