From 0b76ac3e844fa92c6480bb15e6440bed2bfb69da Mon Sep 17 00:00:00 2001 From: Michael Sokolov Date: Wed, 2 Oct 2024 14:22:14 -0400 Subject: [PATCH 01/25] refactor float vector values random access --- .../synonym/word2vec/Word2VecModel.java | 16 ++-- .../lucene90/Lucene90HnswGraphBuilder.java | 26 +++---- .../lucene90/Lucene90HnswVectorsReader.java | 32 ++++---- .../lucene90/Lucene90OnHeapHnswGraph.java | 5 +- .../lucene91/Lucene91HnswVectorsReader.java | 31 ++++---- .../lucene92/OffHeapFloatVectorValues.java | 68 ++++++++--------- .../lucene94/OffHeapFloatVectorValues.java | 68 ++++++++--------- .../lucene90/Lucene90HnswVectorsWriter.java | 6 +- .../lucene91/Lucene91HnswGraphBuilder.java | 32 ++++---- .../lucene91/Lucene91HnswVectorsWriter.java | 6 +- .../lucene92/Lucene92HnswVectorsWriter.java | 4 +- .../lucene94/Lucene94HnswVectorsWriter.java | 4 +- .../lucene95/Lucene95HnswVectorsWriter.java | 4 +- .../TestBasicBackwardsCompatibility.java | 6 +- .../SimpleTextKnnVectorsReader.java | 22 +++--- .../SimpleTextKnnVectorsWriter.java | 10 ++- .../codecs/BufferingKnnVectorsWriter.java | 31 ++++---- .../lucene/codecs/KnnVectorsWriter.java | 38 ++++++---- .../codecs/hnsw/DefaultFlatVectorScorer.java | 19 ++--- .../lucene95/OffHeapFloatVectorValues.java | 70 +++++++---------- .../lucene99/Lucene99FlatVectorsWriter.java | 3 +- .../Lucene99ScalarQuantizedVectorsReader.java | 15 ++-- .../Lucene99ScalarQuantizedVectorsWriter.java | 54 ++++++------- .../apache/lucene/index/ByteVectorValues.java | 1 - .../org/apache/lucene/index/CheckIndex.java | 5 +- .../lucene/index/ExitableDirectoryReader.java | 14 ++-- .../lucene/index/FloatVectorValues.java | 34 +++++---- .../apache/lucene/index/KnnVectorValues.java | 6 -- .../SlowCompositeCodecReaderWrapper.java | 38 +++++----- .../lucene/index/SortingCodecReader.java | 16 ++-- .../util/quantization/ScalarQuantizer.java | 20 ++--- .../org/apache/lucene/document/TestField.java | 6 +- .../index/TestExitableDirectoryReader.java | 2 +- .../org/apache/lucene/index/TestKnnGraph.java | 3 +- .../lucene/index/TestSortingCodecReader.java | 2 +- .../lucene/util/hnsw/HnswGraphTestCase.java | 75 ++++++++++++------- .../lucene/util/hnsw/MockVectorValues.java | 34 ++++----- .../util/hnsw/TestHnswFloatVectorGraph.java | 3 +- .../TestScalarQuantizedVectorSimilarity.java | 13 +++- .../quantization/TestScalarQuantizer.java | 14 ++-- .../lucene/index/memory/MemoryIndex.java | 27 ++++--- .../lucene/index/memory/TestMemoryIndex.java | 2 +- .../FloatKnnVectorFieldSource.java | 3 +- .../sandbox/codecs/quantization/KMeans.java | 23 +++--- .../codecs/quantization/SampleReader.java | 15 ++-- .../index/BaseKnnVectorsFormatTestCase.java | 55 ++++++++------ 46 files changed, 498 insertions(+), 483 deletions(-) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/word2vec/Word2VecModel.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/word2vec/Word2VecModel.java index 68fd3b5884b..4b264c97582 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/word2vec/Word2VecModel.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/word2vec/Word2VecModel.java @@ -17,7 +17,6 @@ package org.apache.lucene.analysis.synonym.word2vec; -import java.io.IOException; import org.apache.lucene.index.FloatVectorValues; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefHash; @@ -62,8 +61,13 @@ public void addTermAndVector(TermAndVector modelEntry) { } @Override - public float[] vectorValue(int targetOrd) { - return termsAndVectors[targetOrd].vector(); + public Floats values() { + return new Floats() { + @Override + public float[] get(int targetOrd) { + return termsAndVectors[targetOrd].vector(); + } + }; } public float[] vectorValue(BytesRef term) { @@ -86,10 +90,4 @@ public int dimension() { public int size() { return dictionarySize; } - - @Override - public Word2VecModel copy() throws IOException { - return new Word2VecModel( - this.dictionarySize, this.vectorDimension, this.termsAndVectors, this.word2Vec); - } } diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswGraphBuilder.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswGraphBuilder.java index 0d7fd520a30..eb9ac7abedd 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswGraphBuilder.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswGraphBuilder.java @@ -50,6 +50,7 @@ public final class Lucene90HnswGraphBuilder { private final VectorSimilarityFunction similarityFunction; private final FloatVectorValues vectorValues; + private final FloatVectorValues.Floats vectors; private final SplittableRandom random; private final Lucene90BoundsChecker bound; final Lucene90OnHeapHnswGraph hnsw; @@ -58,7 +59,7 @@ public final class Lucene90HnswGraphBuilder { // we need two sources of vectors in order to perform diversity check comparisons without // colliding - private final FloatVectorValues buildVectors; + private final FloatVectorValues.Floats buildVectors; /** * Reads all the vectors from vector values, builds a graph connecting them by their dense @@ -79,8 +80,9 @@ public Lucene90HnswGraphBuilder( int beamWidth, long seed) throws IOException { - vectorValues = vectors.copy(); - buildVectors = vectors.copy(); + this.vectorValues = vectors; + this.vectors = vectors.values(); + buildVectors = vectors.values(); this.similarityFunction = Objects.requireNonNull(similarityFunction); if (maxConn <= 0) { throw new IllegalArgumentException("maxConn must be positive"); @@ -105,17 +107,14 @@ public Lucene90HnswGraphBuilder( * accessor for the vectors */ public Lucene90OnHeapHnswGraph build(FloatVectorValues vectors) throws IOException { - if (vectors == vectorValues) { - throw new IllegalArgumentException( - "Vectors to build must be independent of the source of vectors provided to HnswGraphBuilder()"); - } if (infoStream.isEnabled(HNSW_COMPONENT)) { infoStream.message(HNSW_COMPONENT, "build graph from " + vectors.size() + " vectors"); } long start = System.nanoTime(), t = start; // start at node 1! node 0 is added implicitly, in the constructor + FloatVectorValues.Floats values = vectors.values(); for (int node = 1; node < vectors.size(); node++) { - addGraphNode(vectors.vectorValue(node)); + addGraphNode(values.get(node)); if (node % 10000 == 0) { if (infoStream.isEnabled(HNSW_COMPONENT)) { long now = System.nanoTime(); @@ -200,7 +199,7 @@ private void selectDiverse(Lucene90NeighborArray neighbors, Lucene90NeighborArra int cNode = candidates.node()[i]; float cScore = candidates.score()[i]; assert cNode < hnsw.size(); - if (diversityCheck(vectorValues.vectorValue(cNode), cScore, neighbors, buildVectors)) { + if (diversityCheck(vectors.get(cNode), cScore, neighbors, buildVectors)) { neighbors.add(cNode, cScore); } } @@ -230,12 +229,12 @@ private boolean diversityCheck( float[] candidate, float score, Lucene90NeighborArray neighbors, - FloatVectorValues vectorValues) + FloatVectorValues.Floats vectorValues) throws IOException { bound.set(score); for (int i = 0; i < neighbors.size(); i++) { float neighborSimilarity = - similarityFunction.compare(candidate, vectorValues.vectorValue(neighbors.node()[i])); + similarityFunction.compare(candidate, vectorValues.get(neighbors.node()[i])); if (bound.check(neighborSimilarity) == false) { return false; } @@ -269,11 +268,10 @@ private int findNonDiverse(Lucene90NeighborArray neighbors) throws IOException { // them, drop it int neighborId = neighbors.node()[i]; bound.set(neighbors.score()[i]); - float[] neighborVector = vectorValues.vectorValue(neighborId); + float[] neighborVector = vectors.get(neighborId); for (int j = maxConn; j > i; j--) { float neighborSimilarity = - similarityFunction.compare( - neighborVector, buildVectors.vectorValue(neighbors.node()[j])); + similarityFunction.compare(neighborVector, buildVectors.get(neighbors.node()[j])); if (bound.check(neighborSimilarity) == false) { // node j is too similar to node i given its score relative to the base node // replace it with the new node, which is at [maxConn] diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswVectorsReader.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswVectorsReader.java index 1196ed3fdb6..2e860777ba7 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswVectorsReader.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswVectorsReader.java @@ -388,19 +388,19 @@ public int size() { } @Override - public OffHeapFloatVectorValues copy() { - return new OffHeapFloatVectorValues(dimension, ordToDoc, similarityFunction, dataIn.clone()); - } - - @Override - public float[] vectorValue(int targetOrd) throws IOException { - if (lastOrd == targetOrd) { - return value; - } - dataIn.seek((long) targetOrd * byteSize); - dataIn.readFloats(value, 0, value.length); - lastOrd = targetOrd; - return value; + public Floats values() { + return new Floats() { + @Override + public float[] get(int targetOrd) throws IOException { + if (lastOrd == targetOrd) { + return value; + } + dataIn.seek((long) targetOrd * byteSize); + dataIn.readFloats(value, 0, value.length); + lastOrd = targetOrd; + return value; + } + }; } @Override @@ -418,12 +418,12 @@ public VectorScorer scorer(float[] target) { if (size() == 0) { return null; } - OffHeapFloatVectorValues values = this.copy(); - DocIndexIterator iterator = values.iterator(); + FloatVectorValues.Floats values = values(); + DocIndexIterator iterator = iterator(); return new VectorScorer() { @Override public float score() throws IOException { - return values.similarityFunction.compare(values.vectorValue(iterator.index()), target); + return similarityFunction.compare(values.get(iterator.index()), target); } @Override diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90OnHeapHnswGraph.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90OnHeapHnswGraph.java index 845987c2957..6faf2f06cd8 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90OnHeapHnswGraph.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90OnHeapHnswGraph.java @@ -83,6 +83,7 @@ public static NeighborQueue search( throws IOException { int size = graphValues.size(); + FloatVectorValues.Floats values = vectors.values(); // MIN heap, holding the top results NeighborQueue results = new NeighborQueue(numSeed, false); // MAX heap, from which to pull the candidate nodes @@ -101,7 +102,7 @@ public static NeighborQueue search( break; } // explore the topK starting points of some random numSeed probes - float score = similarityFunction.compare(query, vectors.vectorValue(entryPoint)); + float score = similarityFunction.compare(query, values.get(entryPoint)); candidates.add(entryPoint, score); if (acceptOrds == null || acceptOrds.get(entryPoint)) { results.add(entryPoint, score); @@ -137,7 +138,7 @@ public static NeighborQueue search( break; } - float friendSimilarity = similarityFunction.compare(query, vectors.vectorValue(friendOrd)); + float friendSimilarity = similarityFunction.compare(query, values.get(friendOrd)); if (results.size() < numSeed || bound.check(friendSimilarity) == false) { candidates.add(friendOrd, friendSimilarity); if (acceptOrds == null || acceptOrds.get(friendOrd)) { diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene91/Lucene91HnswVectorsReader.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene91/Lucene91HnswVectorsReader.java index a140b4fd7f3..24997307197 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene91/Lucene91HnswVectorsReader.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene91/Lucene91HnswVectorsReader.java @@ -405,7 +405,6 @@ static class OffHeapFloatVectorValues extends FloatVectorValues { private final IntUnaryOperator ordToDocOperator; private final IndexInput dataIn; private final int byteSize; - private final float[] value; private final VectorSimilarityFunction similarityFunction; OffHeapFloatVectorValues( @@ -421,7 +420,6 @@ static class OffHeapFloatVectorValues extends FloatVectorValues { this.dataIn = dataIn; this.similarityFunction = similarityFunction; byteSize = Float.BYTES * dimension; - value = new float[dimension]; } @Override @@ -435,16 +433,17 @@ public int size() { } @Override - public OffHeapFloatVectorValues copy() { - return new OffHeapFloatVectorValues( - dimension, size, ordToDoc, similarityFunction, dataIn.clone()); - } - - @Override - public float[] vectorValue(int targetOrd) throws IOException { - dataIn.seek((long) targetOrd * byteSize); - dataIn.readFloats(value, 0, value.length); - return value; + public Floats values() throws IOException { + IndexInput input = dataIn.clone(); + float[] value = new float[dimension]; + return new Floats() { + @Override + public float[] get(int targetOrd) throws IOException { + input.seek((long) targetOrd * byteSize); + input.readFloats(value, 0, value.length); + return value; + } + }; } @Override @@ -458,16 +457,16 @@ public DocIndexIterator iterator() { } @Override - public VectorScorer scorer(float[] target) { + public VectorScorer scorer(float[] target) throws IOException { if (size == 0) { return null; } - OffHeapFloatVectorValues values = this.copy(); - DocIndexIterator iterator = values.iterator(); + Floats values = values(); + DocIndexIterator iterator = iterator(); return new VectorScorer() { @Override public float score() throws IOException { - return values.similarityFunction.compare(values.vectorValue(iterator.index()), target); + return similarityFunction.compare(values.get(iterator.index()), target); } @Override diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene92/OffHeapFloatVectorValues.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene92/OffHeapFloatVectorValues.java index 7c87bac5e54..726b0187519 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene92/OffHeapFloatVectorValues.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene92/OffHeapFloatVectorValues.java @@ -35,10 +35,7 @@ abstract class OffHeapFloatVectorValues extends FloatVectorValues { protected final int size; protected final IndexInput slice; protected final int byteSize; - protected int lastOrd = -1; - protected final float[] value; protected final VectorSimilarityFunction vectorSimilarityFunction; - ; OffHeapFloatVectorValues( int dimension, @@ -49,7 +46,6 @@ abstract class OffHeapFloatVectorValues extends FloatVectorValues { this.size = size; this.slice = slice; byteSize = Float.BYTES * dimension; - value = new float[dimension]; this.vectorSimilarityFunction = vectorSimilarityFunction; } @@ -64,14 +60,23 @@ public int size() { } @Override - public float[] vectorValue(int targetOrd) throws IOException { - if (lastOrd == targetOrd) { - return value; - } - slice.seek((long) targetOrd * byteSize); - slice.readFloats(value, 0, value.length); - lastOrd = targetOrd; - return value; + public Floats values() throws IOException { + return new Floats() { + final IndexInput dictionarySlice = slice.clone(); + int lastOrd = -1; + float[] value = new float[dimension]; + + @Override + public float[] get(int targetOrd) throws IOException { + if (lastOrd == targetOrd) { + return value; + } + dictionarySlice.seek((long) targetOrd * byteSize); + dictionarySlice.readFloats(value, 0, value.length); + lastOrd = targetOrd; + return value; + } + }; } static OffHeapFloatVectorValues load( @@ -101,11 +106,6 @@ public DenseOffHeapVectorValues( super(dimension, size, vectorSimilarityFunction, slice); } - @Override - public DenseOffHeapVectorValues copy() throws IOException { - return new DenseOffHeapVectorValues(dimension, size, vectorSimilarityFunction, slice.clone()); - } - @Override public DocIndexIterator iterator() { return createDenseIterator(); @@ -118,13 +118,12 @@ public Bits getAcceptOrds(Bits acceptDocs) { @Override public VectorScorer scorer(float[] query) throws IOException { - DenseOffHeapVectorValues values = this.copy(); - DocIndexIterator iterator = values.iterator(); + FloatVectorValues.Floats values = values(); + DocIndexIterator iterator = iterator(); return new VectorScorer() { @Override public float score() throws IOException { - return values.vectorSimilarityFunction.compare( - values.vectorValue(iterator.index()), query); + return vectorSimilarityFunction.compare(values.get(iterator.index()), query); } @Override @@ -165,12 +164,6 @@ public SparseOffHeapVectorValues( fieldEntry.size()); } - @Override - public SparseOffHeapVectorValues copy() throws IOException { - return new SparseOffHeapVectorValues( - fieldEntry, dataIn, vectorSimilarityFunction, slice.clone()); - } - @Override public DocIndexIterator iterator() { return IndexedDISI.asDocIndexIterator(disi); @@ -201,13 +194,12 @@ public int length() { @Override public VectorScorer scorer(float[] query) throws IOException { - SparseOffHeapVectorValues values = this.copy(); - DocIndexIterator iterator = values.iterator(); + FloatVectorValues.Floats values = values(); + DocIndexIterator iterator = iterator(); return new VectorScorer() { @Override public float score() throws IOException { - return values.vectorSimilarityFunction.compare( - values.vectorValue(iterator.index()), query); + return vectorSimilarityFunction.compare(values.get(iterator.index()), query); } @Override @@ -235,13 +227,13 @@ public int size() { } @Override - public OffHeapFloatVectorValues copy() throws IOException { - throw new UnsupportedOperationException(); - } - - @Override - public float[] vectorValue(int targetOrd) throws IOException { - throw new UnsupportedOperationException(); + public Floats values() { + return new Floats() { + @Override + public float[] get(int targetOrd) throws IOException { + throw new UnsupportedOperationException(); + } + }; } @Override diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene94/OffHeapFloatVectorValues.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene94/OffHeapFloatVectorValues.java index b21df901ddb..04dbabcc653 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene94/OffHeapFloatVectorValues.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene94/OffHeapFloatVectorValues.java @@ -35,8 +35,6 @@ abstract class OffHeapFloatVectorValues extends FloatVectorValues { protected final int size; protected final IndexInput slice; protected final int byteSize; - protected int lastOrd = -1; - protected final float[] value; protected final VectorSimilarityFunction vectorSimilarityFunction; OffHeapFloatVectorValues( @@ -49,7 +47,6 @@ abstract class OffHeapFloatVectorValues extends FloatVectorValues { this.size = size; this.slice = slice; this.byteSize = byteSize; - value = new float[dimension]; this.vectorSimilarityFunction = vectorSimilarityFunction; } @@ -64,14 +61,23 @@ public int size() { } @Override - public float[] vectorValue(int targetOrd) throws IOException { - if (lastOrd == targetOrd) { - return value; - } - slice.seek((long) targetOrd * byteSize); - slice.readFloats(value, 0, value.length); - lastOrd = targetOrd; - return value; + public Floats values() { + return new Floats() { + final IndexInput dictionarySlice = slice.clone(); + int lastOrd = -1; + float[] value = new float[dimension]; + + @Override + public float[] get(int targetOrd) throws IOException { + if (lastOrd == targetOrd) { + return value; + } + dictionarySlice.seek((long) targetOrd * byteSize); + dictionarySlice.readFloats(value, 0, value.length); + lastOrd = targetOrd; + return value; + } + }; } static OffHeapFloatVectorValues load( @@ -111,12 +117,6 @@ public DenseOffHeapVectorValues( super(dimension, size, slice, vectorSimilarityFunction, byteSize); } - @Override - public DenseOffHeapVectorValues copy() throws IOException { - return new DenseOffHeapVectorValues( - dimension, size, slice.clone(), vectorSimilarityFunction, byteSize); - } - @Override public DocIndexIterator iterator() { return createDenseIterator(); @@ -129,14 +129,13 @@ public Bits getAcceptOrds(Bits acceptDocs) { @Override public VectorScorer scorer(float[] query) throws IOException { - DenseOffHeapVectorValues values = this.copy(); - DocIndexIterator iterator = values.iterator(); + Floats floats = values(); + DocIndexIterator iterator = iterator(); return new VectorScorer() { @Override public float score() throws IOException { - return values.vectorSimilarityFunction.compare( - values.vectorValue(iterator.index()), query); + return vectorSimilarityFunction.compare(floats.get(iterator.index()), query); } @Override @@ -178,12 +177,6 @@ public SparseOffHeapVectorValues( fieldEntry.size()); } - @Override - public SparseOffHeapVectorValues copy() throws IOException { - return new SparseOffHeapVectorValues( - fieldEntry, dataIn, slice.clone(), vectorSimilarityFunction, byteSize); - } - @Override public DocIndexIterator iterator() { return IndexedDISI.asDocIndexIterator(disi); @@ -214,13 +207,12 @@ public int length() { @Override public VectorScorer scorer(float[] query) throws IOException { - SparseOffHeapVectorValues values = this.copy(); - DocIndexIterator iterator = values.iterator(); + DocIndexIterator iterator = iterator(); + Floats values = values(); return new VectorScorer() { @Override public float score() throws IOException { - return values.vectorSimilarityFunction.compare( - values.vectorValue(iterator.index()), query); + return vectorSimilarityFunction.compare(values.get(iterator.index()), query); } @Override @@ -248,13 +240,13 @@ public int size() { } @Override - public OffHeapFloatVectorValues copy() throws IOException { - throw new UnsupportedOperationException(); - } - - @Override - public float[] vectorValue(int targetOrd) throws IOException { - throw new UnsupportedOperationException(); + public Floats values() { + return new Floats() { + @Override + public float[] get(int targetOrd) { + throw new UnsupportedOperationException(); + } + }; } @Override diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswVectorsWriter.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswVectorsWriter.java index f60411752d2..b4d00e6567e 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswVectorsWriter.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswVectorsWriter.java @@ -189,10 +189,10 @@ private static int[] writeVectorData(IndexOutput output, FloatVectorValues vecto ByteBuffer binaryVector = ByteBuffer.allocate(vectors.dimension() * Float.BYTES).order(ByteOrder.LITTLE_ENDIAN); KnnVectorValues.DocIndexIterator iter = vectors.iterator(); + FloatVectorValues.Floats values = vectors.values(); for (int docV = iter.nextDoc(); docV != NO_MORE_DOCS; docV = iter.nextDoc()) { // write vector - float[] vectorValue = vectors.vectorValue(iter.index()); - binaryVector.asFloatBuffer().put(vectorValue); + binaryVector.asFloatBuffer().put(values.get(iter.index())); output.writeBytes(binaryVector.array(), binaryVector.limit()); docIds[count++] = docV; } @@ -250,7 +250,7 @@ private void writeGraph( beamWidth, Lucene90HnswGraphBuilder.randSeed); hnswGraphBuilder.setInfoStream(segmentWriteState.infoStream); - Lucene90OnHeapHnswGraph graph = hnswGraphBuilder.build(vectorValues.copy()); + Lucene90OnHeapHnswGraph graph = hnswGraphBuilder.build(vectorValues); for (int ord = 0; ord < offsets.length; ord++) { // write graph diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene91/Lucene91HnswGraphBuilder.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene91/Lucene91HnswGraphBuilder.java index 5ef85a8419c..c3ef3a8bb4a 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene91/Lucene91HnswGraphBuilder.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene91/Lucene91HnswGraphBuilder.java @@ -57,7 +57,8 @@ public final class Lucene91HnswGraphBuilder { private final DefaultFlatVectorScorer defaultFlatVectorScorer = new DefaultFlatVectorScorer(); private final VectorSimilarityFunction similarityFunction; - private final FloatVectorValues vectorValues; + private final FloatVectorValues vectors; + private final FloatVectorValues.Floats vectorValues; private final SplittableRandom random; private final Lucene91BoundsChecker bound; private final HnswGraphSearcher graphSearcher; @@ -68,7 +69,7 @@ public final class Lucene91HnswGraphBuilder { // we need two sources of vectors in order to perform diversity check comparisons without // colliding - private FloatVectorValues buildVectors; + private FloatVectorValues.Floats buildVectors; /** * Reads all the vectors from vector values, builds a graph connecting them by their dense @@ -89,8 +90,9 @@ public Lucene91HnswGraphBuilder( int beamWidth, long seed) throws IOException { - vectorValues = vectors.copy(); - buildVectors = vectors.copy(); + this.vectors = vectors; + vectorValues = vectors.values(); + buildVectors = vectors.values(); this.similarityFunction = Objects.requireNonNull(similarityFunction); if (maxConn <= 0) { throw new IllegalArgumentException("maxConn must be positive"); @@ -106,8 +108,7 @@ public Lucene91HnswGraphBuilder( int levelOfFirstNode = getRandomGraphLevel(ml, random); this.hnsw = new Lucene91OnHeapHnswGraph(maxConn, levelOfFirstNode); this.graphSearcher = - new HnswGraphSearcher( - new NeighborQueue(beamWidth, true), new FixedBitSet(vectorValues.size())); + new HnswGraphSearcher(new NeighborQueue(beamWidth, true), new FixedBitSet(vectors.size())); bound = Lucene91BoundsChecker.create(false); scratch = new Lucene91NeighborArray(Math.max(beamWidth, maxConn + 1)); } @@ -121,17 +122,14 @@ public Lucene91HnswGraphBuilder( * accessor for the vectors */ public Lucene91OnHeapHnswGraph build(FloatVectorValues vectors) throws IOException { - if (vectors == vectorValues) { - throw new IllegalArgumentException( - "Vectors to build must be independent of the source of vectors provided to HnswGraphBuilder()"); - } + FloatVectorValues.Floats values = vectors.values(); if (infoStream.isEnabled(HNSW_COMPONENT)) { infoStream.message(HNSW_COMPONENT, "build graph from " + vectors.size() + " vectors"); } long start = System.nanoTime(), t = start; // start at node 1! node 0 is added implicitly, in the constructor for (int node = 1; node < vectors.size(); node++) { - addGraphNode(node, vectors.vectorValue(node)); + addGraphNode(node, values.get(node)); if ((node % 10000 == 0) && infoStream.isEnabled(HNSW_COMPONENT)) { t = printGraphBuildStatus(node, start, t); } @@ -147,7 +145,7 @@ public void setInfoStream(InfoStream infoStream) { /** Inserts a doc with vector value to the graph */ void addGraphNode(int node, float[] value) throws IOException { RandomVectorScorer scorer = - defaultFlatVectorScorer.getRandomVectorScorer(similarityFunction, vectorValues, value); + defaultFlatVectorScorer.getRandomVectorScorer(similarityFunction, vectors, value); HnswGraphBuilder.GraphBuilderKnnCollector candidates; final int nodeLevel = getRandomGraphLevel(ml, random); int curMaxLevel = hnsw.numLevels() - 1; @@ -224,7 +222,7 @@ private void selectDiverse(Lucene91NeighborArray neighbors, Lucene91NeighborArra int cNode = candidates.node[i]; float cScore = candidates.score[i]; assert cNode < hnsw.size(); - if (diversityCheck(vectorValues.vectorValue(cNode), cScore, neighbors, buildVectors)) { + if (diversityCheck(vectorValues.get(cNode), cScore, neighbors, buildVectors)) { neighbors.add(cNode, cScore); } } @@ -254,12 +252,12 @@ private boolean diversityCheck( float[] candidate, float score, Lucene91NeighborArray neighbors, - FloatVectorValues vectorValues) + FloatVectorValues.Floats vectorValues) throws IOException { bound.set(score); for (int i = 0; i < neighbors.size(); i++) { float neighborSimilarity = - similarityFunction.compare(candidate, vectorValues.vectorValue(neighbors.node[i])); + similarityFunction.compare(candidate, vectorValues.get(neighbors.node[i])); if (bound.check(neighborSimilarity) == false) { return false; } @@ -293,10 +291,10 @@ private int findNonDiverse(Lucene91NeighborArray neighbors) throws IOException { // them, drop it int neighborId = neighbors.node[i]; bound.set(neighbors.score[i]); - float[] neighborVector = vectorValues.vectorValue(neighborId); + float[] neighborVector = vectorValues.get(neighborId); for (int j = maxConn; j > i; j--) { float neighborSimilarity = - similarityFunction.compare(neighborVector, buildVectors.vectorValue(neighbors.node[j])); + similarityFunction.compare(neighborVector, buildVectors.get(neighbors.node[j])); if (bound.check(neighborSimilarity) == false) { // node j is too similar to node i given its score relative to the base node // replace it with the new node, which is at [maxConn] diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene91/Lucene91HnswVectorsWriter.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene91/Lucene91HnswVectorsWriter.java index a984a3ef1f8..0090e59922d 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene91/Lucene91HnswVectorsWriter.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene91/Lucene91HnswVectorsWriter.java @@ -182,10 +182,10 @@ private static DocsWithFieldSet writeVectorData(IndexOutput output, FloatVectorV ByteBuffer binaryVector = ByteBuffer.allocate(vectors.dimension() * Float.BYTES).order(ByteOrder.LITTLE_ENDIAN); KnnVectorValues.DocIndexIterator iter = vectors.iterator(); + FloatVectorValues.Floats values = vectors.values(); for (int docV = iter.nextDoc(); docV != DocIdSetIterator.NO_MORE_DOCS; docV = iter.nextDoc()) { // write vector - float[] vectorValue = vectors.vectorValue(iter.index()); - binaryVector.asFloatBuffer().put(vectorValue); + binaryVector.asFloatBuffer().put(values.get(iter.index())); output.writeBytes(binaryVector.array(), binaryVector.limit()); docsWithField.add(docV); } @@ -254,7 +254,7 @@ private Lucene91OnHeapHnswGraph writeGraph( beamWidth, Lucene91HnswGraphBuilder.randSeed); hnswGraphBuilder.setInfoStream(segmentWriteState.infoStream); - Lucene91OnHeapHnswGraph graph = hnswGraphBuilder.build(vectorValues.copy()); + Lucene91OnHeapHnswGraph graph = hnswGraphBuilder.build(vectorValues); // write vectors' neighbours on each level into the vectorIndex file int countOnLevel0 = graph.size(); diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene92/Lucene92HnswVectorsWriter.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene92/Lucene92HnswVectorsWriter.java index bf1c89a536d..93bacddf452 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene92/Lucene92HnswVectorsWriter.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene92/Lucene92HnswVectorsWriter.java @@ -190,12 +190,12 @@ private static DocsWithFieldSet writeVectorData(IndexOutput output, FloatVectorV ByteBuffer binaryVector = ByteBuffer.allocate(vectors.dimension() * Float.BYTES).order(ByteOrder.LITTLE_ENDIAN); KnnVectorValues.DocIndexIterator iterator = vectors.iterator(); + FloatVectorValues.Floats values = vectors.values(); for (int docV = iterator.nextDoc(); docV != DocIdSetIterator.NO_MORE_DOCS; docV = iterator.nextDoc()) { // write vector - float[] vectorValue = vectors.vectorValue(iterator.index()); - binaryVector.asFloatBuffer().put(vectorValue); + binaryVector.asFloatBuffer().put(values.get(iterator.index())); output.writeBytes(binaryVector.array(), binaryVector.limit()); docsWithField.add(docV); } diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene94/Lucene94HnswVectorsWriter.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene94/Lucene94HnswVectorsWriter.java index 01698da7989..3722dc30857 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene94/Lucene94HnswVectorsWriter.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene94/Lucene94HnswVectorsWriter.java @@ -604,13 +604,13 @@ private static DocsWithFieldSet writeVectorData( IndexOutput output, FloatVectorValues floatVectorValues) throws IOException { DocsWithFieldSet docsWithField = new DocsWithFieldSet(); KnnVectorValues.DocIndexIterator iter = floatVectorValues.iterator(); + FloatVectorValues.Floats values = floatVectorValues.values(); ByteBuffer binaryVector = ByteBuffer.allocate(floatVectorValues.dimension() * VectorEncoding.FLOAT32.byteSize) .order(ByteOrder.LITTLE_ENDIAN); for (int docV = iter.nextDoc(); docV != NO_MORE_DOCS; docV = iter.nextDoc()) { // write vector - float[] vectorValue = floatVectorValues.vectorValue(iter.index()); - binaryVector.asFloatBuffer().put(vectorValue); + binaryVector.asFloatBuffer().put(values.get(iter.index())); output.writeBytes(binaryVector.array(), binaryVector.limit()); docsWithField.add(docV); } diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene95/Lucene95HnswVectorsWriter.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene95/Lucene95HnswVectorsWriter.java index c855d8f5e07..94c8094b3ac 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene95/Lucene95HnswVectorsWriter.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene95/Lucene95HnswVectorsWriter.java @@ -655,10 +655,10 @@ private static DocsWithFieldSet writeVectorData( ByteBuffer.allocate(floatVectorValues.dimension() * VectorEncoding.FLOAT32.byteSize) .order(ByteOrder.LITTLE_ENDIAN); KnnVectorValues.DocIndexIterator iter = floatVectorValues.iterator(); + FloatVectorValues.Floats values = floatVectorValues.values(); for (int docV = iter.nextDoc(); docV != NO_MORE_DOCS; docV = iter.nextDoc()) { // write vector - float[] value = floatVectorValues.vectorValue(iter.index()); - buffer.asFloatBuffer().put(value); + buffer.asFloatBuffer().put(values.get(iter.index())); output.writeBytes(buffer.array(), buffer.limit()); docsWithField.add(docV); } diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestBasicBackwardsCompatibility.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestBasicBackwardsCompatibility.java index e8137bda8d8..c8c9b7e46ef 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestBasicBackwardsCompatibility.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestBasicBackwardsCompatibility.java @@ -478,13 +478,11 @@ public static void searchIndex( if (values != null) { assertEquals(KNN_VECTOR_FIELD_TYPE.vectorDimension(), values.dimension()); KnnVectorValues.DocIndexIterator it = values.iterator(); + FloatVectorValues.Floats vectors = values.values(); for (int doc = it.nextDoc(); doc != NO_MORE_DOCS; doc = it.nextDoc()) { float[] expectedVector = {KNN_VECTOR[0], KNN_VECTOR[1], KNN_VECTOR[2] + 0.1f * cnt}; assertArrayEquals( - "vectors do not match for doc=" + cnt, - expectedVector, - values.vectorValue(it.index()), - 0); + "vectors do not match for doc=" + cnt, expectedVector, vectors.get(it.index()), 0); cnt++; } } diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextKnnVectorsReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextKnnVectorsReader.java index 0a8c4836321..caa42df33d8 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextKnnVectorsReader.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextKnnVectorsReader.java @@ -192,6 +192,7 @@ public void search(String field, float[] target, KnnCollector knnCollector, Bits } FieldInfo info = readState.fieldInfos.fieldInfo(field); VectorSimilarityFunction vectorSimilarity = info.getVectorSimilarityFunction(); + FloatVectorValues.Floats valuesDict = values.values(); for (int ord = 0; ord < values.size(); ord++) { int doc = values.ordToDoc(ord); if (acceptDocs != null && acceptDocs.get(doc) == false) { @@ -202,7 +203,7 @@ public void search(String field, float[] target, KnnCollector knnCollector, Bits break; } - float[] vector = values.vectorValue(ord); + float[] vector = valuesDict.get(ord); float score = vectorSimilarity.compare(vector, target); knnCollector.collect(doc, score); knnCollector.incVisitedCount(1); @@ -327,8 +328,13 @@ public int size() { } @Override - public float[] vectorValue(int ord) { - return values[ord]; + public Floats values() { + return new Floats() { + @Override + public float[] get(int ord) { + return values[ord]; + } + }; } @Override @@ -349,13 +355,12 @@ public VectorScorer scorer(float[] target) { SimpleTextFloatVectorValues simpleTextFloatVectorValues = new SimpleTextFloatVectorValues(this); DocIndexIterator iterator = simpleTextFloatVectorValues.iterator(); + Floats valuesDict = simpleTextFloatVectorValues.values(); return new VectorScorer() { @Override public float score() throws IOException { int ord = iterator.index(); - return entry - .similarityFunction() - .compare(simpleTextFloatVectorValues.vectorValue(ord), target); + return entry.similarityFunction().compare(valuesDict.get(ord), target); } @Override @@ -382,11 +387,6 @@ private void readVector(float[] value) throws IOException { value[i] = Float.parseFloat(floatStrings[i]); } } - - @Override - public SimpleTextFloatVectorValues copy() { - return this; - } } private static class SimpleTextByteVectorValues extends ByteVectorValues { diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextKnnVectorsWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextKnnVectorsWriter.java index eaf4b657755..137cc48ade4 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextKnnVectorsWriter.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextKnnVectorsWriter.java @@ -79,18 +79,20 @@ public void writeField(FieldInfo fieldInfo, FloatVectorValues floatVectorValues, long vectorDataOffset = vectorData.getFilePointer(); List docIds = new ArrayList<>(); KnnVectorValues.DocIndexIterator iter = floatVectorValues.iterator(); + FloatVectorValues.Floats valuesDict = floatVectorValues.values(); for (int docId = iter.nextDoc(); docId != NO_MORE_DOCS; docId = iter.nextDoc()) { - writeFloatVectorValue(floatVectorValues, iter.index()); + writeFloatVectorValue(valuesDict, floatVectorValues.dimension(), iter.index()); docIds.add(docId); } long vectorDataLength = vectorData.getFilePointer() - vectorDataOffset; writeMeta(fieldInfo, vectorDataOffset, vectorDataLength, docIds); } - private void writeFloatVectorValue(FloatVectorValues vectors, int ord) throws IOException { + private void writeFloatVectorValue(FloatVectorValues.Floats vectors, int dim, int ord) + throws IOException { // write vector value - float[] value = vectors.vectorValue(ord); - assert value.length == vectors.dimension(); + float[] value = vectors.get(ord); + assert value.length == dim; write(vectorData, Arrays.toString(value)); newline(vectorData); } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/BufferingKnnVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/BufferingKnnVectorsWriter.java index 96b0f75a259..51545b150ba 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/BufferingKnnVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/BufferingKnnVectorsWriter.java @@ -117,13 +117,19 @@ private static class SortingFloatVectorValues extends FloatVectorValues { SortingFloatVectorValues( BufferedFloatVectorValues delegate, DocsWithFieldSet docsWithField, Sorter.DocMap sortMap) throws IOException { - this.delegate = delegate.copy(); + this.delegate = delegate; iteratorSupplier = SortingCodecReader.iteratorSupplier(delegate, sortMap); } @Override - public float[] vectorValue(int ord) throws IOException { - return delegate.vectorValue(ord); + public Floats values() { + Floats delegateFloats = delegate.values(); + return new Floats() { + @Override + public float[] get(int ord) throws IOException { + return delegateFloats.get(ord); + } + }; } @Override @@ -136,11 +142,6 @@ public int size() { return delegate.size(); } - @Override - public SortingFloatVectorValues copy() { - throw new UnsupportedOperationException(); - } - @Override public DocIndexIterator iterator() { return iteratorSupplier.get(); @@ -295,19 +296,19 @@ public int ordToDoc(int ord) { } @Override - public float[] vectorValue(int targetOrd) { - return vectors.get(targetOrd); + public Floats values() { + return new Floats() { + @Override + public float[] get(int ord) throws IOException { + return vectors.get(ord); + } + }; } @Override public DocIndexIterator iterator() { return iterator; } - - @Override - public BufferedFloatVectorValues copy() throws IOException { - return new BufferedFloatVectorValues(vectors, dimension, docsWithField); - } } private static class BufferedByteVectorValues extends ByteVectorValues { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsWriter.java index 50af32a7e16..15f186a6744 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsWriter.java @@ -71,9 +71,10 @@ public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOE (KnnFieldVectorsWriter) addField(fieldInfo); FloatVectorValues mergedFloats = MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState); + FloatVectorValues.Floats mergedDict = mergedFloats.values(); KnnVectorValues.DocIndexIterator iter = mergedFloats.iterator(); for (int doc = iter.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = iter.nextDoc()) { - floatWriter.addValue(doc, mergedFloats.vectorValue(iter.index())); + floatWriter.addValue(doc, mergedDict.get(iter.index())); } } } @@ -355,15 +356,27 @@ public long cost() { } @Override - public float[] vectorValue(int ord) throws IOException { - if (ord != lastOrd) { - throw new IllegalStateException( - "only supports forward iteration with a single iterator: ord=" - + ord - + ", lastOrd=" - + lastOrd); - } - return current.values.vectorValue(current.index()); + public Floats values() { + return new Floats() { + FloatVectorValues currentValues = null; + Floats currentFloats = null; + + @Override + public float[] get(int ord) throws IOException { + if (ord != lastOrd) { + throw new IllegalStateException( + "only supports forward iteration with a single iterator: ord=" + + ord + + ", lastOrd=" + + lastOrd); + } + if (currentValues != current.values) { + currentValues = current.values; + currentFloats = current.values.values(); + } + return currentFloats.get(current.index()); + } + }; } @Override @@ -385,11 +398,6 @@ public int ordToDoc(int ord) { public VectorScorer scorer(float[] target) { throw new UnsupportedOperationException(); } - - @Override - public FloatVectorValues copy() { - throw new UnsupportedOperationException(); - } } static class MergedByteVectorValues extends ByteVectorValues { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/hnsw/DefaultFlatVectorScorer.java b/lucene/core/src/java/org/apache/lucene/codecs/hnsw/DefaultFlatVectorScorer.java index 3e506037969..d0546d2deed 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/hnsw/DefaultFlatVectorScorer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/hnsw/DefaultFlatVectorScorer.java @@ -125,15 +125,15 @@ public String toString() { /** RandomVectorScorerSupplier for Float vector */ private static final class FloatScoringSupplier implements RandomVectorScorerSupplier { private final FloatVectorValues vectors; - private final FloatVectorValues vectors1; - private final FloatVectorValues vectors2; + private final FloatVectorValues.Floats vectors1; + private final FloatVectorValues.Floats vectors2; private final VectorSimilarityFunction similarityFunction; private FloatScoringSupplier( FloatVectorValues vectors, VectorSimilarityFunction similarityFunction) throws IOException { this.vectors = vectors; - vectors1 = vectors.copy(); - vectors2 = vectors.copy(); + vectors1 = vectors.values(); + vectors2 = vectors.values(); this.similarityFunction = similarityFunction; } @@ -142,7 +142,7 @@ public RandomVectorScorer scorer(int ord) { return new RandomVectorScorer.AbstractRandomVectorScorer(vectors) { @Override public float score(int node) throws IOException { - return similarityFunction.compare(vectors1.vectorValue(ord), vectors2.vectorValue(node)); + return similarityFunction.compare(vectors1.get(ord), vectors2.get(node)); } }; } @@ -160,21 +160,22 @@ public String toString() { /** A {@link RandomVectorScorer} for float vectors. */ private static class FloatVectorScorer extends RandomVectorScorer.AbstractRandomVectorScorer { - private final FloatVectorValues values; + private final FloatVectorValues.Floats floats; private final float[] query; private final VectorSimilarityFunction similarityFunction; public FloatVectorScorer( - FloatVectorValues values, float[] query, VectorSimilarityFunction similarityFunction) { + FloatVectorValues values, float[] query, VectorSimilarityFunction similarityFunction) + throws IOException { super(values); - this.values = values; + this.floats = values.values(); this.query = query; this.similarityFunction = similarityFunction; } @Override public float score(int node) throws IOException { - return similarityFunction.compare(query, values.vectorValue(node)); + return similarityFunction.compare(query, floats.get(node)); } } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene95/OffHeapFloatVectorValues.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene95/OffHeapFloatVectorValues.java index 2384657e93e..5bc64817156 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene95/OffHeapFloatVectorValues.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene95/OffHeapFloatVectorValues.java @@ -38,8 +38,6 @@ public abstract class OffHeapFloatVectorValues extends FloatVectorValues impleme protected final int size; protected final IndexInput slice; protected final int byteSize; - protected int lastOrd = -1; - protected final float[] value; protected final VectorSimilarityFunction similarityFunction; protected final FlatVectorsScorer flatVectorsScorer; @@ -56,7 +54,6 @@ public abstract class OffHeapFloatVectorValues extends FloatVectorValues impleme this.byteSize = byteSize; this.similarityFunction = similarityFunction; this.flatVectorsScorer = flatVectorsScorer; - value = new float[dimension]; } @Override @@ -75,14 +72,23 @@ public IndexInput getSlice() { } @Override - public float[] vectorValue(int targetOrd) throws IOException { - if (lastOrd == targetOrd) { - return value; - } - slice.seek((long) targetOrd * byteSize); - slice.readFloats(value, 0, value.length); - lastOrd = targetOrd; - return value; + public Floats values() { + IndexInput sliceCopy = slice.clone(); + float[] value = new float[dimension]; + return new Floats() { + int lastOrd = -1; + + @Override + public float[] get(int targetOrd) throws IOException { + if (lastOrd == targetOrd) { + return value; + } + sliceCopy.seek((long) targetOrd * byteSize); + sliceCopy.readFloats(value, 0, value.length); + lastOrd = targetOrd; + return value; + } + }; } public static OffHeapFloatVectorValues load( @@ -136,12 +142,6 @@ public DenseOffHeapVectorValues( super(dimension, size, slice, byteSize, flatVectorsScorer, similarityFunction); } - @Override - public DenseOffHeapVectorValues copy() throws IOException { - return new DenseOffHeapVectorValues( - dimension, size, slice.clone(), byteSize, flatVectorsScorer, similarityFunction); - } - @Override public int ordToDoc(int ord) { return ord; @@ -159,10 +159,9 @@ public DocIndexIterator iterator() { @Override public VectorScorer scorer(float[] query) throws IOException { - DenseOffHeapVectorValues copy = copy(); - DocIndexIterator iterator = copy.iterator(); + DocIndexIterator iterator = iterator(); RandomVectorScorer randomVectorScorer = - flatVectorsScorer.getRandomVectorScorer(similarityFunction, copy, query); + flatVectorsScorer.getRandomVectorScorer(similarityFunction, this, query); return new VectorScorer() { @Override public float score() throws IOException { @@ -210,18 +209,6 @@ public SparseOffHeapVectorValues( configuration.size); } - @Override - public SparseOffHeapVectorValues copy() throws IOException { - return new SparseOffHeapVectorValues( - configuration, - dataIn, - slice.clone(), - dimension, - byteSize, - flatVectorsScorer, - similarityFunction); - } - @Override public int ordToDoc(int ord) { return (int) ordToDoc.get(ord); @@ -252,10 +239,9 @@ public DocIndexIterator iterator() { @Override public VectorScorer scorer(float[] query) throws IOException { - SparseOffHeapVectorValues copy = copy(); - DocIndexIterator iterator = copy.iterator(); + DocIndexIterator iterator = iterator(); RandomVectorScorer randomVectorScorer = - flatVectorsScorer.getRandomVectorScorer(similarityFunction, copy, query); + flatVectorsScorer.getRandomVectorScorer(similarityFunction, this, query); return new VectorScorer() { @Override public float score() throws IOException { @@ -290,13 +276,13 @@ public int size() { } @Override - public EmptyOffHeapVectorValues copy() { - throw new UnsupportedOperationException(); - } - - @Override - public float[] vectorValue(int targetOrd) { - throw new UnsupportedOperationException(); + public Floats values() { + return new Floats() { + @Override + public float[] get(int ord) { + throw new UnsupportedOperationException(); + } + }; } @Override diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsWriter.java index b731e758b7a..59d8e872d7f 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsWriter.java @@ -383,9 +383,10 @@ private static DocsWithFieldSet writeVectorData( ByteBuffer.allocate(floatVectorValues.dimension() * VectorEncoding.FLOAT32.byteSize) .order(ByteOrder.LITTLE_ENDIAN); KnnVectorValues.DocIndexIterator iter = floatVectorValues.iterator(); + FloatVectorValues.Floats dict = floatVectorValues.values(); for (int docV = iter.nextDoc(); docV != NO_MORE_DOCS; docV = iter.nextDoc()) { // write vector - float[] value = floatVectorValues.vectorValue(iter.index()); + float[] value = dict.get(iter.index()); buffer.asFloatBuffer().put(value); output.writeBytes(buffer.array(), buffer.limit()); docsWithField.add(docV); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsReader.java index 24123a4f21e..7d8f49784af 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsReader.java @@ -421,8 +421,14 @@ public int size() { } @Override - public float[] vectorValue(int ord) throws IOException { - return rawVectorValues.vectorValue(ord); + public Floats values() throws IOException { + Floats rawDict = rawVectorValues.values(); + return new Floats() { + @Override + public float[] get(int ord) throws IOException { + return rawDict.get(ord); + } + }; } @Override @@ -430,11 +436,6 @@ public int ordToDoc(int ord) { return rawVectorValues.ordToDoc(ord); } - @Override - public QuantizedVectorValues copy() throws IOException { - return new QuantizedVectorValues(rawVectorValues.copy(), quantizedVectorValues.copy()); - } - @Override public VectorScorer scorer(float[] query) throws IOException { return quantizedVectorValues.scorer(query); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsWriter.java index 1a30b5271cd..df2384cf4a5 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsWriter.java @@ -869,16 +869,16 @@ public int size() { } @Override - public FloatVectorValues copy() throws IOException { - return this; - } - - @Override - public float[] vectorValue(int ord) throws IOException { - if (ord < 0 || ord >= vectorList.size()) { - throw new IOException("vector ord " + ord + " out of bounds"); - } - return vectorList.get(ord); + public Floats values() { + return new Floats() { + @Override + public float[] get(int ord) throws IOException { + if (ord < 0 || ord >= vectorList.size()) { + throw new IOException("vector ord " + ord + " out of bounds"); + } + return vectorList.get(ord); + } + }; } @Override @@ -1047,6 +1047,7 @@ static class QuantizedFloatVectorValues extends QuantizedByteVectorValues { private final FloatVectorValues values; private final ScalarQuantizer quantizer; private final byte[] quantizedVector; + private final FloatVectorValues.Floats floats; private int lastOrd = -1; private float offsetValue = 0f; @@ -1055,11 +1056,13 @@ static class QuantizedFloatVectorValues extends QuantizedByteVectorValues { public QuantizedFloatVectorValues( FloatVectorValues values, VectorSimilarityFunction vectorSimilarityFunction, - ScalarQuantizer quantizer) { + ScalarQuantizer quantizer) + throws IOException { this.values = values; this.quantizer = quantizer; - this.quantizedVector = new byte[values.dimension()]; + quantizedVector = new byte[values.dimension()]; this.vectorSimilarityFunction = vectorSimilarityFunction; + floats = values.values(); } @Override @@ -1099,7 +1102,7 @@ public VectorScorer scorer(float[] target) throws IOException { } private float quantize(int ord) throws IOException { - return quantizer.quantize(values.vectorValue(ord), quantizedVector, vectorSimilarityFunction); + return quantizer.quantize(floats.get(ord), quantizedVector, vectorSimilarityFunction); } @Override @@ -1199,11 +1202,11 @@ public DocIndexIterator iterator() { static final class NormalizedFloatVectorValues extends FloatVectorValues { private final FloatVectorValues values; - private final float[] normalizedVector; + private final Floats floats; - public NormalizedFloatVectorValues(FloatVectorValues values) { + public NormalizedFloatVectorValues(FloatVectorValues values) throws IOException { this.values = values; - this.normalizedVector = new float[values.dimension()]; + floats = values.values(); } @Override @@ -1222,20 +1225,21 @@ public int ordToDoc(int ord) { } @Override - public float[] vectorValue(int ord) throws IOException { - System.arraycopy(values.vectorValue(ord), 0, normalizedVector, 0, normalizedVector.length); - VectorUtil.l2normalize(normalizedVector); - return normalizedVector; + public Floats values() { + float[] normalizedVector = new float[values.dimension()]; + return new Floats() { + @Override + public float[] get(int ord) throws IOException { + System.arraycopy(floats.get(ord), 0, normalizedVector, 0, normalizedVector.length); + VectorUtil.l2normalize(normalizedVector); + return normalizedVector; + } + }; } @Override public DocIndexIterator iterator() { return values.iterator(); } - - @Override - public NormalizedFloatVectorValues copy() throws IOException { - return new NormalizedFloatVectorValues(values.copy()); - } } } diff --git a/lucene/core/src/java/org/apache/lucene/index/ByteVectorValues.java b/lucene/core/src/java/org/apache/lucene/index/ByteVectorValues.java index e9be3423c18..1231166f533 100644 --- a/lucene/core/src/java/org/apache/lucene/index/ByteVectorValues.java +++ b/lucene/core/src/java/org/apache/lucene/index/ByteVectorValues.java @@ -40,7 +40,6 @@ protected ByteVectorValues() {} */ public abstract byte[] vectorValue(int ord) throws IOException; - @Override public abstract ByteVectorValues copy() throws IOException; /** diff --git a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java index becb00cbb5b..eb6108629f9 100644 --- a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java +++ b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java @@ -2762,6 +2762,7 @@ private static void checkFloatVectorValues( throws IOException { int count = 0; int everyNdoc = Math.max(values.size() / 64, 1); + FloatVectorValues.Floats valueDict = values.values(); while (count < values.size()) { // search the first maxNumSearches vectors to exercise the graph if (values.ordToDoc(count) % everyNdoc == 0) { @@ -2769,7 +2770,7 @@ private static void checkFloatVectorValues( if (vectorsReaderSupportsSearch(codecReader, fieldInfo.name)) { codecReader .getVectorReader() - .search(fieldInfo.name, values.vectorValue(count), collector, null); + .search(fieldInfo.name, valueDict.get(count), collector, null); TopDocs docs = collector.topDocs(); if (docs.scoreDocs.length == 0) { throw new CheckIndexException( @@ -2777,7 +2778,7 @@ private static void checkFloatVectorValues( } } } - int valueLength = values.vectorValue(count).length; + int valueLength = valueDict.get(count).length; if (valueLength != fieldInfo.getVectorDimension()) { throw new CheckIndexException( "Field \"" diff --git a/lucene/core/src/java/org/apache/lucene/index/ExitableDirectoryReader.java b/lucene/core/src/java/org/apache/lucene/index/ExitableDirectoryReader.java index 614a652cd35..6b9562fa5d5 100644 --- a/lucene/core/src/java/org/apache/lucene/index/ExitableDirectoryReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/ExitableDirectoryReader.java @@ -441,8 +441,13 @@ public int dimension() { } @Override - public float[] vectorValue(int ord) throws IOException { - return vectorValues.vectorValue(ord); + public Floats values() throws IOException { + Floats dict = vectorValues.values(); + return new Floats() { + public float[] get(int ord) throws IOException { + return dict.get(ord); + } + }; } @Override @@ -464,11 +469,6 @@ public DocIndexIterator iterator() { public VectorScorer scorer(float[] target) throws IOException { return vectorValues.scorer(target); } - - @Override - public FloatVectorValues copy() { - throw new UnsupportedOperationException(); - } } private class ExitableByteVectorValues extends ByteVectorValues { diff --git a/lucene/core/src/java/org/apache/lucene/index/FloatVectorValues.java b/lucene/core/src/java/org/apache/lucene/index/FloatVectorValues.java index aa840fc3931..3e332b98de3 100644 --- a/lucene/core/src/java/org/apache/lucene/index/FloatVectorValues.java +++ b/lucene/core/src/java/org/apache/lucene/index/FloatVectorValues.java @@ -32,16 +32,18 @@ public abstract class FloatVectorValues extends KnnVectorValues { /** Sole constructor */ protected FloatVectorValues() {} - /** - * Return the vector value for the given vector ordinal which must be in [0, size() - 1], - * otherwise IndexOutOfBoundsException is thrown. The returned array may be shared across calls. - * - * @return the vector value - */ - public abstract float[] vectorValue(int ord) throws IOException; + public abstract static class Floats { + /** + * Return the vector value for the given vector ordinal which must be in [0, size() - 1], + * otherwise IndexOutOfBoundsException is thrown. The returned array may be shared across calls. + * + * @return the vector value + */ + public abstract float[] get(int ord) throws IOException; + } - @Override - public abstract FloatVectorValues copy() throws IOException; + /** Returns a random access (lookup by ord) provider of the vector values */ + public abstract Floats values() throws IOException; /** * Checks the Vector Encoding of a field @@ -100,13 +102,13 @@ public int dimension() { } @Override - public float[] vectorValue(int targetOrd) { - return vectors.get(targetOrd); - } - - @Override - public FloatVectorValues copy() { - return this; + public Floats values() { + return new Floats() { + @Override + public float[] get(int ord) throws IOException { + return vectors.get(ord); + } + }; } @Override diff --git a/lucene/core/src/java/org/apache/lucene/index/KnnVectorValues.java b/lucene/core/src/java/org/apache/lucene/index/KnnVectorValues.java index 8e58f387a33..c1e5c1f1ef9 100644 --- a/lucene/core/src/java/org/apache/lucene/index/KnnVectorValues.java +++ b/lucene/core/src/java/org/apache/lucene/index/KnnVectorValues.java @@ -49,12 +49,6 @@ public int ordToDoc(int ord) { return ord; } - /** - * Creates a new copy of this {@link KnnVectorValues}. This is helpful when you need to access - * different values at once, to avoid overwriting the underlying vector returned. - */ - public abstract KnnVectorValues copy() throws IOException; - /** Returns the vector byte length, defaults to dimension multiplied by float byte size */ public int getVectorByteLength() { return dimension() * getEncoding().byteSize; diff --git a/lucene/core/src/java/org/apache/lucene/index/SlowCompositeCodecReaderWrapper.java b/lucene/core/src/java/org/apache/lucene/index/SlowCompositeCodecReaderWrapper.java index 69d557d270a..37cd21abf20 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SlowCompositeCodecReaderWrapper.java +++ b/lucene/core/src/java/org/apache/lucene/index/SlowCompositeCodecReaderWrapper.java @@ -853,7 +853,6 @@ class MergedFloatVectorValues extends FloatVectorValues { final DocValuesSub[] subs; final MergedDocIterator iter; final int[] starts; - int lastSubIndex; MergedFloatVectorValues(int dimension, int size, List> subs) { this.dimension = dimension; @@ -884,25 +883,26 @@ public int size() { return size; } - @SuppressWarnings("unchecked") - @Override - public FloatVectorValues copy() throws IOException { - List> subsCopy = new ArrayList<>(); - for (Object sub : subs) { - subsCopy.add((DocValuesSub) sub); - } - return new MergedFloatVectorValues(dimension, size, subsCopy); - } - @Override - public float[] vectorValue(int ord) throws IOException { - assert ord >= 0 && ord < size; - // We need to implement fully random-access API here in order to support callers like - // SortingCodecReader that rely on it. - lastSubIndex = findSub(ord, lastSubIndex, starts); - assert subs[lastSubIndex].sub != null; - return ((FloatVectorValues) subs[lastSubIndex].sub) - .vectorValue(ord - subs[lastSubIndex].ordStart); + public Floats values() { + return new Floats() { + int lastSubIndex = -1; + Floats subDict; + + @Override + public float[] get(int ord) throws IOException { + assert ord >= 0 && ord < size; + // We need to implement fully random-access API here in order to support callers like + // SortingCodecReader that rely on it. + int newSubIndex = findSub(ord, lastSubIndex, starts); + if (newSubIndex != lastSubIndex) { + lastSubIndex = newSubIndex; + assert subs[lastSubIndex].sub != null; + subDict = ((FloatVectorValues) subs[lastSubIndex].sub).values(); + } + return subDict.get(ord - subs[lastSubIndex].ordStart); + } + }; } } diff --git a/lucene/core/src/java/org/apache/lucene/index/SortingCodecReader.java b/lucene/core/src/java/org/apache/lucene/index/SortingCodecReader.java index daec0c197d6..d92f9ddea00 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SortingCodecReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/SortingCodecReader.java @@ -319,9 +319,14 @@ private static class SortingFloatVectorValues extends FloatVectorValues { } @Override - public float[] vectorValue(int ord) throws IOException { - // ords are interpreted in the delegate's ord-space. - return delegate.vectorValue(ord); + public Floats values() throws IOException { + Floats delegateDict = delegate.values(); + return new Floats() { + public float[] get(int ord) throws IOException { + // ords are interpreted in the delegate's ord-space. + return delegateDict.get(ord); + } + }; } @Override @@ -334,11 +339,6 @@ public int size() { return iteratorSupplier.size(); } - @Override - public FloatVectorValues copy() { - throw new UnsupportedOperationException(); - } - @Override public DocIndexIterator iterator() { return iteratorSupplier.get(); diff --git a/lucene/core/src/java/org/apache/lucene/util/quantization/ScalarQuantizer.java b/lucene/core/src/java/org/apache/lucene/util/quantization/ScalarQuantizer.java index 3f7bcf6c5c4..3a04650aa36 100644 --- a/lucene/core/src/java/org/apache/lucene/util/quantization/ScalarQuantizer.java +++ b/lucene/core/src/java/org/apache/lucene/util/quantization/ScalarQuantizer.java @@ -271,11 +271,12 @@ static ScalarQuantizer fromVectors( return new ScalarQuantizer(0f, 0f, bits); } KnnVectorValues.DocIndexIterator iterator = floatVectorValues.iterator(); + FloatVectorValues.Floats dict = floatVectorValues.values(); if (confidenceInterval == 1f) { float min = Float.POSITIVE_INFINITY; float max = Float.NEGATIVE_INFINITY; while (iterator.nextDoc() != NO_MORE_DOCS) { - for (float v : floatVectorValues.vectorValue(iterator.index())) { + for (float v : dict.get(iterator.index())) { min = Math.min(min, v); max = Math.max(max, v); } @@ -292,7 +293,7 @@ static ScalarQuantizer fromVectors( int scratchSize = Math.min(SCRATCH_SIZE, totalVectorCount); int i = 0; while (iterator.nextDoc() != NO_MORE_DOCS) { - float[] vectorValue = floatVectorValues.vectorValue(iterator.index()); + float[] vectorValue = dict.get(iterator.index()); System.arraycopy( vectorValue, 0, quantileGatheringScratch, i * vectorValue.length, vectorValue.length); i++; @@ -317,7 +318,7 @@ static ScalarQuantizer fromVectors( index++; } assert iterator.docID() != NO_MORE_DOCS; - float[] vectorValue = floatVectorValues.vectorValue(iterator.index()); + float[] vectorValue = dict.get(iterator.index()); System.arraycopy( vectorValue, 0, quantileGatheringScratch, idx * vectorValue.length, vectorValue.length); idx++; @@ -356,15 +357,12 @@ public static ScalarQuantizer fromVectorsAutoInterval( 1 - 1f / (floatVectorValues.dimension() + 1) }; KnnVectorValues.DocIndexIterator iterator = floatVectorValues.iterator(); + FloatVectorValues.Floats dict = floatVectorValues.values(); if (totalVectorCount <= sampleSize) { int scratchSize = Math.min(SCRATCH_SIZE, totalVectorCount); int i = 0; while (iterator.nextDoc() != NO_MORE_DOCS) { - gatherSample( - floatVectorValues.vectorValue(iterator.index()), - quantileGatheringScratch, - sampledDocs, - i); + gatherSample(dict.get(iterator.index()), quantileGatheringScratch, sampledDocs, i); i++; if (i == scratchSize) { extractQuantiles(confidenceIntervals, quantileGatheringScratch, upperSum, lowerSum); @@ -385,11 +383,7 @@ public static ScalarQuantizer fromVectorsAutoInterval( index++; } assert iterator.docID() != NO_MORE_DOCS; - gatherSample( - floatVectorValues.vectorValue(iterator.index()), - quantileGatheringScratch, - sampledDocs, - idx); + gatherSample(dict.get(iterator.index()), quantileGatheringScratch, sampledDocs, idx); idx++; if (idx == SCRATCH_SIZE) { extractQuantiles(confidenceIntervals, quantileGatheringScratch, upperSum, lowerSum); diff --git a/lucene/core/src/test/org/apache/lucene/document/TestField.java b/lucene/core/src/test/org/apache/lucene/document/TestField.java index 5c1b8f17294..e13d8cd6897 100644 --- a/lucene/core/src/test/org/apache/lucene/document/TestField.java +++ b/lucene/core/src/test/org/apache/lucene/document/TestField.java @@ -726,10 +726,10 @@ public void testKnnVectorField() throws Exception { assertEquals(1, floatValues.size()); KnnVectorValues.DocIndexIterator iterator1 = floatValues.iterator(); assertNotEquals(NO_MORE_DOCS, iterator1.nextDoc()); - assertEquals(vector.length, floatValues.vectorValue(0).length); - assertEquals(vector[0], floatValues.vectorValue(0)[0], 0); + assertEquals(vector.length, floatValues.values().get(0).length); + assertEquals(vector[0], floatValues.values().get(0)[0], 0); assertEquals(NO_MORE_DOCS, iterator1.nextDoc()); - expectThrows(IOException.class, () -> floatValues.vectorValue(1)); + expectThrows(IOException.class, () -> floatValues.values().get(1)); } } } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestExitableDirectoryReader.java b/lucene/core/src/test/org/apache/lucene/index/TestExitableDirectoryReader.java index d03c8cf42b5..c18e4d18170 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestExitableDirectoryReader.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestExitableDirectoryReader.java @@ -581,7 +581,7 @@ private static void scanAndRetrieve(LeafReader leaf, KnnVectorValues values) thr if (random().nextBoolean() && iter.docID() != DocIdSetIterator.NO_MORE_DOCS && values instanceof FloatVectorValues) { - ((FloatVectorValues) values).vectorValue(iter.index()); + ((FloatVectorValues) values).values().get(iter.index()); } } } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestKnnGraph.java b/lucene/core/src/test/org/apache/lucene/index/TestKnnGraph.java index 41410ad4e39..b5d4babea8e 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestKnnGraph.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestKnnGraph.java @@ -414,6 +414,7 @@ private void assertConsistentGraph(IndexWriter iw, float[][] values, String vect int nextDocWithVectors = 0; StoredFields storedFields = reader.storedFields(); KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator(); + FloatVectorValues.Floats dict = vectorValues.values(); for (int i = 0; i < reader.maxDoc(); i++) { nextDocWithVectors = iterator.advance(i); while (i < nextDocWithVectors && i < reader.maxDoc()) { @@ -427,7 +428,7 @@ private void assertConsistentGraph(IndexWriter iw, float[][] values, String vect } int id = Integer.parseInt(storedFields.document(i).get("id")); // documents with KnnGraphValues have the expected vectors - float[] scratch = vectorValues.vectorValue(iterator.index()); + float[] scratch = dict.get(iterator.index()); assertArrayEquals( "vector did not match for doc " + i + ", id=" + id + ": " + Arrays.toString(scratch), values[id], diff --git a/lucene/core/src/test/org/apache/lucene/index/TestSortingCodecReader.java b/lucene/core/src/test/org/apache/lucene/index/TestSortingCodecReader.java index 9663d676255..6db0aa8644f 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestSortingCodecReader.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestSortingCodecReader.java @@ -276,7 +276,7 @@ public void testSortOnAddIndicesRandom() throws IOException { assertEquals(1, sorted_numeric_dv.docValueCount()); assertEquals(ids.longValue(), sorted_numeric_dv.nextValue()); - float[] vectorValue = vectorValues.vectorValue(valuesIterator.index()); + float[] vectorValue = vectorValues.values().get(valuesIterator.index()); assertEquals(1, vectorValue.length); assertEquals((float) ids.longValue(), vectorValue[0], 0.001f); diff --git a/lucene/core/src/test/org/apache/lucene/util/hnsw/HnswGraphTestCase.java b/lucene/core/src/test/org/apache/lucene/util/hnsw/HnswGraphTestCase.java index 41aeef2e5c8..9234e70b802 100644 --- a/lucene/core/src/test/org/apache/lucene/util/hnsw/HnswGraphTestCase.java +++ b/lucene/core/src/test/org/apache/lucene/util/hnsw/HnswGraphTestCase.java @@ -85,6 +85,7 @@ import org.apache.lucene.util.RamUsageEstimator; import org.apache.lucene.util.VectorUtil; import org.apache.lucene.util.hnsw.HnswGraph.NodesIterator; +import org.junit.Ignore; /** Tests HNSW KNN graphs */ abstract class HnswGraphTestCase extends LuceneTestCase { @@ -119,12 +120,12 @@ protected RandomVectorScorerSupplier buildScorerSupplier(KnnVectorValues vectors } protected RandomVectorScorer buildScorer(KnnVectorValues vectors, T query) throws IOException { - KnnVectorValues vectorsCopy = vectors.copy(); return switch (getVectorEncoding()) { case BYTE -> - flatVectorScorer.getRandomVectorScorer(similarityFunction, vectorsCopy, (byte[]) query); + flatVectorScorer.getRandomVectorScorer( + similarityFunction, ((ByteVectorValues) vectors).copy(), (byte[]) query); case FLOAT32 -> - flatVectorScorer.getRandomVectorScorer(similarityFunction, vectorsCopy, (float[]) query); + flatVectorScorer.getRandomVectorScorer(similarityFunction, vectors, (float[]) query); }; } @@ -191,11 +192,10 @@ public KnnVectorsFormat getKnnVectorsFormatForField(String field) { doc.add( knnVectorField( "field", - (T) ((FloatVectorValues) vectors).vectorValue(ord), + (T) ((FloatVectorValues) vectors).values().get(ord), similarityFunction)); } } - ; doc.add(new StringField("id", Integer.toString(vectors.ordToDoc(ord)), Field.Store.NO)); iw.addDocument(doc); } @@ -226,12 +226,39 @@ private T vectorValue(KnnVectorValues vectors, int ord) throws IOException { return (T) ((ByteVectorValues) vectors).vectorValue(ord); } case FLOAT32 -> { - return (T) ((FloatVectorValues) vectors).vectorValue(ord); + return (T) ((FloatVectorValues) vectors).values().get(ord); } } throw new AssertionError("unknown encoding " + vectors.getEncoding()); } + interface Vectors { + T get(int ord) throws IOException; + } + + // we used to have a generically-typed vector API, now this persists only in tests + @SuppressWarnings("unchecked") + private static Vectors vectors(KnnVectorValues vectors) throws IOException { + return switch (vectors.getEncoding()) { + case FLOAT32 -> + new Vectors() { + FloatVectorValues.Floats dict = ((FloatVectorValues) vectors).values(); + + @Override + public T get(int ord) throws IOException { + return (T) dict.get(ord); + } + }; + case BYTE -> + new Vectors() { + @Override + public T get(int ord) throws IOException { + return (T) ((ByteVectorValues) vectors).vectorValue(ord); + } + }; + }; + } + // test writing out and reading in a graph gives the expected graph public void testReadWrite() throws IOException { int dim = random().nextInt(100) + 1; @@ -240,7 +267,7 @@ public void testReadWrite() throws IOException { int beamWidth = random().nextInt(10) + 5; long seed = random().nextLong(); KnnVectorValues vectors = vectorValues(nDoc, dim); - KnnVectorValues v2 = vectors.copy(), v3 = vectors.copy(); + Vectors v1 = vectors(vectors), v2 = vectors(vectors), v3 = vectors(vectors); RandomVectorScorerSupplier scorerSupplier = buildScorerSupplier(vectors); HnswGraphBuilder builder = HnswGraphBuilder.create(scorerSupplier, M, beamWidth, seed); HnswGraph hnsw = builder.build(vectors.size()); @@ -268,7 +295,7 @@ public KnnVectorsFormat getKnnVectorsFormatForField(String field) { } }); try (IndexWriter iw = new IndexWriter(dir, iwc)) { - KnnVectorValues.DocIndexIterator it2 = v2.iterator(); + KnnVectorValues.DocIndexIterator it2 = vectors.iterator(); while (it2.nextDoc() != NO_MORE_DOCS) { while (indexedDoc < it2.docID()) { // increment docId in the index by adding empty documents @@ -276,7 +303,7 @@ public KnnVectorsFormat getKnnVectorsFormatForField(String field) { indexedDoc++; } Document doc = new Document(); - doc.add(knnVectorField("field", vectorValue(v2, it2.index()), similarityFunction)); + doc.add(knnVectorField("field", vectorValue(vectors, it2.index()), similarityFunction)); doc.add(new StoredField("id", it2.docID())); iw.addDocument(doc); nVec++; @@ -290,7 +317,7 @@ public KnnVectorsFormat getKnnVectorsFormatForField(String field) { assertEquals(nVec, values.size()); assertEquals(indexedDoc, ctx.reader().maxDoc()); assertEquals(indexedDoc, ctx.reader().numDocs()); - assertVectorsEqual(v3, values); + assertVectorsEqual(vectors, values); HnswGraph graphValues = ((Lucene99HnswVectorsReader) ((PerFieldKnnVectorsFormat.FieldsReader) @@ -632,7 +659,7 @@ public void testHnswGraphBuilderInitializationFromGraph_withNonZeroOffset() thro OnHeapHnswGraph initializerGraph = initializerBuilder.build(initializerVectors.size()); KnnVectorValues finalVectorValues = - vectorValues(totalSize, dim, initializerVectors.copy(), docIdOffset); + vectorValues(totalSize, dim, initializerVectors, docIdOffset); int[] initializerOrdMap = createOffsetOrdinalMap(initializerSize, finalVectorValues, docIdOffset); @@ -968,7 +995,7 @@ public void testOnHeapHnswGraphSearch() HnswGraphBuilder builder = HnswGraphBuilder.create(scorerSupplier, 10, 30, random().nextLong()); OnHeapHnswGraph hnsw = builder.build(vectors.size()); Bits acceptOrds = random().nextBoolean() ? null : createRandomAcceptOrds(0, size); - + System.out.println("acceptOrds=" + acceptOrds); List queries = new ArrayList<>(); List expects = new ArrayList<>(); for (int i = 0; i < 100; i++) { @@ -1101,18 +1128,11 @@ private int computeOverlap(int[] a, int[] b) { /** Returns vectors evenly distributed around the upper unit semicircle. */ static class CircularFloatVectorValues extends FloatVectorValues { private final int size; - private final float[] value; int doc = -1; CircularFloatVectorValues(int size) { this.size = size; - value = new float[2]; - } - - @Override - public CircularFloatVectorValues copy() { - return new CircularFloatVectorValues(size); } @Override @@ -1125,8 +1145,16 @@ public int size() { return size; } - public float[] vectorValue() { - return vectorValue(doc); + @Override + public Floats values() { + return new Floats() { + float[] value = new float[2]; + + @Override + public float[] get(int ord) { + return unitVector2d(ord / (double) size, value); + } + }; } public int docID() { @@ -1146,11 +1174,6 @@ public int advance(int target) { return doc; } - @Override - public float[] vectorValue(int ord) { - return unitVector2d(ord / (double) size, value); - } - @Override public VectorScorer scorer(float[] target) { throw new UnsupportedOperationException(); diff --git a/lucene/core/src/test/org/apache/lucene/util/hnsw/MockVectorValues.java b/lucene/core/src/test/org/apache/lucene/util/hnsw/MockVectorValues.java index 5411f2418de..2ff2d50a223 100644 --- a/lucene/core/src/test/org/apache/lucene/util/hnsw/MockVectorValues.java +++ b/lucene/core/src/test/org/apache/lucene/util/hnsw/MockVectorValues.java @@ -19,14 +19,12 @@ import org.apache.lucene.index.FloatVectorValues; import org.apache.lucene.tests.util.LuceneTestCase; -import org.apache.lucene.util.ArrayUtil; class MockVectorValues extends FloatVectorValues { private final int dimension; private final float[][] denseValues; protected final float[][] values; private final int numVectors; - private final float[] scratch; static MockVectorValues fromValues(float[][] values) { float[] firstNonNull = null; @@ -51,7 +49,6 @@ static MockVectorValues fromValues(float[][] values) { this.values = values; this.denseValues = denseValues; this.numVectors = numVectors; - this.scratch = new float[dimension]; } @Override @@ -65,22 +62,23 @@ public int dimension() { } @Override - public MockVectorValues copy() { - return new MockVectorValues( - ArrayUtil.copyArray(values), dimension, ArrayUtil.copyArray(denseValues), numVectors); - } + public Floats values() { + return new Floats() { + float[] scratch = new float[dimension]; - @Override - public float[] vectorValue(int ord) { - if (LuceneTestCase.random().nextBoolean()) { - return values[ord]; - } else { - // Sometimes use the same scratch array repeatedly, mimicing what the codec will do. - // This should help us catch cases of aliasing where the same vector values source is used - // twice in a single computation. - System.arraycopy(values[ord], 0, scratch, 0, dimension); - return scratch; - } + @Override + public float[] get(int ord) { + if (LuceneTestCase.random().nextBoolean()) { + return values[ord]; + } else { + // Sometimes use the same scratch array repeatedly, mimicing what the codec will do. + // This should help us catch cases of aliasing where the same vector values source is used + // twice in a single computation. + System.arraycopy(values[ord], 0, scratch, 0, dimension); + return scratch; + } + } + }; } @Override diff --git a/lucene/core/src/test/org/apache/lucene/util/hnsw/TestHnswFloatVectorGraph.java b/lucene/core/src/test/org/apache/lucene/util/hnsw/TestHnswFloatVectorGraph.java index 52d1da3dfa8..7231460928d 100644 --- a/lucene/core/src/test/org/apache/lucene/util/hnsw/TestHnswFloatVectorGraph.java +++ b/lucene/core/src/test/org/apache/lucene/util/hnsw/TestHnswFloatVectorGraph.java @@ -71,10 +71,11 @@ MockVectorValues vectorValues(float[][] values) { @Override MockVectorValues vectorValues(LeafReader reader, String fieldName) throws IOException { FloatVectorValues vectorValues = reader.getFloatVectorValues(fieldName); + FloatVectorValues.Floats dict = vectorValues.values(); float[][] vectors = new float[reader.maxDoc()][]; for (int i = 0; i < vectorValues.size(); i++) { vectors[vectorValues.ordToDoc(i)] = - ArrayUtil.copyOfSubArray(vectorValues.vectorValue(i), 0, vectorValues.dimension()); + ArrayUtil.copyOfSubArray(dict.get(i), 0, vectorValues.dimension()); } return MockVectorValues.fromValues(vectors); } diff --git a/lucene/core/src/test/org/apache/lucene/util/quantization/TestScalarQuantizedVectorSimilarity.java b/lucene/core/src/test/org/apache/lucene/util/quantization/TestScalarQuantizedVectorSimilarity.java index f2cc3ac35c0..0b5a6223db6 100644 --- a/lucene/core/src/test/org/apache/lucene/util/quantization/TestScalarQuantizedVectorSimilarity.java +++ b/lucene/core/src/test/org/apache/lucene/util/quantization/TestScalarQuantizedVectorSimilarity.java @@ -238,10 +238,15 @@ private static FloatVectorValues fromFloatsNormalized( float[][] floats, Set deletedVectors) { return new TestScalarQuantizer.TestSimpleFloatVectorValues(floats, deletedVectors) { @Override - public float[] vectorValue(int ord) throws IOException { - float[] v = ArrayUtil.copyArray(floats[ordToDoc[ord]]); - VectorUtil.l2normalize(v); - return v; + public Floats values() { + return new Floats() { + @Override + public float[] get(int ord) throws IOException { + float[] v = ArrayUtil.copyArray(floats[ordToDoc[ord]]); + VectorUtil.l2normalize(v); + return v; + } + }; } }; } diff --git a/lucene/core/src/test/org/apache/lucene/util/quantization/TestScalarQuantizer.java b/lucene/core/src/test/org/apache/lucene/util/quantization/TestScalarQuantizer.java index 7f56688b799..49db2be488f 100644 --- a/lucene/core/src/test/org/apache/lucene/util/quantization/TestScalarQuantizer.java +++ b/lucene/core/src/test/org/apache/lucene/util/quantization/TestScalarQuantizer.java @@ -306,8 +306,13 @@ public int size() { } @Override - public float[] vectorValue(int ord) throws IOException { - return floats[ordToDoc(ord)]; + public Floats values() { + return new Floats() { + @Override + public float[] get(int ord) throws IOException { + return floats[ordToDoc(ord)]; + } + }; } @Override @@ -360,10 +365,5 @@ public int advance(int target) throws IOException { public VectorScorer scorer(float[] target) { throw new UnsupportedOperationException(); } - - @Override - public TestSimpleFloatVectorValues copy() { - return this; - } } } diff --git a/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java b/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java index 04ac9285bab..0b5b5b665b4 100644 --- a/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java +++ b/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java @@ -2301,12 +2301,17 @@ public int size() { } @Override - public float[] vectorValue(int ord) { - if (ord == 0) { - return info.floatVectorValues[0]; - } else { - return null; - } + public Floats values() { + return new Floats() { + @Override + public float[] get(int ord) { + if (ord == 0) { + return info.floatVectorValues[0]; + } else { + return null; + } + } + }; } @Override @@ -2325,13 +2330,12 @@ public VectorScorer scorer(float[] query) { } MemoryFloatVectorValues vectorValues = new MemoryFloatVectorValues(info); DocIndexIterator iterator = vectorValues.iterator(); + FloatVectorValues.Floats floats = vectorValues.values(); return new VectorScorer() { @Override public float score() throws IOException { assert iterator.docID() == 0; - return info.fieldInfo - .getVectorSimilarityFunction() - .compare(vectorValues.vectorValue(0), query); + return info.fieldInfo.getVectorSimilarityFunction().compare(floats.get(0), query); } @Override @@ -2340,11 +2344,6 @@ public DocIdSetIterator iterator() { } }; } - - @Override - public MemoryFloatVectorValues copy() { - return this; - } } private static final class MemoryByteVectorValues extends ByteVectorValues { diff --git a/lucene/memory/src/test/org/apache/lucene/index/memory/TestMemoryIndex.java b/lucene/memory/src/test/org/apache/lucene/index/memory/TestMemoryIndex.java index 7c592868912..c16ed357573 100644 --- a/lucene/memory/src/test/org/apache/lucene/index/memory/TestMemoryIndex.java +++ b/lucene/memory/src/test/org/apache/lucene/index/memory/TestMemoryIndex.java @@ -854,7 +854,7 @@ private static void assertFloatVectorValue(MemoryIndex mi, String fieldName, flo assertNotNull(fvv); KnnVectorValues.DocIndexIterator iterator = fvv.iterator(); assertEquals(0, iterator.nextDoc()); - assertArrayEquals(expected, fvv.vectorValue(0), 1e-6f); + assertArrayEquals(expected, fvv.values().get(0), 1e-6f); assertEquals(DocIdSetIterator.NO_MORE_DOCS, iterator.nextDoc()); } diff --git a/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/FloatKnnVectorFieldSource.java b/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/FloatKnnVectorFieldSource.java index f026d9537bc..ae86126cd8c 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/FloatKnnVectorFieldSource.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/FloatKnnVectorFieldSource.java @@ -64,11 +64,12 @@ protected DocIdSetIterator getVectorIterator() { return new VectorFieldFunction(this) { KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator(); + FloatVectorValues.Floats values = vectorValues.values(); @Override public float[] floatVectorVal(int doc) throws IOException { if (exists(doc)) { - return vectorValues.vectorValue(iterator.index()); + return values.get(iterator.index()); } else { return null; } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/quantization/KMeans.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/quantization/KMeans.java index 88d2adba5fa..3a3d8f97e8a 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/quantization/KMeans.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/quantization/KMeans.java @@ -39,6 +39,7 @@ public class KMeans { public static final int DEFAULT_SAMPLE_SIZE = 100_000; private final FloatVectorValues vectors; + private final FloatVectorValues.Floats vectorValues; private final int numVectors; private final int numCentroids; private final Random random; @@ -145,8 +146,10 @@ private KMeans( Random random, KmeansInitializationMethod initializationMethod, int restarts, - int iters) { + int iters) + throws IOException { this.vectors = vectors; + this.vectorValues = vectors.values(); this.numVectors = vectors.size(); this.numCentroids = numCentroids; this.random = random; @@ -197,7 +200,7 @@ private float[][] initializeForgy() throws IOException { float[][] initialCentroids = new float[numCentroids][]; int i = 0; for (Integer selectedIdx : selection) { - float[] vector = vectors.vectorValue(selectedIdx); + float[] vector = vectorValues.get(selectedIdx); initialCentroids[i++] = ArrayUtil.copyOfSubArray(vector, 0, vector.length); } return initialCentroids; @@ -207,7 +210,7 @@ private float[][] initializeForgy() throws IOException { private float[][] initializeReservoirSampling() throws IOException { float[][] initialCentroids = new float[numCentroids][]; for (int index = 0; index < numVectors; index++) { - float[] vector = vectors.vectorValue(index); + float[] vector = vectorValues.get(index); if (index < numCentroids) { initialCentroids[index] = ArrayUtil.copyOfSubArray(vector, 0, vector.length); } else if (random.nextDouble() < numCentroids * (1.0 / index)) { @@ -223,7 +226,7 @@ private float[][] initializePlusPlus() throws IOException { float[][] initialCentroids = new float[numCentroids][]; // Choose the first centroid uniformly at random int firstIndex = random.nextInt(numVectors); - float[] value = vectors.vectorValue(firstIndex); + float[] value = vectorValues.get(firstIndex); initialCentroids[0] = ArrayUtil.copyOfSubArray(value, 0, value.length); // Store distances of each point to the nearest centroid @@ -236,7 +239,7 @@ private float[][] initializePlusPlus() throws IOException { double totalSum = 0; for (int j = 0; j < numVectors; j++) { // TODO: replace with RandomVectorScorer::score possible on quantized vectors - float dist = VectorUtil.squareDistance(vectors.vectorValue(j), initialCentroids[i - 1]); + float dist = VectorUtil.squareDistance(vectorValues.get(j), initialCentroids[i - 1]); if (dist < minDistances[j]) { minDistances[j] = dist; } @@ -255,7 +258,7 @@ private float[][] initializePlusPlus() throws IOException { } } // Update centroid - value = vectors.vectorValue(nextCentroidIndex); + value = vectorValues.get(nextCentroidIndex); initialCentroids[i] = ArrayUtil.copyOfSubArray(value, 0, value.length); } return initialCentroids; @@ -289,9 +292,10 @@ private static double runKMeansStep( compensations = new float[numCentroids][centroids[0].length]; } + FloatVectorValues.Floats values = vectors.values(); double sumSquaredDist = 0; for (int docID = 0; docID < vectors.size(); docID++) { - float[] vector = vectors.vectorValue(docID); + float[] vector = values.get(docID); short bestCentroid = 0; if (numCentroids > 1) { float minSquaredDist = Float.MAX_VALUE; @@ -355,16 +359,17 @@ static void assignCentroids( assignedCentroidsIdxs[assignedIndex++] = i; } } + FloatVectorValues.Floats vectorValues = vectors.values(); NeighborQueue queue = new NeighborQueue(unassignedCentroidsIdxs.size(), false); for (int i = 0; i < vectors.size(); i++) { - float[] vector = vectors.vectorValue(i); + float[] vector = vectorValues.get(i); for (short j = 0; j < assignedCentroidsIdxs.length; j++) { float squareDist = VectorUtil.squareDistance(centroids[assignedCentroidsIdxs[j]], vector); queue.insertWithOverflow(i, squareDist); } } for (int i = 0; i < unassignedCentroidsIdxs.size(); i++) { - float[] vector = vectors.vectorValue(queue.topNode()); + float[] vector = vectorValues.get(queue.topNode()); int unassignedCentroidIdx = unassignedCentroidsIdxs.get(i); centroids[unassignedCentroidIdx] = ArrayUtil.copyArray(vector); queue.pop(); diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/quantization/SampleReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/quantization/SampleReader.java index 684c9fac838..02487668d54 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/quantization/SampleReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/quantization/SampleReader.java @@ -47,19 +47,20 @@ public int dimension() { return origin.dimension(); } - @Override - public FloatVectorValues copy() throws IOException { - throw new IllegalStateException("Not supported"); - } - @Override public IndexInput getSlice() { return ((HasIndexSlice) origin).getSlice(); } @Override - public float[] vectorValue(int targetOrd) throws IOException { - return origin.vectorValue(sampleFunction.applyAsInt(targetOrd)); + public Floats values() throws IOException { + Floats originValues = origin.values(); + return new Floats() { + @Override + public float[] get(int targetOrd) throws IOException { + return originValues.get(sampleFunction.applyAsInt(targetOrd)); + } + }; } @Override diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java index e42d3e18981..d22e908e8f3 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java @@ -440,7 +440,7 @@ public void testAddIndexesDirectory0() throws Exception { FloatVectorValues vectorValues = r.getFloatVectorValues(fieldName); KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator(); assertEquals(0, iterator.nextDoc()); - assertEquals(0, vectorValues.vectorValue(0)[0], 0); + assertEquals(0, vectorValues.values().get(0)[0], 0); assertEquals(NO_MORE_DOCS, iterator.nextDoc()); } } @@ -466,7 +466,7 @@ public void testAddIndexesDirectory1() throws Exception { FloatVectorValues vectorValues = r.getFloatVectorValues(fieldName); KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator(); assertNotEquals(NO_MORE_DOCS, iterator.nextDoc()); - assertEquals(0, vectorValues.vectorValue(iterator.index())[0], 0); + assertEquals(0, vectorValues.values().get(iterator.index())[0], 0); assertEquals(NO_MORE_DOCS, iterator.nextDoc()); } } @@ -495,10 +495,10 @@ public void testAddIndexesDirectory01() throws Exception { KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator(); assertEquals(0, iterator.nextDoc()); // The merge order is randomized, we might get 0 first, or 1 - float value = vectorValues.vectorValue(0)[0]; + float value = vectorValues.values().get(0)[0]; assertTrue(value == 0 || value == 1); assertEquals(1, iterator.nextDoc()); - value += vectorValues.vectorValue(1)[0]; + value += vectorValues.values().get(1)[0]; assertEquals(1, value, 0); } } @@ -897,9 +897,10 @@ public void testSparseVectors() throws Exception { if (vectorValues != null) { docCount += vectorValues.size(); KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator(); + FloatVectorValues.Floats dict = vectorValues.values(); while (true) { if (!(iterator.nextDoc() != NO_MORE_DOCS)) break; - checksum += vectorValues.vectorValue(iterator.index())[0]; + checksum += dict.get(iterator.index())[0]; } } } @@ -1131,15 +1132,16 @@ public void testIndexedValueNotAliased() throws Exception { FloatVectorValues vectorValues = r.getFloatVectorValues(fieldName); assertEquals(3, vectorValues.size()); KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator(); + FloatVectorValues.Floats dict = vectorValues.values(); iterator.nextDoc(); assertEquals(0, iterator.index()); - assertEquals(1, vectorValues.vectorValue(0)[0], 0); + assertEquals(1, dict.get(0)[0], 0); iterator.nextDoc(); assertEquals(1, iterator.index()); - assertEquals(1, vectorValues.vectorValue(1)[0], 0); + assertEquals(1, dict.get(1)[0], 0); iterator.nextDoc(); assertEquals(2, iterator.index()); - assertEquals(2, vectorValues.vectorValue(2)[0], 0); + assertEquals(2, dict.get(2)[0], 0); } } } @@ -1163,12 +1165,13 @@ public void testSortedIndex() throws Exception { assertEquals(2, vectorValues.dimension()); assertEquals(3, vectorValues.size()); KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator(); + FloatVectorValues.Floats dict = vectorValues.values(); assertEquals("1", storedFields.document(iterator.nextDoc()).get("id")); - assertEquals(-1f, vectorValues.vectorValue(0)[0], 0); + assertEquals(-1f, dict.get(0)[0], 0); assertEquals("2", storedFields.document(iterator.nextDoc()).get("id")); - assertEquals(1, vectorValues.vectorValue(1)[0], 0); + assertEquals(1, dict.get(1)[0], 0); assertEquals("4", storedFields.document(iterator.nextDoc()).get("id")); - assertEquals(0, vectorValues.vectorValue(2)[0], 0); + assertEquals(0, dict.get(2)[0], 0); assertEquals(NO_MORE_DOCS, iterator.nextDoc()); } } @@ -1229,20 +1232,22 @@ public void testIndexMultipleKnnVectorFields() throws Exception { assertEquals(2, vectorValues.dimension()); assertEquals(2, vectorValues.size()); KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator(); + FloatVectorValues.Floats dict = vectorValues.values(); iterator.nextDoc(); - assertEquals(1f, vectorValues.vectorValue(0)[0], 0); + assertEquals(1f, dict.get(0)[0], 0); iterator.nextDoc(); - assertEquals(2f, vectorValues.vectorValue(1)[0], 0); + assertEquals(2f, dict.get(1)[0], 0); assertEquals(NO_MORE_DOCS, iterator.nextDoc()); FloatVectorValues vectorValues2 = leaf.getFloatVectorValues("field2"); KnnVectorValues.DocIndexIterator it2 = vectorValues2.iterator(); + FloatVectorValues.Floats dict2 = vectorValues2.values(); assertEquals(4, vectorValues2.dimension()); assertEquals(2, vectorValues2.size()); it2.nextDoc(); - assertEquals(2f, vectorValues2.vectorValue(0)[1], 0); + assertEquals(2f, dict2.get(0)[1], 0); it2.nextDoc(); - assertEquals(2f, vectorValues2.vectorValue(1)[1], 0); + assertEquals(2f, dict2.get(1)[1], 0); assertEquals(NO_MORE_DOCS, it2.nextDoc()); FloatVectorValues vectorValues3 = leaf.getFloatVectorValues("field3"); @@ -1250,7 +1255,7 @@ public void testIndexMultipleKnnVectorFields() throws Exception { assertEquals(1, vectorValues3.size()); KnnVectorValues.DocIndexIterator it3 = vectorValues3.iterator(); it3.nextDoc(); - assertEquals(1f, vectorValues3.vectorValue(0)[0], 0.1); + assertEquals(1f, vectorValues3.values().get(0)[0], 0.1); assertEquals(NO_MORE_DOCS, it3.nextDoc()); } } @@ -1316,9 +1321,10 @@ public void testRandom() throws Exception { StoredFields storedFields = ctx.reader().storedFields(); int docId; KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator(); + FloatVectorValues.Floats dict = vectorValues.values(); while (true) { if (!((docId = iterator.nextDoc()) != NO_MORE_DOCS)) break; - float[] v = vectorValues.vectorValue(iterator.index()); + float[] v = dict.get(iterator.index()); assertEquals(dimension, v.length); String idString = storedFields.document(docId).getField("id").stringValue(); int id = Integer.parseInt(idString); @@ -1464,7 +1470,10 @@ public void testSearchWithVisitedLimit() throws Exception { ctx.reader() .searchNearestVectors( fieldName, randomNormalizedVector(dimension), k, liveDocs, visitedLimit); - assertEquals(TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO, results.totalHits.relation()); + assertEquals( + results.totalHits.toString() + " limit=" + visitedLimit, + TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO, + results.totalHits.relation()); assertEquals(visitedLimit, results.totalHits.value()); // check the limit is not hit when it clearly exceeds the number of vectors @@ -1520,9 +1529,10 @@ public void testRandomWithUpdatesAndGraph() throws Exception { int docId; int numLiveDocsWithVectors = 0; KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator(); + FloatVectorValues.Floats dict = vectorValues.values(); while (true) { if (!((docId = iterator.nextDoc()) != NO_MORE_DOCS)) break; - float[] v = vectorValues.vectorValue(iterator.index()); + float[] v = dict.get(iterator.index()); assertEquals(dimension, v.length); String idString = storedFields.document(docId).getField("id").stringValue(); int id = Integer.parseInt(idString); @@ -1827,12 +1837,13 @@ public void testVectorValuesReportCorrectDocs() throws Exception { for (LeafReaderContext ctx : r.leaves()) { FloatVectorValues vectorValues = ctx.reader().getFloatVectorValues("knn_vector"); if (vectorValues != null) { + FloatVectorValues.Floats dict = vectorValues.values(); docCount += vectorValues.size(); StoredFields storedFields = ctx.reader().storedFields(); KnnVectorValues.DocIndexIterator iter = vectorValues.iterator(); for (iter.nextDoc(); iter.docID() != NO_MORE_DOCS; iter.nextDoc()) { int ord = iter.index(); - checksum += vectorValues.vectorValue(ord)[0]; + checksum += dict.get(ord)[0]; Document doc = storedFields.document(iter.docID(), Set.of("id")); sumDocIds += Integer.parseInt(doc.get("id")); } @@ -1893,12 +1904,12 @@ public void testMismatchedFields() throws Exception { assertNotNull(floatVectors); iter = floatVectors.iterator(); assertEquals(0, iter.nextDoc()); - float[] vector = floatVectors.vectorValue(0); + float[] vector = floatVectors.values().get(0); assertEquals(2, vector.length); assertEquals(1f, vector[0], 0f); assertEquals(2f, vector[1], 0f); assertEquals(1, iter.nextDoc()); - vector = floatVectors.vectorValue(1); + vector = floatVectors.values().get(1); assertEquals(2, vector.length); assertEquals(1f, vector[0], 0f); assertEquals(2f, vector[1], 0f); From 1c2977f8d8860a7cc36f5dc1c3f9d4ca572dc81b Mon Sep 17 00:00:00 2001 From: Michael Sokolov Date: Sun, 6 Oct 2024 10:14:26 -0400 Subject: [PATCH 02/25] refactor byte vector values random access --- .../synonym/word2vec/Word2VecModel.java | 11 -- .../lucene91/Lucene91HnswVectorsReader.java | 2 - .../lucene92/OffHeapFloatVectorValues.java | 5 - .../lucene94/OffHeapByteVectorValues.java | 86 ++++++------- .../lucene94/OffHeapFloatVectorValues.java | 5 - .../lucene94/Lucene94HnswVectorsWriter.java | 3 +- .../lucene95/Lucene95HnswVectorsWriter.java | 3 +- .../bitvectors/FlatBitVectorsScorer.java | 20 ++- .../SimpleTextKnnVectorsReader.java | 40 +++--- .../SimpleTextKnnVectorsWriter.java | 10 +- .../codecs/BufferingKnnVectorsWriter.java | 34 +++-- .../lucene/codecs/KnnVectorsWriter.java | 42 ++++--- .../codecs/hnsw/DefaultFlatVectorScorer.java | 29 ++--- .../hnsw/ScalarQuantizedVectorScorer.java | 33 ++--- .../lucene95/OffHeapByteVectorValues.java | 117 +++++++++-------- .../lucene95/OffHeapFloatVectorValues.java | 5 - .../lucene99/Lucene99FlatVectorsWriter.java | 8 +- .../Lucene99ScalarQuantizedVectorScorer.java | 68 +++++----- .../Lucene99ScalarQuantizedVectorsWriter.java | 118 ++++++++++-------- .../OffHeapQuantizedByteVectorValues.java | 113 ++++++----------- .../apache/lucene/index/ByteVectorValues.java | 42 ++++--- .../org/apache/lucene/index/CheckIndex.java | 7 +- .../lucene/index/ExitableDirectoryReader.java | 17 +-- .../lucene/index/FloatVectorValues.java | 1 + .../SlowCompositeCodecReaderWrapper.java | 43 +++---- .../lucene/index/SortingCodecReader.java | 17 +-- .../CloseableRandomVectorScorerSupplier.java | 5 +- .../util/hnsw/HnswConcurrentMergeBuilder.java | 2 +- .../util/hnsw/RandomVectorScorerSupplier.java | 6 - .../QuantizedByteVectorValues.java | 38 ++++-- ...MemorySegmentByteVectorScorerSupplier.java | 36 ++---- ...estLucene99HnswQuantizedVectorsFormat.java | 6 +- ...stLucene99ScalarQuantizedVectorScorer.java | 30 ++--- ...tLucene99ScalarQuantizedVectorsFormat.java | 6 +- .../org/apache/lucene/document/TestField.java | 7 +- .../vectorization/TestVectorScorer.java | 4 +- .../lucene/util/hnsw/HnswGraphTestCase.java | 82 +++--------- .../util/hnsw/MockByteVectorValues.java | 53 +++----- .../lucene/util/hnsw/MockVectorValues.java | 16 +-- .../util/hnsw/TestHnswByteVectorGraph.java | 3 +- .../lucene/index/memory/MemoryIndex.java | 31 +++-- .../lucene/index/memory/TestMemoryIndex.java | 2 +- .../valuesource/ByteKnnVectorFieldSource.java | 3 +- .../index/BaseKnnVectorsFormatTestCase.java | 69 +++++----- 44 files changed, 575 insertions(+), 703 deletions(-) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/word2vec/Word2VecModel.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/word2vec/Word2VecModel.java index 4b264c97582..ced8a4230fe 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/word2vec/Word2VecModel.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/word2vec/Word2VecModel.java @@ -43,17 +43,6 @@ public Word2VecModel(int dictionarySize, int vectorDimension) { this.word2Vec = new BytesRefHash(); } - private Word2VecModel( - int dictionarySize, - int vectorDimension, - TermAndVector[] termsAndVectors, - BytesRefHash word2Vec) { - this.dictionarySize = dictionarySize; - this.vectorDimension = vectorDimension; - this.termsAndVectors = termsAndVectors; - this.word2Vec = word2Vec; - } - public void addTermAndVector(TermAndVector modelEntry) { modelEntry = modelEntry.normalizeVector(); this.termsAndVectors[loadedCount++] = modelEntry; diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene91/Lucene91HnswVectorsReader.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene91/Lucene91HnswVectorsReader.java index 24997307197..25f362ad09d 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene91/Lucene91HnswVectorsReader.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene91/Lucene91HnswVectorsReader.java @@ -401,7 +401,6 @@ static class OffHeapFloatVectorValues extends FloatVectorValues { private final int dimension; private final int size; - private final int[] ordToDoc; private final IntUnaryOperator ordToDocOperator; private final IndexInput dataIn; private final int byteSize; @@ -415,7 +414,6 @@ static class OffHeapFloatVectorValues extends FloatVectorValues { IndexInput dataIn) { this.dimension = dimension; this.size = size; - this.ordToDoc = ordToDoc; ordToDocOperator = ordToDoc == null ? IntUnaryOperator.identity() : (ord) -> ordToDoc[ord]; this.dataIn = dataIn; this.similarityFunction = similarityFunction; diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene92/OffHeapFloatVectorValues.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene92/OffHeapFloatVectorValues.java index 726b0187519..16e0a89b0eb 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene92/OffHeapFloatVectorValues.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene92/OffHeapFloatVectorValues.java @@ -137,9 +137,6 @@ public DocIdSetIterator iterator() { private static class SparseOffHeapVectorValues extends OffHeapFloatVectorValues { private final DirectMonotonicReader ordToDoc; private final IndexedDISI disi; - // dataIn was used to init a new IndexedDIS for #randomAccess() - private final IndexInput dataIn; - private final Lucene92HnswVectorsReader.FieldEntry fieldEntry; public SparseOffHeapVectorValues( Lucene92HnswVectorsReader.FieldEntry fieldEntry, @@ -149,10 +146,8 @@ public SparseOffHeapVectorValues( throws IOException { super(fieldEntry.dimension(), fieldEntry.size(), vectorSimilarityFunction, slice); - this.fieldEntry = fieldEntry; final RandomAccessInput addressesData = dataIn.randomAccessSlice(fieldEntry.addressesOffset(), fieldEntry.addressesLength()); - this.dataIn = dataIn; this.ordToDoc = DirectMonotonicReader.getInstance(fieldEntry.meta(), addressesData); this.disi = new IndexedDISI( diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene94/OffHeapByteVectorValues.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene94/OffHeapByteVectorValues.java index 0c428bb169f..afb890f4863 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene94/OffHeapByteVectorValues.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene94/OffHeapByteVectorValues.java @@ -36,9 +36,6 @@ abstract class OffHeapByteVectorValues extends ByteVectorValues { protected final int dimension; protected final int size; protected final IndexInput slice; - protected int lastOrd = -1; - protected final byte[] binaryValue; - protected final ByteBuffer byteBuffer; protected final int byteSize; protected final VectorSimilarityFunction vectorSimilarityFunction; @@ -53,8 +50,6 @@ abstract class OffHeapByteVectorValues extends ByteVectorValues { this.slice = slice; this.byteSize = byteSize; this.vectorSimilarityFunction = vectorSimilarityFunction; - byteBuffer = ByteBuffer.allocate(byteSize); - binaryValue = byteBuffer.array(); } @Override @@ -68,17 +63,28 @@ public int size() { } @Override - public byte[] vectorValue(int targetOrd) throws IOException { - if (lastOrd != targetOrd) { - readValue(targetOrd); - lastOrd = targetOrd; - } - return binaryValue; - } + public Bytes values() throws IOException { + return new Bytes() { + IndexInput input = slice.clone(); + ByteBuffer byteBuffer = ByteBuffer.allocate(byteSize); + ; + byte[] binaryValue = byteBuffer.array(); + int lastOrd = -1; + + @Override + public byte[] get(int targetOrd) throws IOException { + if (lastOrd != targetOrd) { + readValue(targetOrd); + lastOrd = targetOrd; + } + return binaryValue; + } - private void readValue(int targetOrd) throws IOException { - slice.seek((long) targetOrd * byteSize); - slice.readBytes(byteBuffer.array(), byteBuffer.arrayOffset(), byteSize); + private void readValue(int targetOrd) throws IOException { + input.seek((long) targetOrd * byteSize); + input.readBytes(byteBuffer.array(), byteBuffer.arrayOffset(), byteSize); + } + }; } static OffHeapByteVectorValues load( @@ -115,12 +121,6 @@ public DenseOffHeapVectorValues( super(dimension, size, slice, vectorSimilarityFunction, byteSize); } - @Override - public DenseOffHeapVectorValues copy() throws IOException { - return new DenseOffHeapVectorValues( - dimension, size, slice.clone(), vectorSimilarityFunction, byteSize); - } - @Override public DocIndexIterator iterator() { return createDenseIterator(); @@ -133,12 +133,12 @@ public Bits getAcceptOrds(Bits acceptDocs) { @Override public VectorScorer scorer(byte[] query) throws IOException { - DenseOffHeapVectorValues copy = this.copy(); - DocIndexIterator iterator = copy.iterator(); + DocIndexIterator iterator = iterator(); + Bytes vectors = values(); return new VectorScorer() { @Override public float score() throws IOException { - return vectorSimilarityFunction.compare(copy.vectorValue(iterator.index()), query); + return vectorSimilarityFunction.compare(vectors.get(iterator.index()), query); } @Override @@ -151,10 +151,9 @@ public DocIdSetIterator iterator() { private static class SparseOffHeapVectorValues extends OffHeapByteVectorValues { private final DirectMonotonicReader ordToDoc; - private final IndexedDISI disi; - // dataIn was used to init a new IndexedDIS for #randomAccess() private final IndexInput dataIn; private final Lucene94HnswVectorsReader.FieldEntry fieldEntry; + private final IndexedDISI disi; public SparseOffHeapVectorValues( Lucene94HnswVectorsReader.FieldEntry fieldEntry, @@ -165,14 +164,17 @@ public SparseOffHeapVectorValues( throws IOException { super(fieldEntry.dimension(), fieldEntry.size(), slice, vectorSimilarityFunction, byteSize); - this.fieldEntry = fieldEntry; final RandomAccessInput addressesData = dataIn.randomAccessSlice(fieldEntry.addressesOffset(), fieldEntry.addressesLength()); - this.dataIn = dataIn; this.ordToDoc = DirectMonotonicReader.getInstance(fieldEntry.meta(), addressesData); - this.disi = - new IndexedDISI( - dataIn, + this.fieldEntry = fieldEntry; + this.dataIn = dataIn; + this.disi = createDISI(); + } + + IndexedDISI createDISI() throws IOException { + return new IndexedDISI( + dataIn.clone(), fieldEntry.docsWithFieldOffset(), fieldEntry.docsWithFieldLength(), fieldEntry.jumpTableEntryCount(), @@ -180,12 +182,6 @@ public SparseOffHeapVectorValues( fieldEntry.size()); } - @Override - public SparseOffHeapVectorValues copy() throws IOException { - return new SparseOffHeapVectorValues( - fieldEntry, dataIn, slice.clone(), vectorSimilarityFunction, byteSize); - } - @Override public int ordToDoc(int ord) { return (int) ordToDoc.get(ord); @@ -216,12 +212,13 @@ public int length() { @Override public VectorScorer scorer(byte[] query) throws IOException { - SparseOffHeapVectorValues copy = this.copy(); - IndexedDISI disi = copy.disi; return new VectorScorer() { + ByteVectorValues.Bytes vectors = values(); + IndexedDISI disi = createDISI(); + @Override public float score() throws IOException { - return vectorSimilarityFunction.compare(copy.vectorValue(disi.index()), query); + return vectorSimilarityFunction.compare(vectors.get(disi.index()), query); } @Override @@ -249,13 +246,8 @@ public int size() { } @Override - public OffHeapByteVectorValues copy() throws IOException { - throw new UnsupportedOperationException(); - } - - @Override - public byte[] vectorValue(int targetOrd) throws IOException { - throw new UnsupportedOperationException(); + public Bytes values() { + return Bytes.EMPTY; } @Override diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene94/OffHeapFloatVectorValues.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene94/OffHeapFloatVectorValues.java index 04dbabcc653..96d010071f9 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene94/OffHeapFloatVectorValues.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene94/OffHeapFloatVectorValues.java @@ -149,9 +149,6 @@ public DocIdSetIterator iterator() { private static class SparseOffHeapVectorValues extends OffHeapFloatVectorValues { private final DirectMonotonicReader ordToDoc; private final IndexedDISI disi; - // dataIn was used to init a new IndexedDIS for #randomAccess() - private final IndexInput dataIn; - private final Lucene94HnswVectorsReader.FieldEntry fieldEntry; public SparseOffHeapVectorValues( Lucene94HnswVectorsReader.FieldEntry fieldEntry, @@ -162,10 +159,8 @@ public SparseOffHeapVectorValues( throws IOException { super(fieldEntry.dimension(), fieldEntry.size(), slice, vectorSimilarityFunction, byteSize); - this.fieldEntry = fieldEntry; final RandomAccessInput addressesData = dataIn.randomAccessSlice(fieldEntry.addressesOffset(), fieldEntry.addressesLength()); - this.dataIn = dataIn; this.ordToDoc = DirectMonotonicReader.getInstance(fieldEntry.meta(), addressesData); this.disi = new IndexedDISI( diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene94/Lucene94HnswVectorsWriter.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene94/Lucene94HnswVectorsWriter.java index 3722dc30857..ca7dc1cf192 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene94/Lucene94HnswVectorsWriter.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene94/Lucene94HnswVectorsWriter.java @@ -587,9 +587,10 @@ private static DocsWithFieldSet writeByteVectorData( IndexOutput output, ByteVectorValues byteVectorValues) throws IOException { DocsWithFieldSet docsWithField = new DocsWithFieldSet(); KnnVectorValues.DocIndexIterator iter = byteVectorValues.iterator(); + ByteVectorValues.Bytes vectors = byteVectorValues.values(); for (int docV = iter.nextDoc(); docV != NO_MORE_DOCS; docV = iter.nextDoc()) { // write vector - byte[] binaryValue = byteVectorValues.vectorValue(iter.index()); + byte[] binaryValue = vectors.get(iter.index()); assert binaryValue.length == byteVectorValues.dimension() * VectorEncoding.BYTE.byteSize; output.writeBytes(binaryValue, binaryValue.length); docsWithField.add(docV); diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene95/Lucene95HnswVectorsWriter.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene95/Lucene95HnswVectorsWriter.java index 94c8094b3ac..481a576253a 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene95/Lucene95HnswVectorsWriter.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene95/Lucene95HnswVectorsWriter.java @@ -635,9 +635,10 @@ private static DocsWithFieldSet writeByteVectorData( IndexOutput output, ByteVectorValues byteVectorValues) throws IOException { DocsWithFieldSet docsWithField = new DocsWithFieldSet(); KnnVectorValues.DocIndexIterator iter = byteVectorValues.iterator(); + ByteVectorValues.Bytes vectors = byteVectorValues.values(); for (int docId = iter.nextDoc(); docId != NO_MORE_DOCS; docId = iter.nextDoc()) { // write vector - byte[] binaryValue = byteVectorValues.vectorValue(iter.index()); + byte[] binaryValue = vectors.get(iter.index()); assert binaryValue.length == byteVectorValues.dimension() * VectorEncoding.BYTE.byteSize; output.writeBytes(binaryValue, binaryValue.length); docsWithField.add(docId); diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/bitvectors/FlatBitVectorsScorer.java b/lucene/codecs/src/java/org/apache/lucene/codecs/bitvectors/FlatBitVectorsScorer.java index 8ffcc1c8d50..1a6f4878394 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/bitvectors/FlatBitVectorsScorer.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/bitvectors/FlatBitVectorsScorer.java @@ -60,18 +60,20 @@ public RandomVectorScorer getRandomVectorScorer( static class BitRandomVectorScorer implements RandomVectorScorer { private final ByteVectorValues vectorValues; + private final ByteVectorValues.Bytes vectors; private final int bitDimensions; private final byte[] query; - BitRandomVectorScorer(ByteVectorValues vectorValues, byte[] query) { + BitRandomVectorScorer(ByteVectorValues vectorValues, byte[] query) throws IOException { this.query = query; this.bitDimensions = vectorValues.dimension() * Byte.SIZE; this.vectorValues = vectorValues; + vectors = vectorValues.values(); } @Override public float score(int node) throws IOException { - return (bitDimensions - VectorUtil.xorBitCount(query, vectorValues.vectorValue(node))) + return (bitDimensions - VectorUtil.xorBitCount(query, vectors.get(node))) / (float) bitDimensions; } @@ -93,24 +95,16 @@ public Bits getAcceptOrds(Bits acceptDocs) { static class BitRandomVectorScorerSupplier implements RandomVectorScorerSupplier { protected final ByteVectorValues vectorValues; - protected final ByteVectorValues vectorValues1; - protected final ByteVectorValues vectorValues2; + protected final ByteVectorValues.Bytes vectors; public BitRandomVectorScorerSupplier(ByteVectorValues vectorValues) throws IOException { this.vectorValues = vectorValues; - this.vectorValues1 = vectorValues.copy(); - this.vectorValues2 = vectorValues.copy(); + this.vectors = vectorValues.values(); } @Override public RandomVectorScorer scorer(int ord) throws IOException { - byte[] query = vectorValues1.vectorValue(ord); - return new BitRandomVectorScorer(vectorValues2, query); - } - - @Override - public RandomVectorScorerSupplier copy() throws IOException { - return new BitRandomVectorScorerSupplier(vectorValues.copy()); + return new BitRandomVectorScorer(vectorValues, vectors.get(ord)); } } diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextKnnVectorsReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextKnnVectorsReader.java index caa42df33d8..1fe9dda39a6 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextKnnVectorsReader.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextKnnVectorsReader.java @@ -221,6 +221,7 @@ public void search(String field, byte[] target, KnnCollector knnCollector, Bits + " differs from field dimension: " + values.dimension()); } + ByteVectorValues.Bytes vectors = values.values(); FieldInfo info = readState.fieldInfos.fieldInfo(field); VectorSimilarityFunction vectorSimilarity = info.getVectorSimilarityFunction(); @@ -234,7 +235,7 @@ public void search(String field, byte[] target, KnnCollector knnCollector, Bits break; } - byte[] vector = values.vectorValue(ord); + byte[] vector = vectors.get(ord); float score = vectorSimilarity.compare(vector, target); knnCollector.collect(doc, score); knnCollector.incVisitedCount(1); @@ -300,13 +301,10 @@ private static class SimpleTextFloatVectorValues extends FloatVectorValues { private final IndexInput in; private final float[][] values; - int curOrd; - SimpleTextFloatVectorValues(FieldEntry entry, IndexInput in) throws IOException { this.entry = entry; this.in = in; values = new float[entry.size()][entry.dimension]; - curOrd = -1; readAllVectors(); } @@ -314,7 +312,6 @@ private SimpleTextFloatVectorValues(SimpleTextFloatVectorValues other) { this.entry = other.entry; this.in = other.in.clone(); this.values = other.values; - this.curOrd = other.curOrd; } @Override @@ -394,18 +391,12 @@ private static class SimpleTextByteVectorValues extends ByteVectorValues { private final BytesRefBuilder scratch = new BytesRefBuilder(); private final FieldEntry entry; private final IndexInput in; - private final BytesRef binaryValue; private final byte[][] values; - int curOrd; - SimpleTextByteVectorValues(FieldEntry entry, IndexInput in) throws IOException { this.entry = entry; this.in = in; values = new byte[entry.size()][entry.dimension]; - binaryValue = new BytesRef(entry.dimension); - binaryValue.length = binaryValue.bytes.length; - curOrd = -1; readAllVectors(); } @@ -413,9 +404,6 @@ private SimpleTextByteVectorValues(SimpleTextByteVectorValues other) { this.entry = other.entry; this.in = other.in.clone(); this.values = other.values; - this.binaryValue = new BytesRef(entry.dimension); - this.binaryValue.length = binaryValue.bytes.length; - this.curOrd = other.curOrd; } @Override @@ -429,9 +417,17 @@ public int size() { } @Override - public byte[] vectorValue(int ord) { - binaryValue.bytes = values[ord]; - return binaryValue.bytes; + public Bytes values() { + BytesRef binaryValue = new BytesRef(entry.dimension); + binaryValue.length = binaryValue.bytes.length; + + return new Bytes() { + @Override + public byte[] get(int ord) { + binaryValue.bytes = values[ord]; + return binaryValue.bytes; + } + }; } @Override @@ -450,15 +446,14 @@ public VectorScorer scorer(byte[] target) { return null; } SimpleTextByteVectorValues simpleTextByteVectorValues = new SimpleTextByteVectorValues(this); + ByteVectorValues.Bytes vectors = simpleTextByteVectorValues.values(); return new VectorScorer() { DocIndexIterator it = simpleTextByteVectorValues.iterator(); @Override public float score() throws IOException { int ord = it.index(); - return entry - .similarityFunction() - .compare(simpleTextByteVectorValues.vectorValue(ord), target); + return entry.similarityFunction().compare(vectors.get(ord), target); } @Override @@ -485,11 +480,6 @@ private void readVector(byte[] value) throws IOException { value[i] = (byte) Float.parseFloat(floatStrings[i]); } } - - @Override - public SimpleTextByteVectorValues copy() { - return this; - } } private int readInt(IndexInput in, BytesRef field) throws IOException { diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextKnnVectorsWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextKnnVectorsWriter.java index 137cc48ade4..42dbe7e85ac 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextKnnVectorsWriter.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextKnnVectorsWriter.java @@ -103,18 +103,20 @@ public void writeField(FieldInfo fieldInfo, ByteVectorValues byteVectorValues, i long vectorDataOffset = vectorData.getFilePointer(); List docIds = new ArrayList<>(); KnnVectorValues.DocIndexIterator it = byteVectorValues.iterator(); + ByteVectorValues.Bytes vectors = byteVectorValues.values(); for (int docV = it.nextDoc(); docV != NO_MORE_DOCS; docV = it.nextDoc()) { - writeByteVectorValue(byteVectorValues, it.index()); + writeByteVectorValue(vectors, it.index(), byteVectorValues.dimension()); docIds.add(docV); } long vectorDataLength = vectorData.getFilePointer() - vectorDataOffset; writeMeta(fieldInfo, vectorDataOffset, vectorDataLength, docIds); } - private void writeByteVectorValue(ByteVectorValues vectors, int ord) throws IOException { + private void writeByteVectorValue(ByteVectorValues.Bytes vectors, int ord, int dim) + throws IOException { // write vector value - byte[] value = vectors.vectorValue(ord); - assert value.length == vectors.dimension(); + byte[] value = vectors.get(ord); + assert value.length == dim; write(vectorData, Arrays.toString(value)); newline(vectorData); } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/BufferingKnnVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/BufferingKnnVectorsWriter.java index 51545b150ba..db23afcff17 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/BufferingKnnVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/BufferingKnnVectorsWriter.java @@ -161,8 +161,15 @@ private static class SortingByteVectorValues extends ByteVectorValues { } @Override - public byte[] vectorValue(int ord) throws IOException { - return delegate.vectorValue(ord); + public Bytes values() throws IOException { + return new Bytes() { + Bytes values = delegate.values(); + + @Override + public byte[] get(int ord) throws IOException { + return values.get(ord); + } + }; } @Override @@ -175,11 +182,6 @@ public int size() { return delegate.size(); } - @Override - public SortingByteVectorValues copy() { - throw new UnsupportedOperationException(); - } - @Override public DocIndexIterator iterator() { return iteratorSupplier.get(); @@ -269,14 +271,12 @@ private static class BufferedFloatVectorValues extends FloatVectorValues { // These are always the vectors of a VectorValuesWriter, which are copied when added to it final List vectors; final int dimension; - private final DocIdSet docsWithField; private final DocIndexIterator iterator; BufferedFloatVectorValues(List vectors, int dimension, DocIdSet docsWithField) throws IOException { this.vectors = vectors; this.dimension = dimension; - this.docsWithField = docsWithField; this.iterator = fromDISI(docsWithField.iterator()); } @@ -315,14 +315,12 @@ private static class BufferedByteVectorValues extends ByteVectorValues { // These are always the vectors of a VectorValuesWriter, which are copied when added to it final List vectors; final int dimension; - private final DocIdSet docsWithField; private final DocIndexIterator iterator; BufferedByteVectorValues(List vectors, int dimension, DocIdSet docsWithField) throws IOException { this.vectors = vectors; this.dimension = dimension; - this.docsWithField = docsWithField; iterator = fromDISI(docsWithField.iterator()); } @@ -337,18 +335,18 @@ public int size() { } @Override - public byte[] vectorValue(int targetOrd) { - return vectors.get(targetOrd); + public Bytes values() { + return new Bytes() { + @Override + public byte[] get(int targetOrd) { + return vectors.get(targetOrd); + } + }; } @Override public DocIndexIterator iterator() { return iterator; } - - @Override - public BufferedByteVectorValues copy() throws IOException { - return new BufferedByteVectorValues(vectors, dimension, docsWithField); - } } } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsWriter.java index 15f186a6744..011e550ffb8 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsWriter.java @@ -61,9 +61,10 @@ public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOE (KnnFieldVectorsWriter) addField(fieldInfo); ByteVectorValues mergedBytes = MergedVectorValues.mergeByteVectorValues(fieldInfo, mergeState); + ByteVectorValues.Bytes values = mergedBytes.values(); KnnVectorValues.DocIndexIterator iter = mergedBytes.iterator(); for (int doc = iter.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = iter.nextDoc()) { - byteWriter.addValue(doc, mergedBytes.vectorValue(iter.index())); + byteWriter.addValue(doc, values.get(iter.index())); } } case FLOAT32 -> { @@ -71,10 +72,10 @@ public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOE (KnnFieldVectorsWriter) addField(fieldInfo); FloatVectorValues mergedFloats = MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState); - FloatVectorValues.Floats mergedDict = mergedFloats.values(); + FloatVectorValues.Floats values = mergedFloats.values(); KnnVectorValues.DocIndexIterator iter = mergedFloats.iterator(); for (int doc = iter.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = iter.nextDoc()) { - floatWriter.addValue(doc, mergedDict.get(iter.index())); + floatWriter.addValue(doc, values.get(iter.index())); } } } @@ -421,14 +422,27 @@ private MergedByteVectorValues(List subs, MergeState mergeS } @Override - public byte[] vectorValue(int ord) throws IOException { - if (ord != lastOrd + 1) { - throw new IllegalStateException( - "only supports forward iteration: ord=" + ord + ", lastOrd=" + lastOrd); - } else { - lastOrd = ord; - } - return current.values.vectorValue(current.index()); + public Bytes values() { + return new Bytes() { + ByteVectorValues currentValues = null; + Bytes currentBytes = null; + + @Override + public byte[] get(int ord) throws IOException { + if (ord != lastOrd) { + throw new IllegalStateException( + "only supports forward iteration with a single iterator: ord=" + + ord + + ", lastOrd=" + + lastOrd); + } + if (currentValues != current.values) { + currentValues = current.values; + currentBytes = current.values.values(); + } + return currentBytes.get(current.index()); + } + }; } @Override @@ -455,6 +469,7 @@ public int nextDoc() throws IOException { } else { docId = current.mappedDocID; ++index; + ++lastOrd; } return docId; } @@ -490,11 +505,6 @@ public int ordToDoc(int ord) { public VectorScorer scorer(byte[] target) { throw new UnsupportedOperationException(); } - - @Override - public ByteVectorValues copy() { - throw new UnsupportedOperationException(); - } } } } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/hnsw/DefaultFlatVectorScorer.java b/lucene/core/src/java/org/apache/lucene/codecs/hnsw/DefaultFlatVectorScorer.java index d0546d2deed..729dd6616c4 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/hnsw/DefaultFlatVectorScorer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/hnsw/DefaultFlatVectorScorer.java @@ -89,15 +89,15 @@ public String toString() { /** RandomVectorScorerSupplier for bytes vector */ private static final class ByteScoringSupplier implements RandomVectorScorerSupplier { private final ByteVectorValues vectors; - private final ByteVectorValues vectors1; - private final ByteVectorValues vectors2; + private final ByteVectorValues.Bytes vectors1; + private final ByteVectorValues.Bytes vectors2; private final VectorSimilarityFunction similarityFunction; private ByteScoringSupplier( ByteVectorValues vectors, VectorSimilarityFunction similarityFunction) throws IOException { this.vectors = vectors; - vectors1 = vectors.copy(); - vectors2 = vectors.copy(); + vectors1 = vectors.values(); + vectors2 = vectors.values(); this.similarityFunction = similarityFunction; } @@ -106,16 +106,11 @@ public RandomVectorScorer scorer(int ord) { return new RandomVectorScorer.AbstractRandomVectorScorer(vectors) { @Override public float score(int node) throws IOException { - return similarityFunction.compare(vectors1.vectorValue(ord), vectors2.vectorValue(node)); + return similarityFunction.compare(vectors1.get(ord), vectors2.get(node)); } }; } - @Override - public RandomVectorScorerSupplier copy() throws IOException { - return new ByteScoringSupplier(vectors, similarityFunction); - } - @Override public String toString() { return "ByteScoringSupplier(similarityFunction=" + similarityFunction + ")"; @@ -147,11 +142,6 @@ public float score(int node) throws IOException { }; } - @Override - public RandomVectorScorerSupplier copy() throws IOException { - return new FloatScoringSupplier(vectors, similarityFunction); - } - @Override public String toString() { return "FloatScoringSupplier(similarityFunction=" + similarityFunction + ")"; @@ -181,21 +171,22 @@ public float score(int node) throws IOException { /** A {@link RandomVectorScorer} for byte vectors. */ private static class ByteVectorScorer extends RandomVectorScorer.AbstractRandomVectorScorer { - private final ByteVectorValues values; + private final ByteVectorValues.Bytes vectors; private final byte[] query; private final VectorSimilarityFunction similarityFunction; public ByteVectorScorer( - ByteVectorValues values, byte[] query, VectorSimilarityFunction similarityFunction) { + ByteVectorValues values, byte[] query, VectorSimilarityFunction similarityFunction) + throws IOException { super(values); - this.values = values; + vectors = values.values(); this.query = query; this.similarityFunction = similarityFunction; } @Override public float score(int node) throws IOException { - return similarityFunction.compare(query, values.vectorValue(node)); + return similarityFunction.compare(query, vectors.get(node)); } } } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/hnsw/ScalarQuantizedVectorScorer.java b/lucene/core/src/java/org/apache/lucene/codecs/hnsw/ScalarQuantizedVectorScorer.java index ceb826aa3a1..5b364765435 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/hnsw/ScalarQuantizedVectorScorer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/hnsw/ScalarQuantizedVectorScorer.java @@ -87,10 +87,12 @@ public RandomVectorScorer getRandomVectorScorer( scalarQuantizer.getConstantMultiplier(), scalarQuantizer.getBits()); return new RandomVectorScorer.AbstractRandomVectorScorer(quantizedByteVectorValues) { + QuantizedByteVectorValues.QuantizedBytes values = quantizedByteVectorValues.values(); + @Override public float score(int node) throws IOException { - byte[] nodeVector = quantizedByteVectorValues.vectorValue(node); - float nodeOffset = quantizedByteVectorValues.getScoreCorrectionConstant(node); + byte[] nodeVector = values.get(node); + float nodeOffset = values.getScoreCorrectionConstant(node); return scalarQuantizedVectorSimilarity.score( targetBytes, offsetCorrection, nodeVector, nodeOffset); } @@ -137,36 +139,21 @@ public ScalarQuantizedRandomVectorScorerSupplier( this.vectorSimilarityFunction = similarityFunction; } - private ScalarQuantizedRandomVectorScorerSupplier( - ScalarQuantizedVectorSimilarity similarity, - VectorSimilarityFunction vectorSimilarityFunction, - QuantizedByteVectorValues values) { - this.similarity = similarity; - this.values = values; - this.vectorSimilarityFunction = vectorSimilarityFunction; - } - @Override public RandomVectorScorer scorer(int ord) throws IOException { - final QuantizedByteVectorValues vectorsCopy = values.copy(); - final byte[] queryVector = values.vectorValue(ord); - final float queryOffset = values.getScoreCorrectionConstant(ord); - return new RandomVectorScorer.AbstractRandomVectorScorer(vectorsCopy) { + final QuantizedByteVectorValues.QuantizedBytes vectors = values.values(); + final byte[] queryVector = vectors.get(ord); + final float queryOffset = vectors.getScoreCorrectionConstant(ord); + return new RandomVectorScorer.AbstractRandomVectorScorer(values) { @Override public float score(int node) throws IOException { - byte[] nodeVector = vectorsCopy.vectorValue(node); - float nodeOffset = vectorsCopy.getScoreCorrectionConstant(node); + byte[] nodeVector = vectors.get(node); + float nodeOffset = vectors.getScoreCorrectionConstant(node); return similarity.score(queryVector, queryOffset, nodeVector, nodeOffset); } }; } - @Override - public RandomVectorScorerSupplier copy() throws IOException { - return new ScalarQuantizedRandomVectorScorerSupplier( - similarity, vectorSimilarityFunction, values.copy()); - } - @Override public String toString() { return "ScalarQuantizedRandomVectorScorerSupplier(vectorSimilarityFunction=" diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene95/OffHeapByteVectorValues.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene95/OffHeapByteVectorValues.java index 1e78c8ea7aa..5ccbde1b3bf 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene95/OffHeapByteVectorValues.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene95/OffHeapByteVectorValues.java @@ -33,14 +33,11 @@ import org.apache.lucene.util.packed.DirectMonotonicReader; /** Read the vector values from the index input. This supports both iterated and random access. */ -public abstract class OffHeapByteVectorValues extends ByteVectorValues implements HasIndexSlice { +public abstract class OffHeapByteVectorValues extends ByteVectorValues { protected final int dimension; protected final int size; protected final IndexInput slice; - protected int lastOrd = -1; - protected final byte[] binaryValue; - protected final ByteBuffer byteBuffer; protected final int byteSize; protected final VectorSimilarityFunction similarityFunction; protected final FlatVectorsScorer flatVectorsScorer; @@ -56,8 +53,6 @@ public abstract class OffHeapByteVectorValues extends ByteVectorValues implement this.size = size; this.slice = slice; this.byteSize = byteSize; - byteBuffer = ByteBuffer.allocate(byteSize); - binaryValue = byteBuffer.array(); this.similarityFunction = similarityFunction; this.flatVectorsScorer = flatVectorsScorer; } @@ -73,22 +68,42 @@ public int size() { } @Override - public byte[] vectorValue(int targetOrd) throws IOException { - if (lastOrd != targetOrd) { - readValue(targetOrd); - lastOrd = targetOrd; - } - return binaryValue; + public Bytes values() throws IOException { + return new OffHeapBytes(); } - @Override - public IndexInput getSlice() { - return slice; - } + // can't be anonymous because it extends + implements + class OffHeapBytes extends Bytes implements HasIndexSlice { + + private final ByteBuffer byteBuffer; + private final byte[] binaryValue; + private final IndexInput input; + private int lastOrd = -1; + + OffHeapBytes() throws IOException { + byteBuffer = ByteBuffer.allocate(byteSize); + binaryValue = byteBuffer.array(); + input = slice.clone(); + } + + @Override + public byte[] get(int targetOrd) throws IOException { + if (lastOrd != targetOrd) { + readValue(targetOrd); + lastOrd = targetOrd; + } + return binaryValue; + } - private void readValue(int targetOrd) throws IOException { - slice.seek((long) targetOrd * byteSize); - slice.readBytes(byteBuffer.array(), byteBuffer.arrayOffset(), byteSize); + @Override + public IndexInput getSlice() { + return input; + } + + private void readValue(int targetOrd) throws IOException { + input.seek((long) targetOrd * byteSize); + input.readBytes(byteBuffer.array(), byteBuffer.arrayOffset(), byteSize); + } } public static OffHeapByteVectorValues load( @@ -140,12 +155,6 @@ public DenseOffHeapVectorValues( super(dimension, size, slice, byteSize, flatVectorsScorer, vectorSimilarityFunction); } - @Override - public DenseOffHeapVectorValues copy() throws IOException { - return new DenseOffHeapVectorValues( - dimension, size, slice.clone(), byteSize, flatVectorsScorer, similarityFunction); - } - @Override public DocIndexIterator iterator() { return createDenseIterator(); @@ -158,10 +167,9 @@ public Bits getAcceptOrds(Bits acceptDocs) { @Override public VectorScorer scorer(byte[] query) throws IOException { - DenseOffHeapVectorValues copy = copy(); - DocIndexIterator iterator = copy.iterator(); + DocIndexIterator iterator = iterator(); RandomVectorScorer scorer = - flatVectorsScorer.getRandomVectorScorer(similarityFunction, copy, query); + flatVectorsScorer.getRandomVectorScorer(similarityFunction, this, query); return new VectorScorer() { @Override public float score() throws IOException { @@ -178,10 +186,10 @@ public DocIdSetIterator iterator() { private static class SparseOffHeapVectorValues extends OffHeapByteVectorValues { private final DirectMonotonicReader ordToDoc; - private final IndexedDISI disi; - // dataIn was used to init a new IndexedDIS for #randomAccess() private final IndexInput dataIn; private final OrdToDocDISIReaderConfiguration configuration; + private final IndexedDISI disi; + private DocIndexIterator iterator; public SparseOffHeapVectorValues( OrdToDocDISIReaderConfiguration configuration, @@ -200,14 +208,17 @@ public SparseOffHeapVectorValues( byteSize, flatVectorsScorer, vectorSimilarityFunction); - this.configuration = configuration; final RandomAccessInput addressesData = dataIn.randomAccessSlice(configuration.addressesOffset, configuration.addressesLength); - this.dataIn = dataIn; this.ordToDoc = DirectMonotonicReader.getInstance(configuration.meta, addressesData); - this.disi = - new IndexedDISI( - dataIn, + this.dataIn = dataIn; + this.configuration = configuration; + this.disi = createDISI(); + } + + IndexedDISI createDISI() throws IOException { + return new IndexedDISI( + dataIn.clone(), configuration.docsWithFieldOffset, configuration.docsWithFieldLength, configuration.jumpTableEntryCount, @@ -215,18 +226,6 @@ public SparseOffHeapVectorValues( configuration.size); } - @Override - public SparseOffHeapVectorValues copy() throws IOException { - return new SparseOffHeapVectorValues( - configuration, - dataIn, - slice.clone(), - dimension, - byteSize, - flatVectorsScorer, - similarityFunction); - } - @Override public int ordToDoc(int ord) { return (int) ordToDoc.get(ord); @@ -234,7 +233,11 @@ public int ordToDoc(int ord) { @Override public DocIndexIterator iterator() { - return IndexedDISI.asDocIndexIterator(disi); + // we can only create a single iterator since creating a new IndexedDISI + // could throw an IOException + assert iterator == null; + iterator = IndexedDISI.asDocIndexIterator(disi); + return iterator; } @Override @@ -257,18 +260,19 @@ public int length() { @Override public VectorScorer scorer(byte[] query) throws IOException { - SparseOffHeapVectorValues copy = copy(); RandomVectorScorer scorer = - flatVectorsScorer.getRandomVectorScorer(similarityFunction, copy, query); + flatVectorsScorer.getRandomVectorScorer(similarityFunction, this, query); return new VectorScorer() { + IndexedDISI disi = createDISI(); + @Override public float score() throws IOException { - return scorer.score(copy.disi.index()); + return scorer.score(disi.index()); } @Override public DocIdSetIterator iterator() { - return copy.disi; + return disi; } }; } @@ -294,8 +298,8 @@ public int size() { } @Override - public byte[] vectorValue(int ord) throws IOException { - throw new UnsupportedOperationException(); + public Bytes values() { + return Bytes.EMPTY; } @Override @@ -303,11 +307,6 @@ public DocIndexIterator iterator() { return createDenseIterator(); } - @Override - public EmptyOffHeapVectorValues copy() throws IOException { - throw new UnsupportedOperationException(); - } - @Override public int ordToDoc(int ord) { throw new UnsupportedOperationException(); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene95/OffHeapFloatVectorValues.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene95/OffHeapFloatVectorValues.java index 5bc64817156..4ff9fdecf3b 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene95/OffHeapFloatVectorValues.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene95/OffHeapFloatVectorValues.java @@ -179,9 +179,6 @@ public DocIdSetIterator iterator() { private static class SparseOffHeapVectorValues extends OffHeapFloatVectorValues { private final DirectMonotonicReader ordToDoc; private final IndexedDISI disi; - // dataIn was used to init a new IndexedDIS for #randomAccess() - private final IndexInput dataIn; - private final OrdToDocDISIReaderConfiguration configuration; public SparseOffHeapVectorValues( OrdToDocDISIReaderConfiguration configuration, @@ -194,10 +191,8 @@ public SparseOffHeapVectorValues( throws IOException { super(dimension, configuration.size, slice, byteSize, flatVectorsScorer, similarityFunction); - this.configuration = configuration; final RandomAccessInput addressesData = dataIn.randomAccessSlice(configuration.addressesOffset, configuration.addressesLength); - this.dataIn = dataIn; this.ordToDoc = DirectMonotonicReader.getInstance(configuration.meta, addressesData); this.disi = new IndexedDISI( diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsWriter.java index 59d8e872d7f..eca60c4ce15 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsWriter.java @@ -363,9 +363,10 @@ private static DocsWithFieldSet writeByteVectorData( IndexOutput output, ByteVectorValues byteVectorValues) throws IOException { DocsWithFieldSet docsWithField = new DocsWithFieldSet(); KnnVectorValues.DocIndexIterator iter = byteVectorValues.iterator(); + ByteVectorValues.Bytes values = byteVectorValues.values(); for (int docV = iter.nextDoc(); docV != NO_MORE_DOCS; docV = iter.nextDoc()) { // write vector - byte[] binaryValue = byteVectorValues.vectorValue(iter.index()); + byte[] binaryValue = values.get(iter.index()); assert binaryValue.length == byteVectorValues.dimension() * VectorEncoding.BYTE.byteSize; output.writeBytes(binaryValue, binaryValue.length); docsWithField.add(docV); @@ -512,11 +513,6 @@ public RandomVectorScorer scorer(int ord) throws IOException { return supplier.scorer(ord); } - @Override - public RandomVectorScorerSupplier copy() throws IOException { - return supplier.copy(); - } - @Override public void close() throws IOException { onClose.close(); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorScorer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorScorer.java index a4770f01f46..ec100c31dad 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorScorer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorScorer.java @@ -92,7 +92,8 @@ static RandomVectorScorer fromVectorSimilarity( float offsetCorrection, VectorSimilarityFunction sim, float constMultiplier, - QuantizedByteVectorValues values) { + QuantizedByteVectorValues values) + throws IOException { return switch (sim) { case EUCLIDEAN -> new Euclidean(values, constMultiplier, targetBytes); case COSINE, DOT_PRODUCT -> @@ -117,9 +118,11 @@ private static RandomVectorScorer.AbstractRandomVectorScorer dotProductFactory( float offsetCorrection, float constMultiplier, QuantizedByteVectorValues values, - FloatToFloatFunction scoreAdjustmentFunction) { + FloatToFloatFunction scoreAdjustmentFunction) + throws IOException { + QuantizedByteVectorValues.QuantizedBytes vectors = values.values(); if (values.getScalarQuantizer().getBits() <= 4) { - if (values.getVectorByteLength() != values.dimension() && values.getSlice() != null) { + if (values.getVectorByteLength() != values.dimension() && vectors.getSlice() != null) { return new CompressedInt4DotProduct( values, constMultiplier, targetBytes, offsetCorrection, scoreAdjustmentFunction); } @@ -133,18 +136,19 @@ private static RandomVectorScorer.AbstractRandomVectorScorer dotProductFactory( private static class Euclidean extends RandomVectorScorer.AbstractRandomVectorScorer { private final float constMultiplier; private final byte[] targetBytes; - private final QuantizedByteVectorValues values; + private final QuantizedByteVectorValues.QuantizedBytes vectors; - private Euclidean(QuantizedByteVectorValues values, float constMultiplier, byte[] targetBytes) { + private Euclidean(QuantizedByteVectorValues values, float constMultiplier, byte[] targetBytes) + throws IOException { super(values); - this.values = values; + vectors = values.values(); this.constMultiplier = constMultiplier; this.targetBytes = targetBytes; } @Override public float score(int node) throws IOException { - byte[] nodeVector = values.vectorValue(node); + byte[] nodeVector = vectors.get(node); int squareDistance = VectorUtil.squareDistance(nodeVector, targetBytes); float adjustedDistance = squareDistance * constMultiplier; return 1 / (1f + adjustedDistance); @@ -154,7 +158,7 @@ public float score(int node) throws IOException { /** Calculates dot product on quantized vectors, applying the appropriate corrections */ private static class DotProduct extends RandomVectorScorer.AbstractRandomVectorScorer { private final float constMultiplier; - private final QuantizedByteVectorValues values; + private final QuantizedByteVectorValues.QuantizedBytes vectors; private final byte[] targetBytes; private final float offsetCorrection; private final FloatToFloatFunction scoreAdjustmentFunction; @@ -164,10 +168,11 @@ public DotProduct( float constMultiplier, byte[] targetBytes, float offsetCorrection, - FloatToFloatFunction scoreAdjustmentFunction) { + FloatToFloatFunction scoreAdjustmentFunction) + throws IOException { super(values); this.constMultiplier = constMultiplier; - this.values = values; + vectors = values.values(); this.targetBytes = targetBytes; this.offsetCorrection = offsetCorrection; this.scoreAdjustmentFunction = scoreAdjustmentFunction; @@ -175,8 +180,8 @@ public DotProduct( @Override public float score(int vectorOrdinal) throws IOException { - byte[] storedVector = values.vectorValue(vectorOrdinal); - float vectorOffset = values.getScoreCorrectionConstant(vectorOrdinal); + byte[] storedVector = vectors.get(vectorOrdinal); + float vectorOffset = vectors.getScoreCorrectionConstant(vectorOrdinal); int dotProduct = VectorUtil.dotProduct(storedVector, targetBytes); // For the current implementation of scalar quantization, all dotproducts should be >= 0; assert dotProduct >= 0; @@ -189,6 +194,7 @@ private static class CompressedInt4DotProduct extends RandomVectorScorer.AbstractRandomVectorScorer { private final float constMultiplier; private final QuantizedByteVectorValues values; + private final QuantizedByteVectorValues.QuantizedBytes vectors; private final byte[] compressedVector; private final byte[] targetBytes; private final float offsetCorrection; @@ -199,10 +205,12 @@ private CompressedInt4DotProduct( float constMultiplier, byte[] targetBytes, float offsetCorrection, - FloatToFloatFunction scoreAdjustmentFunction) { + FloatToFloatFunction scoreAdjustmentFunction) + throws IOException { super(values); this.constMultiplier = constMultiplier; this.values = values; + vectors = values.values(); this.compressedVector = new byte[values.getVectorByteLength()]; this.targetBytes = targetBytes; this.offsetCorrection = offsetCorrection; @@ -213,9 +221,9 @@ private CompressedInt4DotProduct( public float score(int vectorOrdinal) throws IOException { // get compressed vector, in Lucene99, vector values are stored and have a single value for // offset correction - values.getSlice().seek((long) vectorOrdinal * (values.getVectorByteLength() + Float.BYTES)); - values.getSlice().readBytes(compressedVector, 0, compressedVector.length); - float vectorOffset = values.getScoreCorrectionConstant(vectorOrdinal); + vectors.getSlice().seek((long) vectorOrdinal * (values.getVectorByteLength() + Float.BYTES)); + vectors.getSlice().readBytes(compressedVector, 0, compressedVector.length); + float vectorOffset = vectors.getScoreCorrectionConstant(vectorOrdinal); int dotProduct = VectorUtil.int4DotProductPacked(targetBytes, compressedVector); // For the current implementation of scalar quantization, all dotproducts should be >= 0; assert dotProduct >= 0; @@ -226,7 +234,7 @@ public float score(int vectorOrdinal) throws IOException { private static class Int4DotProduct extends RandomVectorScorer.AbstractRandomVectorScorer { private final float constMultiplier; - private final QuantizedByteVectorValues values; + private final QuantizedByteVectorValues.QuantizedBytes vectors; private final byte[] targetBytes; private final float offsetCorrection; private final FloatToFloatFunction scoreAdjustmentFunction; @@ -236,10 +244,11 @@ public Int4DotProduct( float constMultiplier, byte[] targetBytes, float offsetCorrection, - FloatToFloatFunction scoreAdjustmentFunction) { + FloatToFloatFunction scoreAdjustmentFunction) + throws IOException { super(values); this.constMultiplier = constMultiplier; - this.values = values; + vectors = values.values(); this.targetBytes = targetBytes; this.offsetCorrection = offsetCorrection; this.scoreAdjustmentFunction = scoreAdjustmentFunction; @@ -247,8 +256,8 @@ public Int4DotProduct( @Override public float score(int vectorOrdinal) throws IOException { - byte[] storedVector = values.vectorValue(vectorOrdinal); - float vectorOffset = values.getScoreCorrectionConstant(vectorOrdinal); + byte[] storedVector = vectors.get(vectorOrdinal); + float vectorOffset = vectors.getScoreCorrectionConstant(vectorOrdinal); int dotProduct = VectorUtil.int4DotProduct(storedVector, targetBytes); // For the current implementation of scalar quantization, all dotproducts should be >= 0; assert dotProduct >= 0; @@ -267,33 +276,26 @@ private static final class ScalarQuantizedRandomVectorScorerSupplier private final VectorSimilarityFunction vectorSimilarityFunction; private final QuantizedByteVectorValues values; - private final QuantizedByteVectorValues values1; - private final QuantizedByteVectorValues values2; + private final QuantizedByteVectorValues.QuantizedBytes vectors; public ScalarQuantizedRandomVectorScorerSupplier( QuantizedByteVectorValues values, VectorSimilarityFunction vectorSimilarityFunction) throws IOException { this.values = values; - this.values1 = values.copy(); - this.values2 = values.copy(); + this.vectors = values.values(); this.vectorSimilarityFunction = vectorSimilarityFunction; } @Override public RandomVectorScorer scorer(int ord) throws IOException { - byte[] vectorValue = values1.vectorValue(ord); - float offsetCorrection = values1.getScoreCorrectionConstant(ord); + byte[] vectorValue = vectors.get(ord); + float offsetCorrection = vectors.getScoreCorrectionConstant(ord); return fromVectorSimilarity( vectorValue, offsetCorrection, vectorSimilarityFunction, values.getScalarQuantizer().getConstantMultiplier(), - values2); - } - - @Override - public ScalarQuantizedRandomVectorScorerSupplier copy() throws IOException { - return new ScalarQuantizedRandomVectorScorerSupplier(values.copy(), vectorSimilarityFunction); + values); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsWriter.java index df2384cf4a5..fcb28fefffb 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsWriter.java @@ -729,9 +729,10 @@ public static DocsWithFieldSet writeQuantizedVectorData( quantizedByteVectorValues.dimension(), bits) : null; KnnVectorValues.DocIndexIterator iter = quantizedByteVectorValues.iterator(); + QuantizedByteVectorValues.QuantizedBytes vectors = quantizedByteVectorValues.values(); for (int docV = iter.nextDoc(); docV != NO_MORE_DOCS; docV = iter.nextDoc()) { // write vector - byte[] binaryValue = quantizedByteVectorValues.vectorValue(iter.index()); + byte[] binaryValue = vectors.get(iter.index()); assert binaryValue.length == quantizedByteVectorValues.dimension() : "dim=" + quantizedByteVectorValues.dimension() + " len=" + binaryValue.length; if (compressedVector != null) { @@ -740,8 +741,7 @@ public static DocsWithFieldSet writeQuantizedVectorData( } else { output.writeBytes(binaryValue, binaryValue.length); } - output.writeInt( - Float.floatToIntBits(quantizedByteVectorValues.getScoreCorrectionConstant(iter.index()))); + output.writeInt(Float.floatToIntBits(vectors.getScoreCorrectionConstant(iter.index()))); docsWithField.add(docV); } return docsWithField; @@ -889,12 +889,15 @@ public DocIndexIterator iterator() { static class QuantizedByteVectorValueSub extends DocIDMerger.Sub { private final QuantizedByteVectorValues values; + private final QuantizedByteVectorValues.QuantizedBytes vectors; private final KnnVectorValues.DocIndexIterator iterator; - QuantizedByteVectorValueSub(MergeState.DocMap docMap, QuantizedByteVectorValues values) { + QuantizedByteVectorValueSub(MergeState.DocMap docMap, QuantizedByteVectorValues values) + throws IOException { super(docMap); this.values = values; iterator = values.iterator(); + vectors = values.values(); assert iterator.docID() == -1; } @@ -975,8 +978,18 @@ private MergedQuantizedVectorValues( } @Override - public byte[] vectorValue(int ord) throws IOException { - return current.values.vectorValue(current.index()); + public QuantizedBytes values() throws IOException { + return new QuantizedBytes() { + @Override + public byte[] get(int ord) throws IOException { + return current.vectors.get(current.index()); + } + + @Override + public float getScoreCorrectionConstant(int ord) throws IOException { + return current.vectors.getScoreCorrectionConstant(current.index()); + } + }; } @Override @@ -994,11 +1007,6 @@ public int dimension() { return subs.get(0).values.dimension(); } - @Override - public float getScoreCorrectionConstant(int ord) throws IOException { - return current.values.getScoreCorrectionConstant(current.index()); - } - private class CompositeIterator extends DocIndexIterator { private int docId; private int ord; @@ -1046,10 +1054,6 @@ public long cost() { static class QuantizedFloatVectorValues extends QuantizedByteVectorValues { private final FloatVectorValues values; private final ScalarQuantizer quantizer; - private final byte[] quantizedVector; - private final FloatVectorValues.Floats floats; - private int lastOrd = -1; - private float offsetValue = 0f; private final VectorSimilarityFunction vectorSimilarityFunction; @@ -1060,21 +1064,7 @@ public QuantizedFloatVectorValues( throws IOException { this.values = values; this.quantizer = quantizer; - quantizedVector = new byte[values.dimension()]; this.vectorSimilarityFunction = vectorSimilarityFunction; - floats = values.values(); - } - - @Override - public float getScoreCorrectionConstant(int ord) { - if (ord != lastOrd) { - throw new IllegalStateException( - "attempt to retrieve score correction for different ord " - + ord - + " than the quantization was done for: " - + lastOrd); - } - return offsetValue; } @Override @@ -1088,12 +1078,38 @@ public int size() { } @Override - public byte[] vectorValue(int ord) throws IOException { - if (ord != lastOrd) { - offsetValue = quantize(ord); - lastOrd = ord; - } - return quantizedVector; + public QuantizedBytes values() throws IOException { + return new QuantizedBytes() { + FloatVectorValues.Floats vectors = values.values(); + byte[] quantizedVector = new byte[values.dimension()]; + float offsetValue = 0f; + int lastOrd = -1; + + @Override + public byte[] get(int ord) throws IOException { + if (ord != lastOrd) { + offsetValue = quantize(ord); + lastOrd = ord; + } + return quantizedVector; + } + + @Override + public float getScoreCorrectionConstant(int ord) { + if (ord != lastOrd) { + throw new IllegalStateException( + "attempt to retrieve score correction for different ord " + + ord + + " than the quantization was done for: " + + lastOrd); + } + return offsetValue; + } + + private float quantize(int ord) throws IOException { + return quantizer.quantize(vectors.get(ord), quantizedVector, vectorSimilarityFunction); + } + }; } @Override @@ -1101,10 +1117,6 @@ public VectorScorer scorer(float[] target) throws IOException { throw new UnsupportedOperationException(); } - private float quantize(int ord) throws IOException { - return quantizer.quantize(floats.get(ord), quantizedVector, vectorSimilarityFunction); - } - @Override public int ordToDoc(int ord) { return values.ordToDoc(ord); @@ -1135,11 +1147,6 @@ public RandomVectorScorer scorer(int ord) throws IOException { return supplier.scorer(ord); } - @Override - public RandomVectorScorerSupplier copy() throws IOException { - return supplier.copy(); - } - @Override public void close() throws IOException { onClose.close(); @@ -1169,9 +1176,21 @@ static final class OffsetCorrectedQuantizedByteVectorValues extends QuantizedByt } @Override - public float getScoreCorrectionConstant(int ord) throws IOException { - return scalarQuantizer.recalculateCorrectiveOffset( - in.vectorValue(ord), oldScalarQuantizer, vectorSimilarityFunction); + public QuantizedBytes values() throws IOException { + return new QuantizedBytes() { + Bytes vectors = in.values(); + + @Override + public byte[] get(int ord) throws IOException { + return vectors.get(ord); + } + + @Override + public float getScoreCorrectionConstant(int ord) throws IOException { + return scalarQuantizer.recalculateCorrectiveOffset( + vectors.get(ord), oldScalarQuantizer, vectorSimilarityFunction); + } + }; } @Override @@ -1184,11 +1203,6 @@ public int size() { return in.size(); } - @Override - public byte[] vectorValue(int ord) throws IOException { - return in.vectorValue(ord); - } - @Override public int ordToDoc(int ord) { return in.ordToDoc(ord); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/OffHeapQuantizedByteVectorValues.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/OffHeapQuantizedByteVectorValues.java index 051c926a679..2de462c857c 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/OffHeapQuantizedByteVectorValues.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/OffHeapQuantizedByteVectorValues.java @@ -47,11 +47,7 @@ public abstract class OffHeapQuantizedByteVectorValues extends QuantizedByteVect protected final boolean compress; protected final IndexInput slice; - protected final byte[] binaryValue; - protected final ByteBuffer byteBuffer; protected final int byteSize; - protected int lastOrd = -1; - protected final float[] scoreCorrectionConstant = new float[1]; static void decompressBytes(byte[] compressed, int numBytes) { if (numBytes == compressed.length) { @@ -105,8 +101,6 @@ static void compressBytes(byte[] raw, byte[] compressed) { this.numBytes = dimension; } this.byteSize = this.numBytes + Float.BYTES; - byteBuffer = ByteBuffer.allocate(dimension); - binaryValue = byteBuffer.array(); this.similarityFunction = similarityFunction; this.vectorsScorer = vectorsScorer; } @@ -127,31 +121,42 @@ public int size() { } @Override - public byte[] vectorValue(int targetOrd) throws IOException { - if (lastOrd == targetOrd) { - return binaryValue; - } - slice.seek((long) targetOrd * byteSize); - slice.readBytes(byteBuffer.array(), byteBuffer.arrayOffset(), numBytes); - slice.readFloats(scoreCorrectionConstant, 0, 1); - decompressBytes(binaryValue, numBytes); - lastOrd = targetOrd; - return binaryValue; - } + public QuantizedBytes values() throws IOException { + return new QuantizedBytes() { + ByteBuffer byteBuffer = ByteBuffer.allocate(dimension); + byte[] binaryValue = byteBuffer.array(); + IndexInput input = slice.clone(); + float[] scoreCorrectionConstant = new float[1]; + int lastOrd = -1; + + @Override + public byte[] get(int targetOrd) throws IOException { + if (lastOrd == targetOrd) { + return binaryValue; + } + input.seek((long) targetOrd * byteSize); + input.readBytes(byteBuffer.array(), byteBuffer.arrayOffset(), numBytes); + input.readFloats(scoreCorrectionConstant, 0, 1); + decompressBytes(binaryValue, numBytes); + lastOrd = targetOrd; + return binaryValue; + } - @Override - public float getScoreCorrectionConstant(int targetOrd) throws IOException { - if (lastOrd == targetOrd) { - return scoreCorrectionConstant[0]; - } - slice.seek(((long) targetOrd * byteSize) + numBytes); - slice.readFloats(scoreCorrectionConstant, 0, 1); - return scoreCorrectionConstant[0]; - } + @Override + public float getScoreCorrectionConstant(int targetOrd) throws IOException { + if (lastOrd == targetOrd) { + return scoreCorrectionConstant[0]; + } + input.seek(((long) targetOrd * byteSize) + numBytes); + input.readFloats(scoreCorrectionConstant, 0, 1); + return scoreCorrectionConstant[0]; + } - @Override - public IndexInput getSlice() { - return slice; + @Override + public IndexInput getSlice() { + return input; + } + }; } @Override @@ -217,18 +222,6 @@ public DenseOffHeapVectorValues( super(dimension, size, scalarQuantizer, similarityFunction, vectorsScorer, compress, slice); } - @Override - public DenseOffHeapVectorValues copy() throws IOException { - return new DenseOffHeapVectorValues( - dimension, - size, - scalarQuantizer, - compress, - similarityFunction, - vectorsScorer, - slice.clone()); - } - @Override public Bits getAcceptOrds(Bits acceptDocs) { return acceptDocs; @@ -236,10 +229,9 @@ public Bits getAcceptOrds(Bits acceptDocs) { @Override public VectorScorer scorer(float[] target) throws IOException { - DenseOffHeapVectorValues copy = copy(); - DocIndexIterator iterator = copy.iterator(); + DocIndexIterator iterator = iterator(); RandomVectorScorer vectorScorer = - vectorsScorer.getRandomVectorScorer(similarityFunction, copy, target); + vectorsScorer.getRandomVectorScorer(similarityFunction, this, target); return new VectorScorer() { @Override public float score() throws IOException { @@ -262,9 +254,6 @@ public DocIndexIterator iterator() { private static class SparseOffHeapVectorValues extends OffHeapQuantizedByteVectorValues { private final DirectMonotonicReader ordToDoc; private final IndexedDISI disi; - // dataIn was used to init a new IndexedDIS for #randomAccess() - private final IndexInput dataIn; - private final OrdToDocDISIReaderConfiguration configuration; public SparseOffHeapVectorValues( OrdToDocDISIReaderConfiguration configuration, @@ -278,8 +267,6 @@ public SparseOffHeapVectorValues( IndexInput slice) throws IOException { super(dimension, size, scalarQuantizer, similarityFunction, vectorsScorer, compress, slice); - this.configuration = configuration; - this.dataIn = dataIn; this.ordToDoc = configuration.getDirectMonotonicReader(dataIn); this.disi = configuration.getIndexedDISI(dataIn); } @@ -289,20 +276,6 @@ public DocIndexIterator iterator() { return IndexedDISI.asDocIndexIterator(disi); } - @Override - public SparseOffHeapVectorValues copy() throws IOException { - return new SparseOffHeapVectorValues( - configuration, - dimension, - size, - scalarQuantizer, - compress, - dataIn, - similarityFunction, - vectorsScorer, - slice.clone()); - } - @Override public int ordToDoc(int ord) { return (int) ordToDoc.get(ord); @@ -328,10 +301,9 @@ public int length() { @Override public VectorScorer scorer(float[] target) throws IOException { - SparseOffHeapVectorValues copy = copy(); - DocIndexIterator iterator = copy.iterator(); + DocIndexIterator iterator = iterator(); RandomVectorScorer vectorScorer = - vectorsScorer.getRandomVectorScorer(similarityFunction, copy, target); + vectorsScorer.getRandomVectorScorer(similarityFunction, this, target); return new VectorScorer() { @Override public float score() throws IOException { @@ -378,13 +350,8 @@ public DocIndexIterator iterator() { } @Override - public EmptyOffHeapVectorValues copy() { - throw new UnsupportedOperationException(); - } - - @Override - public byte[] vectorValue(int targetOrd) { - throw new UnsupportedOperationException(); + public QuantizedBytes values() { + return QuantizedBytes.EMPTY; } @Override diff --git a/lucene/core/src/java/org/apache/lucene/index/ByteVectorValues.java b/lucene/core/src/java/org/apache/lucene/index/ByteVectorValues.java index 1231166f533..aa3381a34cb 100644 --- a/lucene/core/src/java/org/apache/lucene/index/ByteVectorValues.java +++ b/lucene/core/src/java/org/apache/lucene/index/ByteVectorValues.java @@ -32,15 +32,27 @@ public abstract class ByteVectorValues extends KnnVectorValues { /** Sole constructor */ protected ByteVectorValues() {} - /** - * Return the vector value for the given vector ordinal which must be in [0, size() - 1], - * otherwise IndexOutOfBoundsException is thrown. The returned array may be shared across calls. - * - * @return the vector value - */ - public abstract byte[] vectorValue(int ord) throws IOException; + /** A random access (lookup by ord) provider of the vector values */ + public abstract static class Bytes { + /** + * Return the vector value for the given vector ordinal which must be in [0, size() - 1], + * otherwise IndexOutOfBoundsException is thrown. The returned array may be shared across calls. + * + * @return the vector value + */ + public abstract byte[] get(int ord) throws IOException; + + public static final Bytes EMPTY = + new Bytes() { + @Override + public byte[] get(int ord) { + throw new UnsupportedOperationException(); + } + }; + } - public abstract ByteVectorValues copy() throws IOException; + /** Returns a random access (lookup by ord) provider of the vector values */ + public abstract Bytes values() throws IOException; /** * Checks the Vector Encoding of a field @@ -98,13 +110,13 @@ public int dimension() { } @Override - public byte[] vectorValue(int targetOrd) { - return vectors.get(targetOrd); - } - - @Override - public ByteVectorValues copy() { - return this; + public Bytes values() { + return new Bytes() { + @Override + public byte[] get(int targetOrd) { + return vectors.get(targetOrd); + } + }; } @Override diff --git a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java index eb6108629f9..f9db7143724 100644 --- a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java +++ b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java @@ -2812,20 +2812,19 @@ private static void checkByteVectorValues( int count = 0; int everyNdoc = Math.max(values.size() / 64, 1); boolean supportsSearch = vectorsReaderSupportsSearch(codecReader, fieldInfo.name); + ByteVectorValues.Bytes vectors = values.values(); while (count < values.size()) { // search the first maxNumSearches vectors to exercise the graph if (supportsSearch && values.ordToDoc(count) % everyNdoc == 0) { KnnCollector collector = new TopKnnCollector(10, Integer.MAX_VALUE); - codecReader - .getVectorReader() - .search(fieldInfo.name, values.vectorValue(count), collector, null); + codecReader.getVectorReader().search(fieldInfo.name, vectors.get(count), collector, null); TopDocs docs = collector.topDocs(); if (docs.scoreDocs.length == 0) { throw new CheckIndexException( "Field \"" + fieldInfo.name + "\" failed to search k nearest neighbors"); } } - int valueLength = values.vectorValue(count).length; + int valueLength = vectors.get(count).length; if (valueLength != fieldInfo.getVectorDimension()) { throw new CheckIndexException( "Field \"" diff --git a/lucene/core/src/java/org/apache/lucene/index/ExitableDirectoryReader.java b/lucene/core/src/java/org/apache/lucene/index/ExitableDirectoryReader.java index 6b9562fa5d5..868601ab83b 100644 --- a/lucene/core/src/java/org/apache/lucene/index/ExitableDirectoryReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/ExitableDirectoryReader.java @@ -444,6 +444,7 @@ public int dimension() { public Floats values() throws IOException { Floats dict = vectorValues.values(); return new Floats() { + @Override public float[] get(int ord) throws IOException { return dict.get(ord); } @@ -489,8 +490,15 @@ public int size() { } @Override - public byte[] vectorValue(int ord) throws IOException { - return vectorValues.vectorValue(ord); + public Bytes values() throws IOException { + return new Bytes() { + Bytes vectors = vectorValues.values(); + + @Override + public byte[] get(int ord) throws IOException { + return vectors.get(ord); + } + }; } @Override @@ -507,11 +515,6 @@ public DocIndexIterator iterator() { public VectorScorer scorer(byte[] target) throws IOException { return vectorValues.scorer(target); } - - @Override - public ByteVectorValues copy() { - throw new UnsupportedOperationException(); - } } } diff --git a/lucene/core/src/java/org/apache/lucene/index/FloatVectorValues.java b/lucene/core/src/java/org/apache/lucene/index/FloatVectorValues.java index 3e332b98de3..b02c060e43b 100644 --- a/lucene/core/src/java/org/apache/lucene/index/FloatVectorValues.java +++ b/lucene/core/src/java/org/apache/lucene/index/FloatVectorValues.java @@ -32,6 +32,7 @@ public abstract class FloatVectorValues extends KnnVectorValues { /** Sole constructor */ protected FloatVectorValues() {} + /** A random access (lookup by ord) provider of the vector values */ public abstract static class Floats { /** * Return the vector value for the given vector ordinal which must be in [0, size() - 1], diff --git a/lucene/core/src/java/org/apache/lucene/index/SlowCompositeCodecReaderWrapper.java b/lucene/core/src/java/org/apache/lucene/index/SlowCompositeCodecReaderWrapper.java index 37cd21abf20..af42c7f8541 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SlowCompositeCodecReaderWrapper.java +++ b/lucene/core/src/java/org/apache/lucene/index/SlowCompositeCodecReaderWrapper.java @@ -887,7 +887,7 @@ public int size() { public Floats values() { return new Floats() { int lastSubIndex = -1; - Floats subDict; + Floats subValues; @Override public float[] get(int ord) throws IOException { @@ -898,9 +898,9 @@ public float[] get(int ord) throws IOException { if (newSubIndex != lastSubIndex) { lastSubIndex = newSubIndex; assert subs[lastSubIndex].sub != null; - subDict = ((FloatVectorValues) subs[lastSubIndex].sub).values(); + subValues = ((FloatVectorValues) subs[lastSubIndex].sub).values(); } - return subDict.get(ord - subs[lastSubIndex].ordStart); + return subValues.get(ord - subs[lastSubIndex].ordStart); } }; } @@ -932,7 +932,6 @@ class MergedByteVectorValues extends ByteVectorValues { final DocValuesSub[] subs; final MergedDocIterator iter; final int[] starts; - int lastSubIndex; MergedByteVectorValues(int dimension, int size, List> subs) { this.dimension = dimension; @@ -964,24 +963,26 @@ public int size() { } @Override - public byte[] vectorValue(int ord) throws IOException { - assert ord >= 0 && ord < size; - // We need to implement fully random-access API here in order to support callers like - // SortingCodecReader that rely on it. We maintain lastSubIndex since we expect some - // repetition. - lastSubIndex = findSub(ord, lastSubIndex, starts); - return ((ByteVectorValues) subs[lastSubIndex].sub) - .vectorValue(ord - subs[lastSubIndex].ordStart); - } + public Bytes values() { + return new Bytes() { + int lastSubIndex = -1; + Bytes subValues; - @SuppressWarnings("unchecked") - @Override - public ByteVectorValues copy() throws IOException { - List> newSubs = new ArrayList<>(); - for (Object sub : subs) { - newSubs.add((DocValuesSub) sub); - } - return new MergedByteVectorValues(dimension, size, newSubs); + @Override + public byte[] get(int ord) throws IOException { + assert ord >= 0 && ord < size; + // We need to implement fully random-access API here in order to support callers like + // SortingCodecReader that rely on it. We maintain lastSubIndex since we expect some + // repetition. + int newSubIndex = findSub(ord, lastSubIndex, starts); + if (newSubIndex != lastSubIndex) { + lastSubIndex = newSubIndex; + assert subs[lastSubIndex].sub != null; + subValues = ((ByteVectorValues) subs[lastSubIndex].sub).values(); + } + return subValues.get(ord - subs[lastSubIndex].ordStart); + } + }; } } diff --git a/lucene/core/src/java/org/apache/lucene/index/SortingCodecReader.java b/lucene/core/src/java/org/apache/lucene/index/SortingCodecReader.java index d92f9ddea00..8d488b5b45a 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SortingCodecReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/SortingCodecReader.java @@ -322,6 +322,7 @@ private static class SortingFloatVectorValues extends FloatVectorValues { public Floats values() throws IOException { Floats delegateDict = delegate.values(); return new Floats() { + @Override public float[] get(int ord) throws IOException { // ords are interpreted in the delegate's ord-space. return delegateDict.get(ord); @@ -356,8 +357,15 @@ private static class SortingByteVectorValues extends ByteVectorValues { } @Override - public byte[] vectorValue(int ord) throws IOException { - return delegate.vectorValue(ord); + public Bytes values() throws IOException { + return new Bytes() { + Bytes values = delegate.values(); + + @Override + public byte[] get(int ord) throws IOException { + return values.get(ord); + } + }; } @Override @@ -374,11 +382,6 @@ public int dimension() { public int size() { return iteratorSupplier.size(); } - - @Override - public ByteVectorValues copy() { - throw new UnsupportedOperationException(); - } } /** diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/CloseableRandomVectorScorerSupplier.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/CloseableRandomVectorScorerSupplier.java index 148963e7dc9..162f6013140 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/CloseableRandomVectorScorerSupplier.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/CloseableRandomVectorScorerSupplier.java @@ -21,10 +21,7 @@ /** * A supplier that creates {@link RandomVectorScorer} from an ordinal. Caller should be sure to - * close after use - * - *

NOTE: the {@link #copy()} returned {@link RandomVectorScorerSupplier} is not necessarily - * closeable + * close after use. */ public interface CloseableRandomVectorScorerSupplier extends Closeable, RandomVectorScorerSupplier { int totalVectorCount(); diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswConcurrentMergeBuilder.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswConcurrentMergeBuilder.java index aebed564238..d20b6de53bc 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswConcurrentMergeBuilder.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswConcurrentMergeBuilder.java @@ -60,7 +60,7 @@ public HnswConcurrentMergeBuilder( for (int i = 0; i < numWorker; i++) { workers[i] = new ConcurrentMergeWorker( - scorerSupplier.copy(), + scorerSupplier, M, beamWidth, HnswGraphBuilder.randSeed, diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/RandomVectorScorerSupplier.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/RandomVectorScorerSupplier.java index f8436f061d6..d950ef5314e 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/RandomVectorScorerSupplier.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/RandomVectorScorerSupplier.java @@ -29,10 +29,4 @@ public interface RandomVectorScorerSupplier { * @return a new {@link RandomVectorScorer} */ RandomVectorScorer scorer(int ord) throws IOException; - - /** - * Make a copy of the supplier, which will copy the underlying vectorValues so the copy is safe to - * be used in other threads. - */ - RandomVectorScorerSupplier copy() throws IOException; } diff --git a/lucene/core/src/java/org/apache/lucene/util/quantization/QuantizedByteVectorValues.java b/lucene/core/src/java/org/apache/lucene/util/quantization/QuantizedByteVectorValues.java index b90ab8276dd..88b071bf2ce 100644 --- a/lucene/core/src/java/org/apache/lucene/util/quantization/QuantizedByteVectorValues.java +++ b/lucene/core/src/java/org/apache/lucene/util/quantization/QuantizedByteVectorValues.java @@ -28,14 +28,12 @@ * * @lucene.experimental */ -public abstract class QuantizedByteVectorValues extends ByteVectorValues implements HasIndexSlice { +public abstract class QuantizedByteVectorValues extends ByteVectorValues { public ScalarQuantizer getScalarQuantizer() { throw new UnsupportedOperationException(); } - public abstract float getScoreCorrectionConstant(int ord) throws IOException; - /** * Return a {@link VectorScorer} for the given query vector. * @@ -46,13 +44,35 @@ public VectorScorer scorer(float[] query) throws IOException { throw new UnsupportedOperationException(); } + /** Returns a random access (lookup by ord) provider of the quantized vector values */ @Override - public QuantizedByteVectorValues copy() throws IOException { - return this; - } + public abstract QuantizedBytes values() throws IOException; - @Override - public IndexInput getSlice() { - return null; + /** A Bytes that also provides quantization info */ + public abstract static class QuantizedBytes extends Bytes implements HasIndexSlice { + + /** + * Returns a constant that can be used to account for differences in quantization in order to + * make scores computed across differently-quantized vectors comparable. + */ + public abstract float getScoreCorrectionConstant(int ord) throws IOException; + + @Override + public IndexInput getSlice() { + return null; + } + + public static final QuantizedBytes EMPTY = + new QuantizedBytes() { + @Override + public byte[] get(int ord) { + throw new UnsupportedOperationException(); + } + + @Override + public float getScoreCorrectionConstant(int ord) { + throw new UnsupportedOperationException(); + } + }; } } diff --git a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/Lucene99MemorySegmentByteVectorScorerSupplier.java b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/Lucene99MemorySegmentByteVectorScorerSupplier.java index 02c71561122..ef904867349 100644 --- a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/Lucene99MemorySegmentByteVectorScorerSupplier.java +++ b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/Lucene99MemorySegmentByteVectorScorerSupplier.java @@ -77,7 +77,7 @@ final void checkOrdinal(int ord) { } } - final MemorySegment getFirstSegment(int ord) throws IOException { + final MemorySegment getFirstSegment(MemorySegmentAccessInput input, int ord) throws IOException { long byteOffset = (long) ord * vectorByteSize; MemorySegment seg = input.segmentSliceOrNull(byteOffset, vectorByteSize); if (seg == null) { @@ -90,7 +90,7 @@ final MemorySegment getFirstSegment(int ord) throws IOException { return seg; } - final MemorySegment getSecondSegment(int ord) throws IOException { + final MemorySegment getSecondSegment(MemorySegmentAccessInput input, int ord) throws IOException { long byteOffset = (long) ord * vectorByteSize; MemorySegment seg = input.segmentSliceOrNull(byteOffset, vectorByteSize); if (seg == null) { @@ -112,20 +112,16 @@ static final class CosineSupplier extends Lucene99MemorySegmentByteVectorScorerS @Override public RandomVectorScorer scorer(int ord) { checkOrdinal(ord); + MemorySegmentAccessInput slice = input.clone(); return new RandomVectorScorer.AbstractRandomVectorScorer(values) { @Override public float score(int node) throws IOException { checkOrdinal(node); - float raw = PanamaVectorUtilSupport.cosine(getFirstSegment(ord), getSecondSegment(node)); + float raw = PanamaVectorUtilSupport.cosine(getFirstSegment(slice, ord), getSecondSegment(slice, node)); return (1 + raw) / 2; } }; } - - @Override - public CosineSupplier copy() throws IOException { - return new CosineSupplier(input.clone(), values); - } } static final class DotProductSupplier extends Lucene99MemorySegmentByteVectorScorerSupplier { @@ -138,21 +134,17 @@ static final class DotProductSupplier extends Lucene99MemorySegmentByteVectorSco public RandomVectorScorer scorer(int ord) { checkOrdinal(ord); return new RandomVectorScorer.AbstractRandomVectorScorer(values) { + MemorySegmentAccessInput slice = input.clone(); @Override public float score(int node) throws IOException { checkOrdinal(node); // divide by 2 * 2^14 (maximum absolute value of product of 2 signed bytes) * len float raw = - PanamaVectorUtilSupport.dotProduct(getFirstSegment(ord), getSecondSegment(node)); + PanamaVectorUtilSupport.dotProduct(getFirstSegment(slice, ord), getSecondSegment(slice, node)); return 0.5f + raw / (float) (values.dimension() * (1 << 15)); } }; } - - @Override - public DotProductSupplier copy() throws IOException { - return new DotProductSupplier(input.clone(), values); - } } static final class EuclideanSupplier extends Lucene99MemorySegmentByteVectorScorerSupplier { @@ -164,21 +156,17 @@ static final class EuclideanSupplier extends Lucene99MemorySegmentByteVectorScor @Override public RandomVectorScorer scorer(int ord) { checkOrdinal(ord); + MemorySegmentAccessInput slice = input.clone(); return new RandomVectorScorer.AbstractRandomVectorScorer(values) { @Override public float score(int node) throws IOException { checkOrdinal(node); float raw = - PanamaVectorUtilSupport.squareDistance(getFirstSegment(ord), getSecondSegment(node)); + PanamaVectorUtilSupport.squareDistance(getFirstSegment(slice, ord), getSecondSegment(slice, node)); return 1 / (1f + raw); } }; } - - @Override - public EuclideanSupplier copy() throws IOException { - return new EuclideanSupplier(input.clone(), values); - } } static final class MaxInnerProductSupplier extends Lucene99MemorySegmentByteVectorScorerSupplier { @@ -191,11 +179,12 @@ static final class MaxInnerProductSupplier extends Lucene99MemorySegmentByteVect public RandomVectorScorer scorer(int ord) { checkOrdinal(ord); return new RandomVectorScorer.AbstractRandomVectorScorer(values) { + MemorySegmentAccessInput slice = input.clone(); @Override public float score(int node) throws IOException { checkOrdinal(node); float raw = - PanamaVectorUtilSupport.dotProduct(getFirstSegment(ord), getSecondSegment(node)); + PanamaVectorUtilSupport.dotProduct(getFirstSegment(slice, ord), getSecondSegment(slice, node)); if (raw < 0) { return 1 / (1 + -1 * raw); } @@ -203,10 +192,5 @@ public float score(int node) throws IOException { } }; } - - @Override - public MaxInnerProductSupplier copy() throws IOException { - return new MaxInnerProductSupplier(input.clone(), values); - } } } diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99HnswQuantizedVectorsFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99HnswQuantizedVectorsFormat.java index ed70b2df002..cee219d8e42 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99HnswQuantizedVectorsFormat.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99HnswQuantizedVectorsFormat.java @@ -311,9 +311,11 @@ public void testQuantizedVectorsWriteAndRead() throws Exception { assertNotNull(hnswReader.getQuantizationState("f")); QuantizedByteVectorValues quantizedByteVectorValues = hnswReader.getQuantizedVectorValues("f"); + QuantizedByteVectorValues.QuantizedBytes byteVectors = + quantizedByteVectorValues.values(); for (int ord = 0; ord < quantizedByteVectorValues.size(); ord++) { - byte[] vector = quantizedByteVectorValues.vectorValue(ord); - float offset = quantizedByteVectorValues.getScoreCorrectionConstant(ord); + byte[] vector = byteVectors.get(ord); + float offset = byteVectors.getScoreCorrectionConstant(ord); for (int i = 0; i < dim; i++) { assertEquals(vector[i], expectedVectors[ord][i]); } diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99ScalarQuantizedVectorScorer.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99ScalarQuantizedVectorScorer.java index 3b758de6ce6..d4594673a58 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99ScalarQuantizedVectorScorer.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99ScalarQuantizedVectorScorer.java @@ -118,23 +118,23 @@ public int size() { } @Override - public byte[] vectorValue(int ord) { - return new byte[32]; - } + public QuantizedBytes values() { + return new QuantizedBytes() { + @Override + public byte[] get(int ord) { + return new byte[32]; + } - @Override - public float getScoreCorrectionConstant(int ord) { - return -50; - } + @Override + public float getScoreCorrectionConstant(int ord) { + return -50; + } - @Override - public QuantizedByteVectorValues copy() throws IOException { - return this; - } - - @Override - public IndexInput getSlice() { - return in; + @Override + public IndexInput getSlice() { + return in; + } + }; } @Override diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99ScalarQuantizedVectorsFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99ScalarQuantizedVectorsFormat.java index ccba0975d73..5a3074488a7 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99ScalarQuantizedVectorsFormat.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99ScalarQuantizedVectorsFormat.java @@ -173,11 +173,13 @@ public void testQuantizedVectorsWriteAndRead() throws Exception { assertNotNull(quantizedReader.getQuantizationState("f")); QuantizedByteVectorValues quantizedByteVectorValues = quantizedReader.getQuantizedVectorValues("f"); + QuantizedByteVectorValues.QuantizedBytes byteVectors = + quantizedByteVectorValues.values(); int docId = -1; KnnVectorValues.DocIndexIterator iter = quantizedByteVectorValues.iterator(); for (docId = iter.nextDoc(); docId != NO_MORE_DOCS; docId = iter.nextDoc()) { - byte[] vector = quantizedByteVectorValues.vectorValue(iter.index()); - float offset = quantizedByteVectorValues.getScoreCorrectionConstant(iter.index()); + byte[] vector = byteVectors.get(iter.index()); + float offset = byteVectors.getScoreCorrectionConstant(iter.index()); for (int i = 0; i < dim; i++) { assertEquals(vector[i], expectedVectors[docId][i]); } diff --git a/lucene/core/src/test/org/apache/lucene/document/TestField.java b/lucene/core/src/test/org/apache/lucene/document/TestField.java index e13d8cd6897..4d0be248614 100644 --- a/lucene/core/src/test/org/apache/lucene/document/TestField.java +++ b/lucene/core/src/test/org/apache/lucene/document/TestField.java @@ -714,13 +714,14 @@ public void testKnnVectorField() throws Exception { w.addDocument(doc); try (IndexReader r = DirectoryReader.open(w)) { ByteVectorValues binary = r.leaves().get(0).reader().getByteVectorValues("binary"); + ByteVectorValues.Bytes vectors = binary.values(); assertEquals(1, binary.size()); KnnVectorValues.DocIndexIterator iterator = binary.iterator(); assertNotEquals(NO_MORE_DOCS, iterator.nextDoc()); - assertNotNull(binary.vectorValue(0)); - assertArrayEquals(b, binary.vectorValue(0)); + assertNotNull(vectors.get(0)); + assertArrayEquals(b, vectors.get(0)); assertEquals(NO_MORE_DOCS, iterator.nextDoc()); - expectThrows(IOException.class, () -> binary.vectorValue(1)); + expectThrows(IOException.class, () -> vectors.get(1)); FloatVectorValues floatValues = r.leaves().get(0).reader().getFloatVectorValues("float"); assertEquals(1, floatValues.size()); diff --git a/lucene/core/src/test/org/apache/lucene/internal/vectorization/TestVectorScorer.java b/lucene/core/src/test/org/apache/lucene/internal/vectorization/TestVectorScorer.java index bc3b6813a5b..086810001d7 100644 --- a/lucene/core/src/test/org/apache/lucene/internal/vectorization/TestVectorScorer.java +++ b/lucene/core/src/test/org/apache/lucene/internal/vectorization/TestVectorScorer.java @@ -257,8 +257,8 @@ public void testCopiesAcrossThreads() throws Exception { var scorer = MEMSEG_SCORER.getRandomVectorScorerSupplier(sim, vectorValues); var tasks = List.>>of( - new AssertingScoreCallable(scorer.copy().scorer(0), 1, expectedScore1), - new AssertingScoreCallable(scorer.copy().scorer(2), 3, expectedScore2)); + new AssertingScoreCallable(scorer.scorer(0), 1, expectedScore1), + new AssertingScoreCallable(scorer.scorer(2), 3, expectedScore2)); var executor = Executors.newFixedThreadPool(2, new NamedThreadFactory("copiesThreads")); var results = executor.invokeAll(tasks); executor.shutdown(); diff --git a/lucene/core/src/test/org/apache/lucene/util/hnsw/HnswGraphTestCase.java b/lucene/core/src/test/org/apache/lucene/util/hnsw/HnswGraphTestCase.java index 9234e70b802..8d66075b0c0 100644 --- a/lucene/core/src/test/org/apache/lucene/util/hnsw/HnswGraphTestCase.java +++ b/lucene/core/src/test/org/apache/lucene/util/hnsw/HnswGraphTestCase.java @@ -85,7 +85,6 @@ import org.apache.lucene.util.RamUsageEstimator; import org.apache.lucene.util.VectorUtil; import org.apache.lucene.util.hnsw.HnswGraph.NodesIterator; -import org.junit.Ignore; /** Tests HNSW KNN graphs */ abstract class HnswGraphTestCase extends LuceneTestCase { @@ -122,8 +121,7 @@ protected RandomVectorScorerSupplier buildScorerSupplier(KnnVectorValues vectors protected RandomVectorScorer buildScorer(KnnVectorValues vectors, T query) throws IOException { return switch (getVectorEncoding()) { case BYTE -> - flatVectorScorer.getRandomVectorScorer( - similarityFunction, ((ByteVectorValues) vectors).copy(), (byte[]) query); + flatVectorScorer.getRandomVectorScorer(similarityFunction, vectors, (byte[]) query); case FLOAT32 -> flatVectorScorer.getRandomVectorScorer(similarityFunction, vectors, (float[]) query); }; @@ -185,7 +183,7 @@ public KnnVectorsFormat getKnnVectorsFormatForField(String field) { doc.add( knnVectorField( "field", - (T) ((ByteVectorValues) vectors).vectorValue(ord), + (T) ((ByteVectorValues) vectors).values().get(ord), similarityFunction)); } case FLOAT32 -> { @@ -223,7 +221,7 @@ public KnnVectorsFormat getKnnVectorsFormatForField(String field) { private T vectorValue(KnnVectorValues vectors, int ord) throws IOException { switch (vectors.getEncoding()) { case BYTE -> { - return (T) ((ByteVectorValues) vectors).vectorValue(ord); + return (T) ((ByteVectorValues) vectors).values().get(ord); } case FLOAT32 -> { return (T) ((FloatVectorValues) vectors).values().get(ord); @@ -236,29 +234,6 @@ interface Vectors { T get(int ord) throws IOException; } - // we used to have a generically-typed vector API, now this persists only in tests - @SuppressWarnings("unchecked") - private static Vectors vectors(KnnVectorValues vectors) throws IOException { - return switch (vectors.getEncoding()) { - case FLOAT32 -> - new Vectors() { - FloatVectorValues.Floats dict = ((FloatVectorValues) vectors).values(); - - @Override - public T get(int ord) throws IOException { - return (T) dict.get(ord); - } - }; - case BYTE -> - new Vectors() { - @Override - public T get(int ord) throws IOException { - return (T) ((ByteVectorValues) vectors).vectorValue(ord); - } - }; - }; - } - // test writing out and reading in a graph gives the expected graph public void testReadWrite() throws IOException { int dim = random().nextInt(100) + 1; @@ -267,7 +242,6 @@ public void testReadWrite() throws IOException { int beamWidth = random().nextInt(10) + 5; long seed = random().nextLong(); KnnVectorValues vectors = vectorValues(nDoc, dim); - Vectors v1 = vectors(vectors), v2 = vectors(vectors), v3 = vectors(vectors); RandomVectorScorerSupplier scorerSupplier = buildScorerSupplier(vectors); HnswGraphBuilder builder = HnswGraphBuilder.create(scorerSupplier, M, beamWidth, seed); HnswGraph hnsw = builder.build(vectors.size()); @@ -1183,20 +1157,11 @@ public VectorScorer scorer(float[] target) { /** Returns vectors evenly distributed around the upper unit semicircle. */ static class CircularByteVectorValues extends ByteVectorValues { private final int size; - private final float[] value; - private final byte[] bValue; int doc = -1; CircularByteVectorValues(int size) { this.size = size; - value = new float[2]; - bValue = new byte[2]; - } - - @Override - public CircularByteVectorValues copy() { - return new CircularByteVectorValues(size); } @Override @@ -1209,34 +1174,21 @@ public int size() { return size; } - public byte[] vectorValue() { - return vectorValue(doc); - } - - public int docID() { - return doc; - } - - public int nextDoc() { - return advance(doc + 1); - } - - public int advance(int target) { - if (target >= 0 && target < size) { - doc = target; - } else { - doc = NO_MORE_DOCS; - } - return doc; - } - @Override - public byte[] vectorValue(int ord) { - unitVector2d(ord / (double) size, value); - for (int i = 0; i < value.length; i++) { - bValue[i] = (byte) (value[i] * 127); - } - return bValue; + public Bytes values() { + return new Bytes() { + byte[] bValue = new byte[2]; + float[] value = new float[2]; + + @Override + public byte[] get(int ord) { + unitVector2d(ord / (double) size, value); + for (int i = 0; i < value.length; i++) { + bValue[i] = (byte) (value[i] * 127); + } + return bValue; + } + }; } @Override diff --git a/lucene/core/src/test/org/apache/lucene/util/hnsw/MockByteVectorValues.java b/lucene/core/src/test/org/apache/lucene/util/hnsw/MockByteVectorValues.java index 4ab86c70781..719d7df76b4 100644 --- a/lucene/core/src/test/org/apache/lucene/util/hnsw/MockByteVectorValues.java +++ b/lucene/core/src/test/org/apache/lucene/util/hnsw/MockByteVectorValues.java @@ -19,16 +19,12 @@ import org.apache.lucene.index.ByteVectorValues; import org.apache.lucene.tests.util.LuceneTestCase; -import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; class MockByteVectorValues extends ByteVectorValues { private final int dimension; - private final byte[][] denseValues; protected final byte[][] values; - private final int numVectors; private final BytesRef binaryValue; - private final byte[] scratch; static MockByteVectorValues fromValues(byte[][] values) { byte[] firstNonNull = null; @@ -37,26 +33,15 @@ static MockByteVectorValues fromValues(byte[][] values) { firstNonNull = values[j++]; } int dimension = firstNonNull.length; - int maxDoc = values.length; - byte[][] denseValues = new byte[maxDoc][]; - int count = 0; - for (int i = 0; i < maxDoc; i++) { - if (values[i] != null) { - denseValues[count++] = values[i]; - } - } - return new MockByteVectorValues(values, dimension, denseValues, count); + return new MockByteVectorValues(values, dimension); } - MockByteVectorValues(byte[][] values, int dimension, byte[][] denseValues, int numVectors) { + MockByteVectorValues(byte[][] values, int dimension) { this.dimension = dimension; this.values = values; - this.denseValues = denseValues; - this.numVectors = numVectors; // used by tests that build a graph from bytes rather than floats binaryValue = new BytesRef(dimension); binaryValue.length = dimension; - scratch = new byte[dimension]; } @Override @@ -70,23 +55,25 @@ public int dimension() { } @Override - public MockByteVectorValues copy() { - return new MockByteVectorValues( - ArrayUtil.copyArray(values), dimension, ArrayUtil.copyArray(denseValues), numVectors); - } + public Bytes values() { + return new Bytes() { + byte[] scratch = new byte[dimension]; - @Override - public byte[] vectorValue(int ord) { - if (LuceneTestCase.random().nextBoolean()) { - return values[ord]; - } else { - // Sometimes use the same scratch array repeatedly, mimicing what the codec will do. - // This should help us catch cases of aliasing where the same ByteVectorValues source is used - // twice in a - // single computation. - System.arraycopy(values[ord], 0, scratch, 0, dimension); - return scratch; - } + @Override + public byte[] get(int ord) { + if (LuceneTestCase.random().nextBoolean()) { + return values[ord]; + } else { + // Sometimes use the same scratch array repeatedly, mimicing what the codec will do. + // This should help us catch cases of aliasing where the same ByteVectorValues source is + // used + // twice in a + // single computation. + System.arraycopy(values[ord], 0, scratch, 0, dimension); + return scratch; + } + } + }; } @Override diff --git a/lucene/core/src/test/org/apache/lucene/util/hnsw/MockVectorValues.java b/lucene/core/src/test/org/apache/lucene/util/hnsw/MockVectorValues.java index 2ff2d50a223..a25ed570de3 100644 --- a/lucene/core/src/test/org/apache/lucene/util/hnsw/MockVectorValues.java +++ b/lucene/core/src/test/org/apache/lucene/util/hnsw/MockVectorValues.java @@ -22,9 +22,7 @@ class MockVectorValues extends FloatVectorValues { private final int dimension; - private final float[][] denseValues; protected final float[][] values; - private final int numVectors; static MockVectorValues fromValues(float[][] values) { float[] firstNonNull = null; @@ -33,22 +31,12 @@ static MockVectorValues fromValues(float[][] values) { firstNonNull = values[j++]; } int dimension = firstNonNull.length; - int maxDoc = values.length; - float[][] denseValues = new float[maxDoc][]; - int count = 0; - for (int i = 0; i < maxDoc; i++) { - if (values[i] != null) { - denseValues[count++] = values[i]; - } - } - return new MockVectorValues(values, dimension, denseValues, count); + return new MockVectorValues(values, dimension); } - MockVectorValues(float[][] values, int dimension, float[][] denseValues, int numVectors) { + MockVectorValues(float[][] values, int dimension) { this.dimension = dimension; this.values = values; - this.denseValues = denseValues; - this.numVectors = numVectors; } @Override diff --git a/lucene/core/src/test/org/apache/lucene/util/hnsw/TestHnswByteVectorGraph.java b/lucene/core/src/test/org/apache/lucene/util/hnsw/TestHnswByteVectorGraph.java index f0e6745211c..91611fbe775 100644 --- a/lucene/core/src/test/org/apache/lucene/util/hnsw/TestHnswByteVectorGraph.java +++ b/lucene/core/src/test/org/apache/lucene/util/hnsw/TestHnswByteVectorGraph.java @@ -110,10 +110,11 @@ MockByteVectorValues vectorValues( @Override MockByteVectorValues vectorValues(LeafReader reader, String fieldName) throws IOException { ByteVectorValues vectorValues = reader.getByteVectorValues(fieldName); + ByteVectorValues.Bytes byteVectors = vectorValues.values(); byte[][] vectors = new byte[reader.maxDoc()][]; for (int i = 0; i < vectorValues.size(); i++) { vectors[vectorValues.ordToDoc(i)] = - ArrayUtil.copyOfSubArray(vectorValues.vectorValue(i), 0, vectorValues.dimension()); + ArrayUtil.copyOfSubArray(byteVectors.get(i), 0, vectorValues.dimension()); } return MockByteVectorValues.fromValues(vectors); } diff --git a/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java b/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java index 0b5b5b665b4..6b5feb71dca 100644 --- a/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java +++ b/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java @@ -2364,12 +2364,17 @@ public int size() { } @Override - public byte[] vectorValue(int ord) { - if (ord == 0) { - return info.byteVectorValues[0]; - } else { - return null; - } + public Bytes values() { + return new Bytes() { + @Override + public byte[] get(int ord) { + if (ord == 0) { + return info.byteVectorValues[0]; + } else { + return null; + } + } + }; } @Override @@ -2378,7 +2383,7 @@ public DocIndexIterator iterator() { } @Override - public VectorScorer scorer(byte[] query) { + public VectorScorer scorer(byte[] query) throws IOException { if (query.length != info.fieldInfo.getVectorDimension()) { throw new IllegalArgumentException( "query vector dimension " @@ -2387,14 +2392,13 @@ public VectorScorer scorer(byte[] query) { + info.fieldInfo.getVectorDimension()); } MemoryByteVectorValues vectorValues = new MemoryByteVectorValues(info); + ByteVectorValues.Bytes vectors = vectorValues.values(); DocIndexIterator iterator = vectorValues.iterator(); return new VectorScorer() { @Override - public float score() { + public float score() throws IOException { assert iterator.docID() == 0; - return info.fieldInfo - .getVectorSimilarityFunction() - .compare(vectorValues.vectorValue(0), query); + return info.fieldInfo.getVectorSimilarityFunction().compare(vectors.get(0), query); } @Override @@ -2403,10 +2407,5 @@ public DocIdSetIterator iterator() { } }; } - - @Override - public MemoryByteVectorValues copy() { - return this; - } } } diff --git a/lucene/memory/src/test/org/apache/lucene/index/memory/TestMemoryIndex.java b/lucene/memory/src/test/org/apache/lucene/index/memory/TestMemoryIndex.java index c16ed357573..4bd9a0bf87c 100644 --- a/lucene/memory/src/test/org/apache/lucene/index/memory/TestMemoryIndex.java +++ b/lucene/memory/src/test/org/apache/lucene/index/memory/TestMemoryIndex.java @@ -890,7 +890,7 @@ private static void assertByteVectorValue(MemoryIndex mi, String fieldName, byte assertNotNull(bvv); KnnVectorValues.DocIndexIterator iterator = bvv.iterator(); assertEquals(0, iterator.nextDoc()); - assertArrayEquals(expected, bvv.vectorValue(0)); + assertArrayEquals(expected, bvv.values().get(0)); assertEquals(DocIdSetIterator.NO_MORE_DOCS, iterator.nextDoc()); } diff --git a/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/ByteKnnVectorFieldSource.java b/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/ByteKnnVectorFieldSource.java index c95bf632a73..595df5f11fb 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/ByteKnnVectorFieldSource.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/ByteKnnVectorFieldSource.java @@ -65,11 +65,12 @@ protected DocIdSetIterator getVectorIterator() { return new VectorFieldFunction(this) { KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator(); + ByteVectorValues.Bytes vectors = vectorValues.values(); @Override public byte[] byteVectorVal(int doc) throws IOException { if (exists(doc)) { - return vectorValues.vectorValue(iterator.index()); + return vectors.get(iterator.index()); } else { return null; } diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java index d22e908e8f3..e33b88ff583 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java @@ -882,11 +882,12 @@ public void testSparseVectors() throws Exception { for (LeafReaderContext ctx : r.leaves()) { ByteVectorValues byteVectorValues = ctx.reader().getByteVectorValues(fieldName); if (byteVectorValues != null) { + ByteVectorValues.Bytes vectors = byteVectorValues.values(); docCount += byteVectorValues.size(); KnnVectorValues.DocIndexIterator iterator = byteVectorValues.iterator(); while (true) { if (!(iterator.nextDoc() != NO_MORE_DOCS)) break; - checksum += byteVectorValues.vectorValue(iterator.index())[0]; + checksum += vectors.get(iterator.index())[0]; } } } @@ -897,10 +898,10 @@ public void testSparseVectors() throws Exception { if (vectorValues != null) { docCount += vectorValues.size(); KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator(); - FloatVectorValues.Floats dict = vectorValues.values(); + FloatVectorValues.Floats vectors = vectorValues.values(); while (true) { if (!(iterator.nextDoc() != NO_MORE_DOCS)) break; - checksum += dict.get(iterator.index())[0]; + checksum += vectors.get(iterator.index())[0]; } } } @@ -1027,10 +1028,14 @@ public void testByteVectorScorerIteration() throws Exception { assertTrue(score >= 0f); assertEquals(iterator.docID(), valuesIterator.docID()); } + System.out.println("values=" + vectorValues); + System.out.println("scorer=" + scorer); // verify that a new scorer can be obtained after iteration VectorScorer newScorer = vectorValues.scorer(vectorToScore); assertNotNull(newScorer); assertNotSame(scorer, newScorer); + System.out.println("first iterator=" + iterator); + System.out.println("new iterator=" + newScorer.iterator()); assertNotSame(iterator, newScorer.iterator()); } } @@ -1132,16 +1137,16 @@ public void testIndexedValueNotAliased() throws Exception { FloatVectorValues vectorValues = r.getFloatVectorValues(fieldName); assertEquals(3, vectorValues.size()); KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator(); - FloatVectorValues.Floats dict = vectorValues.values(); + FloatVectorValues.Floats vectors = vectorValues.values(); iterator.nextDoc(); assertEquals(0, iterator.index()); - assertEquals(1, dict.get(0)[0], 0); + assertEquals(1, vectors.get(0)[0], 0); iterator.nextDoc(); assertEquals(1, iterator.index()); - assertEquals(1, dict.get(1)[0], 0); + assertEquals(1, vectors.get(1)[0], 0); iterator.nextDoc(); assertEquals(2, iterator.index()); - assertEquals(2, dict.get(2)[0], 0); + assertEquals(2, vectors.get(2)[0], 0); } } } @@ -1165,13 +1170,13 @@ public void testSortedIndex() throws Exception { assertEquals(2, vectorValues.dimension()); assertEquals(3, vectorValues.size()); KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator(); - FloatVectorValues.Floats dict = vectorValues.values(); + FloatVectorValues.Floats vectors = vectorValues.values(); assertEquals("1", storedFields.document(iterator.nextDoc()).get("id")); - assertEquals(-1f, dict.get(0)[0], 0); + assertEquals(-1f, vectors.get(0)[0], 0); assertEquals("2", storedFields.document(iterator.nextDoc()).get("id")); - assertEquals(1, dict.get(1)[0], 0); + assertEquals(1, vectors.get(1)[0], 0); assertEquals("4", storedFields.document(iterator.nextDoc()).get("id")); - assertEquals(0, dict.get(2)[0], 0); + assertEquals(0, vectors.get(2)[0], 0); assertEquals(NO_MORE_DOCS, iterator.nextDoc()); } } @@ -1193,14 +1198,15 @@ public void testSortedIndexBytes() throws Exception { StoredFields storedFields = leaf.storedFields(); ByteVectorValues vectorValues = leaf.getByteVectorValues(fieldName); + ByteVectorValues.Bytes vectors = vectorValues.values(); assertEquals(2, vectorValues.dimension()); assertEquals(3, vectorValues.size()); assertEquals("1", storedFields.document(vectorValues.iterator().nextDoc()).get("id")); - assertEquals(-1, vectorValues.vectorValue(0)[0], 0); + assertEquals(-1, vectors.get(0)[0], 0); assertEquals("2", storedFields.document(vectorValues.iterator().nextDoc()).get("id")); - assertEquals(1, vectorValues.vectorValue(1)[0], 0); + assertEquals(1, vectors.get(1)[0], 0); assertEquals("4", storedFields.document(vectorValues.iterator().nextDoc()).get("id")); - assertEquals(0, vectorValues.vectorValue(2)[0], 0); + assertEquals(0, vectors.get(2)[0], 0); assertEquals(NO_MORE_DOCS, vectorValues.iterator().nextDoc()); } } @@ -1232,22 +1238,22 @@ public void testIndexMultipleKnnVectorFields() throws Exception { assertEquals(2, vectorValues.dimension()); assertEquals(2, vectorValues.size()); KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator(); - FloatVectorValues.Floats dict = vectorValues.values(); + FloatVectorValues.Floats vectors = vectorValues.values(); iterator.nextDoc(); - assertEquals(1f, dict.get(0)[0], 0); + assertEquals(1f, vectors.get(0)[0], 0); iterator.nextDoc(); - assertEquals(2f, dict.get(1)[0], 0); + assertEquals(2f, vectors.get(1)[0], 0); assertEquals(NO_MORE_DOCS, iterator.nextDoc()); FloatVectorValues vectorValues2 = leaf.getFloatVectorValues("field2"); KnnVectorValues.DocIndexIterator it2 = vectorValues2.iterator(); - FloatVectorValues.Floats dict2 = vectorValues2.values(); + FloatVectorValues.Floats vectors2 = vectorValues2.values(); assertEquals(4, vectorValues2.dimension()); assertEquals(2, vectorValues2.size()); it2.nextDoc(); - assertEquals(2f, dict2.get(0)[1], 0); + assertEquals(2f, vectors2.get(0)[1], 0); it2.nextDoc(); - assertEquals(2f, dict2.get(1)[1], 0); + assertEquals(2f, vectors2.get(1)[1], 0); assertEquals(NO_MORE_DOCS, it2.nextDoc()); FloatVectorValues vectorValues3 = leaf.getFloatVectorValues("field3"); @@ -1321,10 +1327,10 @@ public void testRandom() throws Exception { StoredFields storedFields = ctx.reader().storedFields(); int docId; KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator(); - FloatVectorValues.Floats dict = vectorValues.values(); + FloatVectorValues.Floats vectors = vectorValues.values(); while (true) { if (!((docId = iterator.nextDoc()) != NO_MORE_DOCS)) break; - float[] v = dict.get(iterator.index()); + float[] v = vectors.get(iterator.index()); assertEquals(dimension, v.length); String idString = storedFields.document(docId).getField("id").stringValue(); int id = Integer.parseInt(idString); @@ -1400,13 +1406,14 @@ public void testRandomBytes() throws Exception { if (vectorValues == null) { continue; } + ByteVectorValues.Bytes vectors = vectorValues.values(); totalSize += vectorValues.size(); StoredFields storedFields = ctx.reader().storedFields(); int docId; KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator(); while (true) { if (!((docId = iterator.nextDoc()) != NO_MORE_DOCS)) break; - byte[] v = vectorValues.vectorValue(iterator.index()); + byte[] v = vectors.get(iterator.index()); assertEquals(dimension, v.length); String idString = storedFields.document(docId).getField("id").stringValue(); int id = Integer.parseInt(idString); @@ -1529,10 +1536,10 @@ public void testRandomWithUpdatesAndGraph() throws Exception { int docId; int numLiveDocsWithVectors = 0; KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator(); - FloatVectorValues.Floats dict = vectorValues.values(); + FloatVectorValues.Floats vectors = vectorValues.values(); while (true) { if (!((docId = iterator.nextDoc()) != NO_MORE_DOCS)) break; - float[] v = dict.get(iterator.index()); + float[] v = vectors.get(iterator.index()); assertEquals(dimension, v.length); String idString = storedFields.document(docId).getField("id").stringValue(); int id = Integer.parseInt(idString); @@ -1819,9 +1826,10 @@ public void testVectorValuesReportCorrectDocs() throws Exception { docCount += byteVectorValues.size(); StoredFields storedFields = ctx.reader().storedFields(); KnnVectorValues.DocIndexIterator iter = byteVectorValues.iterator(); + ByteVectorValues.Bytes vectors = byteVectorValues.values(); for (iter.nextDoc(); iter.docID() != NO_MORE_DOCS; iter.nextDoc()) { int ord = iter.index(); - checksum += byteVectorValues.vectorValue(ord)[0]; + checksum += vectors.get(ord)[0]; Document doc = storedFields.document(iter.docID(), Set.of("id")); sumDocIds += Integer.parseInt(doc.get("id")); } @@ -1837,13 +1845,13 @@ public void testVectorValuesReportCorrectDocs() throws Exception { for (LeafReaderContext ctx : r.leaves()) { FloatVectorValues vectorValues = ctx.reader().getFloatVectorValues("knn_vector"); if (vectorValues != null) { - FloatVectorValues.Floats dict = vectorValues.values(); + FloatVectorValues.Floats vectors = vectorValues.values(); docCount += vectorValues.size(); StoredFields storedFields = ctx.reader().storedFields(); KnnVectorValues.DocIndexIterator iter = vectorValues.iterator(); for (iter.nextDoc(); iter.docID() != NO_MORE_DOCS; iter.nextDoc()) { int ord = iter.index(); - checksum += dict.get(ord)[0]; + checksum += vectors.get(ord)[0]; Document doc = storedFields.document(iter.docID(), Set.of("id")); sumDocIds += Integer.parseInt(doc.get("id")); } @@ -1892,12 +1900,13 @@ public void testMismatchedFields() throws Exception { LeafReader leafReader = getOnlyLeafReader(reader); ByteVectorValues byteVectors = leafReader.getByteVectorValues("byte"); + ByteVectorValues.Bytes vectors = byteVectors.values(); assertNotNull(byteVectors); KnnVectorValues.DocIndexIterator iter = byteVectors.iterator(); assertEquals(0, iter.nextDoc()); - assertArrayEquals(new byte[] {42}, byteVectors.vectorValue(0)); + assertArrayEquals(new byte[] {42}, vectors.get(0)); assertEquals(1, iter.nextDoc()); - assertArrayEquals(new byte[] {42}, byteVectors.vectorValue(1)); + assertArrayEquals(new byte[] {42}, vectors.get(1)); assertEquals(DocIdSetIterator.NO_MORE_DOCS, iter.nextDoc()); FloatVectorValues floatVectors = leafReader.getFloatVectorValues("float"); From debac32144761312dd17f20f7f185174bcd7393f Mon Sep 17 00:00:00 2001 From: Michael Sokolov Date: Sun, 6 Oct 2024 07:51:31 -0700 Subject: [PATCH 03/25] make sure KnnVectorValues.iterator() always returns a new value --- .../lucene92/OffHeapFloatVectorValues.java | 33 ++++++++------- .../lucene94/OffHeapByteVectorValues.java | 18 ++++---- .../lucene94/OffHeapFloatVectorValues.java | 33 ++++++++------- .../codecs/BufferingKnnVectorsWriter.java | 16 +++---- .../lucene/codecs/KnnVectorsWriter.java | 8 ++-- .../lucene95/OffHeapByteVectorValues.java | 25 ++++------- .../lucene95/OffHeapFloatVectorValues.java | 33 ++++++++------- .../Lucene99ScalarQuantizedVectorsReader.java | 2 +- .../Lucene99ScalarQuantizedVectorsWriter.java | 6 +-- .../lucene/index/ExitableDirectoryReader.java | 4 +- .../apache/lucene/index/KnnVectorValues.java | 2 +- .../SlowCompositeCodecReaderWrapper.java | 42 +++++++++---------- .../org/apache/lucene/util/IOBiFunction.java | 39 +++++++++++++++++ ...MemorySegmentByteVectorScorerSupplier.java | 15 +++++-- .../index/BaseKnnVectorsFormatTestCase.java | 13 ++++-- 15 files changed, 172 insertions(+), 117 deletions(-) create mode 100644 lucene/core/src/java/org/apache/lucene/util/IOBiFunction.java diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene92/OffHeapFloatVectorValues.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene92/OffHeapFloatVectorValues.java index 16e0a89b0eb..6c7944c310d 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene92/OffHeapFloatVectorValues.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene92/OffHeapFloatVectorValues.java @@ -136,7 +136,8 @@ public DocIdSetIterator iterator() { private static class SparseOffHeapVectorValues extends OffHeapFloatVectorValues { private final DirectMonotonicReader ordToDoc; - private final IndexedDISI disi; + private final IndexInput dataIn; + private final Lucene92HnswVectorsReader.FieldEntry fieldEntry; public SparseOffHeapVectorValues( Lucene92HnswVectorsReader.FieldEntry fieldEntry, @@ -149,19 +150,23 @@ public SparseOffHeapVectorValues( final RandomAccessInput addressesData = dataIn.randomAccessSlice(fieldEntry.addressesOffset(), fieldEntry.addressesLength()); this.ordToDoc = DirectMonotonicReader.getInstance(fieldEntry.meta(), addressesData); - this.disi = - new IndexedDISI( - dataIn, - fieldEntry.docsWithFieldOffset(), - fieldEntry.docsWithFieldLength(), - fieldEntry.jumpTableEntryCount(), - fieldEntry.denseRankPower(), - fieldEntry.size()); + this.dataIn = dataIn; + this.fieldEntry = fieldEntry; + } + + private IndexedDISI createDISI() throws IOException { + return new IndexedDISI( + dataIn.clone(), + fieldEntry.docsWithFieldOffset(), + fieldEntry.docsWithFieldLength(), + fieldEntry.jumpTableEntryCount(), + fieldEntry.denseRankPower(), + fieldEntry.size()); } @Override - public DocIndexIterator iterator() { - return IndexedDISI.asDocIndexIterator(disi); + public DocIndexIterator iterator() throws IOException { + return IndexedDISI.asDocIndexIterator(createDISI()); } @Override @@ -190,16 +195,16 @@ public int length() { @Override public VectorScorer scorer(float[] query) throws IOException { FloatVectorValues.Floats values = values(); - DocIndexIterator iterator = iterator(); + IndexedDISI disi = createDISI(); return new VectorScorer() { @Override public float score() throws IOException { - return vectorSimilarityFunction.compare(values.get(iterator.index()), query); + return vectorSimilarityFunction.compare(values.get(disi.index()), query); } @Override public DocIdSetIterator iterator() { - return iterator; + return disi; } }; } diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene94/OffHeapByteVectorValues.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene94/OffHeapByteVectorValues.java index afb890f4863..acc39e48a22 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene94/OffHeapByteVectorValues.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene94/OffHeapByteVectorValues.java @@ -153,7 +153,6 @@ private static class SparseOffHeapVectorValues extends OffHeapByteVectorValues { private final DirectMonotonicReader ordToDoc; private final IndexInput dataIn; private final Lucene94HnswVectorsReader.FieldEntry fieldEntry; - private final IndexedDISI disi; public SparseOffHeapVectorValues( Lucene94HnswVectorsReader.FieldEntry fieldEntry, @@ -169,17 +168,16 @@ public SparseOffHeapVectorValues( this.ordToDoc = DirectMonotonicReader.getInstance(fieldEntry.meta(), addressesData); this.fieldEntry = fieldEntry; this.dataIn = dataIn; - this.disi = createDISI(); } IndexedDISI createDISI() throws IOException { return new IndexedDISI( - dataIn.clone(), - fieldEntry.docsWithFieldOffset(), - fieldEntry.docsWithFieldLength(), - fieldEntry.jumpTableEntryCount(), - fieldEntry.denseRankPower(), - fieldEntry.size()); + dataIn.clone(), + fieldEntry.docsWithFieldOffset(), + fieldEntry.docsWithFieldLength(), + fieldEntry.jumpTableEntryCount(), + fieldEntry.denseRankPower(), + fieldEntry.size()); } @Override @@ -188,8 +186,8 @@ public int ordToDoc(int ord) { } @Override - public DocIndexIterator iterator() { - return fromDISI(disi); + public DocIndexIterator iterator() throws IOException { + return IndexedDISI.asDocIndexIterator(createDISI()); } @Override diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene94/OffHeapFloatVectorValues.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene94/OffHeapFloatVectorValues.java index 96d010071f9..5b67552e78a 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene94/OffHeapFloatVectorValues.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene94/OffHeapFloatVectorValues.java @@ -148,7 +148,8 @@ public DocIdSetIterator iterator() { private static class SparseOffHeapVectorValues extends OffHeapFloatVectorValues { private final DirectMonotonicReader ordToDoc; - private final IndexedDISI disi; + private final IndexInput dataIn; + private final Lucene94HnswVectorsReader.FieldEntry fieldEntry; public SparseOffHeapVectorValues( Lucene94HnswVectorsReader.FieldEntry fieldEntry, @@ -162,19 +163,23 @@ public SparseOffHeapVectorValues( final RandomAccessInput addressesData = dataIn.randomAccessSlice(fieldEntry.addressesOffset(), fieldEntry.addressesLength()); this.ordToDoc = DirectMonotonicReader.getInstance(fieldEntry.meta(), addressesData); - this.disi = - new IndexedDISI( - dataIn, - fieldEntry.docsWithFieldOffset(), - fieldEntry.docsWithFieldLength(), - fieldEntry.jumpTableEntryCount(), - fieldEntry.denseRankPower(), - fieldEntry.size()); + this.dataIn = dataIn; + this.fieldEntry = fieldEntry; + } + + private IndexedDISI createDISI() throws IOException { + return new IndexedDISI( + dataIn.clone(), + fieldEntry.docsWithFieldOffset(), + fieldEntry.docsWithFieldLength(), + fieldEntry.jumpTableEntryCount(), + fieldEntry.denseRankPower(), + fieldEntry.size()); } @Override - public DocIndexIterator iterator() { - return IndexedDISI.asDocIndexIterator(disi); + public DocIndexIterator iterator() throws IOException { + return IndexedDISI.asDocIndexIterator(createDISI()); } @Override @@ -202,17 +207,17 @@ public int length() { @Override public VectorScorer scorer(float[] query) throws IOException { - DocIndexIterator iterator = iterator(); + IndexedDISI disi = createDISI(); Floats values = values(); return new VectorScorer() { @Override public float score() throws IOException { - return vectorSimilarityFunction.compare(values.get(iterator.index()), query); + return vectorSimilarityFunction.compare(values.get(disi.index()), query); } @Override public DocIdSetIterator iterator() { - return iterator; + return disi; } }; } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/BufferingKnnVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/BufferingKnnVectorsWriter.java index db23afcff17..4b3c8c2f53a 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/BufferingKnnVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/BufferingKnnVectorsWriter.java @@ -271,13 +271,13 @@ private static class BufferedFloatVectorValues extends FloatVectorValues { // These are always the vectors of a VectorValuesWriter, which are copied when added to it final List vectors; final int dimension; - private final DocIndexIterator iterator; + final DocIdSet docsWithField; BufferedFloatVectorValues(List vectors, int dimension, DocIdSet docsWithField) throws IOException { this.vectors = vectors; this.dimension = dimension; - this.iterator = fromDISI(docsWithField.iterator()); + this.docsWithField = docsWithField; } @Override @@ -306,8 +306,8 @@ public float[] get(int ord) throws IOException { } @Override - public DocIndexIterator iterator() { - return iterator; + public DocIndexIterator iterator() throws IOException { + return fromDISI(docsWithField.iterator()); } } @@ -315,13 +315,13 @@ private static class BufferedByteVectorValues extends ByteVectorValues { // These are always the vectors of a VectorValuesWriter, which are copied when added to it final List vectors; final int dimension; - private final DocIndexIterator iterator; + final DocIdSet docsWithField; BufferedByteVectorValues(List vectors, int dimension, DocIdSet docsWithField) throws IOException { this.vectors = vectors; this.dimension = dimension; - iterator = fromDISI(docsWithField.iterator()); + this.docsWithField = docsWithField; } @Override @@ -345,8 +345,8 @@ public byte[] get(int targetOrd) { } @Override - public DocIndexIterator iterator() { - return iterator; + public DocIndexIterator iterator() throws IOException { + return fromDISI(docsWithField.iterator()); } } } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsWriter.java index 011e550ffb8..b95ff4f2cdc 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsWriter.java @@ -23,7 +23,6 @@ import java.util.Arrays; import java.util.List; import java.util.Objects; -import java.util.function.BiFunction; import org.apache.lucene.index.ByteVectorValues; import org.apache.lucene.index.DocIDMerger; import org.apache.lucene.index.DocsWithFieldSet; @@ -38,6 +37,7 @@ import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.VectorScorer; import org.apache.lucene.util.Accountable; +import org.apache.lucene.util.IOBiFunction; import org.apache.lucene.util.IOFunction; /** Writes vectors to an index. */ @@ -120,7 +120,7 @@ private static class FloatVectorValuesSub extends DocIDMerger.Sub { final FloatVectorValues values; final KnnVectorValues.DocIndexIterator iterator; - FloatVectorValuesSub(MergeState.DocMap docMap, FloatVectorValues values) { + FloatVectorValuesSub(MergeState.DocMap docMap, FloatVectorValues values) throws IOException { super(docMap); this.values = values; this.iterator = values.iterator(); @@ -142,7 +142,7 @@ private static class ByteVectorValuesSub extends DocIDMerger.Sub { final ByteVectorValues values; final KnnVectorValues.DocIndexIterator iterator; - ByteVectorValuesSub(MergeState.DocMap docMap, ByteVectorValues values) { + ByteVectorValuesSub(MergeState.DocMap docMap, ByteVectorValues values) throws IOException { super(docMap); this.values = values; iterator = values.iterator(); @@ -247,7 +247,7 @@ private static List mergeVectorValues( FieldInfo mergingField, FieldInfos[] sourceFieldInfos, IOFunction valuesSupplier, - BiFunction newSub) + IOBiFunction newSub) throws IOException { List subs = new ArrayList<>(); for (int i = 0; i < knnVectorsReaders.length; i++) { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene95/OffHeapByteVectorValues.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene95/OffHeapByteVectorValues.java index 5ccbde1b3bf..b13c0db094b 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene95/OffHeapByteVectorValues.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene95/OffHeapByteVectorValues.java @@ -188,8 +188,6 @@ private static class SparseOffHeapVectorValues extends OffHeapByteVectorValues { private final DirectMonotonicReader ordToDoc; private final IndexInput dataIn; private final OrdToDocDISIReaderConfiguration configuration; - private final IndexedDISI disi; - private DocIndexIterator iterator; public SparseOffHeapVectorValues( OrdToDocDISIReaderConfiguration configuration, @@ -213,17 +211,16 @@ public SparseOffHeapVectorValues( this.ordToDoc = DirectMonotonicReader.getInstance(configuration.meta, addressesData); this.dataIn = dataIn; this.configuration = configuration; - this.disi = createDISI(); } IndexedDISI createDISI() throws IOException { return new IndexedDISI( - dataIn.clone(), - configuration.docsWithFieldOffset, - configuration.docsWithFieldLength, - configuration.jumpTableEntryCount, - configuration.denseRankPower, - configuration.size); + dataIn.clone(), + configuration.docsWithFieldOffset, + configuration.docsWithFieldLength, + configuration.jumpTableEntryCount, + configuration.denseRankPower, + configuration.size); } @Override @@ -232,12 +229,8 @@ public int ordToDoc(int ord) { } @Override - public DocIndexIterator iterator() { - // we can only create a single iterator since creating a new IndexedDISI - // could throw an IOException - assert iterator == null; - iterator = IndexedDISI.asDocIndexIterator(disi); - return iterator; + public DocIndexIterator iterator() throws IOException { + return IndexedDISI.asDocIndexIterator(createDISI()); } @Override @@ -260,10 +253,10 @@ public int length() { @Override public VectorScorer scorer(byte[] query) throws IOException { + IndexedDISI disi = createDISI(); RandomVectorScorer scorer = flatVectorsScorer.getRandomVectorScorer(similarityFunction, this, query); return new VectorScorer() { - IndexedDISI disi = createDISI(); @Override public float score() throws IOException { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene95/OffHeapFloatVectorValues.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene95/OffHeapFloatVectorValues.java index 4ff9fdecf3b..94831fd8ae3 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene95/OffHeapFloatVectorValues.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene95/OffHeapFloatVectorValues.java @@ -178,7 +178,8 @@ public DocIdSetIterator iterator() { private static class SparseOffHeapVectorValues extends OffHeapFloatVectorValues { private final DirectMonotonicReader ordToDoc; - private final IndexedDISI disi; + private final IndexInput dataIn; + private final OrdToDocDISIReaderConfiguration configuration; public SparseOffHeapVectorValues( OrdToDocDISIReaderConfiguration configuration, @@ -194,14 +195,18 @@ public SparseOffHeapVectorValues( final RandomAccessInput addressesData = dataIn.randomAccessSlice(configuration.addressesOffset, configuration.addressesLength); this.ordToDoc = DirectMonotonicReader.getInstance(configuration.meta, addressesData); - this.disi = - new IndexedDISI( - dataIn, - configuration.docsWithFieldOffset, - configuration.docsWithFieldLength, - configuration.jumpTableEntryCount, - configuration.denseRankPower, - configuration.size); + this.dataIn = dataIn; + this.configuration = configuration; + } + + private IndexedDISI createDISI() throws IOException { + return new IndexedDISI( + dataIn.clone(), + configuration.docsWithFieldOffset, + configuration.docsWithFieldLength, + configuration.jumpTableEntryCount, + configuration.denseRankPower, + configuration.size); } @Override @@ -228,24 +233,24 @@ public int length() { } @Override - public DocIndexIterator iterator() { - return IndexedDISI.asDocIndexIterator(disi); + public DocIndexIterator iterator() throws IOException { + return IndexedDISI.asDocIndexIterator(createDISI()); } @Override public VectorScorer scorer(float[] query) throws IOException { - DocIndexIterator iterator = iterator(); + IndexedDISI disi = createDISI(); RandomVectorScorer randomVectorScorer = flatVectorsScorer.getRandomVectorScorer(similarityFunction, this, query); return new VectorScorer() { @Override public float score() throws IOException { - return randomVectorScorer.score(iterator.index()); + return randomVectorScorer.score(disi.index()); } @Override public DocIdSetIterator iterator() { - return iterator; + return disi; } }; } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsReader.java index 7d8f49784af..955dfb79bad 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsReader.java @@ -442,7 +442,7 @@ public VectorScorer scorer(float[] query) throws IOException { } @Override - public DocIndexIterator iterator() { + public DocIndexIterator iterator() throws IOException { return rawVectorValues.iterator(); } } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsWriter.java index fcb28fefffb..a90f061caa9 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsWriter.java @@ -1123,7 +1123,7 @@ public int ordToDoc(int ord) { } @Override - public DocIndexIterator iterator() { + public DocIndexIterator iterator() throws IOException { return values.iterator(); } } @@ -1209,7 +1209,7 @@ public int ordToDoc(int ord) { } @Override - public DocIndexIterator iterator() { + public DocIndexIterator iterator() throws IOException { return in.iterator(); } } @@ -1252,7 +1252,7 @@ public float[] get(int ord) throws IOException { } @Override - public DocIndexIterator iterator() { + public DocIndexIterator iterator() throws IOException { return values.iterator(); } } diff --git a/lucene/core/src/java/org/apache/lucene/index/ExitableDirectoryReader.java b/lucene/core/src/java/org/apache/lucene/index/ExitableDirectoryReader.java index 868601ab83b..363f96eb137 100644 --- a/lucene/core/src/java/org/apache/lucene/index/ExitableDirectoryReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/ExitableDirectoryReader.java @@ -462,7 +462,7 @@ public int size() { } @Override - public DocIndexIterator iterator() { + public DocIndexIterator iterator() throws IOException { return createExitableIterator(vectorValues.iterator(), queryTimeout); } @@ -507,7 +507,7 @@ public int ordToDoc(int ord) { } @Override - public DocIndexIterator iterator() { + public DocIndexIterator iterator() throws IOException { return createExitableIterator(vectorValues.iterator(), queryTimeout); } diff --git a/lucene/core/src/java/org/apache/lucene/index/KnnVectorValues.java b/lucene/core/src/java/org/apache/lucene/index/KnnVectorValues.java index c1e5c1f1ef9..b3735fd557c 100644 --- a/lucene/core/src/java/org/apache/lucene/index/KnnVectorValues.java +++ b/lucene/core/src/java/org/apache/lucene/index/KnnVectorValues.java @@ -78,7 +78,7 @@ public int length() { } /** Create an iterator for this instance. */ - public DocIndexIterator iterator() { + public DocIndexIterator iterator() throws IOException { throw new UnsupportedOperationException(); } diff --git a/lucene/core/src/java/org/apache/lucene/index/SlowCompositeCodecReaderWrapper.java b/lucene/core/src/java/org/apache/lucene/index/SlowCompositeCodecReaderWrapper.java index af42c7f8541..cbad0e23f2d 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SlowCompositeCodecReaderWrapper.java +++ b/lucene/core/src/java/org/apache/lucene/index/SlowCompositeCodecReaderWrapper.java @@ -312,7 +312,7 @@ private static class MergedDocIterator int ord = -1; int doc = -1; - MergedDocIterator(List> subs) { + MergedDocIterator(List> subs) throws IOException { this.it = subs.iterator(); current = it.next(); currentIterator = currentIterator(); @@ -348,7 +348,7 @@ public int nextDoc() throws IOException { } } - private KnnVectorValues.DocIndexIterator currentIterator() { + private KnnVectorValues.DocIndexIterator currentIterator() throws IOException { if (current.sub != null) { return current.sub.iterator(); } else { @@ -850,15 +850,14 @@ public FloatVectorValues getFloatVectorValues(String field) throws IOException { class MergedFloatVectorValues extends FloatVectorValues { final int dimension; final int size; - final DocValuesSub[] subs; - final MergedDocIterator iter; + final List> subs; final int[] starts; - MergedFloatVectorValues(int dimension, int size, List> subs) { + MergedFloatVectorValues(int dimension, int size, List> subs) + throws IOException { this.dimension = dimension; this.size = size; - this.subs = subs.toArray(new DocValuesSub[0]); - iter = new MergedDocIterator<>(subs); + this.subs = subs; // [0, start(1), ..., size] - we want the extra element // to avoid checking for out-of-array bounds starts = new int[subs.size() + 1]; @@ -869,8 +868,8 @@ class MergedFloatVectorValues extends FloatVectorValues { } @Override - public MergedDocIterator iterator() { - return iter; + public MergedDocIterator iterator() throws IOException { + return new MergedDocIterator(subs); } @Override @@ -897,10 +896,10 @@ public float[] get(int ord) throws IOException { int newSubIndex = findSub(ord, lastSubIndex, starts); if (newSubIndex != lastSubIndex) { lastSubIndex = newSubIndex; - assert subs[lastSubIndex].sub != null; - subValues = ((FloatVectorValues) subs[lastSubIndex].sub).values(); + assert subs.get(lastSubIndex).sub != null; + subValues = subs.get(lastSubIndex).sub.values(); } - return subValues.get(ord - subs[lastSubIndex].ordStart); + return subValues.get(ord - subs.get(lastSubIndex).ordStart); } }; } @@ -929,15 +928,14 @@ public ByteVectorValues getByteVectorValues(String field) throws IOException { class MergedByteVectorValues extends ByteVectorValues { final int dimension; final int size; - final DocValuesSub[] subs; - final MergedDocIterator iter; + final List> subs; final int[] starts; - MergedByteVectorValues(int dimension, int size, List> subs) { + MergedByteVectorValues(int dimension, int size, List> subs) + throws IOException { this.dimension = dimension; this.size = size; - this.subs = subs.toArray(new DocValuesSub[0]); - iter = new MergedDocIterator<>(subs); + this.subs = subs; // [0, start(1), ..., size] - we want the extra element // to avoid checking for out-of-array bounds starts = new int[subs.size() + 1]; @@ -948,8 +946,8 @@ class MergedByteVectorValues extends ByteVectorValues { } @Override - public MergedDocIterator iterator() { - return iter; + public MergedDocIterator iterator() throws IOException { + return new MergedDocIterator(subs); } @Override @@ -977,10 +975,10 @@ public byte[] get(int ord) throws IOException { int newSubIndex = findSub(ord, lastSubIndex, starts); if (newSubIndex != lastSubIndex) { lastSubIndex = newSubIndex; - assert subs[lastSubIndex].sub != null; - subValues = ((ByteVectorValues) subs[lastSubIndex].sub).values(); + assert subs.get(lastSubIndex).sub != null; + subValues = subs.get(lastSubIndex).sub.values(); } - return subValues.get(ord - subs[lastSubIndex].ordStart); + return subValues.get(ord - subs.get(lastSubIndex).ordStart); } }; } diff --git a/lucene/core/src/java/org/apache/lucene/util/IOBiFunction.java b/lucene/core/src/java/org/apache/lucene/util/IOBiFunction.java new file mode 100644 index 00000000000..b8ff708866b --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/util/IOBiFunction.java @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.util; + +import java.io.IOException; + +/** + * A BiFunction that may throw an IOException + * + * @see java.util.function.Function + * @param the type of the input to the function + * @param the type of the result of the function + */ +@FunctionalInterface +public interface IOBiFunction { + /** + * Applies this function to the given argument. + * + * @param s the first function argument + * @param t the second function argument + * @return the function result + * @throws IOException if producing the result throws an {@link IOException} + */ + R apply(S s, T t) throws IOException; +} diff --git a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/Lucene99MemorySegmentByteVectorScorerSupplier.java b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/Lucene99MemorySegmentByteVectorScorerSupplier.java index ef904867349..ab66300298c 100644 --- a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/Lucene99MemorySegmentByteVectorScorerSupplier.java +++ b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/Lucene99MemorySegmentByteVectorScorerSupplier.java @@ -117,7 +117,9 @@ public RandomVectorScorer scorer(int ord) { @Override public float score(int node) throws IOException { checkOrdinal(node); - float raw = PanamaVectorUtilSupport.cosine(getFirstSegment(slice, ord), getSecondSegment(slice, node)); + float raw = + PanamaVectorUtilSupport.cosine( + getFirstSegment(slice, ord), getSecondSegment(slice, node)); return (1 + raw) / 2; } }; @@ -135,12 +137,14 @@ public RandomVectorScorer scorer(int ord) { checkOrdinal(ord); return new RandomVectorScorer.AbstractRandomVectorScorer(values) { MemorySegmentAccessInput slice = input.clone(); + @Override public float score(int node) throws IOException { checkOrdinal(node); // divide by 2 * 2^14 (maximum absolute value of product of 2 signed bytes) * len float raw = - PanamaVectorUtilSupport.dotProduct(getFirstSegment(slice, ord), getSecondSegment(slice, node)); + PanamaVectorUtilSupport.dotProduct( + getFirstSegment(slice, ord), getSecondSegment(slice, node)); return 0.5f + raw / (float) (values.dimension() * (1 << 15)); } }; @@ -162,7 +166,8 @@ public RandomVectorScorer scorer(int ord) { public float score(int node) throws IOException { checkOrdinal(node); float raw = - PanamaVectorUtilSupport.squareDistance(getFirstSegment(slice, ord), getSecondSegment(slice, node)); + PanamaVectorUtilSupport.squareDistance( + getFirstSegment(slice, ord), getSecondSegment(slice, node)); return 1 / (1f + raw); } }; @@ -180,11 +185,13 @@ public RandomVectorScorer scorer(int ord) { checkOrdinal(ord); return new RandomVectorScorer.AbstractRandomVectorScorer(values) { MemorySegmentAccessInput slice = input.clone(); + @Override public float score(int node) throws IOException { checkOrdinal(node); float raw = - PanamaVectorUtilSupport.dotProduct(getFirstSegment(slice, ord), getSecondSegment(slice, node)); + PanamaVectorUtilSupport.dotProduct( + getFirstSegment(slice, ord), getSecondSegment(slice, node)); if (raw < 0) { return 1 / (1 + -1 * raw); } diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java index e33b88ff583..da2a1b1ba60 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java @@ -1201,13 +1201,18 @@ public void testSortedIndexBytes() throws Exception { ByteVectorValues.Bytes vectors = vectorValues.values(); assertEquals(2, vectorValues.dimension()); assertEquals(3, vectorValues.size()); - assertEquals("1", storedFields.document(vectorValues.iterator().nextDoc()).get("id")); + DocIdSetIterator iter = vectorValues.iterator(); + assertEquals("1", storedFields.document(iter.nextDoc()).get("id")); assertEquals(-1, vectors.get(0)[0], 0); - assertEquals("2", storedFields.document(vectorValues.iterator().nextDoc()).get("id")); + assertEquals("2", storedFields.document(iter.nextDoc()).get("id")); assertEquals(1, vectors.get(1)[0], 0); - assertEquals("4", storedFields.document(vectorValues.iterator().nextDoc()).get("id")); + assertEquals("4", storedFields.document(iter.nextDoc()).get("id")); assertEquals(0, vectors.get(2)[0], 0); - assertEquals(NO_MORE_DOCS, vectorValues.iterator().nextDoc()); + assertEquals(NO_MORE_DOCS, iter.nextDoc()); + // Each call to iterator() produces a new iterator + DocIdSetIterator iter2 = vectorValues.iterator(); + assertNotSame(iter, iter2); + assertEquals("1", storedFields.document(iter2.nextDoc()).get("id")); } } } From 273e8ed64313b0dc78d025709d9886e20c78ea25 Mon Sep 17 00:00:00 2001 From: Michael Sokolov Date: Mon, 7 Oct 2024 09:14:53 -0600 Subject: [PATCH 04/25] fix cloning/sharing of vector scorer resources --- .../codecs/hnsw/DefaultFlatVectorScorer.java | 34 ++++++------- ...MemorySegmentByteVectorScorerSupplier.java | 48 ++++++++----------- .../lucene/store/MemorySegmentIndexInput.java | 2 +- .../vectorization/TestVectorScorer.java | 8 +++- 4 files changed, 42 insertions(+), 50 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/hnsw/DefaultFlatVectorScorer.java b/lucene/core/src/java/org/apache/lucene/codecs/hnsw/DefaultFlatVectorScorer.java index 729dd6616c4..7e46a36928d 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/hnsw/DefaultFlatVectorScorer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/hnsw/DefaultFlatVectorScorer.java @@ -88,22 +88,20 @@ public String toString() { /** RandomVectorScorerSupplier for bytes vector */ private static final class ByteScoringSupplier implements RandomVectorScorerSupplier { - private final ByteVectorValues vectors; - private final ByteVectorValues.Bytes vectors1; - private final ByteVectorValues.Bytes vectors2; + private final ByteVectorValues vectorValues; private final VectorSimilarityFunction similarityFunction; private ByteScoringSupplier( ByteVectorValues vectors, VectorSimilarityFunction similarityFunction) throws IOException { - this.vectors = vectors; - vectors1 = vectors.values(); - vectors2 = vectors.values(); + vectorValues = vectors; this.similarityFunction = similarityFunction; } @Override - public RandomVectorScorer scorer(int ord) { - return new RandomVectorScorer.AbstractRandomVectorScorer(vectors) { + public RandomVectorScorer scorer(int ord) throws IOException { + ByteVectorValues.Bytes vectors1 = vectorValues.values(); + ByteVectorValues.Bytes vectors2 = vectorValues.values(); + return new RandomVectorScorer.AbstractRandomVectorScorer(vectorValues) { @Override public float score(int node) throws IOException { return similarityFunction.compare(vectors1.get(ord), vectors2.get(node)); @@ -119,22 +117,20 @@ public String toString() { /** RandomVectorScorerSupplier for Float vector */ private static final class FloatScoringSupplier implements RandomVectorScorerSupplier { - private final FloatVectorValues vectors; - private final FloatVectorValues.Floats vectors1; - private final FloatVectorValues.Floats vectors2; + private final FloatVectorValues vectorValues; private final VectorSimilarityFunction similarityFunction; private FloatScoringSupplier( FloatVectorValues vectors, VectorSimilarityFunction similarityFunction) throws IOException { - this.vectors = vectors; - vectors1 = vectors.values(); - vectors2 = vectors.values(); + vectorValues = vectors; this.similarityFunction = similarityFunction; } @Override - public RandomVectorScorer scorer(int ord) { - return new RandomVectorScorer.AbstractRandomVectorScorer(vectors) { + public RandomVectorScorer scorer(int ord) throws IOException { + FloatVectorValues.Floats vectors1 = vectorValues.values(); + FloatVectorValues.Floats vectors2 = vectorValues.values(); + return new RandomVectorScorer.AbstractRandomVectorScorer(vectorValues) { @Override public float score(int node) throws IOException { return similarityFunction.compare(vectors1.get(ord), vectors2.get(node)); @@ -150,7 +146,7 @@ public String toString() { /** A {@link RandomVectorScorer} for float vectors. */ private static class FloatVectorScorer extends RandomVectorScorer.AbstractRandomVectorScorer { - private final FloatVectorValues.Floats floats; + private final FloatVectorValues.Floats vectors; private final float[] query; private final VectorSimilarityFunction similarityFunction; @@ -158,14 +154,14 @@ public FloatVectorScorer( FloatVectorValues values, float[] query, VectorSimilarityFunction similarityFunction) throws IOException { super(values); - this.floats = values.values(); + this.vectors = values.values(); this.query = query; this.similarityFunction = similarityFunction; } @Override public float score(int node) throws IOException { - return similarityFunction.compare(query, floats.get(node)); + return similarityFunction.compare(query, vectors.get(node)); } } diff --git a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/Lucene99MemorySegmentByteVectorScorerSupplier.java b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/Lucene99MemorySegmentByteVectorScorerSupplier.java index ab66300298c..0a4ee85a310 100644 --- a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/Lucene99MemorySegmentByteVectorScorerSupplier.java +++ b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/Lucene99MemorySegmentByteVectorScorerSupplier.java @@ -35,7 +35,6 @@ public abstract sealed class Lucene99MemorySegmentByteVectorScorerSupplier final int maxOrd; final MemorySegmentAccessInput input; final KnnVectorValues values; // to support ordToDoc/getAcceptOrds - byte[] scratch1, scratch2; /** * Return an optional whose value, if present, is the scorer supplier. Otherwise, an empty @@ -77,28 +76,13 @@ final void checkOrdinal(int ord) { } } - final MemorySegment getFirstSegment(MemorySegmentAccessInput input, int ord) throws IOException { + final MemorySegment getSegment(MemorySegmentAccessInput input, byte[] scratch, int ord) + throws IOException { long byteOffset = (long) ord * vectorByteSize; MemorySegment seg = input.segmentSliceOrNull(byteOffset, vectorByteSize); if (seg == null) { - if (scratch1 == null) { - scratch1 = new byte[vectorByteSize]; - } - input.readBytes(byteOffset, scratch1, 0, vectorByteSize); - seg = MemorySegment.ofArray(scratch1); - } - return seg; - } - - final MemorySegment getSecondSegment(MemorySegmentAccessInput input, int ord) throws IOException { - long byteOffset = (long) ord * vectorByteSize; - MemorySegment seg = input.segmentSliceOrNull(byteOffset, vectorByteSize); - if (seg == null) { - if (scratch2 == null) { - scratch2 = new byte[vectorByteSize]; - } - input.readBytes(byteOffset, scratch2, 0, vectorByteSize); - seg = MemorySegment.ofArray(scratch2); + input.readBytes(byteOffset, scratch, 0, vectorByteSize); + seg = MemorySegment.ofArray(scratch); } return seg; } @@ -113,13 +97,15 @@ static final class CosineSupplier extends Lucene99MemorySegmentByteVectorScorerS public RandomVectorScorer scorer(int ord) { checkOrdinal(ord); MemorySegmentAccessInput slice = input.clone(); + byte[] scratch1 = new byte[vectorByteSize]; + byte[] scratch2 = new byte[vectorByteSize]; return new RandomVectorScorer.AbstractRandomVectorScorer(values) { @Override public float score(int node) throws IOException { checkOrdinal(node); float raw = PanamaVectorUtilSupport.cosine( - getFirstSegment(slice, ord), getSecondSegment(slice, node)); + getSegment(slice, scratch1, ord), getSegment(slice, scratch2, node)); return (1 + raw) / 2; } }; @@ -135,16 +121,17 @@ static final class DotProductSupplier extends Lucene99MemorySegmentByteVectorSco @Override public RandomVectorScorer scorer(int ord) { checkOrdinal(ord); + MemorySegmentAccessInput slice = input.clone(); + byte[] scratch1 = new byte[vectorByteSize]; + byte[] scratch2 = new byte[vectorByteSize]; return new RandomVectorScorer.AbstractRandomVectorScorer(values) { - MemorySegmentAccessInput slice = input.clone(); - @Override public float score(int node) throws IOException { checkOrdinal(node); // divide by 2 * 2^14 (maximum absolute value of product of 2 signed bytes) * len float raw = PanamaVectorUtilSupport.dotProduct( - getFirstSegment(slice, ord), getSecondSegment(slice, node)); + getSegment(slice, scratch1, ord), getSegment(slice, scratch2, node)); return 0.5f + raw / (float) (values.dimension() * (1 << 15)); } }; @@ -161,13 +148,16 @@ static final class EuclideanSupplier extends Lucene99MemorySegmentByteVectorScor public RandomVectorScorer scorer(int ord) { checkOrdinal(ord); MemorySegmentAccessInput slice = input.clone(); + byte[] scratch1 = new byte[vectorByteSize]; + byte[] scratch2 = new byte[vectorByteSize]; + return new RandomVectorScorer.AbstractRandomVectorScorer(values) { @Override public float score(int node) throws IOException { checkOrdinal(node); float raw = PanamaVectorUtilSupport.squareDistance( - getFirstSegment(slice, ord), getSecondSegment(slice, node)); + getSegment(slice, scratch1, ord), getSegment(slice, scratch2, node)); return 1 / (1f + raw); } }; @@ -183,15 +173,17 @@ static final class MaxInnerProductSupplier extends Lucene99MemorySegmentByteVect @Override public RandomVectorScorer scorer(int ord) { checkOrdinal(ord); - return new RandomVectorScorer.AbstractRandomVectorScorer(values) { - MemorySegmentAccessInput slice = input.clone(); + MemorySegmentAccessInput slice = input.clone(); + byte[] scratch1 = new byte[vectorByteSize]; + byte[] scratch2 = new byte[vectorByteSize]; + return new RandomVectorScorer.AbstractRandomVectorScorer(values) { @Override public float score(int node) throws IOException { checkOrdinal(node); float raw = PanamaVectorUtilSupport.dotProduct( - getFirstSegment(slice, ord), getSecondSegment(slice, node)); + getSegment(slice, scratch1, ord), getSegment(slice, scratch2, node)); if (raw < 0) { return 1 / (1 + -1 * raw); } diff --git a/lucene/core/src/java21/org/apache/lucene/store/MemorySegmentIndexInput.java b/lucene/core/src/java21/org/apache/lucene/store/MemorySegmentIndexInput.java index 8bb70ba009d..7ec7c6330f1 100644 --- a/lucene/core/src/java21/org/apache/lucene/store/MemorySegmentIndexInput.java +++ b/lucene/core/src/java21/org/apache/lucene/store/MemorySegmentIndexInput.java @@ -191,7 +191,7 @@ private void readBytesBoundary(byte[] b, int offset, int len) throws IOException offset += curAvail; curSegmentIndex++; if (curSegmentIndex >= segments.length) { - throw new EOFException("read past EOF: " + this); + throw new EOFException("read past EOF: " + this + " curSegIndex=" + curSegmentIndex); } curSegment = segments[curSegmentIndex]; curPosition = 0L; diff --git a/lucene/core/src/test/org/apache/lucene/internal/vectorization/TestVectorScorer.java b/lucene/core/src/test/org/apache/lucene/internal/vectorization/TestVectorScorer.java index 086810001d7..225caba15bf 100644 --- a/lucene/core/src/test/org/apache/lucene/internal/vectorization/TestVectorScorer.java +++ b/lucene/core/src/test/org/apache/lucene/internal/vectorization/TestVectorScorer.java @@ -253,7 +253,6 @@ public void testCopiesAcrossThreads() throws Exception { var scoreSupplier = DEFAULT_SCORER.getRandomVectorScorerSupplier(sim, vectorValues); var expectedScore1 = scoreSupplier.scorer(0).score(1); var expectedScore2 = scoreSupplier.scorer(2).score(3); - var scorer = MEMSEG_SCORER.getRandomVectorScorerSupplier(sim, vectorValues); var tasks = List.>>of( @@ -280,7 +279,12 @@ record AssertingScoreCallable(RandomVectorScorer scorer, int ord, float expected public Optional call() throws Exception { try { for (int i = 0; i < 100; i++) { - assertEquals(scorer.score(ord), expectedScore, DELTA); + float score = scorer.score(ord); + assertEquals( + "ord=" + ord + " i=" + i + " expected=" + expectedScore + " actual=" + score, + expectedScore, + score, + DELTA); } } catch (Throwable t) { return Optional.of(t); From ce70f4c53d166ee0c4240a69e489d042611d33ca Mon Sep 17 00:00:00 2001 From: Michael Sokolov Date: Mon, 7 Oct 2024 10:00:20 -0600 Subject: [PATCH 05/25] renaming --- .../synonym/word2vec/Word2VecModel.java | 2 +- .../lucene90/Lucene90HnswGraphBuilder.java | 6 +- .../lucene90/Lucene90HnswVectorsReader.java | 4 +- .../lucene90/Lucene90OnHeapHnswGraph.java | 6 +- .../lucene91/Lucene91HnswVectorsReader.java | 4 +- .../lucene92/OffHeapFloatVectorValues.java | 14 ++-- .../lucene94/OffHeapByteVectorValues.java | 8 +-- .../lucene94/OffHeapFloatVectorValues.java | 14 ++-- .../lucene90/Lucene90HnswVectorsWriter.java | 2 +- .../lucene91/Lucene91HnswGraphBuilder.java | 6 +- .../lucene91/Lucene91HnswVectorsWriter.java | 2 +- .../lucene92/Lucene92HnswVectorsWriter.java | 2 +- .../lucene94/Lucene94HnswVectorsWriter.java | 4 +- .../lucene95/Lucene95HnswVectorsWriter.java | 4 +- .../TestBasicBackwardsCompatibility.java | 2 +- .../bitvectors/FlatBitVectorsScorer.java | 4 +- .../SimpleTextKnnVectorsReader.java | 36 +++++------ .../SimpleTextKnnVectorsWriter.java | 6 +- .../codecs/BufferingKnnVectorsWriter.java | 12 ++-- .../lucene/codecs/KnnVectorsWriter.java | 44 ++++++------- .../codecs/hnsw/DefaultFlatVectorScorer.java | 20 +++--- .../hnsw/ScalarQuantizedVectorScorer.java | 12 ++-- .../lucene95/OffHeapByteVectorValues.java | 4 +- .../lucene95/OffHeapFloatVectorValues.java | 4 +- .../lucene99/Lucene99FlatVectorsWriter.java | 6 +- .../Lucene99ScalarQuantizedVectorScorer.java | 58 ++++++++--------- .../Lucene99ScalarQuantizedVectorsReader.java | 6 +- .../Lucene99ScalarQuantizedVectorsWriter.java | 64 +++++++++---------- .../OffHeapQuantizedByteVectorValues.java | 4 +- .../apache/lucene/index/ByteVectorValues.java | 4 +- .../org/apache/lucene/index/CheckIndex.java | 32 +++++----- .../lucene/index/ExitableDirectoryReader.java | 10 +-- .../lucene/index/FloatVectorValues.java | 4 +- .../SlowCompositeCodecReaderWrapper.java | 8 +-- .../lucene/index/SortingCodecReader.java | 10 +-- .../QuantizedByteVectorValues.java | 2 +- .../util/quantization/ScalarQuantizer.java | 14 ++-- ...estLucene99HnswQuantizedVectorsFormat.java | 2 +- ...stLucene99ScalarQuantizedVectorScorer.java | 2 +- ...tLucene99ScalarQuantizedVectorsFormat.java | 2 +- .../org/apache/lucene/document/TestField.java | 8 +-- .../index/TestExitableDirectoryReader.java | 2 +- .../org/apache/lucene/index/TestKnnGraph.java | 4 +- .../lucene/index/TestSortingCodecReader.java | 2 +- .../lucene/util/hnsw/HnswGraphTestCase.java | 12 ++-- .../util/hnsw/MockByteVectorValues.java | 2 +- .../lucene/util/hnsw/MockVectorValues.java | 2 +- .../util/hnsw/TestHnswByteVectorGraph.java | 2 +- .../util/hnsw/TestHnswFloatVectorGraph.java | 4 +- .../TestScalarQuantizedVectorSimilarity.java | 2 +- .../quantization/TestScalarQuantizer.java | 2 +- .../lucene/index/memory/MemoryIndex.java | 8 +-- .../lucene/index/memory/TestMemoryIndex.java | 4 +- .../valuesource/ByteKnnVectorFieldSource.java | 2 +- .../FloatKnnVectorFieldSource.java | 2 +- .../sandbox/codecs/quantization/KMeans.java | 44 ++++++------- .../codecs/quantization/SampleReader.java | 4 +- .../index/BaseKnnVectorsFormatTestCase.java | 40 ++++++------ 58 files changed, 298 insertions(+), 298 deletions(-) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/word2vec/Word2VecModel.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/word2vec/Word2VecModel.java index ced8a4230fe..6bf3e9712b7 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/word2vec/Word2VecModel.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/word2vec/Word2VecModel.java @@ -50,7 +50,7 @@ public void addTermAndVector(TermAndVector modelEntry) { } @Override - public Floats values() { + public Floats vectors() { return new Floats() { @Override public float[] get(int targetOrd) { diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswGraphBuilder.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswGraphBuilder.java index eb9ac7abedd..7953b1c066d 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswGraphBuilder.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswGraphBuilder.java @@ -81,8 +81,8 @@ public Lucene90HnswGraphBuilder( long seed) throws IOException { this.vectorValues = vectors; - this.vectors = vectors.values(); - buildVectors = vectors.values(); + this.vectors = vectorValues.vectors(); + buildVectors = vectorValues.vectors(); this.similarityFunction = Objects.requireNonNull(similarityFunction); if (maxConn <= 0) { throw new IllegalArgumentException("maxConn must be positive"); @@ -112,7 +112,7 @@ public Lucene90OnHeapHnswGraph build(FloatVectorValues vectors) throws IOExcepti } long start = System.nanoTime(), t = start; // start at node 1! node 0 is added implicitly, in the constructor - FloatVectorValues.Floats values = vectors.values(); + FloatVectorValues.Floats values = vectorValues.vectors(); for (int node = 1; node < vectors.size(); node++) { addGraphNode(values.get(node)); if (node % 10000 == 0) { diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswVectorsReader.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswVectorsReader.java index 2e860777ba7..859d4148026 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswVectorsReader.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswVectorsReader.java @@ -388,7 +388,7 @@ public int size() { } @Override - public Floats values() { + public Floats vectors() { return new Floats() { @Override public float[] get(int targetOrd) throws IOException { @@ -418,7 +418,7 @@ public VectorScorer scorer(float[] target) { if (size() == 0) { return null; } - FloatVectorValues.Floats values = values(); + FloatVectorValues.Floats values = vectors(); DocIndexIterator iterator = iterator(); return new VectorScorer() { @Override diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90OnHeapHnswGraph.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90OnHeapHnswGraph.java index 6faf2f06cd8..95f2b8742b4 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90OnHeapHnswGraph.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90OnHeapHnswGraph.java @@ -62,7 +62,7 @@ public final class Lucene90OnHeapHnswGraph extends HnswGraph { * @param topK the number of nodes to be returned * @param numSeed the size of the queue maintained while searching, and controls the number of * random entry points to sample - * @param vectors vector values + * @param vectorValues vector values * @param graphValues the graph values. May represent the entire graph, or a level in a * hierarchical graph. * @param acceptOrds {@link Bits} that represents the allowed document ordinals to match, or @@ -74,7 +74,7 @@ public static NeighborQueue search( float[] query, int topK, int numSeed, - FloatVectorValues vectors, + FloatVectorValues vectorValues, VectorSimilarityFunction similarityFunction, HnswGraph graphValues, Bits acceptOrds, @@ -83,7 +83,7 @@ public static NeighborQueue search( throws IOException { int size = graphValues.size(); - FloatVectorValues.Floats values = vectors.values(); + FloatVectorValues.Floats values = vectorValues.vectors(); // MIN heap, holding the top results NeighborQueue results = new NeighborQueue(numSeed, false); // MAX heap, from which to pull the candidate nodes diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene91/Lucene91HnswVectorsReader.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene91/Lucene91HnswVectorsReader.java index 25f362ad09d..0e1b349995c 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene91/Lucene91HnswVectorsReader.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene91/Lucene91HnswVectorsReader.java @@ -431,7 +431,7 @@ public int size() { } @Override - public Floats values() throws IOException { + public Floats vectors() throws IOException { IndexInput input = dataIn.clone(); float[] value = new float[dimension]; return new Floats() { @@ -459,7 +459,7 @@ public VectorScorer scorer(float[] target) throws IOException { if (size == 0) { return null; } - Floats values = values(); + Floats values = vectors(); DocIndexIterator iterator = iterator(); return new VectorScorer() { @Override diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene92/OffHeapFloatVectorValues.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene92/OffHeapFloatVectorValues.java index 6c7944c310d..66744d6e234 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene92/OffHeapFloatVectorValues.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene92/OffHeapFloatVectorValues.java @@ -60,9 +60,9 @@ public int size() { } @Override - public Floats values() throws IOException { + public Floats vectors() throws IOException { return new Floats() { - final IndexInput dictionarySlice = slice.clone(); + final IndexInput vectorSlice = slice.clone(); int lastOrd = -1; float[] value = new float[dimension]; @@ -71,8 +71,8 @@ public float[] get(int targetOrd) throws IOException { if (lastOrd == targetOrd) { return value; } - dictionarySlice.seek((long) targetOrd * byteSize); - dictionarySlice.readFloats(value, 0, value.length); + vectorSlice.seek((long) targetOrd * byteSize); + vectorSlice.readFloats(value, 0, value.length); lastOrd = targetOrd; return value; } @@ -118,7 +118,7 @@ public Bits getAcceptOrds(Bits acceptDocs) { @Override public VectorScorer scorer(float[] query) throws IOException { - FloatVectorValues.Floats values = values(); + FloatVectorValues.Floats values = vectors(); DocIndexIterator iterator = iterator(); return new VectorScorer() { @Override @@ -194,7 +194,7 @@ public int length() { @Override public VectorScorer scorer(float[] query) throws IOException { - FloatVectorValues.Floats values = values(); + FloatVectorValues.Floats values = vectors(); IndexedDISI disi = createDISI(); return new VectorScorer() { @Override @@ -227,7 +227,7 @@ public int size() { } @Override - public Floats values() { + public Floats vectors() { return new Floats() { @Override public float[] get(int targetOrd) throws IOException { diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene94/OffHeapByteVectorValues.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene94/OffHeapByteVectorValues.java index acc39e48a22..38e9199279e 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene94/OffHeapByteVectorValues.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene94/OffHeapByteVectorValues.java @@ -63,7 +63,7 @@ public int size() { } @Override - public Bytes values() throws IOException { + public Bytes vectors() throws IOException { return new Bytes() { IndexInput input = slice.clone(); ByteBuffer byteBuffer = ByteBuffer.allocate(byteSize); @@ -134,7 +134,7 @@ public Bits getAcceptOrds(Bits acceptDocs) { @Override public VectorScorer scorer(byte[] query) throws IOException { DocIndexIterator iterator = iterator(); - Bytes vectors = values(); + Bytes vectors = vectors(); return new VectorScorer() { @Override public float score() throws IOException { @@ -211,7 +211,7 @@ public int length() { @Override public VectorScorer scorer(byte[] query) throws IOException { return new VectorScorer() { - ByteVectorValues.Bytes vectors = values(); + ByteVectorValues.Bytes vectors = vectors(); IndexedDISI disi = createDISI(); @Override @@ -244,7 +244,7 @@ public int size() { } @Override - public Bytes values() { + public Bytes vectors() { return Bytes.EMPTY; } diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene94/OffHeapFloatVectorValues.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene94/OffHeapFloatVectorValues.java index 5b67552e78a..b412ebd8f5b 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene94/OffHeapFloatVectorValues.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene94/OffHeapFloatVectorValues.java @@ -61,9 +61,9 @@ public int size() { } @Override - public Floats values() { + public Floats vectors() { return new Floats() { - final IndexInput dictionarySlice = slice.clone(); + final IndexInput vectorSlice = slice.clone(); int lastOrd = -1; float[] value = new float[dimension]; @@ -72,8 +72,8 @@ public float[] get(int targetOrd) throws IOException { if (lastOrd == targetOrd) { return value; } - dictionarySlice.seek((long) targetOrd * byteSize); - dictionarySlice.readFloats(value, 0, value.length); + vectorSlice.seek((long) targetOrd * byteSize); + vectorSlice.readFloats(value, 0, value.length); lastOrd = targetOrd; return value; } @@ -129,7 +129,7 @@ public Bits getAcceptOrds(Bits acceptDocs) { @Override public VectorScorer scorer(float[] query) throws IOException { - Floats floats = values(); + Floats floats = vectors(); DocIndexIterator iterator = iterator(); return new VectorScorer() { @@ -208,7 +208,7 @@ public int length() { @Override public VectorScorer scorer(float[] query) throws IOException { IndexedDISI disi = createDISI(); - Floats values = values(); + Floats values = vectors(); return new VectorScorer() { @Override public float score() throws IOException { @@ -240,7 +240,7 @@ public int size() { } @Override - public Floats values() { + public Floats vectors() { return new Floats() { @Override public float[] get(int targetOrd) { diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswVectorsWriter.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswVectorsWriter.java index b4d00e6567e..cc70df0acb9 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswVectorsWriter.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswVectorsWriter.java @@ -189,7 +189,7 @@ private static int[] writeVectorData(IndexOutput output, FloatVectorValues vecto ByteBuffer binaryVector = ByteBuffer.allocate(vectors.dimension() * Float.BYTES).order(ByteOrder.LITTLE_ENDIAN); KnnVectorValues.DocIndexIterator iter = vectors.iterator(); - FloatVectorValues.Floats values = vectors.values(); + FloatVectorValues.Floats values = vectorValues.vectors(); for (int docV = iter.nextDoc(); docV != NO_MORE_DOCS; docV = iter.nextDoc()) { // write vector binaryVector.asFloatBuffer().put(values.get(iter.index())); diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene91/Lucene91HnswGraphBuilder.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene91/Lucene91HnswGraphBuilder.java index c3ef3a8bb4a..ad374fbf038 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene91/Lucene91HnswGraphBuilder.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene91/Lucene91HnswGraphBuilder.java @@ -91,8 +91,8 @@ public Lucene91HnswGraphBuilder( long seed) throws IOException { this.vectors = vectors; - vectorValues = vectors.values(); - buildVectors = vectors.values(); + vectorValues = vectorValues.vectors(); + buildVectors = vectorValues.vectors(); this.similarityFunction = Objects.requireNonNull(similarityFunction); if (maxConn <= 0) { throw new IllegalArgumentException("maxConn must be positive"); @@ -122,7 +122,7 @@ public Lucene91HnswGraphBuilder( * accessor for the vectors */ public Lucene91OnHeapHnswGraph build(FloatVectorValues vectors) throws IOException { - FloatVectorValues.Floats values = vectors.values(); + FloatVectorValues.Floats values = vectorValues.vectors(); if (infoStream.isEnabled(HNSW_COMPONENT)) { infoStream.message(HNSW_COMPONENT, "build graph from " + vectors.size() + " vectors"); } diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene91/Lucene91HnswVectorsWriter.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene91/Lucene91HnswVectorsWriter.java index 0090e59922d..5e1f34ef243 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene91/Lucene91HnswVectorsWriter.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene91/Lucene91HnswVectorsWriter.java @@ -182,7 +182,7 @@ private static DocsWithFieldSet writeVectorData(IndexOutput output, FloatVectorV ByteBuffer binaryVector = ByteBuffer.allocate(vectors.dimension() * Float.BYTES).order(ByteOrder.LITTLE_ENDIAN); KnnVectorValues.DocIndexIterator iter = vectors.iterator(); - FloatVectorValues.Floats values = vectors.values(); + FloatVectorValues.Floats values = vectorValues.vectors(); for (int docV = iter.nextDoc(); docV != DocIdSetIterator.NO_MORE_DOCS; docV = iter.nextDoc()) { // write vector binaryVector.asFloatBuffer().put(values.get(iter.index())); diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene92/Lucene92HnswVectorsWriter.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene92/Lucene92HnswVectorsWriter.java index 93bacddf452..03c1ff79336 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene92/Lucene92HnswVectorsWriter.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene92/Lucene92HnswVectorsWriter.java @@ -190,7 +190,7 @@ private static DocsWithFieldSet writeVectorData(IndexOutput output, FloatVectorV ByteBuffer binaryVector = ByteBuffer.allocate(vectors.dimension() * Float.BYTES).order(ByteOrder.LITTLE_ENDIAN); KnnVectorValues.DocIndexIterator iterator = vectors.iterator(); - FloatVectorValues.Floats values = vectors.values(); + FloatVectorValues.Floats values = vectorValues.vectors(); for (int docV = iterator.nextDoc(); docV != DocIdSetIterator.NO_MORE_DOCS; docV = iterator.nextDoc()) { diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene94/Lucene94HnswVectorsWriter.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene94/Lucene94HnswVectorsWriter.java index ca7dc1cf192..38b0779ab5f 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene94/Lucene94HnswVectorsWriter.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene94/Lucene94HnswVectorsWriter.java @@ -587,7 +587,7 @@ private static DocsWithFieldSet writeByteVectorData( IndexOutput output, ByteVectorValues byteVectorValues) throws IOException { DocsWithFieldSet docsWithField = new DocsWithFieldSet(); KnnVectorValues.DocIndexIterator iter = byteVectorValues.iterator(); - ByteVectorValues.Bytes vectors = byteVectorValues.values(); + ByteVectorValues.Bytes vectors = byteVectorValues.vectors(); for (int docV = iter.nextDoc(); docV != NO_MORE_DOCS; docV = iter.nextDoc()) { // write vector byte[] binaryValue = vectors.get(iter.index()); @@ -605,7 +605,7 @@ private static DocsWithFieldSet writeVectorData( IndexOutput output, FloatVectorValues floatVectorValues) throws IOException { DocsWithFieldSet docsWithField = new DocsWithFieldSet(); KnnVectorValues.DocIndexIterator iter = floatVectorValues.iterator(); - FloatVectorValues.Floats values = floatVectorValues.values(); + FloatVectorValues.Floats values = floatVectorValues.vectors(); ByteBuffer binaryVector = ByteBuffer.allocate(floatVectorValues.dimension() * VectorEncoding.FLOAT32.byteSize) .order(ByteOrder.LITTLE_ENDIAN); diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene95/Lucene95HnswVectorsWriter.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene95/Lucene95HnswVectorsWriter.java index 481a576253a..471ba54cca6 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene95/Lucene95HnswVectorsWriter.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene95/Lucene95HnswVectorsWriter.java @@ -635,7 +635,7 @@ private static DocsWithFieldSet writeByteVectorData( IndexOutput output, ByteVectorValues byteVectorValues) throws IOException { DocsWithFieldSet docsWithField = new DocsWithFieldSet(); KnnVectorValues.DocIndexIterator iter = byteVectorValues.iterator(); - ByteVectorValues.Bytes vectors = byteVectorValues.values(); + ByteVectorValues.Bytes vectors = byteVectorValues.vectors(); for (int docId = iter.nextDoc(); docId != NO_MORE_DOCS; docId = iter.nextDoc()) { // write vector byte[] binaryValue = vectors.get(iter.index()); @@ -656,7 +656,7 @@ private static DocsWithFieldSet writeVectorData( ByteBuffer.allocate(floatVectorValues.dimension() * VectorEncoding.FLOAT32.byteSize) .order(ByteOrder.LITTLE_ENDIAN); KnnVectorValues.DocIndexIterator iter = floatVectorValues.iterator(); - FloatVectorValues.Floats values = floatVectorValues.values(); + FloatVectorValues.Floats values = floatVectorValues.vectors(); for (int docV = iter.nextDoc(); docV != NO_MORE_DOCS; docV = iter.nextDoc()) { // write vector buffer.asFloatBuffer().put(values.get(iter.index())); diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestBasicBackwardsCompatibility.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestBasicBackwardsCompatibility.java index c8c9b7e46ef..2ef87707897 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestBasicBackwardsCompatibility.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestBasicBackwardsCompatibility.java @@ -478,7 +478,7 @@ public static void searchIndex( if (values != null) { assertEquals(KNN_VECTOR_FIELD_TYPE.vectorDimension(), values.dimension()); KnnVectorValues.DocIndexIterator it = values.iterator(); - FloatVectorValues.Floats vectors = values.values(); + FloatVectorValues.Floats vectors = vectorValues.vectors(); for (int doc = it.nextDoc(); doc != NO_MORE_DOCS; doc = it.nextDoc()) { float[] expectedVector = {KNN_VECTOR[0], KNN_VECTOR[1], KNN_VECTOR[2] + 0.1f * cnt}; assertArrayEquals( diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/bitvectors/FlatBitVectorsScorer.java b/lucene/codecs/src/java/org/apache/lucene/codecs/bitvectors/FlatBitVectorsScorer.java index 1a6f4878394..11ae0b5cbb5 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/bitvectors/FlatBitVectorsScorer.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/bitvectors/FlatBitVectorsScorer.java @@ -68,7 +68,7 @@ static class BitRandomVectorScorer implements RandomVectorScorer { this.query = query; this.bitDimensions = vectorValues.dimension() * Byte.SIZE; this.vectorValues = vectorValues; - vectors = vectorValues.values(); + vectors = vectorValues.vectors(); } @Override @@ -99,7 +99,7 @@ static class BitRandomVectorScorerSupplier implements RandomVectorScorerSupplier public BitRandomVectorScorerSupplier(ByteVectorValues vectorValues) throws IOException { this.vectorValues = vectorValues; - this.vectors = vectorValues.values(); + this.vectors = vectorValues.vectors(); } @Override diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextKnnVectorsReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextKnnVectorsReader.java index 1fe9dda39a6..4c2a5feabfb 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextKnnVectorsReader.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextKnnVectorsReader.java @@ -182,19 +182,19 @@ public ByteVectorValues getByteVectorValues(String field) throws IOException { @Override public void search(String field, float[] target, KnnCollector knnCollector, Bits acceptDocs) throws IOException { - FloatVectorValues values = getFloatVectorValues(field); - if (target.length != values.dimension()) { + FloatVectorValues vectorValues = getFloatVectorValues(field); + if (target.length != vectorValues.dimension()) { throw new IllegalArgumentException( "vector query dimension: " + target.length + " differs from field dimension: " - + values.dimension()); + + vectorValues.dimension()); } FieldInfo info = readState.fieldInfos.fieldInfo(field); VectorSimilarityFunction vectorSimilarity = info.getVectorSimilarityFunction(); - FloatVectorValues.Floats valuesDict = values.values(); - for (int ord = 0; ord < values.size(); ord++) { - int doc = values.ordToDoc(ord); + FloatVectorValues.Floats vectors = vectorValues.vectors(); + for (int ord = 0; ord < vectorValues.size(); ord++) { + int doc = vectorValues.ordToDoc(ord); if (acceptDocs != null && acceptDocs.get(doc) == false) { continue; } @@ -203,7 +203,7 @@ public void search(String field, float[] target, KnnCollector knnCollector, Bits break; } - float[] vector = valuesDict.get(ord); + float[] vector = vectors.get(ord); float score = vectorSimilarity.compare(vector, target); knnCollector.collect(doc, score); knnCollector.incVisitedCount(1); @@ -213,20 +213,20 @@ public void search(String field, float[] target, KnnCollector knnCollector, Bits @Override public void search(String field, byte[] target, KnnCollector knnCollector, Bits acceptDocs) throws IOException { - ByteVectorValues values = getByteVectorValues(field); - if (target.length != values.dimension()) { + ByteVectorValues vectorValues = getByteVectorValues(field); + if (target.length != vectorValues.dimension()) { throw new IllegalArgumentException( "vector query dimension: " + target.length + " differs from field dimension: " - + values.dimension()); + + vectorValues.dimension()); } - ByteVectorValues.Bytes vectors = values.values(); + ByteVectorValues.Bytes vectors = vectorValues.vectors(); FieldInfo info = readState.fieldInfos.fieldInfo(field); VectorSimilarityFunction vectorSimilarity = info.getVectorSimilarityFunction(); - for (int ord = 0; ord < values.size(); ord++) { - int doc = values.ordToDoc(ord); + for (int ord = 0; ord < vectorValues.size(); ord++) { + int doc = vectorValues.ordToDoc(ord); if (acceptDocs != null && acceptDocs.get(doc) == false) { continue; } @@ -325,7 +325,7 @@ public int size() { } @Override - public Floats values() { + public Floats vectors() { return new Floats() { @Override public float[] get(int ord) { @@ -352,12 +352,12 @@ public VectorScorer scorer(float[] target) { SimpleTextFloatVectorValues simpleTextFloatVectorValues = new SimpleTextFloatVectorValues(this); DocIndexIterator iterator = simpleTextFloatVectorValues.iterator(); - Floats valuesDict = simpleTextFloatVectorValues.values(); + Floats vectors = simpleTextFloatVectorValues.vectors(); return new VectorScorer() { @Override public float score() throws IOException { int ord = iterator.index(); - return entry.similarityFunction().compare(valuesDict.get(ord), target); + return entry.similarityFunction().compare(vectors.get(ord), target); } @Override @@ -417,7 +417,7 @@ public int size() { } @Override - public Bytes values() { + public Bytes vectors() { BytesRef binaryValue = new BytesRef(entry.dimension); binaryValue.length = binaryValue.bytes.length; @@ -446,7 +446,7 @@ public VectorScorer scorer(byte[] target) { return null; } SimpleTextByteVectorValues simpleTextByteVectorValues = new SimpleTextByteVectorValues(this); - ByteVectorValues.Bytes vectors = simpleTextByteVectorValues.values(); + ByteVectorValues.Bytes vectors = simpleTextByteVectorValues.vectors(); return new VectorScorer() { DocIndexIterator it = simpleTextByteVectorValues.iterator(); diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextKnnVectorsWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextKnnVectorsWriter.java index 42dbe7e85ac..8ba828e6bc1 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextKnnVectorsWriter.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextKnnVectorsWriter.java @@ -79,9 +79,9 @@ public void writeField(FieldInfo fieldInfo, FloatVectorValues floatVectorValues, long vectorDataOffset = vectorData.getFilePointer(); List docIds = new ArrayList<>(); KnnVectorValues.DocIndexIterator iter = floatVectorValues.iterator(); - FloatVectorValues.Floats valuesDict = floatVectorValues.values(); + FloatVectorValues.Floats vectors = floatVectorValues.vectors(); for (int docId = iter.nextDoc(); docId != NO_MORE_DOCS; docId = iter.nextDoc()) { - writeFloatVectorValue(valuesDict, floatVectorValues.dimension(), iter.index()); + writeFloatVectorValue(vectors, floatVectorValues.dimension(), iter.index()); docIds.add(docId); } long vectorDataLength = vectorData.getFilePointer() - vectorDataOffset; @@ -103,7 +103,7 @@ public void writeField(FieldInfo fieldInfo, ByteVectorValues byteVectorValues, i long vectorDataOffset = vectorData.getFilePointer(); List docIds = new ArrayList<>(); KnnVectorValues.DocIndexIterator it = byteVectorValues.iterator(); - ByteVectorValues.Bytes vectors = byteVectorValues.values(); + ByteVectorValues.Bytes vectors = byteVectorValues.vectors(); for (int docV = it.nextDoc(); docV != NO_MORE_DOCS; docV = it.nextDoc()) { writeByteVectorValue(vectors, it.index(), byteVectorValues.dimension()); docIds.add(docV); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/BufferingKnnVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/BufferingKnnVectorsWriter.java index 4b3c8c2f53a..c73f5454136 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/BufferingKnnVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/BufferingKnnVectorsWriter.java @@ -122,8 +122,8 @@ private static class SortingFloatVectorValues extends FloatVectorValues { } @Override - public Floats values() { - Floats delegateFloats = delegate.values(); + public Floats vectors() { + Floats delegateFloats = delegate.vectors(); return new Floats() { @Override public float[] get(int ord) throws IOException { @@ -161,9 +161,9 @@ private static class SortingByteVectorValues extends ByteVectorValues { } @Override - public Bytes values() throws IOException { + public Bytes vectors() throws IOException { return new Bytes() { - Bytes values = delegate.values(); + Bytes values = delegate.vectors(); @Override public byte[] get(int ord) throws IOException { @@ -296,7 +296,7 @@ public int ordToDoc(int ord) { } @Override - public Floats values() { + public Floats vectors() { return new Floats() { @Override public float[] get(int ord) throws IOException { @@ -335,7 +335,7 @@ public int size() { } @Override - public Bytes values() { + public Bytes vectors() { return new Bytes() { @Override public byte[] get(int targetOrd) { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsWriter.java index b95ff4f2cdc..d85baed323c 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsWriter.java @@ -61,7 +61,7 @@ public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOE (KnnFieldVectorsWriter) addField(fieldInfo); ByteVectorValues mergedBytes = MergedVectorValues.mergeByteVectorValues(fieldInfo, mergeState); - ByteVectorValues.Bytes values = mergedBytes.values(); + ByteVectorValues.Bytes values = mergedBytes.vectors(); KnnVectorValues.DocIndexIterator iter = mergedBytes.iterator(); for (int doc = iter.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = iter.nextDoc()) { byteWriter.addValue(doc, values.get(iter.index())); @@ -72,7 +72,7 @@ public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOE (KnnFieldVectorsWriter) addField(fieldInfo); FloatVectorValues mergedFloats = MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState); - FloatVectorValues.Floats values = mergedFloats.values(); + FloatVectorValues.Floats values = mergedFloats.vectors(); KnnVectorValues.DocIndexIterator iter = mergedFloats.iterator(); for (int doc = iter.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = iter.nextDoc()) { floatWriter.addValue(doc, values.get(iter.index())); @@ -117,13 +117,13 @@ public final void merge(MergeState mergeState) throws IOException { /** Tracks state of one sub-reader that we are merging */ private static class FloatVectorValuesSub extends DocIDMerger.Sub { - final FloatVectorValues values; + final FloatVectorValues vectorValues; final KnnVectorValues.DocIndexIterator iterator; - FloatVectorValuesSub(MergeState.DocMap docMap, FloatVectorValues values) throws IOException { + FloatVectorValuesSub(MergeState.DocMap docMap, FloatVectorValues vectorValues) throws IOException { super(docMap); - this.values = values; - this.iterator = values.iterator(); + this.vectorValues = vectorValues; + this.iterator = vectorValues.iterator(); assert iterator.docID() == -1; } @@ -139,13 +139,13 @@ public int index() { private static class ByteVectorValuesSub extends DocIDMerger.Sub { - final ByteVectorValues values; + final ByteVectorValues vectorValues; final KnnVectorValues.DocIndexIterator iterator; - ByteVectorValuesSub(MergeState.DocMap docMap, ByteVectorValues values) throws IOException { + ByteVectorValuesSub(MergeState.DocMap docMap, ByteVectorValues vectorValues) throws IOException { super(docMap); - this.values = values; - iterator = values.iterator(); + this.vectorValues = vectorValues; + iterator = vectorValues.iterator(); assert iterator.docID() == -1; } @@ -310,7 +310,7 @@ private MergedFloat32VectorValues(List subs, MergeState me docIdMerger = DocIDMerger.of(subs, mergeState.needsIndexSort); int totalSize = 0; for (FloatVectorValuesSub sub : subs) { - totalSize += sub.values.size(); + totalSize += sub.vectorValues.size(); } size = totalSize; } @@ -357,7 +357,7 @@ public long cost() { } @Override - public Floats values() { + public Floats vectors() { return new Floats() { FloatVectorValues currentValues = null; Floats currentFloats = null; @@ -371,9 +371,9 @@ public float[] get(int ord) throws IOException { + ", lastOrd=" + lastOrd); } - if (currentValues != current.values) { - currentValues = current.values; - currentFloats = current.values.values(); + if (currentValues != current.vectorValues) { + currentValues = current.vectorValues; + currentFloats = current.vectorValues.vectors(); } return currentFloats.get(current.index()); } @@ -387,7 +387,7 @@ public int size() { @Override public int dimension() { - return subs.get(0).values.dimension(); + return subs.get(0).vectorValues.dimension(); } @Override @@ -416,13 +416,13 @@ private MergedByteVectorValues(List subs, MergeState mergeS docIdMerger = DocIDMerger.of(subs, mergeState.needsIndexSort); int totalSize = 0; for (ByteVectorValuesSub sub : subs) { - totalSize += sub.values.size(); + totalSize += sub.vectorValues.size(); } size = totalSize; } @Override - public Bytes values() { + public Bytes vectors() { return new Bytes() { ByteVectorValues currentValues = null; Bytes currentBytes = null; @@ -436,9 +436,9 @@ public byte[] get(int ord) throws IOException { + ", lastOrd=" + lastOrd); } - if (currentValues != current.values) { - currentValues = current.values; - currentBytes = current.values.values(); + if (currentValues != current.vectorValues) { + currentValues = current.vectorValues; + currentBytes = current.vectorValues.vectors(); } return currentBytes.get(current.index()); } @@ -493,7 +493,7 @@ public int size() { @Override public int dimension() { - return subs.get(0).values.dimension(); + return subs.get(0).vectorValues.dimension(); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/codecs/hnsw/DefaultFlatVectorScorer.java b/lucene/core/src/java/org/apache/lucene/codecs/hnsw/DefaultFlatVectorScorer.java index 7e46a36928d..a153ce086f8 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/hnsw/DefaultFlatVectorScorer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/hnsw/DefaultFlatVectorScorer.java @@ -99,8 +99,8 @@ private ByteScoringSupplier( @Override public RandomVectorScorer scorer(int ord) throws IOException { - ByteVectorValues.Bytes vectors1 = vectorValues.values(); - ByteVectorValues.Bytes vectors2 = vectorValues.values(); + ByteVectorValues.Bytes vectors1 = vectorValues.vectors(); + ByteVectorValues.Bytes vectors2 = vectorValues.vectors(); return new RandomVectorScorer.AbstractRandomVectorScorer(vectorValues) { @Override public float score(int node) throws IOException { @@ -128,8 +128,8 @@ private FloatScoringSupplier( @Override public RandomVectorScorer scorer(int ord) throws IOException { - FloatVectorValues.Floats vectors1 = vectorValues.values(); - FloatVectorValues.Floats vectors2 = vectorValues.values(); + FloatVectorValues.Floats vectors1 = vectorValues.vectors(); + FloatVectorValues.Floats vectors2 = vectorValues.vectors(); return new RandomVectorScorer.AbstractRandomVectorScorer(vectorValues) { @Override public float score(int node) throws IOException { @@ -151,10 +151,10 @@ private static class FloatVectorScorer extends RandomVectorScorer.AbstractRandom private final VectorSimilarityFunction similarityFunction; public FloatVectorScorer( - FloatVectorValues values, float[] query, VectorSimilarityFunction similarityFunction) + FloatVectorValues vectorValues, float[] query, VectorSimilarityFunction similarityFunction) throws IOException { - super(values); - this.vectors = values.values(); + super(vectorValues); + this.vectors = vectorValues.vectors(); this.query = query; this.similarityFunction = similarityFunction; } @@ -172,10 +172,10 @@ private static class ByteVectorScorer extends RandomVectorScorer.AbstractRandomV private final VectorSimilarityFunction similarityFunction; public ByteVectorScorer( - ByteVectorValues values, byte[] query, VectorSimilarityFunction similarityFunction) + ByteVectorValues vectorValues, byte[] query, VectorSimilarityFunction similarityFunction) throws IOException { - super(values); - vectors = values.values(); + super(vectorValues); + vectors = vectorValues.vectors(); this.query = query; this.similarityFunction = similarityFunction; } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/hnsw/ScalarQuantizedVectorScorer.java b/lucene/core/src/java/org/apache/lucene/codecs/hnsw/ScalarQuantizedVectorScorer.java index 5b364765435..da670aa673b 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/hnsw/ScalarQuantizedVectorScorer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/hnsw/ScalarQuantizedVectorScorer.java @@ -87,7 +87,7 @@ public RandomVectorScorer getRandomVectorScorer( scalarQuantizer.getConstantMultiplier(), scalarQuantizer.getBits()); return new RandomVectorScorer.AbstractRandomVectorScorer(quantizedByteVectorValues) { - QuantizedByteVectorValues.QuantizedBytes values = quantizedByteVectorValues.values(); + QuantizedByteVectorValues.QuantizedBytes values = quantizedByteVectorValues.vectors(); @Override public float score(int node) throws IOException { @@ -122,29 +122,29 @@ public String toString() { public static class ScalarQuantizedRandomVectorScorerSupplier implements RandomVectorScorerSupplier { - private final QuantizedByteVectorValues values; + private final QuantizedByteVectorValues vectorValues; private final ScalarQuantizedVectorSimilarity similarity; private final VectorSimilarityFunction vectorSimilarityFunction; public ScalarQuantizedRandomVectorScorerSupplier( VectorSimilarityFunction similarityFunction, ScalarQuantizer scalarQuantizer, - QuantizedByteVectorValues values) { + QuantizedByteVectorValues vectorValues) { this.similarity = ScalarQuantizedVectorSimilarity.fromVectorSimilarity( similarityFunction, scalarQuantizer.getConstantMultiplier(), scalarQuantizer.getBits()); - this.values = values; + this.vectorValues = vectorValues; this.vectorSimilarityFunction = similarityFunction; } @Override public RandomVectorScorer scorer(int ord) throws IOException { - final QuantizedByteVectorValues.QuantizedBytes vectors = values.values(); + final QuantizedByteVectorValues.QuantizedBytes vectors = vectorValues.vectors(); final byte[] queryVector = vectors.get(ord); final float queryOffset = vectors.getScoreCorrectionConstant(ord); - return new RandomVectorScorer.AbstractRandomVectorScorer(values) { + return new RandomVectorScorer.AbstractRandomVectorScorer(vectorValues) { @Override public float score(int node) throws IOException { byte[] nodeVector = vectors.get(node); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene95/OffHeapByteVectorValues.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene95/OffHeapByteVectorValues.java index b13c0db094b..5e0a10eef5f 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene95/OffHeapByteVectorValues.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene95/OffHeapByteVectorValues.java @@ -68,7 +68,7 @@ public int size() { } @Override - public Bytes values() throws IOException { + public Bytes vectors() throws IOException { return new OffHeapBytes(); } @@ -291,7 +291,7 @@ public int size() { } @Override - public Bytes values() { + public Bytes vectors() { return Bytes.EMPTY; } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene95/OffHeapFloatVectorValues.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene95/OffHeapFloatVectorValues.java index 94831fd8ae3..919ba1a19b7 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene95/OffHeapFloatVectorValues.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene95/OffHeapFloatVectorValues.java @@ -72,7 +72,7 @@ public IndexInput getSlice() { } @Override - public Floats values() { + public Floats vectors() { IndexInput sliceCopy = slice.clone(); float[] value = new float[dimension]; return new Floats() { @@ -276,7 +276,7 @@ public int size() { } @Override - public Floats values() { + public Floats vectors() { return new Floats() { @Override public float[] get(int ord) { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsWriter.java index eca60c4ce15..5697abcde10 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsWriter.java @@ -363,7 +363,7 @@ private static DocsWithFieldSet writeByteVectorData( IndexOutput output, ByteVectorValues byteVectorValues) throws IOException { DocsWithFieldSet docsWithField = new DocsWithFieldSet(); KnnVectorValues.DocIndexIterator iter = byteVectorValues.iterator(); - ByteVectorValues.Bytes values = byteVectorValues.values(); + ByteVectorValues.Bytes values = byteVectorValues.vectors(); for (int docV = iter.nextDoc(); docV != NO_MORE_DOCS; docV = iter.nextDoc()) { // write vector byte[] binaryValue = values.get(iter.index()); @@ -384,10 +384,10 @@ private static DocsWithFieldSet writeVectorData( ByteBuffer.allocate(floatVectorValues.dimension() * VectorEncoding.FLOAT32.byteSize) .order(ByteOrder.LITTLE_ENDIAN); KnnVectorValues.DocIndexIterator iter = floatVectorValues.iterator(); - FloatVectorValues.Floats dict = floatVectorValues.values(); + FloatVectorValues.Floats vectors = floatVectorValues.vectors(); for (int docV = iter.nextDoc(); docV != NO_MORE_DOCS; docV = iter.nextDoc()) { // write vector - float[] value = dict.get(iter.index()); + float[] value = vectors.get(iter.index()); buffer.asFloatBuffer().put(value); output.writeBytes(buffer.array(), buffer.limit()); docsWithField.add(docV); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorScorer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorScorer.java index ec100c31dad..519d08e5a68 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorScorer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorScorer.java @@ -117,20 +117,20 @@ private static RandomVectorScorer.AbstractRandomVectorScorer dotProductFactory( byte[] targetBytes, float offsetCorrection, float constMultiplier, - QuantizedByteVectorValues values, + QuantizedByteVectorValues vectorValues, FloatToFloatFunction scoreAdjustmentFunction) throws IOException { - QuantizedByteVectorValues.QuantizedBytes vectors = values.values(); - if (values.getScalarQuantizer().getBits() <= 4) { - if (values.getVectorByteLength() != values.dimension() && vectors.getSlice() != null) { + QuantizedByteVectorValues.QuantizedBytes vectors = vectorValues.vectors(); + if (vectorValues.getScalarQuantizer().getBits() <= 4) { + if (vectorValues.getVectorByteLength() != vectorValues.dimension() && vectors.getSlice() != null) { return new CompressedInt4DotProduct( - values, constMultiplier, targetBytes, offsetCorrection, scoreAdjustmentFunction); + vectorValues, constMultiplier, targetBytes, offsetCorrection, scoreAdjustmentFunction); } return new Int4DotProduct( - values, constMultiplier, targetBytes, offsetCorrection, scoreAdjustmentFunction); + vectorValues, constMultiplier, targetBytes, offsetCorrection, scoreAdjustmentFunction); } return new DotProduct( - values, constMultiplier, targetBytes, offsetCorrection, scoreAdjustmentFunction); + vectorValues, constMultiplier, targetBytes, offsetCorrection, scoreAdjustmentFunction); } private static class Euclidean extends RandomVectorScorer.AbstractRandomVectorScorer { @@ -138,10 +138,10 @@ private static class Euclidean extends RandomVectorScorer.AbstractRandomVectorSc private final byte[] targetBytes; private final QuantizedByteVectorValues.QuantizedBytes vectors; - private Euclidean(QuantizedByteVectorValues values, float constMultiplier, byte[] targetBytes) + private Euclidean(QuantizedByteVectorValues vectorValues, float constMultiplier, byte[] targetBytes) throws IOException { - super(values); - vectors = values.values(); + super(vectorValues); + vectors = vectorValues.vectors(); this.constMultiplier = constMultiplier; this.targetBytes = targetBytes; } @@ -164,15 +164,15 @@ private static class DotProduct extends RandomVectorScorer.AbstractRandomVectorS private final FloatToFloatFunction scoreAdjustmentFunction; public DotProduct( - QuantizedByteVectorValues values, + QuantizedByteVectorValues vectorValues, float constMultiplier, byte[] targetBytes, float offsetCorrection, FloatToFloatFunction scoreAdjustmentFunction) throws IOException { - super(values); + super(vectorValues); this.constMultiplier = constMultiplier; - vectors = values.values(); + vectors = vectorValues.vectors(); this.targetBytes = targetBytes; this.offsetCorrection = offsetCorrection; this.scoreAdjustmentFunction = scoreAdjustmentFunction; @@ -193,7 +193,7 @@ public float score(int vectorOrdinal) throws IOException { private static class CompressedInt4DotProduct extends RandomVectorScorer.AbstractRandomVectorScorer { private final float constMultiplier; - private final QuantizedByteVectorValues values; + private final QuantizedByteVectorValues vectorValues; private final QuantizedByteVectorValues.QuantizedBytes vectors; private final byte[] compressedVector; private final byte[] targetBytes; @@ -201,17 +201,17 @@ private static class CompressedInt4DotProduct private final FloatToFloatFunction scoreAdjustmentFunction; private CompressedInt4DotProduct( - QuantizedByteVectorValues values, + QuantizedByteVectorValues vectorValues, float constMultiplier, byte[] targetBytes, float offsetCorrection, FloatToFloatFunction scoreAdjustmentFunction) throws IOException { - super(values); + super(vectorValues); this.constMultiplier = constMultiplier; - this.values = values; - vectors = values.values(); - this.compressedVector = new byte[values.getVectorByteLength()]; + this.vectorValues = vectorValues; + vectors = vectorValues.vectors(); + this.compressedVector = new byte[vectorValues.getVectorByteLength()]; this.targetBytes = targetBytes; this.offsetCorrection = offsetCorrection; this.scoreAdjustmentFunction = scoreAdjustmentFunction; @@ -221,7 +221,7 @@ private CompressedInt4DotProduct( public float score(int vectorOrdinal) throws IOException { // get compressed vector, in Lucene99, vector values are stored and have a single value for // offset correction - vectors.getSlice().seek((long) vectorOrdinal * (values.getVectorByteLength() + Float.BYTES)); + vectors.getSlice().seek((long) vectorOrdinal * (vectorValues.getVectorByteLength() + Float.BYTES)); vectors.getSlice().readBytes(compressedVector, 0, compressedVector.length); float vectorOffset = vectors.getScoreCorrectionConstant(vectorOrdinal); int dotProduct = VectorUtil.int4DotProductPacked(targetBytes, compressedVector); @@ -240,15 +240,15 @@ private static class Int4DotProduct extends RandomVectorScorer.AbstractRandomVec private final FloatToFloatFunction scoreAdjustmentFunction; public Int4DotProduct( - QuantizedByteVectorValues values, + QuantizedByteVectorValues vectorValues, float constMultiplier, byte[] targetBytes, float offsetCorrection, FloatToFloatFunction scoreAdjustmentFunction) throws IOException { - super(values); + super(vectorValues); this.constMultiplier = constMultiplier; - vectors = values.values(); + vectors = vectorValues.vectors(); this.targetBytes = targetBytes; this.offsetCorrection = offsetCorrection; this.scoreAdjustmentFunction = scoreAdjustmentFunction; @@ -275,14 +275,14 @@ private static final class ScalarQuantizedRandomVectorScorerSupplier implements RandomVectorScorerSupplier { private final VectorSimilarityFunction vectorSimilarityFunction; - private final QuantizedByteVectorValues values; + private final QuantizedByteVectorValues vectorValues; private final QuantizedByteVectorValues.QuantizedBytes vectors; public ScalarQuantizedRandomVectorScorerSupplier( - QuantizedByteVectorValues values, VectorSimilarityFunction vectorSimilarityFunction) + QuantizedByteVectorValues vectorValues, VectorSimilarityFunction vectorSimilarityFunction) throws IOException { - this.values = values; - this.vectors = values.values(); + this.vectorValues = vectorValues; + this.vectors = vectorValues.vectors(); this.vectorSimilarityFunction = vectorSimilarityFunction; } @@ -294,8 +294,8 @@ public RandomVectorScorer scorer(int ord) throws IOException { vectorValue, offsetCorrection, vectorSimilarityFunction, - values.getScalarQuantizer().getConstantMultiplier(), - values); + vectorValues.getScalarQuantizer().getConstantMultiplier(), + vectorValues); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsReader.java index 955dfb79bad..cacaaa1306f 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsReader.java @@ -421,12 +421,12 @@ public int size() { } @Override - public Floats values() throws IOException { - Floats rawDict = rawVectorValues.values(); + public Floats vectors() throws IOException { + Floats rawVectors = rawVectorValues.vectors(); return new Floats() { @Override public float[] get(int ord) throws IOException { - return rawDict.get(ord); + return rawVectors.get(ord); } }; } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsWriter.java index a90f061caa9..fc401c93529 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsWriter.java @@ -729,7 +729,7 @@ public static DocsWithFieldSet writeQuantizedVectorData( quantizedByteVectorValues.dimension(), bits) : null; KnnVectorValues.DocIndexIterator iter = quantizedByteVectorValues.iterator(); - QuantizedByteVectorValues.QuantizedBytes vectors = quantizedByteVectorValues.values(); + QuantizedByteVectorValues.QuantizedBytes vectors = quantizedByteVectorValues.vectors(); for (int docV = iter.nextDoc(); docV != NO_MORE_DOCS; docV = iter.nextDoc()) { // write vector byte[] binaryValue = vectors.get(iter.index()); @@ -869,7 +869,7 @@ public int size() { } @Override - public Floats values() { + public Floats vectors() { return new Floats() { @Override public float[] get(int ord) throws IOException { @@ -888,16 +888,16 @@ public DocIndexIterator iterator() { } static class QuantizedByteVectorValueSub extends DocIDMerger.Sub { - private final QuantizedByteVectorValues values; + private final QuantizedByteVectorValues vectorValues; private final QuantizedByteVectorValues.QuantizedBytes vectors; private final KnnVectorValues.DocIndexIterator iterator; - QuantizedByteVectorValueSub(MergeState.DocMap docMap, QuantizedByteVectorValues values) + QuantizedByteVectorValueSub(MergeState.DocMap docMap, QuantizedByteVectorValues vectorValues) throws IOException { super(docMap); - this.values = values; - iterator = values.iterator(); - vectors = values.values(); + this.vectorValues = vectorValues; + iterator = vectorValues.iterator(); + vectors = vectorValues.vectors(); assert iterator.docID() == -1; } @@ -972,13 +972,13 @@ private MergedQuantizedVectorValues( docIdMerger = DocIDMerger.of(subs, mergeState.needsIndexSort); int totalSize = 0; for (QuantizedByteVectorValueSub sub : subs) { - totalSize += sub.values.size(); + totalSize += sub.vectorValues.size(); } size = totalSize; } @Override - public QuantizedBytes values() throws IOException { + public QuantizedBytes vectors() throws IOException { return new QuantizedBytes() { @Override public byte[] get(int ord) throws IOException { @@ -1004,7 +1004,7 @@ public int size() { @Override public int dimension() { - return subs.get(0).values.dimension(); + return subs.get(0).vectorValues.dimension(); } private class CompositeIterator extends DocIndexIterator { @@ -1052,36 +1052,36 @@ public long cost() { } static class QuantizedFloatVectorValues extends QuantizedByteVectorValues { - private final FloatVectorValues values; + private final FloatVectorValues vectorValues; private final ScalarQuantizer quantizer; private final VectorSimilarityFunction vectorSimilarityFunction; public QuantizedFloatVectorValues( - FloatVectorValues values, + FloatVectorValues vectorValues, VectorSimilarityFunction vectorSimilarityFunction, ScalarQuantizer quantizer) throws IOException { - this.values = values; + this.vectorValues = vectorValues; this.quantizer = quantizer; this.vectorSimilarityFunction = vectorSimilarityFunction; } @Override public int dimension() { - return values.dimension(); + return vectorValues.dimension(); } @Override public int size() { - return values.size(); + return vectorValues.size(); } @Override - public QuantizedBytes values() throws IOException { + public QuantizedBytes vectors() throws IOException { return new QuantizedBytes() { - FloatVectorValues.Floats vectors = values.values(); - byte[] quantizedVector = new byte[values.dimension()]; + FloatVectorValues.Floats vectors = vectorValues.vectors(); + byte[] quantizedVector = new byte[vectorValues.dimension()]; float offsetValue = 0f; int lastOrd = -1; @@ -1119,12 +1119,12 @@ public VectorScorer scorer(float[] target) throws IOException { @Override public int ordToDoc(int ord) { - return values.ordToDoc(ord); + return vectorValues.ordToDoc(ord); } @Override public DocIndexIterator iterator() throws IOException { - return values.iterator(); + return vectorValues.iterator(); } } @@ -1176,9 +1176,9 @@ static final class OffsetCorrectedQuantizedByteVectorValues extends QuantizedByt } @Override - public QuantizedBytes values() throws IOException { + public QuantizedBytes vectors() throws IOException { return new QuantizedBytes() { - Bytes vectors = in.values(); + Bytes vectors = in.vectors(); @Override public byte[] get(int ord) throws IOException { @@ -1215,32 +1215,32 @@ public DocIndexIterator iterator() throws IOException { } static final class NormalizedFloatVectorValues extends FloatVectorValues { - private final FloatVectorValues values; + private final FloatVectorValues vectorValues; private final Floats floats; - public NormalizedFloatVectorValues(FloatVectorValues values) throws IOException { - this.values = values; - floats = values.values(); + public NormalizedFloatVectorValues(FloatVectorValues vectorValues) throws IOException { + this.vectorValues = vectorValues; + floats = vectorValues.vectors(); } @Override public int dimension() { - return values.dimension(); + return vectorValues.dimension(); } @Override public int size() { - return values.size(); + return vectorValues.size(); } @Override public int ordToDoc(int ord) { - return values.ordToDoc(ord); + return vectorValues.ordToDoc(ord); } @Override - public Floats values() { - float[] normalizedVector = new float[values.dimension()]; + public Floats vectors() { + float[] normalizedVector = new float[vectorValues.dimension()]; return new Floats() { @Override public float[] get(int ord) throws IOException { @@ -1253,7 +1253,7 @@ public float[] get(int ord) throws IOException { @Override public DocIndexIterator iterator() throws IOException { - return values.iterator(); + return vectorValues.iterator(); } } } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/OffHeapQuantizedByteVectorValues.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/OffHeapQuantizedByteVectorValues.java index 2de462c857c..5fdccb43c42 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/OffHeapQuantizedByteVectorValues.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/OffHeapQuantizedByteVectorValues.java @@ -121,7 +121,7 @@ public int size() { } @Override - public QuantizedBytes values() throws IOException { + public QuantizedBytes vectors() throws IOException { return new QuantizedBytes() { ByteBuffer byteBuffer = ByteBuffer.allocate(dimension); byte[] binaryValue = byteBuffer.array(); @@ -350,7 +350,7 @@ public DocIndexIterator iterator() { } @Override - public QuantizedBytes values() { + public QuantizedBytes vectors() { return QuantizedBytes.EMPTY; } diff --git a/lucene/core/src/java/org/apache/lucene/index/ByteVectorValues.java b/lucene/core/src/java/org/apache/lucene/index/ByteVectorValues.java index aa3381a34cb..8e3f2264d40 100644 --- a/lucene/core/src/java/org/apache/lucene/index/ByteVectorValues.java +++ b/lucene/core/src/java/org/apache/lucene/index/ByteVectorValues.java @@ -52,7 +52,7 @@ public byte[] get(int ord) { } /** Returns a random access (lookup by ord) provider of the vector values */ - public abstract Bytes values() throws IOException; + public abstract Bytes vectors() throws IOException; /** * Checks the Vector Encoding of a field @@ -110,7 +110,7 @@ public int dimension() { } @Override - public Bytes values() { + public Bytes vectors() { return new Bytes() { @Override public byte[] get(int targetOrd) { diff --git a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java index f9db7143724..3fed58305ef 100644 --- a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java +++ b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java @@ -2755,22 +2755,22 @@ private static boolean vectorsReaderSupportsSearch(CodecReader codecReader, Stri } private static void checkFloatVectorValues( - FloatVectorValues values, + FloatVectorValues vectorValues, FieldInfo fieldInfo, CheckIndex.Status.VectorValuesStatus status, CodecReader codecReader) throws IOException { int count = 0; - int everyNdoc = Math.max(values.size() / 64, 1); - FloatVectorValues.Floats valueDict = values.values(); - while (count < values.size()) { + int everyNdoc = Math.max(vectorValues.size() / 64, 1); + FloatVectorValues.Floats vectors = vectorValues.vectors(); + while (count < vectorValues.size()) { // search the first maxNumSearches vectors to exercise the graph - if (values.ordToDoc(count) % everyNdoc == 0) { + if (vectorValues.ordToDoc(count) % everyNdoc == 0) { KnnCollector collector = new TopKnnCollector(10, Integer.MAX_VALUE); if (vectorsReaderSupportsSearch(codecReader, fieldInfo.name)) { codecReader .getVectorReader() - .search(fieldInfo.name, valueDict.get(count), collector, null); + .search(fieldInfo.name, vectors.get(count), collector, null); TopDocs docs = collector.topDocs(); if (docs.scoreDocs.length == 0) { throw new CheckIndexException( @@ -2778,7 +2778,7 @@ private static void checkFloatVectorValues( } } } - int valueLength = valueDict.get(count).length; + int valueLength = vectors.get(count).length; if (valueLength != fieldInfo.getVectorDimension()) { throw new CheckIndexException( "Field \"" @@ -2790,12 +2790,12 @@ private static void checkFloatVectorValues( } ++count; } - if (count != values.size()) { + if (count != vectorValues.size()) { throw new CheckIndexException( "Field \"" + fieldInfo.name + "\" has size=" - + values.size() + + vectorValues.size() + " but when iterated, returns " + count + " docs with values"); @@ -2804,18 +2804,18 @@ private static void checkFloatVectorValues( } private static void checkByteVectorValues( - ByteVectorValues values, + ByteVectorValues vectorValues, FieldInfo fieldInfo, CheckIndex.Status.VectorValuesStatus status, CodecReader codecReader) throws IOException { int count = 0; - int everyNdoc = Math.max(values.size() / 64, 1); + int everyNdoc = Math.max(vectorValues.size() / 64, 1); boolean supportsSearch = vectorsReaderSupportsSearch(codecReader, fieldInfo.name); - ByteVectorValues.Bytes vectors = values.values(); - while (count < values.size()) { + ByteVectorValues.Bytes vectors = vectorValues.vectors(); + while (count < vectorValues.size()) { // search the first maxNumSearches vectors to exercise the graph - if (supportsSearch && values.ordToDoc(count) % everyNdoc == 0) { + if (supportsSearch && vectorValues.ordToDoc(count) % everyNdoc == 0) { KnnCollector collector = new TopKnnCollector(10, Integer.MAX_VALUE); codecReader.getVectorReader().search(fieldInfo.name, vectors.get(count), collector, null); TopDocs docs = collector.topDocs(); @@ -2836,12 +2836,12 @@ private static void checkByteVectorValues( } ++count; } - if (count != values.size()) { + if (count != vectorValues.size()) { throw new CheckIndexException( "Field \"" + fieldInfo.name + "\" has size=" - + values.size() + + vectorValues.size() + " but when iterated, returns " + count + " docs with values"); diff --git a/lucene/core/src/java/org/apache/lucene/index/ExitableDirectoryReader.java b/lucene/core/src/java/org/apache/lucene/index/ExitableDirectoryReader.java index 363f96eb137..3d4bccf6cc2 100644 --- a/lucene/core/src/java/org/apache/lucene/index/ExitableDirectoryReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/ExitableDirectoryReader.java @@ -441,12 +441,12 @@ public int dimension() { } @Override - public Floats values() throws IOException { - Floats dict = vectorValues.values(); + public Floats vectors() throws IOException { + Floats vectors = vectorValues.vectors(); return new Floats() { @Override public float[] get(int ord) throws IOException { - return dict.get(ord); + return vectors.get(ord); } }; } @@ -490,9 +490,9 @@ public int size() { } @Override - public Bytes values() throws IOException { + public Bytes vectors() throws IOException { return new Bytes() { - Bytes vectors = vectorValues.values(); + Bytes vectors = vectorValues.vectors(); @Override public byte[] get(int ord) throws IOException { diff --git a/lucene/core/src/java/org/apache/lucene/index/FloatVectorValues.java b/lucene/core/src/java/org/apache/lucene/index/FloatVectorValues.java index b02c060e43b..c58e885d265 100644 --- a/lucene/core/src/java/org/apache/lucene/index/FloatVectorValues.java +++ b/lucene/core/src/java/org/apache/lucene/index/FloatVectorValues.java @@ -44,7 +44,7 @@ public abstract static class Floats { } /** Returns a random access (lookup by ord) provider of the vector values */ - public abstract Floats values() throws IOException; + public abstract Floats vectors() throws IOException; /** * Checks the Vector Encoding of a field @@ -103,7 +103,7 @@ public int dimension() { } @Override - public Floats values() { + public Floats vectors() { return new Floats() { @Override public float[] get(int ord) throws IOException { diff --git a/lucene/core/src/java/org/apache/lucene/index/SlowCompositeCodecReaderWrapper.java b/lucene/core/src/java/org/apache/lucene/index/SlowCompositeCodecReaderWrapper.java index cbad0e23f2d..99802a163ae 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SlowCompositeCodecReaderWrapper.java +++ b/lucene/core/src/java/org/apache/lucene/index/SlowCompositeCodecReaderWrapper.java @@ -883,7 +883,7 @@ public int size() { } @Override - public Floats values() { + public Floats vectors() { return new Floats() { int lastSubIndex = -1; Floats subValues; @@ -897,7 +897,7 @@ public float[] get(int ord) throws IOException { if (newSubIndex != lastSubIndex) { lastSubIndex = newSubIndex; assert subs.get(lastSubIndex).sub != null; - subValues = subs.get(lastSubIndex).sub.values(); + subValues = subs.get(lastSubIndex).sub.vectors(); } return subValues.get(ord - subs.get(lastSubIndex).ordStart); } @@ -961,7 +961,7 @@ public int size() { } @Override - public Bytes values() { + public Bytes vectors() { return new Bytes() { int lastSubIndex = -1; Bytes subValues; @@ -976,7 +976,7 @@ public byte[] get(int ord) throws IOException { if (newSubIndex != lastSubIndex) { lastSubIndex = newSubIndex; assert subs.get(lastSubIndex).sub != null; - subValues = subs.get(lastSubIndex).sub.values(); + subValues = subs.get(lastSubIndex).sub.vectors(); } return subValues.get(ord - subs.get(lastSubIndex).ordStart); } diff --git a/lucene/core/src/java/org/apache/lucene/index/SortingCodecReader.java b/lucene/core/src/java/org/apache/lucene/index/SortingCodecReader.java index 8d488b5b45a..56e8c9eb6d3 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SortingCodecReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/SortingCodecReader.java @@ -319,13 +319,13 @@ private static class SortingFloatVectorValues extends FloatVectorValues { } @Override - public Floats values() throws IOException { - Floats delegateDict = delegate.values(); + public Floats vectors() throws IOException { + Floats delegateVectors = delegate.vectors(); return new Floats() { @Override public float[] get(int ord) throws IOException { // ords are interpreted in the delegate's ord-space. - return delegateDict.get(ord); + return delegateVectors.get(ord); } }; } @@ -357,9 +357,9 @@ private static class SortingByteVectorValues extends ByteVectorValues { } @Override - public Bytes values() throws IOException { + public Bytes vectors() throws IOException { return new Bytes() { - Bytes values = delegate.values(); + Bytes values = delegate.vectors(); @Override public byte[] get(int ord) throws IOException { diff --git a/lucene/core/src/java/org/apache/lucene/util/quantization/QuantizedByteVectorValues.java b/lucene/core/src/java/org/apache/lucene/util/quantization/QuantizedByteVectorValues.java index 88b071bf2ce..7ffc1bef6eb 100644 --- a/lucene/core/src/java/org/apache/lucene/util/quantization/QuantizedByteVectorValues.java +++ b/lucene/core/src/java/org/apache/lucene/util/quantization/QuantizedByteVectorValues.java @@ -46,7 +46,7 @@ public VectorScorer scorer(float[] query) throws IOException { /** Returns a random access (lookup by ord) provider of the quantized vector values */ @Override - public abstract QuantizedBytes values() throws IOException; + public abstract QuantizedBytes vectors() throws IOException; /** A Bytes that also provides quantization info */ public abstract static class QuantizedBytes extends Bytes implements HasIndexSlice { diff --git a/lucene/core/src/java/org/apache/lucene/util/quantization/ScalarQuantizer.java b/lucene/core/src/java/org/apache/lucene/util/quantization/ScalarQuantizer.java index 3a04650aa36..a13de82be71 100644 --- a/lucene/core/src/java/org/apache/lucene/util/quantization/ScalarQuantizer.java +++ b/lucene/core/src/java/org/apache/lucene/util/quantization/ScalarQuantizer.java @@ -271,12 +271,12 @@ static ScalarQuantizer fromVectors( return new ScalarQuantizer(0f, 0f, bits); } KnnVectorValues.DocIndexIterator iterator = floatVectorValues.iterator(); - FloatVectorValues.Floats dict = floatVectorValues.values(); + FloatVectorValues.Floats vectors = floatVectorValues.vectors(); if (confidenceInterval == 1f) { float min = Float.POSITIVE_INFINITY; float max = Float.NEGATIVE_INFINITY; while (iterator.nextDoc() != NO_MORE_DOCS) { - for (float v : dict.get(iterator.index())) { + for (float v : vectors.get(iterator.index())) { min = Math.min(min, v); max = Math.max(max, v); } @@ -293,7 +293,7 @@ static ScalarQuantizer fromVectors( int scratchSize = Math.min(SCRATCH_SIZE, totalVectorCount); int i = 0; while (iterator.nextDoc() != NO_MORE_DOCS) { - float[] vectorValue = dict.get(iterator.index()); + float[] vectorValue = vectors.get(iterator.index()); System.arraycopy( vectorValue, 0, quantileGatheringScratch, i * vectorValue.length, vectorValue.length); i++; @@ -318,7 +318,7 @@ static ScalarQuantizer fromVectors( index++; } assert iterator.docID() != NO_MORE_DOCS; - float[] vectorValue = dict.get(iterator.index()); + float[] vectorValue = vectors.get(iterator.index()); System.arraycopy( vectorValue, 0, quantileGatheringScratch, idx * vectorValue.length, vectorValue.length); idx++; @@ -357,12 +357,12 @@ public static ScalarQuantizer fromVectorsAutoInterval( 1 - 1f / (floatVectorValues.dimension() + 1) }; KnnVectorValues.DocIndexIterator iterator = floatVectorValues.iterator(); - FloatVectorValues.Floats dict = floatVectorValues.values(); + FloatVectorValues.Floats vectors = floatVectorValues.vectors(); if (totalVectorCount <= sampleSize) { int scratchSize = Math.min(SCRATCH_SIZE, totalVectorCount); int i = 0; while (iterator.nextDoc() != NO_MORE_DOCS) { - gatherSample(dict.get(iterator.index()), quantileGatheringScratch, sampledDocs, i); + gatherSample(vectors.get(iterator.index()), quantileGatheringScratch, sampledDocs, i); i++; if (i == scratchSize) { extractQuantiles(confidenceIntervals, quantileGatheringScratch, upperSum, lowerSum); @@ -383,7 +383,7 @@ public static ScalarQuantizer fromVectorsAutoInterval( index++; } assert iterator.docID() != NO_MORE_DOCS; - gatherSample(dict.get(iterator.index()), quantileGatheringScratch, sampledDocs, idx); + gatherSample(vectors.get(iterator.index()), quantileGatheringScratch, sampledDocs, idx); idx++; if (idx == SCRATCH_SIZE) { extractQuantiles(confidenceIntervals, quantileGatheringScratch, upperSum, lowerSum); diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99HnswQuantizedVectorsFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99HnswQuantizedVectorsFormat.java index cee219d8e42..0615e1d6813 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99HnswQuantizedVectorsFormat.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99HnswQuantizedVectorsFormat.java @@ -312,7 +312,7 @@ public void testQuantizedVectorsWriteAndRead() throws Exception { QuantizedByteVectorValues quantizedByteVectorValues = hnswReader.getQuantizedVectorValues("f"); QuantizedByteVectorValues.QuantizedBytes byteVectors = - quantizedByteVectorValues.values(); + quantizedByteVectorValues.vectors(); for (int ord = 0; ord < quantizedByteVectorValues.size(); ord++) { byte[] vector = byteVectors.get(ord); float offset = byteVectors.getScoreCorrectionConstant(ord); diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99ScalarQuantizedVectorScorer.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99ScalarQuantizedVectorScorer.java index d4594673a58..2b59bdce35e 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99ScalarQuantizedVectorScorer.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99ScalarQuantizedVectorScorer.java @@ -118,7 +118,7 @@ public int size() { } @Override - public QuantizedBytes values() { + public QuantizedBytes vectors() { return new QuantizedBytes() { @Override public byte[] get(int ord) { diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99ScalarQuantizedVectorsFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99ScalarQuantizedVectorsFormat.java index 5a3074488a7..5315ab74785 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99ScalarQuantizedVectorsFormat.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99ScalarQuantizedVectorsFormat.java @@ -174,7 +174,7 @@ public void testQuantizedVectorsWriteAndRead() throws Exception { QuantizedByteVectorValues quantizedByteVectorValues = quantizedReader.getQuantizedVectorValues("f"); QuantizedByteVectorValues.QuantizedBytes byteVectors = - quantizedByteVectorValues.values(); + quantizedByteVectorValues.vectors(); int docId = -1; KnnVectorValues.DocIndexIterator iter = quantizedByteVectorValues.iterator(); for (docId = iter.nextDoc(); docId != NO_MORE_DOCS; docId = iter.nextDoc()) { diff --git a/lucene/core/src/test/org/apache/lucene/document/TestField.java b/lucene/core/src/test/org/apache/lucene/document/TestField.java index 4d0be248614..876ee07e191 100644 --- a/lucene/core/src/test/org/apache/lucene/document/TestField.java +++ b/lucene/core/src/test/org/apache/lucene/document/TestField.java @@ -714,7 +714,7 @@ public void testKnnVectorField() throws Exception { w.addDocument(doc); try (IndexReader r = DirectoryReader.open(w)) { ByteVectorValues binary = r.leaves().get(0).reader().getByteVectorValues("binary"); - ByteVectorValues.Bytes vectors = binary.values(); + ByteVectorValues.Bytes vectors = binary.vectors(); assertEquals(1, binary.size()); KnnVectorValues.DocIndexIterator iterator = binary.iterator(); assertNotEquals(NO_MORE_DOCS, iterator.nextDoc()); @@ -727,10 +727,10 @@ public void testKnnVectorField() throws Exception { assertEquals(1, floatValues.size()); KnnVectorValues.DocIndexIterator iterator1 = floatValues.iterator(); assertNotEquals(NO_MORE_DOCS, iterator1.nextDoc()); - assertEquals(vector.length, floatValues.values().get(0).length); - assertEquals(vector[0], floatValues.values().get(0)[0], 0); + assertEquals(vector.length, floatValues.vectors().get(0).length); + assertEquals(vector[0], floatValues.vectors().get(0)[0], 0); assertEquals(NO_MORE_DOCS, iterator1.nextDoc()); - expectThrows(IOException.class, () -> floatValues.values().get(1)); + expectThrows(IOException.class, () -> floatValues.vectors().get(1)); } } } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestExitableDirectoryReader.java b/lucene/core/src/test/org/apache/lucene/index/TestExitableDirectoryReader.java index c18e4d18170..8c028463797 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestExitableDirectoryReader.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestExitableDirectoryReader.java @@ -581,7 +581,7 @@ private static void scanAndRetrieve(LeafReader leaf, KnnVectorValues values) thr if (random().nextBoolean() && iter.docID() != DocIdSetIterator.NO_MORE_DOCS && values instanceof FloatVectorValues) { - ((FloatVectorValues) values).values().get(iter.index()); + ((FloatVectorValues) values).vectors().get(iter.index()); } } } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestKnnGraph.java b/lucene/core/src/test/org/apache/lucene/index/TestKnnGraph.java index b5d4babea8e..96332b84eed 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestKnnGraph.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestKnnGraph.java @@ -414,7 +414,7 @@ private void assertConsistentGraph(IndexWriter iw, float[][] values, String vect int nextDocWithVectors = 0; StoredFields storedFields = reader.storedFields(); KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator(); - FloatVectorValues.Floats dict = vectorValues.values(); + FloatVectorValues.Floats vectors = vectorValues.vectors(); for (int i = 0; i < reader.maxDoc(); i++) { nextDocWithVectors = iterator.advance(i); while (i < nextDocWithVectors && i < reader.maxDoc()) { @@ -428,7 +428,7 @@ private void assertConsistentGraph(IndexWriter iw, float[][] values, String vect } int id = Integer.parseInt(storedFields.document(i).get("id")); // documents with KnnGraphValues have the expected vectors - float[] scratch = dict.get(iterator.index()); + float[] scratch = vectors.get(iterator.index()); assertArrayEquals( "vector did not match for doc " + i + ", id=" + id + ": " + Arrays.toString(scratch), values[id], diff --git a/lucene/core/src/test/org/apache/lucene/index/TestSortingCodecReader.java b/lucene/core/src/test/org/apache/lucene/index/TestSortingCodecReader.java index 6db0aa8644f..fed3e5800da 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestSortingCodecReader.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestSortingCodecReader.java @@ -276,7 +276,7 @@ public void testSortOnAddIndicesRandom() throws IOException { assertEquals(1, sorted_numeric_dv.docValueCount()); assertEquals(ids.longValue(), sorted_numeric_dv.nextValue()); - float[] vectorValue = vectorValues.values().get(valuesIterator.index()); + float[] vectorValue = vectorValues.vectors().get(valuesIterator.index()); assertEquals(1, vectorValue.length); assertEquals((float) ids.longValue(), vectorValue[0], 0.001f); diff --git a/lucene/core/src/test/org/apache/lucene/util/hnsw/HnswGraphTestCase.java b/lucene/core/src/test/org/apache/lucene/util/hnsw/HnswGraphTestCase.java index 8d66075b0c0..0ffc8a87995 100644 --- a/lucene/core/src/test/org/apache/lucene/util/hnsw/HnswGraphTestCase.java +++ b/lucene/core/src/test/org/apache/lucene/util/hnsw/HnswGraphTestCase.java @@ -183,14 +183,14 @@ public KnnVectorsFormat getKnnVectorsFormatForField(String field) { doc.add( knnVectorField( "field", - (T) ((ByteVectorValues) vectors).values().get(ord), + (T) ((ByteVectorValues) vectors).vectors().get(ord), similarityFunction)); } case FLOAT32 -> { doc.add( knnVectorField( "field", - (T) ((FloatVectorValues) vectors).values().get(ord), + (T) ((FloatVectorValues) vectors).vectors().get(ord), similarityFunction)); } } @@ -221,10 +221,10 @@ public KnnVectorsFormat getKnnVectorsFormatForField(String field) { private T vectorValue(KnnVectorValues vectors, int ord) throws IOException { switch (vectors.getEncoding()) { case BYTE -> { - return (T) ((ByteVectorValues) vectors).values().get(ord); + return (T) ((ByteVectorValues) vectors).vectors().get(ord); } case FLOAT32 -> { - return (T) ((FloatVectorValues) vectors).values().get(ord); + return (T) ((FloatVectorValues) vectors).vectors().get(ord); } } throw new AssertionError("unknown encoding " + vectors.getEncoding()); @@ -1120,7 +1120,7 @@ public int size() { } @Override - public Floats values() { + public Floats vectors() { return new Floats() { float[] value = new float[2]; @@ -1175,7 +1175,7 @@ public int size() { } @Override - public Bytes values() { + public Bytes vectors() { return new Bytes() { byte[] bValue = new byte[2]; float[] value = new float[2]; diff --git a/lucene/core/src/test/org/apache/lucene/util/hnsw/MockByteVectorValues.java b/lucene/core/src/test/org/apache/lucene/util/hnsw/MockByteVectorValues.java index 719d7df76b4..3d366870a4b 100644 --- a/lucene/core/src/test/org/apache/lucene/util/hnsw/MockByteVectorValues.java +++ b/lucene/core/src/test/org/apache/lucene/util/hnsw/MockByteVectorValues.java @@ -55,7 +55,7 @@ public int dimension() { } @Override - public Bytes values() { + public Bytes vectors() { return new Bytes() { byte[] scratch = new byte[dimension]; diff --git a/lucene/core/src/test/org/apache/lucene/util/hnsw/MockVectorValues.java b/lucene/core/src/test/org/apache/lucene/util/hnsw/MockVectorValues.java index a25ed570de3..bf11293f49d 100644 --- a/lucene/core/src/test/org/apache/lucene/util/hnsw/MockVectorValues.java +++ b/lucene/core/src/test/org/apache/lucene/util/hnsw/MockVectorValues.java @@ -50,7 +50,7 @@ public int dimension() { } @Override - public Floats values() { + public Floats vectors() { return new Floats() { float[] scratch = new float[dimension]; diff --git a/lucene/core/src/test/org/apache/lucene/util/hnsw/TestHnswByteVectorGraph.java b/lucene/core/src/test/org/apache/lucene/util/hnsw/TestHnswByteVectorGraph.java index 91611fbe775..12471f8ea28 100644 --- a/lucene/core/src/test/org/apache/lucene/util/hnsw/TestHnswByteVectorGraph.java +++ b/lucene/core/src/test/org/apache/lucene/util/hnsw/TestHnswByteVectorGraph.java @@ -110,7 +110,7 @@ MockByteVectorValues vectorValues( @Override MockByteVectorValues vectorValues(LeafReader reader, String fieldName) throws IOException { ByteVectorValues vectorValues = reader.getByteVectorValues(fieldName); - ByteVectorValues.Bytes byteVectors = vectorValues.values(); + ByteVectorValues.Bytes byteVectors = vectorValues.vectors(); byte[][] vectors = new byte[reader.maxDoc()][]; for (int i = 0; i < vectorValues.size(); i++) { vectors[vectorValues.ordToDoc(i)] = diff --git a/lucene/core/src/test/org/apache/lucene/util/hnsw/TestHnswFloatVectorGraph.java b/lucene/core/src/test/org/apache/lucene/util/hnsw/TestHnswFloatVectorGraph.java index 7231460928d..519e1b03469 100644 --- a/lucene/core/src/test/org/apache/lucene/util/hnsw/TestHnswFloatVectorGraph.java +++ b/lucene/core/src/test/org/apache/lucene/util/hnsw/TestHnswFloatVectorGraph.java @@ -71,11 +71,11 @@ MockVectorValues vectorValues(float[][] values) { @Override MockVectorValues vectorValues(LeafReader reader, String fieldName) throws IOException { FloatVectorValues vectorValues = reader.getFloatVectorValues(fieldName); - FloatVectorValues.Floats dict = vectorValues.values(); + FloatVectorValues.Floats vectors = vectorValues.vectors(); float[][] vectors = new float[reader.maxDoc()][]; for (int i = 0; i < vectorValues.size(); i++) { vectors[vectorValues.ordToDoc(i)] = - ArrayUtil.copyOfSubArray(dict.get(i), 0, vectorValues.dimension()); + ArrayUtil.copyOfSubArray(vectors.get(i), 0, vectorValues.dimension()); } return MockVectorValues.fromValues(vectors); } diff --git a/lucene/core/src/test/org/apache/lucene/util/quantization/TestScalarQuantizedVectorSimilarity.java b/lucene/core/src/test/org/apache/lucene/util/quantization/TestScalarQuantizedVectorSimilarity.java index 0b5a6223db6..575ada88721 100644 --- a/lucene/core/src/test/org/apache/lucene/util/quantization/TestScalarQuantizedVectorSimilarity.java +++ b/lucene/core/src/test/org/apache/lucene/util/quantization/TestScalarQuantizedVectorSimilarity.java @@ -238,7 +238,7 @@ private static FloatVectorValues fromFloatsNormalized( float[][] floats, Set deletedVectors) { return new TestScalarQuantizer.TestSimpleFloatVectorValues(floats, deletedVectors) { @Override - public Floats values() { + public Floats vectors() { return new Floats() { @Override public float[] get(int ord) throws IOException { diff --git a/lucene/core/src/test/org/apache/lucene/util/quantization/TestScalarQuantizer.java b/lucene/core/src/test/org/apache/lucene/util/quantization/TestScalarQuantizer.java index 49db2be488f..31d4d151b6a 100644 --- a/lucene/core/src/test/org/apache/lucene/util/quantization/TestScalarQuantizer.java +++ b/lucene/core/src/test/org/apache/lucene/util/quantization/TestScalarQuantizer.java @@ -306,7 +306,7 @@ public int size() { } @Override - public Floats values() { + public Floats vectors() { return new Floats() { @Override public float[] get(int ord) throws IOException { diff --git a/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java b/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java index 6b5feb71dca..c7301c1ed64 100644 --- a/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java +++ b/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java @@ -2301,7 +2301,7 @@ public int size() { } @Override - public Floats values() { + public Floats vectors() { return new Floats() { @Override public float[] get(int ord) { @@ -2330,7 +2330,7 @@ public VectorScorer scorer(float[] query) { } MemoryFloatVectorValues vectorValues = new MemoryFloatVectorValues(info); DocIndexIterator iterator = vectorValues.iterator(); - FloatVectorValues.Floats floats = vectorValues.values(); + FloatVectorValues.Floats floats = vectorValues.vectors(); return new VectorScorer() { @Override public float score() throws IOException { @@ -2364,7 +2364,7 @@ public int size() { } @Override - public Bytes values() { + public Bytes vectors() { return new Bytes() { @Override public byte[] get(int ord) { @@ -2392,7 +2392,7 @@ public VectorScorer scorer(byte[] query) throws IOException { + info.fieldInfo.getVectorDimension()); } MemoryByteVectorValues vectorValues = new MemoryByteVectorValues(info); - ByteVectorValues.Bytes vectors = vectorValues.values(); + ByteVectorValues.Bytes vectors = vectorValues.vectors(); DocIndexIterator iterator = vectorValues.iterator(); return new VectorScorer() { @Override diff --git a/lucene/memory/src/test/org/apache/lucene/index/memory/TestMemoryIndex.java b/lucene/memory/src/test/org/apache/lucene/index/memory/TestMemoryIndex.java index 4bd9a0bf87c..6944e0e0459 100644 --- a/lucene/memory/src/test/org/apache/lucene/index/memory/TestMemoryIndex.java +++ b/lucene/memory/src/test/org/apache/lucene/index/memory/TestMemoryIndex.java @@ -854,7 +854,7 @@ private static void assertFloatVectorValue(MemoryIndex mi, String fieldName, flo assertNotNull(fvv); KnnVectorValues.DocIndexIterator iterator = fvv.iterator(); assertEquals(0, iterator.nextDoc()); - assertArrayEquals(expected, fvv.values().get(0), 1e-6f); + assertArrayEquals(expected, fvv.vectors().get(0), 1e-6f); assertEquals(DocIdSetIterator.NO_MORE_DOCS, iterator.nextDoc()); } @@ -890,7 +890,7 @@ private static void assertByteVectorValue(MemoryIndex mi, String fieldName, byte assertNotNull(bvv); KnnVectorValues.DocIndexIterator iterator = bvv.iterator(); assertEquals(0, iterator.nextDoc()); - assertArrayEquals(expected, bvv.values().get(0)); + assertArrayEquals(expected, bvv.vectors().get(0)); assertEquals(DocIdSetIterator.NO_MORE_DOCS, iterator.nextDoc()); } diff --git a/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/ByteKnnVectorFieldSource.java b/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/ByteKnnVectorFieldSource.java index 595df5f11fb..7dd4a5cd91f 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/ByteKnnVectorFieldSource.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/ByteKnnVectorFieldSource.java @@ -65,7 +65,7 @@ protected DocIdSetIterator getVectorIterator() { return new VectorFieldFunction(this) { KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator(); - ByteVectorValues.Bytes vectors = vectorValues.values(); + ByteVectorValues.Bytes vectors = vectorValues.vectors(); @Override public byte[] byteVectorVal(int doc) throws IOException { diff --git a/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/FloatKnnVectorFieldSource.java b/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/FloatKnnVectorFieldSource.java index ae86126cd8c..c05ef6a48c3 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/FloatKnnVectorFieldSource.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/FloatKnnVectorFieldSource.java @@ -64,7 +64,7 @@ protected DocIdSetIterator getVectorIterator() { return new VectorFieldFunction(this) { KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator(); - FloatVectorValues.Floats values = vectorValues.values(); + FloatVectorValues.Floats values = vectorValues.vectors(); @Override public float[] floatVectorVal(int doc) throws IOException { diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/quantization/KMeans.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/quantization/KMeans.java index 3a3d8f97e8a..8003c8fa9de 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/quantization/KMeans.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/quantization/KMeans.java @@ -38,8 +38,8 @@ public class KMeans { public static final int DEFAULT_ITRS = 10; public static final int DEFAULT_SAMPLE_SIZE = 100_000; - private final FloatVectorValues vectors; - private final FloatVectorValues.Floats vectorValues; + private final FloatVectorValues vectorValues; + private final FloatVectorValues.Floats vectors; private final int numVectors; private final int numCentroids; private final Random random; @@ -141,16 +141,16 @@ public static Results cluster( } private KMeans( - FloatVectorValues vectors, + FloatVectorValues vectorValues, int numCentroids, Random random, KmeansInitializationMethod initializationMethod, int restarts, int iters) throws IOException { - this.vectors = vectors; - this.vectorValues = vectors.values(); - this.numVectors = vectors.size(); + this.vectorValues = vectorValues; + this.vectors = vectorValues.vectors(); + this.numVectors = vectorValues.size(); this.numCentroids = numCentroids; this.random = random; this.initializationMethod = initializationMethod; @@ -173,7 +173,7 @@ private float[][] computeCentroids(boolean normalizeCenters) throws IOException }; double prevSquaredDist = Double.MAX_VALUE; for (int iter = 0; iter < iters; iter++) { - squaredDist = runKMeansStep(vectors, centroids, vectorCentroids, false, normalizeCenters); + squaredDist = runKMeansStep(vectorValues, centroids, vectorCentroids, false, normalizeCenters); // Check for convergence if (prevSquaredDist <= (squaredDist + 1e-6)) { break; @@ -200,7 +200,7 @@ private float[][] initializeForgy() throws IOException { float[][] initialCentroids = new float[numCentroids][]; int i = 0; for (Integer selectedIdx : selection) { - float[] vector = vectorValues.get(selectedIdx); + float[] vector = vectors.get(selectedIdx); initialCentroids[i++] = ArrayUtil.copyOfSubArray(vector, 0, vector.length); } return initialCentroids; @@ -210,7 +210,7 @@ private float[][] initializeForgy() throws IOException { private float[][] initializeReservoirSampling() throws IOException { float[][] initialCentroids = new float[numCentroids][]; for (int index = 0; index < numVectors; index++) { - float[] vector = vectorValues.get(index); + float[] vector = vectors.get(index); if (index < numCentroids) { initialCentroids[index] = ArrayUtil.copyOfSubArray(vector, 0, vector.length); } else if (random.nextDouble() < numCentroids * (1.0 / index)) { @@ -226,7 +226,7 @@ private float[][] initializePlusPlus() throws IOException { float[][] initialCentroids = new float[numCentroids][]; // Choose the first centroid uniformly at random int firstIndex = random.nextInt(numVectors); - float[] value = vectorValues.get(firstIndex); + float[] value = vectors.get(firstIndex); initialCentroids[0] = ArrayUtil.copyOfSubArray(value, 0, value.length); // Store distances of each point to the nearest centroid @@ -239,7 +239,7 @@ private float[][] initializePlusPlus() throws IOException { double totalSum = 0; for (int j = 0; j < numVectors; j++) { // TODO: replace with RandomVectorScorer::score possible on quantized vectors - float dist = VectorUtil.squareDistance(vectorValues.get(j), initialCentroids[i - 1]); + float dist = VectorUtil.squareDistance(vectors.get(j), initialCentroids[i - 1]); if (dist < minDistances[j]) { minDistances[j] = dist; } @@ -258,7 +258,7 @@ private float[][] initializePlusPlus() throws IOException { } } // Update centroid - value = vectorValues.get(nextCentroidIndex); + value = vectors.get(nextCentroidIndex); initialCentroids[i] = ArrayUtil.copyOfSubArray(value, 0, value.length); } return initialCentroids; @@ -277,7 +277,7 @@ private float[][] initializePlusPlus() throws IOException { * @throws IOException if there is an error accessing vector values */ private static double runKMeansStep( - FloatVectorValues vectors, + FloatVectorValues vectorValues, float[][] centroids, short[] docCentroids, boolean useKahanSummation, @@ -292,10 +292,10 @@ private static double runKMeansStep( compensations = new float[numCentroids][centroids[0].length]; } - FloatVectorValues.Floats values = vectors.values(); + FloatVectorValues.Floats vectors = vectorValues.vectors(); double sumSquaredDist = 0; - for (int docID = 0; docID < vectors.size(); docID++) { - float[] vector = values.get(docID); + for (int docID = 0; docID < vectorValues.size(); docID++) { + float[] vector = vectors.get(docID); short bestCentroid = 0; if (numCentroids > 1) { float minSquaredDist = Float.MAX_VALUE; @@ -335,7 +335,7 @@ private static double runKMeansStep( } } if (unassignedCentroids.size() > 0) { - assignCentroids(vectors, centroids, unassignedCentroids); + assignCentroids(vectorValues, centroids, unassignedCentroids); } if (normalizeCentroids) { for (int c = 0; c < centroids.length; c++) { @@ -350,7 +350,7 @@ private static double runKMeansStep( * descending distance to the current centroid set */ static void assignCentroids( - FloatVectorValues vectors, float[][] centroids, List unassignedCentroidsIdxs) + FloatVectorValues vectorValues, float[][] centroids, List unassignedCentroidsIdxs) throws IOException { int[] assignedCentroidsIdxs = new int[centroids.length - unassignedCentroidsIdxs.size()]; int assignedIndex = 0; @@ -359,17 +359,17 @@ static void assignCentroids( assignedCentroidsIdxs[assignedIndex++] = i; } } - FloatVectorValues.Floats vectorValues = vectors.values(); + FloatVectorValues.Floats vectors = vectorValues.vectors(); NeighborQueue queue = new NeighborQueue(unassignedCentroidsIdxs.size(), false); - for (int i = 0; i < vectors.size(); i++) { - float[] vector = vectorValues.get(i); + for (int i = 0; i < vectorValues.size(); i++) { + float[] vector = vectors.get(i); for (short j = 0; j < assignedCentroidsIdxs.length; j++) { float squareDist = VectorUtil.squareDistance(centroids[assignedCentroidsIdxs[j]], vector); queue.insertWithOverflow(i, squareDist); } } for (int i = 0; i < unassignedCentroidsIdxs.size(); i++) { - float[] vector = vectorValues.get(queue.topNode()); + float[] vector = vectors.get(queue.topNode()); int unassignedCentroidIdx = unassignedCentroidsIdxs.get(i); centroids[unassignedCentroidIdx] = ArrayUtil.copyArray(vector); queue.pop(); diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/quantization/SampleReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/quantization/SampleReader.java index 02487668d54..fb977025f83 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/quantization/SampleReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/quantization/SampleReader.java @@ -53,8 +53,8 @@ public IndexInput getSlice() { } @Override - public Floats values() throws IOException { - Floats originValues = origin.values(); + public Floats vectors() throws IOException { + Floats originValues = origin.vectors(); return new Floats() { @Override public float[] get(int targetOrd) throws IOException { diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java index da2a1b1ba60..c0171ac2bf3 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java @@ -440,7 +440,7 @@ public void testAddIndexesDirectory0() throws Exception { FloatVectorValues vectorValues = r.getFloatVectorValues(fieldName); KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator(); assertEquals(0, iterator.nextDoc()); - assertEquals(0, vectorValues.values().get(0)[0], 0); + assertEquals(0, vectorValues.vectors().get(0)[0], 0); assertEquals(NO_MORE_DOCS, iterator.nextDoc()); } } @@ -466,7 +466,7 @@ public void testAddIndexesDirectory1() throws Exception { FloatVectorValues vectorValues = r.getFloatVectorValues(fieldName); KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator(); assertNotEquals(NO_MORE_DOCS, iterator.nextDoc()); - assertEquals(0, vectorValues.values().get(iterator.index())[0], 0); + assertEquals(0, vectorValues.vectors().get(iterator.index())[0], 0); assertEquals(NO_MORE_DOCS, iterator.nextDoc()); } } @@ -495,10 +495,10 @@ public void testAddIndexesDirectory01() throws Exception { KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator(); assertEquals(0, iterator.nextDoc()); // The merge order is randomized, we might get 0 first, or 1 - float value = vectorValues.values().get(0)[0]; + float value = vectorValues.vectors().get(0)[0]; assertTrue(value == 0 || value == 1); assertEquals(1, iterator.nextDoc()); - value += vectorValues.values().get(1)[0]; + value += vectorValues.vectors().get(1)[0]; assertEquals(1, value, 0); } } @@ -882,7 +882,7 @@ public void testSparseVectors() throws Exception { for (LeafReaderContext ctx : r.leaves()) { ByteVectorValues byteVectorValues = ctx.reader().getByteVectorValues(fieldName); if (byteVectorValues != null) { - ByteVectorValues.Bytes vectors = byteVectorValues.values(); + ByteVectorValues.Bytes vectors = byteVectorValues.vectors(); docCount += byteVectorValues.size(); KnnVectorValues.DocIndexIterator iterator = byteVectorValues.iterator(); while (true) { @@ -898,7 +898,7 @@ public void testSparseVectors() throws Exception { if (vectorValues != null) { docCount += vectorValues.size(); KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator(); - FloatVectorValues.Floats vectors = vectorValues.values(); + FloatVectorValues.Floats vectors = vectorValues.vectors(); while (true) { if (!(iterator.nextDoc() != NO_MORE_DOCS)) break; checksum += vectors.get(iterator.index())[0]; @@ -1137,7 +1137,7 @@ public void testIndexedValueNotAliased() throws Exception { FloatVectorValues vectorValues = r.getFloatVectorValues(fieldName); assertEquals(3, vectorValues.size()); KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator(); - FloatVectorValues.Floats vectors = vectorValues.values(); + FloatVectorValues.Floats vectors = vectorValues.vectors(); iterator.nextDoc(); assertEquals(0, iterator.index()); assertEquals(1, vectors.get(0)[0], 0); @@ -1170,7 +1170,7 @@ public void testSortedIndex() throws Exception { assertEquals(2, vectorValues.dimension()); assertEquals(3, vectorValues.size()); KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator(); - FloatVectorValues.Floats vectors = vectorValues.values(); + FloatVectorValues.Floats vectors = vectorValues.vectors(); assertEquals("1", storedFields.document(iterator.nextDoc()).get("id")); assertEquals(-1f, vectors.get(0)[0], 0); assertEquals("2", storedFields.document(iterator.nextDoc()).get("id")); @@ -1198,7 +1198,7 @@ public void testSortedIndexBytes() throws Exception { StoredFields storedFields = leaf.storedFields(); ByteVectorValues vectorValues = leaf.getByteVectorValues(fieldName); - ByteVectorValues.Bytes vectors = vectorValues.values(); + ByteVectorValues.Bytes vectors = vectorValues.vectors(); assertEquals(2, vectorValues.dimension()); assertEquals(3, vectorValues.size()); DocIdSetIterator iter = vectorValues.iterator(); @@ -1243,7 +1243,7 @@ public void testIndexMultipleKnnVectorFields() throws Exception { assertEquals(2, vectorValues.dimension()); assertEquals(2, vectorValues.size()); KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator(); - FloatVectorValues.Floats vectors = vectorValues.values(); + FloatVectorValues.Floats vectors = vectorValues.vectors(); iterator.nextDoc(); assertEquals(1f, vectors.get(0)[0], 0); iterator.nextDoc(); @@ -1252,7 +1252,7 @@ public void testIndexMultipleKnnVectorFields() throws Exception { FloatVectorValues vectorValues2 = leaf.getFloatVectorValues("field2"); KnnVectorValues.DocIndexIterator it2 = vectorValues2.iterator(); - FloatVectorValues.Floats vectors2 = vectorValues2.values(); + FloatVectorValues.Floats vectors2 = vectorValues2.vectors(); assertEquals(4, vectorValues2.dimension()); assertEquals(2, vectorValues2.size()); it2.nextDoc(); @@ -1266,7 +1266,7 @@ public void testIndexMultipleKnnVectorFields() throws Exception { assertEquals(1, vectorValues3.size()); KnnVectorValues.DocIndexIterator it3 = vectorValues3.iterator(); it3.nextDoc(); - assertEquals(1f, vectorValues3.values().get(0)[0], 0.1); + assertEquals(1f, vectorValues3.vectors().get(0)[0], 0.1); assertEquals(NO_MORE_DOCS, it3.nextDoc()); } } @@ -1332,7 +1332,7 @@ public void testRandom() throws Exception { StoredFields storedFields = ctx.reader().storedFields(); int docId; KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator(); - FloatVectorValues.Floats vectors = vectorValues.values(); + FloatVectorValues.Floats vectors = vectorValues.vectors(); while (true) { if (!((docId = iterator.nextDoc()) != NO_MORE_DOCS)) break; float[] v = vectors.get(iterator.index()); @@ -1411,7 +1411,7 @@ public void testRandomBytes() throws Exception { if (vectorValues == null) { continue; } - ByteVectorValues.Bytes vectors = vectorValues.values(); + ByteVectorValues.Bytes vectors = vectorValues.vectors(); totalSize += vectorValues.size(); StoredFields storedFields = ctx.reader().storedFields(); int docId; @@ -1541,7 +1541,7 @@ public void testRandomWithUpdatesAndGraph() throws Exception { int docId; int numLiveDocsWithVectors = 0; KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator(); - FloatVectorValues.Floats vectors = vectorValues.values(); + FloatVectorValues.Floats vectors = vectorValues.vectors(); while (true) { if (!((docId = iterator.nextDoc()) != NO_MORE_DOCS)) break; float[] v = vectors.get(iterator.index()); @@ -1831,7 +1831,7 @@ public void testVectorValuesReportCorrectDocs() throws Exception { docCount += byteVectorValues.size(); StoredFields storedFields = ctx.reader().storedFields(); KnnVectorValues.DocIndexIterator iter = byteVectorValues.iterator(); - ByteVectorValues.Bytes vectors = byteVectorValues.values(); + ByteVectorValues.Bytes vectors = byteVectorValues.vectors(); for (iter.nextDoc(); iter.docID() != NO_MORE_DOCS; iter.nextDoc()) { int ord = iter.index(); checksum += vectors.get(ord)[0]; @@ -1850,7 +1850,7 @@ public void testVectorValuesReportCorrectDocs() throws Exception { for (LeafReaderContext ctx : r.leaves()) { FloatVectorValues vectorValues = ctx.reader().getFloatVectorValues("knn_vector"); if (vectorValues != null) { - FloatVectorValues.Floats vectors = vectorValues.values(); + FloatVectorValues.Floats vectors = vectorValues.vectors(); docCount += vectorValues.size(); StoredFields storedFields = ctx.reader().storedFields(); KnnVectorValues.DocIndexIterator iter = vectorValues.iterator(); @@ -1905,7 +1905,7 @@ public void testMismatchedFields() throws Exception { LeafReader leafReader = getOnlyLeafReader(reader); ByteVectorValues byteVectors = leafReader.getByteVectorValues("byte"); - ByteVectorValues.Bytes vectors = byteVectors.values(); + ByteVectorValues.Bytes vectors = byteVectors.vectors(); assertNotNull(byteVectors); KnnVectorValues.DocIndexIterator iter = byteVectors.iterator(); assertEquals(0, iter.nextDoc()); @@ -1918,12 +1918,12 @@ public void testMismatchedFields() throws Exception { assertNotNull(floatVectors); iter = floatVectors.iterator(); assertEquals(0, iter.nextDoc()); - float[] vector = floatVectors.values().get(0); + float[] vector = floatVectors.vectors().get(0); assertEquals(2, vector.length); assertEquals(1f, vector[0], 0f); assertEquals(2f, vector[1], 0f); assertEquals(1, iter.nextDoc()); - vector = floatVectors.values().get(1); + vector = floatVectors.vectors().get(1); assertEquals(2, vector.length); assertEquals(1f, vector[0], 0f); assertEquals(2f, vector[1], 0f); From b4febca6b1a0f9fb99824d9996ff2c7be99e43e7 Mon Sep 17 00:00:00 2001 From: Michael Sokolov Date: Mon, 7 Oct 2024 14:19:02 -0600 Subject: [PATCH 06/25] tidy --- .../org/apache/lucene/codecs/KnnVectorsWriter.java | 6 ++++-- .../lucene99/Lucene99ScalarQuantizedVectorScorer.java | 10 +++++++--- .../src/java/org/apache/lucene/index/CheckIndex.java | 4 +--- .../lucene/sandbox/codecs/quantization/KMeans.java | 3 ++- 4 files changed, 14 insertions(+), 9 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsWriter.java index d85baed323c..f96407257c4 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsWriter.java @@ -120,7 +120,8 @@ private static class FloatVectorValuesSub extends DocIDMerger.Sub { final FloatVectorValues vectorValues; final KnnVectorValues.DocIndexIterator iterator; - FloatVectorValuesSub(MergeState.DocMap docMap, FloatVectorValues vectorValues) throws IOException { + FloatVectorValuesSub(MergeState.DocMap docMap, FloatVectorValues vectorValues) + throws IOException { super(docMap); this.vectorValues = vectorValues; this.iterator = vectorValues.iterator(); @@ -142,7 +143,8 @@ private static class ByteVectorValuesSub extends DocIDMerger.Sub { final ByteVectorValues vectorValues; final KnnVectorValues.DocIndexIterator iterator; - ByteVectorValuesSub(MergeState.DocMap docMap, ByteVectorValues vectorValues) throws IOException { + ByteVectorValuesSub(MergeState.DocMap docMap, ByteVectorValues vectorValues) + throws IOException { super(docMap); this.vectorValues = vectorValues; iterator = vectorValues.iterator(); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorScorer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorScorer.java index 519d08e5a68..d1491115d4e 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorScorer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorScorer.java @@ -122,7 +122,8 @@ private static RandomVectorScorer.AbstractRandomVectorScorer dotProductFactory( throws IOException { QuantizedByteVectorValues.QuantizedBytes vectors = vectorValues.vectors(); if (vectorValues.getScalarQuantizer().getBits() <= 4) { - if (vectorValues.getVectorByteLength() != vectorValues.dimension() && vectors.getSlice() != null) { + if (vectorValues.getVectorByteLength() != vectorValues.dimension() + && vectors.getSlice() != null) { return new CompressedInt4DotProduct( vectorValues, constMultiplier, targetBytes, offsetCorrection, scoreAdjustmentFunction); } @@ -138,7 +139,8 @@ private static class Euclidean extends RandomVectorScorer.AbstractRandomVectorSc private final byte[] targetBytes; private final QuantizedByteVectorValues.QuantizedBytes vectors; - private Euclidean(QuantizedByteVectorValues vectorValues, float constMultiplier, byte[] targetBytes) + private Euclidean( + QuantizedByteVectorValues vectorValues, float constMultiplier, byte[] targetBytes) throws IOException { super(vectorValues); vectors = vectorValues.vectors(); @@ -221,7 +223,9 @@ private CompressedInt4DotProduct( public float score(int vectorOrdinal) throws IOException { // get compressed vector, in Lucene99, vector values are stored and have a single value for // offset correction - vectors.getSlice().seek((long) vectorOrdinal * (vectorValues.getVectorByteLength() + Float.BYTES)); + vectors + .getSlice() + .seek((long) vectorOrdinal * (vectorValues.getVectorByteLength() + Float.BYTES)); vectors.getSlice().readBytes(compressedVector, 0, compressedVector.length); float vectorOffset = vectors.getScoreCorrectionConstant(vectorOrdinal); int dotProduct = VectorUtil.int4DotProductPacked(targetBytes, compressedVector); diff --git a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java index 3fed58305ef..8138d238752 100644 --- a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java +++ b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java @@ -2768,9 +2768,7 @@ private static void checkFloatVectorValues( if (vectorValues.ordToDoc(count) % everyNdoc == 0) { KnnCollector collector = new TopKnnCollector(10, Integer.MAX_VALUE); if (vectorsReaderSupportsSearch(codecReader, fieldInfo.name)) { - codecReader - .getVectorReader() - .search(fieldInfo.name, vectors.get(count), collector, null); + codecReader.getVectorReader().search(fieldInfo.name, vectors.get(count), collector, null); TopDocs docs = collector.topDocs(); if (docs.scoreDocs.length == 0) { throw new CheckIndexException( diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/quantization/KMeans.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/quantization/KMeans.java index 8003c8fa9de..2ed99906da6 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/quantization/KMeans.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/quantization/KMeans.java @@ -173,7 +173,8 @@ private float[][] computeCentroids(boolean normalizeCenters) throws IOException }; double prevSquaredDist = Double.MAX_VALUE; for (int iter = 0; iter < iters; iter++) { - squaredDist = runKMeansStep(vectorValues, centroids, vectorCentroids, false, normalizeCenters); + squaredDist = + runKMeansStep(vectorValues, centroids, vectorCentroids, false, normalizeCenters); // Check for convergence if (prevSquaredDist <= (squaredDist + 1e-6)) { break; From 2fca27c34c999aeccfee7b89ff16060e46dcdafb Mon Sep 17 00:00:00 2001 From: Michael Sokolov Date: Mon, 7 Oct 2024 14:42:31 -0600 Subject: [PATCH 07/25] more renaming --- .../lucene90/Lucene90HnswGraphBuilder.java | 17 ++- .../lucene90/Lucene90HnswVectorsWriter.java | 8 +- .../lucene91/Lucene91HnswGraphBuilder.java | 42 +++--- .../lucene91/Lucene91HnswVectorsWriter.java | 12 +- .../lucene92/Lucene92HnswVectorsWriter.java | 12 +- .../TestBasicBackwardsCompatibility.java | 8 +- .../codecs/hnsw/DefaultFlatVectorScorer.java | 10 +- .../lucene/util/hnsw/HnswGraphTestCase.java | 141 ++++++++++-------- .../util/hnsw/TestHnswFloatVectorGraph.java | 14 +- .../sandbox/codecs/quantization/KMeans.java | 28 ++-- .../codecs/quantization/TestKMeans.java | 20 +-- 11 files changed, 164 insertions(+), 148 deletions(-) diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswGraphBuilder.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswGraphBuilder.java index 7953b1c066d..68d08838adf 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswGraphBuilder.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswGraphBuilder.java @@ -65,7 +65,7 @@ public final class Lucene90HnswGraphBuilder { * Reads all the vectors from vector values, builds a graph connecting them by their dense * ordinals, using the given hyperparameter settings, and returns the resulting graph. * - * @param vectors the vectors whose relations are represented by the graph - must provide a + * @param vectorValues the vectors whose relations are represented by the graph - must provide a * different view over those vectors than the one used to add via addGraphNode. * @param maxConn the number of connections to make when adding a new graph node; roughly speaking * the graph fanout. @@ -74,13 +74,13 @@ public final class Lucene90HnswGraphBuilder { * to ensure repeatable construction. */ public Lucene90HnswGraphBuilder( - FloatVectorValues vectors, + FloatVectorValues vectorValues, VectorSimilarityFunction similarityFunction, int maxConn, int beamWidth, long seed) throws IOException { - this.vectorValues = vectors; + this.vectorValues = vectorValues; this.vectors = vectorValues.vectors(); buildVectors = vectorValues.vectors(); this.similarityFunction = Objects.requireNonNull(similarityFunction); @@ -103,17 +103,18 @@ public Lucene90HnswGraphBuilder( * enables efficient retrieval without extra data copying, while avoiding collision of the * returned values. * - * @param vectors the vectors for which to build a nearest neighbors graph. Must be an independet - * accessor for the vectors + * @param vectorValues the vectors for which to build a nearest neighbors graph. Must be an + * independet accessor for the vectors */ - public Lucene90OnHeapHnswGraph build(FloatVectorValues vectors) throws IOException { + public Lucene90OnHeapHnswGraph build(FloatVectorValues vectorValues) throws IOException { if (infoStream.isEnabled(HNSW_COMPONENT)) { - infoStream.message(HNSW_COMPONENT, "build graph from " + vectors.size() + " vectors"); + infoStream.message( + HNSW_COMPONENT, "build graph from " + vectorValues.size() + " vectorValues"); } long start = System.nanoTime(), t = start; // start at node 1! node 0 is added implicitly, in the constructor FloatVectorValues.Floats values = vectorValues.vectors(); - for (int node = 1; node < vectors.size(); node++) { + for (int node = 1; node < vectorValues.size(); node++) { addGraphNode(values.get(node)); if (node % 10000 == 0) { if (infoStream.isEnabled(HNSW_COMPONENT)) { diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswVectorsWriter.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswVectorsWriter.java index cc70df0acb9..22eb3bf7916 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswVectorsWriter.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswVectorsWriter.java @@ -182,13 +182,13 @@ protected void writeField(FieldInfo fieldInfo, ByteVectorValues byteVectorValues * IDs. The length of the returned array matches the total number of documents with a vector * (which excludes deleted documents), so it may be less than {@link FloatVectorValues#size()}. */ - private static int[] writeVectorData(IndexOutput output, FloatVectorValues vectors) + private static int[] writeVectorData(IndexOutput output, FloatVectorValues vectorValues) throws IOException { - int[] docIds = new int[vectors.size()]; + int[] docIds = new int[vectorValues.size()]; int count = 0; ByteBuffer binaryVector = - ByteBuffer.allocate(vectors.dimension() * Float.BYTES).order(ByteOrder.LITTLE_ENDIAN); - KnnVectorValues.DocIndexIterator iter = vectors.iterator(); + ByteBuffer.allocate(vectorValues.dimension() * Float.BYTES).order(ByteOrder.LITTLE_ENDIAN); + KnnVectorValues.DocIndexIterator iter = vectorValues.iterator(); FloatVectorValues.Floats values = vectorValues.vectors(); for (int docV = iter.nextDoc(); docV != NO_MORE_DOCS; docV = iter.nextDoc()) { // write vector diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene91/Lucene91HnswGraphBuilder.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene91/Lucene91HnswGraphBuilder.java index ad374fbf038..2d5aa8b7b9f 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene91/Lucene91HnswGraphBuilder.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene91/Lucene91HnswGraphBuilder.java @@ -57,8 +57,8 @@ public final class Lucene91HnswGraphBuilder { private final DefaultFlatVectorScorer defaultFlatVectorScorer = new DefaultFlatVectorScorer(); private final VectorSimilarityFunction similarityFunction; - private final FloatVectorValues vectors; - private final FloatVectorValues.Floats vectorValues; + private final FloatVectorValues vectorValues; + private final FloatVectorValues.Floats vectors; private final SplittableRandom random; private final Lucene91BoundsChecker bound; private final HnswGraphSearcher graphSearcher; @@ -75,8 +75,7 @@ public final class Lucene91HnswGraphBuilder { * Reads all the vectors from vector values, builds a graph connecting them by their dense * ordinals, using the given hyperparameter settings, and returns the resulting graph. * - * @param vectors the vectors whose relations are represented by the graph - must provide a - * different view over those vectors than the one used to add via addGraphNode. + * @param vectorValues the vectors whose relations are represented by the graph. * @param maxConn the number of connections to make when adding a new graph node; roughly speaking * the graph fanout. * @param beamWidth the size of the beam search to use when finding nearest neighbors. @@ -84,14 +83,14 @@ public final class Lucene91HnswGraphBuilder { * to ensure repeatable construction. */ public Lucene91HnswGraphBuilder( - FloatVectorValues vectors, + FloatVectorValues vectorValues, VectorSimilarityFunction similarityFunction, int maxConn, int beamWidth, long seed) throws IOException { - this.vectors = vectors; - vectorValues = vectorValues.vectors(); + this.vectorValues = vectorValues; + vectors = vectorValues.vectors(); buildVectors = vectorValues.vectors(); this.similarityFunction = Objects.requireNonNull(similarityFunction); if (maxConn <= 0) { @@ -108,7 +107,8 @@ public Lucene91HnswGraphBuilder( int levelOfFirstNode = getRandomGraphLevel(ml, random); this.hnsw = new Lucene91OnHeapHnswGraph(maxConn, levelOfFirstNode); this.graphSearcher = - new HnswGraphSearcher(new NeighborQueue(beamWidth, true), new FixedBitSet(vectors.size())); + new HnswGraphSearcher( + new NeighborQueue(beamWidth, true), new FixedBitSet(vectorValues.size())); bound = Lucene91BoundsChecker.create(false); scratch = new Lucene91NeighborArray(Math.max(beamWidth, maxConn + 1)); } @@ -118,18 +118,18 @@ public Lucene91HnswGraphBuilder( * enables efficient retrieval without extra data copying, while avoiding collision of the * returned values. * - * @param vectors the vectors for which to build a nearest neighbors graph. Must be an independent - * accessor for the vectors + * @param vectorValues the vectors for which to build a nearest neighbors graph. Must be an + * independent accessor for the vectors */ - public Lucene91OnHeapHnswGraph build(FloatVectorValues vectors) throws IOException { - FloatVectorValues.Floats values = vectorValues.vectors(); + public Lucene91OnHeapHnswGraph build(FloatVectorValues vectorValues) throws IOException { + FloatVectorValues.Floats vectors = vectorValues.vectors(); if (infoStream.isEnabled(HNSW_COMPONENT)) { - infoStream.message(HNSW_COMPONENT, "build graph from " + vectors.size() + " vectors"); + infoStream.message(HNSW_COMPONENT, "build graph from " + vectorValues.size() + " vectors"); } long start = System.nanoTime(), t = start; // start at node 1! node 0 is added implicitly, in the constructor - for (int node = 1; node < vectors.size(); node++) { - addGraphNode(node, values.get(node)); + for (int node = 1; node < vectorValues.size(); node++) { + addGraphNode(node, vectors.get(node)); if ((node % 10000 == 0) && infoStream.isEnabled(HNSW_COMPONENT)) { t = printGraphBuildStatus(node, start, t); } @@ -145,7 +145,7 @@ public void setInfoStream(InfoStream infoStream) { /** Inserts a doc with vector value to the graph */ void addGraphNode(int node, float[] value) throws IOException { RandomVectorScorer scorer = - defaultFlatVectorScorer.getRandomVectorScorer(similarityFunction, vectors, value); + defaultFlatVectorScorer.getRandomVectorScorer(similarityFunction, vectorValues, value); HnswGraphBuilder.GraphBuilderKnnCollector candidates; final int nodeLevel = getRandomGraphLevel(ml, random); int curMaxLevel = hnsw.numLevels() - 1; @@ -222,7 +222,7 @@ private void selectDiverse(Lucene91NeighborArray neighbors, Lucene91NeighborArra int cNode = candidates.node[i]; float cScore = candidates.score[i]; assert cNode < hnsw.size(); - if (diversityCheck(vectorValues.get(cNode), cScore, neighbors, buildVectors)) { + if (diversityCheck(vectors.get(cNode), cScore, neighbors, buildVectors)) { neighbors.add(cNode, cScore); } } @@ -244,7 +244,7 @@ private void popToScratch(HnswGraphBuilder.GraphBuilderKnnCollector candidates) * @param score the score of the new candidate and node n, to be compared with scores of the * candidate and n's neighbors * @param neighbors the neighbors selected so far - * @param vectorValues source of values used for making comparisons between candidate and existing + * @param vectors source of values used for making comparisons between candidate and existing * neighbors * @return whether the candidate is diverse given the existing neighbors */ @@ -252,12 +252,12 @@ private boolean diversityCheck( float[] candidate, float score, Lucene91NeighborArray neighbors, - FloatVectorValues.Floats vectorValues) + FloatVectorValues.Floats vectors) throws IOException { bound.set(score); for (int i = 0; i < neighbors.size(); i++) { float neighborSimilarity = - similarityFunction.compare(candidate, vectorValues.get(neighbors.node[i])); + similarityFunction.compare(candidate, vectors.get(neighbors.node[i])); if (bound.check(neighborSimilarity) == false) { return false; } @@ -291,7 +291,7 @@ private int findNonDiverse(Lucene91NeighborArray neighbors) throws IOException { // them, drop it int neighborId = neighbors.node[i]; bound.set(neighbors.score[i]); - float[] neighborVector = vectorValues.get(neighborId); + float[] neighborVector = vectors.get(neighborId); for (int j = maxConn; j > i; j--) { float neighborSimilarity = similarityFunction.compare(neighborVector, buildVectors.get(neighbors.node[j])); diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene91/Lucene91HnswVectorsWriter.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene91/Lucene91HnswVectorsWriter.java index 5e1f34ef243..25a928c7c2e 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene91/Lucene91HnswVectorsWriter.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene91/Lucene91HnswVectorsWriter.java @@ -176,16 +176,16 @@ protected void writeField(FieldInfo fieldInfo, ByteVectorValues byteVectorValues /** * Writes the vector values to the output and returns a set of documents that contains vectors. */ - private static DocsWithFieldSet writeVectorData(IndexOutput output, FloatVectorValues vectors) - throws IOException { + private static DocsWithFieldSet writeVectorData( + IndexOutput output, FloatVectorValues vectorValues) throws IOException { DocsWithFieldSet docsWithField = new DocsWithFieldSet(); ByteBuffer binaryVector = - ByteBuffer.allocate(vectors.dimension() * Float.BYTES).order(ByteOrder.LITTLE_ENDIAN); - KnnVectorValues.DocIndexIterator iter = vectors.iterator(); - FloatVectorValues.Floats values = vectorValues.vectors(); + ByteBuffer.allocate(vectorValues.dimension() * Float.BYTES).order(ByteOrder.LITTLE_ENDIAN); + KnnVectorValues.DocIndexIterator iter = vectorValues.iterator(); + FloatVectorValues.Floats vectors = vectorValues.vectors(); for (int docV = iter.nextDoc(); docV != DocIdSetIterator.NO_MORE_DOCS; docV = iter.nextDoc()) { // write vector - binaryVector.asFloatBuffer().put(values.get(iter.index())); + binaryVector.asFloatBuffer().put(vectors.get(iter.index())); output.writeBytes(binaryVector.array(), binaryVector.limit()); docsWithField.add(docV); } diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene92/Lucene92HnswVectorsWriter.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene92/Lucene92HnswVectorsWriter.java index 03c1ff79336..1aa6ec3941b 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene92/Lucene92HnswVectorsWriter.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene92/Lucene92HnswVectorsWriter.java @@ -184,18 +184,18 @@ protected void writeField(FieldInfo fieldInfo, ByteVectorValues byteVectorValues /** * Writes the vector values to the output and returns a set of documents that contains vectors. */ - private static DocsWithFieldSet writeVectorData(IndexOutput output, FloatVectorValues vectors) - throws IOException { + private static DocsWithFieldSet writeVectorData( + IndexOutput output, FloatVectorValues vectorValues) throws IOException { DocsWithFieldSet docsWithField = new DocsWithFieldSet(); ByteBuffer binaryVector = - ByteBuffer.allocate(vectors.dimension() * Float.BYTES).order(ByteOrder.LITTLE_ENDIAN); - KnnVectorValues.DocIndexIterator iterator = vectors.iterator(); - FloatVectorValues.Floats values = vectorValues.vectors(); + ByteBuffer.allocate(vectorValues.dimension() * Float.BYTES).order(ByteOrder.LITTLE_ENDIAN); + KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator(); + FloatVectorValues.Floats vectors = vectorValues.vectors(); for (int docV = iterator.nextDoc(); docV != DocIdSetIterator.NO_MORE_DOCS; docV = iterator.nextDoc()) { // write vector - binaryVector.asFloatBuffer().put(values.get(iterator.index())); + binaryVector.asFloatBuffer().put(vectors.get(iterator.index())); output.writeBytes(binaryVector.array(), binaryVector.limit()); docsWithField.add(docV); } diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestBasicBackwardsCompatibility.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestBasicBackwardsCompatibility.java index 2ef87707897..888a8a038b7 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestBasicBackwardsCompatibility.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestBasicBackwardsCompatibility.java @@ -474,10 +474,10 @@ public static void searchIndex( // test vector values int cnt = 0; for (LeafReaderContext ctx : reader.leaves()) { - FloatVectorValues values = ctx.reader().getFloatVectorValues(KNN_VECTOR_FIELD); - if (values != null) { - assertEquals(KNN_VECTOR_FIELD_TYPE.vectorDimension(), values.dimension()); - KnnVectorValues.DocIndexIterator it = values.iterator(); + FloatVectorValues vectorValues = ctx.reader().getFloatVectorValues(KNN_VECTOR_FIELD); + if (vectorValues != null) { + assertEquals(KNN_VECTOR_FIELD_TYPE.vectorDimension(), vectorValues.dimension()); + KnnVectorValues.DocIndexIterator it = vectorValues.iterator(); FloatVectorValues.Floats vectors = vectorValues.vectors(); for (int doc = it.nextDoc(); doc != NO_MORE_DOCS; doc = it.nextDoc()) { float[] expectedVector = {KNN_VECTOR[0], KNN_VECTOR[1], KNN_VECTOR[2] + 0.1f * cnt}; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/hnsw/DefaultFlatVectorScorer.java b/lucene/core/src/java/org/apache/lucene/codecs/hnsw/DefaultFlatVectorScorer.java index a153ce086f8..3f07abcef80 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/hnsw/DefaultFlatVectorScorer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/hnsw/DefaultFlatVectorScorer.java @@ -92,8 +92,9 @@ private static final class ByteScoringSupplier implements RandomVectorScorerSupp private final VectorSimilarityFunction similarityFunction; private ByteScoringSupplier( - ByteVectorValues vectors, VectorSimilarityFunction similarityFunction) throws IOException { - vectorValues = vectors; + ByteVectorValues vectorValues, VectorSimilarityFunction similarityFunction) + throws IOException { + this.vectorValues = vectorValues; this.similarityFunction = similarityFunction; } @@ -121,8 +122,9 @@ private static final class FloatScoringSupplier implements RandomVectorScorerSup private final VectorSimilarityFunction similarityFunction; private FloatScoringSupplier( - FloatVectorValues vectors, VectorSimilarityFunction similarityFunction) throws IOException { - vectorValues = vectors; + FloatVectorValues vectorValues, VectorSimilarityFunction similarityFunction) + throws IOException { + this.vectorValues = vectorValues; this.similarityFunction = similarityFunction; } diff --git a/lucene/core/src/test/org/apache/lucene/util/hnsw/HnswGraphTestCase.java b/lucene/core/src/test/org/apache/lucene/util/hnsw/HnswGraphTestCase.java index 0ffc8a87995..27dba687711 100644 --- a/lucene/core/src/test/org/apache/lucene/util/hnsw/HnswGraphTestCase.java +++ b/lucene/core/src/test/org/apache/lucene/util/hnsw/HnswGraphTestCase.java @@ -113,17 +113,18 @@ abstract KnnVectorValues vectorValues( abstract T getTargetVector(); - protected RandomVectorScorerSupplier buildScorerSupplier(KnnVectorValues vectors) + protected RandomVectorScorerSupplier buildScorerSupplier(KnnVectorValues vectorValues) throws IOException { - return flatVectorScorer.getRandomVectorScorerSupplier(similarityFunction, vectors); + return flatVectorScorer.getRandomVectorScorerSupplier(similarityFunction, vectorValues); } - protected RandomVectorScorer buildScorer(KnnVectorValues vectors, T query) throws IOException { + protected RandomVectorScorer buildScorer(KnnVectorValues vectorValues, T query) + throws IOException { return switch (getVectorEncoding()) { case BYTE -> - flatVectorScorer.getRandomVectorScorer(similarityFunction, vectors, (byte[]) query); + flatVectorScorer.getRandomVectorScorer(similarityFunction, vectorValues, (byte[]) query); case FLOAT32 -> - flatVectorScorer.getRandomVectorScorer(similarityFunction, vectors, (float[]) query); + flatVectorScorer.getRandomVectorScorer(similarityFunction, vectorValues, (float[]) query); }; } @@ -144,7 +145,7 @@ public void testRandomReadWriteAndMerge() throws IOException { int M = random().nextInt(4) + 2; int beamWidth = random().nextInt(10) + 5; long seed = random().nextLong(); - KnnVectorValues vectors = vectorValues(numVectors, dim); + KnnVectorValues vectorValues = vectorValues(numVectors, dim); HnswGraphBuilder.randSeed = seed; try (Directory dir = newDirectory()) { @@ -178,23 +179,25 @@ public KnnVectorsFormat getKnnVectorsFormatForField(String field) { } } Document doc = new Document(); - switch (vectors.getEncoding()) { + switch (vectorValues.getEncoding()) { case BYTE -> { doc.add( knnVectorField( "field", - (T) ((ByteVectorValues) vectors).vectors().get(ord), + (T) ((ByteVectorValues) vectorValues).vectors().get(ord), similarityFunction)); } case FLOAT32 -> { doc.add( knnVectorField( "field", - (T) ((FloatVectorValues) vectors).vectors().get(ord), + (T) ((FloatVectorValues) vectorValues).vectors().get(ord), similarityFunction)); } } - doc.add(new StringField("id", Integer.toString(vectors.ordToDoc(ord)), Field.Store.NO)); + doc.add( + new StringField( + "id", Integer.toString(vectorValues.ordToDoc(ord)), Field.Store.NO)); iw.addDocument(doc); } iw.commit(); @@ -218,16 +221,16 @@ public KnnVectorsFormat getKnnVectorsFormatForField(String field) { } @SuppressWarnings("unchecked") - private T vectorValue(KnnVectorValues vectors, int ord) throws IOException { - switch (vectors.getEncoding()) { + private T vectorValue(KnnVectorValues vectorValues, int ord) throws IOException { + switch (vectorValues.getEncoding()) { case BYTE -> { - return (T) ((ByteVectorValues) vectors).vectors().get(ord); + return (T) ((ByteVectorValues) vectorValues).vectors().get(ord); } case FLOAT32 -> { - return (T) ((FloatVectorValues) vectors).vectors().get(ord); + return (T) ((FloatVectorValues) vectorValues).vectors().get(ord); } } - throw new AssertionError("unknown encoding " + vectors.getEncoding()); + throw new AssertionError("unknown encoding " + vectorValues.getEncoding()); } interface Vectors { @@ -241,10 +244,10 @@ public void testReadWrite() throws IOException { int M = random().nextInt(4) + 2; int beamWidth = random().nextInt(10) + 5; long seed = random().nextLong(); - KnnVectorValues vectors = vectorValues(nDoc, dim); - RandomVectorScorerSupplier scorerSupplier = buildScorerSupplier(vectors); + KnnVectorValues vectorValues = vectorValues(nDoc, dim); + RandomVectorScorerSupplier scorerSupplier = buildScorerSupplier(vectorValues); HnswGraphBuilder builder = HnswGraphBuilder.create(scorerSupplier, M, beamWidth, seed); - HnswGraph hnsw = builder.build(vectors.size()); + HnswGraph hnsw = builder.build(vectorValues.size()); expectThrows(IllegalStateException.class, () -> builder.addGraphNode(0)); // Recreate the graph while indexing with the same random seed and write it out @@ -269,7 +272,7 @@ public KnnVectorsFormat getKnnVectorsFormatForField(String field) { } }); try (IndexWriter iw = new IndexWriter(dir, iwc)) { - KnnVectorValues.DocIndexIterator it2 = vectors.iterator(); + KnnVectorValues.DocIndexIterator it2 = vectorValues.iterator(); while (it2.nextDoc() != NO_MORE_DOCS) { while (indexedDoc < it2.docID()) { // increment docId in the index by adding empty documents @@ -277,7 +280,8 @@ public KnnVectorsFormat getKnnVectorsFormatForField(String field) { indexedDoc++; } Document doc = new Document(); - doc.add(knnVectorField("field", vectorValue(vectors, it2.index()), similarityFunction)); + doc.add( + knnVectorField("field", vectorValue(vectorValues, it2.index()), similarityFunction)); doc.add(new StoredField("id", it2.docID())); iw.addDocument(doc); nVec++; @@ -291,7 +295,7 @@ public KnnVectorsFormat getKnnVectorsFormatForField(String field) { assertEquals(nVec, values.size()); assertEquals(indexedDoc, ctx.reader().maxDoc()); assertEquals(indexedDoc, ctx.reader().numDocs()); - assertVectorsEqual(vectors, values); + assertVectorsEqual(vectorValues, values); HnswGraph graphValues = ((Lucene99HnswVectorsReader) ((PerFieldKnnVectorsFormat.FieldsReader) @@ -308,7 +312,7 @@ public KnnVectorsFormat getKnnVectorsFormatForField(String field) { public void testSortedAndUnsortedIndicesReturnSameResults() throws IOException { int dim = random().nextInt(10) + 3; int nDoc = random().nextInt(200) + 100; - KnnVectorValues vectors = vectorValues(nDoc, dim); + KnnVectorValues vectorValues = vectorValues(nDoc, dim); int M = random().nextInt(10) + 5; int beamWidth = random().nextInt(10) + 10; @@ -351,15 +355,15 @@ public KnnVectorsFormat getKnnVectorsFormatForField(String field) { int indexedDoc = 0; try (IndexWriter iw = new IndexWriter(dir, iwc); IndexWriter iw2 = new IndexWriter(dir2, iwc2)) { - for (int ord = 0; ord < vectors.size(); ord++) { - while (indexedDoc < vectors.ordToDoc(ord)) { + for (int ord = 0; ord < vectorValues.size(); ord++) { + while (indexedDoc < vectorValues.ordToDoc(ord)) { // increment docId in the index by adding empty documents iw.addDocument(new Document()); indexedDoc++; } Document doc = new Document(); - doc.add(knnVectorField("vector", vectorValue(vectors, ord), similarityFunction)); - doc.add(new StoredField("id", vectors.ordToDoc(ord))); + doc.add(knnVectorField("vector", vectorValue(vectorValues, ord), similarityFunction)); + doc.add(new StoredField("id", vectorValues.ordToDoc(ord))); doc.add(new NumericDocValuesField("sortkey", random().nextLong())); iw.addDocument(doc); iw2.addDocument(doc); @@ -489,15 +493,15 @@ void assertGraphEqual(HnswGraph g, HnswGraph h) throws IOException { public void testAknnDiverse() throws IOException { int nDoc = 100; similarityFunction = VectorSimilarityFunction.DOT_PRODUCT; - KnnVectorValues vectors = circularVectorValues(nDoc); - RandomVectorScorerSupplier scorerSupplier = buildScorerSupplier(vectors); + KnnVectorValues vectorValues = circularVectorValues(nDoc); + RandomVectorScorerSupplier scorerSupplier = buildScorerSupplier(vectorValues); HnswGraphBuilder builder = HnswGraphBuilder.create(scorerSupplier, 10, 100, random().nextInt()); - OnHeapHnswGraph hnsw = builder.build(vectors.size()); + OnHeapHnswGraph hnsw = builder.build(vectorValues.size()); // run some searches KnnCollector nn = HnswGraphSearcher.search( - buildScorer(vectors, getTargetVector()), 10, hnsw, null, Integer.MAX_VALUE); + buildScorer(vectorValues, getTargetVector()), 10, hnsw, null, Integer.MAX_VALUE); TopDocs topDocs = nn.topDocs(); assertEquals("Number of found results is not equal to [10].", 10, topDocs.scoreDocs.length); int sum = 0; @@ -521,16 +525,16 @@ public void testAknnDiverse() throws IOException { @SuppressWarnings("unchecked") public void testSearchWithAcceptOrds() throws IOException { int nDoc = 100; - KnnVectorValues vectors = circularVectorValues(nDoc); + KnnVectorValues vectorValues = circularVectorValues(nDoc); similarityFunction = VectorSimilarityFunction.DOT_PRODUCT; - RandomVectorScorerSupplier scorerSupplier = buildScorerSupplier(vectors); + RandomVectorScorerSupplier scorerSupplier = buildScorerSupplier(vectorValues); HnswGraphBuilder builder = HnswGraphBuilder.create(scorerSupplier, 16, 100, random().nextInt()); - OnHeapHnswGraph hnsw = builder.build(vectors.size()); + OnHeapHnswGraph hnsw = builder.build(vectorValues.size()); // the first 10 docs must not be deleted to ensure the expected recall Bits acceptOrds = createRandomAcceptOrds(10, nDoc); KnnCollector nn = HnswGraphSearcher.search( - buildScorer(vectors, getTargetVector()), 10, hnsw, acceptOrds, Integer.MAX_VALUE); + buildScorer(vectorValues, getTargetVector()), 10, hnsw, acceptOrds, Integer.MAX_VALUE); TopDocs nodes = nn.topDocs(); assertEquals("Number of found results is not equal to [10].", 10, nodes.scoreDocs.length); int sum = 0; @@ -546,11 +550,11 @@ public void testSearchWithAcceptOrds() throws IOException { @SuppressWarnings("unchecked") public void testSearchWithSelectiveAcceptOrds() throws IOException { int nDoc = 100; - KnnVectorValues vectors = circularVectorValues(nDoc); + KnnVectorValues vectorValues = circularVectorValues(nDoc); similarityFunction = VectorSimilarityFunction.DOT_PRODUCT; - RandomVectorScorerSupplier scorerSupplier = buildScorerSupplier(vectors); + RandomVectorScorerSupplier scorerSupplier = buildScorerSupplier(vectorValues); HnswGraphBuilder builder = HnswGraphBuilder.create(scorerSupplier, 16, 100, random().nextInt()); - OnHeapHnswGraph hnsw = builder.build(vectors.size()); + OnHeapHnswGraph hnsw = builder.build(vectorValues.size()); // Only mark a few vectors as accepted BitSet acceptOrds = new FixedBitSet(nDoc); for (int i = 0; i < nDoc; i += random().nextInt(15, 20)) { @@ -561,7 +565,7 @@ public void testSearchWithSelectiveAcceptOrds() throws IOException { int numAccepted = acceptOrds.cardinality(); KnnCollector nn = HnswGraphSearcher.search( - buildScorer(vectors, getTargetVector()), + buildScorer(vectorValues, getTargetVector()), numAccepted, hnsw, acceptOrds, @@ -737,16 +741,16 @@ private int[] createOffsetOrdinalMap( public void testVisitedLimit() throws IOException { int nDoc = 500; similarityFunction = VectorSimilarityFunction.DOT_PRODUCT; - KnnVectorValues vectors = circularVectorValues(nDoc); - RandomVectorScorerSupplier scorerSupplier = buildScorerSupplier(vectors); + KnnVectorValues vectorValues = circularVectorValues(nDoc); + RandomVectorScorerSupplier scorerSupplier = buildScorerSupplier(vectorValues); HnswGraphBuilder builder = HnswGraphBuilder.create(scorerSupplier, 16, 100, random().nextInt()); - OnHeapHnswGraph hnsw = builder.build(vectors.size()); + OnHeapHnswGraph hnsw = builder.build(vectorValues.size()); int topK = 50; int visitedLimit = topK + random().nextInt(5); KnnCollector nn = HnswGraphSearcher.search( - buildScorer(vectors, getTargetVector()), + buildScorer(vectorValues, getTargetVector()), topK, hnsw, createRandomAcceptOrds(0, nDoc), @@ -772,12 +776,12 @@ public void testRamUsageEstimate() throws IOException { int M = randomIntBetween(4, 96); similarityFunction = RandomizedTest.randomFrom(VectorSimilarityFunction.values()); - KnnVectorValues vectors = vectorValues(size, dim); + KnnVectorValues vectorValues = vectorValues(size, dim); - RandomVectorScorerSupplier scorerSupplier = buildScorerSupplier(vectors); + RandomVectorScorerSupplier scorerSupplier = buildScorerSupplier(vectorValues); HnswGraphBuilder builder = HnswGraphBuilder.create(scorerSupplier, M, M * 2, random().nextLong()); - OnHeapHnswGraph hnsw = builder.build(vectors.size()); + OnHeapHnswGraph hnsw = builder.build(vectorValues.size()); long estimated = RamUsageEstimator.sizeOfObject(hnsw); long actual = ramUsed(hnsw); @@ -797,9 +801,9 @@ public void testDiversity() throws IOException { unitVector2d(0.77), unitVector2d(0.6) }; - KnnVectorValues vectors = vectorValues(values); + KnnVectorValues vectorValues = vectorValues(values); // First add nodes until everybody gets a full neighbor list - RandomVectorScorerSupplier scorerSupplier = buildScorerSupplier(vectors); + RandomVectorScorerSupplier scorerSupplier = buildScorerSupplier(vectorValues); HnswGraphBuilder builder = HnswGraphBuilder.create(scorerSupplier, 2, 10, random().nextInt()); // node 0 is added by the builder constructor builder.addGraphNode(0); @@ -851,9 +855,9 @@ public void testDiversityFallback() throws IOException { {10, 0, 0}, {0, 4, 0} }; - KnnVectorValues vectors = vectorValues(values); + KnnVectorValues vectorValues = vectorValues(values); // First add nodes until everybody gets a full neighbor list - RandomVectorScorerSupplier scorerSupplier = buildScorerSupplier(vectors); + RandomVectorScorerSupplier scorerSupplier = buildScorerSupplier(vectorValues); HnswGraphBuilder builder = HnswGraphBuilder.create(scorerSupplier, 1, 10, random().nextInt()); builder.addGraphNode(0); builder.addGraphNode(1); @@ -881,9 +885,9 @@ public void testDiversity3d() throws IOException { {0, 0, 20}, {0, 9, 0} }; - KnnVectorValues vectors = vectorValues(values); + KnnVectorValues vectorValues = vectorValues(values); // First add nodes until everybody gets a full neighbor list - RandomVectorScorerSupplier scorerSupplier = buildScorerSupplier(vectors); + RandomVectorScorerSupplier scorerSupplier = buildScorerSupplier(vectorValues); HnswGraphBuilder builder = HnswGraphBuilder.create(scorerSupplier, 1, 10, random().nextInt()); builder.addGraphNode(0); builder.addGraphNode(1); @@ -917,11 +921,11 @@ private void assertLevel0Neighbors(OnHeapHnswGraph graph, int node, int... expec public void testRandom() throws IOException { int size = atLeast(100); int dim = atLeast(10); - KnnVectorValues vectors = vectorValues(size, dim); + KnnVectorValues vectorValues = vectorValues(size, dim); int topK = 5; - RandomVectorScorerSupplier scorerSupplier = buildScorerSupplier(vectors); + RandomVectorScorerSupplier scorerSupplier = buildScorerSupplier(vectorValues); HnswGraphBuilder builder = HnswGraphBuilder.create(scorerSupplier, 10, 30, random().nextLong()); - OnHeapHnswGraph hnsw = builder.build(vectors.size()); + OnHeapHnswGraph hnsw = builder.build(vectorValues.size()); Bits acceptOrds = random().nextBoolean() ? null : createRandomAcceptOrds(0, size); int totalMatches = 0; @@ -930,17 +934,20 @@ public void testRandom() throws IOException { T query = randomVector(dim); actual = HnswGraphSearcher.search( - buildScorer(vectors, query), 100, hnsw, acceptOrds, Integer.MAX_VALUE); + buildScorer(vectorValues, query), 100, hnsw, acceptOrds, Integer.MAX_VALUE); TopDocs topDocs = actual.topDocs(); NeighborQueue expected = new NeighborQueue(topK, false); for (int j = 0; j < size; j++) { - if (vectorValue(vectors, j) != null && (acceptOrds == null || acceptOrds.get(j))) { + if (vectorValue(vectorValues, j) != null && (acceptOrds == null || acceptOrds.get(j))) { if (getVectorEncoding() == VectorEncoding.BYTE) { expected.add( - j, similarityFunction.compare((byte[]) query, (byte[]) vectorValue(vectors, j))); + j, + similarityFunction.compare((byte[]) query, (byte[]) vectorValue(vectorValues, j))); } else { expected.add( - j, similarityFunction.compare((float[]) query, (float[]) vectorValue(vectors, j))); + j, + similarityFunction.compare( + (float[]) query, (float[]) vectorValue(vectorValues, j))); } if (expected.size() > topK) { expected.pop(); @@ -964,10 +971,10 @@ public void testOnHeapHnswGraphSearch() throws IOException, ExecutionException, InterruptedException, TimeoutException { int size = atLeast(100); int dim = atLeast(10); - KnnVectorValues vectors = vectorValues(size, dim); - RandomVectorScorerSupplier scorerSupplier = buildScorerSupplier(vectors); + KnnVectorValues vectorValues = vectorValues(size, dim); + RandomVectorScorerSupplier scorerSupplier = buildScorerSupplier(vectorValues); HnswGraphBuilder builder = HnswGraphBuilder.create(scorerSupplier, 10, 30, random().nextLong()); - OnHeapHnswGraph hnsw = builder.build(vectors.size()); + OnHeapHnswGraph hnsw = builder.build(vectorValues.size()); Bits acceptOrds = random().nextBoolean() ? null : createRandomAcceptOrds(0, size); System.out.println("acceptOrds=" + acceptOrds); List queries = new ArrayList<>(); @@ -978,7 +985,7 @@ public void testOnHeapHnswGraphSearch() queries.add(query); expect = HnswGraphSearcher.search( - buildScorer(vectors, query), 100, hnsw, acceptOrds, Integer.MAX_VALUE); + buildScorer(vectorValues, query), 100, hnsw, acceptOrds, Integer.MAX_VALUE); expects.add(expect); } @@ -993,7 +1000,11 @@ public void testOnHeapHnswGraphSearch() try { actual = HnswGraphSearcher.search( - buildScorer(vectors, query), 100, hnsw, acceptOrds, Integer.MAX_VALUE); + buildScorer(vectorValues, query), + 100, + hnsw, + acceptOrds, + Integer.MAX_VALUE); } catch (IOException ioe) { throw new RuntimeException(ioe); } @@ -1028,8 +1039,8 @@ public void testOnHeapHnswGraphSearch() public void testConcurrentMergeBuilder() throws IOException { int size = atLeast(1000); int dim = atLeast(10); - KnnVectorValues vectors = vectorValues(size, dim); - RandomVectorScorerSupplier scorerSupplier = buildScorerSupplier(vectors); + KnnVectorValues vectorValues = vectorValues(size, dim); + RandomVectorScorerSupplier scorerSupplier = buildScorerSupplier(vectorValues); ExecutorService exec = Executors.newFixedThreadPool(4, new NamedThreadFactory("hnswMerge")); TaskExecutor taskExecutor = new TaskExecutor(exec); HnswGraphBuilder.randSeed = random().nextLong(); diff --git a/lucene/core/src/test/org/apache/lucene/util/hnsw/TestHnswFloatVectorGraph.java b/lucene/core/src/test/org/apache/lucene/util/hnsw/TestHnswFloatVectorGraph.java index 519e1b03469..3b357f25a8f 100644 --- a/lucene/core/src/test/org/apache/lucene/util/hnsw/TestHnswFloatVectorGraph.java +++ b/lucene/core/src/test/org/apache/lucene/util/hnsw/TestHnswFloatVectorGraph.java @@ -72,12 +72,12 @@ MockVectorValues vectorValues(float[][] values) { MockVectorValues vectorValues(LeafReader reader, String fieldName) throws IOException { FloatVectorValues vectorValues = reader.getFloatVectorValues(fieldName); FloatVectorValues.Floats vectors = vectorValues.vectors(); - float[][] vectors = new float[reader.maxDoc()][]; + float[][] vectorsArray = new float[reader.maxDoc()][]; for (int i = 0; i < vectorValues.size(); i++) { - vectors[vectorValues.ordToDoc(i)] = + vectorsArray[vectorValues.ordToDoc(i)] = ArrayUtil.copyOfSubArray(vectors.get(i), 0, vectorValues.dimension()); } - return MockVectorValues.fromValues(vectors); + return MockVectorValues.fromValues(vectorsArray); } @Override @@ -121,10 +121,10 @@ float[] getTargetVector() { public void testSearchWithSkewedAcceptOrds() throws IOException { int nDoc = 1000; similarityFunction = VectorSimilarityFunction.EUCLIDEAN; - FloatVectorValues vectors = circularVectorValues(nDoc); - RandomVectorScorerSupplier scorerSupplier = buildScorerSupplier(vectors); + FloatVectorValues vectorValues = circularVectorValues(nDoc); + RandomVectorScorerSupplier scorerSupplier = buildScorerSupplier(vectorValues); HnswGraphBuilder builder = HnswGraphBuilder.create(scorerSupplier, 16, 100, random().nextInt()); - OnHeapHnswGraph hnsw = builder.build(vectors.size()); + OnHeapHnswGraph hnsw = builder.build(vectorValues.size()); // Skip over half of the documents that are closest to the query vector FixedBitSet acceptOrds = new FixedBitSet(nDoc); @@ -133,7 +133,7 @@ public void testSearchWithSkewedAcceptOrds() throws IOException { } KnnCollector nn = HnswGraphSearcher.search( - buildScorer(vectors, getTargetVector()), 10, hnsw, acceptOrds, Integer.MAX_VALUE); + buildScorer(vectorValues, getTargetVector()), 10, hnsw, acceptOrds, Integer.MAX_VALUE); TopDocs nodes = nn.topDocs(); assertEquals("Number of found results is not equal to [10].", 10, nodes.scoreDocs.length); diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/quantization/KMeans.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/quantization/KMeans.java index 2ed99906da6..4081fcbb6c9 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/quantization/KMeans.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/quantization/KMeans.java @@ -50,7 +50,7 @@ public class KMeans { /** * Cluster vectors into a given number of clusters * - * @param vectors float vectors + * @param vectorValues float vectors * @param similarityFunction vector similarity function. For COSINE similarity, vectors must be * normalized. * @param numClusters number of cluster to cluster vector into @@ -58,10 +58,10 @@ public class KMeans { * @throws IOException when if there is an error accessing vectors */ public static Results cluster( - FloatVectorValues vectors, VectorSimilarityFunction similarityFunction, int numClusters) + FloatVectorValues vectorValues, VectorSimilarityFunction similarityFunction, int numClusters) throws IOException { return cluster( - vectors, + vectorValues, numClusters, true, 42L, @@ -75,7 +75,7 @@ public static Results cluster( /** * Expert: Cluster vectors into a given number of clusters * - * @param vectors float vectors + * @param vectorValues float vectors * @param numClusters number of cluster to cluster vector into * @param assignCentroidsToVectors if {@code true} assign centroids for all vectors. Centroids are * computed on a sample of vectors. If this parameter is {@code true}, in results also return @@ -92,7 +92,7 @@ public static Results cluster( * @throws IOException if there is error accessing vectors */ public static Results cluster( - FloatVectorValues vectors, + FloatVectorValues vectorValues, int numClusters, boolean assignCentroidsToVectors, long seed, @@ -102,7 +102,7 @@ public static Results cluster( int iters, int sampleSize) throws IOException { - if (vectors.size() == 0) { + if (vectorValues.size() == 0) { return null; } if (numClusters < 1 || numClusters > MAX_NUM_CENTROIDS) { @@ -111,8 +111,8 @@ public static Results cluster( } // adjust sampleSize and numClusters sampleSize = Math.max(sampleSize, 100 * numClusters); - if (sampleSize > vectors.size()) { - sampleSize = vectors.size(); + if (sampleSize > vectorValues.size()) { + sampleSize = vectorValues.size(); // Decrease the number of clusters if needed int maxNumClusters = Math.max(1, sampleSize / 100); numClusters = Math.min(numClusters, maxNumClusters); @@ -121,10 +121,12 @@ public static Results cluster( Random random = new Random(seed); float[][] centroids; if (numClusters == 1) { - centroids = new float[1][vectors.dimension()]; + centroids = new float[1][vectorValues.dimension()]; } else { FloatVectorValues sampleVectors = - vectors.size() <= sampleSize ? vectors : createSampleReader(vectors, sampleSize, seed); + vectorValues.size() <= sampleSize + ? vectorValues + : createSampleReader(vectorValues, sampleSize, seed); KMeans kmeans = new KMeans(sampleVectors, numClusters, random, initializationMethod, restarts, iters); centroids = kmeans.computeCentroids(normalizeCenters); @@ -133,9 +135,9 @@ public static Results cluster( short[] vectorCentroids = null; // Assign each vector to the nearest centroid and update the centres if (assignCentroidsToVectors) { - vectorCentroids = new short[vectors.size()]; + vectorCentroids = new short[vectorValues.size()]; // Use kahan summation to get more precise results - KMeans.runKMeansStep(vectors, centroids, vectorCentroids, true, normalizeCenters); + KMeans.runKMeansStep(vectorValues, centroids, vectorCentroids, true, normalizeCenters); } return new Results(centroids, vectorCentroids); } @@ -268,7 +270,7 @@ private float[][] initializePlusPlus() throws IOException { /** * Run kmeans step * - * @param vectors float vectors + * @param vectorValues float vectors * @param centroids centroids, new calculated centroids are written here * @param docCentroids for each document which centroid it belongs to, results will be written * here diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/quantization/TestKMeans.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/quantization/TestKMeans.java index 3669079b719..9dd457b8e11 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/quantization/TestKMeans.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/quantization/TestKMeans.java @@ -32,11 +32,11 @@ public void testKMeansAPI() throws IOException { int dims = random().nextInt(2, 20); int randIdx = random().nextInt(VectorSimilarityFunction.values().length); VectorSimilarityFunction similarityFunction = VectorSimilarityFunction.values()[randIdx]; - FloatVectorValues vectors = generateData(nVectors, dims, nClusters); + FloatVectorValues vectorValues = generateData(nVectors, dims, nClusters); // default case { - KMeans.Results results = KMeans.cluster(vectors, similarityFunction, nClusters); + KMeans.Results results = KMeans.cluster(vectorValues, similarityFunction, nClusters); assertEquals(nClusters, results.centroids().length); assertEquals(nVectors, results.vectorCentroids().length); } @@ -52,7 +52,7 @@ public void testKMeansAPI() throws IOException { KMeans.Results results = KMeans.cluster( - vectors, + vectorValues, nClusters, assignCentroidsToVectors, random().nextLong(), @@ -75,9 +75,9 @@ public void testKMeansSpecialCases() throws IOException { // nClusters > nVectors int nClusters = 20; int nVectors = 10; - FloatVectorValues vectors = generateData(nVectors, 5, nClusters); + FloatVectorValues vectorValues = generateData(nVectors, 5, nClusters); KMeans.Results results = - KMeans.cluster(vectors, VectorSimilarityFunction.EUCLIDEAN, nClusters); + KMeans.cluster(vectorValues, VectorSimilarityFunction.EUCLIDEAN, nClusters); // assert that we get 1 centroid, as nClusters will be adjusted assertEquals(1, results.centroids().length); assertEquals(nVectors, results.vectorCentroids().length); @@ -87,12 +87,12 @@ public void testKMeansSpecialCases() throws IOException { int sampleSize = 2; int nClusters = 2; int nVectors = 300; - FloatVectorValues vectors = generateData(nVectors, 5, nClusters); + FloatVectorValues vectorValues = generateData(nVectors, 5, nClusters); KMeans.KmeansInitializationMethod initializationMethod = KMeans.KmeansInitializationMethod.PLUS_PLUS; KMeans.Results results = KMeans.cluster( - vectors, + vectorValues, nClusters, true, random().nextLong(), @@ -108,12 +108,12 @@ public void testKMeansSpecialCases() throws IOException { // test unassigned centroids int nClusters = 4; int nVectors = 400; - FloatVectorValues vectors = generateData(nVectors, 5, nClusters); + FloatVectorValues vectorValues = generateData(nVectors, 5, nClusters); KMeans.Results results = - KMeans.cluster(vectors, VectorSimilarityFunction.EUCLIDEAN, nClusters); + KMeans.cluster(vectorValues, VectorSimilarityFunction.EUCLIDEAN, nClusters); float[][] centroids = results.centroids(); List unassignedIdxs = List.of(0, 3); - KMeans.assignCentroids(vectors, centroids, unassignedIdxs); + KMeans.assignCentroids(vectorValues, centroids, unassignedIdxs); assertEquals(nClusters, centroids.length); } } From 2e513805b98f13ec19f7bf6a76e61b666d4ef783 Mon Sep 17 00:00:00 2001 From: Michael Sokolov Date: Mon, 7 Oct 2024 14:54:41 -0600 Subject: [PATCH 08/25] EMPTY --- .../lucene92/OffHeapFloatVectorValues.java | 7 +------ .../lucene94/OffHeapFloatVectorValues.java | 7 +------ .../lucene/codecs/lucene95/OffHeapFloatVectorValues.java | 7 +------ .../java/org/apache/lucene/index/ByteVectorValues.java | 1 + .../java/org/apache/lucene/index/FloatVectorValues.java | 9 +++++++++ .../util/quantization/QuantizedByteVectorValues.java | 4 ++++ 6 files changed, 17 insertions(+), 18 deletions(-) diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene92/OffHeapFloatVectorValues.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene92/OffHeapFloatVectorValues.java index 66744d6e234..24f66c2f944 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene92/OffHeapFloatVectorValues.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene92/OffHeapFloatVectorValues.java @@ -228,12 +228,7 @@ public int size() { @Override public Floats vectors() { - return new Floats() { - @Override - public float[] get(int targetOrd) throws IOException { - throw new UnsupportedOperationException(); - } - }; + return Floats.EMPTY; } @Override diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene94/OffHeapFloatVectorValues.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene94/OffHeapFloatVectorValues.java index b412ebd8f5b..d83732a1cac 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene94/OffHeapFloatVectorValues.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene94/OffHeapFloatVectorValues.java @@ -241,12 +241,7 @@ public int size() { @Override public Floats vectors() { - return new Floats() { - @Override - public float[] get(int targetOrd) { - throw new UnsupportedOperationException(); - } - }; + return Floats.EMPTY; } @Override diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene95/OffHeapFloatVectorValues.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene95/OffHeapFloatVectorValues.java index 919ba1a19b7..736a552ff04 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene95/OffHeapFloatVectorValues.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene95/OffHeapFloatVectorValues.java @@ -277,12 +277,7 @@ public int size() { @Override public Floats vectors() { - return new Floats() { - @Override - public float[] get(int ord) { - throw new UnsupportedOperationException(); - } - }; + return Floats.EMPTY; } @Override diff --git a/lucene/core/src/java/org/apache/lucene/index/ByteVectorValues.java b/lucene/core/src/java/org/apache/lucene/index/ByteVectorValues.java index 8e3f2264d40..ea187cdd84b 100644 --- a/lucene/core/src/java/org/apache/lucene/index/ByteVectorValues.java +++ b/lucene/core/src/java/org/apache/lucene/index/ByteVectorValues.java @@ -42,6 +42,7 @@ public abstract static class Bytes { */ public abstract byte[] get(int ord) throws IOException; + /** A Bytes containing no vectors. Throws UnsupportedOperationException if get() is called. */ public static final Bytes EMPTY = new Bytes() { @Override diff --git a/lucene/core/src/java/org/apache/lucene/index/FloatVectorValues.java b/lucene/core/src/java/org/apache/lucene/index/FloatVectorValues.java index c58e885d265..6a3ebe4a1af 100644 --- a/lucene/core/src/java/org/apache/lucene/index/FloatVectorValues.java +++ b/lucene/core/src/java/org/apache/lucene/index/FloatVectorValues.java @@ -41,6 +41,15 @@ public abstract static class Floats { * @return the vector value */ public abstract float[] get(int ord) throws IOException; + + /** A Floats containing no vectors. Throws UnsupportedOperationException if get() is called. */ + public static final Floats EMPTY = + new Floats() { + @Override + public float[] get(int ord) { + throw new UnsupportedOperationException(); + } + }; } /** Returns a random access (lookup by ord) provider of the vector values */ diff --git a/lucene/core/src/java/org/apache/lucene/util/quantization/QuantizedByteVectorValues.java b/lucene/core/src/java/org/apache/lucene/util/quantization/QuantizedByteVectorValues.java index 7ffc1bef6eb..84f3f9c410c 100644 --- a/lucene/core/src/java/org/apache/lucene/util/quantization/QuantizedByteVectorValues.java +++ b/lucene/core/src/java/org/apache/lucene/util/quantization/QuantizedByteVectorValues.java @@ -62,6 +62,10 @@ public IndexInput getSlice() { return null; } + /** + * A QuantizedBytes containing no vectors. Throws UnsupportedOperationException if its methods + * is called. + */ public static final QuantizedBytes EMPTY = new QuantizedBytes() { @Override From 3b8d70fc2dfa0b1c1f77ddd93c8e33f009d113ef Mon Sep 17 00:00:00 2001 From: Michael Sokolov Date: Mon, 7 Oct 2024 15:14:13 -0600 Subject: [PATCH 09/25] CHANGES and MIGRATE entries --- lucene/CHANGES.txt | 3 +++ lucene/MIGRATE.md | 4 ++++ 2 files changed, 7 insertions(+) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 2009437a785..9363efbfeb2 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -35,6 +35,9 @@ API Changes --------------------- * GITHUB#13845: Add missing with-discountOverlaps Similarity constructor variants. (Pierre Salagnac, Christine Poerschke, Robert Muir) +* GITHUB#13831: Complete refactoring of random-access vector API, eliminating copy() method. Now random-access vectors + are accessed by calling Byte/FloatVectorValues.vectors().get(int). + New Features --------------------- (No changes) diff --git a/lucene/MIGRATE.md b/lucene/MIGRATE.md index 0411791165d..046a0075673 100644 --- a/lucene/MIGRATE.md +++ b/lucene/MIGRATE.md @@ -892,3 +892,7 @@ segments are rewritten either via `IndexWriter.forceMerge` or ### Vector values APIs switched to primarily random-access `{Byte/Float}VectorValues` no longer inherit from `DocIdSetIterator`. Rather they extend a common class, `KnnVectorValues`, that provides a random access API (previously provided by `RandomAccessVectorValues`, now removed), and an `iterator()` method for retrieving `DocIndexIterator`: an iterator which is a DISI that also provides an `index()` method. Therefore, any iteration over vector values must now be performed using the values' `iterator()`. Random access works as before, but does not require casting to `RandomAccessVectorValues`. + +## Migration from Lucene 10.0 to Lucene 10.1 + +The refactoring of random-access vector API begun in 10.0 is completed in 10.1, where `{Byte/Float}VectorValues.copy()` methods have been removed. It is no longer necessary to copy instances of `KnnVectorValues` in order to obtain unique vector sources that do not share underlying data structures. Instead, random-access vectors are accessed via `{Byte/Float}VectorValues.vectors().get(int)`. The `Bytes`/`Floats` instances returned from `{Byte/Float}VectorValues.vectors()` now encapsulate non-shareable storage. From f5e026072d93f819d17d7eba0e4c187d813094df Mon Sep 17 00:00:00 2001 From: Michael Sokolov Date: Mon, 7 Oct 2024 15:33:37 -0600 Subject: [PATCH 10/25] a little more renaming --- .../lucene90/Lucene90HnswGraphBuilder.java | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswGraphBuilder.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswGraphBuilder.java index 68d08838adf..bfa2d295621 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswGraphBuilder.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswGraphBuilder.java @@ -104,7 +104,7 @@ public Lucene90HnswGraphBuilder( * returned values. * * @param vectorValues the vectors for which to build a nearest neighbors graph. Must be an - * independet accessor for the vectors + * independent accessor for the vectors */ public Lucene90OnHeapHnswGraph build(FloatVectorValues vectorValues) throws IOException { if (infoStream.isEnabled(HNSW_COMPONENT)) { @@ -113,9 +113,9 @@ public Lucene90OnHeapHnswGraph build(FloatVectorValues vectorValues) throws IOEx } long start = System.nanoTime(), t = start; // start at node 1! node 0 is added implicitly, in the constructor - FloatVectorValues.Floats values = vectorValues.vectors(); + FloatVectorValues.Floats vectors = vectorValues.vectors(); for (int node = 1; node < vectorValues.size(); node++) { - addGraphNode(values.get(node)); + addGraphNode(vectors.get(node)); if (node % 10000 == 0) { if (infoStream.isEnabled(HNSW_COMPONENT)) { long now = System.nanoTime(); @@ -230,12 +230,12 @@ private boolean diversityCheck( float[] candidate, float score, Lucene90NeighborArray neighbors, - FloatVectorValues.Floats vectorValues) + FloatVectorValues.Floats vectors) throws IOException { bound.set(score); for (int i = 0; i < neighbors.size(); i++) { float neighborSimilarity = - similarityFunction.compare(candidate, vectorValues.get(neighbors.node()[i])); + similarityFunction.compare(candidate, vectors.get(neighbors.node()[i])); if (bound.check(neighborSimilarity) == false) { return false; } From 9c68a6e3104d3d42bbd5479ebc316bc07ac72a99 Mon Sep 17 00:00:00 2001 From: Michael Sokolov Date: Mon, 7 Oct 2024 15:44:47 -0600 Subject: [PATCH 11/25] mopping up some more values->vectors --- .../lucene90/Lucene90HnswVectorsReader.java | 4 ++-- .../backward_codecs/lucene90/Lucene90OnHeapHnswGraph.java | 6 +++--- .../lucene91/Lucene91HnswVectorsReader.java | 4 ++-- .../lucene92/OffHeapFloatVectorValues.java | 8 ++++---- .../lucene94/OffHeapFloatVectorValues.java | 4 ++-- .../lucene90/Lucene90HnswVectorsWriter.java | 4 ++-- .../lucene94/Lucene94HnswVectorsWriter.java | 4 ++-- .../lucene95/Lucene95HnswVectorsWriter.java | 4 ++-- .../apache/lucene/codecs/BufferingKnnVectorsWriter.java | 4 ++-- .../java/org/apache/lucene/codecs/KnnVectorsWriter.java | 8 ++++---- .../lucene/codecs/hnsw/ScalarQuantizedVectorScorer.java | 6 +++--- .../lucene/codecs/lucene99/Lucene99FlatVectorsWriter.java | 4 ++-- .../java/org/apache/lucene/index/SortingCodecReader.java | 4 ++-- .../function/valuesource/FloatKnnVectorFieldSource.java | 4 ++-- 14 files changed, 34 insertions(+), 34 deletions(-) diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswVectorsReader.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswVectorsReader.java index 859d4148026..8132ea7292d 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswVectorsReader.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswVectorsReader.java @@ -418,12 +418,12 @@ public VectorScorer scorer(float[] target) { if (size() == 0) { return null; } - FloatVectorValues.Floats values = vectors(); + FloatVectorValues.Floats vectors = vectors(); DocIndexIterator iterator = iterator(); return new VectorScorer() { @Override public float score() throws IOException { - return similarityFunction.compare(values.get(iterator.index()), target); + return similarityFunction.compare(vectors.get(iterator.index()), target); } @Override diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90OnHeapHnswGraph.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90OnHeapHnswGraph.java index 95f2b8742b4..e99939edd4c 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90OnHeapHnswGraph.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90OnHeapHnswGraph.java @@ -83,7 +83,7 @@ public static NeighborQueue search( throws IOException { int size = graphValues.size(); - FloatVectorValues.Floats values = vectorValues.vectors(); + FloatVectorValues.Floats vectors = vectorValues.vectors(); // MIN heap, holding the top results NeighborQueue results = new NeighborQueue(numSeed, false); // MAX heap, from which to pull the candidate nodes @@ -102,7 +102,7 @@ public static NeighborQueue search( break; } // explore the topK starting points of some random numSeed probes - float score = similarityFunction.compare(query, values.get(entryPoint)); + float score = similarityFunction.compare(query, vectors.get(entryPoint)); candidates.add(entryPoint, score); if (acceptOrds == null || acceptOrds.get(entryPoint)) { results.add(entryPoint, score); @@ -138,7 +138,7 @@ public static NeighborQueue search( break; } - float friendSimilarity = similarityFunction.compare(query, values.get(friendOrd)); + float friendSimilarity = similarityFunction.compare(query, vectors.get(friendOrd)); if (results.size() < numSeed || bound.check(friendSimilarity) == false) { candidates.add(friendOrd, friendSimilarity); if (acceptOrds == null || acceptOrds.get(friendOrd)) { diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene91/Lucene91HnswVectorsReader.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene91/Lucene91HnswVectorsReader.java index 0e1b349995c..e4283f20ccc 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene91/Lucene91HnswVectorsReader.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene91/Lucene91HnswVectorsReader.java @@ -459,12 +459,12 @@ public VectorScorer scorer(float[] target) throws IOException { if (size == 0) { return null; } - Floats values = vectors(); + Floats vectors = vectors(); DocIndexIterator iterator = iterator(); return new VectorScorer() { @Override public float score() throws IOException { - return similarityFunction.compare(values.get(iterator.index()), target); + return similarityFunction.compare(vectors.get(iterator.index()), target); } @Override diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene92/OffHeapFloatVectorValues.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene92/OffHeapFloatVectorValues.java index 24f66c2f944..d8b6a66bc32 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene92/OffHeapFloatVectorValues.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene92/OffHeapFloatVectorValues.java @@ -118,12 +118,12 @@ public Bits getAcceptOrds(Bits acceptDocs) { @Override public VectorScorer scorer(float[] query) throws IOException { - FloatVectorValues.Floats values = vectors(); + FloatVectorValues.Floats vectors = vectors(); DocIndexIterator iterator = iterator(); return new VectorScorer() { @Override public float score() throws IOException { - return vectorSimilarityFunction.compare(values.get(iterator.index()), query); + return vectorSimilarityFunction.compare(vectors.get(iterator.index()), query); } @Override @@ -194,12 +194,12 @@ public int length() { @Override public VectorScorer scorer(float[] query) throws IOException { - FloatVectorValues.Floats values = vectors(); + FloatVectorValues.Floats vectors = vectors(); IndexedDISI disi = createDISI(); return new VectorScorer() { @Override public float score() throws IOException { - return vectorSimilarityFunction.compare(values.get(disi.index()), query); + return vectorSimilarityFunction.compare(vectors.get(disi.index()), query); } @Override diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene94/OffHeapFloatVectorValues.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene94/OffHeapFloatVectorValues.java index d83732a1cac..6d7c1155c77 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene94/OffHeapFloatVectorValues.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene94/OffHeapFloatVectorValues.java @@ -208,11 +208,11 @@ public int length() { @Override public VectorScorer scorer(float[] query) throws IOException { IndexedDISI disi = createDISI(); - Floats values = vectors(); + Floats vectors = vectors(); return new VectorScorer() { @Override public float score() throws IOException { - return vectorSimilarityFunction.compare(values.get(disi.index()), query); + return vectorSimilarityFunction.compare(vectors.get(disi.index()), query); } @Override diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswVectorsWriter.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswVectorsWriter.java index 22eb3bf7916..fef8656d892 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswVectorsWriter.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswVectorsWriter.java @@ -189,10 +189,10 @@ private static int[] writeVectorData(IndexOutput output, FloatVectorValues vecto ByteBuffer binaryVector = ByteBuffer.allocate(vectorValues.dimension() * Float.BYTES).order(ByteOrder.LITTLE_ENDIAN); KnnVectorValues.DocIndexIterator iter = vectorValues.iterator(); - FloatVectorValues.Floats values = vectorValues.vectors(); + FloatVectorValues.Floats vectors = vectorValues.vectors(); for (int docV = iter.nextDoc(); docV != NO_MORE_DOCS; docV = iter.nextDoc()) { // write vector - binaryVector.asFloatBuffer().put(values.get(iter.index())); + binaryVector.asFloatBuffer().put(vectors.get(iter.index())); output.writeBytes(binaryVector.array(), binaryVector.limit()); docIds[count++] = docV; } diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene94/Lucene94HnswVectorsWriter.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene94/Lucene94HnswVectorsWriter.java index 38b0779ab5f..cc642d0422e 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene94/Lucene94HnswVectorsWriter.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene94/Lucene94HnswVectorsWriter.java @@ -605,13 +605,13 @@ private static DocsWithFieldSet writeVectorData( IndexOutput output, FloatVectorValues floatVectorValues) throws IOException { DocsWithFieldSet docsWithField = new DocsWithFieldSet(); KnnVectorValues.DocIndexIterator iter = floatVectorValues.iterator(); - FloatVectorValues.Floats values = floatVectorValues.vectors(); + FloatVectorValues.Floats vectors = floatVectorValues.vectors(); ByteBuffer binaryVector = ByteBuffer.allocate(floatVectorValues.dimension() * VectorEncoding.FLOAT32.byteSize) .order(ByteOrder.LITTLE_ENDIAN); for (int docV = iter.nextDoc(); docV != NO_MORE_DOCS; docV = iter.nextDoc()) { // write vector - binaryVector.asFloatBuffer().put(values.get(iter.index())); + binaryVector.asFloatBuffer().put(vectors.get(iter.index())); output.writeBytes(binaryVector.array(), binaryVector.limit()); docsWithField.add(docV); } diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene95/Lucene95HnswVectorsWriter.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene95/Lucene95HnswVectorsWriter.java index 471ba54cca6..dc255b7bf93 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene95/Lucene95HnswVectorsWriter.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene95/Lucene95HnswVectorsWriter.java @@ -656,10 +656,10 @@ private static DocsWithFieldSet writeVectorData( ByteBuffer.allocate(floatVectorValues.dimension() * VectorEncoding.FLOAT32.byteSize) .order(ByteOrder.LITTLE_ENDIAN); KnnVectorValues.DocIndexIterator iter = floatVectorValues.iterator(); - FloatVectorValues.Floats values = floatVectorValues.vectors(); + FloatVectorValues.Floats vectors = floatVectorValues.vectors(); for (int docV = iter.nextDoc(); docV != NO_MORE_DOCS; docV = iter.nextDoc()) { // write vector - buffer.asFloatBuffer().put(values.get(iter.index())); + buffer.asFloatBuffer().put(vectors.get(iter.index())); output.writeBytes(buffer.array(), buffer.limit()); docsWithField.add(docV); } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/BufferingKnnVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/BufferingKnnVectorsWriter.java index c73f5454136..e1ec70cc6f0 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/BufferingKnnVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/BufferingKnnVectorsWriter.java @@ -163,11 +163,11 @@ private static class SortingByteVectorValues extends ByteVectorValues { @Override public Bytes vectors() throws IOException { return new Bytes() { - Bytes values = delegate.vectors(); + Bytes vectors = delegate.vectors(); @Override public byte[] get(int ord) throws IOException { - return values.get(ord); + return vectors.get(ord); } }; } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsWriter.java index f96407257c4..ef68b6c5001 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsWriter.java @@ -61,10 +61,10 @@ public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOE (KnnFieldVectorsWriter) addField(fieldInfo); ByteVectorValues mergedBytes = MergedVectorValues.mergeByteVectorValues(fieldInfo, mergeState); - ByteVectorValues.Bytes values = mergedBytes.vectors(); + ByteVectorValues.Bytes vectors = mergedBytes.vectors(); KnnVectorValues.DocIndexIterator iter = mergedBytes.iterator(); for (int doc = iter.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = iter.nextDoc()) { - byteWriter.addValue(doc, values.get(iter.index())); + byteWriter.addValue(doc, vectors.get(iter.index())); } } case FLOAT32 -> { @@ -72,10 +72,10 @@ public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOE (KnnFieldVectorsWriter) addField(fieldInfo); FloatVectorValues mergedFloats = MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState); - FloatVectorValues.Floats values = mergedFloats.vectors(); + FloatVectorValues.Floats vectors = mergedFloats.vectors(); KnnVectorValues.DocIndexIterator iter = mergedFloats.iterator(); for (int doc = iter.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = iter.nextDoc()) { - floatWriter.addValue(doc, values.get(iter.index())); + floatWriter.addValue(doc, vectors.get(iter.index())); } } } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/hnsw/ScalarQuantizedVectorScorer.java b/lucene/core/src/java/org/apache/lucene/codecs/hnsw/ScalarQuantizedVectorScorer.java index da670aa673b..0753e689175 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/hnsw/ScalarQuantizedVectorScorer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/hnsw/ScalarQuantizedVectorScorer.java @@ -87,12 +87,12 @@ public RandomVectorScorer getRandomVectorScorer( scalarQuantizer.getConstantMultiplier(), scalarQuantizer.getBits()); return new RandomVectorScorer.AbstractRandomVectorScorer(quantizedByteVectorValues) { - QuantizedByteVectorValues.QuantizedBytes values = quantizedByteVectorValues.vectors(); + QuantizedByteVectorValues.QuantizedBytes vectors = quantizedByteVectorValues.vectors(); @Override public float score(int node) throws IOException { - byte[] nodeVector = values.get(node); - float nodeOffset = values.getScoreCorrectionConstant(node); + byte[] nodeVector = vectors.get(node); + float nodeOffset = vectors.getScoreCorrectionConstant(node); return scalarQuantizedVectorSimilarity.score( targetBytes, offsetCorrection, nodeVector, nodeOffset); } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsWriter.java index 5697abcde10..22886c2d9fb 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsWriter.java @@ -363,10 +363,10 @@ private static DocsWithFieldSet writeByteVectorData( IndexOutput output, ByteVectorValues byteVectorValues) throws IOException { DocsWithFieldSet docsWithField = new DocsWithFieldSet(); KnnVectorValues.DocIndexIterator iter = byteVectorValues.iterator(); - ByteVectorValues.Bytes values = byteVectorValues.vectors(); + ByteVectorValues.Bytes vectors = byteVectorValues.vectors(); for (int docV = iter.nextDoc(); docV != NO_MORE_DOCS; docV = iter.nextDoc()) { // write vector - byte[] binaryValue = values.get(iter.index()); + byte[] binaryValue = vectors.get(iter.index()); assert binaryValue.length == byteVectorValues.dimension() * VectorEncoding.BYTE.byteSize; output.writeBytes(binaryValue, binaryValue.length); docsWithField.add(docV); diff --git a/lucene/core/src/java/org/apache/lucene/index/SortingCodecReader.java b/lucene/core/src/java/org/apache/lucene/index/SortingCodecReader.java index 56e8c9eb6d3..6e2005aebda 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SortingCodecReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/SortingCodecReader.java @@ -359,11 +359,11 @@ private static class SortingByteVectorValues extends ByteVectorValues { @Override public Bytes vectors() throws IOException { return new Bytes() { - Bytes values = delegate.vectors(); + Bytes vectors = delegate.vectors(); @Override public byte[] get(int ord) throws IOException { - return values.get(ord); + return vectors.get(ord); } }; } diff --git a/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/FloatKnnVectorFieldSource.java b/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/FloatKnnVectorFieldSource.java index c05ef6a48c3..b192175eaf8 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/FloatKnnVectorFieldSource.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/FloatKnnVectorFieldSource.java @@ -64,12 +64,12 @@ protected DocIdSetIterator getVectorIterator() { return new VectorFieldFunction(this) { KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator(); - FloatVectorValues.Floats values = vectorValues.vectors(); + FloatVectorValues.Floats vectors = vectorValues.vectors(); @Override public float[] floatVectorVal(int doc) throws IOException { if (exists(doc)) { - return values.get(iterator.index()); + return vectors.get(iterator.index()); } else { return null; } From f035183bd27b05cb4724ca47b1eebf4e51a4e0be Mon Sep 17 00:00:00 2001 From: Michael Sokolov Date: Mon, 7 Oct 2024 17:33:54 -0600 Subject: [PATCH 12/25] fix javadoc --- .../backward_codecs/lucene90/Lucene90HnswGraphBuilder.java | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswGraphBuilder.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswGraphBuilder.java index bfa2d295621..e393342535b 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswGraphBuilder.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswGraphBuilder.java @@ -222,8 +222,7 @@ private void popToScratch(NeighborQueue candidates) { * @param score the score of the new candidate and node n, to be compared with scores of the * candidate and n's neighbors * @param neighbors the neighbors selected so far - * @param vectorValues source of values used for making comparisons between candidate and existing - * neighbors + * @param vectors used for making comparisons between candidate and existing neighbors * @return whether the candidate is diverse given the existing neighbors */ private boolean diversityCheck( From 23c7497581688ed391810e4fd9ff5b74238725da Mon Sep 17 00:00:00 2001 From: Michael Sokolov Date: Mon, 7 Oct 2024 17:45:42 -0600 Subject: [PATCH 13/25] fix error introduced in refactoring (init lastSubIndex to -1 instead of 0) --- .../apache/lucene/index/SlowCompositeCodecReaderWrapper.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lucene/core/src/java/org/apache/lucene/index/SlowCompositeCodecReaderWrapper.java b/lucene/core/src/java/org/apache/lucene/index/SlowCompositeCodecReaderWrapper.java index 99802a163ae..920da7dbb6e 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SlowCompositeCodecReaderWrapper.java +++ b/lucene/core/src/java/org/apache/lucene/index/SlowCompositeCodecReaderWrapper.java @@ -885,7 +885,7 @@ public int size() { @Override public Floats vectors() { return new Floats() { - int lastSubIndex = -1; + int lastSubIndex = 0; Floats subValues; @Override From 63a4d8314fb63fdd9962b205c351a9074c19443f Mon Sep 17 00:00:00 2001 From: Michael Sokolov Date: Mon, 14 Oct 2024 11:17:50 -0400 Subject: [PATCH 14/25] Add BaseKnnVectorsFormatTestCase.testRecall() and fix map ord to doc in Lucene90HnswVectorsReader --- .../lucene90/Lucene90HnswVectorsReader.java | 2 +- ...tLucene99ScalarQuantizedVectorsFormat.java | 5 + .../org/apache/lucene/index/TestKnnGraph.java | 2 +- .../index/BaseKnnVectorsFormatTestCase.java | 116 ++++ .../org/apache/lucene/tests/index/LICENSE.txt | 507 ++++++++++++++++++ .../org/apache/lucene/tests/index/NOTICE.txt | 197 +++++++ 6 files changed, 827 insertions(+), 2 deletions(-) create mode 100644 lucene/test-framework/src/resources/org/apache/lucene/tests/index/LICENSE.txt create mode 100644 lucene/test-framework/src/resources/org/apache/lucene/tests/index/NOTICE.txt diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswVectorsReader.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswVectorsReader.java index 8132ea7292d..ccb8770f838 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswVectorsReader.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswVectorsReader.java @@ -260,7 +260,7 @@ public void search(String field, float[] target, KnnCollector knnCollector, Bits int node = results.topNode(); float minSimilarity = results.topScore(); results.pop(); - knnCollector.collect(node, minSimilarity); + knnCollector.collect(vectorValues.ordToDoc(node), minSimilarity); } } diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99ScalarQuantizedVectorsFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99ScalarQuantizedVectorsFormat.java index 5315ab74785..3ffe62c6461 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99ScalarQuantizedVectorsFormat.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99ScalarQuantizedVectorsFormat.java @@ -102,6 +102,11 @@ public void testSearch() throws Exception { } } + @Override + public void testRecall() { + // ignore this test since this class always returns no results from search + } + public void testQuantizedVectorsWriteAndRead() throws Exception { // create lucene directory with codec int numVectors = 1 + random().nextInt(50); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestKnnGraph.java b/lucene/core/src/test/org/apache/lucene/index/TestKnnGraph.java index 96332b84eed..97bd624e97f 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestKnnGraph.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestKnnGraph.java @@ -563,7 +563,7 @@ private void add( String idString = Integer.toString(id); doc.add(new StringField("id", idString, Field.Store.YES)); doc.add(new SortedDocValuesField("id", new BytesRef(idString))); - // XSSystem.out.println("add " + idString + " " + Arrays.toString(vector)); + // System.out.println("add " + idString + " " + Arrays.toString(vector)); iw.updateDocument(new Term("id", idString), doc); } } diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java index c0171ac2bf3..82dfc0c252d 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java @@ -16,15 +16,22 @@ */ package org.apache.lucene.tests.index; +import static com.carrotsearch.randomizedtesting.RandomizedTest.randomIntBetween; import static java.nio.charset.StandardCharsets.UTF_8; import static org.apache.lucene.index.VectorSimilarityFunction.DOT_PRODUCT; import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; import java.io.ByteArrayOutputStream; +import java.io.BufferedReader; +import java.io.InputStream; +import java.io.InputStreamReader; import java.io.IOException; +import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; +import java.util.HashSet; +import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.atomic.AtomicReference; @@ -70,6 +77,10 @@ import org.apache.lucene.index.VectorEncoding; import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.KnnFloatVectorQuery; +import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.Sort; import org.apache.lucene.search.SortField; import org.apache.lucene.search.TopDocs; @@ -1931,4 +1942,109 @@ public void testMismatchedFields() throws Exception { IOUtils.close(reader, w2, dir1, dir2); } + + /** Test that the query is a viable approximation to exact search. This test is designed to + uncover gross failures only, not to represent the true expected recall. + */ + public void testRecall() throws IOException { + // TODO: vary the function randomly + VectorSimilarityFunction vectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN; + int dim = 16; + try (Directory indexStore = getKnownIndexStore("field", dim, vectorSimilarityFunction); + IndexReader reader = DirectoryReader.open(indexStore)) { + IndexSearcher searcher = newSearcher(reader); + float[] queryEmbedding = new float[dim]; + String queryString = "Apache License"; + computeLineEmbedding(queryString, queryEmbedding); + //computeLineEmbedding(" END OF TERMS AND CONDITIONS", queryEmbedding); + // pass match-all "filter" to force full traversal, bypassing graph + KnnFloatVectorQuery exactQuery = new KnnFloatVectorQuery("field", queryEmbedding, 1000, new MatchAllDocsQuery()); + // indexed 421 lines from LICENSE.txt + // indexed 157 lines from NOTICE.txt + assertEquals(578, searcher.count(exactQuery)); // Same for exact search + KnnFloatVectorQuery query = new KnnFloatVectorQuery("field", queryEmbedding, 10); + assertEquals(10, searcher.count(query)); // Expect some results without timeout + TopDocs results = searcher.search(query, 10); + Set resultDocs = new HashSet<>(); + int i = 0; + for (ScoreDoc scoreDoc : results.scoreDocs) { + System.out.println("result " + i++ + ": " + reader.storedFields().document(scoreDoc.doc) + " " + scoreDoc); + resultDocs.add(scoreDoc.doc); + } + TopDocs expected = searcher.search(exactQuery, 10); + i = 0; + int recalled = 0; + for (ScoreDoc scoreDoc : expected.scoreDocs) { + System.out.println("expected " + i++ + ": " + reader.storedFields().document(scoreDoc.doc) + " " + scoreDoc); + if (resultDocs.contains(scoreDoc.doc)) { + ++recalled; + } + } + assertTrue("recall should be at least 5/10, got " + recalled, recalled >= 5); + /* + assertEquals(queryString, reader.storedFields().document(results.scoreDocs[0].doc).get("text")); + assertEquals("Copyright (c) 2006 Dawid Weiss", reader.storedFields().document(results.scoreDocs[1].doc).get("text")); + assertEquals(queryString, reader.storedFields().document(expected.scoreDocs[0].doc).get("text")); + assertEquals("Copyright (c) 2006 Dawid Weiss", reader.storedFields().document(expected.scoreDocs[1].doc).get("text")); + */ + } + } + + /** Creates a new directory and adds documents with the given vectors as kNN vector fields */ + Directory getKnownIndexStore(String field, int dimension, VectorSimilarityFunction vectorSimilarityFunction) throws IOException { + Directory indexStore = newDirectory(random()); + IndexWriter writer = new IndexWriter(indexStore, newIndexWriterConfig()); + float[] scratch = new float[dimension]; + for (String file : List.of("LICENSE.txt", "NOTICE.txt")) { + try (InputStream in = BaseKnnVectorsFormatTestCase.class.getResourceAsStream(file); + BufferedReader reader = new BufferedReader(new InputStreamReader(in, UTF_8))) { + List lines = new ArrayList<>(); + String line; + int lineNo = -1; + while ((line = reader.readLine()) != null) { + line = line.strip(); + if (line.isEmpty()) { + continue; + } + ++lineNo; + Document doc = new Document(); + doc.add(new KnnFloatVectorField(field, computeLineEmbedding(line, scratch), vectorSimilarityFunction)); + doc.add(new StoredField("text", line)); + doc.add(new StringField("id", file + "." + lineNo, Field.Store.YES)); + writer.addDocument(doc); + if (random().nextBoolean()) { + // Add some documents without a vector + addDocuments(writer, "id" + lineNo + ".", randomIntBetween(1, 5)); + } + } + //System.out.println("indexed " + (lineNo + 1) + " lines from " + file); + } + } + // Add some documents without a vector nor an id + addDocuments(writer, null, 5); + writer.close(); + return indexStore; + } + + private float[] computeLineEmbedding(String line, float[] vector) { + Arrays.fill(vector, 0); + for (int i = 0; i < line.length(); i++) { + char c = line.charAt(i); + vector[i % vector.length] += (c << (i * 8 / vector.length)); + } + VectorUtil.l2normalize(vector, false); + return vector; + } + + private void addDocuments(IndexWriter writer, String idBase, int count) throws IOException { + for (int i = 0; i < count; i++) { + Document doc = new Document(); + doc.add(new StringField("other", "value", Field.Store.NO)); + if (idBase != null) { + doc.add(new StringField("id", idBase + i, Field.Store.YES)); + } + writer.addDocument(doc); + } + } + } diff --git a/lucene/test-framework/src/resources/org/apache/lucene/tests/index/LICENSE.txt b/lucene/test-framework/src/resources/org/apache/lucene/tests/index/LICENSE.txt new file mode 100644 index 00000000000..fc1b33ae9b3 --- /dev/null +++ b/lucene/test-framework/src/resources/org/apache/lucene/tests/index/LICENSE.txt @@ -0,0 +1,507 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + + +Some code in core/src/java/org/apache/lucene/util/UnicodeUtil.java was +derived from unicode conversion examples available at +http://www.unicode.org/Public/PROGRAMS/CVTUTF. Here is the copyright +from those sources: + +/* + * Copyright 2001-2004 Unicode, Inc. + * + * Disclaimer + * + * This source code is provided as is by Unicode, Inc. No claims are + * made as to fitness for any particular purpose. No warranties of any + * kind are expressed or implied. The recipient agrees to determine + * applicability of information provided. If this file has been + * purchased on magnetic or optical media from Unicode, Inc., the + * sole remedy for any claim will be exchange of defective media + * within 90 days of receipt. + * + * Limitations on Rights to Redistribute This Code + * + * Unicode, Inc. hereby grants the right to freely use the information + * supplied in this file in the creation of products supporting the + * Unicode Standard, and to make copies of this file in any form + * for internal or external distribution as long as this notice + * remains attached. + */ + + +Some code in core/src/java/org/apache/lucene/util/ArrayUtil.java was +derived from Python 2.4.2 sources available at +http://www.python.org. Full license is here: + + http://www.python.org/download/releases/2.4.2/license/ + +Some code in core/src/java/org/apache/lucene/util/UnicodeUtil.java was +derived from Python 3.1.2 sources available at +http://www.python.org. Full license is here: + + http://www.python.org/download/releases/3.1.2/license/ + +Some code in core/src/java/org/apache/lucene/util/automaton was +derived from Brics automaton sources available at +www.brics.dk/automaton/. Here is the copyright from those sources: + +/* + * Copyright (c) 2001-2009 Anders Moeller + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +The levenshtein automata tables in core/src/java/org/apache/lucene/util/automaton +were automatically generated with the moman/finenight FSA package. +Here is the copyright for those sources: + +# Copyright (c) 2010, Jean-Philippe Barrette-LaPierre, +# +# Permission is hereby granted, free of charge, to any person +# obtaining a copy of this software and associated documentation +# files (the "Software"), to deal in the Software without +# restriction, including without limitation the rights to use, +# copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following +# conditions: +# +# The above copyright notice and this permission notice shall be +# included in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +Some code in core/src/java/org/apache/lucene/util/UnicodeUtil.java was +derived from ICU (http://www.icu-project.org) +The full license is available here: + https://github.com/unicode-org/icu/blob/main/icu4c/LICENSE + +/* + * Copyright (C) 1999-2010, International Business Machines + * Corporation and others. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, and/or sell copies of the + * Software, and to permit persons to whom the Software is furnished to do so, + * provided that the above copyright notice(s) and this permission notice appear + * in all copies of the Software and that both the above copyright notice(s) and + * this permission notice appear in supporting documentation. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE + * LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR + * ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER + * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT + * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + * Except as contained in this notice, the name of a copyright holder shall not + * be used in advertising or otherwise to promote the sale, use or other + * dealings in this Software without prior written authorization of the + * copyright holder. + */ + +The following license applies to the Snowball stemmers: + +Copyright (c) 2001, Dr Martin Porter +Copyright (c) 2002, Richard Boulton +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holders nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +The following license applies to the KStemmer: + +Copyright © 2003, +Center for Intelligent Information Retrieval, +University of Massachusetts, Amherst. +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this +list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + +3. The names "Center for Intelligent Information Retrieval" and +"University of Massachusetts" must not be used to endorse or promote products +derived from this software without prior written permission. To obtain +permission, contact info@ciir.cs.umass.edu. + +THIS SOFTWARE IS PROVIDED BY UNIVERSITY OF MASSACHUSETTS AND OTHER CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE +GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +SUCH DAMAGE. + +The following license applies to the Morfologik project: + +Copyright (c) 2006 Dawid Weiss +Copyright (c) 2007-2011 Dawid Weiss, Marcin Miłkowski +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + * Neither the name of Morfologik nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +--- + +The dictionary comes from Morfologik project. Morfologik uses data from +Polish ispell/myspell dictionary hosted at http://www.sjp.pl/slownik/en/ and +is licenced on the terms of (inter alia) LGPL and Creative Commons +ShareAlike. The part-of-speech tags were added in Morfologik project and +are not found in the data from sjp.pl. The tagset is similar to IPI PAN +tagset. + +--- + +The following license applies to the Morfeusz project, +used by org.apache.lucene.analysis.morfologik. + +BSD-licensed dictionary of Polish (SGJP) +http://sgjp.pl/morfeusz/ + +Copyright © 2011 Zygmunt Saloni, Włodzimierz Gruszczyński, +Marcin Woliński, Robert Wołosz + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the + distribution. + +THIS SOFTWARE IS PROVIDED BY COPYRIGHT HOLDERS “AS IS” AND ANY EXPRESS +OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL COPYRIGHT HOLDERS OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR +BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN +IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +--- + +core/src/java/org/apache/lucene/util/compress/LZ4.java is a Java +implementation of the LZ4 (https://github.com/lz4/lz4/tree/dev/lib) +compression format for Lucene's DataInput/DataOutput abstractions. + +LZ4 Library +Copyright (c) 2011-2016, Yann Collet +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, this + list of conditions and the following disclaimer in the documentation and/or + other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + diff --git a/lucene/test-framework/src/resources/org/apache/lucene/tests/index/NOTICE.txt b/lucene/test-framework/src/resources/org/apache/lucene/tests/index/NOTICE.txt new file mode 100644 index 00000000000..ea6903484c0 --- /dev/null +++ b/lucene/test-framework/src/resources/org/apache/lucene/tests/index/NOTICE.txt @@ -0,0 +1,197 @@ +Apache Lucene +Copyright 2001-2022 The Apache Software Foundation + +This product includes software developed at +The Apache Software Foundation (http://www.apache.org/). + +Includes software from other Apache Software Foundation projects, +including, but not limited to: + - Apache Jakarta Regexp + - Apache Commons + - Apache Xerces + +ICU4J, (under analysis/icu) is licensed under an MIT styles license +and Copyright (c) 1995-2008 International Business Machines Corporation and others + +Some data files (under analysis/icu/src/data) are derived from Unicode data such +as the Unicode Character Database. See http://unicode.org/copyright.html for more +details. + +Brics Automaton (under core/src/java/org/apache/lucene/util/automaton) is +BSD-licensed, created by Anders Møller. See http://www.brics.dk/automaton/ + +The levenshtein automata tables (under core/src/java/org/apache/lucene/util/automaton) were +automatically generated with the moman/finenight FSA library, created by +Jean-Philippe Barrette-LaPierre. This library is available under an MIT license, +see http://sites.google.com/site/rrettesite/moman and +http://bitbucket.org/jpbarrette/moman/overview/ + +The class org.apache.lucene.util.WeakIdentityMap was derived from +the Apache CXF project and is Apache License 2.0. + +The class org.apache.lucene.util.compress.LZ4 is a Java rewrite of the LZ4 +compression library (https://github.com/lz4/lz4/tree/dev/lib) that is licensed +under the 2-clause BSD license. +(https://opensource.org/licenses/bsd-license.php) + +The Google Code Prettify is Apache License 2.0. +See http://code.google.com/p/google-code-prettify/ + +This product includes code (JaspellTernarySearchTrie) from Java Spelling Checkin +g Package (jaspell): http://jaspell.sourceforge.net/ +License: The BSD License (http://www.opensource.org/licenses/bsd-license.php) + +The snowball stemmers in + analysis/common/src/java/net/sf/snowball +were developed by Martin Porter and Richard Boulton. +The snowball stopword lists in + analysis/common/src/resources/org/apache/lucene/analysis/snowball +were developed by Martin Porter and Richard Boulton. +The full snowball package is available from + https://snowballstem.org/ + +The KStem stemmer in + analysis/common/src/org/apache/lucene/analysis/en +was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst) +under the BSD-license. + +The Arabic,Persian,Romanian,Bulgarian, Hindi and Bengali analyzers (common) come with a default +stopword list that is BSD-licensed created by Jacques Savoy. These files reside in: +analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt +See http://members.unine.ch/jacques.savoy/clef/index.html. + +The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers +(common) are based on BSD-licensed reference implementations created by Jacques Savoy and +Ljiljana Dolamic. These files reside in: +analysis/common/src/java/org/apache/lucene/analysis/de/GermanLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/de/GermanMinimalStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/es/SpanishLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchMinimalStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/it/ItalianLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/ru/RussianLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishLightStemmer.java + +The Stempel analyzer (stempel) includes BSD-licensed software developed +by the Egothor project http://egothor.sf.net/, created by Leo Galambos, Martin Kvapil, +and Edmond Nolan. + +The Polish analyzer (stempel) comes with a default +stopword list that is BSD-licensed created by the Carrot2 project. The file resides +in stempel/src/resources/org/apache/lucene/analysis/pl/stopwords.txt. +See https://github.com/carrot2/carrot2. + +The SmartChineseAnalyzer source code (smartcn) was +provided by Xiaoping Gao and copyright 2009 by www.imdict.net. + +WordBreakTestUnicode_*.java (under modules/analysis/common/src/test/) +is derived from Unicode data such as the Unicode Character Database. +See http://unicode.org/copyright.html for more details. + +The Morfologik analyzer (morfologik) includes BSD-licensed software +developed by Dawid Weiss and Marcin Miłkowski +(https://github.com/morfologik/morfologik-stemming) and uses +data from the BSD-licensed dictionary of Polish (SGJP, http://sgjp.pl/morfeusz/). + +=========================================================================== +Kuromoji Japanese Morphological Analyzer - Apache Lucene Integration +=========================================================================== + +This software includes a binary and/or source version of data from + + mecab-ipadic-2.7.0-20070801 + +which can be obtained from + + http://atilika.com/releases/mecab-ipadic/mecab-ipadic-2.7.0-20070801.tar.gz + +or + + http://jaist.dl.sourceforge.net/project/mecab/mecab-ipadic/2.7.0-20070801/mecab-ipadic-2.7.0-20070801.tar.gz + +=========================================================================== +mecab-ipadic-2.7.0-20070801 Notice +=========================================================================== + +Nara Institute of Science and Technology (NAIST), +the copyright holders, disclaims all warranties with regard to this +software, including all implied warranties of merchantability and +fitness, in no event shall NAIST be liable for +any special, indirect or consequential damages or any damages +whatsoever resulting from loss of use, data or profits, whether in an +action of contract, negligence or other tortuous action, arising out +of or in connection with the use or performance of this software. + +A large portion of the dictionary entries +originate from ICOT Free Software. The following conditions for ICOT +Free Software applies to the current dictionary as well. + +Each User may also freely distribute the Program, whether in its +original form or modified, to any third party or parties, PROVIDED +that the provisions of Section 3 ("NO WARRANTY") will ALWAYS appear +on, or be attached to, the Program, which is distributed substantially +in the same form as set out herein and that such intended +distribution, if actually made, will neither violate or otherwise +contravene any of the laws and regulations of the countries having +jurisdiction over the User or the intended distribution itself. + +NO WARRANTY + +The program was produced on an experimental basis in the course of the +research and development conducted during the project and is provided +to users as so produced on an experimental basis. Accordingly, the +program is provided without any warranty whatsoever, whether express, +implied, statutory or otherwise. The term "warranty" used herein +includes, but is not limited to, any warranty of the quality, +performance, merchantability and fitness for a particular purpose of +the program and the nonexistence of any infringement or violation of +any right of any third party. + +Each user of the program will agree and understand, and be deemed to +have agreed and understood, that there is no warranty whatsoever for +the program and, accordingly, the entire risk arising from or +otherwise connected with the program is assumed by the user. + +Therefore, neither ICOT, the copyright holder, or any other +organization that participated in or was otherwise related to the +development of the program and their respective officials, directors, +officers and other employees shall be held liable for any and all +damages, including, without limitation, general, special, incidental +and consequential damages, arising out of or otherwise in connection +with the use or inability to use the program or any product, material +or result produced or otherwise obtained by using the program, +regardless of whether they have been advised of, or otherwise had +knowledge of, the possibility of such damages at any time during the +project or thereafter. Each user will be deemed to have agreed to the +foregoing by his or her commencement of use of the program. The term +"use" as used herein includes, but is not limited to, the use, +modification, copying and distribution of the program and the +production of secondary products from the program. + +In the case where the program, whether in its original form or +modified, was distributed or delivered to or received by a user from +any person, organization or entity other than ICOT, unless it makes or +grants independently of ICOT any specific warranty to the user in +writing, such person, organization or entity, will also be exempted +from and not be held liable to the user for any such damages as noted +above as far as the program is concerned. + +=========================================================================== +Nori Korean Morphological Analyzer - Apache Lucene Integration +=========================================================================== + +This software includes a binary and/or source version of data from + + mecab-ko-dic-2.1.1-20180720 + +which can be obtained from + + https://bitbucket.org/eunjeon/mecab-ko-dic/downloads/mecab-ko-dic-2.1.1-20180720.tar.gz From 2099589e746771ed0d076b463b4640bfe8dfc454 Mon Sep 17 00:00:00 2001 From: Michael Sokolov Date: Mon, 14 Oct 2024 11:17:50 -0400 Subject: [PATCH 15/25] Add BaseKnnVectorsFormatTestCase.testRecall() and fix map ord to doc in Lucene90HnswVectorsReader --- .../lucene91/Lucene91HnswGraphBuilder.java | 2 +- .../lucene/util/hnsw/HnswGraphBuilder.java | 2 +- .../index/BaseKnnVectorsFormatTestCase.java | 51 ++++++++++++------- 3 files changed, 35 insertions(+), 20 deletions(-) diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene91/Lucene91HnswGraphBuilder.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene91/Lucene91HnswGraphBuilder.java index 2d5aa8b7b9f..2fd384e2739 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene91/Lucene91HnswGraphBuilder.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene91/Lucene91HnswGraphBuilder.java @@ -234,7 +234,7 @@ private void popToScratch(HnswGraphBuilder.GraphBuilderKnnCollector candidates) // extract all the Neighbors from the queue into an array; these will now be // sorted from worst to best for (int i = 0; i < candidateCount; i++) { - float similarity = candidates.minCompetitiveSimilarity(); + float similarity = candidates.minimumScore(); scratch.add(candidates.popNode(), similarity); } } diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphBuilder.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphBuilder.java index 75b579f0bd9..bed1480e926 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphBuilder.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphBuilder.java @@ -555,7 +555,7 @@ public int[] popUntilNearestKNodes() { return queue.nodes(); } - float minimumScore() { + public float minimumScore() { return queue.topScore(); } diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java index 82dfc0c252d..3201335eb74 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java @@ -21,12 +21,14 @@ import static org.apache.lucene.index.VectorSimilarityFunction.DOT_PRODUCT; import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; +import java.io.BufferedReader; import java.io.ByteArrayOutputStream; import java.io.BufferedReader; import java.io.InputStream; import java.io.InputStreamReader; import java.io.IOException; -import java.util.ArrayList; +import java.io.InputStream; +import java.io.InputStreamReader; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; @@ -1943,22 +1945,23 @@ public void testMismatchedFields() throws Exception { IOUtils.close(reader, w2, dir1, dir2); } - /** Test that the query is a viable approximation to exact search. This test is designed to - uncover gross failures only, not to represent the true expected recall. + /** + * Test that the query is a viable approximation to exact search. This test is designed to uncover + * gross failures only, not to represent the true expected recall. */ public void testRecall() throws IOException { - // TODO: vary the function randomly VectorSimilarityFunction vectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN; int dim = 16; try (Directory indexStore = getKnownIndexStore("field", dim, vectorSimilarityFunction); - IndexReader reader = DirectoryReader.open(indexStore)) { + IndexReader reader = DirectoryReader.open(indexStore)) { IndexSearcher searcher = newSearcher(reader); float[] queryEmbedding = new float[dim]; String queryString = "Apache License"; computeLineEmbedding(queryString, queryEmbedding); - //computeLineEmbedding(" END OF TERMS AND CONDITIONS", queryEmbedding); + // computeLineEmbedding(" END OF TERMS AND CONDITIONS", queryEmbedding); // pass match-all "filter" to force full traversal, bypassing graph - KnnFloatVectorQuery exactQuery = new KnnFloatVectorQuery("field", queryEmbedding, 1000, new MatchAllDocsQuery()); + KnnFloatVectorQuery exactQuery = + new KnnFloatVectorQuery("field", queryEmbedding, 1000, new MatchAllDocsQuery()); // indexed 421 lines from LICENSE.txt // indexed 157 lines from NOTICE.txt assertEquals(578, searcher.count(exactQuery)); // Same for exact search @@ -1966,16 +1969,26 @@ public void testRecall() throws IOException { assertEquals(10, searcher.count(query)); // Expect some results without timeout TopDocs results = searcher.search(query, 10); Set resultDocs = new HashSet<>(); - int i = 0; for (ScoreDoc scoreDoc : results.scoreDocs) { - System.out.println("result " + i++ + ": " + reader.storedFields().document(scoreDoc.doc) + " " + scoreDoc); + /* + System.out.println( + "result " + i++ + ": " + reader.storedFields().document(scoreDoc.doc) + " " + scoreDoc); + */ resultDocs.add(scoreDoc.doc); } TopDocs expected = searcher.search(exactQuery, 10); - i = 0; + // int i = 0; int recalled = 0; for (ScoreDoc scoreDoc : expected.scoreDocs) { - System.out.println("expected " + i++ + ": " + reader.storedFields().document(scoreDoc.doc) + " " + scoreDoc); + /* + System.out.println( + "expected " + + i++ + + ": " + + reader.storedFields().document(scoreDoc.doc) + + " " + + scoreDoc); + */ if (resultDocs.contains(scoreDoc.doc)) { ++recalled; } @@ -1991,14 +2004,15 @@ public void testRecall() throws IOException { } /** Creates a new directory and adds documents with the given vectors as kNN vector fields */ - Directory getKnownIndexStore(String field, int dimension, VectorSimilarityFunction vectorSimilarityFunction) throws IOException { + Directory getKnownIndexStore( + String field, int dimension, VectorSimilarityFunction vectorSimilarityFunction) + throws IOException { Directory indexStore = newDirectory(random()); IndexWriter writer = new IndexWriter(indexStore, newIndexWriterConfig()); float[] scratch = new float[dimension]; for (String file : List.of("LICENSE.txt", "NOTICE.txt")) { try (InputStream in = BaseKnnVectorsFormatTestCase.class.getResourceAsStream(file); - BufferedReader reader = new BufferedReader(new InputStreamReader(in, UTF_8))) { - List lines = new ArrayList<>(); + BufferedReader reader = new BufferedReader(new InputStreamReader(in, UTF_8))) { String line; int lineNo = -1; while ((line = reader.readLine()) != null) { @@ -2008,7 +2022,9 @@ Directory getKnownIndexStore(String field, int dimension, VectorSimilarityFuncti } ++lineNo; Document doc = new Document(); - doc.add(new KnnFloatVectorField(field, computeLineEmbedding(line, scratch), vectorSimilarityFunction)); + doc.add( + new KnnFloatVectorField( + field, computeLineEmbedding(line, scratch), vectorSimilarityFunction)); doc.add(new StoredField("text", line)); doc.add(new StringField("id", file + "." + lineNo, Field.Store.YES)); writer.addDocument(doc); @@ -2017,7 +2033,7 @@ Directory getKnownIndexStore(String field, int dimension, VectorSimilarityFuncti addDocuments(writer, "id" + lineNo + ".", randomIntBetween(1, 5)); } } - //System.out.println("indexed " + (lineNo + 1) + " lines from " + file); + // System.out.println("indexed " + (lineNo + 1) + " lines from " + file); } } // Add some documents without a vector nor an id @@ -2030,7 +2046,7 @@ private float[] computeLineEmbedding(String line, float[] vector) { Arrays.fill(vector, 0); for (int i = 0; i < line.length(); i++) { char c = line.charAt(i); - vector[i % vector.length] += (c << (i * 8 / vector.length)); + vector[i % vector.length] += c / ((float) (i + 1) / vector.length); } VectorUtil.l2normalize(vector, false); return vector; @@ -2046,5 +2062,4 @@ private void addDocuments(IndexWriter writer, String idBase, int count) throws I writer.addDocument(doc); } } - } From 61419004b9a21e8573776d75470820e3454bb491 Mon Sep 17 00:00:00 2001 From: Michael Sokolov Date: Thu, 17 Oct 2024 05:12:01 -0700 Subject: [PATCH 16/25] handle stray prints --- .../org/apache/lucene/index/TestKnnGraph.java | 1 - .../index/BaseKnnVectorsFormatTestCase.java | 35 ++++++++----------- 2 files changed, 15 insertions(+), 21 deletions(-) diff --git a/lucene/core/src/test/org/apache/lucene/index/TestKnnGraph.java b/lucene/core/src/test/org/apache/lucene/index/TestKnnGraph.java index 97bd624e97f..f9564b11d08 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestKnnGraph.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestKnnGraph.java @@ -563,7 +563,6 @@ private void add( String idString = Integer.toString(id); doc.add(new StringField("id", idString, Field.Store.YES)); doc.add(new SortedDocValuesField("id", new BytesRef(idString))); - // System.out.println("add " + idString + " " + Arrays.toString(vector)); iw.updateDocument(new Term("id", idString), doc); } } diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java index 3201335eb74..b841737249b 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java @@ -1969,37 +1969,32 @@ public void testRecall() throws IOException { assertEquals(10, searcher.count(query)); // Expect some results without timeout TopDocs results = searcher.search(query, 10); Set resultDocs = new HashSet<>(); + int i = 0; for (ScoreDoc scoreDoc : results.scoreDocs) { - /* - System.out.println( - "result " + i++ + ": " + reader.storedFields().document(scoreDoc.doc) + " " + scoreDoc); - */ + if (VERBOSE) { + System.out.println( + "result " + i++ + ": " + reader.storedFields().document(scoreDoc.doc) + " " + scoreDoc); + } resultDocs.add(scoreDoc.doc); } TopDocs expected = searcher.search(exactQuery, 10); - // int i = 0; int recalled = 0; + i = 0; for (ScoreDoc scoreDoc : expected.scoreDocs) { - /* - System.out.println( - "expected " - + i++ - + ": " - + reader.storedFields().document(scoreDoc.doc) - + " " - + scoreDoc); - */ + if (VERBOSE) { + System.out.println( + "expected " + + i++ + + ": " + + reader.storedFields().document(scoreDoc.doc) + + " " + + scoreDoc); + } if (resultDocs.contains(scoreDoc.doc)) { ++recalled; } } assertTrue("recall should be at least 5/10, got " + recalled, recalled >= 5); - /* - assertEquals(queryString, reader.storedFields().document(results.scoreDocs[0].doc).get("text")); - assertEquals("Copyright (c) 2006 Dawid Weiss", reader.storedFields().document(results.scoreDocs[1].doc).get("text")); - assertEquals(queryString, reader.storedFields().document(expected.scoreDocs[0].doc).get("text")); - assertEquals("Copyright (c) 2006 Dawid Weiss", reader.storedFields().document(expected.scoreDocs[1].doc).get("text")); - */ } } From 61a0d790d251ebe10025b8bc780e2fcfa97a7a75 Mon Sep 17 00:00:00 2001 From: Michael Sokolov Date: Thu, 17 Oct 2024 05:33:02 -0700 Subject: [PATCH 17/25] test all similarities and more queries --- .../index/BaseKnnVectorsFormatTestCase.java | 97 +++++++++++-------- 1 file changed, 55 insertions(+), 42 deletions(-) diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java index b841737249b..9ab606db899 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java @@ -1950,51 +1950,64 @@ public void testMismatchedFields() throws Exception { * gross failures only, not to represent the true expected recall. */ public void testRecall() throws IOException { - VectorSimilarityFunction vectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN; - int dim = 16; - try (Directory indexStore = getKnownIndexStore("field", dim, vectorSimilarityFunction); - IndexReader reader = DirectoryReader.open(indexStore)) { - IndexSearcher searcher = newSearcher(reader); - float[] queryEmbedding = new float[dim]; - String queryString = "Apache License"; - computeLineEmbedding(queryString, queryEmbedding); - // computeLineEmbedding(" END OF TERMS AND CONDITIONS", queryEmbedding); - // pass match-all "filter" to force full traversal, bypassing graph - KnnFloatVectorQuery exactQuery = - new KnnFloatVectorQuery("field", queryEmbedding, 1000, new MatchAllDocsQuery()); - // indexed 421 lines from LICENSE.txt - // indexed 157 lines from NOTICE.txt - assertEquals(578, searcher.count(exactQuery)); // Same for exact search - KnnFloatVectorQuery query = new KnnFloatVectorQuery("field", queryEmbedding, 10); - assertEquals(10, searcher.count(query)); // Expect some results without timeout - TopDocs results = searcher.search(query, 10); - Set resultDocs = new HashSet<>(); - int i = 0; - for (ScoreDoc scoreDoc : results.scoreDocs) { - if (VERBOSE) { - System.out.println( - "result " + i++ + ": " + reader.storedFields().document(scoreDoc.doc) + " " + scoreDoc); - } - resultDocs.add(scoreDoc.doc); - } - TopDocs expected = searcher.search(exactQuery, 10); + VectorSimilarityFunction[] functions = { VectorSimilarityFunction.EUCLIDEAN, VectorSimilarityFunction.COSINE, VectorSimilarityFunction.DOT_PRODUCT, VectorSimilarityFunction.MAXIMUM_INNER_PRODUCT }; + for (VectorSimilarityFunction vectorSimilarityFunction : functions) { + int dim = 16; int recalled = 0; - i = 0; - for (ScoreDoc scoreDoc : expected.scoreDocs) { - if (VERBOSE) { - System.out.println( - "expected " - + i++ - + ": " - + reader.storedFields().document(scoreDoc.doc) - + " " - + scoreDoc); - } - if (resultDocs.contains(scoreDoc.doc)) { - ++recalled; + try (Directory indexStore = getKnownIndexStore("field", dim, vectorSimilarityFunction); + IndexReader reader = DirectoryReader.open(indexStore)) { + IndexSearcher searcher = newSearcher(reader); + float[] queryEmbedding = new float[dim]; + // indexed 421 lines from LICENSE.txt + // indexed 157 lines from NOTICE.txt + int numQueries = 578; + String[] testQueries = { "Apache Lucene", "Apache License", "TERMS AND CONDITIONS", + "Copyright 2001", "Permission is hereby", "Copyright © 2003", + "The dictionary comes from Morfologik project", + "The levenshtein automata tables" + }; + for (String queryString : testQueries) { + computeLineEmbedding(queryString, queryEmbedding); + + // pass match-all "filter" to force full traversal, bypassing graph + KnnFloatVectorQuery exactQuery = + new KnnFloatVectorQuery("field", queryEmbedding, 1000, new MatchAllDocsQuery()); + assertEquals(numQueries, searcher.count(exactQuery)); // Same for exact search + + KnnFloatVectorQuery query = new KnnFloatVectorQuery("field", queryEmbedding, 10); + assertEquals(10, searcher.count(query)); // Expect some results without timeout + TopDocs results = searcher.search(query, 10); + Set resultDocs = new HashSet<>(); + int i = 0; + for (ScoreDoc scoreDoc : results.scoreDocs) { + if (VERBOSE) { + System.out.println( + "result " + i++ + ": " + reader.storedFields().document(scoreDoc.doc) + " " + scoreDoc); + } + resultDocs.add(scoreDoc.doc); + } + TopDocs expected = searcher.search(exactQuery, 10); + i = 0; + for (ScoreDoc scoreDoc : expected.scoreDocs) { + if (VERBOSE) { + System.out.println( + "expected " + + i++ + + ": " + + reader.storedFields().document(scoreDoc.doc) + + " " + + scoreDoc); + } + if (resultDocs.contains(scoreDoc.doc)) { + ++recalled; + } + } } + assertTrue("Average recall for " + vectorSimilarityFunction + + " should be at least " + (testQueries.length * 5) + " / " + + (testQueries.length * 10) + ", got " + recalled, + recalled >= testQueries.length * 5); } - assertTrue("recall should be at least 5/10, got " + recalled, recalled >= 5); } } From 5a6d7093e2fbafd393a8b06b85b42f56e6de10d0 Mon Sep 17 00:00:00 2001 From: Michael Sokolov Date: Thu, 17 Oct 2024 12:42:26 -0700 Subject: [PATCH 18/25] fix Lucene90Hnsw that was aliasing vector values --- .../lucene90/Lucene90HnswGraphBuilder.java | 7 ++----- .../lucene90/Lucene90HnswVectorsReader.java | 10 +++++----- .../lucene90/Lucene90OnHeapHnswGraph.java | 5 ++--- .../tests/index/BaseKnnVectorsFormatTestCase.java | 2 -- 4 files changed, 9 insertions(+), 15 deletions(-) diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswGraphBuilder.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswGraphBuilder.java index e393342535b..44535845a2a 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswGraphBuilder.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswGraphBuilder.java @@ -49,7 +49,6 @@ public final class Lucene90HnswGraphBuilder { private final Lucene90NeighborArray scratch; private final VectorSimilarityFunction similarityFunction; - private final FloatVectorValues vectorValues; private final FloatVectorValues.Floats vectors; private final SplittableRandom random; private final Lucene90BoundsChecker bound; @@ -80,8 +79,7 @@ public Lucene90HnswGraphBuilder( int beamWidth, long seed) throws IOException { - this.vectorValues = vectorValues; - this.vectors = vectorValues.vectors(); + vectors = vectorValues.vectors(); buildVectors = vectorValues.vectors(); this.similarityFunction = Objects.requireNonNull(similarityFunction); if (maxConn <= 0) { @@ -113,7 +111,6 @@ public Lucene90OnHeapHnswGraph build(FloatVectorValues vectorValues) throws IOEx } long start = System.nanoTime(), t = start; // start at node 1! node 0 is added implicitly, in the constructor - FloatVectorValues.Floats vectors = vectorValues.vectors(); for (int node = 1; node < vectorValues.size(); node++) { addGraphNode(vectors.get(node)); if (node % 10000 == 0) { @@ -147,7 +144,7 @@ void addGraphNode(float[] value) throws IOException { value, beamWidth, beamWidth, - vectorValues, + buildVectors, similarityFunction, hnsw, null, diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswVectorsReader.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswVectorsReader.java index ccb8770f838..0aa752894cf 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswVectorsReader.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswVectorsReader.java @@ -249,7 +249,7 @@ public void search(String field, float[] target, KnnCollector knnCollector, Bits target, knnCollector.k(), knnCollector.k(), - vectorValues, + vectorValues.vectors(), fieldEntry.similarityFunction, getGraphValues(fieldEntry), getAcceptOrds(acceptDocs, fieldEntry), @@ -360,7 +360,6 @@ static class OffHeapFloatVectorValues extends FloatVectorValues { final int byteSize; int lastOrd = -1; - final float[] value; final VectorSimilarityFunction similarityFunction; OffHeapFloatVectorValues( @@ -374,7 +373,6 @@ static class OffHeapFloatVectorValues extends FloatVectorValues { this.similarityFunction = similarityFunction; byteSize = Float.BYTES * dimension; - value = new float[dimension]; } @Override @@ -389,14 +387,16 @@ public int size() { @Override public Floats vectors() { + IndexInput input = dataIn.clone(); + float[] value = new float[dimension]; return new Floats() { @Override public float[] get(int targetOrd) throws IOException { if (lastOrd == targetOrd) { return value; } - dataIn.seek((long) targetOrd * byteSize); - dataIn.readFloats(value, 0, value.length); + input.seek((long) targetOrd * byteSize); + input.readFloats(value, 0, value.length); lastOrd = targetOrd; return value; } diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90OnHeapHnswGraph.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90OnHeapHnswGraph.java index e99939edd4c..63c5ffe42bf 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90OnHeapHnswGraph.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90OnHeapHnswGraph.java @@ -62,7 +62,7 @@ public final class Lucene90OnHeapHnswGraph extends HnswGraph { * @param topK the number of nodes to be returned * @param numSeed the size of the queue maintained while searching, and controls the number of * random entry points to sample - * @param vectorValues vector values + * @param vectors vectors to search whose ordinals are in the graph * @param graphValues the graph values. May represent the entire graph, or a level in a * hierarchical graph. * @param acceptOrds {@link Bits} that represents the allowed document ordinals to match, or @@ -74,7 +74,7 @@ public static NeighborQueue search( float[] query, int topK, int numSeed, - FloatVectorValues vectorValues, + FloatVectorValues.Floats vectors, VectorSimilarityFunction similarityFunction, HnswGraph graphValues, Bits acceptOrds, @@ -83,7 +83,6 @@ public static NeighborQueue search( throws IOException { int size = graphValues.size(); - FloatVectorValues.Floats vectors = vectorValues.vectors(); // MIN heap, holding the top results NeighborQueue results = new NeighborQueue(numSeed, false); // MAX heap, from which to pull the candidate nodes diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java index 9ab606db899..3c5bd034c48 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java @@ -1047,8 +1047,6 @@ public void testByteVectorScorerIteration() throws Exception { VectorScorer newScorer = vectorValues.scorer(vectorToScore); assertNotNull(newScorer); assertNotSame(scorer, newScorer); - System.out.println("first iterator=" + iterator); - System.out.println("new iterator=" + newScorer.iterator()); assertNotSame(iterator, newScorer.iterator()); } } From bbe4d28e05d3059a18aec29ea291f56c5289619d Mon Sep 17 00:00:00 2001 From: Michael Sokolov Date: Fri, 18 Oct 2024 09:18:50 -0400 Subject: [PATCH 19/25] remove stray print --- .../apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java index 3c5bd034c48..df8de2810a5 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java @@ -1041,8 +1041,6 @@ public void testByteVectorScorerIteration() throws Exception { assertTrue(score >= 0f); assertEquals(iterator.docID(), valuesIterator.docID()); } - System.out.println("values=" + vectorValues); - System.out.println("scorer=" + scorer); // verify that a new scorer can be obtained after iteration VectorScorer newScorer = vectorValues.scorer(vectorToScore); assertNotNull(newScorer); From 568372fe45227bb173f47e4a00889425a38243c0 Mon Sep 17 00:00:00 2001 From: Michael Sokolov Date: Fri, 18 Oct 2024 15:40:09 -0400 Subject: [PATCH 20/25] fix initialization bug in SlowCompositeCodecReaderWrapper --- .../lucene/index/SlowCompositeCodecReaderWrapper.java | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/index/SlowCompositeCodecReaderWrapper.java b/lucene/core/src/java/org/apache/lucene/index/SlowCompositeCodecReaderWrapper.java index 920da7dbb6e..fa8dea05096 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SlowCompositeCodecReaderWrapper.java +++ b/lucene/core/src/java/org/apache/lucene/index/SlowCompositeCodecReaderWrapper.java @@ -887,6 +887,7 @@ public Floats vectors() { return new Floats() { int lastSubIndex = 0; Floats subValues; + // TODO: cache the subValues so we only need to create each one once (same in Bytes below) @Override public float[] get(int ord) throws IOException { @@ -894,7 +895,7 @@ public float[] get(int ord) throws IOException { // We need to implement fully random-access API here in order to support callers like // SortingCodecReader that rely on it. int newSubIndex = findSub(ord, lastSubIndex, starts); - if (newSubIndex != lastSubIndex) { + if (newSubIndex != lastSubIndex || subValues == null) { lastSubIndex = newSubIndex; assert subs.get(lastSubIndex).sub != null; subValues = subs.get(lastSubIndex).sub.vectors(); @@ -963,7 +964,7 @@ public int size() { @Override public Bytes vectors() { return new Bytes() { - int lastSubIndex = -1; + int lastSubIndex = 0; Bytes subValues; @Override @@ -973,7 +974,7 @@ public byte[] get(int ord) throws IOException { // SortingCodecReader that rely on it. We maintain lastSubIndex since we expect some // repetition. int newSubIndex = findSub(ord, lastSubIndex, starts); - if (newSubIndex != lastSubIndex) { + if (newSubIndex != lastSubIndex || subValues == null) { lastSubIndex = newSubIndex; assert subs.get(lastSubIndex).sub != null; subValues = subs.get(lastSubIndex).sub.vectors(); From da062889de022dd9c2756074a8c4acf0e62ee1d7 Mon Sep 17 00:00:00 2001 From: Michael Sokolov Date: Tue, 22 Oct 2024 15:00:41 -0400 Subject: [PATCH 21/25] simplifications from PR feedback --- .../lucene/index/ExitableDirectoryReader.java | 17 ++--------------- .../index/SlowCompositeCodecReaderWrapper.java | 1 + .../lucene/index/SortingCodecReader.java | 18 ++---------------- .../index/BaseKnnVectorsFormatTestCase.java | 3 --- 4 files changed, 5 insertions(+), 34 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/index/ExitableDirectoryReader.java b/lucene/core/src/java/org/apache/lucene/index/ExitableDirectoryReader.java index 3d4bccf6cc2..8a9477e69c0 100644 --- a/lucene/core/src/java/org/apache/lucene/index/ExitableDirectoryReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/ExitableDirectoryReader.java @@ -442,13 +442,7 @@ public int dimension() { @Override public Floats vectors() throws IOException { - Floats vectors = vectorValues.vectors(); - return new Floats() { - @Override - public float[] get(int ord) throws IOException { - return vectors.get(ord); - } - }; + return vectorValues.vectors(); } @Override @@ -491,14 +485,7 @@ public int size() { @Override public Bytes vectors() throws IOException { - return new Bytes() { - Bytes vectors = vectorValues.vectors(); - - @Override - public byte[] get(int ord) throws IOException { - return vectors.get(ord); - } - }; + return vectorValues.vectors(); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/index/SlowCompositeCodecReaderWrapper.java b/lucene/core/src/java/org/apache/lucene/index/SlowCompositeCodecReaderWrapper.java index fa8dea05096..f2d350d51e9 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SlowCompositeCodecReaderWrapper.java +++ b/lucene/core/src/java/org/apache/lucene/index/SlowCompositeCodecReaderWrapper.java @@ -887,6 +887,7 @@ public Floats vectors() { return new Floats() { int lastSubIndex = 0; Floats subValues; + // TODO: cache the subValues so we only need to create each one once (same in Bytes below) @Override diff --git a/lucene/core/src/java/org/apache/lucene/index/SortingCodecReader.java b/lucene/core/src/java/org/apache/lucene/index/SortingCodecReader.java index 6e2005aebda..7a096c0186a 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SortingCodecReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/SortingCodecReader.java @@ -320,14 +320,7 @@ private static class SortingFloatVectorValues extends FloatVectorValues { @Override public Floats vectors() throws IOException { - Floats delegateVectors = delegate.vectors(); - return new Floats() { - @Override - public float[] get(int ord) throws IOException { - // ords are interpreted in the delegate's ord-space. - return delegateVectors.get(ord); - } - }; + return delegate.vectors(); } @Override @@ -358,14 +351,7 @@ private static class SortingByteVectorValues extends ByteVectorValues { @Override public Bytes vectors() throws IOException { - return new Bytes() { - Bytes vectors = delegate.vectors(); - - @Override - public byte[] get(int ord) throws IOException { - return vectors.get(ord); - } - }; + return delegate.vectors(); } @Override diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java index 97e581a1fc5..4dc9267f41a 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java @@ -23,9 +23,6 @@ import java.io.BufferedReader; import java.io.ByteArrayOutputStream; -import java.io.BufferedReader; -import java.io.InputStream; -import java.io.InputStreamReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; From ed233baaa1b054ddea24c661acd7541b1cebfc54 Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Tue, 29 Oct 2024 16:59:36 +0000 Subject: [PATCH 22/25] fix off-heap scorer by falling back to on-heap. --- ...MemorySegmentByteVectorScorerSupplier.java | 139 ++++++++---------- ...ucene99MemorySegmentFlatVectorsScorer.java | 4 +- 2 files changed, 63 insertions(+), 80 deletions(-) diff --git a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/Lucene99MemorySegmentByteVectorScorerSupplier.java b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/Lucene99MemorySegmentByteVectorScorerSupplier.java index 0a4ee85a310..1f9cfb70df0 100644 --- a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/Lucene99MemorySegmentByteVectorScorerSupplier.java +++ b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/Lucene99MemorySegmentByteVectorScorerSupplier.java @@ -35,6 +35,7 @@ public abstract sealed class Lucene99MemorySegmentByteVectorScorerSupplier final int maxOrd; final MemorySegmentAccessInput input; final KnnVectorValues values; // to support ordToDoc/getAcceptOrds + final VectorSimilarityFunction fallbackScorer; /** * Return an optional whose value, if present, is the scorer supplier. Otherwise, an empty @@ -57,11 +58,14 @@ static Optional create( } Lucene99MemorySegmentByteVectorScorerSupplier( - MemorySegmentAccessInput input, KnnVectorValues values) { + MemorySegmentAccessInput input, + KnnVectorValues values, + VectorSimilarityFunction fallbackScorer) { this.input = input; this.values = values; this.vectorByteSize = values.getVectorByteLength(); this.maxOrd = values.size(); + this.fallbackScorer = fallbackScorer; } static void checkInvariants(int maxOrd, int vectorByteLength, IndexInput input) { @@ -76,120 +80,99 @@ final void checkOrdinal(int ord) { } } - final MemorySegment getSegment(MemorySegmentAccessInput input, byte[] scratch, int ord) + final float fallbackScore( + MemorySegmentAccessInput input, long firstByteOffset, long secondByteOffset) throws IOException { - long byteOffset = (long) ord * vectorByteSize; - MemorySegment seg = input.segmentSliceOrNull(byteOffset, vectorByteSize); - if (seg == null) { - input.readBytes(byteOffset, scratch, 0, vectorByteSize); - seg = MemorySegment.ofArray(scratch); + byte[] a = new byte[vectorByteSize]; + input.readBytes(firstByteOffset, a, 0, vectorByteSize); + byte[] b = new byte[vectorByteSize]; + input.readBytes(secondByteOffset, b, 0, vectorByteSize); + return fallbackScorer.compare(a, b); + } + + final float scoreFromOrds(MemorySegmentAccessInput input, int firstOrd, int secondOrd) + throws IOException { + checkOrdinal(firstOrd); + checkOrdinal(secondOrd); + long firstByteOffset = (long) firstOrd * vectorByteSize; + long secondByteOffset = (long) secondOrd * vectorByteSize; + MemorySegment firstSeg = input.segmentSliceOrNull(firstByteOffset, vectorByteSize); + MemorySegment secondSeg = input.segmentSliceOrNull(secondByteOffset, vectorByteSize); + + if (firstSeg == null || secondSeg == null) { + return fallbackScore(input, firstByteOffset, secondByteOffset); + } else { + return scoreFromSegments(firstSeg, secondSeg); } - return seg; + } + + abstract float scoreFromSegments(MemorySegment a, MemorySegment b); + + @Override + public RandomVectorScorer scorer(int ord) { + checkOrdinal(ord); + MemorySegmentAccessInput slice = input.clone(); + return new RandomVectorScorer.AbstractRandomVectorScorer(values) { + @Override + public float score(int node) throws IOException { + return scoreFromOrds(slice, ord, node); + } + }; } static final class CosineSupplier extends Lucene99MemorySegmentByteVectorScorerSupplier { CosineSupplier(MemorySegmentAccessInput input, KnnVectorValues values) { - super(input, values); + super(input, values, VectorSimilarityFunction.COSINE); } @Override - public RandomVectorScorer scorer(int ord) { - checkOrdinal(ord); - MemorySegmentAccessInput slice = input.clone(); - byte[] scratch1 = new byte[vectorByteSize]; - byte[] scratch2 = new byte[vectorByteSize]; - return new RandomVectorScorer.AbstractRandomVectorScorer(values) { - @Override - public float score(int node) throws IOException { - checkOrdinal(node); - float raw = - PanamaVectorUtilSupport.cosine( - getSegment(slice, scratch1, ord), getSegment(slice, scratch2, node)); - return (1 + raw) / 2; - } - }; + float scoreFromSegments(MemorySegment a, MemorySegment b) { + float raw = PanamaVectorUtilSupport.cosine(a, b); + return (1 + raw) / 2; } } static final class DotProductSupplier extends Lucene99MemorySegmentByteVectorScorerSupplier { DotProductSupplier(MemorySegmentAccessInput input, KnnVectorValues values) { - super(input, values); + super(input, values, VectorSimilarityFunction.DOT_PRODUCT); } @Override - public RandomVectorScorer scorer(int ord) { - checkOrdinal(ord); - MemorySegmentAccessInput slice = input.clone(); - byte[] scratch1 = new byte[vectorByteSize]; - byte[] scratch2 = new byte[vectorByteSize]; - return new RandomVectorScorer.AbstractRandomVectorScorer(values) { - @Override - public float score(int node) throws IOException { - checkOrdinal(node); - // divide by 2 * 2^14 (maximum absolute value of product of 2 signed bytes) * len - float raw = - PanamaVectorUtilSupport.dotProduct( - getSegment(slice, scratch1, ord), getSegment(slice, scratch2, node)); - return 0.5f + raw / (float) (values.dimension() * (1 << 15)); - } - }; + float scoreFromSegments(MemorySegment a, MemorySegment b) { + // divide by 2 * 2^14 (maximum absolute value of product of 2 signed bytes) * len + float raw = PanamaVectorUtilSupport.dotProduct(a, b); + return 0.5f + raw / (float) (values.dimension() * (1 << 15)); } } static final class EuclideanSupplier extends Lucene99MemorySegmentByteVectorScorerSupplier { EuclideanSupplier(MemorySegmentAccessInput input, KnnVectorValues values) { - super(input, values); + super(input, values, VectorSimilarityFunction.EUCLIDEAN); } @Override - public RandomVectorScorer scorer(int ord) { - checkOrdinal(ord); - MemorySegmentAccessInput slice = input.clone(); - byte[] scratch1 = new byte[vectorByteSize]; - byte[] scratch2 = new byte[vectorByteSize]; - - return new RandomVectorScorer.AbstractRandomVectorScorer(values) { - @Override - public float score(int node) throws IOException { - checkOrdinal(node); - float raw = - PanamaVectorUtilSupport.squareDistance( - getSegment(slice, scratch1, ord), getSegment(slice, scratch2, node)); - return 1 / (1f + raw); - } - }; + float scoreFromSegments(MemorySegment a, MemorySegment b) { + float raw = PanamaVectorUtilSupport.squareDistance(a, b); + return 1 / (1f + raw); } } static final class MaxInnerProductSupplier extends Lucene99MemorySegmentByteVectorScorerSupplier { MaxInnerProductSupplier(MemorySegmentAccessInput input, KnnVectorValues values) { - super(input, values); + super(input, values, VectorSimilarityFunction.MAXIMUM_INNER_PRODUCT); } @Override - public RandomVectorScorer scorer(int ord) { - checkOrdinal(ord); - MemorySegmentAccessInput slice = input.clone(); - byte[] scratch1 = new byte[vectorByteSize]; - byte[] scratch2 = new byte[vectorByteSize]; - - return new RandomVectorScorer.AbstractRandomVectorScorer(values) { - @Override - public float score(int node) throws IOException { - checkOrdinal(node); - float raw = - PanamaVectorUtilSupport.dotProduct( - getSegment(slice, scratch1, ord), getSegment(slice, scratch2, node)); - if (raw < 0) { - return 1 / (1 + -1 * raw); - } - return raw + 1; - } - }; + float scoreFromSegments(MemorySegment a, MemorySegment b) { + float raw = PanamaVectorUtilSupport.dotProduct(a, b); + if (raw < 0) { + return 1 / (1 + -1 * raw); + } + return raw + 1; } } } diff --git a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/Lucene99MemorySegmentFlatVectorsScorer.java b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/Lucene99MemorySegmentFlatVectorsScorer.java index bd8cbb2c388..ea8f10fab3b 100644 --- a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/Lucene99MemorySegmentFlatVectorsScorer.java +++ b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/Lucene99MemorySegmentFlatVectorsScorer.java @@ -45,7 +45,7 @@ public RandomVectorScorerSupplier getRandomVectorScorerSupplier( assert !(vectorValues instanceof QuantizedByteVectorValues); // currently only supports binary vectors if (vectorValues instanceof ByteVectorValues bvv - && bvv instanceof HasIndexSlice byteVectorValues + && bvv.vectors() instanceof HasIndexSlice byteVectorValues && byteVectorValues.getSlice() != null) { var scorer = Lucene99MemorySegmentByteVectorScorerSupplier.create( @@ -73,7 +73,7 @@ public RandomVectorScorer getRandomVectorScorer( // a quantized values here is a wrapping or delegation issue assert !(vectorValues instanceof QuantizedByteVectorValues); if (vectorValues instanceof ByteVectorValues bvv - && bvv instanceof HasIndexSlice byteVectorValues + && bvv.vectors() instanceof HasIndexSlice byteVectorValues && byteVectorValues.getSlice() != null) { var scorer = Lucene99MemorySegmentByteVectorScorer.create( From 42843606ec1134cb4e6f743625b381076f77ab13 Mon Sep 17 00:00:00 2001 From: Michael Sokolov Date: Thu, 7 Nov 2024 12:39:31 +0000 Subject: [PATCH 23/25] fix aliasing of vector scratch in quantized scorer --- .../lucene/codecs/hnsw/DefaultFlatVectorScorer.java | 12 ++++++------ .../Lucene99ScalarQuantizedVectorScorer.java | 3 +-- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/hnsw/DefaultFlatVectorScorer.java b/lucene/core/src/java/org/apache/lucene/codecs/hnsw/DefaultFlatVectorScorer.java index 3f07abcef80..f26244c433c 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/hnsw/DefaultFlatVectorScorer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/hnsw/DefaultFlatVectorScorer.java @@ -100,12 +100,12 @@ private ByteScoringSupplier( @Override public RandomVectorScorer scorer(int ord) throws IOException { - ByteVectorValues.Bytes vectors1 = vectorValues.vectors(); - ByteVectorValues.Bytes vectors2 = vectorValues.vectors(); + byte[] query = vectorValues.vectors().get(ord); + ByteVectorValues.Bytes vectors = vectorValues.vectors(); return new RandomVectorScorer.AbstractRandomVectorScorer(vectorValues) { @Override public float score(int node) throws IOException { - return similarityFunction.compare(vectors1.get(ord), vectors2.get(node)); + return similarityFunction.compare(query, vectors.get(node)); } }; } @@ -130,12 +130,12 @@ private FloatScoringSupplier( @Override public RandomVectorScorer scorer(int ord) throws IOException { - FloatVectorValues.Floats vectors1 = vectorValues.vectors(); - FloatVectorValues.Floats vectors2 = vectorValues.vectors(); + float[] query = vectorValues.vectors().get(ord); + FloatVectorValues.Floats vectors = vectorValues.vectors(); return new RandomVectorScorer.AbstractRandomVectorScorer(vectorValues) { @Override public float score(int node) throws IOException { - return similarityFunction.compare(vectors1.get(ord), vectors2.get(node)); + return similarityFunction.compare(query, vectors.get(node)); } }; } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorScorer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorScorer.java index d1491115d4e..2be5d274e6b 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorScorer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorScorer.java @@ -280,18 +280,17 @@ private static final class ScalarQuantizedRandomVectorScorerSupplier private final VectorSimilarityFunction vectorSimilarityFunction; private final QuantizedByteVectorValues vectorValues; - private final QuantizedByteVectorValues.QuantizedBytes vectors; public ScalarQuantizedRandomVectorScorerSupplier( QuantizedByteVectorValues vectorValues, VectorSimilarityFunction vectorSimilarityFunction) throws IOException { this.vectorValues = vectorValues; - this.vectors = vectorValues.vectors(); this.vectorSimilarityFunction = vectorSimilarityFunction; } @Override public RandomVectorScorer scorer(int ord) throws IOException { + QuantizedByteVectorValues.QuantizedBytes vectors = vectorValues.vectors(); byte[] vectorValue = vectors.get(ord); float offsetCorrection = vectors.getScoreCorrectionConstant(ord); return fromVectorSimilarity( From f1e0007b15245db3a048ebf5bf340e70929f21b7 Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Fri, 8 Nov 2024 11:51:30 +0000 Subject: [PATCH 24/25] update flat vectors scorer to use only two vector dictionaries --- .../codecs/hnsw/DefaultFlatVectorScorer.java | 32 +++++++++++-------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/hnsw/DefaultFlatVectorScorer.java b/lucene/core/src/java/org/apache/lucene/codecs/hnsw/DefaultFlatVectorScorer.java index f26244c433c..b5506e3e519 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/hnsw/DefaultFlatVectorScorer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/hnsw/DefaultFlatVectorScorer.java @@ -90,22 +90,24 @@ public String toString() { private static final class ByteScoringSupplier implements RandomVectorScorerSupplier { private final ByteVectorValues vectorValues; private final VectorSimilarityFunction similarityFunction; + private final ByteVectorValues.Bytes queryVectors; + private final ByteVectorValues.Bytes targetVectors; private ByteScoringSupplier( ByteVectorValues vectorValues, VectorSimilarityFunction similarityFunction) throws IOException { this.vectorValues = vectorValues; this.similarityFunction = similarityFunction; + this.queryVectors = vectorValues.vectors(); + this.targetVectors = vectorValues.vectors(); } @Override - public RandomVectorScorer scorer(int ord) throws IOException { - byte[] query = vectorValues.vectors().get(ord); - ByteVectorValues.Bytes vectors = vectorValues.vectors(); + public RandomVectorScorer scorer(int ord) { return new RandomVectorScorer.AbstractRandomVectorScorer(vectorValues) { @Override public float score(int node) throws IOException { - return similarityFunction.compare(query, vectors.get(node)); + return similarityFunction.compare(queryVectors.get(ord), targetVectors.get(node)); } }; } @@ -120,22 +122,24 @@ public String toString() { private static final class FloatScoringSupplier implements RandomVectorScorerSupplier { private final FloatVectorValues vectorValues; private final VectorSimilarityFunction similarityFunction; + private final FloatVectorValues.Floats queryVectors; + private final FloatVectorValues.Floats targetVectors; private FloatScoringSupplier( FloatVectorValues vectorValues, VectorSimilarityFunction similarityFunction) throws IOException { this.vectorValues = vectorValues; this.similarityFunction = similarityFunction; + this.queryVectors = vectorValues.vectors(); + this.targetVectors = vectorValues.vectors(); } @Override - public RandomVectorScorer scorer(int ord) throws IOException { - float[] query = vectorValues.vectors().get(ord); - FloatVectorValues.Floats vectors = vectorValues.vectors(); + public RandomVectorScorer scorer(int ord) { return new RandomVectorScorer.AbstractRandomVectorScorer(vectorValues) { @Override public float score(int node) throws IOException { - return similarityFunction.compare(query, vectors.get(node)); + return similarityFunction.compare(queryVectors.get(ord), targetVectors.get(node)); } }; } @@ -148,43 +152,43 @@ public String toString() { /** A {@link RandomVectorScorer} for float vectors. */ private static class FloatVectorScorer extends RandomVectorScorer.AbstractRandomVectorScorer { - private final FloatVectorValues.Floats vectors; private final float[] query; private final VectorSimilarityFunction similarityFunction; + private final FloatVectorValues.Floats targetVectors; public FloatVectorScorer( FloatVectorValues vectorValues, float[] query, VectorSimilarityFunction similarityFunction) throws IOException { super(vectorValues); - this.vectors = vectorValues.vectors(); this.query = query; this.similarityFunction = similarityFunction; + this.targetVectors = vectorValues.vectors(); } @Override public float score(int node) throws IOException { - return similarityFunction.compare(query, vectors.get(node)); + return similarityFunction.compare(query, targetVectors.get(node)); } } /** A {@link RandomVectorScorer} for byte vectors. */ private static class ByteVectorScorer extends RandomVectorScorer.AbstractRandomVectorScorer { - private final ByteVectorValues.Bytes vectors; private final byte[] query; private final VectorSimilarityFunction similarityFunction; + private final ByteVectorValues.Bytes targetVectors; public ByteVectorScorer( ByteVectorValues vectorValues, byte[] query, VectorSimilarityFunction similarityFunction) throws IOException { super(vectorValues); - vectors = vectorValues.vectors(); this.query = query; this.similarityFunction = similarityFunction; + targetVectors = vectorValues.vectors(); } @Override public float score(int node) throws IOException { - return similarityFunction.compare(query, vectors.get(node)); + return similarityFunction.compare(query, targetVectors.get(node)); } } } From ef13bade9b3ab8eb5de35e7297a9468ec45fc745 Mon Sep 17 00:00:00 2001 From: Michael Sokolov Date: Fri, 8 Nov 2024 16:47:46 +0000 Subject: [PATCH 25/25] reuse Floats and RandomVectorScorers --- .../codecs/hnsw/DefaultFlatVectorScorer.java | 49 ++++-- .../lucene95/OffHeapFloatVectorValues.java | 12 +- .../Lucene99ScalarQuantizedVectorsReader.java | 5 + .../Lucene99ScalarQuantizedVectorsWriter.java | 13 +- .../lucene/index/FloatVectorValues.java | 10 +- .../java/org/apache/lucene/util/hnsw/Bag.java | 71 +++++++++ .../lucene/util/hnsw/HnswGraphBuilder.java | 142 +++++++++--------- .../lucene/util/hnsw/NeighborArray.java | 43 +++--- .../lucene/util/hnsw/RandomVectorScorer.java | 18 ++- 9 files changed, 250 insertions(+), 113 deletions(-) create mode 100644 lucene/core/src/java/org/apache/lucene/util/hnsw/Bag.java diff --git a/lucene/core/src/java/org/apache/lucene/codecs/hnsw/DefaultFlatVectorScorer.java b/lucene/core/src/java/org/apache/lucene/codecs/hnsw/DefaultFlatVectorScorer.java index b5506e3e519..19d2288fe8f 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/hnsw/DefaultFlatVectorScorer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/hnsw/DefaultFlatVectorScorer.java @@ -22,6 +22,7 @@ import org.apache.lucene.index.FloatVectorValues; import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.VectorSimilarityFunction; +import org.apache.lucene.util.hnsw.Bag; import org.apache.lucene.util.hnsw.RandomVectorScorer; import org.apache.lucene.util.hnsw.RandomVectorScorerSupplier; @@ -122,26 +123,24 @@ public String toString() { private static final class FloatScoringSupplier implements RandomVectorScorerSupplier { private final FloatVectorValues vectorValues; private final VectorSimilarityFunction similarityFunction; - private final FloatVectorValues.Floats queryVectors; - private final FloatVectorValues.Floats targetVectors; + private final Bag pool = new Bag<>(); private FloatScoringSupplier( FloatVectorValues vectorValues, VectorSimilarityFunction similarityFunction) throws IOException { this.vectorValues = vectorValues; this.similarityFunction = similarityFunction; - this.queryVectors = vectorValues.vectors(); - this.targetVectors = vectorValues.vectors(); } @Override - public RandomVectorScorer scorer(int ord) { - return new RandomVectorScorer.AbstractRandomVectorScorer(vectorValues) { - @Override - public float score(int node) throws IOException { - return similarityFunction.compare(queryVectors.get(ord), targetVectors.get(node)); - } - }; + public RandomVectorScorer scorer(int ord) throws IOException { + FloatVectorScorer scorer = (FloatVectorScorer) pool.poll(); + if (scorer != null) { + scorer.setQuery(ord); + } else { + scorer = new FloatVectorScorer(vectorValues, ord, similarityFunction, pool); + } + return scorer; } @Override @@ -152,22 +151,40 @@ public String toString() { /** A {@link RandomVectorScorer} for float vectors. */ private static class FloatVectorScorer extends RandomVectorScorer.AbstractRandomVectorScorer { - private final float[] query; + private final FloatVectorValues.Floats vectors, queryVectors; private final VectorSimilarityFunction similarityFunction; - private final FloatVectorValues.Floats targetVectors; + private float[] query; + + FloatVectorScorer( + FloatVectorValues vectorValues, + int ord, + VectorSimilarityFunction similarityFunction, + Bag pool) + throws IOException { + super(vectorValues, pool); + this.similarityFunction = similarityFunction; + vectors = vectorValues.vectors(); + queryVectors = vectorValues.vectors(); + query = queryVectors.get(ord); + } public FloatVectorScorer( FloatVectorValues vectorValues, float[] query, VectorSimilarityFunction similarityFunction) throws IOException { super(vectorValues); - this.query = query; this.similarityFunction = similarityFunction; - this.targetVectors = vectorValues.vectors(); + vectors = vectorValues.vectors(); + queryVectors = null; + this.query = query; + } + + private void setQuery(int ord) throws IOException { + query = queryVectors.get(ord); } @Override public float score(int node) throws IOException { - return similarityFunction.compare(query, targetVectors.get(node)); + return similarityFunction.compare(query, vectors.get(node)); } } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene95/OffHeapFloatVectorValues.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene95/OffHeapFloatVectorValues.java index 736a552ff04..c7af020dc90 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene95/OffHeapFloatVectorValues.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene95/OffHeapFloatVectorValues.java @@ -28,12 +28,13 @@ import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.RandomAccessInput; import org.apache.lucene.util.Bits; +import org.apache.lucene.util.hnsw.Bag; import org.apache.lucene.util.hnsw.RandomVectorScorer; import org.apache.lucene.util.packed.DirectMonotonicReader; /** Read the vector values from the index input. This supports both iterated and random access. */ public abstract class OffHeapFloatVectorValues extends FloatVectorValues implements HasIndexSlice { - + private final Bag pool = new Bag<>(); protected final int dimension; protected final int size; protected final IndexInput slice; @@ -73,6 +74,10 @@ public IndexInput getSlice() { @Override public Floats vectors() { + Floats floats = pool.poll(); + if (floats != null) { + return floats; + } IndexInput sliceCopy = slice.clone(); float[] value = new float[dimension]; return new Floats() { @@ -88,6 +93,11 @@ public float[] get(int targetOrd) throws IOException { lastOrd = targetOrd; return value; } + + @Override + public void close() throws IOException { + pool.offer(this); + } }; } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsReader.java index 86225d69e55..34f6a2b4f9e 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsReader.java @@ -428,6 +428,11 @@ public Floats vectors() throws IOException { public float[] get(int ord) throws IOException { return rawVectors.get(ord); } + + @Override + public void close() throws IOException { + rawVectors.close(); + } }; } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsWriter.java index fc401c93529..209218707d3 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsWriter.java @@ -1216,11 +1216,9 @@ public DocIndexIterator iterator() throws IOException { static final class NormalizedFloatVectorValues extends FloatVectorValues { private final FloatVectorValues vectorValues; - private final Floats floats; public NormalizedFloatVectorValues(FloatVectorValues vectorValues) throws IOException { this.vectorValues = vectorValues; - floats = vectorValues.vectors(); } @Override @@ -1239,15 +1237,22 @@ public int ordToDoc(int ord) { } @Override - public Floats vectors() { + public Floats vectors() throws IOException { float[] normalizedVector = new float[vectorValues.dimension()]; return new Floats() { + Floats delegate = vectorValues.vectors(); + @Override public float[] get(int ord) throws IOException { - System.arraycopy(floats.get(ord), 0, normalizedVector, 0, normalizedVector.length); + System.arraycopy(delegate.get(ord), 0, normalizedVector, 0, normalizedVector.length); VectorUtil.l2normalize(normalizedVector); return normalizedVector; } + + @Override + public void close() throws IOException { + delegate.close(); + } }; } diff --git a/lucene/core/src/java/org/apache/lucene/index/FloatVectorValues.java b/lucene/core/src/java/org/apache/lucene/index/FloatVectorValues.java index 6a3ebe4a1af..38fd4a2d8cf 100644 --- a/lucene/core/src/java/org/apache/lucene/index/FloatVectorValues.java +++ b/lucene/core/src/java/org/apache/lucene/index/FloatVectorValues.java @@ -33,7 +33,7 @@ public abstract class FloatVectorValues extends KnnVectorValues { protected FloatVectorValues() {} /** A random access (lookup by ord) provider of the vector values */ - public abstract static class Floats { + public abstract static class Floats implements AutoCloseable { /** * Return the vector value for the given vector ordinal which must be in [0, size() - 1], * otherwise IndexOutOfBoundsException is thrown. The returned array may be shared across calls. @@ -42,6 +42,11 @@ public abstract static class Floats { */ public abstract float[] get(int ord) throws IOException; + @Override + public void close() throws IOException { + // by default do nothing. Some implementations do more interesting resource management. + } + /** A Floats containing no vectors. Throws UnsupportedOperationException if get() is called. */ public static final Floats EMPTY = new Floats() { @@ -118,6 +123,9 @@ public Floats vectors() { public float[] get(int ord) throws IOException { return vectors.get(ord); } + + @Override + public void close() {} }; } diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/Bag.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/Bag.java new file mode 100644 index 00000000000..df322093f53 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/Bag.java @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.util.hnsw; + +/** + * A collection of objects that is threadsafe, providing offer(T) that tries to add an element and + * poll() that removes and returns an element or null. The storage will never grow. There are no + * guarantees about which object will be returned from poll(), just that it will be one that was + * added by offer(). + */ +public class Bag { + private static final int DEFAULT_CAPACITY = 64; + + private final Object[] elements; + private int writeTo; + private int readFrom; + + public Bag() { + this(DEFAULT_CAPACITY); + } + + public Bag(int capacity) { + elements = new Object[capacity]; + } + + public synchronized boolean offer(T element) { + if (full()) { + return false; + } + elements[writeTo] = element; + writeTo = (writeTo + 1) % elements.length; + return true; + } + + @SuppressWarnings("unchecked") + public synchronized T poll() { + if (empty()) { + return null; + } + T result = (T) elements[readFrom]; + readFrom = (readFrom + 1) % elements.length; + return result; + } + + private boolean full() { + int headroom = readFrom - 1 - writeTo; + if (headroom < 0) { + headroom += elements.length; + } + return headroom == 0; + } + + private boolean empty() { + return readFrom == writeTo; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphBuilder.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphBuilder.java index bed1480e926..0cbadeaa7a5 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphBuilder.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphBuilder.java @@ -233,74 +233,75 @@ to the newly introduced levels (repeating step 2,3 for new levels) and again try if (frozen) { throw new IllegalStateException("Graph builder is already frozen"); } - RandomVectorScorer scorer = scorerSupplier.scorer(node); - final int nodeLevel = getRandomGraphLevel(ml, random); - // first add nodes to all levels - for (int level = nodeLevel; level >= 0; level--) { - hnsw.addNode(level, node); - } - // then promote itself as entry node if entry node is not set - if (hnsw.trySetNewEntryNode(node, nodeLevel)) { - return; - } - // if the entry node is already set, then we have to do all connections first before we can - // promote ourselves as entry node - - int lowestUnsetLevel = 0; - int curMaxLevel; - do { - curMaxLevel = hnsw.numLevels() - 1; - // NOTE: the entry node and max level may not be paired, but because we get the level first - // we ensure that the entry node we get later will always exist on the curMaxLevel - int[] eps = new int[] {hnsw.entryNode()}; - - // we first do the search from top to bottom - // for levels > nodeLevel search with topk = 1 - GraphBuilderKnnCollector candidates = entryCandidates; - for (int level = curMaxLevel; level > nodeLevel; level--) { - candidates.clear(); - graphSearcher.searchLevel(candidates, scorer, level, eps, hnsw, null); - eps[0] = candidates.popNode(); - } - - // for levels <= nodeLevel search with topk = beamWidth, and add connections - candidates = beamCandidates; - NeighborArray[] scratchPerLevel = - new NeighborArray[Math.min(nodeLevel, curMaxLevel) - lowestUnsetLevel + 1]; - for (int i = scratchPerLevel.length - 1; i >= 0; i--) { - int level = i + lowestUnsetLevel; - candidates.clear(); - graphSearcher.searchLevel(candidates, scorer, level, eps, hnsw, null); - eps = candidates.popUntilNearestKNodes(); - scratchPerLevel[i] = new NeighborArray(Math.max(beamCandidates.k(), M + 1), false); - popToScratch(candidates, scratchPerLevel[i]); + try (RandomVectorScorer scorer = scorerSupplier.scorer(node)) { + final int nodeLevel = getRandomGraphLevel(ml, random); + // first add nodes to all levels + for (int level = nodeLevel; level >= 0; level--) { + hnsw.addNode(level, node); } - - // then do connections from bottom up - for (int i = 0; i < scratchPerLevel.length; i++) { - addDiverseNeighbors(i + lowestUnsetLevel, node, scratchPerLevel[i]); - } - lowestUnsetLevel += scratchPerLevel.length; - assert lowestUnsetLevel == Math.min(nodeLevel, curMaxLevel) + 1; - if (lowestUnsetLevel > nodeLevel) { + // then promote itself as entry node if entry node is not set + if (hnsw.trySetNewEntryNode(node, nodeLevel)) { return; } - assert lowestUnsetLevel == curMaxLevel + 1 && nodeLevel > curMaxLevel; - if (hnsw.tryPromoteNewEntryNode(node, nodeLevel, curMaxLevel)) { - return; - } - if (hnsw.numLevels() == curMaxLevel + 1) { - // This should never happen if all the calculations are correct - throw new IllegalStateException( - "We're not able to promote node " - + node - + " at level " - + nodeLevel - + " as entry node. But the max graph level " - + curMaxLevel - + " has not changed while we are inserting the node."); - } - } while (true); + // if the entry node is already set, then we have to do all connections first before we can + // promote ourselves as entry node + + int lowestUnsetLevel = 0; + int curMaxLevel; + do { + curMaxLevel = hnsw.numLevels() - 1; + // NOTE: the entry node and max level may not be paired, but because we get the level first + // we ensure that the entry node we get later will always exist on the curMaxLevel + int[] eps = new int[] {hnsw.entryNode()}; + + // we first do the search from top to bottom + // for levels > nodeLevel search with topk = 1 + GraphBuilderKnnCollector candidates = entryCandidates; + for (int level = curMaxLevel; level > nodeLevel; level--) { + candidates.clear(); + graphSearcher.searchLevel(candidates, scorer, level, eps, hnsw, null); + eps[0] = candidates.popNode(); + } + + // for levels <= nodeLevel search with topk = beamWidth, and add connections + candidates = beamCandidates; + NeighborArray[] scratchPerLevel = + new NeighborArray[Math.min(nodeLevel, curMaxLevel) - lowestUnsetLevel + 1]; + for (int i = scratchPerLevel.length - 1; i >= 0; i--) { + int level = i + lowestUnsetLevel; + candidates.clear(); + graphSearcher.searchLevel(candidates, scorer, level, eps, hnsw, null); + eps = candidates.popUntilNearestKNodes(); + scratchPerLevel[i] = new NeighborArray(Math.max(beamCandidates.k(), M + 1), false); + popToScratch(candidates, scratchPerLevel[i]); + } + + // then do connections from bottom up + for (int i = 0; i < scratchPerLevel.length; i++) { + addDiverseNeighbors(i + lowestUnsetLevel, node, scratchPerLevel[i]); + } + lowestUnsetLevel += scratchPerLevel.length; + assert lowestUnsetLevel == Math.min(nodeLevel, curMaxLevel) + 1; + if (lowestUnsetLevel > nodeLevel) { + return; + } + assert lowestUnsetLevel == curMaxLevel + 1 && nodeLevel > curMaxLevel; + if (hnsw.tryPromoteNewEntryNode(node, nodeLevel, curMaxLevel)) { + return; + } + if (hnsw.numLevels() == curMaxLevel + 1) { + // This should never happen if all the calculations are correct + throw new IllegalStateException( + "We're not able to promote node " + + node + + " at level " + + nodeLevel + + " as entry node. But the max graph level " + + curMaxLevel + + " has not changed while we are inserting the node."); + } + } while (true); + } } private long printGraphBuildStatus(int node, long start, long t) { @@ -393,11 +394,12 @@ private static void popToScratch(GraphBuilderKnnCollector candidates, NeighborAr */ private boolean diversityCheck(int candidate, float score, NeighborArray neighbors) throws IOException { - RandomVectorScorer scorer = scorerSupplier.scorer(candidate); - for (int i = 0; i < neighbors.size(); i++) { - float neighborSimilarity = scorer.score(neighbors.nodes()[i]); - if (neighborSimilarity >= score) { - return false; + try (RandomVectorScorer scorer = scorerSupplier.scorer(candidate)) { + for (int i = 0; i < neighbors.size(); i++) { + float neighborSimilarity = scorer.score(neighbors.nodes()[i]); + if (neighborSimilarity >= score) { + return false; + } } } return true; diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/NeighborArray.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/NeighborArray.java index 716364a39dc..592aedc57e6 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/NeighborArray.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/NeighborArray.java @@ -237,8 +237,10 @@ private int descSortFindRightMostInsertionPoint(float newScore, int bound) { */ private int findWorstNonDiverse(int nodeOrd, RandomVectorScorerSupplier scorerSupplier) throws IOException { - RandomVectorScorer scorer = scorerSupplier.scorer(nodeOrd); - int[] uncheckedIndexes = sort(scorer); + int[] uncheckedIndexes; + try (RandomVectorScorer scorer = scorerSupplier.scorer(nodeOrd)) { + uncheckedIndexes = sort(scorer); + } assert uncheckedIndexes != null : "We will always have something unchecked"; int uncheckedCursor = uncheckedIndexes.length - 1; for (int i = size - 1; i > 0; i--) { @@ -263,25 +265,26 @@ private boolean isWorstNonDiverse( RandomVectorScorerSupplier scorerSupplier) throws IOException { float minAcceptedSimilarity = scores[candidateIndex]; - RandomVectorScorer scorer = scorerSupplier.scorer(nodes[candidateIndex]); - if (candidateIndex == uncheckedIndexes[uncheckedCursor]) { - // the candidate itself is unchecked - for (int i = candidateIndex - 1; i >= 0; i--) { - float neighborSimilarity = scorer.score(nodes[i]); - // candidate node is too similar to node i given its score relative to the base node - if (neighborSimilarity >= minAcceptedSimilarity) { - return true; + try (RandomVectorScorer scorer = scorerSupplier.scorer(nodes[candidateIndex])) { + if (candidateIndex == uncheckedIndexes[uncheckedCursor]) { + // the candidate itself is unchecked + for (int i = candidateIndex - 1; i >= 0; i--) { + float neighborSimilarity = scorer.score(nodes[i]); + // candidate node is too similar to node i given its score relative to the base node + if (neighborSimilarity >= minAcceptedSimilarity) { + return true; + } } - } - } else { - // else we just need to make sure candidate does not violate diversity with the (newly - // inserted) unchecked nodes - assert candidateIndex > uncheckedIndexes[uncheckedCursor]; - for (int i = uncheckedCursor; i >= 0; i--) { - float neighborSimilarity = scorer.score(nodes[uncheckedIndexes[i]]); - // candidate node is too similar to node i given its score relative to the base node - if (neighborSimilarity >= minAcceptedSimilarity) { - return true; + } else { + // else we just need to make sure candidate does not violate diversity with the (newly + // inserted) unchecked nodes + assert candidateIndex > uncheckedIndexes[uncheckedCursor]; + for (int i = uncheckedCursor; i >= 0; i--) { + float neighborSimilarity = scorer.score(nodes[uncheckedIndexes[i]]); + // candidate node is too similar to node i given its score relative to the base node + if (neighborSimilarity >= minAcceptedSimilarity) { + return true; + } } } } diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/RandomVectorScorer.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/RandomVectorScorer.java index a135df43699..7d3b6c1b7f9 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/RandomVectorScorer.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/RandomVectorScorer.java @@ -22,7 +22,7 @@ import org.apache.lucene.util.Bits; /** A {@link RandomVectorScorer} for scoring random nodes in batches against an abstract query. */ -public interface RandomVectorScorer { +public interface RandomVectorScorer extends AutoCloseable { /** * Returns the score between the query and the provided node. * @@ -56,9 +56,13 @@ default Bits getAcceptOrds(Bits acceptDocs) { return acceptDocs; } + @Override + default void close() {} + /** Creates a default scorer for random access vectors. */ abstract class AbstractRandomVectorScorer implements RandomVectorScorer { private final KnnVectorValues values; + private final Bag scorerPool; /** * Creates a new scorer for the given vector values. @@ -66,7 +70,12 @@ abstract class AbstractRandomVectorScorer implements RandomVectorScorer { * @param values the vector values */ public AbstractRandomVectorScorer(KnnVectorValues values) { + this(values, null); + } + + public AbstractRandomVectorScorer(KnnVectorValues values, Bag scorerPool) { this.values = values; + this.scorerPool = scorerPool; } @Override @@ -83,5 +92,12 @@ public int ordToDoc(int ord) { public Bits getAcceptOrds(Bits acceptDocs) { return values.getAcceptOrds(acceptDocs); } + + @Override + public void close() { + if (scorerPool != null) { + scorerPool.offer(this); + } + } } }