diff --git a/README.md b/README.md index 6a08f1c..6f92385 100644 --- a/README.md +++ b/README.md @@ -25,6 +25,17 @@ python -m svsbench.build \ --proportion_vectors_init 0.5 --batch_size 10000 ``` +#### Building a LeanVec-OOD index + +[LeanVec-OOD](https://openreview.net/forum?id=wczqrpOrIc) is a dimensionality reduction method for cases where query and base vectors have different distributions (e.g., text-to-image search). Use `--train_query_file` to provide training queries. + +```sh +python -m svsbench.build \ + --vecs_file /path/to/vectors.fvecs \ + --svs_type leanvec4x8 \ + --train_query_file /path/to/train_query_vectors.fvecs +``` + ### Computing the ground truth For the query vectors used in performance measurements: diff --git a/pyproject.toml b/pyproject.toml index bb8cb0b..71a9823 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,6 +11,7 @@ dependencies = [ "numpy>=1.10", "scalable-vs>=0.0.7", "tqdm>=4.67", + "typer-slim>=0.15.2", ] [build-system] diff --git a/src/svsbench/build.py b/src/svsbench/build.py index 2fbded7..b1f9b55 100644 --- a/src/svsbench/build.py +++ b/src/svsbench/build.py @@ -4,19 +4,22 @@ import argparse import logging -import os import sys import tempfile import time from pathlib import Path import numpy as np +import numpy.typing as npt import svs from tqdm import tqdm -from . import consts +from . import consts, utils +from .generate_leanvec_matrices import ( + generate_leanvec_matrices, + save_leanvec_matrices, +) from .loader import create_loader -from . import utils logger = logging.getLogger(__file__) @@ -86,11 +89,40 @@ def _read_args(argv: list[str] | None = None) -> argparse.Namespace: parser.add_argument( "--leanvec_dims", help="LeanVec dimensionality", type=int ) - parser.add_argument("--no_save", action="store_true") + parser.add_argument( + "--no_save", action="store_true", help="Do not save built index" + ) + parser.add_argument( + "--train_query_file", + help="Query *vecs file for LeanVec out-of-distribution training", + type=Path, + ) + parser.add_argument( + "--train_max_vectors", + help="Maximum number of base vectors from vecs_file" + " to use for LeanVec out-of-distribution training (0 for all)", + type=int, + default=consts.DEFAULT_LEANVEC_TRAIN_MAX_VECTORS, + ) + parser.add_argument( + "--no_save_matrices", + action="store_true", + help="Do not save LeanVec matrices", + ) + parser.add_argument( + "--data_matrix_file", + help="Data matrix npy file for LeanVec", + type=Path, + ) + parser.add_argument( + "--query_matrix_file", + help="Query matrix npy file for LeanVec", + type=Path, + ) return parser.parse_args(argv) -def main(argv: str | None = None) -> None: +def main(argv: list[str] | None = None) -> None: args = _read_args(argv) log_file = utils.configure_logger( logger, args.log_dir if args.log_dir is not None else args.out_dir @@ -98,6 +130,42 @@ def main(argv: str | None = None) -> None: print("Logging to", log_file, sep="\n") logger.info({"argv": argv if argv else sys.argv}) args.out_dir.mkdir(exist_ok=True) + if args.data_matrix_file is not None: + if args.query_matrix_file is None: + raise ValueError( + "query_matrix_file must be provided with data_matrix_file" + ) + data_matrix = np.load(args.data_matrix_file) + query_matrix = np.load(args.query_matrix_file) + elif args.train_query_file is not None: + (data_matrix, query_matrix), (leanvec_dims_effective, _) = ( + generate_leanvec_matrices( + args.vecs_file, + args.train_query_file, + args.train_max_vectors, + args.leanvec_dims, + ) + ) + if not args.no_save_matrices: + data_matrix_path, query_matrix_path = save_leanvec_matrices( + args.vecs_file, + args.train_query_file, + args.train_max_vectors, + leanvec_dims_effective, + data_matrix, + query_matrix, + args.out_dir, + ) + logger.info( + { + "saved_leanvec_matrices": ( + data_matrix_path, + query_matrix_path, + ) + } + ) + else: + data_matrix = query_matrix = None if args.static: index, name = build_static( vecs_path=args.vecs_file, @@ -110,6 +178,8 @@ def main(argv: str | None = None) -> None: alpha=args.alpha, max_threads=args.max_threads, leanvec_dims=args.leanvec_dims, + data_matrix=data_matrix, + query_matrix=query_matrix, ) else: index, name, ingest_time, delete_time = build_dynamic( @@ -135,6 +205,8 @@ def main(argv: str | None = None) -> None: convert_vecs=args.convert_vecs, tmp_dir=args.tmp_dir, leanvec_dims=args.leanvec_dims, + data_matrix=data_matrix, + query_matrix=query_matrix, ) np.save(args.out_dir / (name + ".ingest.npy"), ingest_time) if args.num_vectors_delete > 0: @@ -167,6 +239,8 @@ def build_dynamic( convert_vecs: bool = False, tmp_dir: Path = Path("/dev/shm"), leanvec_dims: int | None = None, + data_matrix: npt.NDArray | None = None, + query_matrix: npt.NDArray | None = None, ) -> tuple[svs.DynamicVamana, str]: """Build SVS index.""" logger.info({"build_args": locals()}) @@ -264,6 +338,8 @@ def build_dynamic( data_dir=tmp_idx_dir / "data", compress=not svs_type.startswith("float"), leanvec_dims=leanvec_dims, + data_matrix=data_matrix, + query_matrix=query_matrix, ) index = svs.DynamicVamana( str(tmp_idx_dir / "config"), @@ -343,6 +419,8 @@ def build_static( alpha: float | None = None, max_threads: int = 1, leanvec_dims: int | None = None, + data_matrix: npt.NDArray | None = None, + query_matrix: npt.NDArray | None = None, ) -> tuple[svs.Vamana, str]: logger.info({"build_args": locals()}) logger.info(utils.read_system_config()) @@ -360,7 +438,11 @@ def build_static( index = svs.Vamana.build( parameters, create_loader( - svs_type, vecs_path=vecs_path, leanvec_dims=leanvec_dims + svs_type, + vecs_path=vecs_path, + leanvec_dims=leanvec_dims, + data_matrix=data_matrix, + query_matrix=query_matrix, ), distance, num_threads=max_threads, diff --git a/src/svsbench/consts.py b/src/svsbench/consts.py index aaad1ed..9914a3d 100644 --- a/src/svsbench/consts.py +++ b/src/svsbench/consts.py @@ -7,6 +7,9 @@ import numpy as np import svs +DEFAULT_LEANVEC_DIMS: Final[int] = -4 +DEFAULT_LEANVEC_TRAIN_MAX_VECTORS: Final[int] = 100_000 + DISTANCE_TO_ALPHA: Final[dict[svs.DistanceType, float]] = { svs.DistanceType.Cosine: 0.95, svs.DistanceType.L2: 1.2, diff --git a/src/svsbench/generate_leanvec_matrices.py b/src/svsbench/generate_leanvec_matrices.py new file mode 100644 index 0000000..7a81e46 --- /dev/null +++ b/src/svsbench/generate_leanvec_matrices.py @@ -0,0 +1,87 @@ +# Copyright (C) 2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +"""Generate LeanVec-OOD matrices.""" + +from pathlib import Path + +import numpy as np +import numpy.typing as npt +import svs +import typer + +from . import consts, merge + + +def main( + vecs_file: Path, + train_query_file: Path, + max_vectors: int = consts.DEFAULT_LEANVEC_TRAIN_MAX_VECTORS, + leanvec_dims: int = consts.DEFAULT_LEANVEC_DIMS, + out_dir: Path = Path(), +) -> None: + out_dir.mkdir(parents=True, exist_ok=True) + (data_matrix, query_matrix), (leanvec_dims_effective, _) = ( + generate_leanvec_matrices( + vecs_file, train_query_file, max_vectors, leanvec_dims + ) + ) + data_matrix_path, query_matrix_path = save_leanvec_matrices( + vecs_file, + train_query_file, + max_vectors, + leanvec_dims_effective, + data_matrix, + query_matrix, + out_dir, + ) + print("Saved LeanVec matrices:", data_matrix_path, query_matrix_path) + + +def generate_leanvec_matrices( + vecs_file: Path, + train_query_file: Path, + max_vectors: int = consts.DEFAULT_LEANVEC_TRAIN_MAX_VECTORS, + leanvec_dims: int | None = None, +) -> tuple[tuple[npt.NDArray, npt.NDArray], tuple[int, int]]: + """Generate LeanVec matrices from base vectors and query vectors.""" + base_vectors = merge.read_vecs(vecs_file, max_vectors) + query_vectors = merge.read_vecs(train_query_file) + dim = base_vectors.shape[1] + if leanvec_dims is None: + leanvec_dims = consts.DEFAULT_LEANVEC_DIMS + if leanvec_dims < 0: + leanvec_dims = dim // -leanvec_dims + return svs.compute_leanvec_matrices( + base_vectors, query_vectors, leanvec_dims + ), (leanvec_dims, max_vectors) + + +def save_leanvec_matrices( + vecs_file: Path, + train_query_file: Path, + max_vectors: int, + leanvec_dims: int, + data_matrix: npt.NDArray, + query_matrix: npt.NDArray, + out_dir: Path, +) -> tuple[Path, Path]: + """Save LeanVec matrices to files.""" + name_components = [ + vecs_file.name, + train_query_file.name, + str(leanvec_dims), + ] + if max_vectors > 0: + name_components.append(str(max_vectors)) + base_name = "__".join(name_components) + data_matrix_path = out_dir / (base_name + ".data.npy") + query_matrix_path = out_dir / (base_name + ".query.npy") + np.save(data_matrix_path, data_matrix) + np.save(query_matrix_path, query_matrix) + return data_matrix_path, query_matrix_path + + +if __name__ == "__main__": + # https://github.com/fastapi/typer/issues/341 + typer.main.get_command_name = lambda name: name + typer.run(main) diff --git a/src/svsbench/loader.py b/src/svsbench/loader.py index 548dc3d..e7a6ca8 100644 --- a/src/svsbench/loader.py +++ b/src/svsbench/loader.py @@ -4,6 +4,7 @@ from pathlib import Path +import numpy.typing as npt import svs from . import consts @@ -18,6 +19,8 @@ def create_loader( leanvec_dims: int | None = None, leanvec_alignment: int = 32, lvq_strategy: svs.LVQStrategy | None = None, + data_matrix: npt.NDArray | None = None, + query_matrix: npt.NDArray | None = None, ) -> svs.VectorDataLoader | svs.LVQLoader | svs.LeanVecLoader: """Create loader.""" unkown_msg = f"Unknown {svs_type=}" @@ -89,7 +92,7 @@ def create_loader( raise ValueError(unkown_msg) if vecs_path is not None or compress: if leanvec_dims is None: - leanvec_dims = -4 + leanvec_dims = consts.DEFAULT_LEANVEC_DIMS if leanvec_dims < 0: leanvec_dims = loader_or_str.dims // -leanvec_dims loader = svs.LeanVecLoader( @@ -98,6 +101,8 @@ def create_loader( primary_kind=primary, secondary_kind=secondary, alignment=leanvec_alignment, + data_matrix=data_matrix, + query_matrix=query_matrix, ) else: loader = svs.LeanVecLoader( diff --git a/src/svsbench/merge.py b/src/svsbench/merge.py index b66b592..5c50dd6 100644 --- a/src/svsbench/merge.py +++ b/src/svsbench/merge.py @@ -34,12 +34,14 @@ def _read_dim(fname: Path) -> int: return dim -def read_vecs(fname: Path) -> npt.NDArray: +def read_vecs(fname: Path, max_vectors: int | None = None) -> npt.NDArray: """Create NumPy memory maps.""" + if max_vectors == 0: + max_vectors = None dim = _read_dim(fname) padding = SUFFIX_TO_PADDING[fname.suffix] array = np.memmap(fname, dtype=SUFFIX_TO_DTYPE[fname.suffix], mode="r") - return array.reshape(-1, dim + padding)[:, padding:] + return array.reshape(-1, dim + padding)[:max_vectors, padding:] def write(inputs: list[Path], output: Path, num_vectors: int | None) -> None: diff --git a/tests/test_build.py b/tests/test_build.py index 919e08b..c0726c9 100644 --- a/tests/test_build.py +++ b/tests/test_build.py @@ -5,6 +5,10 @@ import svsbench.build from svsbench.consts import SUFFIX_TO_SVS_TYPE +from svsbench.generate_leanvec_matrices import ( + generate_leanvec_matrices, + save_leanvec_matrices, +) @pytest.mark.parametrize("svs_type", svsbench.consts.SVS_TYPES) @@ -26,6 +30,7 @@ def test_build_static(svs_type, tmp_vecs): distance=svs.DistanceType.L2, ) + @pytest.mark.parametrize("svs_type", svsbench.consts.SVS_TYPES) def test_build_dynamic(svs_type, tmp_vecs): if SUFFIX_TO_SVS_TYPE[tmp_vecs.suffix] == "float16": @@ -46,3 +51,54 @@ def test_build_dynamic(svs_type, tmp_vecs): distance=svs.DistanceType.L2, convert_vecs=True, ) + + +def test_main_with_train_query(tmp_path, tmp_vecs, query_path): + if SUFFIX_TO_SVS_TYPE[tmp_vecs.suffix] == "float16": + pytest.xfail("https://github.com/intel/ScalableVectorSearch/issues/93") + svsbench.build.main( + [ + "--vecs_file", + str(tmp_vecs), + "--svs_type", + "leanvec4x8", + "--train_query_file", + str(query_path), + "--out_dir", + str(tmp_path), + ] + ) + + +def test_main_with_matrices(tmp_path, tmp_vecs, query_path): + if SUFFIX_TO_SVS_TYPE[tmp_vecs.suffix] == "float16": + pytest.xfail("https://github.com/intel/ScalableVectorSearch/issues/93") + (data_matrix, query_matrix), (leanvec_dims_effective, max_vectors_effective) = ( + generate_leanvec_matrices( + tmp_vecs, + query_path, + ) + ) + data_matrix_path, query_matrix_path = save_leanvec_matrices( + tmp_vecs, + query_path, + max_vectors_effective, + leanvec_dims_effective, + data_matrix, + query_matrix, + tmp_path, + ) + svsbench.build.main( + [ + "--vecs_file", + str(tmp_vecs), + "--svs_type", + "leanvec4x8", + "--data_matrix_file", + str(data_matrix_path), + "--query_matrix_file", + str(query_matrix_path), + "--out_dir", + str(tmp_path), + ] + ) diff --git a/tests/test_generate_leanvec_matrices.py b/tests/test_generate_leanvec_matrices.py new file mode 100644 index 0000000..2a7131f --- /dev/null +++ b/tests/test_generate_leanvec_matrices.py @@ -0,0 +1,20 @@ +# Copyright (C) 2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import svsbench.generate_leanvec_matrices + + +def test_main(tmp_path, tmp_vecs, query_path): + svsbench.generate_leanvec_matrices.main( + tmp_vecs, + query_path, + out_dir=tmp_path, + ) + + +def test_generate_with_all_vectors(tmp_vecs, query_path): + svsbench.generate_leanvec_matrices.generate_leanvec_matrices( + tmp_vecs, + query_path, + max_vectors=0, + ) diff --git a/uv.lock b/uv.lock index 373becc..fbf4d9e 100644 --- a/uv.lock +++ b/uv.lock @@ -11,6 +11,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d2/8d/2733707a9daf4df14cb6ae251691cfc49fcce888234ec6484364d8319b46/archspec-0.2.5-py3-none-any.whl", hash = "sha256:604bd4115cb4c18e50a22a9b4a1e516706712263790d7d2994aaa595e70082f6", size = 76161 }, ] +[[package]] +name = "click" +version = "8.2.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/60/6c/8ca2efa64cf75a977a0d7fac081354553ebe483345c734fb6b6515d96bbc/click-8.2.1.tar.gz", hash = "sha256:27c491cc05d968d271d5a1db13e3b5a184636d9d930f148c50b038f0d0646202", size = 286342 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/85/32/10bb5764d90a8eee674e9dc6f4db6a0ab47c8c4d0d83c27f7c39ac415a4d/click-8.2.1-py3-none-any.whl", hash = "sha256:61a3265b914e850b85317d0b3109c7f8cd35a670f963866005d6ef1d5175a12b", size = 102215 }, +] + [[package]] name = "colorama" version = "0.4.6" @@ -176,6 +188,7 @@ dependencies = [ { name = "numpy" }, { name = "scalable-vs" }, { name = "tqdm" }, + { name = "typer-slim" }, ] [package.dev-dependencies] @@ -190,6 +203,7 @@ requires-dist = [ { name = "numpy", specifier = ">=1.10" }, { name = "scalable-vs", specifier = ">=0.0.7" }, { name = "tqdm", specifier = ">=4.67" }, + { name = "typer-slim", specifier = ">=0.15.2" }, ] [package.metadata.requires-dev] @@ -219,3 +233,25 @@ sdist = { url = "https://files.pythonhosted.org/packages/a8/4b/29b4ef32e036bb34e wheels = [ { url = "https://files.pythonhosted.org/packages/d0/30/dc54f88dd4a2b5dc8a0279bdd7270e735851848b762aeb1c1184ed1f6b14/tqdm-4.67.1-py3-none-any.whl", hash = "sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2", size = 78540 }, ] + +[[package]] +name = "typer-slim" +version = "0.16.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "click" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ae/96/5749d5b6920fab0db15ce39d850b86dfee21518cd1c81b7a5fe5a495a92a/typer_slim-0.16.0.tar.gz", hash = "sha256:d6483c367f98529884a5d45a028f5d2686ae93cd9d33d518661069f382c08546", size = 102664 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/41/2d/dc1f0c872615aef018783408ac993be7832726a4b30032e317e9f2858267/typer_slim-0.16.0-py3-none-any.whl", hash = "sha256:8aa94eef73b876506b9d239cd70cfedefac95541be8f060688aabfc800f53d67", size = 46377 }, +] + +[[package]] +name = "typing-extensions" +version = "4.14.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/98/5a/da40306b885cc8c09109dc2e1abd358d5684b1425678151cdaed4731c822/typing_extensions-4.14.1.tar.gz", hash = "sha256:38b39f4aeeab64884ce9f74c94263ef78f3c22467c8724005483154c26648d36", size = 107673 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b5/00/d631e67a838026495268c2f6884f3711a15a9a2a96cd244fdaea53b823fb/typing_extensions-4.14.1-py3-none-any.whl", hash = "sha256:d1e1e3b58374dc93031d6eda2420a48ea44a36c2b4766a4fdeb3710755731d76", size = 43906 }, +]