Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,17 @@ python -m svsbench.build \
--proportion_vectors_init 0.5 --batch_size 10000
```

#### Building a LeanVec-OOD index

LeanVec-OOD is a dimensionality reduction method that works even if the query vectors have a different distribution than the base vectors.

```sh
python -m svsbench.build \
--vecs_file /path/to/vectors.fvecs \
--svs_type leanvec4x8 \
--train_query_file /path/to/train_query_vectors.fvecs
```

### Computing the ground truth

For the query vectors used in performance measurements:
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ dependencies = [
"numpy>=1.10",
"scalable-vs>=0.0.7",
"tqdm>=4.67",
"typer-slim>=0.15.2",
]

[build-system]
Expand Down
94 changes: 88 additions & 6 deletions src/svsbench/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,22 @@

import argparse
import logging
import os
import sys
import tempfile
import time
from pathlib import Path

import numpy as np
import numpy.typing as npt
import svs
from tqdm import tqdm

from . import consts
from . import consts, utils
from .generate_leanvec_matrices import (
generate_leanvec_matrices,
save_leanvec_matrices,
)
from .loader import create_loader
from . import utils

logger = logging.getLogger(__file__)

Expand Down Expand Up @@ -86,18 +89,83 @@ def _read_args(argv: list[str] | None = None) -> argparse.Namespace:
parser.add_argument(
"--leanvec_dims", help="LeanVec dimensionality", type=int
)
parser.add_argument("--no_save", action="store_true")
parser.add_argument(
"--no_save", action="store_true", help="Do not save built index"
)
parser.add_argument(
"--train_query_file",
help="Query *vecs file for LeanVec out-of-distribution training",
type=Path,
)
parser.add_argument(
"--train_max_vectors",
help="Maximum number of base vectors from vecs_file"
" to use for LeanVec out-of-distribution training (0 for all)",
type=int,
default=consts.DEFAULT_LEANVEC_TRAIN_MAX_VECTORS,
)
parser.add_argument(
"--no_save_matrices",
action="store_true",
help="Do not save LeanVec matrices",
)
parser.add_argument(
"--data_matrix_file",
help="Data matrix npy file for LeanVec",
type=Path,
)
parser.add_argument(
"--query_matrix_file",
help="Query matrix npy file for LeanVec",
type=Path,
)
return parser.parse_args(argv)


def main(argv: str | None = None) -> None:
def main(argv: list[str] | None = None) -> None:
args = _read_args(argv)
log_file = utils.configure_logger(
logger, args.log_dir if args.log_dir is not None else args.out_dir
)
print("Logging to", log_file, sep="\n")
logger.info({"argv": argv if argv else sys.argv})
args.out_dir.mkdir(exist_ok=True)
if args.data_matrix_file is not None:
if args.query_matrix_file is None:
raise ValueError(
"query_matrix_file must be provided with data_matrix_file"
)
data_matrix = np.load(args.data_matrix_file)
query_matrix = np.load(args.query_matrix_file)
elif args.train_query_file is not None:
(data_matrix, query_matrix), (leanvec_dims_effective, _) = (
generate_leanvec_matrices(
args.vecs_file,
args.train_query_file,
args.train_max_vectors,
args.leanvec_dims,
)
)
if not args.no_save_matrices:
data_matrix_path, query_matrix_path = save_leanvec_matrices(
args.vecs_file,
args.train_query_file,
args.train_max_vectors,
leanvec_dims_effective,
data_matrix,
query_matrix,
args.out_dir,
)
logger.info(
{
"saved_leanvec_matrices": (
data_matrix_path,
query_matrix_path,
)
}
)
else:
data_matrix = query_matrix = None
if args.static:
index, name = build_static(
vecs_path=args.vecs_file,
Expand All @@ -110,6 +178,8 @@ def main(argv: str | None = None) -> None:
alpha=args.alpha,
max_threads=args.max_threads,
leanvec_dims=args.leanvec_dims,
data_matrix=data_matrix,
query_matrix=query_matrix,
)
else:
index, name, ingest_time, delete_time = build_dynamic(
Expand All @@ -135,6 +205,8 @@ def main(argv: str | None = None) -> None:
convert_vecs=args.convert_vecs,
tmp_dir=args.tmp_dir,
leanvec_dims=args.leanvec_dims,
data_matrix=data_matrix,
query_matrix=query_matrix,
)
np.save(args.out_dir / (name + ".ingest.npy"), ingest_time)
if args.num_vectors_delete > 0:
Expand Down Expand Up @@ -167,6 +239,8 @@ def build_dynamic(
convert_vecs: bool = False,
tmp_dir: Path = Path("/dev/shm"),
leanvec_dims: int | None = None,
data_matrix: npt.NDArray | None = None,
query_matrix: npt.NDArray | None = None,
) -> tuple[svs.DynamicVamana, str]:
"""Build SVS index."""
logger.info({"build_args": locals()})
Expand Down Expand Up @@ -264,6 +338,8 @@ def build_dynamic(
data_dir=tmp_idx_dir / "data",
compress=not svs_type.startswith("float"),
leanvec_dims=leanvec_dims,
data_matrix=data_matrix,
query_matrix=query_matrix,
)
index = svs.DynamicVamana(
str(tmp_idx_dir / "config"),
Expand Down Expand Up @@ -343,6 +419,8 @@ def build_static(
alpha: float | None = None,
max_threads: int = 1,
leanvec_dims: int | None = None,
data_matrix: npt.NDArray | None = None,
query_matrix: npt.NDArray | None = None,
) -> tuple[svs.Vamana, str]:
logger.info({"build_args": locals()})
logger.info(utils.read_system_config())
Expand All @@ -360,7 +438,11 @@ def build_static(
index = svs.Vamana.build(
parameters,
create_loader(
svs_type, vecs_path=vecs_path, leanvec_dims=leanvec_dims
svs_type,
vecs_path=vecs_path,
leanvec_dims=leanvec_dims,
data_matrix=data_matrix,
query_matrix=query_matrix,
),
distance,
num_threads=max_threads,
Expand Down
3 changes: 3 additions & 0 deletions src/svsbench/consts.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@
import numpy as np
import svs

DEFAULT_LEANVEC_DIMS: Final[int] = -4
DEFAULT_LEANVEC_TRAIN_MAX_VECTORS: Final[int] = 100_000

DISTANCE_TO_ALPHA: Final[dict[svs.DistanceType, float]] = {
svs.DistanceType.Cosine: 0.95,
svs.DistanceType.L2: 1.2,
Expand Down
87 changes: 87 additions & 0 deletions src/svsbench/generate_leanvec_matrices.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
# Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
"""Generate LeanVec-OOD matrices."""

from pathlib import Path

import numpy as np
import numpy.typing as npt
import svs
import typer

from . import consts, merge


def main(
vecs_file: Path,
train_query_file: Path,
max_vectors: int = consts.DEFAULT_LEANVEC_TRAIN_MAX_VECTORS,
leanvec_dims: int = consts.DEFAULT_LEANVEC_DIMS,
out_dir: Path = Path(),
) -> None:
out_dir.mkdir(parents=True, exist_ok=True)
(data_matrix, query_matrix), (leanvec_dims_effective, _) = (
generate_leanvec_matrices(
vecs_file, train_query_file, max_vectors, leanvec_dims
)
)
data_matrix_path, query_matrix_path = save_leanvec_matrices(
vecs_file,
train_query_file,
max_vectors,
leanvec_dims_effective,
data_matrix,
query_matrix,
out_dir,
)
print("Saved LeanVec matrices:", data_matrix_path, query_matrix_path)


def generate_leanvec_matrices(
vecs_file: Path,
train_query_file: Path,
max_vectors: int = consts.DEFAULT_LEANVEC_TRAIN_MAX_VECTORS,
leanvec_dims: int | None = None,
) -> tuple[tuple[npt.NDArray, npt.NDArray], tuple[int, int]]:
"""Generate LeanVec matrices from base vectors and query vectors."""
base_vectors = merge.read_vecs(vecs_file, max_vectors)
query_vectors = merge.read_vecs(train_query_file)
dim = base_vectors.shape[1]
if leanvec_dims is None:
leanvec_dims = consts.DEFAULT_LEANVEC_DIMS
if leanvec_dims < 0:
leanvec_dims = dim // -leanvec_dims
return svs.compute_leanvec_matrices(
base_vectors, query_vectors, leanvec_dims
), (leanvec_dims, max_vectors)


def save_leanvec_matrices(
vecs_file: Path,
train_query_file: Path,
max_vectors: int,
leanvec_dims: int,
data_matrix: npt.NDArray,
query_matrix: npt.NDArray,
out_dir: Path,
) -> tuple[Path, Path]:
"""Save LeanVec matrices to files."""
name_components = [
vecs_file.name,
train_query_file.name,
str(leanvec_dims),
]
if max_vectors > 0:
name_components.append(str(max_vectors))
base_name = "__".join(name_components)
data_matrix_path = out_dir / (base_name + ".data.npy")
query_matrix_path = out_dir / (base_name + ".query.npy")
np.save(data_matrix_path, data_matrix)
np.save(query_matrix_path, query_matrix)
return data_matrix_path, query_matrix_path


if __name__ == "__main__":
# https://github.com/fastapi/typer/issues/341
typer.main.get_command_name = lambda name: name
typer.run(main)
7 changes: 6 additions & 1 deletion src/svsbench/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from pathlib import Path

import numpy.typing as npt
import svs

from . import consts
Expand All @@ -18,6 +19,8 @@ def create_loader(
leanvec_dims: int | None = None,
leanvec_alignment: int = 32,
lvq_strategy: svs.LVQStrategy | None = None,
data_matrix: npt.NDArray | None = None,
query_matrix: npt.NDArray | None = None,
) -> svs.VectorDataLoader | svs.LVQLoader | svs.LeanVecLoader:
"""Create loader."""
unkown_msg = f"Unknown {svs_type=}"
Expand Down Expand Up @@ -89,7 +92,7 @@ def create_loader(
raise ValueError(unkown_msg)
if vecs_path is not None or compress:
if leanvec_dims is None:
leanvec_dims = -4
leanvec_dims = consts.DEFAULT_LEANVEC_DIMS
if leanvec_dims < 0:
leanvec_dims = loader_or_str.dims // -leanvec_dims
loader = svs.LeanVecLoader(
Expand All @@ -98,6 +101,8 @@ def create_loader(
primary_kind=primary,
secondary_kind=secondary,
alignment=leanvec_alignment,
data_matrix=data_matrix,
query_matrix=query_matrix,
)
else:
loader = svs.LeanVecLoader(
Expand Down
6 changes: 4 additions & 2 deletions src/svsbench/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,14 @@ def _read_dim(fname: Path) -> int:
return dim


def read_vecs(fname: Path) -> npt.NDArray:
def read_vecs(fname: Path, max_vectors: int | None = None) -> npt.NDArray:
"""Create NumPy memory maps."""
if max_vectors == 0:
max_vectors = None
dim = _read_dim(fname)
padding = SUFFIX_TO_PADDING[fname.suffix]
array = np.memmap(fname, dtype=SUFFIX_TO_DTYPE[fname.suffix], mode="r")
return array.reshape(-1, dim + padding)[:, padding:]
return array.reshape(-1, dim + padding)[:max_vectors, padding:]


def write(inputs: list[Path], output: Path, num_vectors: int | None) -> None:
Expand Down
Loading
Loading