Skip to content

Commit e23d182

Browse files
authored
Add support for LeanVec-OOD (#16)
1 parent 5386b39 commit e23d182

File tree

10 files changed

+312
-9
lines changed

10 files changed

+312
-9
lines changed

README.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,17 @@ python -m svsbench.build \
2525
--proportion_vectors_init 0.5 --batch_size 10000
2626
```
2727

28+
#### Building a LeanVec-OOD index
29+
30+
[LeanVec-OOD](https://openreview.net/forum?id=wczqrpOrIc) is a dimensionality reduction method for cases where query and base vectors have different distributions (e.g., text-to-image search). Use `--train_query_file` to provide training queries.
31+
32+
```sh
33+
python -m svsbench.build \
34+
--vecs_file /path/to/vectors.fvecs \
35+
--svs_type leanvec4x8 \
36+
--train_query_file /path/to/train_query_vectors.fvecs
37+
```
38+
2839
### Computing the ground truth
2940

3041
For the query vectors used in performance measurements:

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ dependencies = [
1111
"numpy>=1.10",
1212
"scalable-vs>=0.0.7",
1313
"tqdm>=4.67",
14+
"typer-slim>=0.15.2",
1415
]
1516

1617
[build-system]

src/svsbench/build.py

Lines changed: 88 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,19 +4,22 @@
44

55
import argparse
66
import logging
7-
import os
87
import sys
98
import tempfile
109
import time
1110
from pathlib import Path
1211

1312
import numpy as np
13+
import numpy.typing as npt
1414
import svs
1515
from tqdm import tqdm
1616

17-
from . import consts
17+
from . import consts, utils
18+
from .generate_leanvec_matrices import (
19+
generate_leanvec_matrices,
20+
save_leanvec_matrices,
21+
)
1822
from .loader import create_loader
19-
from . import utils
2023

2124
logger = logging.getLogger(__file__)
2225

@@ -86,18 +89,83 @@ def _read_args(argv: list[str] | None = None) -> argparse.Namespace:
8689
parser.add_argument(
8790
"--leanvec_dims", help="LeanVec dimensionality", type=int
8891
)
89-
parser.add_argument("--no_save", action="store_true")
92+
parser.add_argument(
93+
"--no_save", action="store_true", help="Do not save built index"
94+
)
95+
parser.add_argument(
96+
"--train_query_file",
97+
help="Query *vecs file for LeanVec out-of-distribution training",
98+
type=Path,
99+
)
100+
parser.add_argument(
101+
"--train_max_vectors",
102+
help="Maximum number of base vectors from vecs_file"
103+
" to use for LeanVec out-of-distribution training (0 for all)",
104+
type=int,
105+
default=consts.DEFAULT_LEANVEC_TRAIN_MAX_VECTORS,
106+
)
107+
parser.add_argument(
108+
"--no_save_matrices",
109+
action="store_true",
110+
help="Do not save LeanVec matrices",
111+
)
112+
parser.add_argument(
113+
"--data_matrix_file",
114+
help="Data matrix npy file for LeanVec",
115+
type=Path,
116+
)
117+
parser.add_argument(
118+
"--query_matrix_file",
119+
help="Query matrix npy file for LeanVec",
120+
type=Path,
121+
)
90122
return parser.parse_args(argv)
91123

92124

93-
def main(argv: str | None = None) -> None:
125+
def main(argv: list[str] | None = None) -> None:
94126
args = _read_args(argv)
95127
log_file = utils.configure_logger(
96128
logger, args.log_dir if args.log_dir is not None else args.out_dir
97129
)
98130
print("Logging to", log_file, sep="\n")
99131
logger.info({"argv": argv if argv else sys.argv})
100132
args.out_dir.mkdir(exist_ok=True)
133+
if args.data_matrix_file is not None:
134+
if args.query_matrix_file is None:
135+
raise ValueError(
136+
"query_matrix_file must be provided with data_matrix_file"
137+
)
138+
data_matrix = np.load(args.data_matrix_file)
139+
query_matrix = np.load(args.query_matrix_file)
140+
elif args.train_query_file is not None:
141+
(data_matrix, query_matrix), (leanvec_dims_effective, _) = (
142+
generate_leanvec_matrices(
143+
args.vecs_file,
144+
args.train_query_file,
145+
args.train_max_vectors,
146+
args.leanvec_dims,
147+
)
148+
)
149+
if not args.no_save_matrices:
150+
data_matrix_path, query_matrix_path = save_leanvec_matrices(
151+
args.vecs_file,
152+
args.train_query_file,
153+
args.train_max_vectors,
154+
leanvec_dims_effective,
155+
data_matrix,
156+
query_matrix,
157+
args.out_dir,
158+
)
159+
logger.info(
160+
{
161+
"saved_leanvec_matrices": (
162+
data_matrix_path,
163+
query_matrix_path,
164+
)
165+
}
166+
)
167+
else:
168+
data_matrix = query_matrix = None
101169
if args.static:
102170
index, name = build_static(
103171
vecs_path=args.vecs_file,
@@ -110,6 +178,8 @@ def main(argv: str | None = None) -> None:
110178
alpha=args.alpha,
111179
max_threads=args.max_threads,
112180
leanvec_dims=args.leanvec_dims,
181+
data_matrix=data_matrix,
182+
query_matrix=query_matrix,
113183
)
114184
else:
115185
index, name, ingest_time, delete_time = build_dynamic(
@@ -135,6 +205,8 @@ def main(argv: str | None = None) -> None:
135205
convert_vecs=args.convert_vecs,
136206
tmp_dir=args.tmp_dir,
137207
leanvec_dims=args.leanvec_dims,
208+
data_matrix=data_matrix,
209+
query_matrix=query_matrix,
138210
)
139211
np.save(args.out_dir / (name + ".ingest.npy"), ingest_time)
140212
if args.num_vectors_delete > 0:
@@ -167,6 +239,8 @@ def build_dynamic(
167239
convert_vecs: bool = False,
168240
tmp_dir: Path = Path("/dev/shm"),
169241
leanvec_dims: int | None = None,
242+
data_matrix: npt.NDArray | None = None,
243+
query_matrix: npt.NDArray | None = None,
170244
) -> tuple[svs.DynamicVamana, str]:
171245
"""Build SVS index."""
172246
logger.info({"build_args": locals()})
@@ -264,6 +338,8 @@ def build_dynamic(
264338
data_dir=tmp_idx_dir / "data",
265339
compress=not svs_type.startswith("float"),
266340
leanvec_dims=leanvec_dims,
341+
data_matrix=data_matrix,
342+
query_matrix=query_matrix,
267343
)
268344
index = svs.DynamicVamana(
269345
str(tmp_idx_dir / "config"),
@@ -343,6 +419,8 @@ def build_static(
343419
alpha: float | None = None,
344420
max_threads: int = 1,
345421
leanvec_dims: int | None = None,
422+
data_matrix: npt.NDArray | None = None,
423+
query_matrix: npt.NDArray | None = None,
346424
) -> tuple[svs.Vamana, str]:
347425
logger.info({"build_args": locals()})
348426
logger.info(utils.read_system_config())
@@ -360,7 +438,11 @@ def build_static(
360438
index = svs.Vamana.build(
361439
parameters,
362440
create_loader(
363-
svs_type, vecs_path=vecs_path, leanvec_dims=leanvec_dims
441+
svs_type,
442+
vecs_path=vecs_path,
443+
leanvec_dims=leanvec_dims,
444+
data_matrix=data_matrix,
445+
query_matrix=query_matrix,
364446
),
365447
distance,
366448
num_threads=max_threads,

src/svsbench/consts.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@
77
import numpy as np
88
import svs
99

10+
DEFAULT_LEANVEC_DIMS: Final[int] = -4
11+
DEFAULT_LEANVEC_TRAIN_MAX_VECTORS: Final[int] = 100_000
12+
1013
DISTANCE_TO_ALPHA: Final[dict[svs.DistanceType, float]] = {
1114
svs.DistanceType.Cosine: 0.95,
1215
svs.DistanceType.L2: 1.2,
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
# Copyright (C) 2025 Intel Corporation
2+
# SPDX-License-Identifier: Apache-2.0
3+
"""Generate LeanVec-OOD matrices."""
4+
5+
from pathlib import Path
6+
7+
import numpy as np
8+
import numpy.typing as npt
9+
import svs
10+
import typer
11+
12+
from . import consts, merge
13+
14+
15+
def main(
16+
vecs_file: Path,
17+
train_query_file: Path,
18+
max_vectors: int = consts.DEFAULT_LEANVEC_TRAIN_MAX_VECTORS,
19+
leanvec_dims: int = consts.DEFAULT_LEANVEC_DIMS,
20+
out_dir: Path = Path(),
21+
) -> None:
22+
out_dir.mkdir(parents=True, exist_ok=True)
23+
(data_matrix, query_matrix), (leanvec_dims_effective, _) = (
24+
generate_leanvec_matrices(
25+
vecs_file, train_query_file, max_vectors, leanvec_dims
26+
)
27+
)
28+
data_matrix_path, query_matrix_path = save_leanvec_matrices(
29+
vecs_file,
30+
train_query_file,
31+
max_vectors,
32+
leanvec_dims_effective,
33+
data_matrix,
34+
query_matrix,
35+
out_dir,
36+
)
37+
print("Saved LeanVec matrices:", data_matrix_path, query_matrix_path)
38+
39+
40+
def generate_leanvec_matrices(
41+
vecs_file: Path,
42+
train_query_file: Path,
43+
max_vectors: int = consts.DEFAULT_LEANVEC_TRAIN_MAX_VECTORS,
44+
leanvec_dims: int | None = None,
45+
) -> tuple[tuple[npt.NDArray, npt.NDArray], tuple[int, int]]:
46+
"""Generate LeanVec matrices from base vectors and query vectors."""
47+
base_vectors = merge.read_vecs(vecs_file, max_vectors)
48+
query_vectors = merge.read_vecs(train_query_file)
49+
dim = base_vectors.shape[1]
50+
if leanvec_dims is None:
51+
leanvec_dims = consts.DEFAULT_LEANVEC_DIMS
52+
if leanvec_dims < 0:
53+
leanvec_dims = dim // -leanvec_dims
54+
return svs.compute_leanvec_matrices(
55+
base_vectors, query_vectors, leanvec_dims
56+
), (leanvec_dims, max_vectors)
57+
58+
59+
def save_leanvec_matrices(
60+
vecs_file: Path,
61+
train_query_file: Path,
62+
max_vectors: int,
63+
leanvec_dims: int,
64+
data_matrix: npt.NDArray,
65+
query_matrix: npt.NDArray,
66+
out_dir: Path,
67+
) -> tuple[Path, Path]:
68+
"""Save LeanVec matrices to files."""
69+
name_components = [
70+
vecs_file.name,
71+
train_query_file.name,
72+
str(leanvec_dims),
73+
]
74+
if max_vectors > 0:
75+
name_components.append(str(max_vectors))
76+
base_name = "__".join(name_components)
77+
data_matrix_path = out_dir / (base_name + ".data.npy")
78+
query_matrix_path = out_dir / (base_name + ".query.npy")
79+
np.save(data_matrix_path, data_matrix)
80+
np.save(query_matrix_path, query_matrix)
81+
return data_matrix_path, query_matrix_path
82+
83+
84+
if __name__ == "__main__":
85+
# https://github.com/fastapi/typer/issues/341
86+
typer.main.get_command_name = lambda name: name
87+
typer.run(main)

src/svsbench/loader.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
from pathlib import Path
66

7+
import numpy.typing as npt
78
import svs
89

910
from . import consts
@@ -18,6 +19,8 @@ def create_loader(
1819
leanvec_dims: int | None = None,
1920
leanvec_alignment: int = 32,
2021
lvq_strategy: svs.LVQStrategy | None = None,
22+
data_matrix: npt.NDArray | None = None,
23+
query_matrix: npt.NDArray | None = None,
2124
) -> svs.VectorDataLoader | svs.LVQLoader | svs.LeanVecLoader:
2225
"""Create loader."""
2326
unkown_msg = f"Unknown {svs_type=}"
@@ -89,7 +92,7 @@ def create_loader(
8992
raise ValueError(unkown_msg)
9093
if vecs_path is not None or compress:
9194
if leanvec_dims is None:
92-
leanvec_dims = -4
95+
leanvec_dims = consts.DEFAULT_LEANVEC_DIMS
9396
if leanvec_dims < 0:
9497
leanvec_dims = loader_or_str.dims // -leanvec_dims
9598
loader = svs.LeanVecLoader(
@@ -98,6 +101,8 @@ def create_loader(
98101
primary_kind=primary,
99102
secondary_kind=secondary,
100103
alignment=leanvec_alignment,
104+
data_matrix=data_matrix,
105+
query_matrix=query_matrix,
101106
)
102107
else:
103108
loader = svs.LeanVecLoader(

src/svsbench/merge.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,12 +34,14 @@ def _read_dim(fname: Path) -> int:
3434
return dim
3535

3636

37-
def read_vecs(fname: Path) -> npt.NDArray:
37+
def read_vecs(fname: Path, max_vectors: int | None = None) -> npt.NDArray:
3838
"""Create NumPy memory maps."""
39+
if max_vectors == 0:
40+
max_vectors = None
3941
dim = _read_dim(fname)
4042
padding = SUFFIX_TO_PADDING[fname.suffix]
4143
array = np.memmap(fname, dtype=SUFFIX_TO_DTYPE[fname.suffix], mode="r")
42-
return array.reshape(-1, dim + padding)[:, padding:]
44+
return array.reshape(-1, dim + padding)[:max_vectors, padding:]
4345

4446

4547
def write(inputs: list[Path], output: Path, num_vectors: int | None) -> None:

0 commit comments

Comments
 (0)