forked from IntelLabs/ScalableVectorSearchBenchmarking
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmerge.py
More file actions
78 lines (61 loc) · 2.08 KB
/
merge.py
File metadata and controls
78 lines (61 loc) · 2.08 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
# Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
"""Merge multiple *vecs files into one."""
import argparse
import struct
from pathlib import Path
from typing import Final
import numpy as np
import numpy.typing as npt
from tqdm import tqdm
SUFFIX_TO_DTYPE: Final = {
".fvecs": np.float32,
".hvecs": np.float16,
".bvecs": np.uint8,
".ivecs": np.uint32,
}
SUFFIX_TO_PADDING: Final = {
".fvecs": 1,
".hvecs": 2,
".bvecs": 4,
".ivecs": 1,
}
def _read_dim(fname: Path) -> int:
"""Read vector dimension from *vecs."""
with open(fname, "rb") as file:
dim = struct.unpack("i", file.read(4))[0]
return dim
def read_vecs(fname: Path, max_vectors: int | None = None) -> npt.NDArray:
"""Create NumPy memory maps."""
if max_vectors == 0:
max_vectors = None
dim = _read_dim(fname)
padding = SUFFIX_TO_PADDING[fname.suffix]
array = np.memmap(fname, dtype=SUFFIX_TO_DTYPE[fname.suffix], mode="r")
return array.reshape(-1, dim + padding)[:max_vectors, padding:]
def write(inputs: list[Path], output: Path, num_vectors: int | None) -> None:
"""Write merged array."""
dim = _read_dim(inputs[0])
dim_bytes = dim.to_bytes(4, "little")
with open(output, "wb") as file:
for fname in tqdm(inputs):
array = read_vecs(fname)
if num_vectors is not None:
array = array[:num_vectors]
for vector in tqdm(array):
file.write(dim_bytes)
file.write(vector.tobytes())
def _read_args(argv: list[str] | None = None) -> argparse.Namespace:
"""Read command line arguments."""
parser = argparse.ArgumentParser(description=__file__.__doc__)
parser.add_argument(
"i", help="Input file names", action="extend", type=Path, nargs="+"
)
parser.add_argument("-o", help="Output file name", type=Path)
parser.add_argument("--num_vectors", type=int)
return parser.parse_args(argv)
def main():
args = _read_args()
write(args.i, args.o, args.num_vectors)
if __name__ == "__main__":
main()