Skip to content
This repository was archived by the owner on Apr 28, 2025. It is now read-only.

Commit 337d9b9

Browse files
committed
Merge remote-tracking branch 'origin/upstreamchanges2502'
2 parents a23d4a8 + 3044c68 commit 337d9b9

File tree

21 files changed

+1089
-183
lines changed

21 files changed

+1089
-183
lines changed

.github/workflows/benchmarks.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ jobs:
5555
- onng_ngt
5656
- opensearchknn
5757
- panng_ngt
58+
- parlayann
5859
- pg_embedding
5960
- pgvector
6061
- pgvecto_rs
@@ -70,6 +71,7 @@ jobs:
7071
- vearch
7172
- vespa
7273
- voyager
74+
- vsag
7375
- weaviate
7476
include:
7577
- library: pynndescent

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,8 @@ We have a number of precomputed data sets in HDF5 format. All data sets have bee
7171
| [NYTimes](https://archive.ics.uci.edu/ml/datasets/bag+of+words) | 256 | 290,000 | 10,000 | 100 | Angular | [HDF5](http://ann-benchmarks.com/nytimes-256-angular.hdf5) (301MB) |
7272
| [SIFT](http://corpus-texmex.irisa.fr/) | 128 | 1,000,000 | 10,000 | 100 | Euclidean | [HDF5](http://ann-benchmarks.com/sift-128-euclidean.hdf5) (501MB) |
7373
| [Last.fm](https://github.com/erikbern/ann-benchmarks/pull/91) | 65 | 292,385 | 50,000 | 100 | Angular | [HDF5](http://ann-benchmarks.com/lastfm-64-dot.hdf5) (135MB) |
74+
| [COCO-I2I](https://cocodataset.org/) | 512 | 113,287 | 10,000 | 100 | Angular | [HDF5](https://github.com/fabiocarrara/str-encoders/releases/download/v0.1.3/coco-i2i-512-angular.hdf5) (136MB) |
75+
| [COCO-T2I](https://cocodataset.org/) | 512 | 113,287 | 10,000 | 100 | Angular | [HDF5](https://github.com/fabiocarrara/str-encoders/releases/download/v0.1.3/coco-t2i-512-angular.hdf5) (136MB) |
7476

7577
Results
7678
=======

ann_benchmarks/algorithms/elastiknn/Dockerfile

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,13 @@ WORKDIR /home/elasticsearch
1313
USER elasticsearch
1414

1515
# Install elasticsearch.
16-
RUN curl -o elasticsearch.tar.gz https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-8.12.2-linux-x86_64.tar.gz
16+
RUN curl -o elasticsearch.tar.gz https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-8.15.0-linux-x86_64.tar.gz
1717
RUN tar xzf elasticsearch.tar.gz
1818
RUN mv elasticsearch-* elasticsearch && rm elasticsearch.tar.gz
1919

2020
# Install plugin.
2121
RUN /home/elasticsearch/elasticsearch/bin/elasticsearch-plugin install --batch \
22-
https://github.com/alexklibisz/elastiknn/releases/download/8.12.2.1/elastiknn-8.12.2.1.zip
22+
https://github.com/alexklibisz/elastiknn/releases/download/8.15.0.1/elastiknn-8.15.0.1.zip
2323

2424
# Configuration
2525
# Backup the original configurations, which can be useful for comparing.
@@ -66,10 +66,7 @@ USER root
6666
WORKDIR /home/app
6767

6868
# Install python client.
69-
# Using no-deps because scipy (1.7.0) is incompatible with the container version of Python (3.6).
70-
# Then we need to install the deps manually.
71-
RUN python3 -m pip install --no-deps elastiknn-client==8.6.2.0
72-
RUN python3 -m pip install elasticsearch==8.6.2 dataclasses-json==0.3.7 tqdm==4.61.1
69+
RUN python3 -m pip install elastiknn-client==8.15.0.1
7370

7471
# Custom entrypoint that also starts the Elasticsearch server in the background
7572
RUN echo '\

ann_benchmarks/algorithms/elastiknn/config.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,5 +18,5 @@ float:
1818
name: elastiknn-l2lsh
1919
run_groups:
2020
elastiknn-l2lsh:
21-
args: [[100], [4], [1024, 2048]]
22-
query_args: [[500, 1000], [0, 3]]
21+
args: [[175], [7], [3900]]
22+
query_args: [[100,500,1000], [0]]

ann_benchmarks/algorithms/hnswlib/module.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,6 @@ def __init__(self, metric, method_param):
99
self.metric = {"angular": "cosine", "euclidean": "l2"}[metric]
1010
self.method_param = method_param
1111
# print(self.method_param,save_index,query_param)
12-
# self.ef=query_param['ef']
13-
self.name = "hnswlib (%s)" % (self.method_param)
1412

1513
def fit(self, X):
1614
# Only l2 is supported currently
@@ -24,6 +22,7 @@ def fit(self, X):
2422

2523
def set_query_arguments(self, ef):
2624
self.p.set_ef(ef)
25+
self.name = "hnswlib (%s, 'efQuery': %s)" % (self.method_param, ef)
2726

2827
def query(self, v, n):
2928
# print(np.expand_dims(v,axis=0).shape)
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
FROM ann-benchmarks
2+
3+
RUN apt update
4+
RUN apt install -y software-properties-common
5+
RUN add-apt-repository -y ppa:git-core/ppa
6+
RUN apt update
7+
RUN DEBIAN_FRONTEND=noninteractive apt install -y git make cmake g++ libaio-dev libgoogle-perftools-dev libunwind-dev clang-format libboost-dev libboost-program-options-dev libmkl-full-dev libcpprest-dev python3.10
8+
9+
#RUN apt-get update
10+
#RUN apt-get install -y g++ software-properties-common
11+
RUN pip3 install pybind11 numpy
12+
13+
ARG CACHEBUST=1
14+
RUN git clone -b annbench https://github.com/cmuparlay/ParlayANN.git
15+
RUN cd ParlayANN && git submodule update --init --recursive
16+
RUN cd ParlayANN/python && bash compile.sh
17+
#RUN cd ParlayANN/python && pip install -e .
18+
#RUN python3 -c 'import parlaypy'
19+
ENV PYTHONPATH=$PYTHONPATH:/home/app/ParlayANN/python
20+
WORKDIR /home/app
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
float:
2+
euclidean:
3+
- base_args: ['@metric']
4+
constructor: ParlayANN
5+
disabled: false
6+
docker_tag: ann-benchmarks-parlayann
7+
module: ann_benchmarks.algorithms.parlayann
8+
name: parlayann
9+
run_groups:
10+
parlay_80:
11+
args: [{alpha: 1.15, R: 80, L: 160, two_pass: True}]
12+
query_args: [[{Q: 20}, {Q: 22}, {Q: 25}, {Q: 30}, {Q: 40}, {Q: 50}, {Q: 60}, {Q: 80}, {Q: 100}, {Q: 125}, {Q: 150}, {Q: 200}, {Q: 300}, {Q: 400}, {Q: 600}, {Q: 800}]]
13+
parlay_64:
14+
args: [{alpha: 1.1, R: 64, L: 128, two_pass: True}]
15+
query_args: [[{limit: 10}, {limit: 11}, {limit: 12}, {limit: 13}, {limit: 14}, {limit: 15}, {limit: 16}, {limit: 18}, {limit: 20}, {limit: 22}, {Q: 10}, {Q: 11}, {Q: 12}, {Q: 14}, {Q: 16}, {Q: 18}, {Q: 20}, {Q: 22}, {Q: 25}, {Q: 30}, {Q: 40}, {Q: 50}, {Q: 60}, {Q: 80}, {Q: 100}, {Q: 125}, {Q: 150}, {Q: 200}, {Q: 300}, {Q: 400}, {Q: 600}, {Q: 800}]]
16+
parlay_40:
17+
args: [{alpha: 1.08, R: 40, L: 80, two_pass: True}]
18+
query_args: [[{limit: 10}, {limit: 11}, {limit: 12}, {limit: 13}, {limit: 14}, {limit: 15}, {limit: 16}, {limit: 18}, {limit: 20}, {limit: 22}, {Q: 10}, {Q: 11}, {Q: 12}, {Q: 14}, {Q: 16}, {Q: 18}, {Q: 20}]]
19+
parlay_32_05:
20+
args: [{alpha: 1.05, R: 32, L: 64, two_pass: True}]
21+
query_args: [[{limit: 10}, {limit: 11}, {limit: 12}, {limit: 13}, {limit: 14}, {limit: 15}, {limit: 16}, {limit: 18}, {limit: 20}, {limit: 22}, {Q: 10}, {Q: 11}, {Q: 12}, {Q: 14}, {Q: 16}, {Q: 18}, {Q: 20}]]
22+
angular:
23+
- base_args: ['@metric']
24+
constructor: ParlayANN
25+
disabled: false
26+
docker_tag: ann-benchmarks-parlayann
27+
module: ann_benchmarks.algorithms.parlayann
28+
name: parlayann
29+
run_groups:
30+
parlay_130:
31+
args: [{alpha: .85, R: 130, L: 260, two_pass: True}]
32+
query_args: [[{Q: 20}, {Q: 22}, {Q: 25}, {Q: 30}, {Q: 40}, {Q: 50}, {Q: 60}, {Q: 80}, {Q: 100}, {Q: 125}, {Q: 150}, {Q: 200}, {Q: 400}, {Q: 800}]]
33+
parlay_100:
34+
args: [{alpha: .85, R: 100, L: 200, two_pass: True}]
35+
query_args: [[{limit: 10}, {limit: 11}, {limit: 12}, {limit: 13}, {limit: 14}, {limit: 15}, {limit: 16}, {limit: 18}, {limit: 20}, {limit: 22}, {limit: 25}, {limit: 30}, {Q: 10}, {Q: 11}, {Q: 12}, {Q: 13}, {Q: 14}, {Q: 15}, {Q: 16}, {Q: 17}, {Q: 18}, {Q: 20}, {Q: 22}, {Q: 25}, {Q: 30}, {Q: 40}, {Q: 50}, {Q: 60}, {Q: 80}, {Q: 100}, {Q: 125}, {Q: 150}, {Q: 200}, {Q: 400}, {Q: 800}]]
36+
parlay_80:
37+
args: [{alpha: .90, R: 80, L: 160, two_pass: True}]
38+
query_args: [[{limit: 10}, {limit: 11}, {limit: 12}, {limit: 13}, {limit: 14}, {limit: 15}, {limit: 16}, {limit: 18}, {limit: 20}, {limit: 22}, {limit: 25}, {limit: 30}, {Q: 10}, {Q: 11}, {Q: 12}, {Q: 13}, {Q: 14}, {Q: 15}, {Q: 16}, {Q: 17}, {Q: 18}, {Q: 20}]]
39+
parlay_50:
40+
args: [{alpha: .95, R: 50, L: 100, two_pass: True}]
41+
query_args: [[{limit: 10}, {limit: 11}, {limit: 12}, {limit: 13}, {limit: 14}, {limit: 15}, {limit: 16}, {limit: 18}, {limit: 20}, {limit: 22}, {limit: 25}, {limit: 30}, {Q: 10}, {Q: 11}, {Q: 12}, {Q: 14}, {Q: 16}, {Q: 18}, {Q: 20}]]
42+
43+
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
from __future__ import absolute_import
2+
import psutil
3+
import os
4+
import struct
5+
import time
6+
import numpy as np
7+
import wrapper as pann
8+
9+
from ..base.module import BaseANN
10+
11+
class ParlayANN(BaseANN):
12+
def __init__(self, metric, index_params):
13+
self.name = "parlayann_(" + str(index_params) + ")"
14+
self._index_params = index_params
15+
self._metric = self.translate_dist_fn(metric)
16+
17+
self.R = int(index_params.get("R", 50))
18+
self.L = int(index_params.get("L", 100))
19+
self.alpha = float(index_params.get("alpha", 1.15))
20+
self.two_pass = bool(index_params.get("two_pass", False))
21+
22+
def translate_dist_fn(self, metric):
23+
if metric == 'euclidean':
24+
return 'Euclidian'
25+
elif metric == 'ip':
26+
return 'mips'
27+
elif metric == 'angular':
28+
return 'mips'
29+
else:
30+
raise Exception('Invalid metric')
31+
32+
def translate_dtype(self, dtype:str):
33+
if dtype == 'float32':
34+
return 'float'
35+
else:
36+
return dtype
37+
38+
def fit(self, X):
39+
def bin_to_float(binary):
40+
return struct.unpack("!f", struct.pack("!I", int(binary, 2)))[0]
41+
42+
print("Vamana: Starting Fit...")
43+
index_dir = "indices"
44+
45+
if not os.path.exists(index_dir):
46+
os.makedirs(index_dir)
47+
48+
data_path = os.path.join(index_dir, "base.bin")
49+
save_path = os.path.join(index_dir, self.name)
50+
print("parlayann: Index Stored At: " + save_path)
51+
nb, dims = X.shape
52+
shape = [
53+
np.float32(bin_to_float("{:032b}".format(nb))),
54+
np.float32(bin_to_float("{:032b}".format(dims))),
55+
]
56+
X = X.flatten()
57+
X = np.insert(X, 0, shape)
58+
X.tofile(data_path)
59+
60+
if not os.path.exists(save_path):
61+
print("parlayann: Creating Index")
62+
start = time.time()
63+
self.params = pann.build_vamana_index(self._metric, "float", data_path, save_path,
64+
self.R, self.L, self.alpha, self.two_pass)
65+
end = time.time()
66+
print("Indexing time: ", end - start)
67+
print(f"Wrote index to {save_path}")
68+
self.index = pann.load_index(self._metric, "float", data_path, save_path)
69+
print("Index loaded")
70+
71+
def query(self, X, k):
72+
return self.index.single_search(X, k, self.Q, True, self.limit)
73+
74+
def batch_query(self, X, k):
75+
print("running batch")
76+
nq, dims = X.shape
77+
self.res, self.distances = self.index.batch_search(X, k, self.Q, True, self.limit)
78+
return self.res
79+
80+
def set_query_arguments(self, query_args):
81+
self.name = "parlayann_(" + str(self._index_params) + "," + str(query_args) + ")"
82+
print(query_args)
83+
self.limit = 1000 if query_args.get("limit") is None else query_args.get("limit")
84+
self.Q = 10 if query_args.get("Q") is None else query_args.get("Q")

ann_benchmarks/algorithms/pgvector/module.py

Lines changed: 97 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,52 @@
1+
"""
2+
This module supports connecting to a PostgreSQL instance and performing vector
3+
indexing and search using the pgvector extension. The default behavior uses
4+
the "ann" value of PostgreSQL user name, password, and database name, as well
5+
as the default host and port values of the psycopg driver.
6+
7+
If PostgreSQL is managed externally, e.g. in a cloud DBaaS environment, the
8+
environment variable overrides listed below are available for setting PostgreSQL
9+
connection parameters:
10+
11+
ANN_BENCHMARKS_PG_USER
12+
ANN_BENCHMARKS_PG_PASSWORD
13+
ANN_BENCHMARKS_PG_DBNAME
14+
ANN_BENCHMARKS_PG_HOST
15+
ANN_BENCHMARKS_PG_PORT
16+
17+
This module starts the PostgreSQL service automatically using the "service"
18+
command. The environment variable ANN_BENCHMARKS_PG_START_SERVICE could be set
19+
to "false" (or e.g. "0" or "no") in order to disable this behavior.
20+
21+
This module will also attempt to create the pgvector extension inside the
22+
target database, if it has not been already created.
23+
"""
24+
125
import subprocess
226
import sys
27+
import os
328

429
import pgvector.psycopg
530
import psycopg
631

32+
from typing import Dict, Any, Optional
33+
734
from ..base.module import BaseANN
35+
from ...util import get_bool_env_var
36+
37+
38+
def get_pg_param_env_var_name(pg_param_name: str) -> str:
39+
return f'ANN_BENCHMARKS_PG_{pg_param_name.upper()}'
40+
41+
42+
def get_pg_conn_param(
43+
pg_param_name: str,
44+
default_value: Optional[str] = None) -> Optional[str]:
45+
env_var_name = get_pg_param_env_var_name(pg_param_name)
46+
env_var_value = os.getenv(env_var_name, default_value)
47+
if env_var_value is None or len(env_var_value.strip()) == 0:
48+
return default_value
49+
return env_var_value
850

951

1052
class PGVector(BaseANN):
@@ -21,9 +63,61 @@ def __init__(self, metric, method_param):
2163
else:
2264
raise RuntimeError(f"unknown metric {metric}")
2365

66+
def ensure_pgvector_extension_created(self, conn: psycopg.Connection) -> None:
67+
"""
68+
Ensure that `CREATE EXTENSION vector` has been executed.
69+
"""
70+
with conn.cursor() as cur:
71+
# We have to use a separate cursor for this operation.
72+
# If we reuse the same cursor for later operations, we might get
73+
# the following error:
74+
# KeyError: "couldn't find the type 'vector' in the types registry"
75+
cur.execute(
76+
"SELECT EXISTS(SELECT 1 FROM pg_extension WHERE extname = 'vector')")
77+
pgvector_exists = cur.fetchone()[0]
78+
if pgvector_exists:
79+
print("vector extension already exists")
80+
else:
81+
print("vector extension does not exist, creating")
82+
cur.execute("CREATE EXTENSION vector")
83+
2484
def fit(self, X):
25-
subprocess.run("service postgresql start", shell=True, check=True, stdout=sys.stdout, stderr=sys.stderr)
26-
conn = psycopg.connect(user="ann", password="ann", dbname="ann", autocommit=True)
85+
psycopg_connect_kwargs: Dict[str, Any] = dict(
86+
autocommit=True,
87+
)
88+
for arg_name in ['user', 'password', 'dbname']:
89+
# The default value is "ann" for all of these parameters.
90+
psycopg_connect_kwargs[arg_name] = get_pg_conn_param(
91+
arg_name, 'ann')
92+
93+
# If host/port are not specified, leave the default choice to the
94+
# psycopg driver.
95+
pg_host: Optional[str] = get_pg_conn_param('host')
96+
if pg_host is not None:
97+
psycopg_connect_kwargs['host'] = pg_host
98+
99+
pg_port_str: Optional[str] = get_pg_conn_param('port')
100+
if pg_port_str is not None:
101+
psycopg_connect_kwargs['port'] = int(pg_port_str)
102+
103+
should_start_service = get_bool_env_var(
104+
get_pg_param_env_var_name('start_service'),
105+
default_value=True)
106+
if should_start_service:
107+
subprocess.run(
108+
"service postgresql start",
109+
shell=True,
110+
check=True,
111+
stdout=sys.stdout,
112+
stderr=sys.stderr)
113+
else:
114+
print(
115+
"Assuming that PostgreSQL service is managed externally. "
116+
"Not attempting to start the service.")
117+
118+
conn = psycopg.connect(**psycopg_connect_kwargs)
119+
self.ensure_pgvector_extension_created(conn)
120+
27121
pgvector.psycopg.register_vector(conn)
28122
cur = conn.cursor()
29123
cur.execute("DROP TABLE IF EXISTS items")
@@ -46,6 +140,7 @@ def fit(self, X):
46140
print("done!")
47141
self._cur = cur
48142

143+
49144
def set_query_arguments(self, ef_search):
50145
self._ef_search = ef_search
51146
self._cur.execute("SET hnsw.ef_search = %d" % ef_search)

ann_benchmarks/algorithms/qsg_ngt/Dockerfile

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,9 @@ RUN apt update
44
RUN apt install -y git cmake g++ python3 python3-setuptools python3-pip libblas-dev liblapack-dev
55
RUN pip3 install wheel pybind11==2.5.0
66
RUN git clone https://github.com/WPJiang/HWTL_SDU-ANNS.git
7-
RUN cp HWTL_SDU-ANNS/lib/* /usr/local/lib/
8-
RUN cp HWTL_SDU-ANNS/bin/* /usr/local/bin/
7+
RUN cp HWTL_SDU-ANNS/create.py /home/app/
8+
RUN cp -r HWTL_SDU-ANNS/lib/* /usr/local/lib/
9+
RUN cp -r HWTL_SDU-ANNS/bin/* /usr/local/bin/
910
RUN chmod a+x /usr/local/bin/* && chmod a+x HWTL_SDU-ANNS/*
1011
RUN ldconfig
1112
RUN pip3 install HWTL_SDU-ANNS/qsgngt-*-linux_x86_64.whl

0 commit comments

Comments
 (0)