Skip to content

Commit a886b8d

Browse files
committed
[#25406] DocDB: Random test for vector index
Summary: Random test with big amount of rows and dimensions for vector index Jira: DB-14638 Test Plan: PgVectorIndexTest.Random/* Reviewers: aleksandr.ponomarenko, arybochkin Reviewed By: aleksandr.ponomarenko Subscribers: ybase, yql Tags: #jenkins-ready Differential Revision: https://phorge.dev.yugabyte.com/D40845
1 parent ac52997 commit a886b8d

File tree

4 files changed

+144
-44
lines changed

4 files changed

+144
-44
lines changed

src/yb/docdb/usearch_vector_index-test.cc

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,6 @@ TEST_F(UsearchVectorIndexTest, CreateAndQuery) {
6767
for (size_t thread_index = 0; thread_index < kNumIndexingThreads; ++thread_index) {
6868
indexing_thread_holder.AddThreadFunctor(
6969
[&num_vectors_inserted, &index, &latch, &uniform_distrib]() {
70-
std::random_device rd;
7170
size_t vector_id;
7271
while ((vector_id = num_vectors_inserted.fetch_add(1)) < kNumVectors) {
7372
auto vec = RandomFloatVector(kDimensions, uniform_distrib);

src/yb/docdb/vector_index.cc

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,21 @@
3333
#include "yb/vector_index/usearch_wrapper.h"
3434
#include "yb/vector_index/vector_lsm.h"
3535

36-
DEFINE_RUNTIME_uint64(vector_index_initial_chunk_size, 1024,
36+
DEFINE_RUNTIME_uint64(vector_index_initial_chunk_size, 100000,
3737
"Number of vector in initial vector index chunk");
3838

39+
DEFINE_RUNTIME_PREVIEW_uint32(vector_index_ef, 128,
40+
"The \"expansion\" parameter for search");
41+
42+
DEFINE_RUNTIME_PREVIEW_uint32(vector_index_ef_construction, 256,
43+
"The \"expansion\" parameter during graph construction");
44+
45+
DEFINE_RUNTIME_PREVIEW_uint32(vector_index_num_neighbors_per_vertex, 32,
46+
"Number of neighbors per graph node");
47+
48+
DEFINE_RUNTIME_PREVIEW_uint32(vector_index_num_neighbors_per_vertex_base, 128,
49+
"Number of neighbors per graph node in base level graph");
50+
3951
namespace yb::docdb {
4052

4153
const std::string kVectorIndexDirPrefix = "vi-";
@@ -48,6 +60,10 @@ auto VectorLSMFactory(size_t dimensions) {
4860
return [dimensions] {
4961
vector_index::HNSWOptions hnsw_options = {
5062
.dimensions = dimensions,
63+
.num_neighbors_per_vertex = FLAGS_vector_index_num_neighbors_per_vertex,
64+
.num_neighbors_per_vertex_base = FLAGS_vector_index_num_neighbors_per_vertex_base,
65+
.ef_construction = FLAGS_vector_index_ef_construction,
66+
.ef = FLAGS_vector_index_ef,
5167
};
5268
return FactoryImpl::Create(hnsw_options);
5369
};

src/yb/util/random_util.h

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -149,10 +149,14 @@ typename Collection::const_reference RandomElement(const Collection& collection,
149149
std::string RandomHumanReadableString(size_t len, std::mt19937_64* rng = nullptr);
150150

151151
template<typename Distribution>
152-
std::vector<float> RandomFloatVector(size_t dimensions, Distribution& dis) {
152+
std::vector<float> RandomFloatVector(
153+
size_t dimensions, Distribution& dis, std::mt19937_64* rng = nullptr) {
154+
if (!rng) {
155+
rng = &ThreadLocalRandom();
156+
}
153157
std::vector<float> vec(dimensions);
154158
for (auto& v : vec) {
155-
v = dis(ThreadLocalRandom());
159+
v = dis(*rng);
156160
}
157161
return vec;
158162
}

src/yb/yql/pgwrapper/pg_vector_index-test.cc

Lines changed: 121 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@
1111
// under the License.
1212
//
1313

14+
#include <queue>
15+
1416
#include "yb/client/snapshot_test_util.h"
1517

1618
#include "yb/consensus/consensus.h"
@@ -28,6 +30,8 @@
2830
#include "yb/util/backoff_waiter.h"
2931
#include "yb/util/test_thread_holder.h"
3032

33+
#include "yb/vector_index/usearch_include_wrapper_internal.h"
34+
3135
#include "yb/yql/pgwrapper/pg_mini_test_base.h"
3236

3337
DECLARE_bool(TEST_skip_process_apply);
@@ -38,6 +42,12 @@ DECLARE_uint32(vector_index_concurrent_writes);
3842

3943
namespace yb::pgwrapper {
4044

45+
using FloatVector = std::vector<float>;
46+
47+
const unum::usearch::byte_t* VectorToBytePtr(const FloatVector& vector) {
48+
return pointer_cast<const unum::usearch::byte_t*>(vector.data());
49+
}
50+
4151
class PgVectorIndexTest : public PgMiniTestBase, public testing::WithParamInterface<bool> {
4252
protected:
4353
void SetUp() override {
@@ -57,43 +67,45 @@ class PgVectorIndexTest : public PgMiniTestBase, public testing::WithParamInterf
5767
return IsColocated() ? ConnectToDB("colocated_db") : PgMiniTestBase::Connect();
5868
}
5969

60-
Result<PGConn> MakeIndex(int num_tablets = 0) {
70+
Result<PGConn> MakeIndex(size_t dimensions = 3) {
6171
auto colocated = IsColocated();
6272
auto conn = VERIFY_RESULT(PgMiniTestBase::Connect());
6373
std::string create_suffix;
6474
if (colocated) {
6575
create_suffix = " WITH (COLOCATED = 1)";
6676
RETURN_NOT_OK(conn.ExecuteFormat("CREATE DATABASE colocated_db COLOCATION = true"));
6777
conn = VERIFY_RESULT(Connect());
68-
} else if (num_tablets) {
69-
create_suffix = Format(" SPLIT INTO $0 TABLETS", num_tablets);
7078
}
7179
RETURN_NOT_OK(conn.Execute("CREATE EXTENSION vector"));
72-
RETURN_NOT_OK(conn.Execute(
73-
"CREATE TABLE test (id bigserial PRIMARY KEY, embedding vector(3))" + create_suffix));
80+
RETURN_NOT_OK(conn.ExecuteFormat(
81+
"CREATE TABLE test (id bigserial PRIMARY KEY, embedding vector($0))$1",
82+
dimensions, create_suffix));
7483

7584
RETURN_NOT_OK(conn.Execute("CREATE INDEX ON test USING ybhnsw (embedding vector_l2_ops)"));
7685

7786
return conn;
7887
}
7988

80-
Status WaitForLoadBalance(int num_tablet_servers) {
81-
return WaitFor(
82-
[&]() -> Result<bool> { return client_->IsLoadBalanced(num_tablet_servers); },
83-
60s * kTimeMultiplier,
84-
Format("Wait for load balancer to balance to $0 tservers.", num_tablet_servers));
85-
}
86-
87-
Result<PGConn> MakeIndexAndFill(int num_rows, int num_tablets = 0);
88-
Status InsertRows(PGConn& conn, int start_row, int end_row);
89+
Result<PGConn> MakeIndexAndFill(size_t num_rows);
90+
Result<PGConn> MakeIndexAndFillRandom(size_t num_rows, size_t dimensions);
91+
Status InsertRows(PGConn& conn, size_t start_row, size_t end_row);
92+
Status InsertRandomRows(PGConn& conn, size_t num_rows, size_t dimensions);
8993

90-
void VerifyRead(PGConn& conn, int limit, bool add_filter);
94+
void VerifyRead(PGConn& conn, size_t limit, bool add_filter);
9195
void VerifyRows(
92-
PGConn& conn, bool add_filter, const std::vector<std::string>& expected, int limit = -1);
96+
PGConn& conn, bool add_filter, const std::vector<std::string>& expected, int64_t limit = -1);
9397

9498
void TestSimple();
9599
void TestManyRows(bool add_filter);
96100
void TestRestart(tablet::FlushFlags flush_flags);
101+
102+
FloatVector RandomVector(size_t dimensions) {
103+
return RandomFloatVector(dimensions, distribution_, &rng_);
104+
}
105+
106+
std::vector<FloatVector> vectors_;
107+
std::uniform_real_distribution<> distribution_;
108+
std::mt19937_64 rng_{42};
97109
};
98110

99111
void PgVectorIndexTest::TestSimple() {
@@ -167,53 +179,70 @@ std::string ExpectedRow(int64_t id) {
167179
return BuildRow(id, VectorAsString(id));
168180
}
169181

170-
Status PgVectorIndexTest::InsertRows(PGConn& conn, int start_row, int end_row) {
182+
Status PgVectorIndexTest::InsertRows(PGConn& conn, size_t start_row, size_t end_row) {
171183
RETURN_NOT_OK(conn.StartTransaction(IsolationLevel::SNAPSHOT_ISOLATION));
172-
for (int i = start_row; i <= end_row; ++i) {
184+
for (auto i = start_row; i <= end_row; ++i) {
173185
RETURN_NOT_OK(conn.ExecuteFormat(
174186
"INSERT INTO test VALUES ($0, '$1')", i, VectorAsString(i)));
175187
}
176188
return conn.CommitTransaction();
177189
}
178190

179-
Result<PGConn> PgVectorIndexTest::MakeIndexAndFill(int num_rows, int num_tablets) {
180-
auto conn = VERIFY_RESULT(MakeIndex(num_tablets));
191+
Status PgVectorIndexTest::InsertRandomRows(PGConn& conn, size_t num_rows, size_t dimensions) {
192+
RETURN_NOT_OK(conn.StartTransaction(IsolationLevel::SNAPSHOT_ISOLATION));
193+
for (size_t i = 0; i != num_rows; ++i) {
194+
auto vector = RandomVector(dimensions);
195+
RETURN_NOT_OK(conn.ExecuteFormat(
196+
"INSERT INTO test VALUES ($0, '$1')", vectors_.size(), AsString(vector)));
197+
vectors_.push_back(std::move(vector));
198+
}
199+
return conn.CommitTransaction();
200+
}
201+
202+
Result<PGConn> PgVectorIndexTest::MakeIndexAndFill(size_t num_rows) {
203+
auto conn = VERIFY_RESULT(MakeIndex());
181204
RETURN_NOT_OK(InsertRows(conn, 1, num_rows));
182205
return conn;
183206
}
184207

208+
Result<PGConn> PgVectorIndexTest::MakeIndexAndFillRandom(size_t num_rows, size_t dimensions) {
209+
auto conn = VERIFY_RESULT(MakeIndex(dimensions));
210+
RETURN_NOT_OK(InsertRandomRows(conn, num_rows, dimensions));
211+
return conn;
212+
}
213+
185214
void PgVectorIndexTest::VerifyRows(
186-
PGConn& conn, bool add_filter, const std::vector<std::string>& expected, int limit) {
215+
PGConn& conn, bool add_filter, const std::vector<std::string>& expected, int64_t limit) {
187216
auto result = ASSERT_RESULT((conn.FetchRows<RowAsString>(Format(
188217
"SELECT * FROM test $0 ORDER BY embedding <-> '[0.0, 0.0, 0.0]' LIMIT $1",
189218
add_filter ? "WHERE id + 3 <= 5" : "",
190-
limit == -1 ? expected.size() : make_unsigned(limit)))));
219+
limit < 0 ? expected.size() : make_unsigned(limit)))));
191220
EXPECT_EQ(result.size(), expected.size());
192221
for (size_t i = 0; i != std::min(result.size(), expected.size()); ++i) {
193222
SCOPED_TRACE(Format("Row $0", i));
194223
EXPECT_EQ(result[i], expected[i]);
195224
}
196225
}
197226

198-
void PgVectorIndexTest::VerifyRead(PGConn& conn, int limit, bool add_filter) {
227+
void PgVectorIndexTest::VerifyRead(PGConn& conn, size_t limit, bool add_filter) {
199228
std::vector<std::string> expected;
200-
for (int i = 1; i <= limit; ++i) {
229+
for (size_t i = 1; i <= limit; ++i) {
201230
expected.push_back(ExpectedRow(i));
202231
}
203232
VerifyRows(conn, add_filter, expected);
204233
}
205234

206235
void PgVectorIndexTest::TestManyRows(bool add_filter) {
207-
constexpr int kNumRows = RegularBuildVsSanitizers(2000, 64);
208-
const int query_limit = add_filter ? 1 : 5;
236+
constexpr size_t kNumRows = RegularBuildVsSanitizers(2000, 64);
237+
const size_t query_limit = add_filter ? 1 : 5;
209238

210239
auto conn = ASSERT_RESULT(MakeIndexAndFill(kNumRows));
211240
ASSERT_NO_FATALS(VerifyRead(conn, query_limit, add_filter));
212241
}
213242

214243
TEST_P(PgVectorIndexTest, Split) {
215-
constexpr int kNumRows = RegularBuildVsSanitizers(500, 64);
216-
constexpr int kQueryLimit = 5;
244+
constexpr size_t kNumRows = RegularBuildVsSanitizers(500, 64);
245+
constexpr size_t kQueryLimit = 5;
217246

218247
auto conn = ASSERT_RESULT(MakeIndexAndFill(kNumRows));
219248
ASSERT_OK(cluster_->FlushTablets());
@@ -236,17 +265,17 @@ TEST_P(PgVectorIndexTest, ManyReads) {
236265
ANNOTATE_UNPROTECTED_WRITE(FLAGS_vector_index_concurrent_reads) = 1;
237266
ANNOTATE_UNPROTECTED_WRITE(FLAGS_vector_index_concurrent_writes) = 1;
238267

239-
constexpr int kNumRows = 64;
240-
constexpr int kNumReads = 16;
268+
constexpr size_t kNumRows = 64;
269+
constexpr size_t kNumReads = 16;
241270

242271
auto conn = ASSERT_RESULT(MakeIndexAndFill(kNumRows));
243272

244273
TestThreadHolder threads;
245-
for (int i = 1; i <= kNumReads; ++i) {
274+
for (size_t i = 1; i <= kNumReads; ++i) {
246275
threads.AddThreadFunctor([this, &stop_flag = threads.stop_flag()] {
247276
auto conn = ASSERT_RESULT(Connect());
248277
while (!stop_flag.load()) {
249-
auto id = RandomUniformInt(1, kNumRows);
278+
auto id = RandomUniformInt<size_t>(1, kNumRows);
250279
auto vector = VectorAsString(id);
251280
auto rows = ASSERT_RESULT(conn.FetchAllAsString(Format(
252281
"SELECT * FROM test ORDER BY embedding <-> '$0' LIMIT 1", vector)));
@@ -259,8 +288,8 @@ TEST_P(PgVectorIndexTest, ManyReads) {
259288
}
260289

261290
void PgVectorIndexTest::TestRestart(tablet::FlushFlags flush_flags) {
262-
constexpr int kNumRows = 64;
263-
constexpr int kQueryLimit = 5;
291+
constexpr size_t kNumRows = 64;
292+
constexpr size_t kQueryLimit = 5;
264293

265294
auto conn = ASSERT_RESULT(MakeIndexAndFill(kNumRows));
266295
ASSERT_NO_FATALS(VerifyRead(conn, kQueryLimit, false));
@@ -284,7 +313,7 @@ TEST_P(PgVectorIndexTest, BootstrapFlushedIntentsDB) {
284313
}
285314

286315
TEST_P(PgVectorIndexTest, DeleteAndUpdate) {
287-
constexpr int kNumRows = 64;
316+
constexpr size_t kNumRows = 64;
288317
const std::string kDistantVector = "[100, 500, 9000]";
289318
const std::string kCloseVector = "[0.125, 0.25, 0.375]";
290319

@@ -309,12 +338,12 @@ TEST_P(PgVectorIndexTest, DeleteAndUpdate) {
309338
}
310339

311340
TEST_P(PgVectorIndexTest, RemoteBootstrap) {
312-
constexpr int kNumRows = 64;
313-
constexpr int kQueryLimit = 5;
341+
constexpr size_t kNumRows = 64;
342+
constexpr size_t kQueryLimit = 5;
314343

315344
auto* mts = cluster_->mini_tablet_server(2);
316345
mts->Shutdown();
317-
auto conn = ASSERT_RESULT(MakeIndexAndFill(kNumRows, 3));
346+
auto conn = ASSERT_RESULT(MakeIndexAndFill(kNumRows));
318347
const auto table_id = ASSERT_RESULT(GetTableIDFromTableName("test"));
319348
ASSERT_OK(cluster_->FlushTablets());
320349
for (const auto& peer : ListTableActiveTabletPeers(cluster_.get(), table_id)) {
@@ -355,8 +384,8 @@ TEST_P(PgVectorIndexTest, RemoteBootstrap) {
355384
}
356385

357386
TEST_P(PgVectorIndexTest, SnapshotSchedule) {
358-
constexpr int kNumRows = 128;
359-
constexpr int kQueryLimit = 5;
387+
constexpr size_t kNumRows = 128;
388+
constexpr size_t kQueryLimit = 5;
360389

361390
client::SnapshotTestUtil snapshot_util;
362391
snapshot_util.SetProxy(&client_->proxy_cache());
@@ -383,6 +412,58 @@ TEST_P(PgVectorIndexTest, SnapshotSchedule) {
383412
ASSERT_NO_FATALS(VerifyRead(conn, kQueryLimit, false));
384413
}
385414

415+
TEST_P(PgVectorIndexTest, Random) {
416+
constexpr size_t kLimit = 10;
417+
constexpr size_t kDimensions = 64;
418+
constexpr size_t kNumRows = RegularBuildVsDebugVsSanitizers(10000, 1000, 100);
419+
constexpr int kNumIterations = RegularBuildVsDebugVsSanitizers(100, 20, 10);
420+
421+
unum::usearch::metric_punned_t metric(
422+
kDimensions, unum::usearch::metric_kind_t::l2sq_k, unum::usearch::scalar_kind_t::f32_k);
423+
424+
auto conn = ASSERT_RESULT(MakeIndexAndFillRandom(kNumRows, kDimensions));
425+
size_t sum_missing = 0;
426+
std::vector<size_t> counts;
427+
for (int i = 0; i != kNumIterations; ++i) {
428+
auto query_vector = RandomVector(kDimensions);
429+
auto rows = ASSERT_RESULT(conn.FetchRows<int64_t>(Format(
430+
"SELECT id FROM test ORDER BY embedding <-> '$0' LIMIT $1", query_vector, kLimit)));
431+
std::vector<int64_t> expected(vectors_.size());
432+
std::generate(expected.begin(), expected.end(), [n{0LL}]() mutable { return n++; });
433+
std::sort(
434+
expected.begin(), expected.end(),
435+
[&metric, &query_vector, &vectors = vectors_](size_t li, size_t ri) {
436+
const auto& lhs = vectors[li];
437+
const auto& rhs = vectors[ri];
438+
return metric(VectorToBytePtr(query_vector), VectorToBytePtr(lhs)) <
439+
metric(VectorToBytePtr(query_vector), VectorToBytePtr(rhs));
440+
});
441+
size_t ep = 0;
442+
for (int64_t id : rows) {
443+
while (ep < expected.size() && id != expected[ep]) {
444+
++ep;
445+
}
446+
ASSERT_LT(ep, expected.size());
447+
ASSERT_EQ(id, expected[ep]);
448+
++ep;
449+
}
450+
size_t missing = ep - kLimit;
451+
if (missing > counts.size()) {
452+
LOG(INFO)
453+
<< "New max: " << missing << ", fetched: " << AsString(rows) << ", expected: "
454+
<< AsString(boost::make_iterator_range(
455+
expected.begin(), expected.begin() + kLimit + missing));
456+
}
457+
counts.resize(std::max(counts.size(), missing + 1));
458+
++counts[missing];
459+
sum_missing += missing;
460+
}
461+
LOG(INFO)
462+
<< "Counts: " << AsString(counts)
463+
<< ", recall: " << 1.0 - sum_missing * 1.0 / (kLimit * kNumIterations);
464+
ASSERT_LE(sum_missing * 50, kLimit * kNumIterations);
465+
}
466+
386467
std::string ColocatedToString(const testing::TestParamInfo<bool>& param_info) {
387468
return param_info.param ? "Colocated" : "Distributed";
388469
}

0 commit comments

Comments
 (0)