Skip to content

Commit fc621d5

Browse files
committed
[#26937] DocDB: YbHnsw persistence
Summary: The YbHnsw library currently generates a block-based representation for the HNSW (Hierarchical Navigable Small World) graph structure. However, this representation is only stored in memory and lacks persistence, meaning it is lost when the application terminates or restarts. To make YbHnsw a viable solution for implementing a vector index, we need to add functionality for saving the in-memory HNSW graph data to the file system and reloading it when needed. This persistence layer will ensure that the graph remains available across application sessions, improving usability and efficiency. Also added basic wiring (not final interface) for block cache. To provide ability to load and unload blocks from the diff in followup diffs. Jira: DB-16382 Test Plan: YbHnswTest.Persistence Reviewers: arybochkin Reviewed By: arybochkin Subscribers: ybase Tags: #jenkins-ready Differential Revision: https://phorge.dev.yugabyte.com/D43473
1 parent d21ea7e commit fc621d5

15 files changed

+884
-312
lines changed

CMakeLists.txt

+1
Original file line numberDiff line numberDiff line change
@@ -897,6 +897,7 @@ set(YB_SUBDIR_NAMES
897897
fs
898898
gen_yrpc
899899
gutil
900+
hnsw
900901
integration-tests
901902
master
902903
qlexpr

src/yb/hnsw/CMakeLists.txt

+34
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
# Copyright (c) YugabyteDB, Inc.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
4+
# in compliance with the License. You may obtain a copy of the License at
5+
#
6+
# http://www.apache.org/licenses/LICENSE-2.0
7+
#
8+
# Unless required by applicable law or agreed to in writing, software distributed under the License
9+
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
10+
# or implied. See the License for the specific language governing permissions and limitations
11+
# under the License.
12+
#
13+
14+
set(YB_PCH_PREFIX hnsw)
15+
16+
set(NHSW_SRCS
17+
hnsw_block_cache.cc
18+
hnsw.cc
19+
)
20+
21+
set(NHSW_LIBS
22+
vector_index
23+
yb_common
24+
yb_util
25+
)
26+
27+
ADD_YB_LIBRARY(hnsw
28+
SRCS ${NHSW_SRCS}
29+
DEPS ${NHSW_LIBS})
30+
31+
set(YB_TEST_LINK_LIBS
32+
hnsw yb_common_test_util ${YB_MIN_TEST_LIBS})
33+
34+
ADD_YB_TEST(hnsw-test)

src/yb/hnsw/block_writer.h

+91
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
// Copyright (c) YugabyteDB, Inc.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
4+
// in compliance with the License. You may obtain a copy of the License at
5+
//
6+
// http://www.apache.org/licenses/LICENSE-2.0
7+
//
8+
// Unless required by applicable law or agreed to in writing, software distributed under the License
9+
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
10+
// or implied. See the License for the specific language governing permissions and limitations
11+
// under the License.
12+
//
13+
14+
#pragma once
15+
16+
#include "yb/hnsw/types.h"
17+
18+
#include "yb/util/logging.h"
19+
20+
namespace yb::hnsw {
21+
22+
class BlockWriter {
23+
public:
24+
BlockWriter() = default;
25+
BlockWriter(std::byte* out, std::byte* end) : out_(out), end_(end) {
26+
DCHECK_LE(out_, end_);
27+
}
28+
29+
explicit BlockWriter(DataBlock& block)
30+
: BlockWriter(block.data(), block.data() + block.size()) {}
31+
32+
BlockWriter(BlockWriter&& rhs)
33+
: out_(std::exchange(rhs.out_, nullptr)), end_(std::exchange(rhs.end_, nullptr)) {
34+
}
35+
36+
void operator=(BlockWriter&& rhs) {
37+
DCHECK_EQ(out_, end_);
38+
out_ = rhs.out_;
39+
end_ = rhs.end_;
40+
rhs.out_ = rhs.end_;
41+
}
42+
43+
~BlockWriter() {
44+
DCHECK_EQ(out_, end_);
45+
}
46+
47+
std::byte* out() const {
48+
return out_;
49+
}
50+
51+
std::byte* end() const {
52+
return end_;
53+
}
54+
55+
size_t SpaceLeft() const {
56+
return end_ - out_;
57+
}
58+
59+
BlockWriter Split(size_t size) {
60+
auto old_end = end_;
61+
end_ = out_ + size;
62+
DCHECK_LE(end_, old_end);
63+
return BlockWriter(end_, old_end);
64+
}
65+
66+
std::byte* Prepare(size_t size) {
67+
auto result = out_;
68+
out_ += size;
69+
DCHECK_LE(out_, end_);
70+
return result;
71+
}
72+
73+
template <class Value>
74+
void Append(Value value) {
75+
Store<Value, HnswEndian>(out_, value);
76+
out_ += sizeof(Value);
77+
}
78+
79+
template <class... Args>
80+
void AppendBytes(Args&&... args) {
81+
Slice slice(std::forward<Args>(args)...);
82+
memcpy(out_, slice.data(), slice.size());
83+
out_ += slice.size();
84+
}
85+
86+
private:
87+
std::byte* out_ = nullptr;
88+
std::byte* end_ = nullptr;
89+
};
90+
91+
} // namespace yb::hnsw

src/yb/vector_index/yb_hnsw-test.cc renamed to src/yb/hnsw/hnsw-test.cc

+39-12
Original file line numberDiff line numberDiff line change
@@ -11,19 +11,20 @@
1111
// under the License.
1212
//
1313

14-
#include "yb/util/test_util.h"
14+
#include "yb/hnsw/hnsw.h"
15+
#include "yb/hnsw/hnsw_block_cache.h"
1516

1617
#include "yb/util/random_util.h"
18+
#include "yb/util/test_util.h"
1719
#include "yb/util/tsan_util.h"
1820

1921
#include "yb/vector_index/vector_index_fwd.h"
2022
#include "yb/vector_index/distance.h"
2123
#include "yb/vector_index/usearch_include_wrapper_internal.h"
22-
#include "yb/vector_index/yb_hnsw.h"
2324

24-
namespace yb::vector_index {
25+
namespace yb::hnsw {
2526

26-
using IndexImpl = unum::usearch::index_dense_gt<VectorId>;
27+
using IndexImpl = unum::usearch::index_dense_gt<vector_index::VectorId>;
2728
using Vector = std::vector<float>;
2829

2930
unum::usearch::index_dense_config_t CreateIndexDenseConfig() {
@@ -36,7 +37,7 @@ unum::usearch::index_dense_config_t CreateIndexDenseConfig() {
3637
}
3738

3839
struct AcceptAllVectors {
39-
bool operator()(const VectorId& id) const {
40+
bool operator()(const vector_index::VectorId& id) const {
4041
return true;
4142
}
4243
};
@@ -62,7 +63,7 @@ class YbHnswTest : public YBTest {
6263

6364
void InsertRandomVector(Vector& holder) {
6465
RandomVector(holder);
65-
ASSERT_TRUE(index_.add(VectorId::GenerateRandom(), holder.data()));
66+
ASSERT_TRUE(index_.add(vector_index::VectorId::GenerateRandom(), holder.data()));
6667
}
6768

6869
void InsertRandomVectors(size_t count) {
@@ -79,7 +80,7 @@ class YbHnswTest : public YBTest {
7980
}
8081

8182
void VerifySearch(const Vector& query_vector, size_t max_results) {
82-
VectorFilter filter = AcceptAllVectors();
83+
vector_index::VectorFilter filter = AcceptAllVectors();
8384
auto usearch_results = index_.filtered_search(query_vector.data(), max_results, filter);
8485
auto yb_hnsw_results = yb_hnsw_.Search(query_vector.data(), max_results, filter, context_);
8586
ASSERT_EQ(usearch_results.count, yb_hnsw_results.size());
@@ -91,24 +92,42 @@ class YbHnswTest : public YBTest {
9192
}
9293

9394
std::vector<Vector> PrepareRandom(size_t num_vectors, size_t num_searches);
95+
Status InitYbHnsw(bool load);
96+
9497
void TestPerf();
98+
void TestSimple(bool load);
9599

96100
size_t dimensions_ = 8;
97101
size_t max_vectors_ = 65536;
98102
std::mt19937_64 rng_{42};
99103
unum::usearch::metric_punned_t metric_;
100104
IndexImpl index_;
105+
BlockCachePtr block_cache_ = std::make_shared<BlockCache>(*Env::Default());
101106
YbHnsw yb_hnsw_;
102107
YbHnswSearchContext context_;
103108
};
104109

105-
TEST_F(YbHnswTest, Simple) {
110+
Status YbHnswTest::InitYbHnsw(bool load) {
111+
auto path = GetTestPath("0.yb_hnsw");
112+
if (load) {
113+
{
114+
YbHnsw temp(metric_);
115+
RETURN_NOT_OK(temp.Import(index_, path, block_cache_));
116+
}
117+
RETURN_NOT_OK(yb_hnsw_.Init(path, block_cache_));
118+
} else {
119+
RETURN_NOT_OK(yb_hnsw_.Import(index_, path, block_cache_));
120+
}
121+
return Status::OK();
122+
}
123+
124+
void YbHnswTest::TestSimple(bool load) {
106125
constexpr size_t kNumVectors = 100;
107126
constexpr size_t kNumSearches = 10;
108127
constexpr size_t kMaxResults = 10;
109128

110129
InsertRandomVectors(kNumVectors);
111-
yb_hnsw_.Import(index_);
130+
ASSERT_OK(InitYbHnsw(load));
112131

113132
Vector query_vector;
114133
for (size_t i = 0; i != kNumSearches; ++i) {
@@ -117,10 +136,18 @@ TEST_F(YbHnswTest, Simple) {
117136
}
118137
}
119138

139+
TEST_F(YbHnswTest, Simple) {
140+
TestSimple(/* load= */ false);
141+
}
142+
143+
TEST_F(YbHnswTest, Persistence) {
144+
TestSimple(/* load= */ true);
145+
}
146+
120147
std::vector<Vector> YbHnswTest::PrepareRandom(size_t num_vectors, size_t num_searches) {
121148
EXPECT_LE(num_vectors, max_vectors_);
122149
InsertRandomVectors(num_vectors);
123-
yb_hnsw_.Import(index_);
150+
EXPECT_OK(InitYbHnsw(false));
124151

125152
std::vector<Vector> query_vectors(num_searches);
126153
for (auto& vector : query_vectors) {
@@ -151,7 +178,7 @@ void YbHnswTest::TestPerf() {
151178

152179
auto query_vectors = PrepareRandom(num_vectors, num_searches);
153180
YbHnswSearchContext context;
154-
VectorFilter filter = AcceptAllVectors();
181+
vector_index::VectorFilter filter = AcceptAllVectors();
155182
MonoTime start = MonoTime::Now();
156183
for (const auto& query_vector : query_vectors) {
157184
index_.filtered_search(query_vector.data(), kMaxResults, filter);
@@ -183,4 +210,4 @@ TEST_F(YbHnswTest, Perf2048Dims) {
183210
TestPerf();
184211
}
185212

186-
} // namespace yb::vector_index
213+
} // namespace yb::hnsw

0 commit comments

Comments
 (0)