rapidsai · jamxia155 · Feb 14, 2025 · Feb 27, 2025 · Feb 28, 2025 · Feb 28, 2025
@@ -30,6 +30,14 @@ rapids-print-env
 rapids-logger "Check GPU usage"
 nvidia-smi
 
+# RAPIDS_DATASET_ROOT_DIR is used by test scripts
+RAPIDS_DATASET_ROOT_DIR=${RAPIDS_TESTS_DIR}/dataset
+mkdir -p "${RAPIDS_DATASET_ROOT_DIR}"
+export RAPIDS_DATASET_ROOT_DIR
+pushd "${RAPIDS_DATASET_ROOT_DIR}"
+${GITHUB_WORKSPACE}/cpp/tests/get_test_data.sh --NEIGHBORS_ANN_VAMANA_TEST
+popd
+
 EXITCODE=0
 trap "EXITCODE=1" ERR
 set +e

@@ -485,6 +485,7 @@ if(BUILD_SHARED_LIBS)
     src/neighbors/vamana_build_float.cu
     src/neighbors/vamana_build_uint8.cu
     src/neighbors/vamana_build_int8.cu
+    src/neighbors/vamana_codebooks_float.cu
     src/neighbors/vamana_serialize_float.cu
     src/neighbors/vamana_serialize_uint8.cu
     src/neighbors/vamana_serialize_int8.cu

@@ -53,6 +53,17 @@ namespace cuvs::neighbors::vamana {
  */
 
 struct index_params : cuvs::neighbors::index_params {
+  /**
+   * @brief Parameters used to build quantized DiskANN index; to be generated using
+   * deserialize_codebooks()
+   */
+  template <typename T = float>
+  struct codebook_params {
+    int pq_codebook_size;
+    int pq_dim;
+    std::vector<T> pq_encoding_table;
+    std::vector<T> rotation_matrix;
+  };
   /** Maximum degree of output graph corresponds to the R parameter in the original Vamana
    * literature. */
   uint32_t graph_degree = 32;
@@ -72,6 +83,8 @@ struct index_params : cuvs::neighbors::index_params {
   uint32_t queue_size = 127;
   /** Max batchsize of reverse edge processing (reduces memory footprint) */
   uint32_t reverse_batchsize = 1000000;
+  /** Codebooks and related parameters */
+  std::optional<codebook_params<float>> codebooks = std::nullopt;
 };
 
 /**
@@ -127,6 +140,13 @@ struct index : cuvs::neighbors::index {
     return *dataset_;
   }
 
+  /** Quantized dataset [size, codes_rowlen] */
+  [[nodiscard]] inline auto quantized_data() const noexcept
+    -> raft::device_matrix_view<const uint8_t, int64_t, raft::row_major>
+  {
+    return quantized_dataset_.view();
+  }
+
   /** vamana graph [size, graph-degree] */
   [[nodiscard]] inline auto graph() const noexcept
     -> raft::device_matrix_view<const IdxT, int64_t, raft::row_major>
@@ -150,7 +170,8 @@ struct index : cuvs::neighbors::index {
     : cuvs::neighbors::index(),
       metric_(metric),
       graph_(raft::make_device_matrix<IdxT, int64_t>(res, 0, 0)),
-      dataset_(new cuvs::neighbors::empty_dataset<int64_t>(0))
+      dataset_(new cuvs::neighbors::empty_dataset<int64_t>(0)),
+      quantized_dataset_(raft::make_device_matrix<uint8_t, int64_t>(res, 0, 0))
   {
   }
 
@@ -168,6 +189,7 @@ struct index : cuvs::neighbors::index {
       metric_(metric),
       graph_(raft::make_device_matrix<IdxT, int64_t>(res, 0, 0)),
       dataset_(make_aligned_dataset(res, dataset, 16)),
+      quantized_dataset_(raft::make_device_matrix<uint8_t, int64_t>(res, 0, 0)),
       medoid_id_(medoid_id)
   {
     RAFT_EXPECTS(dataset.extent(0) == vamana_graph.extent(0),
@@ -212,11 +234,28 @@ struct index : cuvs::neighbors::index {
     graph_view_ = graph_.view();
   }
 
+  /**
+   * @brief Replace the current quantized dataset with a new quantized dataset.
+   *
+   * Ownership of the new quantized dataset is transferred to the index.
+   *
+   * @param[in] res
+   * @param[in] new_quantized_dataset the new quantized dataset for the index
+   *
+   */
+  void update_quantized_dataset(
+    raft::resources const& res,
+    raft::device_matrix<uint8_t, int64_t, raft::row_major>&& new_quantized_dataset)
+  {
+    quantized_dataset_ = new_quantized_dataset;
+  }
+
  private:
   cuvs::distance::DistanceType metric_;
   raft::device_matrix<IdxT, int64_t, raft::row_major> graph_;
   raft::device_matrix_view<const IdxT, int64_t, raft::row_major> graph_view_;
   std::unique_ptr<neighbors::dataset<int64_t>> dataset_;
+  raft::device_matrix<uint8_t, int64_t, raft::row_major> quantized_dataset_;
   IdxT medoid_id_;
 };
 /**
@@ -457,13 +496,15 @@ auto build(raft::resources const& res,
  * @param[in] file_prefix prefix of path and name of index files
  * @param[in] index Vamana index
  * @param[in] include_dataset whether or not to serialize the dataset
+ * @param[in] sector_aligned whether output file should be aligned to disk sectors of 4096 bytes
  *
  */
 
 void serialize(raft::resources const& handle,
                const std::string& file_prefix,
                const cuvs::neighbors::vamana::index<float, uint32_t>& index,
-               bool include_dataset = true);
+               bool include_dataset = true,
+               bool sector_aligned  = false);
 
 /**
  * Save the index to file.
@@ -486,12 +527,14 @@ void serialize(raft::resources const& handle,
  * @param[in] file_prefix prefix of path and name of index files
  * @param[in] index Vamana index
  * @param[in] include_dataset whether or not to serialize the dataset
+ * @param[in] sector_aligned whether output file should be aligned to disk sectors of 4096 bytes
  *
  */
 void serialize(raft::resources const& handle,
                const std::string& file_prefix,
                const cuvs::neighbors::vamana::index<int8_t, uint32_t>& index,
-               bool include_dataset = true);
+               bool include_dataset = true,
+               bool sector_aligned  = false);
 
 /**
  * Save the index to file.
@@ -514,12 +557,48 @@ void serialize(raft::resources const& handle,
  * @param[in] file_prefix prefix of path and name of index files
  * @param[in] index Vamana index
  * @param[in] include_dataset whether or not to serialize the dataset
+ * @param[in] sector_aligned whether output file should be aligned to disk sectors of 4096 bytes
  *
  */
 void serialize(raft::resources const& handle,
                const std::string& file_prefix,
                const cuvs::neighbors::vamana::index<uint8_t, uint32_t>& index,
-               bool include_dataset = true);
+               bool include_dataset = true,
+               bool sector_aligned  = false);
+
+/**
+ * @}
+ */
+
+/**
+ * @defgroup vamana_cpp_codebook Vamana codebook functions
+ * @{
+ */
+
+/**
+ * @brief Construct codebook parameters from input codebook files
+ *
+ * Expects pq pivots file at
+ * "${codebook_prefix}_pq_pivots.bin" and rotation matrix file at
+ * "${codebook_prefix}_pq_pivots.bin_rotation_matrix.bin".
+ *
+ * @code{.cpp}
+ *   #include <cuvs/neighbors/vamana.hpp>
+ *
+ *   // create a string with a filepath
+ *   std::string codebook_prefix("/path/to/index/prefix");
+ *   // define dimension of vectors in dataset
+ *   int dim = 64;
+ *   // construct codebook parameters from input codebook files
+ *   auto codebooks = cuvs::neighbors::vamana::deserialize_codebooks(codebook_prefix, dim);
+ * @endcode
+ *
+ * @param[in] codebook_prefix path prefix to pq pivots and rotation matrix files
+ * @param[in] dim dimension of vectors in dataset
+ *
+ */
+auto deserialize_codebooks(const std::string& codebook_prefix, const int dim)
+  -> index_params::codebook_params<float>;
 
 /**
  * @}

@@ -17,6 +17,7 @@
 #pragma once
 
 #include "../../../sparse/neighbors/cross_component_nn.cuh"
+#include "../../detail/vpq_dataset.cuh"
 #include "greedy_search.cuh"
 #include "robust_prune.cuh"
 #include "vamana_structs.cuh"
@@ -32,8 +33,10 @@
 #include <raft/core/host_mdspan.hpp>
 #include <raft/core/logger.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/linalg/gemm.hpp>
 #include <raft/matrix/copy.cuh>
 #include <raft/matrix/init.cuh>
+#include <raft/matrix/slice.cuh>
 #include <raft/random/make_blobs.cuh>
 
 #include <thrust/device_vector.h>
@@ -376,6 +379,21 @@ void batched_insert_vamana(
   RAFT_CHECK_CUDA(stream);
 }
 
+template <typename T>
+auto quantize_all_vectors(raft::resources const& res,
+                          raft::device_matrix_view<const T, int64_t> residuals,
+                          raft::device_matrix_view<float, uint32_t, raft::row_major> pq_codebook,
+                          cuvs::neighbors::vpq_params ps)
+  -> raft::device_matrix<uint8_t, int64_t, raft::row_major>
+{
+  auto dim         = residuals.extent(1);
+  auto vq_codebook = raft::make_device_matrix<float, uint32_t, raft::row_major>(res, 1, dim);
+
+  auto codes = cuvs::neighbors::detail::process_and_fill_codes_subspaces<float, int64_t>(
+    res, ps, residuals, raft::make_const_mdspan(vq_codebook.view()), pq_codebook);
+  return codes;
+}
+
 template <typename T,
           typename IdxT     = uint64_t,
           typename Accessor = raft::host_device_accessor<std::experimental::default_accessor<T>,
@@ -408,9 +426,126 @@ index<T, IdxT> build(
   batched_insert_vamana<T, float, IdxT, Accessor>(
     res, params, dataset, vamana_graph.view(), &medoid_id, metric);
 
+  std::optional<raft::device_matrix<uint8_t, int64_t, raft::row_major>> quantized_vectors;
+  if (params.codebooks) {
+    // Full codebook should be a raft::matrix of dimension [2^PQ_BITS * PQ_DIM, VEC_DIM / PQ_DIM]
+    // Every row is (VEC_DIM/PQ_DIM) floats representing a group of cluster centroids.
+    // Every consecutive [PQ_DIM] rows is a set.
+
+    // short-hand
+    auto& codebook_params = params.codebooks.value();
+    int pq_codebook_size  = codebook_params.pq_codebook_size;
+    int pq_dim            = codebook_params.pq_dim;
+
+    cuvs::neighbors::vpq_params pq_params;
+    pq_params.pq_bits = raft::log2(pq_codebook_size);
+    pq_params.pq_dim  = pq_dim;
+
+    // transform pq_encoding_table (dimensions: pq_codebook_size x dim_per_subspace * pq_dim ) to
+    // pq_codebook (dimensions: pq_codebook_size * pq_dim, dim_per_subspace)
+    auto pq_encoding_table_device_vec = raft::make_device_vector<float, uint32_t>(
+      res,
+      codebook_params.pq_encoding_table.size());  // logically a 2D matrix with dimensions
+                                                  // pq_codebook_size x dim_per_subspace * pq_dim
+    raft::copy(pq_encoding_table_device_vec.data_handle(),
+               codebook_params.pq_encoding_table.data(),
+               codebook_params.pq_encoding_table.size(),
+               raft::resource::get_cuda_stream(res));
+    int dim_per_subspace = dim / pq_dim;
+    auto pq_codebook =
+      raft::make_device_matrix<float, uint32_t>(res, pq_codebook_size * pq_dim, dim_per_subspace);
+    auto pq_encoding_table_device_vec_view = pq_encoding_table_device_vec.view();
+    raft::linalg::map_offset(
+      res,
+      pq_codebook.view(),
+      [pq_encoding_table_device_vec_view,
+       pq_dim,
+       pq_codebook_size,
+       dim_per_subspace,
+       dim] __device__(size_t i) {
+        int row_idx        = i / dim_per_subspace;
+        int subspace_id    = row_idx / pq_codebook_size;  // idx_pq_dim
+        int codebook_id    = row_idx % pq_codebook_size;  // idx_pq_codebook_size
+        int id_in_subspace = i % dim_per_subspace;        // idx_dim_per_subspace
+
+        return pq_encoding_table_device_vec_view[codebook_id * pq_dim * dim_per_subspace +
+                                                 subspace_id * dim_per_subspace + id_in_subspace];
+      });
+
+    // prepare rotation matrix
+    auto rotation_matrix_device = raft::make_device_matrix<float, int64_t>(res, dim, dim);
+    raft::copy(rotation_matrix_device.data_handle(),
+               codebook_params.rotation_matrix.data(),
+               codebook_params.rotation_matrix.size(),
+               raft::resource::get_cuda_stream(res));
+
+    // process in batches
+    const uint32_t n_rows = dataset.extent(0);
+    // codes_rowlen defined as in cuvs::neighbors::detail::process_and_fill_codes_subspaces()
+    const int64_t codes_rowlen =
+      sizeof(uint32_t) *
+      (1 + raft::div_rounding_up_safe<int64_t>(pq_dim * pq_params.pq_bits, 8 * sizeof(uint32_t)));
+    quantized_vectors = raft::make_device_matrix<uint8_t, int64_t, raft::row_major>(
+      res,
+      n_rows,
+      codes_rowlen - 4);  // first 4 columns of output from quantize_all_vectors() to be discarded
+    // TODO: with scaling workspace we could choose the batch size dynamically
+    constexpr uint32_t kReasonableMaxBatchSize = 65536;
+    const uint32_t max_batch_size              = std::min(n_rows, kReasonableMaxBatchSize);
+    for (const auto& batch : cuvs::spatial::knn::detail::utils::batch_load_iterator<T>(
+           dataset.data_handle(),
+           n_rows,
+           dim,
+           max_batch_size,
+           raft::resource::get_cuda_stream(res),
+           raft::resource::get_workspace_resource(res))) {
+      // perform rotation
+      auto dataset_rotated = raft::make_device_matrix<float, int64_t>(res, batch.size(), dim);
+      if constexpr (std::is_same_v<T, float>) {
+        auto dataset_view = raft::make_device_matrix_view(const_cast<T*>(batch.data()),
+                                                          static_cast<int64_t>(batch.size()),
+                                                          static_cast<int64_t>(dim));
+        raft::linalg::gemm(
+          res, dataset_view, rotation_matrix_device.view(), dataset_rotated.view());
+      } else {
+        // convert dataset to float
+        auto dataset_float = raft::make_device_matrix<float, int64_t>(res, batch.size(), dim);
+        auto dataset_view  = raft::make_device_matrix_view(
+          batch.data(), static_cast<int64_t>(batch.size()), static_cast<int64_t>(dim));
+        raft::linalg::map_offset(
+          res, dataset_float.view(), [dataset_view, dim] __device__(size_t i) {
+            int row_idx = i / dim;
+            int col_idx = i % dim;
+            return static_cast<float>(dataset_view(row_idx, col_idx));
+          });
+        raft::linalg::gemm(
+          res, dataset_float.view(), rotation_matrix_device.view(), dataset_rotated.view());
+      }
+
+      // quantize rotated vectors using codebook
+      auto temp_vectors =
+        quantize_all_vectors<float>(res, dataset_rotated.view(), pq_codebook.view(), pq_params);
+
+      // Remove the vector quantization header values
+      raft::matrix::slice_coordinates<int64_t> slice_coords(
+        0, 4, temp_vectors.extent(0), temp_vectors.extent(1));
+
+      raft::matrix::slice(res,
+                          raft::make_const_mdspan(temp_vectors.view()),
+                          raft::make_device_matrix_view<uint8_t, int64_t>(
+                            quantized_vectors.value().data_handle() +
+                              batch.offset() * quantized_vectors.value().extent(1),
+                            batch.size(),
+                            quantized_vectors.value().extent(1)),
+                          slice_coords);
+    }
+  }
+
   try {
-    return index<T, IdxT>(
+    auto idx = index<T, IdxT>(
       res, params.metric, dataset, raft::make_const_mdspan(vamana_graph.view()), medoid_id);
+    if (quantized_vectors) idx.update_quantized_dataset(res, std::move(quantized_vectors.value()));
+    return idx;
   } catch (std::bad_alloc& e) {
     RAFT_LOG_DEBUG("Insufficient GPU memory to construct VAMANA index with dataset on GPU");
     // We just add the graph. User is expected to update dataset separately (e.g allocating in