Skip to content

dead lock when using keys of int64 converted from int32 #147

@rhdong

Description

@rhdong
#include "random"
#include "hierarchical_kv/merlin_hashtable.cuh"

using K = uint64_t;
using S = uint64_t;
using V = float;
using MerlinHashTable = nv::merlin::HashTable<K, V, S>;
using TableOption = nv::merlin::HashTableOptions;

using namespace nv::merlin;
using namespace std;

#define CUCO_CUDA_TRY(...)                                               \
  GET_CUCO_CUDA_TRY_MACRO(__VA_ARGS__, CUCO_CUDA_TRY_2, CUCO_CUDA_TRY_1) \
  (__VA_ARGS__)
#define GET_CUCO_CUDA_TRY_MACRO(_1, _2, NAME, ...) NAME
#define CUCO_CUDA_TRY_2(_call, _exception_type)                         \
  do {                                                                  \
    cudaError_t const error = (_call);                                  \
    if (cudaSuccess != error) {                                         \
      cudaGetLastError();                                               \
      throw _exception_type{std::string{"ERROR at "} + __FILE__ + ":" + \
                            CUCO_STRINGIFY(__LINE__) + ": " +           \
                            cudaGetErrorName(error) + " " +             \
                            cudaGetErrorString(error)};                 \
    }                                                                   \
  } while (0);
#define CUCO_CUDA_TRY_1(_call) CUCO_CUDA_TRY_2(_call, cuco::cuda_error)


__device__ void select() {

}


void test_merlin(bool block = true) {
  TableOption options;
  options.init_capacity = 128 * 1024 * 1024UL;
  options.max_capacity = 128 * 1024 * 1024UL;
  options.dim = 8;
  options.max_hbm_for_vectors = nv::merlin::GB(30);
  options.evict_strategy = EvictStrategy::kLru;

//  MerlinHashTable table;
  std::shared_ptr<MerlinHashTable> table = std::make_shared<MerlinHashTable>();
  table->init(options);

  size_t total_key_length = 1ul << 27;
  size_t total_key_range = 1ul << 27;
  size_t total_key_per_op = 1ul << 20;
  int dim_size = 8;

  K* h_keys;
  V* h_values;
  K* d_keys;
  V* d_values;
  K* h_cold_keys;
  K* d_cold_keys;

  K* d_input_keys;
  V* d_output_values;
  bool* d_founds;

  K* d_evicted_keys;
  V* d_evicted_values;
  S* d_evicted_scores;

  h_keys = static_cast<K*>(std::malloc(total_key_length * sizeof(K)));
  h_values = static_cast<V*>(std::malloc(total_key_length * dim_size * sizeof(V)));
  h_cold_keys = static_cast<K*>(std::malloc(total_key_length * sizeof(K)));
  CUCO_CUDA_TRY(cudaMalloc(&d_keys, total_key_length * sizeof(K)));
  CUCO_CUDA_TRY(cudaMalloc(&d_values, total_key_length * dim_size * sizeof(V)));
  CUCO_CUDA_TRY(cudaMalloc(&d_cold_keys, total_key_length * sizeof(K)));

  CUCO_CUDA_TRY(cudaMalloc(&d_input_keys, total_key_per_op * sizeof(K)));
  CUCO_CUDA_TRY(cudaMalloc(&d_output_values, total_key_per_op * dim_size * sizeof(V)));
  CUCO_CUDA_TRY(cudaMalloc(&d_founds, total_key_per_op * sizeof(bool)));
  CUCO_CUDA_TRY(cudaMemset(d_output_values, 0, total_key_per_op * dim_size * sizeof(V)));
  CUCO_CUDA_TRY(cudaMemset(d_founds, 0, total_key_per_op * sizeof(bool)));

  CUCO_CUDA_TRY(cudaMalloc(&d_evicted_keys, total_key_per_op * sizeof(K)));
  CUCO_CUDA_TRY(cudaMalloc(&d_evicted_values, total_key_per_op * dim_size * sizeof(V)));
  CUCO_CUDA_TRY(cudaMalloc(&d_evicted_scores, total_key_per_op * sizeof(S)));
  CUCO_CUDA_TRY(cudaMemset(d_evicted_keys, 0, total_key_per_op * sizeof(K)));
  CUCO_CUDA_TRY(cudaMemset(d_evicted_values, 0, total_key_per_op * dim_size * sizeof(V)));
  CUCO_CUDA_TRY(cudaMemset(d_evicted_scores, 0, total_key_per_op * sizeof(S)));

  std::random_device dev;
  std::mt19937 mt(dev());
  std::uniform_int_distribution<uint64_t> dist(0, 1ul << 50);
  for (int i = 0; i < total_key_length; i++) {
    h_keys[i] = dist(mt) % total_key_range;
    h_cold_keys[i] = dist(mt);
    for (int j = 0; j < dim_size; j++) {
      h_values[i * dim_size + j] = static_cast<float>(rand()) / static_cast<float>(RAND_MAX);;
    }
  }

  cudaMemcpy(d_keys, h_keys, total_key_length * sizeof(K), cudaMemcpyHostToDevice);
  cudaMemcpy(d_cold_keys, h_cold_keys, total_key_length * sizeof(K), cudaMemcpyHostToDevice);
  cudaMemcpy(d_values, h_values, total_key_length * dim_size * sizeof(V), cudaMemcpyHostToDevice);

  cudaStream_t stream;
  cudaStreamCreate(&stream);
  cudaStreamSynchronize(stream);

  cout << "start" << endl;
  int count = 0;
  for (;;) {
    size_t start_index = dist(mt) % (total_key_length - total_key_per_op);
    cudaMemcpyAsync(d_input_keys, d_keys + start_index, total_key_per_op * sizeof(K),
                    cudaMemcpyDeviceToDevice, stream);
    cudaMemcpyAsync(d_input_keys, d_cold_keys + start_index, total_key_per_op / 10 * sizeof(K),
                    cudaMemcpyDeviceToDevice, stream);
    table->find(total_key_per_op, d_input_keys, d_output_values, d_founds, nullptr, stream);

    start_index = dist(mt) % (total_key_length - total_key_per_op);
    cudaMemcpyAsync(d_input_keys, d_keys + start_index, total_key_per_op * sizeof(K),
                    cudaMemcpyDeviceToDevice, stream);
    cudaMemcpyAsync(d_input_keys, d_cold_keys + start_index, total_key_per_op / 10 * sizeof(K),
                    cudaMemcpyDeviceToDevice, stream);
    table->insert_and_evict(total_key_per_op, d_input_keys, d_values + start_index * dim_size,
                            nullptr, d_evicted_keys, d_evicted_values, d_evicted_scores, stream);

    cudaStreamSynchronize(stream);
    count++;
    if (count % 100 == 0) {
    }
    cout << "find and insert_and_evict: " << count << endl;
  }

}

Metadata

Metadata

Assignees

Labels

bugSomething isn't working

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions