Merge pull request #64 from johnlees/read_blocks

Use a buffer to hold read data on the GPU
bacpop · Jul 22, 2021 · 1d31abf · 1d31abf
2 parents 52664e1 + c10e24f
commit 1d31abf
Show file tree

Hide file tree

Showing 22 changed files with 816 additions and 882 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -76,7 +76,11 @@ if(CMAKE_CUDA_COMPILER)
     set(CMAKE_CUDA_FLAGS "${CUDA_OPTS}")
 
     add_compile_definitions(GPU_AVAILABLE)
-    add_library("${TARGET_NAME}_CUDA" OBJECT src/gpu/dist.cu src/gpu/sketch.cu)
+    add_library("${TARGET_NAME}_CUDA" OBJECT src/gpu/dist.cu
+                                             src/gpu/sketch.cu
+                                             src/gpu/device_memory.cu
+                                             src/gpu/gpu_countmin.cu
+                                             src/gpu/device_reads.cu)
     target_include_directories("${TARGET_NAME}_CUDA" PRIVATE "${EIGEN3_INCLUDE_DIR}" "${pybind11_INCLUDE_DIRS}")
     set_property(TARGET "${TARGET_NAME}_CUDA"
                  PROPERTY POSITION_INDEPENDENT_CODE ON

diff --git a/pp_sketch/__init__.py b/pp_sketch/__init__.py
@@ -3,4 +3,4 @@
 
 '''PopPUNK sketching functions'''
 
-__version__ = '1.7.3'
+__version__ = '1.7.4'
diff --git a/src/Makefile b/src/Makefile
@@ -45,7 +45,7 @@ PROGRAMS=sketch_test matrix_test read_test gpu_dist_test
 
 SKETCH_OBJS=dist/dist.o dist/matrix_ops.o reference.o sketch/seqio.o sketch/sketch.o database/database.o sketch/countmin.o api.o dist/linear_regression.o random/rng.o random/random_match.o
 GPU_SKETCH_OBJS=gpu/gpu_api.o
-CUDA_OBJS=gpu/dist.cu.o gpu/sketch.cu.o
+CUDA_OBJS=gpu/dist.cu.o gpu/sketch.cu.o gpu/device_reads.cu.o gpu/gpu_countmin.cu.o gpu/device_memory.cu.o
 
 # web specific options
 web: CXX = emcc
@@ -121,4 +121,13 @@ gpu/dist.cu.o:
 gpu/sketch.cu.o:
 	nvcc $(CUDAFLAGS) $(CPPFLAGS) -DGPU_AVAILABLE -x cu -c gpu/sketch.cu -o $@
 
+gpu/device_memory.cu.o:
+	nvcc $(CUDAFLAGS) $(CPPFLAGS) -DGPU_AVAILABLE -x cu -c gpu/device_memory.cu -o $@
+
+gpu/device_reads.cu.o:
+	nvcc $(CUDAFLAGS) $(CPPFLAGS) -DGPU_AVAILABLE -x cu -c gpu/device_reads.cu -o $@
+
+gpu/gpu_countmin.cu.o:
+	nvcc $(CUDAFLAGS) $(CPPFLAGS) -DGPU_AVAILABLE -x cu -c gpu/gpu_countmin.cu -o $@
+
 .PHONY: all clean install python install_python web
diff --git a/src/api.cpp b/src/api.cpp
@@ -117,7 +117,7 @@ std::vector<Reference> create_sketches(
 
     // Save sketches and check for densified sketches
     std::cerr << "Writing sketches to file" << std::endl;
-    Database sketch_db = new_db(db_name + ".h5", codon_phased);
+    Database sketch_db = new_db(db_name + ".h5", use_rc, codon_phased);
     for (auto sketch_it = sketches.begin(); sketch_it != sketches.end();
          sketch_it++) {
       sketch_db.add_sketch(*sketch_it);

diff --git a/src/database/database.cpp b/src/database/database.cpp
@@ -224,7 +224,8 @@ HighFive::File open_h5(const std::string &filename, const bool write)
 }
 
 // Create a new HDF5 file with headline attributes
-Database new_db(const std::string &filename, const bool codon_phased)
+Database new_db(const std::string &filename, const bool use_rc,
+                const bool codon_phased)
 {
   // Restrict scope of HDF5 so it is closed before reopening it
   {
@@ -241,6 +242,10 @@ Database new_db(const std::string &filename, const bool codon_phased)
         sketch_group.createAttribute<bool>("codon_phased",
                                           HighFive::DataSpace::From(codon_phased));
     codon_phased_a.write(codon_phased);
+    HighFive::Attribute use_rc_a =
+        sketch_group.createAttribute<bool>("reverse_complement",
+                                          HighFive::DataSpace::From(use_rc));
+    use_rc_a.write(use_rc);
   }
   return(Database(filename, true));
 }
diff --git a/src/database/database.hpp b/src/database/database.hpp
@@ -42,6 +42,7 @@ class Database
     }
     return _version_hash == db2.version();
   }
+  void flush() { _h5_file.flush(); }
 
 private:
   HighFive::File _h5_file;
@@ -52,4 +53,4 @@ class Database
 };
 
 HighFive::File open_h5(const std::string &filename, const bool write = true);
-Database new_db(const std::string &filename, const bool codon_phased);
+Database new_db(const std::string &filename, const bool use_rc, const bool codon_phased);
diff --git a/src/dist/matrix_ops.cpp b/src/dist/matrix_ops.cpp
@@ -174,7 +174,7 @@ NumpyMatrix long_to_square(const Eigen::VectorXf &rrDists,
     for (long distIdx = 0; distIdx < qrDists.rows(); distIdx++)
     {
       unsigned long i = distIdx % nrrSamples;
-      unsigned long j = static_cast<size_t>(distIdx / (float)nrrSamples + 0.001f) + nrrSamples;
+      unsigned long j = distIdx / nrrSamples + nrrSamples;
       squareDists(i, j) = qrDists[distIdx];
       squareDists(j, i) = qrDists[distIdx];
     }

diff --git a/src/gpu/cuda.cuh b/src/gpu/cuda.cuh
@@ -26,6 +26,7 @@ static void HandleCUDAError(const char *file, int line,
 }
 
 #define CUDA_CALL(err) (HandleCUDAError(__FILE__, __LINE__, err))
+#define CUDA_CALL_NOTHROW( err ) (err)
 
 // A bit lazy... should be a class and just use the destructor
 struct progress_atomics {

diff --git a/src/gpu/device_memory.cu b/src/gpu/device_memory.cu
@@ -0,0 +1,83 @@
+
+#include "device_memory.cuh"
+#include "cuda.cuh"
+#include "gpu.hpp"
+
+// Sets up data structures and loads them onto the device
+DeviceMemory::DeviceMemory(
+    SketchStrides &ref_strides, SketchStrides &query_strides,
+    std::vector<Reference> &ref_sketches,
+    std::vector<Reference> &query_sketches, const SketchSlice &sample_slice,
+    const FlatRandom &flat_random, const std::vector<uint16_t> &ref_random_idx,
+    const std::vector<uint16_t> &query_random_idx,
+    const std::vector<size_t> &kmer_lengths, long long dist_rows,
+    const bool self, const int cpu_threads)
+    : _n_dists(dist_rows * 2), d_query_sketches(nullptr),
+      d_query_random(nullptr) {
+  // Set up reference sketches, flatten and copy to device
+  std::vector<uint64_t> flat_ref = flatten_by_samples(
+      ref_sketches, kmer_lengths, ref_strides, sample_slice.ref_offset,
+      sample_slice.ref_offset + sample_slice.ref_size, cpu_threads);
+  CUDA_CALL(
+      cudaMalloc((void **)&d_ref_sketches, flat_ref.size() * sizeof(uint64_t)));
+  CUDA_CALL(cudaMemcpy(d_ref_sketches, flat_ref.data(),
+                       flat_ref.size() * sizeof(uint64_t), cudaMemcpyDefault));
+
+  // Preload random match chances, which have already been flattened
+  CUDA_CALL(cudaMalloc((void **)&d_random_table,
+                       std::get<1>(flat_random).size() * sizeof(float)));
+  CUDA_CALL(cudaMemcpy(d_random_table, std::get<1>(flat_random).data(),
+                       std::get<1>(flat_random).size() * sizeof(float),
+                       cudaMemcpyDefault));
+  CUDA_CALL(cudaMalloc((void **)&d_ref_random,
+                       sample_slice.ref_size * sizeof(uint16_t)));
+  CUDA_CALL(
+      cudaMemcpy(d_ref_random, ref_random_idx.data() + sample_slice.ref_offset,
+                 sample_slice.ref_size * sizeof(uint16_t), cudaMemcpyDefault));
+
+  // If ref v query mode, also flatten query vector and copy to device
+  if (!self) {
+    std::vector<uint64_t> flat_query = flatten_by_bins(
+        query_sketches, kmer_lengths, query_strides, sample_slice.query_offset,
+        sample_slice.query_offset + sample_slice.query_size, cpu_threads);
+    CUDA_CALL(cudaMalloc((void **)&d_query_sketches,
+                         flat_query.size() * sizeof(uint64_t)));
+    CUDA_CALL(cudaMemcpy(d_query_sketches, flat_query.data(),
+                         flat_query.size() * sizeof(uint64_t),
+                         cudaMemcpyDefault));
+
+    CUDA_CALL(cudaMalloc((void **)&d_query_random,
+                         sample_slice.query_size * sizeof(uint16_t)));
+    CUDA_CALL(cudaMemcpy(
+        d_query_random, query_random_idx.data() + sample_slice.query_offset,
+        sample_slice.query_size * sizeof(uint16_t), cudaMemcpyDefault));
+  } else {
+    query_strides = ref_strides;
+  }
+
+  // Copy or set other arrays needed on device (kmers and distance output)
+  std::vector<int> kmer_ints(kmer_lengths.begin(), kmer_lengths.end());
+  CUDA_CALL(cudaMalloc((void **)&d_kmers, kmer_ints.size() * sizeof(int)));
+  CUDA_CALL(cudaMemcpy(d_kmers, kmer_ints.data(),
+                       kmer_ints.size() * sizeof(int), cudaMemcpyDefault));
+
+  CUDA_CALL(cudaMalloc((void **)&d_dist_mat, _n_dists * sizeof(float)));
+  CUDA_CALL(cudaMemset(d_dist_mat, 0, _n_dists * sizeof(float)));
+}
+
+DeviceMemory::~DeviceMemory() {
+  CUDA_CALL(cudaFree(d_ref_sketches));
+  CUDA_CALL(cudaFree(d_query_sketches));
+  CUDA_CALL(cudaFree(d_random_table));
+  CUDA_CALL(cudaFree(d_ref_random));
+  CUDA_CALL(cudaFree(d_query_random));
+  CUDA_CALL(cudaFree(d_kmers));
+  CUDA_CALL(cudaFree(d_dist_mat));
+}
+
+std::vector<float> DeviceMemory::read_dists() {
+  std::vector<float> dists(_n_dists);
+  CUDA_CALL(cudaMemcpy(dists.data(), d_dist_mat, _n_dists * sizeof(float),
+                       cudaMemcpyDefault));
+  return dists;
+}
diff --git a/src/gpu/device_memory.cuh b/src/gpu/device_memory.cuh
@@ -0,0 +1,81 @@
+#pragma once
+
+#include "reference.hpp"
+
+// Align structs
+// https://stackoverflow.com/a/12779757
+#if defined(__CUDACC__) // NVCC
+#define ALIGN(n) __align__(n)
+#elif defined(__GNUC__) // GCC
+#define ALIGN(n) __attribute__((aligned(n)))
+#elif defined(_MSC_VER) // MSVC
+#define ALIGN(n) __declspec(align(n))
+#else
+#error "Please provide a definition for MY_ALIGN macro for your host compiler!"
+#endif
+
+struct ALIGN(8) RandomStrides {
+  size_t kmer_stride;
+  size_t cluster_inner_stride;
+  size_t cluster_outer_stride;
+};
+
+typedef std::tuple<RandomStrides, std::vector<float>> FlatRandom;
+
+#ifdef GPU_AVAILABLE
+
+// Structure of flattened vectors
+struct ALIGN(16) SketchStrides {
+  size_t bin_stride;
+  size_t kmer_stride;
+  size_t sample_stride;
+  size_t sketchsize64;
+  size_t bbits;
+};
+
+struct ALIGN(8) SketchSlice {
+  size_t ref_offset;
+  size_t ref_size;
+  size_t query_offset;
+  size_t query_size;
+};
+
+// Memory on device for each operation
+class DeviceMemory {
+public:
+  DeviceMemory(SketchStrides &ref_strides, SketchStrides &query_strides,
+               std::vector<Reference> &ref_sketches,
+               std::vector<Reference> &query_sketches,
+               const SketchSlice &sample_slice, const FlatRandom &flat_random,
+               const std::vector<uint16_t> &ref_random_idx,
+               const std::vector<uint16_t> &query_random_idx,
+               const std::vector<size_t> &kmer_lengths, long long dist_rows,
+               const bool self, const int cpu_threads);
+
+  ~DeviceMemory();
+
+  std::vector<float> read_dists();
+
+  uint64_t *ref_sketches() { return d_ref_sketches; }
+  uint64_t *query_sketches() { return d_query_sketches; }
+  float *random_table() { return d_random_table; }
+  uint16_t *ref_random() { return d_ref_random; }
+  uint16_t *query_random() { return d_query_random; }
+  int *kmers() { return d_kmers; }
+  float *dist_mat() { return d_dist_mat; }
+
+private:
+  DeviceMemory(const DeviceMemory &) = delete;
+  DeviceMemory(DeviceMemory &&) = delete;
+
+  size_t _n_dists;
+  uint64_t *d_ref_sketches;
+  uint64_t *d_query_sketches;
+  float *d_random_table;
+  uint16_t *d_ref_random;
+  uint16_t *d_query_random;
+  int *d_kmers;
+  float *d_dist_mat;
+};
+
+#endif
diff --git a/src/gpu/device_reads.cu b/src/gpu/device_reads.cu
@@ -0,0 +1,58 @@
+
+#include "cuda.cuh"
+#include "device_reads.cuh"
+
+DeviceReads::DeviceReads(const std::shared_ptr<SeqBuf> &seq_ptr,
+                         const size_t n_threads)
+    : seq(seq_ptr), n_reads(seq->n_full_seqs()), read_length(seq->max_length()),
+      current_block(0), buffer_filled(0), loaded_first(false) {
+  // Set up buffer to load in reads (on host)
+  size_t mem_free = 0;
+  size_t mem_total = 0;
+  CUDA_CALL(cudaMemGetInfo(&mem_free, &mem_total));
+  buffer_size = (mem_free * 0.9) / (read_length * sizeof(char));
+  buffer_blocks =
+      std::floor(n_reads / (static_cast<double>(buffer_size) + 1)) + 1;
+  if (buffer_size > n_reads) {
+    buffer_size = n_reads;
+    buffer_blocks = 1;
+  }
+  host_buffer.resize(buffer_size * read_length);
+  CUDA_CALL_NOTHROW(cudaHostRegister(host_buffer.data(),
+                                     host_buffer.size() * sizeof(char),
+                                     cudaHostRegisterDefault));
+
+  // Buffer to store reads (on device)
+  CUDA_CALL(
+      cudaMalloc((void **)&d_reads, buffer_size * read_length * sizeof(char)));
+}
+
+DeviceReads::~DeviceReads() {
+  CUDA_CALL_NOTHROW(cudaHostUnregister(host_buffer.data()));
+  CUDA_CALL_NOTHROW(cudaFree(d_reads));
+}
+
+bool DeviceReads::next_buffer() {
+  bool success;
+  if (current_block < buffer_blocks) {
+    if (buffer_blocks > 1 || !loaded_first) {
+      size_t start = current_block * buffer_size;
+      size_t end = (current_block + 1) * buffer_size;
+      if (end > seq->n_full_seqs()) {
+        end = seq->n_full_seqs();
+      }
+      buffer_filled = end - start;
+
+      seq->load_seqs(host_buffer, start, end);
+      CUDA_CALL(cudaMemcpyAsync(d_reads, host_buffer.data(),
+                                buffer_filled * read_length * sizeof(char),
+                                cudaMemcpyDefault));
+      loaded_first = true;
+    }
+    current_block++;
+    success = true;
+  } else {
+    success = false;
+  }
+  return success;
+}
diff --git a/src/gpu/device_reads.cuh b/src/gpu/device_reads.cuh
@@ -0,0 +1,36 @@
+#pragma once
+
+#include <memory> // unique_ptr
+#include <vector>
+
+#include "sketch/seqio.hpp" // SeqBuf
+
+class DeviceReads {
+public:
+  DeviceReads(const std::shared_ptr<SeqBuf> &seq_in, const size_t n_threads);
+  ~DeviceReads();
+
+  bool next_buffer();
+
+  void reset_buffer() { current_block = 0; }
+  char *read_ptr() { return d_reads; }
+  size_t buffer_count() const { return buffer_filled; }
+  size_t length() const { return read_length; }
+
+private:
+  // delete copy and move to avoid accidentally using them
+  DeviceReads(const DeviceReads &) = delete;
+  DeviceReads(DeviceReads &&other) = delete;
+
+  char *d_reads;
+  std::vector<char> host_buffer;
+  std::shared_ptr<SeqBuf> seq;
+
+  size_t n_reads;
+  size_t read_length;
+  size_t buffer_size;
+  size_t buffer_blocks;
+  size_t current_block;
+  size_t buffer_filled;
+  bool loaded_first;
+};
Original file line number	Diff line number	Diff line change
Expand Up		@@ -3,4 +3,4 @@

		'''PopPUNK sketching functions'''

		__version__ = '1.7.3'
		__version__ = '1.7.4'