Skip to content

Commit

Permalink
Merge pull request #64 from johnlees/read_blocks
Browse files Browse the repository at this point in the history
Use a buffer to hold read data on the GPU
  • Loading branch information
johnlees authored Jul 22, 2021
2 parents 52664e1 + c10e24f commit 1d31abf
Show file tree
Hide file tree
Showing 22 changed files with 816 additions and 882 deletions.
6 changes: 5 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,11 @@ if(CMAKE_CUDA_COMPILER)
set(CMAKE_CUDA_FLAGS "${CUDA_OPTS}")

add_compile_definitions(GPU_AVAILABLE)
add_library("${TARGET_NAME}_CUDA" OBJECT src/gpu/dist.cu src/gpu/sketch.cu)
add_library("${TARGET_NAME}_CUDA" OBJECT src/gpu/dist.cu
src/gpu/sketch.cu
src/gpu/device_memory.cu
src/gpu/gpu_countmin.cu
src/gpu/device_reads.cu)
target_include_directories("${TARGET_NAME}_CUDA" PRIVATE "${EIGEN3_INCLUDE_DIR}" "${pybind11_INCLUDE_DIRS}")
set_property(TARGET "${TARGET_NAME}_CUDA"
PROPERTY POSITION_INDEPENDENT_CODE ON
Expand Down
2 changes: 1 addition & 1 deletion pp_sketch/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@

'''PopPUNK sketching functions'''

__version__ = '1.7.3'
__version__ = '1.7.4'
11 changes: 10 additions & 1 deletion src/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ PROGRAMS=sketch_test matrix_test read_test gpu_dist_test

SKETCH_OBJS=dist/dist.o dist/matrix_ops.o reference.o sketch/seqio.o sketch/sketch.o database/database.o sketch/countmin.o api.o dist/linear_regression.o random/rng.o random/random_match.o
GPU_SKETCH_OBJS=gpu/gpu_api.o
CUDA_OBJS=gpu/dist.cu.o gpu/sketch.cu.o
CUDA_OBJS=gpu/dist.cu.o gpu/sketch.cu.o gpu/device_reads.cu.o gpu/gpu_countmin.cu.o gpu/device_memory.cu.o

# web specific options
web: CXX = emcc
Expand Down Expand Up @@ -121,4 +121,13 @@ gpu/dist.cu.o:
gpu/sketch.cu.o:
nvcc $(CUDAFLAGS) $(CPPFLAGS) -DGPU_AVAILABLE -x cu -c gpu/sketch.cu -o $@

gpu/device_memory.cu.o:
nvcc $(CUDAFLAGS) $(CPPFLAGS) -DGPU_AVAILABLE -x cu -c gpu/device_memory.cu -o $@

gpu/device_reads.cu.o:
nvcc $(CUDAFLAGS) $(CPPFLAGS) -DGPU_AVAILABLE -x cu -c gpu/device_reads.cu -o $@

gpu/gpu_countmin.cu.o:
nvcc $(CUDAFLAGS) $(CPPFLAGS) -DGPU_AVAILABLE -x cu -c gpu/gpu_countmin.cu -o $@

.PHONY: all clean install python install_python web
2 changes: 1 addition & 1 deletion src/api.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ std::vector<Reference> create_sketches(

// Save sketches and check for densified sketches
std::cerr << "Writing sketches to file" << std::endl;
Database sketch_db = new_db(db_name + ".h5", codon_phased);
Database sketch_db = new_db(db_name + ".h5", use_rc, codon_phased);
for (auto sketch_it = sketches.begin(); sketch_it != sketches.end();
sketch_it++) {
sketch_db.add_sketch(*sketch_it);
Expand Down
7 changes: 6 additions & 1 deletion src/database/database.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,8 @@ HighFive::File open_h5(const std::string &filename, const bool write)
}

// Create a new HDF5 file with headline attributes
Database new_db(const std::string &filename, const bool codon_phased)
Database new_db(const std::string &filename, const bool use_rc,
const bool codon_phased)
{
// Restrict scope of HDF5 so it is closed before reopening it
{
Expand All @@ -241,6 +242,10 @@ Database new_db(const std::string &filename, const bool codon_phased)
sketch_group.createAttribute<bool>("codon_phased",
HighFive::DataSpace::From(codon_phased));
codon_phased_a.write(codon_phased);
HighFive::Attribute use_rc_a =
sketch_group.createAttribute<bool>("reverse_complement",
HighFive::DataSpace::From(use_rc));
use_rc_a.write(use_rc);
}
return(Database(filename, true));
}
3 changes: 2 additions & 1 deletion src/database/database.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ class Database
}
return _version_hash == db2.version();
}
void flush() { _h5_file.flush(); }

private:
HighFive::File _h5_file;
Expand All @@ -52,4 +53,4 @@ class Database
};

HighFive::File open_h5(const std::string &filename, const bool write = true);
Database new_db(const std::string &filename, const bool codon_phased);
Database new_db(const std::string &filename, const bool use_rc, const bool codon_phased);
2 changes: 1 addition & 1 deletion src/dist/matrix_ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ NumpyMatrix long_to_square(const Eigen::VectorXf &rrDists,
for (long distIdx = 0; distIdx < qrDists.rows(); distIdx++)
{
unsigned long i = distIdx % nrrSamples;
unsigned long j = static_cast<size_t>(distIdx / (float)nrrSamples + 0.001f) + nrrSamples;
unsigned long j = distIdx / nrrSamples + nrrSamples;
squareDists(i, j) = qrDists[distIdx];
squareDists(j, i) = qrDists[distIdx];
}
Expand Down
1 change: 1 addition & 0 deletions src/gpu/cuda.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ static void HandleCUDAError(const char *file, int line,
}

#define CUDA_CALL(err) (HandleCUDAError(__FILE__, __LINE__, err))
#define CUDA_CALL_NOTHROW( err ) (err)

// A bit lazy... should be a class and just use the destructor
struct progress_atomics {
Expand Down
83 changes: 83 additions & 0 deletions src/gpu/device_memory.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@

#include "device_memory.cuh"
#include "cuda.cuh"
#include "gpu.hpp"

// Sets up data structures and loads them onto the device
DeviceMemory::DeviceMemory(
SketchStrides &ref_strides, SketchStrides &query_strides,
std::vector<Reference> &ref_sketches,
std::vector<Reference> &query_sketches, const SketchSlice &sample_slice,
const FlatRandom &flat_random, const std::vector<uint16_t> &ref_random_idx,
const std::vector<uint16_t> &query_random_idx,
const std::vector<size_t> &kmer_lengths, long long dist_rows,
const bool self, const int cpu_threads)
: _n_dists(dist_rows * 2), d_query_sketches(nullptr),
d_query_random(nullptr) {
// Set up reference sketches, flatten and copy to device
std::vector<uint64_t> flat_ref = flatten_by_samples(
ref_sketches, kmer_lengths, ref_strides, sample_slice.ref_offset,
sample_slice.ref_offset + sample_slice.ref_size, cpu_threads);
CUDA_CALL(
cudaMalloc((void **)&d_ref_sketches, flat_ref.size() * sizeof(uint64_t)));
CUDA_CALL(cudaMemcpy(d_ref_sketches, flat_ref.data(),
flat_ref.size() * sizeof(uint64_t), cudaMemcpyDefault));

// Preload random match chances, which have already been flattened
CUDA_CALL(cudaMalloc((void **)&d_random_table,
std::get<1>(flat_random).size() * sizeof(float)));
CUDA_CALL(cudaMemcpy(d_random_table, std::get<1>(flat_random).data(),
std::get<1>(flat_random).size() * sizeof(float),
cudaMemcpyDefault));
CUDA_CALL(cudaMalloc((void **)&d_ref_random,
sample_slice.ref_size * sizeof(uint16_t)));
CUDA_CALL(
cudaMemcpy(d_ref_random, ref_random_idx.data() + sample_slice.ref_offset,
sample_slice.ref_size * sizeof(uint16_t), cudaMemcpyDefault));

// If ref v query mode, also flatten query vector and copy to device
if (!self) {
std::vector<uint64_t> flat_query = flatten_by_bins(
query_sketches, kmer_lengths, query_strides, sample_slice.query_offset,
sample_slice.query_offset + sample_slice.query_size, cpu_threads);
CUDA_CALL(cudaMalloc((void **)&d_query_sketches,
flat_query.size() * sizeof(uint64_t)));
CUDA_CALL(cudaMemcpy(d_query_sketches, flat_query.data(),
flat_query.size() * sizeof(uint64_t),
cudaMemcpyDefault));

CUDA_CALL(cudaMalloc((void **)&d_query_random,
sample_slice.query_size * sizeof(uint16_t)));
CUDA_CALL(cudaMemcpy(
d_query_random, query_random_idx.data() + sample_slice.query_offset,
sample_slice.query_size * sizeof(uint16_t), cudaMemcpyDefault));
} else {
query_strides = ref_strides;
}

// Copy or set other arrays needed on device (kmers and distance output)
std::vector<int> kmer_ints(kmer_lengths.begin(), kmer_lengths.end());
CUDA_CALL(cudaMalloc((void **)&d_kmers, kmer_ints.size() * sizeof(int)));
CUDA_CALL(cudaMemcpy(d_kmers, kmer_ints.data(),
kmer_ints.size() * sizeof(int), cudaMemcpyDefault));

CUDA_CALL(cudaMalloc((void **)&d_dist_mat, _n_dists * sizeof(float)));
CUDA_CALL(cudaMemset(d_dist_mat, 0, _n_dists * sizeof(float)));
}

DeviceMemory::~DeviceMemory() {
CUDA_CALL(cudaFree(d_ref_sketches));
CUDA_CALL(cudaFree(d_query_sketches));
CUDA_CALL(cudaFree(d_random_table));
CUDA_CALL(cudaFree(d_ref_random));
CUDA_CALL(cudaFree(d_query_random));
CUDA_CALL(cudaFree(d_kmers));
CUDA_CALL(cudaFree(d_dist_mat));
}

std::vector<float> DeviceMemory::read_dists() {
std::vector<float> dists(_n_dists);
CUDA_CALL(cudaMemcpy(dists.data(), d_dist_mat, _n_dists * sizeof(float),
cudaMemcpyDefault));
return dists;
}
81 changes: 81 additions & 0 deletions src/gpu/device_memory.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
#pragma once

#include "reference.hpp"

// Align structs
// https://stackoverflow.com/a/12779757
#if defined(__CUDACC__) // NVCC
#define ALIGN(n) __align__(n)
#elif defined(__GNUC__) // GCC
#define ALIGN(n) __attribute__((aligned(n)))
#elif defined(_MSC_VER) // MSVC
#define ALIGN(n) __declspec(align(n))
#else
#error "Please provide a definition for MY_ALIGN macro for your host compiler!"
#endif

struct ALIGN(8) RandomStrides {
size_t kmer_stride;
size_t cluster_inner_stride;
size_t cluster_outer_stride;
};

typedef std::tuple<RandomStrides, std::vector<float>> FlatRandom;

#ifdef GPU_AVAILABLE

// Structure of flattened vectors
struct ALIGN(16) SketchStrides {
size_t bin_stride;
size_t kmer_stride;
size_t sample_stride;
size_t sketchsize64;
size_t bbits;
};

struct ALIGN(8) SketchSlice {
size_t ref_offset;
size_t ref_size;
size_t query_offset;
size_t query_size;
};

// Memory on device for each operation
class DeviceMemory {
public:
DeviceMemory(SketchStrides &ref_strides, SketchStrides &query_strides,
std::vector<Reference> &ref_sketches,
std::vector<Reference> &query_sketches,
const SketchSlice &sample_slice, const FlatRandom &flat_random,
const std::vector<uint16_t> &ref_random_idx,
const std::vector<uint16_t> &query_random_idx,
const std::vector<size_t> &kmer_lengths, long long dist_rows,
const bool self, const int cpu_threads);

~DeviceMemory();

std::vector<float> read_dists();

uint64_t *ref_sketches() { return d_ref_sketches; }
uint64_t *query_sketches() { return d_query_sketches; }
float *random_table() { return d_random_table; }
uint16_t *ref_random() { return d_ref_random; }
uint16_t *query_random() { return d_query_random; }
int *kmers() { return d_kmers; }
float *dist_mat() { return d_dist_mat; }

private:
DeviceMemory(const DeviceMemory &) = delete;
DeviceMemory(DeviceMemory &&) = delete;

size_t _n_dists;
uint64_t *d_ref_sketches;
uint64_t *d_query_sketches;
float *d_random_table;
uint16_t *d_ref_random;
uint16_t *d_query_random;
int *d_kmers;
float *d_dist_mat;
};

#endif
58 changes: 58 additions & 0 deletions src/gpu/device_reads.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@

#include "cuda.cuh"
#include "device_reads.cuh"

DeviceReads::DeviceReads(const std::shared_ptr<SeqBuf> &seq_ptr,
const size_t n_threads)
: seq(seq_ptr), n_reads(seq->n_full_seqs()), read_length(seq->max_length()),
current_block(0), buffer_filled(0), loaded_first(false) {
// Set up buffer to load in reads (on host)
size_t mem_free = 0;
size_t mem_total = 0;
CUDA_CALL(cudaMemGetInfo(&mem_free, &mem_total));
buffer_size = (mem_free * 0.9) / (read_length * sizeof(char));
buffer_blocks =
std::floor(n_reads / (static_cast<double>(buffer_size) + 1)) + 1;
if (buffer_size > n_reads) {
buffer_size = n_reads;
buffer_blocks = 1;
}
host_buffer.resize(buffer_size * read_length);
CUDA_CALL_NOTHROW(cudaHostRegister(host_buffer.data(),
host_buffer.size() * sizeof(char),
cudaHostRegisterDefault));

// Buffer to store reads (on device)
CUDA_CALL(
cudaMalloc((void **)&d_reads, buffer_size * read_length * sizeof(char)));
}

DeviceReads::~DeviceReads() {
CUDA_CALL_NOTHROW(cudaHostUnregister(host_buffer.data()));
CUDA_CALL_NOTHROW(cudaFree(d_reads));
}

bool DeviceReads::next_buffer() {
bool success;
if (current_block < buffer_blocks) {
if (buffer_blocks > 1 || !loaded_first) {
size_t start = current_block * buffer_size;
size_t end = (current_block + 1) * buffer_size;
if (end > seq->n_full_seqs()) {
end = seq->n_full_seqs();
}
buffer_filled = end - start;

seq->load_seqs(host_buffer, start, end);
CUDA_CALL(cudaMemcpyAsync(d_reads, host_buffer.data(),
buffer_filled * read_length * sizeof(char),
cudaMemcpyDefault));
loaded_first = true;
}
current_block++;
success = true;
} else {
success = false;
}
return success;
}
36 changes: 36 additions & 0 deletions src/gpu/device_reads.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#pragma once

#include <memory> // unique_ptr
#include <vector>

#include "sketch/seqio.hpp" // SeqBuf

class DeviceReads {
public:
DeviceReads(const std::shared_ptr<SeqBuf> &seq_in, const size_t n_threads);
~DeviceReads();

bool next_buffer();

void reset_buffer() { current_block = 0; }
char *read_ptr() { return d_reads; }
size_t buffer_count() const { return buffer_filled; }
size_t length() const { return read_length; }

private:
// delete copy and move to avoid accidentally using them
DeviceReads(const DeviceReads &) = delete;
DeviceReads(DeviceReads &&other) = delete;

char *d_reads;
std::vector<char> host_buffer;
std::shared_ptr<SeqBuf> seq;

size_t n_reads;
size_t read_length;
size_t buffer_size;
size_t buffer_blocks;
size_t current_block;
size_t buffer_filled;
bool loaded_first;
};
Loading

0 comments on commit 1d31abf

Please sign in to comment.