Merge pull request #72 from bacpop/v2.0.0_candidate

Update to v2: new CLI
bacpop · Apr 29, 2022 · 9c416f7 · 9c416f7
2 parents e5b11eb + 1e8cb9b
commit 9c416f7
Show file tree

Hide file tree

Showing 41 changed files with 1,585 additions and 1,152 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -20,27 +20,33 @@ set(TARGET_NAME pp_sketchlib)
 add_compile_definitions(PYTHON_EXT)
 
 # gcc: Add openmp
-# gcc: Add -O0 to remove optimizations when using debug
-IF(CMAKE_COMPILER_IS_GNUCC)
+IF(CMAKE_COMPILER_IS_GNUCC OR "$ENV{SKETCHLIB_INSTALL}" STREQUAL "conda")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
-    set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0")
-    set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -O0")
-ENDIF(CMAKE_COMPILER_IS_GNUCC)
+ENDIF()
+
+# Add -O0 to remove optimizations when using debug
+set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0")
+set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -O0")
 
 if(UNIX AND NOT APPLE)
     if(CMAKE_CXX_COMPILER STREQUAL "icpc")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fast -xCASCADELAKE -DMKL_ILP64 -m64 -static-intel")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fast -march=\"native\" -DMKL_ILP64 -m64 -static-intel")
     else()
         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__STDC_LIMIT_MACROS -D__STDC_CONSTANT_MACROS")
         set(CMAKE_LD_FLAGS "${CMAKE_LDFLAGS} -Wl,--as-needed")
     endif()
 endif()
 
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall")
+set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -ffast-math -funroll-loops -m64")
+
 # Set paths for non standard lib/ and include/ locations
 if(DEFINED ENV{CONDA_PREFIX})
     include_directories($ENV{CONDA_PREFIX}/include)
     link_directories($ENV{CONDA_PREFIX}/lib)
     link_directories($ENV{CONDA_PREFIX}/lib/intel64)
+else()
+    find_package(OpenMP)
 endif()
 
 # Add libraries
@@ -50,12 +56,12 @@ include_directories(${HDF5_INCLUDE_DIRS})
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/vendor/highfive/include)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src)
 
+find_package(ZLIB)
 execute_process(COMMAND pybind11-config --cmakedir OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_VARIABLE pybind11_DIR)
 find_package(pybind11 2.6 CONFIG REQUIRED)
 find_package(Eigen3 3.3 REQUIRED NO_MODULE)
 find_package(Armadillo REQUIRED)
 include_directories(${ARMADILLO_INCLUDE_DIRS})
-#find_package(OpenMP) # This links system openmp if present - conda sorts out rpath but take care
 
 # Define python library target
 add_library("${TARGET_NAME}" MODULE)
@@ -64,42 +70,46 @@ add_library("${TARGET_NAME}" MODULE)
 include(CheckLanguage)
 check_language(CUDA)
 if(CMAKE_CUDA_COMPILER)
-    message(STATUS "CUDA found, compiling both GPU and CPU code")
     enable_language(CUDA)
-
-    # PIC/relocatable-device-code needed as this is linked by gcc later
-    # -Xptxas -dlcm=ca turns cache on, but not needed in recent nvcc versions
-    # --cudart static: static linking of the CUDA libraries
-    # -gencode arch=compute_35 etc compiles for each (minimum) device version listed (v3.5, v5.0, v7.5)
-    set(CUDA_OPTS "-Xcompiler -fPIC -Xptxas -dlcm=ca --relocatable-device-code=true --expt-relaxed-constexpr")
-    # Turn on link time optimisation if available
-    if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER 11.0 AND CMAKE_BUILD_TYPE MATCHES Release)
-        string(APPEND CUDA_OPTS " -dlto -arch=sm_86")
+    if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER 11.0)
+        message(STATUS "CUDA >11.0 found, compiling both GPU and CPU code")
+
+        # PIC/relocatable-device-code needed as this is linked by gcc later
+        # -Xptxas -dlcm=ca turns cache on, but not needed in recent nvcc versions
+        # --cudart static: static linking of the CUDA libraries
+        # -gencode arch=compute_35 etc compiles for each (minimum) device version listed (v3.5, v5.0, v7.5)
+        set(CUDA_OPTS "-Xcompiler -fPIC -Xptxas -dlcm=ca --relocatable-device-code=true --expt-relaxed-constexpr")
+        # Turn on link time optimisation if available
+        if(CMAKE_BUILD_TYPE MATCHES Release)
+            string(APPEND CUDA_OPTS " -dlto -arch=sm_86")
+        else()
+            string(APPEND CUDA_OPTS " -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86")
+        endif()
+        if(CMAKE_BUILD_TYPE MATCHES Debug)
+            string(APPEND CUDA_OPTS " -G")
+        endif()
+
+        set(CMAKE_CUDA_FLAGS "${CUDA_OPTS}")
+
+        add_compile_definitions(GPU_AVAILABLE)
+        add_library("${TARGET_NAME}_CUDA" OBJECT src/gpu/dist.cu
+                                                src/gpu/sketch.cu
+                                                src/gpu/device_memory.cu
+                                                src/gpu/gpu_countmin.cu
+                                                src/gpu/device_reads.cu)
+        target_include_directories("${TARGET_NAME}_CUDA" PRIVATE "${EIGEN3_INCLUDE_DIR}" "${pybind11_INCLUDE_DIRS}")
+        set_property(TARGET "${TARGET_NAME}_CUDA"
+                    PROPERTY POSITION_INDEPENDENT_CODE ON
+                    CUDA_SEPARABLE_COMPILATION ON
+                    CUDA_RESOLVE_DEVICE_SYMBOLS ON   # try and ensure device link with nvcc
+                    CUDA_VISIBILITY_PRESET "hidden"
+                CUDA_RUNTIME_LIBRARY Static)
+              #CUDA_ARCHITECTURES OFF) # set off as done explicitly above (due to dlto complexities)
+        # CPU code/gcc compiled code needed by cuda lib
+        target_sources("${TARGET_NAME}" PRIVATE src/gpu/gpu_api.cpp)
     else()
-        string(APPEND CUDA_OPTS " -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86")
-    endif()
-    if(CMAKE_BUILD_TYPE MATCHES Debug)
-        string(APPEND CUDA_OPTS " -G")
+        message(STATUS "CUDA >=11.0 required, compiling CPU code only")
     endif()
-
-    set(CMAKE_CUDA_FLAGS "${CUDA_OPTS}")
-
-    add_compile_definitions(GPU_AVAILABLE)
-    add_library("${TARGET_NAME}_CUDA" OBJECT src/gpu/dist.cu
-                                             src/gpu/sketch.cu
-                                             src/gpu/device_memory.cu
-                                             src/gpu/gpu_countmin.cu
-                                             src/gpu/device_reads.cu)
-    target_include_directories("${TARGET_NAME}_CUDA" PRIVATE "${EIGEN3_INCLUDE_DIR}" "${pybind11_INCLUDE_DIRS}")
-    set_property(TARGET "${TARGET_NAME}_CUDA"
-                 PROPERTY POSITION_INDEPENDENT_CODE ON
-                 CUDA_SEPARABLE_COMPILATION ON
-                 CUDA_RESOLVE_DEVICE_SYMBOLS ON   # try and ensure device link with nvcc
-                 CUDA_VISIBILITY_PRESET "hidden"
-		         CUDA_RUNTIME_LIBRARY Static)
-	         #CUDA_ARCHITECTURES OFF) # set off as done explicitly above (due to dlto complexities)
-    # CPU code/gcc compiled code needed by cuda lib
-    target_sources("${TARGET_NAME}" PRIVATE src/gpu/gpu_api.cpp)
 else()
     message(STATUS "CUDA not found, compiling CPU code only")
 endif()
@@ -133,7 +143,12 @@ if(CMAKE_CUDA_COMPILER)
     set_property(TARGET "${TARGET_NAME}" PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
     #set_property(TARGET "${TARGET_NAME}" PROPERTY CUDA_ARCHITECTURES OFF)
 endif()
-target_link_libraries("${TARGET_NAME}" PRIVATE pybind11::module Eigen3::Eigen z ${HDF5_LIBRARIES} gomp openblas lapack gfortran m dl)
-#if(OpenMP_CXX_FOUND)
-#    target_link_libraries("${TARGET_NAME}" PRIVATE OpenMP::OpenMP_CXX)
-#endif()
+target_link_libraries("${TARGET_NAME}" PRIVATE pybind11::module Eigen3::Eigen ${HDF5_LIBRARIES} openblas lapack gfortran m dl)
+if(DEFINED ENV{CONDA_PREFIX} AND (NOT APPLE OR CMAKE_COMPILER_IS_GNUCC OR ENV{SKETCHLIB_INSTALL} EQUAL "conda"))
+    target_link_libraries("${TARGET_NAME}" PRIVATE gomp z)
+else()
+    target_link_libraries("${TARGET_NAME}" PRIVATE ZLIB::ZLIB)
+    if(OpenMP_CXX_FOUND)
+        target_link_libraries("${TARGET_NAME}" PRIVATE OpenMP::OpenMP_CXX)
+    endif()
+endif()
diff --git a/README.md b/README.md
@@ -1,12 +1,13 @@
 # pp-sketchlib <img src='sketchlib_logo.png' align="right" height="139" />
 
 <!-- badges: start -->
-[![Build status](https://dev.azure.com/jlees/pp-sketchlib/_apis/build/status/johnlees.pp-sketchlib?branchName=master)](https://dev.azure.com/jlees/pp-sketchlib/_build/latest?definitionId=1&branchName=master)
+[![Build Status](https://dev.azure.com/jlees/pp-sketchlib/_apis/build/status/bacpop.pp-sketchlib?branchName=master)](https://dev.azure.com/jlees/pp-sketchlib/_build/latest?definitionId=4&branchName=master)
+[![Build status](https://badge.buildkite.com/b1bc9ccd16211ca5a55846b95e297554e5aa3b544d8cb752b0.svg?branch=master;theme=github)](https://buildkite.com/mrc-ide/pp-sketchlib)
 [![Anaconda package](https://anaconda.org/conda-forge/pp-sketchlib/badges/version.svg)](https://anaconda.org/conda-forge/pp-sketchlib)
 <!-- badges: end -->
 
 
-Library of sketching functions used by [PopPUNK](https://www.poppunk.net>).
+Library of sketching functions used by [PopPUNK](https://www.poppunk.net>). See documentation at http://poppunk.readthedocs.io/en/latest/sketching.html
 
 ## Installation
 
@@ -67,7 +68,7 @@ installed (tested on 10.2 and 11.0).
 Create a set of sketches and save these as a database:
 
 ```
-poppunk_sketch --sketch --rfile rfiles.txt --ref-db listeria --sketch-size 10000 --cpus 4 --min-k 15 --k-step 2
+sketchlib sketch -l rfiles.txt -o listeria --cpus 4
 ```
 
 The input file `rfiles.txt` has one sequence per line. The first column is the sample name, subsequent tab-separated
@@ -79,52 +80,47 @@ sample2    sample2.fa
 sample3    sample3_1.fq.gz     sample3_2.fq.gz
 ```
 
-Calculate core and accessory distances between databases with `--query`. If all-vs-all, only the upper triangle is calculated,
+Calculate core and accessory distances between databases with `query dist`. If all-vs-all, only the upper triangle is calculated,
 for example:
 
 ```
-poppunk_sketch --query --ref_db listeria --query_db listeria --cpus 4
+poppunk_sketch query dist listeria --cpus 4
 ```
 
-This will save output files as a database for use with PopPUNK. If you wish to output the
-distances add the `--print` option:
-
-```
-poppunk_sketch --query --ref_db listeria --query_db listeria --cpus 4 --print > distances.txt
-```
+This will print the distances to STDOUT and can be captured with `>`. If you wish to output save output files as a database for use with PopPUNK.add the `-o` option.
 
 ### Other options
 
 Sketching:
 
-- `--strand` ignores reverse complement k-mers, if input is all in the same sense
+- `--single-strand` ignores reverse complement k-mers, if input is all in the same sense
 - `--min-count` minimum k-mer count to include when using reads
 - `--exact-counter` uses a hash table to count k-mers, which is recommended for non-bacterial datasets.
 
 Query:
 
 - To only use some of the samples in the sketch database, you can add the `--subset` option with a file which lists the required sample names.
-- `--jaccard` will output the Jaccard distances, rather than core and accessory distances.
+- `query jaccard` will output the Jaccard distances, rather than core and accessory distances.
+- `query sparse` will output a sparse distance matrix,
+using either a `--threshold` or the k-nearest (`-kNN`).
 
 ### Large datasets
 
 When working with large datasets, you can increase the `--cpus` to high numbers and get
 a roughly proportional performance increase.
 
 For calculating sketches of read datasets, or large numbers of distances, and you have a CUDA compatible GPU,
-you can calculate distances on your graphics device even more quickly. Add the `--use-gpu` option:
+you can calculate distances on your graphics device even more quickly. Add the `--gpu` option with the desired
+device ID:
 
 ```
-poppunk_sketch --sketch --rfile rfiles.txt --ref-db listeria --cpus 4 --use-gpu
-poppunk_sketch --query --ref-db listeria --query-db listeria --use-gpu
+sketchlib sketch -l rfiles.txt -o listeria --cpus 4 --gpu 0
+sketchlib query dist listeria --gpu 0
 ```
 
 Both CPU parallelism and the GPU will be used, so be sure to add
-both `--cpus` and `--use-gpu` for maximum speed. This is particularly efficient
-when sketching.
-
-You can set the `--gpu-id` if you have more than one device, which may be necessary on
-cluster systems. This mode can also benefit from having multiple CPU cores available too.
+both `--cpus` and `--gpu` for maximum speed. This is particularly efficient
+when sketching reads.
 
 ### Benchmarks
 
@@ -193,12 +189,16 @@ contain `sketch` and may contain `random`. Run `h5dump` to see the full contents
 Contents are programmatically accessible with any HDF5 API. See `__main__.py` for an
 example in python.
 
+See `poppunk_db_info` from the [PopPUNK](https://github.com/johnlees/PopPUNK) package for pretty printing.
+
 #### sketch
 
 Attributes:
 
 - `sketch_version` - version of sketching code used to create the database.
   The SHA1 hash of relevant code files (doesn't change with every commit).
+- `codon_phased` - 1 if codon-phased seeds were used.
+- `reverse_complement` - 0 if `--single-strand`.
 
 Contains a group for each sample, within each has attributes:
 
@@ -230,78 +230,6 @@ Datasets:
 - `table_keys` - sample order of `table_values`.
 - `table_values` - centroid ID assigned to each sample.
 
-C++
----
-I have yet to set up a proper namespace for this, but you can include this
-code (`api.hpp` will do most functions) and use the parts you need. If you
-are interested in this becoming more functional, please raise an issue.
-
-See `main.cpp` for examples:
-
-```
-#include <fstream>
-#include <iostream>
-
-#include "reference.hpp"
-#include "database.hpp"
-#include "random_match.hpp"
-#include "api.hpp"
-
-// Set k-mer lengths
-std::vector<size_t> kmer_lengths {15, 17, 19, 21, 23, 25, 27, 29};
-
-// Create a two sketches
-Reference ref(argv[1], {argv[2]}, kmer_lengths, 156, true, 0, false);
-Reference query(argv[3], {argv[4]}, kmer_lengths, 156, true, 0, false);
-
-// Use default random match chances
-RandomMC random(true);
-
-// Output some distances at a single k-mer length
-std::cout << ref.jaccard_dist(query, 15, random) << std::endl;
-std::cout << ref.jaccard_dist(query, 29, random) << std::endl;
-
-// Calculate core and accessory distances between two sketches
-auto core_acc = ref.core_acc_dist<RandomMC>(query, random);
-std::cout << std::get<0>(core_acc) << "\t" << std::get<1>(core_acc) << std::endl;
-
-// Save sketches to file
-Database sketch_db("sketch.h5");
-sketch_db.add_sketch(ref);
-sketch_db.add_sketch(query);
-
-// Read sketches from file
-Reference ref_read = sketch_db.load_sketch(argv[1]);
-Reference query_read = sketch_db.load_sketch(argv[3]);
-// Create sketches using multiple threads, saving to file
-std::vector<Reference> ref_sketches = create_sketches("full",
-                           {argv[1], argv[3]},
-                           {{argv[2]}, {argv[4]}},
-                           kmer_lengths,
-                           156,
-                           true,
-                           0,
-                           false,
-                           2);
-// Calculate distances between sketches using multiple threads
-MatrixXf dists = query_db(ref_sketches,
-                          ref_sketches,
-                          kmer_lengths,
-                          random,
-                          false,
-                          2);
-std::cout << dists << std::endl;
-
-// Read sketches from an existing database, using random access
-HighFive::File h5_db("listeria.h5");
-Database listeria_db(h5_db);
-std::vector<Reference> listeria_sketches;
-for (auto name_it = names.cbegin(); name_it != names.cend(); name_it++)
-{
-    listeria_sketches.push_back(listeria_db.load_sketch(*name_it));
-}
-```
-
 ## Algorithms
 
 ### Sketching
@@ -363,7 +291,7 @@ Blais & Blanchette is used (formula 6 in the paper cited below).
   sketch each separately and join the databases.
 - GPU sketching filters out any read containing an N, which may give slightly
   different results from the CPU code.
-- GPU sketching with variable read lengths is untested, but theoretically supported.
+- GPU sketching with variable read lengths is unsupported. Illumina data only for now!
 - GPU distances use lower precision than the CPU code, so slightly different results
   are expected.
 
@@ -427,6 +355,9 @@ Modifiers:
 - `PROFILE=1` runs with profiler flags for `ncu` and `nsys`
 - `GPU=1` also build CUDA code (assumes `/usr/local/cuda-11.1/` and SM v8.6)
 
+### azure
+The repository key for the ubuntu CUDA install is periodically updated, which may cause build failures. See https://developer.nvidia.com/blog/updating-the-cuda-linux-gpg-repository-key/ and update in `azure-pipelines.yml`.
+
 ### Test that Python can build an installable package
 
 Build a python source package and install it into an empty docker container with vanilla python 3. If this works, then there's a good chance that the version uploaded to pypi will work

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -29,7 +29,7 @@ steps:
 - script: |
     wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-ubuntu1804.pin
     sudo mv cuda-ubuntu1804.pin /etc/apt/preferences.d/cuda-repository-pin-600
-    sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub
+    sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
     sudo add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/ /"
     sudo apt-get update
     sudo apt-get -y install cuda=11.2.2-1
@@ -54,5 +54,5 @@ steps:
     export CUDA_HOME=/usr/local/cuda-11.2
     export PATH=${CUDA_HOME}/bin${PATH:+:${PATH}}
     export LD_LIBRARY_PATH=${CUDA_HOME}/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
-    cd test && python run_test.py --no-cpp
+    cd test && python run_test.py
   displayName: 'Run tests (run_test.py)'
diff --git a/docker/test b/docker/test
@@ -8,4 +8,4 @@ HERE=$(dirname $0)
 [ ! -z $(docker images -q $TAG_SHA) ] || docker pull $TAG_SHA
 
 ## Just check that we can bring up the container and run something
-docker run -it -w /src --rm $TAG_SHA poppunk_sketch --version
+docker run -it -w /src --rm $TAG_SHA sketchlib --version
diff --git a/environment.yml b/environment.yml
@@ -9,6 +9,7 @@ dependencies:
   - pip
   - numpy
   - scipy
+  - docopt
   - cmake >= 3.12
   - pybind11
   - zlib
@@ -21,4 +22,4 @@ dependencies:
   - armadillo
   - libgfortran-ng
   - nvcc_linux-64
-  - cudatoolkit==11.2
+  - cudatoolkit==11.2 # This is pinned due to version install on azure, see azure-pipelines.yml
diff --git a/pp_sketch/__init__.py b/pp_sketch/__init__.py
@@ -3,4 +3,4 @@
 
 '''PopPUNK sketching functions'''
 
-__version__ = '1.7.6.2'
+__version__ = '2.0.0'
Original file line number	Diff line number	Diff line change
Expand Up		@@ -3,4 +3,4 @@

		'''PopPUNK sketching functions'''

		__version__ = '1.7.6.2'
		__version__ = '2.0.0'