diff --git a/.gitignore b/.gitignore index 770c680d..2f7017ff 100644 --- a/.gitignore +++ b/.gitignore @@ -18,3 +18,5 @@ python/libgdf_cffi/libgdf_cffi.py ## eclipse .project + +build2/ diff --git a/CMakeLists.txt b/CMakeLists.txt index d2cb7a84..39949076 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,6 +1,7 @@ #============================================================================= # Copyright 2018 BlazingDB, Inc. # Copyright 2018 Percy Camilo Triveño Aucahuasi +# Copyright 2018 Cristhian Alberto Gonzales Castillo # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -25,7 +26,7 @@ PROJECT(libgdf) -cmake_minimum_required(VERSION 2.8) # not sure about version required +cmake_minimum_required(VERSION 3.3) # not sure about version required set(CMAKE_CXX_STANDARD 11) message(STATUS "Using C++ standard: c++${CMAKE_CXX_STANDARD}") @@ -46,6 +47,7 @@ include(CTest) # Include custom modules (see cmake directory) include(ConfigureGoogleTest) include(ConfigureArrow) +include(ConfigureParquetCpp) find_package(CUDA) set_package_properties( @@ -83,12 +85,15 @@ else() message(FATAL_ERROR "Apache Arrow not found, please check your settings.") endif() +get_property(PARQUETCPP_INCLUDE_DIRS TARGET Apache::ParquetCpp PROPERTY INTERFACE_INCLUDE_DIRECTORIES) + include_directories( "${CMAKE_CURRENT_SOURCE_DIR}/include" "${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/cub" "${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/moderngpu/src" "${CUDA_INCLUDE_DIRS}" "${ARROW_INCLUDEDIR}" + "${PARQUETCPP_INCLUDE_DIRS}" ) IF(CUDA_VERSION_MAJOR GREATER 7) @@ -119,6 +124,19 @@ if(HT_LEGACY_ALLOCATOR) set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-DHT_LEGACY_ALLOCATOR) endif() +cuda_add_library(gdf-parquet + src/parquet/api.cpp + src/parquet/column_reader.cu + src/parquet/file_reader.cpp + src/parquet/file_reader_contents.cpp + src/parquet/page_reader.cpp + src/parquet/row_group_reader_contents.cpp + src/parquet/decoder/cu_level_decoder.cu + src/arrow/cu_decoder.cu + src/arrow/util/pinned_allocator.cu +) + +target_link_libraries(gdf-parquet Apache::ParquetCpp) cuda_add_library(gdf SHARED src/binaryops.cu @@ -198,5 +216,10 @@ if(GTEST_FOUND) else() message(AUTHOR_WARNING "Google C++ Testing Framework (Google Test) not found: automated tests are disabled.") endif() + +if(GDF_BENCHMARK) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/src/bench) +endif() + # Print the project summary feature_summary(WHAT ALL INCLUDE_QUIET_PACKAGES FATAL_ON_MISSING_REQUIRED_PACKAGES) diff --git a/cmake/Modules/ConfigureArrow.cmake b/cmake/Modules/ConfigureArrow.cmake index 030e9986..e644d504 100644 --- a/cmake/Modules/ConfigureArrow.cmake +++ b/cmake/Modules/ConfigureArrow.cmake @@ -1,6 +1,7 @@ #============================================================================= # Copyright 2018 BlazingDB, Inc. # Copyright 2018 Percy Camilo Triveño Aucahuasi +# Copyright 2018 Cristhian Alberto Gonzales Castillo # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,7 +16,7 @@ # limitations under the License. #============================================================================= -set(ARROW_DOWNLOAD_BINARY_DIR ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/thirdparty/arrow-download/) +set(ARROW_DOWNLOAD_BINARY_DIR ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/thirdparty/arrow-download) # Download and unpack arrow at configure time configure_file(${CMAKE_SOURCE_DIR}/cmake/Templates/Arrow.CMakeLists.txt.cmake ${ARROW_DOWNLOAD_BINARY_DIR}/CMakeLists.txt COPYONLY) diff --git a/cmake/Modules/ConfigureParquetCpp.cmake b/cmake/Modules/ConfigureParquetCpp.cmake new file mode 100644 index 00000000..c425bd55 --- /dev/null +++ b/cmake/Modules/ConfigureParquetCpp.cmake @@ -0,0 +1,89 @@ +#============================================================================= +# Copyright 2018 BlazingDB, Inc. +# Copyright 2018 Cristhian Alberto Gonzales Castillo +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#============================================================================= + +# Download and unpack ParquetCpp at configure time +configure_file(${CMAKE_SOURCE_DIR}/cmake/Templates/ParquetCpp.CMakeLists.txt.cmake ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/thirdparty/parquetcpp-download/CMakeLists.txt) + +execute_process( + COMMAND ${CMAKE_COMMAND} -F "${CMAKE_GENERATOR}" . + RESULT_VARIABLE result + WORKING_DIRECTORY ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/thirdparty/parquetcpp-download/ +) + +if(result) + message(FATAL_ERROR "CMake step for ParquetCpp failed: ${result}") +endif() + +# Transitive dependencies +set(ARROW_TRANSITIVE_DEPENDENCIES_PREFIX ${ARROW_DOWNLOAD_BINARY_DIR}/arrow-prefix/src/arrow-build) +set(BROTLI_TRANSITIVE_DEPENDENCY_PREFIX ${ARROW_TRANSITIVE_DEPENDENCIES_PREFIX}/brotli_ep/src/brotli_ep-install/lib/x86_64-linux-gnu) +set(BROTLI_STATIC_LIB_ENC ${BROTLI_TRANSITIVE_DEPENDENCY_PREFIX}/libbrotlienc.a) +set(BROTLI_STATIC_LIB_DEC ${BROTLI_TRANSITIVE_DEPENDENCY_PREFIX}/libbrotlidec.a) +set(BROTLI_STATIC_LIB_COMMON ${BROTLI_TRANSITIVE_DEPENDENCY_PREFIX}/libbrotlicommon.a) +set(SNAPPY_STATIC_LIB ${ARROW_TRANSITIVE_DEPENDENCIES_PREFIX}/snappy_ep/src/snappy_ep-install/lib/libsnappy.a) +set(ZLIB_STATIC_LIB ${ARROW_TRANSITIVE_DEPENDENCIES_PREFIX}/zlib_ep/src/zlib_ep-install/lib/libz.a) +set(LZ4_STATIC_LIB ${ARROW_TRANSITIVE_DEPENDENCIES_PREFIX}/lz4_ep-prefix/src/lz4_ep/lib/liblz4.a) +set(ZSTD_STATIC_LIB ${ARROW_TRANSITIVE_DEPENDENCIES_PREFIX}/zstd_ep-prefix/src/zstd_ep/lib/libzstd.a) +set(ARROW_HOME ${ARROW_ROOT}) + +set(ENV{BROTLI_STATIC_LIB_ENC} ${BROTLI_STATIC_LIB_ENC}) +set(ENV{BROTLI_STATIC_LIB_DEC} ${BROTLI_STATIC_LIB_DEC}) +set(ENV{BROTLI_STATIC_LIB_COMMON} ${BROTLI_STATIC_LIB_COMMON}) +set(ENV{SNAPPY_STATIC_LIB} ${SNAPPY_STATIC_LIB}) +set(ENV{ZLIB_STATIC_LIB} ${ZLIB_STATIC_LIB}) +set(ENV{LZ4_STATIC_LIB} ${LZ4_STATIC_LIB}) +set(ENV{ZSTD_STATIC_LIB} ${ZSTD_STATIC_LIB}) +set(ENV{ARROW_HOME} ${ARROW_HOME}) + +execute_process( + COMMAND ${CMAKE_COMMAND} --build . + RESULT_VARIABLE result + WORKING_DIRECTORY ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/thirdparty/parquetcpp-download) + +if(result) + message(FATAL_ERROR "Build step for ParquetCpp failed: ${result}") +endif() + +# Add transitive dependency: Thrift +set(THRIFT_ROOT ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/thirdparty/parquetcpp-build/thrift_ep/src/thrift_ep-install) + +# Locate ParquetCpp package +set(PARQUETCPP_ROOT ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/thirdparty/parquetcpp-install) +set(PARQUETCPP_BINARY_DIR ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/thirdparty/parquetcpp-build) +set(PARQUETCPP_SOURCE_DIR ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/thirdparty/parquetcpp-src) + +# Dependency interfaces +find_package(Boost REQUIRED COMPONENTS regex) + +add_library(Apache::Thrift INTERFACE IMPORTED) +set_target_properties(Apache::Thrift + PROPERTIES INTERFACE_INCLUDE_DIRECTORIES ${THRIFT_ROOT}/include) +set_target_properties(Apache::Thrift + PROPERTIES INTERFACE_LINK_LIBRARIES ${THRIFT_ROOT}/lib/libthrift.a) + +add_library(Apache::Arrow INTERFACE IMPORTED) +set_target_properties(Apache::Arrow + PROPERTIES INTERFACE_INCLUDE_DIRECTORIES ${ARROW_ROOT}/include) +set_target_properties(Apache::Arrow + PROPERTIES INTERFACE_LINK_LIBRARIES "${ARROW_ROOT}/lib/libarrow.a;${BROTLI_STATIC_LIB_ENC};${BROTLI_STATIC_LIB_DEC};${BROTLI_STATIC_LIB_COMMON};${SNAPPY_STATIC_LIB};${ZLIB_STATIC_LIB};${LZ4_STATIC_LIB};${ZSTD_STATIC_LIB}") + +add_library(Apache::ParquetCpp INTERFACE IMPORTED) +set_target_properties(Apache::ParquetCpp + PROPERTIES INTERFACE_INCLUDE_DIRECTORIES + "${PARQUETCPP_ROOT}/include;${PARQUETCPP_BINARY_DIR}/src;${PARQUETCPP_SOURCE_DIR}/src") +set_target_properties(Apache::ParquetCpp + PROPERTIES INTERFACE_LINK_LIBRARIES "${PARQUETCPP_ROOT}/lib/libparquet.a;Apache::Arrow;Apache::Thrift;Boost::regex") diff --git a/cmake/Templates/Arrow.CMakeLists.txt.cmake b/cmake/Templates/Arrow.CMakeLists.txt.cmake index 3fcbb108..7d4b7bc5 100644 --- a/cmake/Templates/Arrow.CMakeLists.txt.cmake +++ b/cmake/Templates/Arrow.CMakeLists.txt.cmake @@ -1,6 +1,7 @@ #============================================================================= # Copyright 2018 BlazingDB, Inc. # Copyright 2018 Percy Camilo Triveño Aucahuasi +# Copyright 2018 Cristhian Alberto Gonzales Castillo # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -23,7 +24,7 @@ project(arrow-download NONE) include(ExternalProject) -set(ARROW_VERSION "apache-arrow-0.10.0") +set(ARROW_VERSION "apache-arrow-0.9.0") if (NOT "$ENV{PARQUET_ARROW_VERSION}" STREQUAL "") set(ARROW_VERSION "$ENV{PARQUET_ARROW_VERSION}") @@ -34,13 +35,6 @@ message(STATUS "Using Apache Arrow version: ${ARROW_VERSION}") set(ARROW_URL "https://github.com/apache/arrow/archive/${ARROW_VERSION}.tar.gz") set(ARROW_CMAKE_ARGS - #Arrow dependencies - -DARROW_WITH_LZ4=OFF - -DARROW_WITH_ZSTD=OFF - -DARROW_WITH_BROTLI=OFF - -DARROW_WITH_SNAPPY=OFF - -DARROW_WITH_ZLIB=OFF - #Build settings -DARROW_BUILD_STATIC=ON -DARROW_BUILD_SHARED=OFF @@ -48,10 +42,12 @@ set(ARROW_CMAKE_ARGS -DARROW_BUILD_TESTS=OFF -DARROW_TEST_MEMCHECK=OFF -DARROW_BUILD_BENCHMARKS=OFF + -DARROW_BUILD_UTILITIES=OFF + -DARROW_JEMALLOC=OFF #Arrow modules -DARROW_IPC=ON - -DARROW_COMPUTE=OFF + -DARROW_COMPUTE=ON -DARROW_GPU=OFF -DARROW_JEMALLOC=OFF -DARROW_BOOST_VENDORED=OFF diff --git a/cmake/Templates/ParquetCpp.CMakeLists.txt.cmake b/cmake/Templates/ParquetCpp.CMakeLists.txt.cmake new file mode 100644 index 00000000..2f61a0c2 --- /dev/null +++ b/cmake/Templates/ParquetCpp.CMakeLists.txt.cmake @@ -0,0 +1,44 @@ +#============================================================================= +# Copyright 2018 BlazingDB, Inc. +# Copyright 2018 Cristhian Alberto Gonzales Castillo +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#============================================================================= + +cmake_minimum_required(VERSION 2.8.12) + +project(parquetcpp-download NONE) + +include(ExternalProject) + +set(PARQUET_VERSION apache-parquet-cpp-1.4.0) + +if (NOT $ENV{PARQUET_VERSION} STREQUAL "") + set(PARQUET_VERSION $ENV{PARQUET_VETSION}) +endif() + +message(STATUS "Using Apache ParquetCpp version: ${PARQUET_VERSION}") + +ExternalProject_Add(parquetcpp + BINARY_DIR "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/thirdparty/parquetcpp-build" + CMAKE_ARGS + -DCMAKE_BUILD_TYPE=RELEASE + -DCMAKE_INSTALL_PREFIX=${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/thirdparty/parquetcpp-install + -DPARQUET_ARROW_LINKAGE=static + -DPARQUET_BUILD_SHARED=OFF + -DPARQUET_BUILD_TESTS=OFF + GIT_REPOSITORY https://github.com/apache/parquet-cpp.git + GIT_TAG apache-parquet-cpp-1.4.0 + INSTALL_DIR "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/thirdparty/parquetcpp-install" + SOURCE_DIR "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/thirdparty/parquetcpp-src" +) diff --git a/conda_environments/dev_py35.yml b/conda_environments/dev_py35.yml index c879875e..b4fbb068 100644 --- a/conda_environments/dev_py35.yml +++ b/conda_environments/dev_py35.yml @@ -24,4 +24,6 @@ dependencies: - llvmlite=0.18.0=py35_0 - numba=0.34.0.dev=np112py35_316 - cmake=3.6.3=0 +- flex=2.6.0 +- bison=3.0.4 - pyarrow=0.10.0 diff --git a/include/gdf/cffi/types.h b/include/gdf/cffi/types.h index d8590aca..9bdf04b0 100644 --- a/include/gdf/cffi/types.h +++ b/include/gdf/cffi/types.h @@ -48,6 +48,8 @@ typedef enum { GDF_INVALID_API_CALL, /**< The arguments passed into the function were invalid */ GDF_JOIN_DTYPE_MISMATCH, /**< Datatype mismatch between corresponding columns in left/right tables in the Join function */ GDF_JOIN_TOO_MANY_COLUMNS, /**< Too many columns were passed in for the requested join operation*/ + + GDF_IO_ERROR, /**< Error occured in a parquet-reader api which load a parquet file into gdf_columns */ GDF_DTYPE_MISMATCH, /**< Type mismatch between columns that should be the same type */ GDF_UNSUPPORTED_METHOD, /**< The method requested to perform an operation was invalid or unsupported (e.g., hash vs. sort)*/ GDF_INVALID_AGGREGATOR, /**< Invalid aggregator was specified for a groupby*/ diff --git a/include/gdf/parquet/api.h b/include/gdf/parquet/api.h new file mode 100644 index 00000000..e77dbc1e --- /dev/null +++ b/include/gdf/parquet/api.h @@ -0,0 +1,80 @@ +/* + * Copyright 2018 BlazingDB, Inc. + * Copyright 2018 Cristhian Alberto Gonzales Castillo + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#ifdef __cplusplus +#define BEGIN_NAMESPACE_GDF_PARQUET \ + namespace gdf { \ + namespace parquet { +#define END_NAMESPACE_GDF_PARQUET \ + } \ + } +#else +#define BEGIN_NAMESPACE_GDF_PARQUET +#define END_NAMESPACE_GDF_PARQUET +#endif + +BEGIN_NAMESPACE_GDF_PARQUET + +/// \brief Read parquet file from file path into array of gdf columns +/// \param[in] filename path to parquet file +/// \param[in] columns will be read from the file +/// \param[out] out_gdf_columns array +/// \param[out] out_gdf_columns_length number of columns +extern "C" gdf_error read_parquet(const char *const filename, + const char *const *const columns, + gdf_column **const out_gdf_columns, + size_t *const out_gdf_columns_length); + +END_NAMESPACE_GDF_PARQUET + +#ifdef __cplusplus + +#include +#include +#include + +namespace gdf { +namespace parquet { + +/// \brief Read parquet file from file path into array of gdf columns +/// \param[in] filename path to parquet file +/// \param[in] indices of the rowgroups that will be read from the file +/// \param[in] indices of the columns that will be read from the file +/// \param[out] out_gdf_columns vector of gdf_column pointers. The data read. +gdf_error +read_parquet_by_ids(const std::string & filename, + const std::vector &row_group_indices, + const std::vector &column_indices, + std::vector & out_gdf_columns); + +/// \brief Read parquet file from file interface into array of gdf columns +/// \param[in] filename path to parquet file +/// \param[in] indices of the rowgroups that will be read from the file +/// \param[in] indices of the columns that will be read from the file +/// \param[out] out_gdf_columns vector of gdf_column pointers. The data read. +gdf_error +read_parquet_by_ids(std::shared_ptr<::arrow::io::RandomAccessFile> file, + const std::vector &row_group_indices, + const std::vector &column_indices, + std::vector & out_gdf_columns); + +} // namespace parquet +} // namespace gdf + +#endif diff --git a/src/arrow/bit-stream.h b/src/arrow/bit-stream.h new file mode 100644 index 00000000..79bb814a --- /dev/null +++ b/src/arrow/bit-stream.h @@ -0,0 +1,393 @@ +/* + * Copyright 2018 BlazingDB, Inc. + * Copyright 2018 Alexander Ocsa + * Copyright 2018 William Malpica + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef GDF_ARROW_UTIL_BIT_STREAM_UTILS_H +#define GDF_ARROW_UTIL_BIT_STREAM_UTILS_H + +#include +#include +#include + +#include "arrow/util/bit-util.h" +#include "arrow/util/bpacking.h" +#include "arrow/util/logging.h" +#include "arrow/util/macros.h" + +namespace gdf { +namespace arrow { + namespace internal { + + /// Utility class to read bit/byte stream. This class can read bits or bytes + /// that are either byte aligned or not. It also has utilities to read multiple + /// bytes in one read (e.g. encoded int). + class BitReader { + public: + /// 'buffer' is the buffer to read from. The buffer's length is 'buffer_len'. + BitReader(const uint8_t* buffer, int buffer_len) + : buffer_(buffer) + , max_bytes_(buffer_len) + , byte_offset_(0) + , bit_offset_(0) + { + int num_bytes = std::min(8, max_bytes_ - byte_offset_); + memcpy(&buffered_values_, buffer_ + byte_offset_, num_bytes); + } + + BitReader() + : buffer_(NULL) + , max_bytes_(0) + { + } + + void Reset(const uint8_t* buffer, int buffer_len) + { + buffer_ = buffer; + max_bytes_ = buffer_len; + byte_offset_ = 0; + bit_offset_ = 0; + int num_bytes = std::min(8, max_bytes_ - byte_offset_); + memcpy(&buffered_values_, buffer_ + byte_offset_, num_bytes); + } + + /// Gets the next value from the buffer. Returns true if 'v' could be read or + /// false if + /// there are not enough bytes left. num_bits must be <= 32. + template + bool GetValue(int num_bits, T* v); + + template + void SetGpuBatchMetadata(int num_bits, T* v, int batch_size, int values_read, + std::vector& unpack32InputOffsets, + std::vector& unpack32InputRunLengths, + std::vector& unpack32OutputOffsets, + std::vector& remainderInputOffsets, + std::vector& remainderBitOffsets, + std::vector& remainderSetSize, + std::vector& remainderOutputOffsets); + + /// Get a number of values from the buffer. Return the number of values + /// actually read. + template + int GetBatch(int num_bits, T* v, int batch_size); + + /// Reads a 'num_bytes'-sized value from the buffer and stores it in 'v'. T + /// needs to be a little-endian native type and big enough to store + /// 'num_bytes'. The value is assumed to be byte-aligned so the stream will + /// be advanced to the start of the next byte before 'v' is read. Returns + /// false if there are not enough bytes left. + template + bool GetAligned(int num_bytes, T* v); + + /// Reads a vlq encoded int from the stream. The encoded int must start at + /// the beginning of a byte. Return false if there were not enough bytes in + /// the buffer. + bool GetVlqInt(int32_t* v); + + // Reads a zigzag encoded int `into` v. + bool GetZigZagVlqInt(int32_t* v); + + /// Returns the number of bytes left in the stream, not including the current + /// byte (i.e., there may be an additional fraction of a byte). + int bytes_left() + { + return max_bytes_ - (byte_offset_ + static_cast(::arrow::BitUtil::Ceil( + bit_offset_, 8))); + } + + const uint8_t* get_buffer() { return buffer_; } + int get_buffer_len() { return max_bytes_; } + /// Maximum byte length of a vlq encoded int + static const int MAX_VLQ_BYTE_LEN = 5; + + private: + const uint8_t* buffer_; + int max_bytes_; + + /// Bytes are memcpy'd from buffer_ and values are read from this variable. + /// This is + /// faster than reading values byte by byte directly from buffer_. + uint64_t buffered_values_; + + int byte_offset_; // Offset in buffer_ + int bit_offset_; // Offset in buffered_values_ + }; + + + + template + inline void GetValue_(int num_bits, T* v, int max_bytes, const uint8_t* buffer, + int* bit_offset, int* byte_offset, uint64_t* buffered_values) { + #ifdef _MSC_VER + #pragma warning(push) + #pragma warning(disable : 4800) + #endif + *v = static_cast(::arrow::BitUtil::TrailingBits(*buffered_values, *bit_offset + num_bits) >> + *bit_offset); + #ifdef _MSC_VER + #pragma warning(pop) + #endif + *bit_offset += num_bits; + if (*bit_offset >= 64) { + *byte_offset += 8; + *bit_offset -= 64; + + int bytes_remaining = max_bytes - *byte_offset; + if (ARROW_PREDICT_TRUE(bytes_remaining >= 8)) { + memcpy(buffered_values, buffer + *byte_offset, 8); + } else { + memcpy(buffered_values, buffer + *byte_offset, bytes_remaining); + } + #ifdef _MSC_VER + #pragma warning(push) + #pragma warning(disable : 4800 4805) + #endif + // Read bits of v that crossed into new buffered_values_ + *v = *v | static_cast(::arrow::BitUtil::TrailingBits(*buffered_values, *bit_offset) + << (num_bits - *bit_offset)); + #ifdef _MSC_VER + #pragma warning(pop) + #endif + DCHECK_LE(*bit_offset, 64); + } + } + + template + inline bool BitReader::GetValue(int num_bits, T* v) + { + return GetBatch(num_bits, v, 1) == 1; + } + + + template + inline void + BitReader::SetGpuBatchMetadata(int num_bits, T* v, int batch_size, int values_read, + std::vector& unpack32InputOffsets, + std::vector& unpack32InputRunLengths, + std::vector& unpack32OutputOffsets, + std::vector& remainderInputOffsets, + std::vector& remainderBitOffsets, + std::vector& remainderSetSize, + std::vector& remainderOutputOffsets) + { + DCHECK(buffer_ != NULL); + // TODO: revisit this limit if necessary + DCHECK_LE(num_bits, 32); + // DCHECK_LE(num_bits, static_cast(sizeof(T) * 8)); + + + int bit_offset = bit_offset_; + int byte_offset = byte_offset_; + uint64_t buffered_values = buffered_values_; + int max_bytes = max_bytes_; + + uint64_t needed_bits = num_bits * batch_size; + uint64_t remaining_bits = (max_bytes - byte_offset) * 8 - bit_offset; + if (remaining_bits < needed_bits) { + batch_size = static_cast(remaining_bits) / num_bits; + } + + int i = 0; + if (ARROW_PREDICT_FALSE(bit_offset != 0)) { + int byte_offset_start = byte_offset; + int bit_offset_start = bit_offset; + int i_start = i + values_read; + + int count = 0; + for (; i < batch_size && bit_offset != 0; ++i) { // TODO this loop can be replaced with math + bit_offset += num_bits; + if (bit_offset >= 64) { + byte_offset += 8; + bit_offset -= 64; + } + count++; + } + if (count > 0) { + remainderInputOffsets.push_back(byte_offset_start); + remainderBitOffsets.push_back(bit_offset_start); + remainderOutputOffsets.push_back(i_start); + remainderSetSize.push_back(count); + } + } + + int unpack_batch_size = (batch_size - i) / 32 * 32; + + + if (unpack_batch_size > 32){ + unpack32InputOffsets.push_back(byte_offset); + unpack32InputRunLengths.push_back(unpack_batch_size); + unpack32OutputOffsets.push_back(i + values_read); + i += unpack_batch_size; + byte_offset += unpack_batch_size * num_bits / 8; + } + + + int byte_offset_start = byte_offset; + int bit_offset_start = bit_offset; + int i_start = i + values_read; + + int count = 0; + for (; i < batch_size; ++i) { // TODO this loop can be replaced with math + bit_offset += num_bits; + if (bit_offset >= 64) { + byte_offset += 8; + bit_offset -= 64; + } + count++; + } + if (count > 0) { + remainderInputOffsets.push_back(byte_offset_start); + remainderBitOffsets.push_back(bit_offset_start); + remainderOutputOffsets.push_back(i_start); + remainderSetSize.push_back(count); + } + + bit_offset_ = bit_offset; + byte_offset_ = byte_offset; + buffered_values_ = buffered_values; + } + + template + inline int BitReader::GetBatch(int num_bits, T* v, int batch_size) + { + DCHECK(buffer_ != NULL); + // TODO: revisit this limit if necessary + DCHECK_LE(num_bits, 32); + DCHECK_LE(num_bits, static_cast(sizeof(T) * 8)); + + int bit_offset = bit_offset_; + int byte_offset = byte_offset_; + uint64_t buffered_values = buffered_values_; + int max_bytes = max_bytes_; + const uint8_t* buffer = buffer_; + + uint64_t needed_bits = num_bits * batch_size; + uint64_t remaining_bits = (max_bytes - byte_offset) * 8 - bit_offset; + if (remaining_bits < needed_bits) { + batch_size = static_cast(remaining_bits) / num_bits; + } + + int i = 0; + if (ARROW_PREDICT_FALSE(bit_offset != 0)) { + for (; i < batch_size && bit_offset != 0; ++i) { + GetValue_(num_bits, &v[i], max_bytes, buffer, + &bit_offset, &byte_offset, &buffered_values); + } + } + + if (sizeof(T) == 4) { + int num_unpacked = ::arrow::internal::unpack32( + reinterpret_cast(buffer + byte_offset), + reinterpret_cast(v + i), batch_size - i, num_bits); + i += num_unpacked; + byte_offset += num_unpacked * num_bits / 8; + } else { + const int buffer_size = 1024; + uint32_t unpack_buffer[buffer_size]; + while (i < batch_size) { + int unpack_size = std::min(buffer_size, batch_size - i); + int num_unpacked = ::arrow::internal::unpack32( + reinterpret_cast(buffer + byte_offset), + unpack_buffer, unpack_size, num_bits); + if (num_unpacked == 0) { + break; + } + for (int k = 0; k < num_unpacked; ++k) { +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 4800) +#endif + v[i + k] = static_cast(unpack_buffer[k]); +#ifdef _MSC_VER +#pragma warning(pop) +#endif + } + i += num_unpacked; + byte_offset += num_unpacked * num_bits / 8; + } + } + + int bytes_remaining = max_bytes - byte_offset; + if (bytes_remaining >= 8) { + memcpy(&buffered_values, buffer + byte_offset, 8); + } else { + memcpy(&buffered_values, buffer + byte_offset, bytes_remaining); + } + for (; i < batch_size; ++i) { + GetValue_(num_bits, &v[i], max_bytes, buffer, &bit_offset, + &byte_offset, &buffered_values); + } + bit_offset_ = bit_offset; + byte_offset_ = byte_offset; + buffered_values_ = buffered_values; + + return batch_size; + } + + template + inline bool BitReader::GetAligned(int num_bytes, T* v) + { + DCHECK_LE(num_bytes, static_cast(sizeof(T))); + int bytes_read = static_cast(::arrow::BitUtil::Ceil(bit_offset_, 8)); + if (ARROW_PREDICT_FALSE(byte_offset_ + bytes_read + num_bytes > max_bytes_)) + return false; + + // Advance byte_offset to next unread byte and read num_bytes + byte_offset_ += bytes_read; + memcpy(v, buffer_ + byte_offset_, num_bytes); + byte_offset_ += num_bytes; + + // Reset buffered_values_ + bit_offset_ = 0; + int bytes_remaining = max_bytes_ - byte_offset_; + if (ARROW_PREDICT_TRUE(bytes_remaining >= 8)) { + memcpy(&buffered_values_, buffer_ + byte_offset_, 8); + } else { + memcpy(&buffered_values_, buffer_ + byte_offset_, bytes_remaining); + } + return true; + } + + inline bool BitReader::GetVlqInt(int32_t* v) + { + *v = 0; + int shift = 0; + int num_bytes = 0; + uint8_t byte = 0; + do { + if (!GetAligned(1, &byte)) + return false; + *v |= (byte & 0x7F) << shift; + shift += 7; + DCHECK_LE(++num_bytes, MAX_VLQ_BYTE_LEN); + } while ((byte & 0x80) != 0); + return true; + } + + inline bool BitReader::GetZigZagVlqInt(int32_t* v) + { + int32_t u_signed; + if (!GetVlqInt(&u_signed)) + return false; + uint32_t u = static_cast(u_signed); + *reinterpret_cast(v) = (u >> 1) ^ -(static_cast(u & 1)); + return true; + } + } +} +} + +#endif diff --git a/src/arrow/bpacking.cuh b/src/arrow/bpacking.cuh new file mode 100644 index 00000000..2f3a7eb7 --- /dev/null +++ b/src/arrow/bpacking.cuh @@ -0,0 +1,3200 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This file was modified from its original version for inclusion in parquet-cpp. +// Original source: +// https://github.com/lemire/FrameOfReference/blob/6ccaf9e97160f9a3b299e23a8ef739e711ef0c71/src/bpacking.cpp +// The original copyright notice follows. + +// This code is released under the +// Apache License Version 2.0 http://www.apache.org/licenses/. +// (c) Daniel Lemire 2013 + +#ifndef GDF_ARROW_UTIL_BPACKING_H +#define GDF_ARROW_UTIL_BPACKING_H + +#include "arrow/util/logging.h" + +namespace gdf { +namespace arrow { + namespace internal { + +__host__ __device__ inline const uint32_t* unpack1_32(const uint32_t* in, uint32_t* out) { + *out = ((*in) >> 0) & 1; + out++; + *out = ((*in) >> 1) & 1; + out++; + *out = ((*in) >> 2) & 1; + out++; + *out = ((*in) >> 3) & 1; + out++; + *out = ((*in) >> 4) & 1; + out++; + *out = ((*in) >> 5) & 1; + out++; + *out = ((*in) >> 6) & 1; + out++; + *out = ((*in) >> 7) & 1; + out++; + *out = ((*in) >> 8) & 1; + out++; + *out = ((*in) >> 9) & 1; + out++; + *out = ((*in) >> 10) & 1; + out++; + *out = ((*in) >> 11) & 1; + out++; + *out = ((*in) >> 12) & 1; + out++; + *out = ((*in) >> 13) & 1; + out++; + *out = ((*in) >> 14) & 1; + out++; + *out = ((*in) >> 15) & 1; + out++; + *out = ((*in) >> 16) & 1; + out++; + *out = ((*in) >> 17) & 1; + out++; + *out = ((*in) >> 18) & 1; + out++; + *out = ((*in) >> 19) & 1; + out++; + *out = ((*in) >> 20) & 1; + out++; + *out = ((*in) >> 21) & 1; + out++; + *out = ((*in) >> 22) & 1; + out++; + *out = ((*in) >> 23) & 1; + out++; + *out = ((*in) >> 24) & 1; + out++; + *out = ((*in) >> 25) & 1; + out++; + *out = ((*in) >> 26) & 1; + out++; + *out = ((*in) >> 27) & 1; + out++; + *out = ((*in) >> 28) & 1; + out++; + *out = ((*in) >> 29) & 1; + out++; + *out = ((*in) >> 30) & 1; + out++; + *out = ((*in) >> 31); + ++in; + out++; + + return in; +} + +__host__ __device__ inline const uint32_t* unpack2_32(const uint32_t* in, uint32_t* out) { + *out = ((*in) >> 0) % (1U << 2); + out++; + *out = ((*in) >> 2) % (1U << 2); + out++; + *out = ((*in) >> 4) % (1U << 2); + out++; + *out = ((*in) >> 6) % (1U << 2); + out++; + *out = ((*in) >> 8) % (1U << 2); + out++; + *out = ((*in) >> 10) % (1U << 2); + out++; + *out = ((*in) >> 12) % (1U << 2); + out++; + *out = ((*in) >> 14) % (1U << 2); + out++; + *out = ((*in) >> 16) % (1U << 2); + out++; + *out = ((*in) >> 18) % (1U << 2); + out++; + *out = ((*in) >> 20) % (1U << 2); + out++; + *out = ((*in) >> 22) % (1U << 2); + out++; + *out = ((*in) >> 24) % (1U << 2); + out++; + *out = ((*in) >> 26) % (1U << 2); + out++; + *out = ((*in) >> 28) % (1U << 2); + out++; + *out = ((*in) >> 30); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 2); + out++; + *out = ((*in) >> 2) % (1U << 2); + out++; + *out = ((*in) >> 4) % (1U << 2); + out++; + *out = ((*in) >> 6) % (1U << 2); + out++; + *out = ((*in) >> 8) % (1U << 2); + out++; + *out = ((*in) >> 10) % (1U << 2); + out++; + *out = ((*in) >> 12) % (1U << 2); + out++; + *out = ((*in) >> 14) % (1U << 2); + out++; + *out = ((*in) >> 16) % (1U << 2); + out++; + *out = ((*in) >> 18) % (1U << 2); + out++; + *out = ((*in) >> 20) % (1U << 2); + out++; + *out = ((*in) >> 22) % (1U << 2); + out++; + *out = ((*in) >> 24) % (1U << 2); + out++; + *out = ((*in) >> 26) % (1U << 2); + out++; + *out = ((*in) >> 28) % (1U << 2); + out++; + *out = ((*in) >> 30); + ++in; + out++; + + return in; +} + +__host__ __device__ inline const uint32_t* unpack3_32(const uint32_t* in, uint32_t* out) { + *out = ((*in) >> 0) % (1U << 3); + out++; + *out = ((*in) >> 3) % (1U << 3); + out++; + *out = ((*in) >> 6) % (1U << 3); + out++; + *out = ((*in) >> 9) % (1U << 3); + out++; + *out = ((*in) >> 12) % (1U << 3); + out++; + *out = ((*in) >> 15) % (1U << 3); + out++; + *out = ((*in) >> 18) % (1U << 3); + out++; + *out = ((*in) >> 21) % (1U << 3); + out++; + *out = ((*in) >> 24) % (1U << 3); + out++; + *out = ((*in) >> 27) % (1U << 3); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 1)) << (3 - 1); + out++; + *out = ((*in) >> 1) % (1U << 3); + out++; + *out = ((*in) >> 4) % (1U << 3); + out++; + *out = ((*in) >> 7) % (1U << 3); + out++; + *out = ((*in) >> 10) % (1U << 3); + out++; + *out = ((*in) >> 13) % (1U << 3); + out++; + *out = ((*in) >> 16) % (1U << 3); + out++; + *out = ((*in) >> 19) % (1U << 3); + out++; + *out = ((*in) >> 22) % (1U << 3); + out++; + *out = ((*in) >> 25) % (1U << 3); + out++; + *out = ((*in) >> 28) % (1U << 3); + out++; + *out = ((*in) >> 31); + ++in; + *out |= ((*in) % (1U << 2)) << (3 - 2); + out++; + *out = ((*in) >> 2) % (1U << 3); + out++; + *out = ((*in) >> 5) % (1U << 3); + out++; + *out = ((*in) >> 8) % (1U << 3); + out++; + *out = ((*in) >> 11) % (1U << 3); + out++; + *out = ((*in) >> 14) % (1U << 3); + out++; + *out = ((*in) >> 17) % (1U << 3); + out++; + *out = ((*in) >> 20) % (1U << 3); + out++; + *out = ((*in) >> 23) % (1U << 3); + out++; + *out = ((*in) >> 26) % (1U << 3); + out++; + *out = ((*in) >> 29); + ++in; + out++; + + return in; +} + +__host__ __device__ inline const uint32_t* unpack4_32(const uint32_t* in, uint32_t* out) { + *out = ((*in) >> 0) % (1U << 4); + out++; + *out = ((*in) >> 4) % (1U << 4); + out++; + *out = ((*in) >> 8) % (1U << 4); + out++; + *out = ((*in) >> 12) % (1U << 4); + out++; + *out = ((*in) >> 16) % (1U << 4); + out++; + *out = ((*in) >> 20) % (1U << 4); + out++; + *out = ((*in) >> 24) % (1U << 4); + out++; + *out = ((*in) >> 28); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 4); + out++; + *out = ((*in) >> 4) % (1U << 4); + out++; + *out = ((*in) >> 8) % (1U << 4); + out++; + *out = ((*in) >> 12) % (1U << 4); + out++; + *out = ((*in) >> 16) % (1U << 4); + out++; + *out = ((*in) >> 20) % (1U << 4); + out++; + *out = ((*in) >> 24) % (1U << 4); + out++; + *out = ((*in) >> 28); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 4); + out++; + *out = ((*in) >> 4) % (1U << 4); + out++; + *out = ((*in) >> 8) % (1U << 4); + out++; + *out = ((*in) >> 12) % (1U << 4); + out++; + *out = ((*in) >> 16) % (1U << 4); + out++; + *out = ((*in) >> 20) % (1U << 4); + out++; + *out = ((*in) >> 24) % (1U << 4); + out++; + *out = ((*in) >> 28); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 4); + out++; + *out = ((*in) >> 4) % (1U << 4); + out++; + *out = ((*in) >> 8) % (1U << 4); + out++; + *out = ((*in) >> 12) % (1U << 4); + out++; + *out = ((*in) >> 16) % (1U << 4); + out++; + *out = ((*in) >> 20) % (1U << 4); + out++; + *out = ((*in) >> 24) % (1U << 4); + out++; + *out = ((*in) >> 28); + ++in; + out++; + + return in; +} + +__host__ __device__ inline const uint32_t* unpack5_32(const uint32_t* in, uint32_t* out) { + *out = ((*in) >> 0) % (1U << 5); + out++; + *out = ((*in) >> 5) % (1U << 5); + out++; + *out = ((*in) >> 10) % (1U << 5); + out++; + *out = ((*in) >> 15) % (1U << 5); + out++; + *out = ((*in) >> 20) % (1U << 5); + out++; + *out = ((*in) >> 25) % (1U << 5); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 3)) << (5 - 3); + out++; + *out = ((*in) >> 3) % (1U << 5); + out++; + *out = ((*in) >> 8) % (1U << 5); + out++; + *out = ((*in) >> 13) % (1U << 5); + out++; + *out = ((*in) >> 18) % (1U << 5); + out++; + *out = ((*in) >> 23) % (1U << 5); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 1)) << (5 - 1); + out++; + *out = ((*in) >> 1) % (1U << 5); + out++; + *out = ((*in) >> 6) % (1U << 5); + out++; + *out = ((*in) >> 11) % (1U << 5); + out++; + *out = ((*in) >> 16) % (1U << 5); + out++; + *out = ((*in) >> 21) % (1U << 5); + out++; + *out = ((*in) >> 26) % (1U << 5); + out++; + *out = ((*in) >> 31); + ++in; + *out |= ((*in) % (1U << 4)) << (5 - 4); + out++; + *out = ((*in) >> 4) % (1U << 5); + out++; + *out = ((*in) >> 9) % (1U << 5); + out++; + *out = ((*in) >> 14) % (1U << 5); + out++; + *out = ((*in) >> 19) % (1U << 5); + out++; + *out = ((*in) >> 24) % (1U << 5); + out++; + *out = ((*in) >> 29); + ++in; + *out |= ((*in) % (1U << 2)) << (5 - 2); + out++; + *out = ((*in) >> 2) % (1U << 5); + out++; + *out = ((*in) >> 7) % (1U << 5); + out++; + *out = ((*in) >> 12) % (1U << 5); + out++; + *out = ((*in) >> 17) % (1U << 5); + out++; + *out = ((*in) >> 22) % (1U << 5); + out++; + *out = ((*in) >> 27); + ++in; + out++; + + return in; +} + +__host__ __device__ inline const uint32_t* unpack6_32(const uint32_t* in, uint32_t* out) { + *out = ((*in) >> 0) % (1U << 6); + out++; + *out = ((*in) >> 6) % (1U << 6); + out++; + *out = ((*in) >> 12) % (1U << 6); + out++; + *out = ((*in) >> 18) % (1U << 6); + out++; + *out = ((*in) >> 24) % (1U << 6); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 4)) << (6 - 4); + out++; + *out = ((*in) >> 4) % (1U << 6); + out++; + *out = ((*in) >> 10) % (1U << 6); + out++; + *out = ((*in) >> 16) % (1U << 6); + out++; + *out = ((*in) >> 22) % (1U << 6); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 2)) << (6 - 2); + out++; + *out = ((*in) >> 2) % (1U << 6); + out++; + *out = ((*in) >> 8) % (1U << 6); + out++; + *out = ((*in) >> 14) % (1U << 6); + out++; + *out = ((*in) >> 20) % (1U << 6); + out++; + *out = ((*in) >> 26); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 6); + out++; + *out = ((*in) >> 6) % (1U << 6); + out++; + *out = ((*in) >> 12) % (1U << 6); + out++; + *out = ((*in) >> 18) % (1U << 6); + out++; + *out = ((*in) >> 24) % (1U << 6); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 4)) << (6 - 4); + out++; + *out = ((*in) >> 4) % (1U << 6); + out++; + *out = ((*in) >> 10) % (1U << 6); + out++; + *out = ((*in) >> 16) % (1U << 6); + out++; + *out = ((*in) >> 22) % (1U << 6); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 2)) << (6 - 2); + out++; + *out = ((*in) >> 2) % (1U << 6); + out++; + *out = ((*in) >> 8) % (1U << 6); + out++; + *out = ((*in) >> 14) % (1U << 6); + out++; + *out = ((*in) >> 20) % (1U << 6); + out++; + *out = ((*in) >> 26); + ++in; + out++; + + return in; +} + +__host__ __device__ inline const uint32_t* unpack7_32(const uint32_t* in, uint32_t* out) { + *out = ((*in) >> 0) % (1U << 7); + out++; + *out = ((*in) >> 7) % (1U << 7); + out++; + *out = ((*in) >> 14) % (1U << 7); + out++; + *out = ((*in) >> 21) % (1U << 7); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 3)) << (7 - 3); + out++; + *out = ((*in) >> 3) % (1U << 7); + out++; + *out = ((*in) >> 10) % (1U << 7); + out++; + *out = ((*in) >> 17) % (1U << 7); + out++; + *out = ((*in) >> 24) % (1U << 7); + out++; + *out = ((*in) >> 31); + ++in; + *out |= ((*in) % (1U << 6)) << (7 - 6); + out++; + *out = ((*in) >> 6) % (1U << 7); + out++; + *out = ((*in) >> 13) % (1U << 7); + out++; + *out = ((*in) >> 20) % (1U << 7); + out++; + *out = ((*in) >> 27); + ++in; + *out |= ((*in) % (1U << 2)) << (7 - 2); + out++; + *out = ((*in) >> 2) % (1U << 7); + out++; + *out = ((*in) >> 9) % (1U << 7); + out++; + *out = ((*in) >> 16) % (1U << 7); + out++; + *out = ((*in) >> 23) % (1U << 7); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 5)) << (7 - 5); + out++; + *out = ((*in) >> 5) % (1U << 7); + out++; + *out = ((*in) >> 12) % (1U << 7); + out++; + *out = ((*in) >> 19) % (1U << 7); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 1)) << (7 - 1); + out++; + *out = ((*in) >> 1) % (1U << 7); + out++; + *out = ((*in) >> 8) % (1U << 7); + out++; + *out = ((*in) >> 15) % (1U << 7); + out++; + *out = ((*in) >> 22) % (1U << 7); + out++; + *out = ((*in) >> 29); + ++in; + *out |= ((*in) % (1U << 4)) << (7 - 4); + out++; + *out = ((*in) >> 4) % (1U << 7); + out++; + *out = ((*in) >> 11) % (1U << 7); + out++; + *out = ((*in) >> 18) % (1U << 7); + out++; + *out = ((*in) >> 25); + ++in; + out++; + + return in; +} + +__host__ __device__ inline const uint32_t* unpack8_32(const uint32_t* in, uint32_t* out) { + *out = ((*in) >> 0) % (1U << 8); + out++; + *out = ((*in) >> 8) % (1U << 8); + out++; + *out = ((*in) >> 16) % (1U << 8); + out++; + *out = ((*in) >> 24); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 8); + out++; + *out = ((*in) >> 8) % (1U << 8); + out++; + *out = ((*in) >> 16) % (1U << 8); + out++; + *out = ((*in) >> 24); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 8); + out++; + *out = ((*in) >> 8) % (1U << 8); + out++; + *out = ((*in) >> 16) % (1U << 8); + out++; + *out = ((*in) >> 24); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 8); + out++; + *out = ((*in) >> 8) % (1U << 8); + out++; + *out = ((*in) >> 16) % (1U << 8); + out++; + *out = ((*in) >> 24); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 8); + out++; + *out = ((*in) >> 8) % (1U << 8); + out++; + *out = ((*in) >> 16) % (1U << 8); + out++; + *out = ((*in) >> 24); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 8); + out++; + *out = ((*in) >> 8) % (1U << 8); + out++; + *out = ((*in) >> 16) % (1U << 8); + out++; + *out = ((*in) >> 24); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 8); + out++; + *out = ((*in) >> 8) % (1U << 8); + out++; + *out = ((*in) >> 16) % (1U << 8); + out++; + *out = ((*in) >> 24); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 8); + out++; + *out = ((*in) >> 8) % (1U << 8); + out++; + *out = ((*in) >> 16) % (1U << 8); + out++; + *out = ((*in) >> 24); + ++in; + out++; + + return in; +} + +__host__ __device__ inline const uint32_t* unpack9_32(const uint32_t* in, uint32_t* out) { + *out = ((*in) >> 0) % (1U << 9); + out++; + *out = ((*in) >> 9) % (1U << 9); + out++; + *out = ((*in) >> 18) % (1U << 9); + out++; + *out = ((*in) >> 27); + ++in; + *out |= ((*in) % (1U << 4)) << (9 - 4); + out++; + *out = ((*in) >> 4) % (1U << 9); + out++; + *out = ((*in) >> 13) % (1U << 9); + out++; + *out = ((*in) >> 22) % (1U << 9); + out++; + *out = ((*in) >> 31); + ++in; + *out |= ((*in) % (1U << 8)) << (9 - 8); + out++; + *out = ((*in) >> 8) % (1U << 9); + out++; + *out = ((*in) >> 17) % (1U << 9); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 3)) << (9 - 3); + out++; + *out = ((*in) >> 3) % (1U << 9); + out++; + *out = ((*in) >> 12) % (1U << 9); + out++; + *out = ((*in) >> 21) % (1U << 9); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 7)) << (9 - 7); + out++; + *out = ((*in) >> 7) % (1U << 9); + out++; + *out = ((*in) >> 16) % (1U << 9); + out++; + *out = ((*in) >> 25); + ++in; + *out |= ((*in) % (1U << 2)) << (9 - 2); + out++; + *out = ((*in) >> 2) % (1U << 9); + out++; + *out = ((*in) >> 11) % (1U << 9); + out++; + *out = ((*in) >> 20) % (1U << 9); + out++; + *out = ((*in) >> 29); + ++in; + *out |= ((*in) % (1U << 6)) << (9 - 6); + out++; + *out = ((*in) >> 6) % (1U << 9); + out++; + *out = ((*in) >> 15) % (1U << 9); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 1)) << (9 - 1); + out++; + *out = ((*in) >> 1) % (1U << 9); + out++; + *out = ((*in) >> 10) % (1U << 9); + out++; + *out = ((*in) >> 19) % (1U << 9); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 5)) << (9 - 5); + out++; + *out = ((*in) >> 5) % (1U << 9); + out++; + *out = ((*in) >> 14) % (1U << 9); + out++; + *out = ((*in) >> 23); + ++in; + out++; + + return in; +} + +__host__ __device__ inline const uint32_t* unpack10_32(const uint32_t* in, uint32_t* out) { + *out = ((*in) >> 0) % (1U << 10); + out++; + *out = ((*in) >> 10) % (1U << 10); + out++; + *out = ((*in) >> 20) % (1U << 10); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 8)) << (10 - 8); + out++; + *out = ((*in) >> 8) % (1U << 10); + out++; + *out = ((*in) >> 18) % (1U << 10); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 6)) << (10 - 6); + out++; + *out = ((*in) >> 6) % (1U << 10); + out++; + *out = ((*in) >> 16) % (1U << 10); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 4)) << (10 - 4); + out++; + *out = ((*in) >> 4) % (1U << 10); + out++; + *out = ((*in) >> 14) % (1U << 10); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 2)) << (10 - 2); + out++; + *out = ((*in) >> 2) % (1U << 10); + out++; + *out = ((*in) >> 12) % (1U << 10); + out++; + *out = ((*in) >> 22); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 10); + out++; + *out = ((*in) >> 10) % (1U << 10); + out++; + *out = ((*in) >> 20) % (1U << 10); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 8)) << (10 - 8); + out++; + *out = ((*in) >> 8) % (1U << 10); + out++; + *out = ((*in) >> 18) % (1U << 10); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 6)) << (10 - 6); + out++; + *out = ((*in) >> 6) % (1U << 10); + out++; + *out = ((*in) >> 16) % (1U << 10); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 4)) << (10 - 4); + out++; + *out = ((*in) >> 4) % (1U << 10); + out++; + *out = ((*in) >> 14) % (1U << 10); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 2)) << (10 - 2); + out++; + *out = ((*in) >> 2) % (1U << 10); + out++; + *out = ((*in) >> 12) % (1U << 10); + out++; + *out = ((*in) >> 22); + ++in; + out++; + + return in; +} + +__host__ __device__ inline const uint32_t* unpack11_32(const uint32_t* in, uint32_t* out) { + *out = ((*in) >> 0) % (1U << 11); + out++; + *out = ((*in) >> 11) % (1U << 11); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 1)) << (11 - 1); + out++; + *out = ((*in) >> 1) % (1U << 11); + out++; + *out = ((*in) >> 12) % (1U << 11); + out++; + *out = ((*in) >> 23); + ++in; + *out |= ((*in) % (1U << 2)) << (11 - 2); + out++; + *out = ((*in) >> 2) % (1U << 11); + out++; + *out = ((*in) >> 13) % (1U << 11); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 3)) << (11 - 3); + out++; + *out = ((*in) >> 3) % (1U << 11); + out++; + *out = ((*in) >> 14) % (1U << 11); + out++; + *out = ((*in) >> 25); + ++in; + *out |= ((*in) % (1U << 4)) << (11 - 4); + out++; + *out = ((*in) >> 4) % (1U << 11); + out++; + *out = ((*in) >> 15) % (1U << 11); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 5)) << (11 - 5); + out++; + *out = ((*in) >> 5) % (1U << 11); + out++; + *out = ((*in) >> 16) % (1U << 11); + out++; + *out = ((*in) >> 27); + ++in; + *out |= ((*in) % (1U << 6)) << (11 - 6); + out++; + *out = ((*in) >> 6) % (1U << 11); + out++; + *out = ((*in) >> 17) % (1U << 11); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 7)) << (11 - 7); + out++; + *out = ((*in) >> 7) % (1U << 11); + out++; + *out = ((*in) >> 18) % (1U << 11); + out++; + *out = ((*in) >> 29); + ++in; + *out |= ((*in) % (1U << 8)) << (11 - 8); + out++; + *out = ((*in) >> 8) % (1U << 11); + out++; + *out = ((*in) >> 19) % (1U << 11); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 9)) << (11 - 9); + out++; + *out = ((*in) >> 9) % (1U << 11); + out++; + *out = ((*in) >> 20) % (1U << 11); + out++; + *out = ((*in) >> 31); + ++in; + *out |= ((*in) % (1U << 10)) << (11 - 10); + out++; + *out = ((*in) >> 10) % (1U << 11); + out++; + *out = ((*in) >> 21); + ++in; + out++; + + return in; +} + +__host__ __device__ inline const uint32_t* unpack12_32(const uint32_t* in, uint32_t* out) { + *out = ((*in) >> 0) % (1U << 12); + out++; + *out = ((*in) >> 12) % (1U << 12); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 4)) << (12 - 4); + out++; + *out = ((*in) >> 4) % (1U << 12); + out++; + *out = ((*in) >> 16) % (1U << 12); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 8)) << (12 - 8); + out++; + *out = ((*in) >> 8) % (1U << 12); + out++; + *out = ((*in) >> 20); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 12); + out++; + *out = ((*in) >> 12) % (1U << 12); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 4)) << (12 - 4); + out++; + *out = ((*in) >> 4) % (1U << 12); + out++; + *out = ((*in) >> 16) % (1U << 12); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 8)) << (12 - 8); + out++; + *out = ((*in) >> 8) % (1U << 12); + out++; + *out = ((*in) >> 20); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 12); + out++; + *out = ((*in) >> 12) % (1U << 12); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 4)) << (12 - 4); + out++; + *out = ((*in) >> 4) % (1U << 12); + out++; + *out = ((*in) >> 16) % (1U << 12); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 8)) << (12 - 8); + out++; + *out = ((*in) >> 8) % (1U << 12); + out++; + *out = ((*in) >> 20); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 12); + out++; + *out = ((*in) >> 12) % (1U << 12); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 4)) << (12 - 4); + out++; + *out = ((*in) >> 4) % (1U << 12); + out++; + *out = ((*in) >> 16) % (1U << 12); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 8)) << (12 - 8); + out++; + *out = ((*in) >> 8) % (1U << 12); + out++; + *out = ((*in) >> 20); + ++in; + out++; + + return in; +} + +__host__ __device__ inline const uint32_t* unpack13_32(const uint32_t* in, uint32_t* out) { + *out = ((*in) >> 0) % (1U << 13); + out++; + *out = ((*in) >> 13) % (1U << 13); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 7)) << (13 - 7); + out++; + *out = ((*in) >> 7) % (1U << 13); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 1)) << (13 - 1); + out++; + *out = ((*in) >> 1) % (1U << 13); + out++; + *out = ((*in) >> 14) % (1U << 13); + out++; + *out = ((*in) >> 27); + ++in; + *out |= ((*in) % (1U << 8)) << (13 - 8); + out++; + *out = ((*in) >> 8) % (1U << 13); + out++; + *out = ((*in) >> 21); + ++in; + *out |= ((*in) % (1U << 2)) << (13 - 2); + out++; + *out = ((*in) >> 2) % (1U << 13); + out++; + *out = ((*in) >> 15) % (1U << 13); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 9)) << (13 - 9); + out++; + *out = ((*in) >> 9) % (1U << 13); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 3)) << (13 - 3); + out++; + *out = ((*in) >> 3) % (1U << 13); + out++; + *out = ((*in) >> 16) % (1U << 13); + out++; + *out = ((*in) >> 29); + ++in; + *out |= ((*in) % (1U << 10)) << (13 - 10); + out++; + *out = ((*in) >> 10) % (1U << 13); + out++; + *out = ((*in) >> 23); + ++in; + *out |= ((*in) % (1U << 4)) << (13 - 4); + out++; + *out = ((*in) >> 4) % (1U << 13); + out++; + *out = ((*in) >> 17) % (1U << 13); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 11)) << (13 - 11); + out++; + *out = ((*in) >> 11) % (1U << 13); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 5)) << (13 - 5); + out++; + *out = ((*in) >> 5) % (1U << 13); + out++; + *out = ((*in) >> 18) % (1U << 13); + out++; + *out = ((*in) >> 31); + ++in; + *out |= ((*in) % (1U << 12)) << (13 - 12); + out++; + *out = ((*in) >> 12) % (1U << 13); + out++; + *out = ((*in) >> 25); + ++in; + *out |= ((*in) % (1U << 6)) << (13 - 6); + out++; + *out = ((*in) >> 6) % (1U << 13); + out++; + *out = ((*in) >> 19); + ++in; + out++; + + return in; +} + +__host__ __device__ inline const uint32_t* unpack14_32(const uint32_t* in, uint32_t* out) { + *out = ((*in) >> 0) % (1U << 14); + out++; + *out = ((*in) >> 14) % (1U << 14); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 10)) << (14 - 10); + out++; + *out = ((*in) >> 10) % (1U << 14); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 6)) << (14 - 6); + out++; + *out = ((*in) >> 6) % (1U << 14); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 2)) << (14 - 2); + out++; + *out = ((*in) >> 2) % (1U << 14); + out++; + *out = ((*in) >> 16) % (1U << 14); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 12)) << (14 - 12); + out++; + *out = ((*in) >> 12) % (1U << 14); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 8)) << (14 - 8); + out++; + *out = ((*in) >> 8) % (1U << 14); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 4)) << (14 - 4); + out++; + *out = ((*in) >> 4) % (1U << 14); + out++; + *out = ((*in) >> 18); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 14); + out++; + *out = ((*in) >> 14) % (1U << 14); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 10)) << (14 - 10); + out++; + *out = ((*in) >> 10) % (1U << 14); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 6)) << (14 - 6); + out++; + *out = ((*in) >> 6) % (1U << 14); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 2)) << (14 - 2); + out++; + *out = ((*in) >> 2) % (1U << 14); + out++; + *out = ((*in) >> 16) % (1U << 14); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 12)) << (14 - 12); + out++; + *out = ((*in) >> 12) % (1U << 14); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 8)) << (14 - 8); + out++; + *out = ((*in) >> 8) % (1U << 14); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 4)) << (14 - 4); + out++; + *out = ((*in) >> 4) % (1U << 14); + out++; + *out = ((*in) >> 18); + ++in; + out++; + + return in; +} + +__host__ __device__ inline const uint32_t* unpack15_32(const uint32_t* in, uint32_t* out) { + *out = ((*in) >> 0) % (1U << 15); + out++; + *out = ((*in) >> 15) % (1U << 15); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 13)) << (15 - 13); + out++; + *out = ((*in) >> 13) % (1U << 15); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 11)) << (15 - 11); + out++; + *out = ((*in) >> 11) % (1U << 15); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 9)) << (15 - 9); + out++; + *out = ((*in) >> 9) % (1U << 15); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 7)) << (15 - 7); + out++; + *out = ((*in) >> 7) % (1U << 15); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 5)) << (15 - 5); + out++; + *out = ((*in) >> 5) % (1U << 15); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 3)) << (15 - 3); + out++; + *out = ((*in) >> 3) % (1U << 15); + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 1)) << (15 - 1); + out++; + *out = ((*in) >> 1) % (1U << 15); + out++; + *out = ((*in) >> 16) % (1U << 15); + out++; + *out = ((*in) >> 31); + ++in; + *out |= ((*in) % (1U << 14)) << (15 - 14); + out++; + *out = ((*in) >> 14) % (1U << 15); + out++; + *out = ((*in) >> 29); + ++in; + *out |= ((*in) % (1U << 12)) << (15 - 12); + out++; + *out = ((*in) >> 12) % (1U << 15); + out++; + *out = ((*in) >> 27); + ++in; + *out |= ((*in) % (1U << 10)) << (15 - 10); + out++; + *out = ((*in) >> 10) % (1U << 15); + out++; + *out = ((*in) >> 25); + ++in; + *out |= ((*in) % (1U << 8)) << (15 - 8); + out++; + *out = ((*in) >> 8) % (1U << 15); + out++; + *out = ((*in) >> 23); + ++in; + *out |= ((*in) % (1U << 6)) << (15 - 6); + out++; + *out = ((*in) >> 6) % (1U << 15); + out++; + *out = ((*in) >> 21); + ++in; + *out |= ((*in) % (1U << 4)) << (15 - 4); + out++; + *out = ((*in) >> 4) % (1U << 15); + out++; + *out = ((*in) >> 19); + ++in; + *out |= ((*in) % (1U << 2)) << (15 - 2); + out++; + *out = ((*in) >> 2) % (1U << 15); + out++; + *out = ((*in) >> 17); + ++in; + out++; + + return in; +} + +__host__ __device__ inline const uint32_t* unpack16_32(const uint32_t* in, uint32_t* out) { + *out = ((*in) >> 0) % (1U << 16); + out++; + *out = ((*in) >> 16); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 16); + out++; + *out = ((*in) >> 16); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 16); + out++; + *out = ((*in) >> 16); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 16); + out++; + *out = ((*in) >> 16); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 16); + out++; + *out = ((*in) >> 16); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 16); + out++; + *out = ((*in) >> 16); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 16); + out++; + *out = ((*in) >> 16); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 16); + out++; + *out = ((*in) >> 16); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 16); + out++; + *out = ((*in) >> 16); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 16); + out++; + *out = ((*in) >> 16); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 16); + out++; + *out = ((*in) >> 16); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 16); + out++; + *out = ((*in) >> 16); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 16); + out++; + *out = ((*in) >> 16); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 16); + out++; + *out = ((*in) >> 16); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 16); + out++; + *out = ((*in) >> 16); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 16); + out++; + *out = ((*in) >> 16); + ++in; + out++; + + return in; +} + +__host__ __device__ inline const uint32_t* unpack17_32(const uint32_t* in, uint32_t* out) { + *out = ((*in) >> 0) % (1U << 17); + out++; + *out = ((*in) >> 17); + ++in; + *out |= ((*in) % (1U << 2)) << (17 - 2); + out++; + *out = ((*in) >> 2) % (1U << 17); + out++; + *out = ((*in) >> 19); + ++in; + *out |= ((*in) % (1U << 4)) << (17 - 4); + out++; + *out = ((*in) >> 4) % (1U << 17); + out++; + *out = ((*in) >> 21); + ++in; + *out |= ((*in) % (1U << 6)) << (17 - 6); + out++; + *out = ((*in) >> 6) % (1U << 17); + out++; + *out = ((*in) >> 23); + ++in; + *out |= ((*in) % (1U << 8)) << (17 - 8); + out++; + *out = ((*in) >> 8) % (1U << 17); + out++; + *out = ((*in) >> 25); + ++in; + *out |= ((*in) % (1U << 10)) << (17 - 10); + out++; + *out = ((*in) >> 10) % (1U << 17); + out++; + *out = ((*in) >> 27); + ++in; + *out |= ((*in) % (1U << 12)) << (17 - 12); + out++; + *out = ((*in) >> 12) % (1U << 17); + out++; + *out = ((*in) >> 29); + ++in; + *out |= ((*in) % (1U << 14)) << (17 - 14); + out++; + *out = ((*in) >> 14) % (1U << 17); + out++; + *out = ((*in) >> 31); + ++in; + *out |= ((*in) % (1U << 16)) << (17 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 1)) << (17 - 1); + out++; + *out = ((*in) >> 1) % (1U << 17); + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 3)) << (17 - 3); + out++; + *out = ((*in) >> 3) % (1U << 17); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 5)) << (17 - 5); + out++; + *out = ((*in) >> 5) % (1U << 17); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 7)) << (17 - 7); + out++; + *out = ((*in) >> 7) % (1U << 17); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 9)) << (17 - 9); + out++; + *out = ((*in) >> 9) % (1U << 17); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 11)) << (17 - 11); + out++; + *out = ((*in) >> 11) % (1U << 17); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 13)) << (17 - 13); + out++; + *out = ((*in) >> 13) % (1U << 17); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 15)) << (17 - 15); + out++; + *out = ((*in) >> 15); + ++in; + out++; + + return in; +} + +__host__ __device__ inline const uint32_t* unpack18_32(const uint32_t* in, uint32_t* out) { + *out = ((*in) >> 0) % (1U << 18); + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 4)) << (18 - 4); + out++; + *out = ((*in) >> 4) % (1U << 18); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 8)) << (18 - 8); + out++; + *out = ((*in) >> 8) % (1U << 18); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 12)) << (18 - 12); + out++; + *out = ((*in) >> 12) % (1U << 18); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 16)) << (18 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 2)) << (18 - 2); + out++; + *out = ((*in) >> 2) % (1U << 18); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 6)) << (18 - 6); + out++; + *out = ((*in) >> 6) % (1U << 18); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 10)) << (18 - 10); + out++; + *out = ((*in) >> 10) % (1U << 18); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 14)) << (18 - 14); + out++; + *out = ((*in) >> 14); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 18); + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 4)) << (18 - 4); + out++; + *out = ((*in) >> 4) % (1U << 18); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 8)) << (18 - 8); + out++; + *out = ((*in) >> 8) % (1U << 18); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 12)) << (18 - 12); + out++; + *out = ((*in) >> 12) % (1U << 18); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 16)) << (18 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 2)) << (18 - 2); + out++; + *out = ((*in) >> 2) % (1U << 18); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 6)) << (18 - 6); + out++; + *out = ((*in) >> 6) % (1U << 18); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 10)) << (18 - 10); + out++; + *out = ((*in) >> 10) % (1U << 18); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 14)) << (18 - 14); + out++; + *out = ((*in) >> 14); + ++in; + out++; + + return in; +} + +__host__ __device__ inline const uint32_t* unpack19_32(const uint32_t* in, uint32_t* out) { + *out = ((*in) >> 0) % (1U << 19); + out++; + *out = ((*in) >> 19); + ++in; + *out |= ((*in) % (1U << 6)) << (19 - 6); + out++; + *out = ((*in) >> 6) % (1U << 19); + out++; + *out = ((*in) >> 25); + ++in; + *out |= ((*in) % (1U << 12)) << (19 - 12); + out++; + *out = ((*in) >> 12) % (1U << 19); + out++; + *out = ((*in) >> 31); + ++in; + *out |= ((*in) % (1U << 18)) << (19 - 18); + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 5)) << (19 - 5); + out++; + *out = ((*in) >> 5) % (1U << 19); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 11)) << (19 - 11); + out++; + *out = ((*in) >> 11) % (1U << 19); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 17)) << (19 - 17); + out++; + *out = ((*in) >> 17); + ++in; + *out |= ((*in) % (1U << 4)) << (19 - 4); + out++; + *out = ((*in) >> 4) % (1U << 19); + out++; + *out = ((*in) >> 23); + ++in; + *out |= ((*in) % (1U << 10)) << (19 - 10); + out++; + *out = ((*in) >> 10) % (1U << 19); + out++; + *out = ((*in) >> 29); + ++in; + *out |= ((*in) % (1U << 16)) << (19 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 3)) << (19 - 3); + out++; + *out = ((*in) >> 3) % (1U << 19); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 9)) << (19 - 9); + out++; + *out = ((*in) >> 9) % (1U << 19); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 15)) << (19 - 15); + out++; + *out = ((*in) >> 15); + ++in; + *out |= ((*in) % (1U << 2)) << (19 - 2); + out++; + *out = ((*in) >> 2) % (1U << 19); + out++; + *out = ((*in) >> 21); + ++in; + *out |= ((*in) % (1U << 8)) << (19 - 8); + out++; + *out = ((*in) >> 8) % (1U << 19); + out++; + *out = ((*in) >> 27); + ++in; + *out |= ((*in) % (1U << 14)) << (19 - 14); + out++; + *out = ((*in) >> 14); + ++in; + *out |= ((*in) % (1U << 1)) << (19 - 1); + out++; + *out = ((*in) >> 1) % (1U << 19); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 7)) << (19 - 7); + out++; + *out = ((*in) >> 7) % (1U << 19); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 13)) << (19 - 13); + out++; + *out = ((*in) >> 13); + ++in; + out++; + + return in; +} + +__host__ __device__ inline const uint32_t* unpack20_32(const uint32_t* in, uint32_t* out) { + *out = ((*in) >> 0) % (1U << 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 8)) << (20 - 8); + out++; + *out = ((*in) >> 8) % (1U << 20); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 16)) << (20 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 4)) << (20 - 4); + out++; + *out = ((*in) >> 4) % (1U << 20); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 12)) << (20 - 12); + out++; + *out = ((*in) >> 12); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 8)) << (20 - 8); + out++; + *out = ((*in) >> 8) % (1U << 20); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 16)) << (20 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 4)) << (20 - 4); + out++; + *out = ((*in) >> 4) % (1U << 20); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 12)) << (20 - 12); + out++; + *out = ((*in) >> 12); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 8)) << (20 - 8); + out++; + *out = ((*in) >> 8) % (1U << 20); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 16)) << (20 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 4)) << (20 - 4); + out++; + *out = ((*in) >> 4) % (1U << 20); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 12)) << (20 - 12); + out++; + *out = ((*in) >> 12); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 8)) << (20 - 8); + out++; + *out = ((*in) >> 8) % (1U << 20); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 16)) << (20 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 4)) << (20 - 4); + out++; + *out = ((*in) >> 4) % (1U << 20); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 12)) << (20 - 12); + out++; + *out = ((*in) >> 12); + ++in; + out++; + + return in; +} + +__host__ __device__ inline const uint32_t* unpack21_32(const uint32_t* in, uint32_t* out) { + *out = ((*in) >> 0) % (1U << 21); + out++; + *out = ((*in) >> 21); + ++in; + *out |= ((*in) % (1U << 10)) << (21 - 10); + out++; + *out = ((*in) >> 10) % (1U << 21); + out++; + *out = ((*in) >> 31); + ++in; + *out |= ((*in) % (1U << 20)) << (21 - 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 9)) << (21 - 9); + out++; + *out = ((*in) >> 9) % (1U << 21); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 19)) << (21 - 19); + out++; + *out = ((*in) >> 19); + ++in; + *out |= ((*in) % (1U << 8)) << (21 - 8); + out++; + *out = ((*in) >> 8) % (1U << 21); + out++; + *out = ((*in) >> 29); + ++in; + *out |= ((*in) % (1U << 18)) << (21 - 18); + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 7)) << (21 - 7); + out++; + *out = ((*in) >> 7) % (1U << 21); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 17)) << (21 - 17); + out++; + *out = ((*in) >> 17); + ++in; + *out |= ((*in) % (1U << 6)) << (21 - 6); + out++; + *out = ((*in) >> 6) % (1U << 21); + out++; + *out = ((*in) >> 27); + ++in; + *out |= ((*in) % (1U << 16)) << (21 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 5)) << (21 - 5); + out++; + *out = ((*in) >> 5) % (1U << 21); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 15)) << (21 - 15); + out++; + *out = ((*in) >> 15); + ++in; + *out |= ((*in) % (1U << 4)) << (21 - 4); + out++; + *out = ((*in) >> 4) % (1U << 21); + out++; + *out = ((*in) >> 25); + ++in; + *out |= ((*in) % (1U << 14)) << (21 - 14); + out++; + *out = ((*in) >> 14); + ++in; + *out |= ((*in) % (1U << 3)) << (21 - 3); + out++; + *out = ((*in) >> 3) % (1U << 21); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 13)) << (21 - 13); + out++; + *out = ((*in) >> 13); + ++in; + *out |= ((*in) % (1U << 2)) << (21 - 2); + out++; + *out = ((*in) >> 2) % (1U << 21); + out++; + *out = ((*in) >> 23); + ++in; + *out |= ((*in) % (1U << 12)) << (21 - 12); + out++; + *out = ((*in) >> 12); + ++in; + *out |= ((*in) % (1U << 1)) << (21 - 1); + out++; + *out = ((*in) >> 1) % (1U << 21); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 11)) << (21 - 11); + out++; + *out = ((*in) >> 11); + ++in; + out++; + + return in; +} + +__host__ __device__ inline const uint32_t* unpack22_32(const uint32_t* in, uint32_t* out) { + *out = ((*in) >> 0) % (1U << 22); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 12)) << (22 - 12); + out++; + *out = ((*in) >> 12); + ++in; + *out |= ((*in) % (1U << 2)) << (22 - 2); + out++; + *out = ((*in) >> 2) % (1U << 22); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 14)) << (22 - 14); + out++; + *out = ((*in) >> 14); + ++in; + *out |= ((*in) % (1U << 4)) << (22 - 4); + out++; + *out = ((*in) >> 4) % (1U << 22); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 16)) << (22 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 6)) << (22 - 6); + out++; + *out = ((*in) >> 6) % (1U << 22); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 18)) << (22 - 18); + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 8)) << (22 - 8); + out++; + *out = ((*in) >> 8) % (1U << 22); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 20)) << (22 - 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 10)) << (22 - 10); + out++; + *out = ((*in) >> 10); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 22); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 12)) << (22 - 12); + out++; + *out = ((*in) >> 12); + ++in; + *out |= ((*in) % (1U << 2)) << (22 - 2); + out++; + *out = ((*in) >> 2) % (1U << 22); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 14)) << (22 - 14); + out++; + *out = ((*in) >> 14); + ++in; + *out |= ((*in) % (1U << 4)) << (22 - 4); + out++; + *out = ((*in) >> 4) % (1U << 22); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 16)) << (22 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 6)) << (22 - 6); + out++; + *out = ((*in) >> 6) % (1U << 22); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 18)) << (22 - 18); + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 8)) << (22 - 8); + out++; + *out = ((*in) >> 8) % (1U << 22); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 20)) << (22 - 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 10)) << (22 - 10); + out++; + *out = ((*in) >> 10); + ++in; + out++; + + return in; +} + +__host__ __device__ inline const uint32_t* unpack23_32(const uint32_t* in, uint32_t* out) { + *out = ((*in) >> 0) % (1U << 23); + out++; + *out = ((*in) >> 23); + ++in; + *out |= ((*in) % (1U << 14)) << (23 - 14); + out++; + *out = ((*in) >> 14); + ++in; + *out |= ((*in) % (1U << 5)) << (23 - 5); + out++; + *out = ((*in) >> 5) % (1U << 23); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 19)) << (23 - 19); + out++; + *out = ((*in) >> 19); + ++in; + *out |= ((*in) % (1U << 10)) << (23 - 10); + out++; + *out = ((*in) >> 10); + ++in; + *out |= ((*in) % (1U << 1)) << (23 - 1); + out++; + *out = ((*in) >> 1) % (1U << 23); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 15)) << (23 - 15); + out++; + *out = ((*in) >> 15); + ++in; + *out |= ((*in) % (1U << 6)) << (23 - 6); + out++; + *out = ((*in) >> 6) % (1U << 23); + out++; + *out = ((*in) >> 29); + ++in; + *out |= ((*in) % (1U << 20)) << (23 - 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 11)) << (23 - 11); + out++; + *out = ((*in) >> 11); + ++in; + *out |= ((*in) % (1U << 2)) << (23 - 2); + out++; + *out = ((*in) >> 2) % (1U << 23); + out++; + *out = ((*in) >> 25); + ++in; + *out |= ((*in) % (1U << 16)) << (23 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 7)) << (23 - 7); + out++; + *out = ((*in) >> 7) % (1U << 23); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 21)) << (23 - 21); + out++; + *out = ((*in) >> 21); + ++in; + *out |= ((*in) % (1U << 12)) << (23 - 12); + out++; + *out = ((*in) >> 12); + ++in; + *out |= ((*in) % (1U << 3)) << (23 - 3); + out++; + *out = ((*in) >> 3) % (1U << 23); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 17)) << (23 - 17); + out++; + *out = ((*in) >> 17); + ++in; + *out |= ((*in) % (1U << 8)) << (23 - 8); + out++; + *out = ((*in) >> 8) % (1U << 23); + out++; + *out = ((*in) >> 31); + ++in; + *out |= ((*in) % (1U << 22)) << (23 - 22); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 13)) << (23 - 13); + out++; + *out = ((*in) >> 13); + ++in; + *out |= ((*in) % (1U << 4)) << (23 - 4); + out++; + *out = ((*in) >> 4) % (1U << 23); + out++; + *out = ((*in) >> 27); + ++in; + *out |= ((*in) % (1U << 18)) << (23 - 18); + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 9)) << (23 - 9); + out++; + *out = ((*in) >> 9); + ++in; + out++; + + return in; +} + +__host__ __device__ inline const uint32_t* unpack24_32(const uint32_t* in, uint32_t* out) { + *out = ((*in) >> 0) % (1U << 24); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 16)) << (24 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 8)) << (24 - 8); + out++; + *out = ((*in) >> 8); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 24); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 16)) << (24 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 8)) << (24 - 8); + out++; + *out = ((*in) >> 8); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 24); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 16)) << (24 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 8)) << (24 - 8); + out++; + *out = ((*in) >> 8); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 24); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 16)) << (24 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 8)) << (24 - 8); + out++; + *out = ((*in) >> 8); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 24); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 16)) << (24 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 8)) << (24 - 8); + out++; + *out = ((*in) >> 8); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 24); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 16)) << (24 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 8)) << (24 - 8); + out++; + *out = ((*in) >> 8); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 24); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 16)) << (24 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 8)) << (24 - 8); + out++; + *out = ((*in) >> 8); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 24); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 16)) << (24 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 8)) << (24 - 8); + out++; + *out = ((*in) >> 8); + ++in; + out++; + + return in; +} + +__host__ __device__ inline const uint32_t* unpack25_32(const uint32_t* in, uint32_t* out) { + *out = ((*in) >> 0) % (1U << 25); + out++; + *out = ((*in) >> 25); + ++in; + *out |= ((*in) % (1U << 18)) << (25 - 18); + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 11)) << (25 - 11); + out++; + *out = ((*in) >> 11); + ++in; + *out |= ((*in) % (1U << 4)) << (25 - 4); + out++; + *out = ((*in) >> 4) % (1U << 25); + out++; + *out = ((*in) >> 29); + ++in; + *out |= ((*in) % (1U << 22)) << (25 - 22); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 15)) << (25 - 15); + out++; + *out = ((*in) >> 15); + ++in; + *out |= ((*in) % (1U << 8)) << (25 - 8); + out++; + *out = ((*in) >> 8); + ++in; + *out |= ((*in) % (1U << 1)) << (25 - 1); + out++; + *out = ((*in) >> 1) % (1U << 25); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 19)) << (25 - 19); + out++; + *out = ((*in) >> 19); + ++in; + *out |= ((*in) % (1U << 12)) << (25 - 12); + out++; + *out = ((*in) >> 12); + ++in; + *out |= ((*in) % (1U << 5)) << (25 - 5); + out++; + *out = ((*in) >> 5) % (1U << 25); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 23)) << (25 - 23); + out++; + *out = ((*in) >> 23); + ++in; + *out |= ((*in) % (1U << 16)) << (25 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 9)) << (25 - 9); + out++; + *out = ((*in) >> 9); + ++in; + *out |= ((*in) % (1U << 2)) << (25 - 2); + out++; + *out = ((*in) >> 2) % (1U << 25); + out++; + *out = ((*in) >> 27); + ++in; + *out |= ((*in) % (1U << 20)) << (25 - 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 13)) << (25 - 13); + out++; + *out = ((*in) >> 13); + ++in; + *out |= ((*in) % (1U << 6)) << (25 - 6); + out++; + *out = ((*in) >> 6) % (1U << 25); + out++; + *out = ((*in) >> 31); + ++in; + *out |= ((*in) % (1U << 24)) << (25 - 24); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 17)) << (25 - 17); + out++; + *out = ((*in) >> 17); + ++in; + *out |= ((*in) % (1U << 10)) << (25 - 10); + out++; + *out = ((*in) >> 10); + ++in; + *out |= ((*in) % (1U << 3)) << (25 - 3); + out++; + *out = ((*in) >> 3) % (1U << 25); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 21)) << (25 - 21); + out++; + *out = ((*in) >> 21); + ++in; + *out |= ((*in) % (1U << 14)) << (25 - 14); + out++; + *out = ((*in) >> 14); + ++in; + *out |= ((*in) % (1U << 7)) << (25 - 7); + out++; + *out = ((*in) >> 7); + ++in; + out++; + + return in; +} + +__host__ __device__ inline const uint32_t* unpack26_32(const uint32_t* in, uint32_t* out) { + *out = ((*in) >> 0) % (1U << 26); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 20)) << (26 - 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 14)) << (26 - 14); + out++; + *out = ((*in) >> 14); + ++in; + *out |= ((*in) % (1U << 8)) << (26 - 8); + out++; + *out = ((*in) >> 8); + ++in; + *out |= ((*in) % (1U << 2)) << (26 - 2); + out++; + *out = ((*in) >> 2) % (1U << 26); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 22)) << (26 - 22); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 16)) << (26 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 10)) << (26 - 10); + out++; + *out = ((*in) >> 10); + ++in; + *out |= ((*in) % (1U << 4)) << (26 - 4); + out++; + *out = ((*in) >> 4) % (1U << 26); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 24)) << (26 - 24); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 18)) << (26 - 18); + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 12)) << (26 - 12); + out++; + *out = ((*in) >> 12); + ++in; + *out |= ((*in) % (1U << 6)) << (26 - 6); + out++; + *out = ((*in) >> 6); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 26); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 20)) << (26 - 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 14)) << (26 - 14); + out++; + *out = ((*in) >> 14); + ++in; + *out |= ((*in) % (1U << 8)) << (26 - 8); + out++; + *out = ((*in) >> 8); + ++in; + *out |= ((*in) % (1U << 2)) << (26 - 2); + out++; + *out = ((*in) >> 2) % (1U << 26); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 22)) << (26 - 22); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 16)) << (26 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 10)) << (26 - 10); + out++; + *out = ((*in) >> 10); + ++in; + *out |= ((*in) % (1U << 4)) << (26 - 4); + out++; + *out = ((*in) >> 4) % (1U << 26); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 24)) << (26 - 24); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 18)) << (26 - 18); + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 12)) << (26 - 12); + out++; + *out = ((*in) >> 12); + ++in; + *out |= ((*in) % (1U << 6)) << (26 - 6); + out++; + *out = ((*in) >> 6); + ++in; + out++; + + return in; +} + +__host__ __device__ inline const uint32_t* unpack27_32(const uint32_t* in, uint32_t* out) { + *out = ((*in) >> 0) % (1U << 27); + out++; + *out = ((*in) >> 27); + ++in; + *out |= ((*in) % (1U << 22)) << (27 - 22); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 17)) << (27 - 17); + out++; + *out = ((*in) >> 17); + ++in; + *out |= ((*in) % (1U << 12)) << (27 - 12); + out++; + *out = ((*in) >> 12); + ++in; + *out |= ((*in) % (1U << 7)) << (27 - 7); + out++; + *out = ((*in) >> 7); + ++in; + *out |= ((*in) % (1U << 2)) << (27 - 2); + out++; + *out = ((*in) >> 2) % (1U << 27); + out++; + *out = ((*in) >> 29); + ++in; + *out |= ((*in) % (1U << 24)) << (27 - 24); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 19)) << (27 - 19); + out++; + *out = ((*in) >> 19); + ++in; + *out |= ((*in) % (1U << 14)) << (27 - 14); + out++; + *out = ((*in) >> 14); + ++in; + *out |= ((*in) % (1U << 9)) << (27 - 9); + out++; + *out = ((*in) >> 9); + ++in; + *out |= ((*in) % (1U << 4)) << (27 - 4); + out++; + *out = ((*in) >> 4) % (1U << 27); + out++; + *out = ((*in) >> 31); + ++in; + *out |= ((*in) % (1U << 26)) << (27 - 26); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 21)) << (27 - 21); + out++; + *out = ((*in) >> 21); + ++in; + *out |= ((*in) % (1U << 16)) << (27 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 11)) << (27 - 11); + out++; + *out = ((*in) >> 11); + ++in; + *out |= ((*in) % (1U << 6)) << (27 - 6); + out++; + *out = ((*in) >> 6); + ++in; + *out |= ((*in) % (1U << 1)) << (27 - 1); + out++; + *out = ((*in) >> 1) % (1U << 27); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 23)) << (27 - 23); + out++; + *out = ((*in) >> 23); + ++in; + *out |= ((*in) % (1U << 18)) << (27 - 18); + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 13)) << (27 - 13); + out++; + *out = ((*in) >> 13); + ++in; + *out |= ((*in) % (1U << 8)) << (27 - 8); + out++; + *out = ((*in) >> 8); + ++in; + *out |= ((*in) % (1U << 3)) << (27 - 3); + out++; + *out = ((*in) >> 3) % (1U << 27); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 25)) << (27 - 25); + out++; + *out = ((*in) >> 25); + ++in; + *out |= ((*in) % (1U << 20)) << (27 - 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 15)) << (27 - 15); + out++; + *out = ((*in) >> 15); + ++in; + *out |= ((*in) % (1U << 10)) << (27 - 10); + out++; + *out = ((*in) >> 10); + ++in; + *out |= ((*in) % (1U << 5)) << (27 - 5); + out++; + *out = ((*in) >> 5); + ++in; + out++; + + return in; +} + +__host__ __device__ inline const uint32_t* unpack28_32(const uint32_t* in, uint32_t* out) { + *out = ((*in) >> 0) % (1U << 28); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 24)) << (28 - 24); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 20)) << (28 - 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 16)) << (28 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 12)) << (28 - 12); + out++; + *out = ((*in) >> 12); + ++in; + *out |= ((*in) % (1U << 8)) << (28 - 8); + out++; + *out = ((*in) >> 8); + ++in; + *out |= ((*in) % (1U << 4)) << (28 - 4); + out++; + *out = ((*in) >> 4); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 28); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 24)) << (28 - 24); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 20)) << (28 - 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 16)) << (28 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 12)) << (28 - 12); + out++; + *out = ((*in) >> 12); + ++in; + *out |= ((*in) % (1U << 8)) << (28 - 8); + out++; + *out = ((*in) >> 8); + ++in; + *out |= ((*in) % (1U << 4)) << (28 - 4); + out++; + *out = ((*in) >> 4); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 28); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 24)) << (28 - 24); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 20)) << (28 - 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 16)) << (28 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 12)) << (28 - 12); + out++; + *out = ((*in) >> 12); + ++in; + *out |= ((*in) % (1U << 8)) << (28 - 8); + out++; + *out = ((*in) >> 8); + ++in; + *out |= ((*in) % (1U << 4)) << (28 - 4); + out++; + *out = ((*in) >> 4); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 28); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 24)) << (28 - 24); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 20)) << (28 - 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 16)) << (28 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 12)) << (28 - 12); + out++; + *out = ((*in) >> 12); + ++in; + *out |= ((*in) % (1U << 8)) << (28 - 8); + out++; + *out = ((*in) >> 8); + ++in; + *out |= ((*in) % (1U << 4)) << (28 - 4); + out++; + *out = ((*in) >> 4); + ++in; + out++; + + return in; +} + +__host__ __device__ inline const uint32_t* unpack29_32(const uint32_t* in, uint32_t* out) { + *out = ((*in) >> 0) % (1U << 29); + out++; + *out = ((*in) >> 29); + ++in; + *out |= ((*in) % (1U << 26)) << (29 - 26); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 23)) << (29 - 23); + out++; + *out = ((*in) >> 23); + ++in; + *out |= ((*in) % (1U << 20)) << (29 - 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 17)) << (29 - 17); + out++; + *out = ((*in) >> 17); + ++in; + *out |= ((*in) % (1U << 14)) << (29 - 14); + out++; + *out = ((*in) >> 14); + ++in; + *out |= ((*in) % (1U << 11)) << (29 - 11); + out++; + *out = ((*in) >> 11); + ++in; + *out |= ((*in) % (1U << 8)) << (29 - 8); + out++; + *out = ((*in) >> 8); + ++in; + *out |= ((*in) % (1U << 5)) << (29 - 5); + out++; + *out = ((*in) >> 5); + ++in; + *out |= ((*in) % (1U << 2)) << (29 - 2); + out++; + *out = ((*in) >> 2) % (1U << 29); + out++; + *out = ((*in) >> 31); + ++in; + *out |= ((*in) % (1U << 28)) << (29 - 28); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 25)) << (29 - 25); + out++; + *out = ((*in) >> 25); + ++in; + *out |= ((*in) % (1U << 22)) << (29 - 22); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 19)) << (29 - 19); + out++; + *out = ((*in) >> 19); + ++in; + *out |= ((*in) % (1U << 16)) << (29 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 13)) << (29 - 13); + out++; + *out = ((*in) >> 13); + ++in; + *out |= ((*in) % (1U << 10)) << (29 - 10); + out++; + *out = ((*in) >> 10); + ++in; + *out |= ((*in) % (1U << 7)) << (29 - 7); + out++; + *out = ((*in) >> 7); + ++in; + *out |= ((*in) % (1U << 4)) << (29 - 4); + out++; + *out = ((*in) >> 4); + ++in; + *out |= ((*in) % (1U << 1)) << (29 - 1); + out++; + *out = ((*in) >> 1) % (1U << 29); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 27)) << (29 - 27); + out++; + *out = ((*in) >> 27); + ++in; + *out |= ((*in) % (1U << 24)) << (29 - 24); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 21)) << (29 - 21); + out++; + *out = ((*in) >> 21); + ++in; + *out |= ((*in) % (1U << 18)) << (29 - 18); + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 15)) << (29 - 15); + out++; + *out = ((*in) >> 15); + ++in; + *out |= ((*in) % (1U << 12)) << (29 - 12); + out++; + *out = ((*in) >> 12); + ++in; + *out |= ((*in) % (1U << 9)) << (29 - 9); + out++; + *out = ((*in) >> 9); + ++in; + *out |= ((*in) % (1U << 6)) << (29 - 6); + out++; + *out = ((*in) >> 6); + ++in; + *out |= ((*in) % (1U << 3)) << (29 - 3); + out++; + *out = ((*in) >> 3); + ++in; + out++; + + return in; +} + +__host__ __device__ inline const uint32_t* unpack30_32(const uint32_t* in, uint32_t* out) { + *out = ((*in) >> 0) % (1U << 30); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 28)) << (30 - 28); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 26)) << (30 - 26); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 24)) << (30 - 24); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 22)) << (30 - 22); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 20)) << (30 - 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 18)) << (30 - 18); + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 16)) << (30 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 14)) << (30 - 14); + out++; + *out = ((*in) >> 14); + ++in; + *out |= ((*in) % (1U << 12)) << (30 - 12); + out++; + *out = ((*in) >> 12); + ++in; + *out |= ((*in) % (1U << 10)) << (30 - 10); + out++; + *out = ((*in) >> 10); + ++in; + *out |= ((*in) % (1U << 8)) << (30 - 8); + out++; + *out = ((*in) >> 8); + ++in; + *out |= ((*in) % (1U << 6)) << (30 - 6); + out++; + *out = ((*in) >> 6); + ++in; + *out |= ((*in) % (1U << 4)) << (30 - 4); + out++; + *out = ((*in) >> 4); + ++in; + *out |= ((*in) % (1U << 2)) << (30 - 2); + out++; + *out = ((*in) >> 2); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 30); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 28)) << (30 - 28); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 26)) << (30 - 26); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 24)) << (30 - 24); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 22)) << (30 - 22); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 20)) << (30 - 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 18)) << (30 - 18); + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 16)) << (30 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 14)) << (30 - 14); + out++; + *out = ((*in) >> 14); + ++in; + *out |= ((*in) % (1U << 12)) << (30 - 12); + out++; + *out = ((*in) >> 12); + ++in; + *out |= ((*in) % (1U << 10)) << (30 - 10); + out++; + *out = ((*in) >> 10); + ++in; + *out |= ((*in) % (1U << 8)) << (30 - 8); + out++; + *out = ((*in) >> 8); + ++in; + *out |= ((*in) % (1U << 6)) << (30 - 6); + out++; + *out = ((*in) >> 6); + ++in; + *out |= ((*in) % (1U << 4)) << (30 - 4); + out++; + *out = ((*in) >> 4); + ++in; + *out |= ((*in) % (1U << 2)) << (30 - 2); + out++; + *out = ((*in) >> 2); + ++in; + out++; + + return in; +} + +__host__ __device__ inline const uint32_t* unpack31_32(const uint32_t* in, uint32_t* out) { + *out = ((*in) >> 0) % (1U << 31); + out++; + *out = ((*in) >> 31); + ++in; + *out |= ((*in) % (1U << 30)) << (31 - 30); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 29)) << (31 - 29); + out++; + *out = ((*in) >> 29); + ++in; + *out |= ((*in) % (1U << 28)) << (31 - 28); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 27)) << (31 - 27); + out++; + *out = ((*in) >> 27); + ++in; + *out |= ((*in) % (1U << 26)) << (31 - 26); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 25)) << (31 - 25); + out++; + *out = ((*in) >> 25); + ++in; + *out |= ((*in) % (1U << 24)) << (31 - 24); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 23)) << (31 - 23); + out++; + *out = ((*in) >> 23); + ++in; + *out |= ((*in) % (1U << 22)) << (31 - 22); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 21)) << (31 - 21); + out++; + *out = ((*in) >> 21); + ++in; + *out |= ((*in) % (1U << 20)) << (31 - 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 19)) << (31 - 19); + out++; + *out = ((*in) >> 19); + ++in; + *out |= ((*in) % (1U << 18)) << (31 - 18); + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 17)) << (31 - 17); + out++; + *out = ((*in) >> 17); + ++in; + *out |= ((*in) % (1U << 16)) << (31 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 15)) << (31 - 15); + out++; + *out = ((*in) >> 15); + ++in; + *out |= ((*in) % (1U << 14)) << (31 - 14); + out++; + *out = ((*in) >> 14); + ++in; + *out |= ((*in) % (1U << 13)) << (31 - 13); + out++; + *out = ((*in) >> 13); + ++in; + *out |= ((*in) % (1U << 12)) << (31 - 12); + out++; + *out = ((*in) >> 12); + ++in; + *out |= ((*in) % (1U << 11)) << (31 - 11); + out++; + *out = ((*in) >> 11); + ++in; + *out |= ((*in) % (1U << 10)) << (31 - 10); + out++; + *out = ((*in) >> 10); + ++in; + *out |= ((*in) % (1U << 9)) << (31 - 9); + out++; + *out = ((*in) >> 9); + ++in; + *out |= ((*in) % (1U << 8)) << (31 - 8); + out++; + *out = ((*in) >> 8); + ++in; + *out |= ((*in) % (1U << 7)) << (31 - 7); + out++; + *out = ((*in) >> 7); + ++in; + *out |= ((*in) % (1U << 6)) << (31 - 6); + out++; + *out = ((*in) >> 6); + ++in; + *out |= ((*in) % (1U << 5)) << (31 - 5); + out++; + *out = ((*in) >> 5); + ++in; + *out |= ((*in) % (1U << 4)) << (31 - 4); + out++; + *out = ((*in) >> 4); + ++in; + *out |= ((*in) % (1U << 3)) << (31 - 3); + out++; + *out = ((*in) >> 3); + ++in; + *out |= ((*in) % (1U << 2)) << (31 - 2); + out++; + *out = ((*in) >> 2); + ++in; + *out |= ((*in) % (1U << 1)) << (31 - 1); + out++; + *out = ((*in) >> 1); + ++in; + out++; + + return in; +} + +__host__ __device__ inline const uint32_t* unpack32_32(const uint32_t* in, uint32_t* out) { + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + + return in; +} + +__host__ __device__ inline const uint32_t* nullunpacker32(const uint32_t* in, uint32_t* out) { + for (int k = 0; k < 32; ++k) { + out[k] = 0; + } + return in; +} + + +} // namespace internal +} // namespace arrow +} // namespace gdf + +#endif // GDF_ARROW_UTIL_BPACKING_H diff --git a/src/arrow/cu_decoder.cu b/src/arrow/cu_decoder.cu new file mode 100644 index 00000000..4abe90d6 --- /dev/null +++ b/src/arrow/cu_decoder.cu @@ -0,0 +1,616 @@ +/* + * Copyright 2018 BlazingDB, Inc. + * Copyright 2018 Alexander Ocsa + * Copyright 2018 William Malpica + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "cu_decoder.cuh" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "bpacking.cuh" +#include "util/pinned_allocator.cuh" + +namespace gdf +{ +namespace arrow +{ +namespace internal { + +CachingPinnedAllocator pinnedAllocator(2, 14, 29, 1024*1024*1024*1ull); + +namespace detail +{ + +#define ARROW_PREDICT_FALSE(x) (__builtin_expect(x, 0)) +#define ARROW_PREDICT_TRUE(x) (__builtin_expect(!!(x), 1)) + +#define ARROW_DEBUG (-1) +#define ARROW_INFO 0 +#define ARROW_WARNING 1 +#define ARROW_ERROR 2 +#define ARROW_FATAL 3 + +class CerrLog +{ + public: + CerrLog(int severity) // NOLINT(runtime/explicit) + : severity_(severity), + has_logged_(false) + { + } + + virtual ~CerrLog() + { + if (has_logged_) + { + std::cerr << std::endl; + } + if (severity_ == ARROW_FATAL) + { + std::exit(1); + } + } + + template + CerrLog &operator<<(const T &t) + { + if (severity_ != ARROW_DEBUG) + { + has_logged_ = true; + std::cerr << t; + } + return *this; + } + + protected: + const int severity_; + bool has_logged_; +}; + + +/// Returns the 'num_bits' least-significant bits of 'v'. +__device__ __host__ static inline uint64_t TrailingBits(uint64_t v, + int num_bits) +{ + if (ARROW_PREDICT_FALSE(num_bits == 0)) + return 0; + if (ARROW_PREDICT_FALSE(num_bits >= 64)) + return v; + int n = 64 - num_bits; + return (v << n) >> n; +} + +template +__device__ __host__ inline void GetValue_(int num_bits, T *v, int max_bytes, + const uint8_t *buffer, + int *bit_offset, int *byte_offset, + uint64_t *buffered_values) +{ +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 4800) +#endif + *v = static_cast(TrailingBits(*buffered_values, *bit_offset + num_bits) >> *bit_offset); +#ifdef _MSC_VER +#pragma warning(pop) +#endif + *bit_offset += num_bits; + + if (*bit_offset >= 64) + { + *byte_offset += 8; + *bit_offset -= 64; + + int bytes_remaining = max_bytes - *byte_offset; + if (ARROW_PREDICT_TRUE(bytes_remaining >= 8)) + { + memcpy(buffered_values, buffer + *byte_offset, 8); + } + else + { + memcpy(buffered_values, buffer + *byte_offset, bytes_remaining); + } +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 4800 4805) +#endif + // Read bits of v that crossed into new buffered_values_ + *v = *v | static_cast(TrailingBits(*buffered_values, *bit_offset) + << (num_bits - *bit_offset)); +#ifdef _MSC_VER +#pragma warning(pop) +#endif + // DCHECK_LE(*bit_offset, 64); + } +} + +} // namespace detail + +template +OutputIterator gpu_expand(InputIterator1 first1, InputIterator1 last1, + InputIterator2 first2, OutputIterator output) +{ + typedef typename thrust::iterator_difference::type + difference_type; + + difference_type input_size = thrust::distance(first1, last1); + difference_type output_size = thrust::reduce(first1, last1); + + // scan the counts to obtain output offsets for each input element + thrust::device_vector output_offsets(input_size, 0); + thrust::exclusive_scan(first1, last1, output_offsets.begin()); + + // scatter the nonzero counts into their corresponding output positions + thrust::device_vector output_indices(output_size, 0); + thrust::scatter_if(thrust::counting_iterator(0), + thrust::counting_iterator(input_size), + output_offsets.begin(), first1, output_indices.begin()); + + // compute max-scan over the output indices, filling in the holes + thrust::inclusive_scan(output_indices.begin(), output_indices.end(), + output_indices.begin(), + thrust::maximum()); + + // gather input values according to index array (output = + // first2[output_indices]) + OutputIterator output_end = output; + thrust::advance(output_end, output_size); + thrust::gather(output_indices.begin(), output_indices.end(), first2, output); + + // return output + output_size + thrust::advance(output, output_size); + return output; +} + +__host__ __device__ inline const uint32_t* unpack32(const uint32_t* in, uint32_t* out, int num_bits) { + const uint32_t* (*UnpackFunctionPtr[])(const uint32_t* in, uint32_t* out) = {nullunpacker32, unpack1_32, unpack2_32, unpack3_32, unpack4_32, unpack5_32, unpack6_32, unpack7_32, unpack8_32, unpack9_32, unpack10_32, unpack11_32, unpack12_32, unpack13_32, unpack14_32, unpack15_32, unpack16_32, unpack17_32, unpack18_32, unpack19_32, unpack20_32, unpack21_32, unpack22_32, unpack23_32, unpack24_32, unpack25_32, unpack26_32, unpack27_32, unpack28_32, unpack29_32, unpack30_32, unpack31_32, unpack32_32}; + return UnpackFunctionPtr[num_bits](in, out); +} + +template +struct unpack_functor + : public thrust::binary_function +{ + int num_bits; + unpack_functor(int num_bits) : num_bits(num_bits) { + + } + __host__ __device__ uint32_t operator()(uint8_t &input, T &output) + { + uint32_t *input_ptr = (uint32_t *)&input; + uint32_t *output_ptr = (uint32_t *)&output; + + unpack32(input_ptr, output_ptr, num_bits); + + return 0; + } +}; + +template + __global__ + void decode_bitpacking_32sets(uint8_t *buffer, int *output, int *input_offsets, int *input_run_lengths, int num_sets, + int * output_offsets, short bit_width, int max_num_sets_in_run, Func unpack_func) + { + + extern __shared__ uint8_t temp[]; + + const short INPUT_BLOCK_BYTES = bit_width * 32 / 8; + const short OUTPUT_BLOCK_BYTES = 32 * 4; + const short BLOCK_SIZE = 32; + const short IO_BLOCK = INPUT_BLOCK_BYTES + OUTPUT_BLOCK_BYTES; // size in bytes of INPUT and OUTPUT BLOCK + + int index = blockIdx.x * blockDim.x + threadIdx.x; + + int set_index = index/max_num_sets_in_run; + + if (set_index < num_sets){ + int intput_index = input_offsets[set_index] + INPUT_BLOCK_BYTES * (index % max_num_sets_in_run); + int output_index = output_offsets[set_index] + BLOCK_SIZE * (index % max_num_sets_in_run); + + if ((INPUT_BLOCK_BYTES * (index % max_num_sets_in_run)) < input_run_lengths[set_index]*bit_width/8) { // if we want to actually process + + uint8_t * temp_in = &temp[IO_BLOCK * threadIdx.x]; + int *temp_out = (int*)&temp[IO_BLOCK * threadIdx.x + INPUT_BLOCK_BYTES]; + + for (int i = 0; i < INPUT_BLOCK_BYTES; i++){ + temp_in[i] = buffer[intput_index + i]; + } + unpack_func(temp_in[0], temp_out[0]); + + for (int i = 0; i < BLOCK_SIZE; i++){ + output[output_index + i] = temp_out[i]; + } + } + } + } + +typedef thrust::tuple Int4; + +template +struct remainder_functor : public thrust::unary_function +{ + int max_bytes; + int num_bits; + uint8_t *d_buffer; + T *ptr_output; + remainder_functor(int max_bytes, int num_bits, uint8_t *buffer, + T *ptr_output) + : max_bytes(max_bytes), num_bits(num_bits), d_buffer(buffer), ptr_output(ptr_output) + { + } + __device__ __host__ int operator()(Int4 tuple) + { + int bit_offset = thrust::get<0>(tuple); // remainderBitOffsets[k]; + int byte_offset = thrust::get<1>(tuple); // remainderInputOffsets[k]; + uint64_t buffered_values = 0; + + int bytes_remaining = max_bytes - byte_offset; + if (bytes_remaining >= 8) + { + memcpy(&buffered_values, d_buffer + byte_offset, 8); + } + else + { + memcpy(&buffered_values, d_buffer + byte_offset, bytes_remaining); + } + int i = thrust::get<2>(tuple); // remainderOutputOffsets[k]; + int batch_size = thrust::get<2>(tuple) + thrust::get<3>(tuple); // remainderOutputOffsets[k] + remainderSetSize[k]; + for (; i < batch_size; ++i) + { + detail::GetValue_(num_bits, &ptr_output[i], max_bytes, (uint8_t *)d_buffer, + &bit_offset, &byte_offset, &buffered_values); + } + return 0; + } +}; + +template +void gpu_bit_packing_remainder( thrust::device_vector & d_buffer, + const std::vector &remainderInputOffsets, + const std::vector &remainderBitOffsets, + const std::vector &remainderSetSize, + const std::vector &remainderOutputOffsets, + thrust::device_vector& d_output, + int num_bits) +{ + + thrust::device_vector d_remainder_input_offsets(remainderInputOffsets); + thrust::device_vector d_remainder_bit_offsets(remainderBitOffsets); + thrust::device_vector d_remainder_setsize(remainderSetSize); + thrust::device_vector d_remainder_output_offsets(remainderOutputOffsets); + + int max_bytes = d_buffer.size(); + auto zip_iterator_begin = thrust::make_zip_iterator(thrust::make_tuple( + d_remainder_bit_offsets.begin(), d_remainder_input_offsets.begin(), + d_remainder_output_offsets.begin(), d_remainder_setsize.begin())); + auto zip_iterator_end = thrust::make_zip_iterator(thrust::make_tuple( + d_remainder_bit_offsets.end(), d_remainder_input_offsets.end(), + d_remainder_output_offsets.end(), d_remainder_setsize.end())); + + thrust::transform( + thrust::device, zip_iterator_begin, zip_iterator_end, + thrust::make_discard_iterator(), + remainder_functor(max_bytes, num_bits, d_buffer.data().get(), + d_output.data().get())); + +} + + +template +void gpu_bit_packing(const uint8_t *buffer, + const int buffer_len, + const std::vector &input_offset, + const std::vector>& bitpackset, + const std::vector &output_offset, + thrust::device_vector& d_output, + int num_bits) +{ + thrust::device_vector d_output_offset(output_offset); + int step_size = 32 * num_bits / 8; + uint8_t* h_bit_buffer; + pinnedAllocator.pinnedAllocate((void **)&h_bit_buffer, step_size * input_offset.size()); + + thrust::host_vector h_bit_offset; + for (int i = 0; i < input_offset.size(); i++){ + h_bit_offset.push_back(i*step_size); + } + int sum = 0; + for (auto &&pair : bitpackset) { + memcpy ( &h_bit_buffer[sum] , &buffer[pair.first], pair.second ); + sum += pair.second; + } + thrust::device_vector d_bit_buffer(h_bit_buffer, h_bit_buffer + step_size * input_offset.size()); + thrust::device_vector d_bit_offset(h_bit_offset); + + thrust::transform(thrust::cuda::par, + thrust::make_permutation_iterator(d_bit_buffer.begin(), d_bit_offset.begin()), + thrust::make_permutation_iterator(d_bit_buffer.end(), d_bit_offset.end()), + thrust::make_permutation_iterator(d_output.begin(), d_output_offset.begin()), + thrust::make_discard_iterator(), unpack_functor(num_bits)); + pinnedAllocator.pinnedFree(h_bit_buffer); +} + +template +int decode_using_gpu(const T * d_dictionary, int num_dictionary_values, T* d_output, const uint8_t *buffer, const int buffer_len, + const std::vector &rle_runs, + const std::vector &rle_values, + const std::vector &input_offset, + const std::vector &input_runlengths, + const std::vector &output_offset, + const std::vector &remainderInputOffsets, + const std::vector &remainderBitOffsets, + const std::vector &remainderSetSize, + const std::vector &remainderOutputOffsets, + int num_bits, int batch_size) +{ + thrust::device_vector d_indices(batch_size); + + { + thrust::device_vector d_counts(rle_runs); + thrust::device_vector d_values(rle_values); + gpu_expand(d_counts.begin(), d_counts.end(), d_values.begin(), d_indices.begin()); + } + + thrust::device_vector d_buffer(buffer_len); + thrust::copy(buffer, buffer + buffer_len, d_buffer.begin()); + if (input_offset.size() > 0){ + unpack_functor func(num_bits); + thrust::device_vector d_input_offsets(input_offset); + thrust::device_vector d_input_runlengths(input_runlengths); + thrust::device_vector d_output_offset(output_offset); + + int max_num_sets_in_run = thrust::reduce(thrust::device, + d_input_runlengths.begin(), d_input_runlengths.end(), + 0, + thrust::maximum()); + max_num_sets_in_run = max_num_sets_in_run/32; + + int max_total_sets = max_num_sets_in_run * input_offset.size(); + + int blocksize = std::min(128, max_total_sets); + int gridsize = (max_total_sets + blocksize - 1) / blocksize; + + int shared_memory = blocksize * (num_bits * 32/8 + 32 * 4); + + decode_bitpacking_32sets<<>>(thrust::raw_pointer_cast(d_buffer.data()), thrust::raw_pointer_cast(d_indices.data()), + thrust::raw_pointer_cast(d_input_offsets.data()), thrust::raw_pointer_cast(d_input_runlengths.data()), input_offset.size(), + thrust::raw_pointer_cast(d_output_offset.data()), num_bits, max_num_sets_in_run, func); + + } + + if (remainderInputOffsets.size() > 0){ + gpu_bit_packing_remainder(d_buffer, remainderInputOffsets, remainderBitOffsets, remainderSetSize, remainderOutputOffsets, d_indices, num_bits); + } + + thrust::gather(thrust::device, + d_indices.begin(), d_indices.end(), + d_dictionary, + d_output); + return batch_size; +} + +template +struct copy_functor : public thrust::unary_function +{ + __host__ __device__ T operator()(int input) + { + return static_cast(input); + } +}; + +template +int decode_def_levels(const uint8_t* buffer, const int buffer_len, + const std::vector &rle_runs, + const std::vector &rle_values, + const std::vector& input_offset, + const std::vector& input_runlengths, + const std::vector& output_offset, + const std::vector& remainderInputOffsets, + const std::vector& remainderBitOffsets, + const std::vector& remainderSetSize, + const std::vector& remainderOutputOffsets, + int num_bits, + T* output, int batch_size) +{ + + thrust::device_vector d_indices(batch_size); + + { + thrust::device_vector d_counts(rle_runs); + thrust::device_vector d_values(rle_values); + gpu_expand(d_counts.begin(), d_counts.end(), d_values.begin(), d_indices.begin()); + } + + thrust::device_vector d_buffer(buffer_len); + thrust::copy(buffer, buffer + buffer_len, d_buffer.begin()); + if (input_offset.size() > 0){ + unpack_functor func(num_bits); + thrust::device_vector d_input_offsets(input_offset); + thrust::device_vector d_input_runlengths(input_runlengths); + thrust::device_vector d_output_offset(output_offset); + + int max_num_sets_in_run = thrust::reduce(thrust::device, + d_input_runlengths.begin(), d_input_runlengths.end(), + 0, + thrust::maximum()); + max_num_sets_in_run = max_num_sets_in_run/32; + + int max_total_sets = max_num_sets_in_run * input_offset.size(); + + int blocksize = std::min(128, max_total_sets); + int gridsize = (max_total_sets + blocksize - 1) / blocksize; + + int shared_memory = blocksize * (num_bits * 32/8 + 32 * 4); + + decode_bitpacking_32sets<<>>(thrust::raw_pointer_cast(d_buffer.data()), thrust::raw_pointer_cast(d_indices.data()), + thrust::raw_pointer_cast(d_input_offsets.data()), thrust::raw_pointer_cast(d_input_runlengths.data()), input_offset.size(), + thrust::raw_pointer_cast(d_output_offset.data()), num_bits, max_num_sets_in_run, func); + + } + + if (remainderInputOffsets.size() > 0){ + gpu_bit_packing_remainder(d_buffer, remainderInputOffsets, remainderBitOffsets, remainderSetSize, remainderOutputOffsets, d_indices, num_bits); + } + + thrust::transform(thrust::device, d_indices.begin(), d_indices.end(), output, copy_functor()); + return batch_size; +} + +template +int unpack_using_gpu(const uint8_t* buffer, const int buffer_len, + const std::vector& input_offset, + const std::vector& input_runlengths, + const std::vector& output_offset, + const std::vector& remainderInputOffsets, + const std::vector& remainderBitOffsets, + const std::vector& remainderSetSize, + const std::vector& remainderOutputOffsets, + int num_bits, + T* device_output, int batch_size) +{ + + thrust::device_vector d_output_int(batch_size); + thrust::device_vector d_buffer(buffer_len); + thrust::copy(buffer, buffer + buffer_len, d_buffer.begin()); + + if (input_offset.size() > 0){ + + unpack_functor func(num_bits); + thrust::device_vector d_input_offsets(input_offset); + thrust::device_vector d_input_runlengths(input_runlengths); + thrust::device_vector d_output_offset(output_offset); + + int max_num_sets_in_run = thrust::reduce(thrust::device, + d_input_runlengths.begin(), d_input_runlengths.end(), + 0, + thrust::maximum()); + max_num_sets_in_run = max_num_sets_in_run/32; + + int max_total_sets = max_num_sets_in_run * input_offset.size(); + + int blocksize = std::min(128, max_total_sets); + int gridsize = (max_total_sets + blocksize - 1) / blocksize; + + int shared_memory = blocksize * (num_bits * 32/8 + 32 * 4); + + decode_bitpacking_32sets<<>>(thrust::raw_pointer_cast(d_buffer.data()), thrust::raw_pointer_cast(d_output_int.data()), + thrust::raw_pointer_cast(d_input_offsets.data()), thrust::raw_pointer_cast(d_input_runlengths.data()), input_offset.size(), + thrust::raw_pointer_cast(d_output_offset.data()), num_bits, max_num_sets_in_run, func); + + } + + if (remainderInputOffsets.size() > 0){ + gpu_bit_packing_remainder(d_buffer, remainderInputOffsets, remainderBitOffsets, remainderSetSize, remainderOutputOffsets, d_output_int, num_bits); + } + + thrust::transform(thrust::device, d_output_int.begin(), d_output_int.end(), device_output, copy_functor()); + return batch_size; +} + + +#define CONCRETIZE_FUNCTION(T) \ +template int decode_using_gpu(const T *dictionary, int num_dictionary_values, T* d_output, const uint8_t *buffer, const int buffer_len, \ + const std::vector &rle_runs, \ + const std::vector &rle_values, \ + const std::vector &input_offset, \ + const std::vector &input_runlengths, \ + const std::vector &output_offset, \ + const std::vector &remainderInputOffsets, \ + const std::vector &remainderBitOffsets, \ + const std::vector &remainderSetSize, \ + const std::vector &remainderOutputOffsets, \ + int num_bits, \ + int batch_size \ + ) + +CONCRETIZE_FUNCTION(bool); +CONCRETIZE_FUNCTION(int32_t); +CONCRETIZE_FUNCTION(int64_t); +CONCRETIZE_FUNCTION(float); +CONCRETIZE_FUNCTION(double); + +#undef CONCRETIZE_FUNCTION + +template int unpack_using_gpu(const uint8_t* buffer, const int buffer_len, + const std::vector& input_offset, + const std::vector& input_runlengths, + const std::vector& output_offset, + const std::vector& remainderInputOffsets, + const std::vector& remainderBitOffsets, + const std::vector& remainderSetSize, + const std::vector& remainderOutputOffsets, + int num_bits, + bool* device_output, int batch_size + ); + + +template int unpack_using_gpu(const uint8_t* buffer, const int buffer_len, + const std::vector& input_offset, + const std::vector& input_runlengths, + const std::vector& output_offset, + const std::vector& remainderInputOffsets, + const std::vector& remainderBitOffsets, + const std::vector& remainderSetSize, + const std::vector& remainderOutputOffsets, + int num_bits, + int16_t* output, int batch_size + ); + +template int decode_def_levels(const uint8_t* buffer, const int buffer_len, + const std::vector &rle_runs, + const std::vector &rle_values, + const std::vector& input_offset, + const std::vector& input_runlengths, + const std::vector& output_offset, + const std::vector& remainderInputOffsets, + const std::vector& remainderBitOffsets, + const std::vector& remainderSetSize, + const std::vector& remainderOutputOffsets, + int num_bits, + int16_t* output, int batch_size); + + + +} // namespace internal +} // namespace arrow +} // namespace gdf diff --git a/src/arrow/cu_decoder.cuh b/src/arrow/cu_decoder.cuh new file mode 100644 index 00000000..1fb0b5e8 --- /dev/null +++ b/src/arrow/cu_decoder.cuh @@ -0,0 +1,124 @@ +#ifndef _CU_DECODER_H_ +#define _CU_DECODER_H_ +/* + * Copyright 2018 BlazingDB, Inc. + * Copyright 2018 Alexander Ocsa + * Copyright 2018 William Malpica + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +namespace gdf { +namespace arrow { +namespace internal { + + template + int decode_using_gpu(const T *dictionary, int num_dictionary_values, T* d_output, const uint8_t *buffer, const int buffer_len, + const std::vector &rle_runs, + const std::vector &rle_values, + const std::vector &input_offset, + const std::vector &intput_runlengths, + const std::vector &output_offset, + const std::vector &remainderInputOffsets, + const std::vector &remainderBitOffsets, + const std::vector &remainderSetSize, + const std::vector &remainderOutputOffsets, + int num_bits, int batch_size); + + template + int unpack_using_gpu(const uint8_t* buffer, const int buffer_len, + const std::vector& input_offset, + const std::vector& input_runlengths, + const std::vector& output_offset, + const std::vector& remainderInputOffsets, + const std::vector& remainderBitOffsets, + const std::vector& remainderSetSize, + const std::vector& remainderOutputOffsets, + int num_bits, + T* output, int batch_size); + template + int decode_def_levels(const uint8_t* buffer, const int buffer_len, + const std::vector &rle_runs, + const std::vector &rle_values, + const std::vector& input_offset, + const std::vector& input_runlengths, + const std::vector& output_offset, + const std::vector& remainderInputOffsets, + const std::vector& remainderBitOffsets, + const std::vector& remainderSetSize, + const std::vector& remainderOutputOffsets, + int num_bits, + T* output, int batch_size); + + + // expands data vector that does not contain nulls into a representation that has indeterminate values where there should be nulls + // A vector of int work_space needs to be allocated to hold the map for the scatter operation. The workspace should be of size batch_size + template + void compact_to_sparse_for_nulls(T* data_in, T* data_out, const uint8_t* definition_levels, uint8_t max_definition_level, + int batch_size, int * work_space){ + + struct is_equal + { + uint8_t _val; + + __host__ __device__ is_equal(uint8_t val){ + _val = val; + } + __host__ __device__ + bool operator()(const uint8_t &x) + { + return x == _val; + } + }; + + is_equal op(max_definition_level); + thrust::counting_iterator iter(0); + auto out_iter = thrust::copy_if(iter, iter + batch_size, definition_levels, work_space, op); + int num_not_null = out_iter - work_space; + + thrust::scatter(data_in, data_in + num_not_null, work_space, data_out); + } + +} +} // namespace arrow +} // namespace gdf + + + + + + + + + + + + + + + + + + + + + + + +#endif // _CU_DECODER_H_ diff --git a/src/arrow/rle_decoder.h b/src/arrow/rle_decoder.h new file mode 100644 index 00000000..5086d7b3 --- /dev/null +++ b/src/arrow/rle_decoder.h @@ -0,0 +1,258 @@ +/* + * Copyright 2018 BlazingDB, Inc. + * Copyright 2018 Alexander Ocsa + * Copyright 2018 William Malpica + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef GDF_ARROW_UTIL_RLE_DECODER_H +#define GDF_ARROW_UTIL_RLE_DECODER_H + +#include "bit-stream.h" +#include "cu_decoder.cuh" +#include +#include +#include +#include +#include + +namespace parquet { +class ColumnDescriptor; +} + +namespace gdf { +namespace arrow { + namespace internal { + + /// Decoder class for RLE encoded data. + class RleDecoder { + public: + /// Create a decoder object. buffer/buffer_len is the decoded data. + /// bit_width is the width of each value (before encoding). + RleDecoder(const uint8_t* buffer, int buffer_len, int bit_width) + : bit_reader_(buffer, buffer_len) + , bit_width_(bit_width) + , current_value_(0) + , repeat_count_(0) + , literal_count_(0) + { + DCHECK_GE(bit_width_, 0); + DCHECK_LE(bit_width_, 64); + } + + RleDecoder() + : bit_width_(-1) + { + } + + void Reset(const uint8_t* buffer, int buffer_len, int bit_width) + { + DCHECK_GE(bit_width, 0); + DCHECK_LE(bit_width, 64); + bit_reader_.Reset(buffer, buffer_len); + bit_width_ = bit_width; + current_value_ = 0; + repeat_count_ = 0; + literal_count_ = 0; + } + + /// Gets the next value. Returns false if there are no more. + template + bool Get(T* val); + + /// Gets a batch of values. Returns the number of decoded elements. + template + int GetBatch(T* values, int batch_size); + + /// Like GetBatch but the values are then decoded using the provided + /// dictionary + template + int GetBatchWithDict(const T* dictionary, int num_dictionary_values, T* values, int batch_size); + + /// Like GetBatchWithDict but add spacing for null entries + template + int GetBatchWithDictSpaced(const T* dictionary, int num_dictionary_values, T* values, int batch_size, + int null_count, const uint8_t* valid_bits, + int64_t valid_bits_offset); + + protected: + BitReader bit_reader_; + /// Number of bits needed to encode the value. Must be between 0 and 64. + int bit_width_; + uint64_t current_value_; + uint32_t repeat_count_; + uint32_t literal_count_; + + private: + /// Fills literal_count_ and repeat_count_ with next values. Returns false if + /// there + /// are no more. + template + bool NextCounts(); + }; + + template + inline bool RleDecoder::Get(T* val) + { + return GetBatch(val, 1) == 1; + } + + template + inline int RleDecoder::GetBatch(T* values, int batch_size) + { + DCHECK_GE(bit_width_, 0); + int values_read = 0; + + std::vector rleRuns; + std::vector rleValues; + int numRle; + int numBitpacked; + std::vector< std::pair > bitpackset; + std::vector unpack32InputOffsets, unpack32InputRunLengths, unpack32OutputOffsets; + std::vector remainderInputOffsets, remainderBitOffsets, remainderSetSize, + remainderOutputOffsets; + + while (values_read < batch_size) { + if (repeat_count_ > 0) { + int repeat_batch = std::min(batch_size - values_read, static_cast(repeat_count_)); + rleRuns.push_back(repeat_batch); + rleValues.push_back(current_value_); + + repeat_count_ -= repeat_batch; + values_read += repeat_batch; + } else if (literal_count_ > 0) { + int literal_batch = std::min(batch_size - values_read, static_cast(literal_count_)); + rleRuns.push_back(literal_batch); + rleValues.push_back(0); + + bit_reader_.SetGpuBatchMetadata( + bit_width_, values + values_read, literal_batch, values_read, unpack32InputOffsets, unpack32InputRunLengths, + unpack32OutputOffsets, remainderInputOffsets, remainderBitOffsets, + remainderSetSize, remainderOutputOffsets); + + literal_count_ -= literal_batch; + values_read += literal_batch; + } else { + if (!NextCounts()) + return values_read; + } + } + gdf::arrow::internal::decode_def_levels( + this->bit_reader_.get_buffer(), this->bit_reader_.get_buffer_len(), + rleRuns, rleValues, + unpack32InputOffsets, + unpack32InputRunLengths, + unpack32OutputOffsets, + remainderInputOffsets, remainderBitOffsets, remainderSetSize, + remainderOutputOffsets, bit_width_, values, batch_size); + + return values_read; + } + + template + inline int RleDecoder::GetBatchWithDict(const T* dictionary, int num_dictionary_values, T* values, + int batch_size) + { + DCHECK_GE(bit_width_, 0); + int values_read = 0; + + std::vector rleRuns; + std::vector rleValues; + int numRle; + int numBitpacked; + std::vector unpack32InputOffsets, unpack32InputRunLengths, unpack32OutputOffsets; + std::vector remainderInputOffsets, remainderBitOffsets, remainderSetSize, + remainderOutputOffsets; + + while (values_read < batch_size) { + if (repeat_count_ > 0) { + int repeat_batch = std::min(batch_size - values_read, static_cast(repeat_count_)); + rleRuns.push_back(repeat_batch); + rleValues.push_back(current_value_); + numRle++; + + repeat_count_ -= repeat_batch; + values_read += repeat_batch; + } else if (literal_count_ > 0) { + int literal_batch = std::min(batch_size - values_read, static_cast(literal_count_)); + + const int buffer_size = 1024; //@todo, check this buffer size for optimization + int indices[buffer_size]; + literal_batch = std::min(literal_batch, buffer_size); + rleRuns.push_back(literal_batch); + rleValues.push_back(0); + numBitpacked++; + bit_reader_.SetGpuBatchMetadata( + bit_width_, &indices[0], literal_batch, values_read, unpack32InputOffsets, unpack32InputRunLengths, + unpack32OutputOffsets, remainderInputOffsets, remainderBitOffsets, + remainderSetSize, remainderOutputOffsets); + literal_count_ -= literal_batch; + values_read += literal_batch; + } else { + if (!NextCounts()) + return values_read; + } + } + int actual_read = gdf::arrow::internal::decode_using_gpu(dictionary, num_dictionary_values, values, + this->bit_reader_.get_buffer(), this->bit_reader_.get_buffer_len(), + rleRuns, rleValues, + unpack32InputOffsets, + unpack32InputRunLengths, + unpack32OutputOffsets, + remainderInputOffsets, remainderBitOffsets, remainderSetSize, + remainderOutputOffsets, bit_width_, batch_size); + + return values_read; + } + + template + inline int RleDecoder::GetBatchWithDictSpaced(const T* dictionary, int num_dictionary_values, T* values, + int batch_size, int null_count, + const uint8_t* valid_bits, + int64_t valid_bits_offset) + { + DCHECK_GE(bit_width_, 0); + + int values_read = GetBatchWithDict(dictionary, num_dictionary_values, values, batch_size); + + return values_read; + } + + template + inline bool RleDecoder::NextCounts() + { + // Read the next run's indicator int, it could be a literal or repeated run. + // The int is encoded as a vlq-encoded value. + int32_t indicator_value = 0; + bool result = bit_reader_.GetVlqInt(&indicator_value); + if (!result) + return false; + + // lsb indicates if it is a literal run or repeated run + bool is_literal = indicator_value & 1; + if (is_literal) { + literal_count_ = (indicator_value >> 1) * 8; + } else { + repeat_count_ = indicator_value >> 1; + bool result = bit_reader_.GetAligned( + static_cast(::arrow::BitUtil::Ceil(bit_width_, 8)), + reinterpret_cast(¤t_value_)); + DCHECK(result); + } + return true; + } + + } // namespace internal +} // namespace parquet +} // namespace gdf +#endif diff --git a/src/arrow/util/pinned_allocator.cu b/src/arrow/util/pinned_allocator.cu new file mode 100644 index 00000000..e5528ab8 --- /dev/null +++ b/src/arrow/util/pinned_allocator.cu @@ -0,0 +1,217 @@ +#include "pinned_allocator.cuh" + + cudaError_t CachingPinnedAllocator::pinnedAllocate( + void **d_ptr, ///< [out] Reference to pointer to the allocation + size_t bytes ) + { + *d_ptr = NULL; + + cudaError_t error = cudaSuccess; + + + // Create a block descriptor for the requested allocation + bool found = false; + BlockDescriptor search_key; + + NearestPowerOf(search_key.bin, search_key.bytes, bin_growth, bytes); + + if (search_key.bin > max_bin) + { + // Bin is greater than our maximum bin: allocate the request + // exactly and give out-of-bounds bin. It will not be cached + // for reuse when returned. + search_key.bin = INVALID_BIN; + search_key.bytes = bytes; + } + else + { + // Search for a suitable cached allocation: lock + mutex.lock(); + + if (search_key.bin < min_bin) + { + // Bin is less than minimum bin: round up + search_key.bin = min_bin; + search_key.bytes = min_bin_bytes; + } + + // Iterate through the range of cached blocks on the same device in the same bin + CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key); + while ((block_itr != cached_blocks.end()) + && (block_itr->bin == search_key.bin)) + { + // To prevent races with reusing blocks returned by the host but still + // in use by the device, only consider cached blocks that are + // either (from the active stream) or (from an idle stream) + + // Reuse existing cache block. Insert into live blocks. + found = true; + search_key = *block_itr; + + live_blocks.insert(search_key); + + // Remove from free blocks + cached_bytes.free -= search_key.bytes; + cached_bytes.live += search_key.bytes; + + cached_blocks.erase(block_itr); + + + block_itr++; + } + + // Done searching: unlock + mutex.unlock(); + } + + // Allocate the block if necessary + if (!found) + { + + // Attempt to allocate + if (cudaMallocHost((void **)&search_key.d_ptr, search_key.bytes) != cudaSuccess) + { + + error = cudaSuccess; // Reset the error we will return + cudaGetLastError(); // Reset CUDART's error + + // Lock + mutex.lock(); + + // Iterate the range of free blocks on the same device + BlockDescriptor free_key; + CachedBlocks::iterator block_itr = cached_blocks.lower_bound(free_key); + + while ((block_itr != cached_blocks.end())) + { + // No need to worry about synchronization with the device: cudaFreeHost is + // blocking and will synchronize across all kernels executing + // on the current device + + // Free device memory and destroy stream event. + error = cudaFreeHost(block_itr->d_ptr); + if(error != cudaSuccess){ + // std::cout<<"could not free from host"; + break; + } + + // Reduce balance and erase entry + cached_bytes.free -= block_itr->bytes; + + + cached_blocks.erase(block_itr); + + block_itr++; + } + + // Unlock + mutex.unlock(); + + // Return under error + if (error) return error; + + // Try to allocate again + error = cudaMallocHost((void **)&search_key.d_ptr, search_key.bytes); + if(error != cudaSuccess){ + return error; + } + + } + + // Insert into live blocks + mutex.lock(); + live_blocks.insert(search_key); + cached_bytes.live += search_key.bytes; + mutex.unlock(); + + + } + + // Copy device pointer to output parameter + *d_ptr = search_key.d_ptr; + + return error; + } + + + cudaError_t CachingPinnedAllocator::pinnedFree( + void* d_ptr) + { + cudaError_t error = cudaSuccess; + + + + // Lock + mutex.lock(); + + // Find corresponding block descriptor + bool recached = false; + BlockDescriptor search_key(d_ptr); + BusyBlocks::iterator block_itr = live_blocks.find(search_key); + if (block_itr != live_blocks.end()) + { + // Remove from live blocks + search_key = *block_itr; + live_blocks.erase(block_itr); + cached_bytes.live -= search_key.bytes; + + // Keep the returned allocation if bin is valid and we won't exceed the max cached threshold + if ((search_key.bin != INVALID_BIN) && (cached_bytes.free + search_key.bytes <= max_cached_bytes)) + { + // Insert returned allocation into free blocks + recached = true; + cached_blocks.insert(search_key); + cached_bytes.free += search_key.bytes; + + } + } + + // Unlock + mutex.unlock(); + + + if (recached) + { + // Insert the ready event in the associated stream (must have current device set properly) + //TODO: see if we have to do anything here to handle concurrency + } + else + { + // Free the allocation from the runtime and cleanup the event. + error = cudaFreeHost(d_ptr); + if (error != cudaSuccess) return error; + + } + + return error; + } + + + cudaError_t CachingPinnedAllocator::FreeAllCached() + { + cudaError_t error = cudaSuccess; + + mutex.lock(); + + while (!cached_blocks.empty()) + { + // Get first block + CachedBlocks::iterator begin = cached_blocks.begin(); + + + + // Free device memory + error = cudaFreeHost(begin->d_ptr); + if (error != cudaSuccess) break; + + // Reduce balance and erase entry + cached_bytes.free -= begin->bytes; + + cached_blocks.erase(begin); + } + + mutex.unlock(); + + + return error; + } diff --git a/src/arrow/util/pinned_allocator.cuh b/src/arrow/util/pinned_allocator.cuh new file mode 100644 index 00000000..a99544d8 --- /dev/null +++ b/src/arrow/util/pinned_allocator.cuh @@ -0,0 +1,385 @@ +/* + * CachedPinnedAllocator.h + * + * Created on: Mar 15, 2018 + * Author: felipe + */ + +#ifndef SRC_GPUABSTRACTIONS_CACHEDPINNEDALLOCATOR_H_ +#define SRC_GPUABSTRACTIONS_CACHEDPINNEDALLOCATOR_H_ + + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/****************************************************************************** + * Simple caching allocator for device memory allocations. The allocator is + * thread-safe and capable of managing device allocations on multiple devices. + ******************************************************************************/ + + + +#include +#include + + +#include +#include + + +#include "driver_types.h" + + +/****************************************************************************** + * CachingPinnedAllocator (host use) + ******************************************************************************/ + +/** + * \brief A simple caching allocator for device memory allocations. + * + * \par Overview + * The allocator is thread-safe and stream-safe and is capable of managing cached + * device allocations on multiple devices. It behaves as follows: + * + * \par + * - Allocations from the allocator are associated with an \p active_stream. Once freed, + * the allocation becomes available immediately for reuse within the \p active_stream + * with which it was associated with during allocation, and it becomes available for + * reuse within other streams when all prior work submitted to \p active_stream has completed. + * - Allocations are categorized and cached by bin size. A new allocation request of + * a given size will only consider cached allocations within the corresponding bin. + * - Bin limits progress geometrically in accordance with the growth factor + * \p bin_growth provided during construction. Unused device allocations within + * a larger bin cache are not reused for allocation requests that categorize to + * smaller bin sizes. + * - Allocation requests below (\p bin_growth ^ \p min_bin) are rounded up to + * (\p bin_growth ^ \p min_bin). + * - Allocations above (\p bin_growth ^ \p max_bin) are not rounded up to the nearest + * bin and are simply freed when they are deallocated instead of being returned + * to a bin-cache. + * - %If the total storage of cached allocations on a given device will exceed + * \p max_cached_bytes, allocations for that device are simply freed when they are + * deallocated instead of being returned to their bin-cache. + * + * \par + * For example, the default-constructed CachingPinnedAllocator is configured with: + * - \p bin_growth = 8 + * - \p min_bin = 3 + * - \p max_bin = 7 + * - \p max_cached_bytes = 6MB - 1B + * + * \par + * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB + * and sets a maximum of 6,291,455 cached bytes per device + * + */ +struct CachingPinnedAllocator +{ + + //--------------------------------------------------------------------- + // Constants + //--------------------------------------------------------------------- + + /// Out-of-bounds bin + static const unsigned int INVALID_BIN = (unsigned int) -1; + + /// Invalid size + static const size_t INVALID_SIZE = (size_t) -1; + + + //--------------------------------------------------------------------- + // Type definitions and helper types + //--------------------------------------------------------------------- + + /** + * Descriptor for device memory allocations + */ + struct BlockDescriptor + { + void* d_ptr; // Device pointer + size_t bytes; // Size of allocation in bytes + unsigned int bin; // Bin enumeration + // int device; // device ordinal + // cudaStream_t associated_stream; // Associated associated_stream + // cudaEvent_t ready_event; // Signal when associated stream has run to the point at which this block was freed + + // Constructor (suitable for searching maps for a specific block, given its pointer and device) + BlockDescriptor(void *d_ptr) : + d_ptr(d_ptr), + bytes(0), + bin(INVALID_BIN) + + {} + + // Constructor (suitable for searching maps for a range of suitable blocks, given a device) + BlockDescriptor() : + d_ptr(NULL), + bytes(0), + bin(INVALID_BIN) + {} + + // Comparison functor for comparing device pointers + static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b) + { + return (a.d_ptr < b.d_ptr); + } + + // Comparison functor for comparing allocation sizes + static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b) + { + return (a.bytes < b.bytes); + } + }; + + /// BlockDescriptor comparator function interface + typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &); + + class TotalBytes { + public: + size_t free; + size_t live; + TotalBytes() { free = live = 0; } + }; + + /// Set type for cached blocks (ordered by size) + typedef std::multiset CachedBlocks; + + /// Set type for live blocks (ordered by ptr) + typedef std::multiset BusyBlocks; + + /// Map type of device ordinals to the number of cached bytes cached by each device + + + + //--------------------------------------------------------------------- + // Utility functions + //--------------------------------------------------------------------- + + /** + * Integer pow function for unsigned base and exponent + */ + static unsigned int IntPow( + unsigned int base, + unsigned int exp) + { + unsigned int retval = 1; + while (exp > 0) + { + if (exp & 1) { + retval = retval * base; // multiply the result by the current base + } + base = base * base; // square the base + exp = exp >> 1; // divide the exponent in half + } + return retval; + } + + + /** + * Round up to the nearest power-of + */ + void NearestPowerOf( + unsigned int &power, + size_t &rounded_bytes, + unsigned int base, + size_t value) + { + power = 0; + rounded_bytes = 1; + + if (value * base < value) + { + // Overflow + power = sizeof(size_t) * 8; + rounded_bytes = size_t(0) - 1; + return; + } + + while (rounded_bytes < value) + { + rounded_bytes *= base; + power++; + } + } + + + //--------------------------------------------------------------------- + // Fields + //--------------------------------------------------------------------- + + std::mutex mutex; /// Mutex for thread-safety + + unsigned int bin_growth; /// Geometric growth factor for bin-sizes + unsigned int min_bin; /// Minimum bin enumeration + unsigned int max_bin; /// Maximum bin enumeration + + size_t min_bin_bytes; /// Minimum bin size + size_t max_bin_bytes; /// Maximum bin size + size_t max_cached_bytes; /// Maximum aggregate cached bytes per device + + const bool skip_cleanup; /// Whether or not to skip a call to FreeAllCached() when destructor is called. (The CUDA runtime may have already shut down for statically declared allocators) + bool debug; /// Whether or not to print (de)allocation events to stdout + + TotalBytes cached_bytes; /// Map of device ordinal to aggregate cached bytes on that device + CachedBlocks cached_blocks; /// Set of cached device allocations available for reuse + BusyBlocks live_blocks; /// Set of live device allocations currently in use + + + + //--------------------------------------------------------------------- + // Methods + //--------------------------------------------------------------------- + + /** + * \brief Constructor. + */ + CachingPinnedAllocator( + unsigned int bin_growth, ///< Geometric growth factor for bin-sizes + unsigned int min_bin = 1, ///< Minimum bin (default is bin_growth ^ 1) + unsigned int max_bin = INVALID_BIN, ///< Maximum bin (default is no max bin) + size_t max_cached_bytes = INVALID_SIZE, ///< Maximum aggregate cached bytes per device (default is no limit) + bool skip_cleanup = false, ///< Whether or not to skip a call to \p FreeAllCached() when the destructor is called (default is to deallocate) + bool debug = false) ///< Whether or not to print (de)allocation events to stdout (default is no stderr output) + : + bin_growth(bin_growth), + min_bin(min_bin), + max_bin(max_bin), + min_bin_bytes(IntPow(bin_growth, min_bin)), + max_bin_bytes(IntPow(bin_growth, max_bin)), + max_cached_bytes(max_cached_bytes), + skip_cleanup(skip_cleanup), + debug(debug), + cached_blocks(BlockDescriptor::SizeCompare), + live_blocks(BlockDescriptor::PtrCompare) + {} + + + /** + * \brief Default constructor. + * + * Configured with: + * \par + * - \p bin_growth = 8 + * - \p min_bin = 3 + * - \p max_bin = 7 + * - \p max_cached_bytes = (\p bin_growth ^ \p max_bin) * 3) - 1 = 6,291,455 bytes + * + * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB and + * sets a maximum of 6,291,455 cached bytes per device + */ + CachingPinnedAllocator( + bool skip_cleanup = false, + bool debug = false) + : + bin_growth(2), + min_bin(20), + max_bin(25), + min_bin_bytes(IntPow(bin_growth, min_bin)), + max_bin_bytes(IntPow(bin_growth, max_bin)), + max_cached_bytes((max_bin_bytes * 3) - 1), + skip_cleanup(skip_cleanup), + debug(debug), + cached_blocks(BlockDescriptor::SizeCompare), + live_blocks(BlockDescriptor::PtrCompare) + {} + + + /** + * \brief Sets the limit on the number bytes this allocator is allowed to cache per device. + * + * Changing the ceiling of cached bytes does not cause any allocations (in-use or + * cached-in-reserve) to be freed. See \p FreeAllCached(). + */ + cudaError_t SetMaxCachedBytes( + size_t max_cached_bytes) + { + // Lock + mutex.lock(); + + // if (debug) _CubLog("Changing max_cached_bytes (%lld -> %lld)\n", (long long) this->max_cached_bytes, (long long) max_cached_bytes); + + this->max_cached_bytes = max_cached_bytes; + + // Unlock + mutex.unlock(); + + return cudaSuccess; + } + + + /** + * \brief Provides a suitable allocation of device memory for the given size on the specified device. + * + * Once freed, the allocation becomes available immediately for reuse within the \p active_stream + * with which it was associated with during allocation, and it becomes available for reuse within other + * streams when all prior work submitted to \p active_stream has completed. + */ + + cudaError_t pinnedAllocate( + void **d_ptr, ///< [out] Reference to pointer to the allocation + size_t bytes ); + + + + + /** + * \brief Frees a live allocation of device memory on the specified device, returning it to the allocator. + * + * Once freed, the allocation becomes available immediately for reuse within the \p active_stream + * with which it was associated with during allocation, and it becomes available for reuse within other + * streams when all prior work submitted to \p active_stream has completed. + */ + cudaError_t pinnedFree( + void* d_ptr); + + + + + + /** + * \brief Frees all cached device allocations on all devices + */ + cudaError_t FreeAllCached(); + + + /** + * \brief Destructor + */ + virtual ~CachingPinnedAllocator() + { + if (!skip_cleanup) + FreeAllCached(); + } + +}; + + + + + +#endif /* SRC_GPUABSTRACTIONS_CACHEDPINNEDALLOCATOR_H_ */ diff --git a/src/bench/CMakeLists.txt b/src/bench/CMakeLists.txt new file mode 100644 index 00000000..17382abd --- /dev/null +++ b/src/bench/CMakeLists.txt @@ -0,0 +1,46 @@ + +if(GDF_BENCHMARK) + +include(ExternalProject) + +ExternalProject_Add(benchmark_ep + CMAKE_ARGS + -DCMAKE_BUILD_TYPE=RELEASE + -DCMAKE_INSTALL_PREFIX=build + -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON + GIT_REPOSITORY https://github.com/google/benchmark.git + GIT_TAG v1.4.1 + UPDATE_COMMAND "" +) +ExternalProject_Get_property(benchmark_ep BINARY_DIR) +set(BENCHMARK_ROOT ${BINARY_DIR}/build) + +file(MAKE_DIRECTORY ${BENCHMARK_ROOT}/include) +file(MAKE_DIRECTORY ${BENCHMARK_ROOT}/lib) + +add_library(Google::Benchmark INTERFACE IMPORTED) +add_dependencies(Google::Benchmark benchmark_ep) +set_target_properties(Google::Benchmark + PROPERTIES INTERFACE_INCLUDE_DIRECTORIES ${BENCHMARK_ROOT}/include) +set_target_properties(Google::Benchmark + PROPERTIES INTERFACE_LINK_LIBRARIES ${BENCHMARK_ROOT}/lib/libbenchmark.a) + +add_library(Google::Benchmark::Main INTERFACE IMPORTED) +set_target_properties(Google::Benchmark::Main + PROPERTIES INTERFACE_LINK_LIBRARIES ${BENCHMARK_ROOT}/lib/libbenchmark_main.a) +endif() + + +function(GDF_ADD_BENCHMARK TARGET) + if(GDF_BENCHMARK) + list(REMOVE_AT ARGV 0) + cuda_add_executable(${TARGET} ${ARGV}) + target_include_directories(${TARGET} + PUBLIC ${CMAKE_SOURCE_DIR}/src/parquet) + target_link_libraries(${TARGET} + Google::Benchmark Google::Benchmark::Main + Threads::Threads gdf-parquet) + endif() +endfunction() + +add_subdirectory(parquet) \ No newline at end of file diff --git a/src/bench/parquet/CMakeLists.txt b/src/bench/parquet/CMakeLists.txt new file mode 100644 index 00000000..428bd241 --- /dev/null +++ b/src/bench/parquet/CMakeLists.txt @@ -0,0 +1,34 @@ +#============================================================================= +# Copyright 2018 BlazingDB, Inc. +# Copyright 2018 Alexander Ocsa +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#============================================================================= + +find_package(Boost REQUIRED COMPONENTS filesystem) + +set(PARQUET_FILE_PATH + ${CMAKE_SOURCE_DIR}/src/bench/parquet/huge_dataset.parquet) + +GDF_ADD_BENCHMARK(parquet-benchmark parquet-benchmark.cu + parquet-multithread-benchmark parquet-multithread-benchmark.cu + ../../tests/helper/utils.cuh + ../../tests/helper/utils.cu +) + +if (GDF_BENCHMARK) +target_compile_definitions(parquet-benchmark + PUBLIC -DPARQUET_FILE_PATH="${PARQUET_FILE_PATH}") +endif() + + diff --git a/src/bench/parquet/huge_dataset.parquet b/src/bench/parquet/huge_dataset.parquet new file mode 100644 index 00000000..6cc822be Binary files /dev/null and b/src/bench/parquet/huge_dataset.parquet differ diff --git a/src/bench/parquet/parquet-benchmark.cu b/src/bench/parquet/parquet-benchmark.cu new file mode 100644 index 00000000..2007f855 --- /dev/null +++ b/src/bench/parquet/parquet-benchmark.cu @@ -0,0 +1,234 @@ +/* + * Copyright 2018 BlazingDB, Inc. + * Copyright 2018 Alexander Ocsa + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include "column_reader.h" +#include "file_reader.h" + +#include "../../tests/helper/utils.cuh" + +#ifndef PARQUET_FILE_PATH +#error PARQUET_FILE_PATH must be defined for precompiling +#define PARQUET_FILE_PATH "/" +#endif + + +enum ReaderType : std::uint8_t { kGdf, kParquet }; + +template +struct Readers {}; + +template <> +struct Readers { + typedef typename gdf::parquet::BoolReader BoolReader; + typedef typename gdf::parquet::Int32Reader Int32Reader; + typedef typename gdf::parquet::Int64Reader Int64Reader; + typedef typename gdf::parquet::FloatReader FloatReader; + typedef typename gdf::parquet::DoubleReader DoubleReader; + typedef typename gdf::parquet::FileReader FileReader; + + + static inline gdf_error init_gdf_buffers(void **device_values, gdf_valid_type** device_valid, int16_t** def_levels, uint32_t values_malloc_size, gdf_size_type column_size){ + cudaError_t cuda_error = cudaMalloc(device_values, values_malloc_size); + auto n_bytes = get_number_of_bytes_for_valid(column_size); + cudaMalloc(device_valid, n_bytes); + cudaMalloc(def_levels, column_size * sizeof(int16_t)); + return GDF_SUCCESS; + } + + + static inline gdf_error buffer_to_gdf_column(gdf_column *output, void *device_values, gdf_valid_type* device_valid, uint32_t values_malloc_size, gdf_size_type column_size, gdf_dtype dtype) { + return gdf_column_view_init(output, device_values, device_valid, column_size, dtype, 0); + } + + static inline void freeDefLevels(int16_t* def_levels){ + cudaFree(def_levels); + } +}; + + +template <> +struct Readers { + typedef typename ::parquet::BoolReader BoolReader; + typedef typename ::parquet::Int32Reader Int32Reader; + typedef typename ::parquet::Int64Reader Int64Reader; + typedef typename ::parquet::FloatReader FloatReader; + typedef typename ::parquet::DoubleReader DoubleReader; + typedef typename ::parquet::ParquetFileReader FileReader; + + static inline gdf_error init_gdf_buffers(void **host_values, gdf_valid_type** host_valid, int16_t** def_levels, uint32_t values_malloc_size, gdf_size_type column_size){ + *host_values = malloc(values_malloc_size); + auto n_bytes = get_number_of_bytes_for_valid(column_size); + *host_valid = (gdf_valid_type*)malloc(n_bytes); + *def_levels = (int16_t*)malloc(column_size * sizeof(int16_t)); + return GDF_SUCCESS; + } + + static inline gdf_error buffer_to_gdf_column(gdf_column *output, void *host_values, gdf_valid_type* host_valid, uint32_t values_malloc_size, gdf_size_type column_size, gdf_dtype dtype) { + void *device_values; + cudaError_t cuda_error = cudaMalloc((void **)&device_values, values_malloc_size); + cudaMemcpy(device_values, host_values, values_malloc_size, cudaMemcpyHostToDevice); + + gdf_valid_type *device_valid; + auto n_bytes = get_number_of_bytes_for_valid(column_size); + cudaMalloc((void **)&device_valid, n_bytes); + cudaMemcpy(device_valid, host_valid, n_bytes, cudaMemcpyHostToDevice); + + auto zero_bits = count_zero_bits(host_valid, column_size); + + free(host_values); + free(host_valid); + return gdf_column_view_init(output, device_values, device_valid, column_size, dtype, zero_bits); + } + + static inline void freeDefLevels(int16_t* def_levels){ + free(def_levels); + } +}; + + +template +struct parquet_traits { + +}; + +#define PARQUET_TRAITS_FACTORY(ParquetTypeEnum, ParquetType, GdfTypeValue) \ +template <> struct parquet_traits { \ + typedef ParquetType parquet_type; \ + static inline gdf_dtype gdf_type() { return GdfTypeValue; } \ +} + +PARQUET_TRAITS_FACTORY(parquet::Type::BOOLEAN, bool, GDF_INT8); +PARQUET_TRAITS_FACTORY(parquet::Type::INT32, int32_t, GDF_INT32); +PARQUET_TRAITS_FACTORY(parquet::Type::INT64, int64_t, GDF_INT64); +PARQUET_TRAITS_FACTORY(parquet::Type::FLOAT, float, GDF_FLOAT32); +PARQUET_TRAITS_FACTORY(parquet::Type::DOUBLE, double, GDF_FLOAT64); + +#undef PARQUET_TRAITS_FACTORY + +template +static inline gdf_error +convert(gdf_column *column, ColumnReaderType *column_reader, int64_t amount_to_read, uint32_t batch_size) { + typedef typename parquet_traits::parquet_type parquet_type; + parquet_type* values_buffer; + gdf_valid_type* valid_bits; + int16_t * definition_level; + + auto values_malloc_size = amount_to_read * sizeof(parquet_type); + gdf_error status = Readers::init_gdf_buffers((void **)&(values_buffer), &valid_bits, &definition_level, values_malloc_size, amount_to_read); + + std::int64_t levels_read; + std::int64_t values_read = 0; + std::int64_t nulls_count; + + int64_t rows_read_total = 0; + while (column_reader->HasNext() && rows_read_total < amount_to_read) { + int64_t rows_read = column_reader->ReadBatchSpaced(batch_size, + &definition_level[rows_read_total], + nullptr, + &values_buffer[rows_read_total], + valid_bits, + 0, + &levels_read, + &values_read, + &nulls_count); + rows_read_total += rows_read; + } + + Readers::buffer_to_gdf_column(column, (void *)values_buffer, valid_bits, values_malloc_size, amount_to_read, parquet_traits::gdf_type()); + + Readers::freeDefLevels(definition_level); + + return GDF_SUCCESS; +} + + +template +static inline gdf_error containerFrom(gdf_column *column, std::shared_ptr column_reader, int64_t numRecords, uint32_t batch_size) { + + parquet::Type::type parquetDataType = column_reader->type(); + + #define WHEN(dataType, Prefix) \ + if ((dataType) == parquetDataType) \ + return convert::Prefix##Reader, dataType> \ + (column, static_cast::Prefix##Reader*>(column_reader.get()), numRecords, batch_size) + + WHEN(parquet::Type::BOOLEAN, Bool); + WHEN(parquet::Type::INT32, Int32); + WHEN(parquet::Type::INT64, Int64); + WHEN(parquet::Type::FLOAT, Float); + WHEN(parquet::Type::DOUBLE, Double); + + #undef WHEN + + throw std::invalid_argument("ERROR: Bad parquet column type"); +} +template +inline static void +readRowGroup(const std::unique_ptr::FileReader> &parquet_reader, uint32_t batch_size) { + + std::shared_ptr file_metadata = parquet_reader->metadata(); + const parquet::SchemaDescriptor *schema = file_metadata->schema(); + int numRowGroups = file_metadata->num_row_groups(); + + std::vector columns; + + for (int rowGroupIndex = 0; rowGroupIndex < numRowGroups; rowGroupIndex++) { + auto groupReader = parquet_reader->RowGroup(rowGroupIndex); + const parquet::RowGroupMetaData *rowGroupMetadata = groupReader->metadata(); + for (int columnIndex = 0; columnIndex < file_metadata->num_columns(); columnIndex++) { + const parquet::ColumnDescriptor *column = schema->Column(columnIndex); + std::unique_ptr columnMetaData = rowGroupMetadata->ColumnChunk( + columnIndex); + parquet::Type::type type = column->physical_type(); + + if (type != parquet::Type::BYTE_ARRAY){ + const std::shared_ptr columnReader = groupReader->Column(columnIndex); + int64_t numRecords = rowGroupMetadata->num_rows(); + + gdf_column output; + containerFrom(&output, columnReader, numRecords, batch_size); + columns.push_back(output); + } + } + } + + for(size_t i = 0; i < columns.size(); i++) + { + delete_gdf_column(&(columns[i])); + } +} + +template +static void +BM_FileRead(benchmark::State &state) { + for (auto _ : state) { + std::unique_ptr::FileReader> reader = + Readers::FileReader::OpenFile(PARQUET_FILE_PATH); + + readRowGroup(reader, state.range(0)); + } +} + +BENCHMARK_TEMPLATE(BM_FileRead, kParquet)->Arg(50000)->Arg(100000)->Arg(500000)->Arg(1000000); +BENCHMARK_TEMPLATE(BM_FileRead, kGdf)->Arg(50000)->Arg(100000)->Arg(500000)->Arg(1000000); + + diff --git a/src/bench/parquet/parquet-multithread-benchmark.cu b/src/bench/parquet/parquet-multithread-benchmark.cu new file mode 100644 index 00000000..2a831e6d --- /dev/null +++ b/src/bench/parquet/parquet-multithread-benchmark.cu @@ -0,0 +1,81 @@ +#include + +#include + +#include "column_reader.h" +#include "file_reader.h" + +#include "../../tests/helper/utils.cuh" + +#ifndef PARQUET_FILE_PATH +#error PARQUET_FILE_PATH must be defined for precompiling +#define PARQUET_FILE_PATH "/" +#endif + +static void +BM_FileRead_mt(benchmark::State &state) { + + for (auto _ : state) { + + gdf_column *columns = nullptr; + std::size_t columns_length = 0; + gdf_error error_code = gdf::parquet::read_parquet( + PARQUET_FILE_PATH, nullptr, &columns, &columns_length); + + + for (std::size_t i = 0; i < columns_length; i++){ + cudaFree(columns[i].data); + cudaFree(columns[i].valid); + } + + } +} + + +// NOTE: this way of doing the reading singlethreaded adds some overhead. +static void +BM_FileRead_st(benchmark::State &state) { + + for (auto _ : state) { + + const std::unique_ptr file_reader = gdf::parquet::FileReader::OpenFile(PARQUET_FILE_PATH); + + std::shared_ptr file_metadata = file_reader->metadata(); + + int numRowGroups = file_metadata->num_row_groups(); + int num_columns = file_metadata->num_columns(); + + auto schema = file_reader->RowGroup(0)->metadata()->schema(); + + std::vector row_group_indices(1); + std::vector column_indices(1); + + for (int rg = 0; rg < numRowGroups; rg++){ + for (int col = 0; col < num_columns; col++){ + + if (schema->Column(col)->physical_type() != ::parquet::Type::BYTE_ARRAY && + schema->Column(col)->physical_type() != ::parquet::Type::FIXED_LEN_BYTE_ARRAY){ + + row_group_indices[0] = rg; + column_indices[0] = col; + + std::vector out_gdf_columns; + gdf_error error_code = gdf::parquet::read_parquet_by_ids( + PARQUET_FILE_PATH, row_group_indices, column_indices, out_gdf_columns); + + + for (std::size_t i = 0; i < out_gdf_columns.size(); i++){ + cudaFree(out_gdf_columns[i]->data); + cudaFree(out_gdf_columns[i]->valid); + } + + } + } + } + + + } +} + +BENCHMARK(BM_FileRead_mt); +BENCHMARK(BM_FileRead_st); diff --git a/src/parquet/api.cpp b/src/parquet/api.cpp new file mode 100644 index 00000000..75a3878c --- /dev/null +++ b/src/parquet/api.cpp @@ -0,0 +1,683 @@ +/* + * Copyright 2018 BlazingDB, Inc. + * Copyright 2018 Cristhian Alberto Gonzales Castillo + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include + +#include +#include +#include + +#include + +#include "column_reader.h" +#include "file_reader.h" + +#include + +BEGIN_NAMESPACE_GDF_PARQUET + +namespace { + +struct ParquetTypeHash { + template + std::size_t + operator()(T t) const { + return static_cast(t); + } +}; + +const std::unordered_map<::parquet::Type::type, gdf_dtype, ParquetTypeHash> + dtype_from_physical_type_map{ + {::parquet::Type::BOOLEAN, GDF_INT8}, + {::parquet::Type::INT32, GDF_INT32}, + {::parquet::Type::INT64, GDF_INT64}, + {::parquet::Type::FLOAT, GDF_FLOAT32}, + {::parquet::Type::DOUBLE, GDF_FLOAT64}, + }; + +const std:: + unordered_map<::parquet::LogicalType::type, gdf_dtype, ParquetTypeHash> + dtype_from_logical_type_map{ + {::parquet::LogicalType::NONE, GDF_invalid}, + {::parquet::LogicalType::UTF8, GDF_invalid}, + {::parquet::LogicalType::MAP, GDF_invalid}, + {::parquet::LogicalType::MAP_KEY_VALUE, GDF_invalid}, + {::parquet::LogicalType::LIST, GDF_invalid}, + {::parquet::LogicalType::ENUM, GDF_invalid}, + {::parquet::LogicalType::DECIMAL, GDF_invalid}, + {::parquet::LogicalType::DATE, GDF_DATE32}, + {::parquet::LogicalType::TIME_MILLIS, GDF_invalid}, + {::parquet::LogicalType::TIME_MICROS, GDF_invalid}, + {::parquet::LogicalType::TIMESTAMP_MILLIS, GDF_TIMESTAMP}, + {::parquet::LogicalType::TIMESTAMP_MICROS, GDF_invalid}, + {::parquet::LogicalType::UINT_8, GDF_invalid}, + {::parquet::LogicalType::UINT_16, GDF_invalid}, + {::parquet::LogicalType::UINT_32, GDF_invalid}, + {::parquet::LogicalType::UINT_64, GDF_invalid}, + {::parquet::LogicalType::INT_8, GDF_INT8}, + {::parquet::LogicalType::INT_16, GDF_INT16}, + {::parquet::LogicalType::INT_32, GDF_INT32}, + {::parquet::LogicalType::INT_64, GDF_INT64}, + {::parquet::LogicalType::JSON, GDF_invalid}, + {::parquet::LogicalType::BSON, GDF_invalid}, + {::parquet::LogicalType::INTERVAL, GDF_invalid}, + {::parquet::LogicalType::NA, GDF_invalid}, + }; + +static inline gdf_dtype +_DTypeFrom(const ::parquet::ColumnDescriptor *const column_descriptor) { + const ::parquet::LogicalType::type logical_type = + column_descriptor->logical_type(); + + if (logical_type != ::parquet::LogicalType::NONE) { + return dtype_from_logical_type_map.at(logical_type); + } + + const ::parquet::Type::type physical_type = + column_descriptor->physical_type(); + + return dtype_from_physical_type_map.at(physical_type); +} + +static inline gdf_error +_ReadColumn(const std::shared_ptr &row_group_reader, + const std::vector & column_indices, + std::size_t offsets[], + gdf_column *const gdf_columns) { + for (std::size_t column_reader_index = 0; + column_reader_index < column_indices.size(); + column_reader_index++) { + const gdf_column &_gdf_column = gdf_columns[column_reader_index]; + const std::shared_ptr<::parquet::ColumnReader> column_reader = + row_group_reader->Column( + static_cast(column_indices[column_reader_index])); + + switch (column_reader->type()) { +#define WHEN(TYPE) \ + case ::parquet::Type::TYPE: { \ + std::shared_ptr>> \ + reader = std::static_pointer_cast>>(column_reader); \ + if (reader->HasNext()) { \ + offsets[column_reader_index] += \ + reader->ToGdfColumn(_gdf_column, offsets[column_reader_index]); \ + } \ + } break + WHEN(BOOLEAN); + WHEN(INT32); + WHEN(INT64); + WHEN(FLOAT); + WHEN(DOUBLE); + default: +#ifdef GDF_DEBUG + std::cerr << "Column type error from file" << std::endl; +#endif + return GDF_IO_ERROR; //TODO: improve using exception handling +#undef WHEN + } + } + return GDF_SUCCESS; +} + +static inline gdf_error +_ReadFile(const std::unique_ptr &file_reader, + const std::vector & indices, + gdf_column *const gdf_columns) { + const std::shared_ptr<::parquet::FileMetaData> &metadata = + file_reader->metadata(); + const std::size_t num_rows = + static_cast(metadata->num_rows()); + const std::size_t num_row_groups = + static_cast(metadata->num_row_groups()); + + std::size_t offsets[indices.size()]; + for (std::size_t i = 0; i < indices.size(); i++) { offsets[i] = 0; } + + for (std::size_t row_group_index = 0; row_group_index < num_row_groups; + row_group_index++) { + const auto row_group_reader = + file_reader->RowGroup(static_cast(row_group_index)); + + gdf_error status = + _ReadColumn(row_group_reader, indices, offsets, gdf_columns); + if (status != GDF_SUCCESS) { return status; } + } + + return GDF_SUCCESS; +} + +static inline gdf_error +_ReadFile(const std::unique_ptr &file_reader, + const std::vector & row_group_indices, + const std::vector & column_indices, + gdf_column *const gdf_columns) { + const std::shared_ptr<::parquet::FileMetaData> &metadata = + file_reader->metadata(); + const std::size_t num_rows = + static_cast(metadata->num_rows()); + + std::size_t offsets[column_indices.size()]; + for (std::size_t i = 0; i < column_indices.size(); i++) { offsets[i] = 0; } + + for (const std::size_t row_group_index : row_group_indices) { + const auto row_group_reader = + file_reader->RowGroup(static_cast(row_group_index)); + + gdf_error status = + _ReadColumn(row_group_reader, column_indices, offsets, gdf_columns); + if (status != GDF_SUCCESS) { return status; } + } + + return GDF_SUCCESS; +} + + +struct ParquetReaderJob { + + std::size_t row_group_index; + std::size_t column_index; + std::size_t column_index_in_read_set; + +// std::shared_ptr row_group_reader; + std::shared_ptr<::parquet::ColumnReader> column_reader; + + const gdf_column & column; + std::size_t offset; + + gdf_valid_type first_valid_byte; + gdf_valid_type last_valid_byte; + + ParquetReaderJob(std::size_t _row_group_index, + std::size_t _column_index, + std::size_t _column_index_in_read_set, +// std::shared_ptr _row_group_reader, + std::shared_ptr<::parquet::ColumnReader> _column_reader, + const gdf_column & _column, + std::size_t _offset ) + : row_group_index(_row_group_index), + column_index(_column_index), + column_index_in_read_set(_column_index_in_read_set), +// row_group_reader(std::move(_row_group_reader)), + column_reader(std::move(_column_reader)), + column(std::move(_column)), + offset(_offset) + {} +}; + + + +void _ProcessParquetReaderJobsThread(std::vector & jobs, std::mutex & lock, + int & job_index, gdf_error & gdf_error_out){ + + lock.lock(); + int current_job = job_index; + job_index++; + lock.unlock(); + + gdf_error current_gdf_error = GDF_SUCCESS; + + while (current_job < jobs.size()){ + + switch (jobs[current_job].column_reader->type()) { + #define WHEN(TYPE) \ + case ::parquet::Type::TYPE: { \ + std::shared_ptr>> \ + reader = std::static_pointer_cast>>(jobs[current_job].column_reader); \ + if (reader->HasNext()) { \ + reader->ToGdfColumn(jobs[current_job].column, jobs[current_job].offset, jobs[current_job].first_valid_byte, jobs[current_job].last_valid_byte); \ + } \ + } break + WHEN(BOOLEAN); + WHEN(INT32); + WHEN(INT64); + WHEN(FLOAT); + WHEN(DOUBLE); + default: + #ifdef GDF_DEBUG + std::cerr << "Column type error from file" << std::endl; + #endif + current_gdf_error = GDF_IO_ERROR; //TODO: improve using exception handling + #undef WHEN + } + + + lock.lock(); + if (gdf_error_out != GDF_SUCCESS){ // if error we want to exit + current_job = jobs.size(); + } else if (current_gdf_error != GDF_SUCCESS) { // if error we want to exit + gdf_error_out = current_gdf_error; + current_job = jobs.size(); + } else { + current_job = job_index; + } + job_index++; + lock.unlock(); + } + +} + +gdf_error _ProcessParquetReaderJobs(std::vector & jobs){ + + std::mutex lock; + int job_index = 0; + gdf_error gdf_error_out = GDF_SUCCESS; + + int num_threads = std::thread::hardware_concurrency(); + num_threads = jobs.size() < num_threads ? jobs.size() : num_threads; + + +// _ProcessParquetReaderJobsThread(jobs, lock, job_index, gdf_error_out); + + std::vector threads(num_threads); + + for (int i = 0; i < num_threads; i++){ + threads[i] = std::thread(_ProcessParquetReaderJobsThread, + std::ref(jobs), std::ref(lock), std::ref(job_index), std::ref(gdf_error_out)); + } + for (int i = 0; i < num_threads; i++){ + threads[i].join(); + } + + + + return gdf_error_out; +} + + +static inline gdf_error +_ReadFileMultiThread(const std::unique_ptr &file_reader, + const std::vector & row_group_indices, + const std::vector & column_indices, + gdf_column *const gdf_columns) { + const std::shared_ptr<::parquet::FileMetaData> &metadata = + file_reader->metadata(); + const std::size_t num_rows = + static_cast(metadata->num_rows()); + + + std::vector jobs; + + std::vector offsets(row_group_indices.size(), 0); + + for (std::size_t row_group_index_in_set = 0; row_group_index_in_set < row_group_indices.size(); + row_group_index_in_set++) { + + std::size_t row_group_index = row_group_indices[row_group_index_in_set]; + + const auto row_group_reader = + file_reader->RowGroup(static_cast(row_group_index)); + + int64_t num_rows = row_group_reader->metadata()->num_rows(); + + + for (std::size_t column_reader_index = 0; + column_reader_index < column_indices.size(); + column_reader_index++) { + const gdf_column &_gdf_column = gdf_columns[column_reader_index]; + const std::shared_ptr<::parquet::ColumnReader> column_reader = + row_group_reader->Column( + static_cast(column_indices[column_reader_index])); + + jobs.emplace_back(row_group_index, column_indices[column_reader_index], + column_reader_index, column_reader, + _gdf_column, offsets[row_group_index_in_set]); + + } + + if (row_group_index_in_set < row_group_indices.size() - 1){ + offsets[row_group_index_in_set + 1] = offsets[row_group_index_in_set] + num_rows; + } + } + + gdf_error gdf_error_out = _ProcessParquetReaderJobs(jobs); + + // now lets fix all the valid bytes that were shared for a column accross rowgroups + if (row_group_indices.size() > 1){ + for (std::size_t column_reader_index = 0; column_reader_index < column_indices.size(); + column_reader_index++) { + + for (std::size_t row_group_index_in_set = 0; row_group_index_in_set < row_group_indices.size() - 1; + row_group_index_in_set++) { + + int job_index1 = (row_group_index_in_set * column_indices.size()) + column_reader_index; + int job_index2 = ((row_group_index_in_set + 1) * column_indices.size()) + column_reader_index; + + gdf_valid_type merged = jobs[job_index1].last_valid_byte | jobs[job_index2].first_valid_byte; + + // determine location of where the merged byte goes + // copy merged into valid + std::size_t merged_byte_offset = (offsets[row_group_index_in_set + 1]/8); + + cudaMemcpy(gdf_columns[column_reader_index].valid + merged_byte_offset, &merged, sizeof(gdf_valid_type), cudaMemcpyHostToDevice); + } + } + } + + + return gdf_error_out; +} + + + +template <::parquet::Type::type TYPE> +static inline gdf_error +_AllocateGdfColumn(const std::size_t num_rows, + const ::parquet::ColumnDescriptor *const column_descriptor, + gdf_column & _gdf_column) { + const std::size_t value_byte_size = + static_cast(::parquet::type_traits::value_byte_size); + + cudaError_t status = + cudaMalloc(&_gdf_column.data, num_rows * value_byte_size); + if (status != cudaSuccess) { +#ifdef GDF_DEBUG + std::cerr << "Allocation error for data\n" << e.what() << std::endl; +#endif + return GDF_IO_ERROR; + } + + status = cudaMalloc(reinterpret_cast(&_gdf_column.valid), + ::arrow::BitUtil::BytesForBits(num_rows)); + if (status != cudaSuccess) { +#ifdef GDF_DEBUG + std::cerr << "Allocation error for valid\n" << e.what() << std::endl; +#endif + return GDF_IO_ERROR; + } + + _gdf_column.size = num_rows; + _gdf_column.dtype = _DTypeFrom(column_descriptor); + + return GDF_SUCCESS; +} // namespace + +static inline std::vector +_ColumnDescriptorsFrom(const std::unique_ptr &file_reader, + const std::vector & indices) { + const auto &row_group_reader = file_reader->RowGroup(0); + + std::vector column_descriptors; + column_descriptors.reserve(indices.size()); + + for (const std::size_t i : indices) { + column_descriptors.emplace_back(row_group_reader->Column(i)->descr()); + } + + return column_descriptors; +} + +static inline gdf_error +_AllocateGdfColumns(const std::unique_ptr &file_reader, + const std::vector & row_group_indices, + const std::vector & column_indices, + gdf_column *const gdf_columns) { + const std::vector column_descriptors = + _ColumnDescriptorsFrom(file_reader, column_indices); + + int64_t num_rows = 0; + for (std::size_t row_group_index_in_set = 0; row_group_index_in_set < row_group_indices.size(); + row_group_index_in_set++) { + + std::size_t row_group_index = row_group_indices[row_group_index_in_set]; + + const auto row_group_reader = + file_reader->RowGroup(static_cast(row_group_index)); + + num_rows += row_group_reader->metadata()->num_rows(); + } + + + const std::size_t num_columns = column_indices.size(); + + +#define WHEN(TYPE) \ + case ::parquet::Type::TYPE: \ + _AllocateGdfColumn<::parquet::Type::TYPE>( \ + num_rows, column_descriptor, _gdf_column); \ + break + + for (std::size_t i = 0; i < num_columns; i++) { + gdf_column & _gdf_column = gdf_columns[i]; + const ::parquet::ColumnDescriptor *const column_descriptor = + column_descriptors[i]; + + switch (column_descriptor->physical_type()) { + WHEN(BOOLEAN); + WHEN(INT32); + WHEN(INT64); + WHEN(FLOAT); + WHEN(DOUBLE); + default: +#ifdef GDF_DEBUG + std::cerr << "Column type not supported" << std::endl; +#endif + return GDF_IO_ERROR; + } + } +#undef WHEN + return GDF_SUCCESS; +} + +static inline gdf_column * +_CreateGdfColumns(const std::size_t num_columns) try { + return new gdf_column[num_columns]; +} catch (const std::bad_alloc &e) { +#ifdef GDF_DEBUG + std::cerr << "Allocation error for gdf columns\n" << e.what() << std::endl; +#endif + return nullptr; +} + + +static inline std::vector +_GetColumnIndices(const std::unique_ptr &file_reader, + const char *const *const raw_names){ + + std::vector indices; + + const std::shared_ptr &metadata = + file_reader->metadata(); + + const std::size_t num_columns = + static_cast(metadata->num_columns()); + + auto schema = file_reader->RowGroup(0)->metadata()->schema(); + + std::vector> parquet_columns; + parquet_columns.reserve(num_columns); + + for (std::size_t i = 0; i < num_columns; i++) { + if (schema->Column(i)->physical_type() != ::parquet::Type::BYTE_ARRAY && + schema->Column(i)->physical_type() != ::parquet::Type::FIXED_LEN_BYTE_ARRAY){ + + parquet_columns.push_back(std::make_pair(schema->Column(i)->name(), i)); + + } + } + + if (raw_names != nullptr) { + for (const char *const *name_ptr = raw_names; *name_ptr != nullptr; + name_ptr++) { + + std::string filter_name = *name_ptr; + for (std::size_t i = 0; i < parquet_columns.size(); i++) { + if (filter_name == parquet_columns[i].first){ + indices.push_back(parquet_columns[i].second); + break; + } + } + } + } else { + for (std::size_t i = 0; i < parquet_columns.size(); i++) { + indices.push_back(parquet_columns[i].second); + } + } + return indices; +} + + +static inline gdf_error +_CheckMinimalData(const std::unique_ptr &file_reader) { + const std::shared_ptr &metadata = + file_reader->metadata(); + + if (metadata->num_row_groups() == 0) { return GDF_IO_ERROR; } + + if (metadata->num_rows() == 0) { return GDF_IO_ERROR; } + + return GDF_SUCCESS; +} + +static inline std::unique_ptr +_OpenFile(const std::string &filename) try { + return FileReader::OpenFile(filename); +} catch (std::exception &e) { +#ifdef GDF_DEBUG + std::cerr << "Open file\n" << e.what() << std::endl; +#endif + return nullptr; +} + +static inline std::unique_ptr +_OpenFile(std::shared_ptr<::arrow::io::RandomAccessFile> file) try { + return FileReader::OpenFile(file); +} catch (std::exception &e) { +#ifdef GDF_DEBUG + std::cerr << "Open file\n" << e.what() << std::endl; +#endif + return nullptr; +} + +} // namespace + + +static inline gdf_error +_read_parquet_by_ids(const std::unique_ptr & file_reader, + const std::vector &row_group_indices, + const std::vector &column_indices, + gdf_column *const gdf_columns) { + + if (gdf_columns == nullptr) { return GDF_IO_ERROR; } + + if (_AllocateGdfColumns(file_reader, row_group_indices, column_indices, gdf_columns) + != GDF_SUCCESS) { + return GDF_IO_ERROR; + } + + if (_ReadFileMultiThread(file_reader, row_group_indices, column_indices, gdf_columns) + != GDF_SUCCESS) { + return GDF_IO_ERROR; + } + + return GDF_SUCCESS; +} + + +gdf_error +read_parquet_by_ids(const std::string & filename, + const std::vector &row_group_indices, + const std::vector &column_indices, + std::vector & out_gdf_columns) { + + const std::unique_ptr file_reader = _OpenFile(filename); + + if (!file_reader) { return GDF_IO_ERROR; } + + if (_CheckMinimalData(file_reader) != GDF_SUCCESS) { return GDF_IO_ERROR; } + + gdf_column *const gdf_columns = _CreateGdfColumns(column_indices.size()); + + gdf_error status = _read_parquet_by_ids(std::move(file_reader), row_group_indices, column_indices, gdf_columns); + + for (std::size_t i = 0; i < column_indices.size(); i++) { + out_gdf_columns.push_back(&gdf_columns[i]); + } + + return status; +} + +gdf_error +read_parquet_by_ids(std::shared_ptr<::arrow::io::RandomAccessFile> file, + const std::vector &row_group_indices, + const std::vector &column_indices, + std::vector & out_gdf_columns) { + + const std::unique_ptr file_reader = _OpenFile(file); + + if (!file_reader) { return GDF_IO_ERROR; } + + if (_CheckMinimalData(file_reader) != GDF_SUCCESS) { return GDF_IO_ERROR; } + + gdf_column *const gdf_columns = _CreateGdfColumns(column_indices.size()); + + gdf_error status = _read_parquet_by_ids(std::move(file_reader), row_group_indices, column_indices, gdf_columns); + + for (std::size_t i = 0; i < column_indices.size(); i++) { + out_gdf_columns.push_back(&gdf_columns[i]); + } + + return status; +} + +extern "C" { + +gdf_error +read_parquet(const char *const filename, + const char *const *const columns, + gdf_column **const out_gdf_columns, + size_t *const out_gdf_columns_length) { + + const std::unique_ptr file_reader = _OpenFile(filename); + + if (!file_reader) { return GDF_IO_ERROR; } + + if (_CheckMinimalData(file_reader) != GDF_SUCCESS) { return GDF_IO_ERROR; } + + const std::vector column_indices = + _GetColumnIndices(file_reader, columns); + + const std::shared_ptr<::parquet::FileMetaData> &metadata = + file_reader->metadata(); + const std::size_t num_row_groups = + static_cast(metadata->num_row_groups()); + + std::vector row_group_ind(num_row_groups); + std::iota( row_group_ind.begin(), row_group_ind.end(), 0); + + const std::vector row_group_indices(row_group_ind); + + gdf_column *const gdf_columns = _CreateGdfColumns(column_indices.size()); + + gdf_error status = _read_parquet_by_ids(std::move(file_reader), row_group_indices, column_indices, gdf_columns); + + *out_gdf_columns = gdf_columns; + *out_gdf_columns_length = column_indices.size(); + + return status; + +} +} + +END_NAMESPACE_GDF_PARQUET diff --git a/src/parquet/column_reader.cu b/src/parquet/column_reader.cu new file mode 100644 index 00000000..5084242d --- /dev/null +++ b/src/parquet/column_reader.cu @@ -0,0 +1,717 @@ +/* + * Copyright 2018 BlazingDB, Inc. + * Copyright 2018 Cristhian Alberto Gonzales Castillo + * Copyright 2018 Alexander Ocsa + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "column_reader.h" +#include "dictionary_decoder.cuh" +#include "plain_decoder.cuh" + +#include +#include "../util/bit_util.cuh" + +namespace gdf +{ +namespace parquet +{ + +template +static inline void +_ConfigureDictionary( + const ::parquet::Page *page, + std::unordered_map> &decoders, + const ::parquet::ColumnDescriptor *const column_descriptor, + ::arrow::MemoryPool *const pool, + DecoderType **out_decoder) +{ + const ::parquet::DictionaryPage *dictionary_page = + static_cast(page); + + int encoding = static_cast(dictionary_page->encoding()); + if (dictionary_page->encoding() == ::parquet::Encoding::PLAIN_DICTIONARY || dictionary_page->encoding() == ::parquet::Encoding::PLAIN) + { + encoding = static_cast(::parquet::Encoding::RLE_DICTIONARY); + } + + auto it = decoders.find(encoding); + if (it != decoders.end()) + { + throw ::parquet::ParquetException( + "Column cannot have more than one dictionary."); + } + + if (dictionary_page->encoding() == ::parquet::Encoding::PLAIN_DICTIONARY || dictionary_page->encoding() == ::parquet::Encoding::PLAIN) + { + internal::PlainDecoder dictionary(column_descriptor); + dictionary.SetData( + dictionary_page->num_values(), page->data(), page->size()); + + auto decoder = std::make_shared>(column_descriptor, pool); + decoder->SetDict(&dictionary); + decoders[encoding] = decoder; + } + else + { + ::parquet::ParquetException::NYI( + "only plain dictionary encoding has been implemented"); + } + + *out_decoder = decoders[encoding].get(); +} + +static inline bool +_IsDictionaryIndexEncoding(const ::parquet::Encoding::type &e) +{ + return e == ::parquet::Encoding::RLE_DICTIONARY || e == ::parquet::Encoding::PLAIN_DICTIONARY; +} + +template +static inline std::int64_t +_ReadValues(DecoderType *decoder, std::int64_t batch_size, T *out) +{ + std::int64_t num_decoded = + decoder->Decode(out, static_cast(batch_size)); + return num_decoded; +} + +template +bool ColumnReader::HasNext() +{ + if (num_buffered_values_ == 0 || num_decoded_values_ == num_buffered_values_) + { + if (!ReadNewPage() || num_buffered_values_ == 0) + { + return false; + } + } + return true; +} + +template +bool ColumnReader::ReadNewPage() +{ + const std::uint8_t *buffer; + + for (;;) + { + current_page_ = pager_->NextPage(); + if (!current_page_) + { + return false; + } + + if (current_page_->type() == ::parquet::PageType::DICTIONARY_PAGE) + { + _ConfigureDictionary(current_page_.get(), + decoders_, + descr_, + pool_, + ¤t_decoder_); + continue; + } + else if (current_page_->type() == ::parquet::PageType::DATA_PAGE) + { + const ::parquet::DataPage *page = + static_cast(current_page_.get()); + + num_buffered_values_ = page->num_values(); + num_decoded_values_ = 0; + buffer = page->data(); + + std::int64_t data_size = page->size(); + + if (descr_->max_repetition_level() > 0) + { + std::int64_t rep_levels_bytes = + repetition_level_decoder_.SetData( + page->repetition_level_encoding(), + descr_->max_repetition_level(), + static_cast(num_buffered_values_), + buffer); + buffer += rep_levels_bytes; + data_size -= rep_levels_bytes; + } + + if (descr_->max_definition_level() > 0) + { + std::int64_t def_levels_bytes = + def_level_decoder_.SetData( + page->definition_level_encoding(), + descr_->max_definition_level(), + static_cast(num_buffered_values_), + buffer); + buffer += def_levels_bytes; + data_size -= def_levels_bytes; + } + + ::parquet::Encoding::type encoding = page->encoding(); + + if (_IsDictionaryIndexEncoding(encoding)) + { + encoding = ::parquet::Encoding::RLE_DICTIONARY; + } + + auto it = decoders_.find(static_cast(encoding)); + if (it != decoders_.end()) + { + if (encoding == ::parquet::Encoding::RLE_DICTIONARY) + { + DCHECK(current_decoder_->encoding() == ::parquet::Encoding::RLE_DICTIONARY); + } + current_decoder_ = it->second.get(); + } + else + { + switch (encoding) + { + case ::parquet::Encoding::PLAIN: + { + std::shared_ptr decoder( + new internal::PlainDecoder(descr_)); + decoders_[static_cast(encoding)] = decoder; + current_decoder_ = decoder.get(); + break; + } + case ::parquet::Encoding::RLE_DICTIONARY: + throw ::parquet::ParquetException( + "Dictionary page must be before data page."); + + case ::parquet::Encoding::DELTA_BINARY_PACKED: + case ::parquet::Encoding::DELTA_LENGTH_BYTE_ARRAY: + case ::parquet::Encoding::DELTA_BYTE_ARRAY: + ::parquet::ParquetException::NYI("Unsupported encoding"); + + default: + throw ::parquet::ParquetException( + "Unknown encoding type."); + } + } + current_decoder_->SetData(static_cast(num_buffered_values_), + buffer, + static_cast(data_size)); + return true; + } + else + { + continue; + } + } + return true; +} + +static inline bool +_HasSpacedValues(const ::parquet::ColumnDescriptor *descr) +{ + if (descr->max_repetition_level() > 0) + { + return !descr->schema_node()->is_required(); + } + else + { + const ::parquet::schema::Node *node = descr->schema_node().get(); + while (node) + { + if (node->is_optional()) + { + return true; + } + node = node->parent(); + } + return false; + } +} + + +struct is_equal +{ + int16_t max_definition_level; + + is_equal(int16_t max_definition_level) + : max_definition_level(max_definition_level) + { + + } + __host__ __device__ bool operator()(const int16_t &x) + { + return x == max_definition_level; + } +}; + +// expands data vector that does not contain nulls into a representation that has indeterminate values where there should be nulls +// A vector of int work_space needs to be allocated to hold the map for the scatter operation. The workspace should be of size batch_size +template +void compact_to_sparse_for_nulls(T *data_in, T *data_out, const int16_t *definition_levels, int16_t max_definition_level, + int64_t batch_size, int *work_space) +{ + is_equal op(max_definition_level); + auto out_iter = thrust::copy_if(thrust::device, + thrust::counting_iterator(0), + thrust::counting_iterator(batch_size), + definition_levels, + work_space, + op); + int num_not_null = out_iter - work_space; + thrust::scatter(thrust::device, data_in, data_in + num_not_null, work_space, data_out); +} + + +#define WARP_BYTE 4 +#define WARP_SIZE 32 +#define WARP_MASK 0xFFFFFFFF +constexpr unsigned int THREAD_BLOCK_SIZE{256}; + +template +__global__ void transform_valid_kernel(uint8_t* valid, const int64_t size, Functor is_valid) { + size_t tid = threadIdx.x; + size_t blkid = blockIdx.x; + size_t blksz = blockDim.x; + size_t gridsz = gridDim.x; + + size_t step = blksz * gridsz; + size_t i = tid + blkid * blksz; + + while (i < size) { + uint32_t bitmask = 0; + uint32_t result = is_valid(i); + bitmask = (-result << (i % WARP_SIZE)); + + #pragma unroll + for (size_t offset = 16; offset > 0; offset /= 2) { + bitmask += __shfl_down_sync(WARP_MASK, bitmask, offset); + } + + if ((i % WARP_SIZE) == 0) { + int index = i / WARP_SIZE * WARP_BYTE; + valid[index + 0] = 0xFF & bitmask; + valid[index + 1] = 0xFF & (bitmask >> 8); + valid[index + 2] = 0xFF & (bitmask >> 16); + valid[index + 3] = 0xFF & (bitmask >> 24); + } + i += step; + } +} + +template +__global__ void transform_valid_kernel(uint8_t* valid, const int64_t size, size_t num_chars, Functor is_valid) { + size_t tid = threadIdx.x; + size_t blkid = blockIdx.x; + size_t blksz = blockDim.x; + size_t gridsz = gridDim.x; + + size_t step = blksz * gridsz; + size_t i = tid + blkid * blksz; + + while (i < size) { + uint32_t bitmask = 0; + uint32_t result = is_valid(i); + bitmask = (-result << (i % WARP_SIZE)); + + #pragma unroll + for (size_t offset = 16; offset > 0; offset /= 2) { + bitmask += __shfl_down_sync(WARP_MASK, bitmask, offset); + } + + if ((i % WARP_SIZE) == 0) { + int index = i / WARP_SIZE * WARP_BYTE; + if (index + 0 < num_chars) + valid[index + 0] = 0xFF & bitmask; + if (index + 1 < num_chars) + valid[index + 1] = 0xFF & (bitmask >> 8); + if (index + 2 < num_chars) + valid[index + 2] = 0xFF & (bitmask >> 16); + if (index + 3 < num_chars) + valid[index + 3] = 0xFF & (bitmask >> 24); + } + i += step; + } +} + +template +void transform_valid(uint8_t* valid, const int64_t size, Functor is_valid) { + const dim3 grid ((size + THREAD_BLOCK_SIZE - 1) / THREAD_BLOCK_SIZE, 1, 1); + const dim3 block (THREAD_BLOCK_SIZE, 1, 1); + if (size % 32 == 0) { + transform_valid_kernel <<>>(valid, size, is_valid); + } + else { + size_t num_chars = gdf_get_num_chars_bitmask(size); + transform_valid_kernel <<>>(valid, size, num_chars, is_valid); + } +} + +struct TurnOnFunctor { + __host__ __device__ uint32_t operator() (size_t index) { + return 0xFFFFFFFF; + } +}; + +static inline void _TurnBitOnForValids(std::int64_t def_length, + std::uint8_t * d_valid_ptr, + const std::int64_t valid_bits_offset) +{ + if (valid_bits_offset % 8 == 0) { + transform_valid(d_valid_ptr + valid_bits_offset / 8, def_length, TurnOnFunctor{}); + } else { + size_t left_bits_length = valid_bits_offset % 8; + size_t rigth_bits_length = 8 - left_bits_length; + uint8_t mask; + cudaMemcpy(&mask, d_valid_ptr + (valid_bits_offset/8), 1, cudaMemcpyDeviceToHost); + + for(size_t i = 0; i < rigth_bits_length; i++) { + mask |= gdf::util::byte_bitmask(i + left_bits_length); + } + cudaMemcpy(d_valid_ptr + valid_bits_offset / 8, &mask, sizeof(uint8_t), cudaMemcpyHostToDevice); + transform_valid((d_valid_ptr + valid_bits_offset / 8 + 1), def_length, TurnOnFunctor{}); + } +} + +struct IsValidFunctor { + const std::int16_t *d_def_levels; + std::int16_t max_definition_level; + IsValidFunctor (const std::int16_t *d_def_levels, std::int16_t max_definition_level) : d_def_levels {d_def_levels}, max_definition_level{max_definition_level} + { + } + __host__ __device__ uint32_t operator() (size_t index) { + return d_def_levels[index] == max_definition_level ? 0xFFFFFFFF : 0x00000000; + } +}; + +static inline void +_DefinitionLevelsToBitmap(const std::int16_t *d_def_levels, + std::int64_t def_length, + const std::int16_t max_definition_level, + std::int64_t * values_read, + std::int64_t * null_count, + std::uint8_t * d_valid_ptr, + const std::int64_t valid_bits_offset) { + + if (valid_bits_offset % 8 == 0) { + transform_valid( + (d_valid_ptr + valid_bits_offset / 8), + def_length, + IsValidFunctor{ d_def_levels, max_definition_level }); + } else { + int left_bits_length = valid_bits_offset % 8; + int right_bits_length = 8 - left_bits_length; + uint8_t mask; + cudaMemcpy(&mask, d_valid_ptr + (valid_bits_offset/8), 1, cudaMemcpyDeviceToHost); + + thrust::host_vector h_def_levels(right_bits_length); + cudaMemcpy(h_def_levels.data(), d_def_levels, right_bits_length * sizeof(int16_t), cudaMemcpyDeviceToHost); + for(size_t i = 0; i < h_def_levels.size(); i++) { + if (h_def_levels[i] == max_definition_level) { + mask |= gdf::util::byte_bitmask(i + left_bits_length); + } else { + if (h_def_levels[i] < max_definition_level) { + mask &= gdf::util::flipped_bitmask(i + left_bits_length); + } + } + } + cudaMemcpy(d_valid_ptr + valid_bits_offset / 8, &mask, sizeof(uint8_t), cudaMemcpyHostToDevice); + transform_valid (d_valid_ptr + valid_bits_offset/8 + 1, + def_length - right_bits_length, + IsValidFunctor{d_def_levels + right_bits_length, max_definition_level}); + } + int not_null_count = thrust::count(thrust::device_pointer_cast(d_def_levels), thrust::device_pointer_cast(d_def_levels) + def_length, max_definition_level); + *null_count = def_length - not_null_count; + *values_read = not_null_count; +} + +template +static inline std::int64_t +_ReadValuesSpaced(DecoderType *decoder, + std::int64_t batch_size, + T *out, + std::int64_t null_count, + std::uint8_t *valid_bits, + std::int64_t valid_bits_offset) +{ + return decoder->DecodeSpaced(out, + static_cast(batch_size), + static_cast(null_count), + valid_bits, + valid_bits_offset); +} + +template +inline std::int64_t +ColumnReader::ReadBatchSpaced(std::int64_t batch_size, + std::int16_t *definition_levels, + std::int16_t *repetition_levels, + T *values, + std::uint8_t *valid_bits, + std::int64_t valid_bits_offset, // + std::int64_t *levels_read, + std::int64_t *values_read, + std::int64_t *nulls_count) +{ + if (!HasNext()) + { + *levels_read = 0; + *values_read = 0; + *nulls_count = 0; + return 0; + } + + std::int64_t total_values; + + batch_size = std::min(batch_size, num_buffered_values_ - num_decoded_values_); + + if (descr_->max_definition_level() > 0) + { + std::int64_t num_def_levels = ReadDefinitionLevels(batch_size, definition_levels); + + const bool has_spaced_values = _HasSpacedValues(descr_); + + std::int64_t null_count = 0; + if (!has_spaced_values) + { + int result = thrust::count(thrust::device_pointer_cast(definition_levels), thrust::device_pointer_cast(definition_levels) + num_def_levels, descr_->max_definition_level()); + int values_to_read = result; + + total_values = _ReadValues(current_decoder_, values_to_read, values); + _TurnBitOnForValids(total_values, valid_bits, valid_bits_offset); + *values_read = total_values; + } + else + { + std::int16_t max_definition_level = descr_->max_definition_level(); + std::int16_t max_repetition_level = descr_->max_repetition_level(); + + _DefinitionLevelsToBitmap( + definition_levels, + num_def_levels, + max_definition_level, + values_read, + &null_count, + valid_bits, + valid_bits_offset); + + total_values = _ReadValues(current_decoder_, *values_read, values); + total_values = num_def_levels; + + if (total_values != *values_read) { + thrust::device_vector work_space_vector(total_values); + int* work_space = thrust::raw_pointer_cast(work_space_vector.data()); + thrust::device_vector d_values_in(values, values + total_values); + compact_to_sparse_for_nulls(thrust::raw_pointer_cast(d_values_in.data()), + values, + definition_levels, + max_definition_level, + total_values, + work_space); + } + } + *levels_read = num_def_levels; + *nulls_count = null_count; + } + else { + total_values = _ReadValues(current_decoder_, batch_size, values); + _TurnBitOnForValids(total_values, valid_bits, valid_bits_offset); + *nulls_count = 0; + *levels_read = total_values; + } + + ConsumeBufferedValues(*levels_read); + + return total_values; +} + +template +inline std::int64_t +ColumnReader::ReadBatch(std::int64_t batch_size, + std::int16_t *def_levels, + std::int16_t *rep_levels, + T *values, + std::int64_t *values_read) +{ + // assert(rep_levels == nullptr); + if (!HasNext()) + { + *values_read = 0; + return 0; + } + batch_size = std::min(batch_size, num_buffered_values_ - num_decoded_values_); + + std::int64_t num_def_levels = 0; + + std::int64_t values_to_read = 0; + + if (descr_->max_definition_level() > 0 && def_levels) + { + num_def_levels = ReadDefinitionLevels(batch_size, def_levels); + int result = thrust::count(thrust::device_pointer_cast(def_levels), thrust::device_pointer_cast(def_levels) + num_def_levels, descr_->max_definition_level()); + values_to_read = result; + } + else + { + values_to_read = batch_size; + } + + *values_read = _ReadValues(current_decoder_, values_to_read, values); + std::int64_t total_values = std::max(num_def_levels, *values_read); + ConsumeBufferedValues(total_values); + + return total_values; +} + +template +struct ParquetTraits +{ +}; + +#define TYPE_TRAITS_FACTORY(ParquetType, GdfDType) \ + template <> \ + struct ParquetTraits \ + { \ + static constexpr gdf_dtype gdfDType = GdfDType; \ + } + +TYPE_TRAITS_FACTORY(::parquet::BooleanType, GDF_INT8); +TYPE_TRAITS_FACTORY(::parquet::Int32Type, GDF_INT32); +TYPE_TRAITS_FACTORY(::parquet::Int64Type, GDF_INT64); +TYPE_TRAITS_FACTORY(::parquet::FloatType, GDF_FLOAT32); +TYPE_TRAITS_FACTORY(::parquet::DoubleType, GDF_FLOAT64); + +#undef TYPE_TRAITS_FACTORY + + +template +std::size_t ColumnReader::ToGdfColumn(const gdf_column & column, const std::ptrdiff_t offset, + std::uint8_t & first_valid_byte, std::uint8_t & last_valid_byte) { + + if (!HasNext()) { + return 0; + } + std::int64_t values_to_read = num_buffered_values_ - num_decoded_values_; + + thrust::device_vector d_def_levels(values_to_read);//this size is work group size + std::int16_t *d_definition_levels = thrust::raw_pointer_cast(d_def_levels.data()); + + std::size_t rows_read_total = ToGdfColumn(column, offset, d_definition_levels); + + std::int16_t max_definition_level = descr_->max_definition_level(); + + if (offset > 0 && offset % 8 != 0){ // need to figure out the first_valid_byte + first_valid_byte = 0; + + int left_bits_length = offset % 8; + int right_bits_length = 8 - left_bits_length; + + thrust::host_vector h_def_levels(right_bits_length); + cudaMemcpy(h_def_levels.data(), d_definition_levels, right_bits_length * sizeof(int16_t), cudaMemcpyDeviceToHost); + for(size_t i = 0; i < h_def_levels.size(); i++) { + if (h_def_levels[i] == max_definition_level) { + first_valid_byte |= gdf::util::byte_bitmask(i + left_bits_length); + } else { + if (h_def_levels[i] < max_definition_level) { + first_valid_byte &= gdf::util::flipped_bitmask(i + left_bits_length); + } + } + } + } + if ( (offset + values_to_read) % 8 != 0 ) { // need to figure out the last_valid_byte + last_valid_byte = 0; + + int left_bits_length = (offset + values_to_read) % 8; + int right_bits_length = 8 - left_bits_length; + + thrust::host_vector h_def_levels(left_bits_length); + cudaMemcpy(h_def_levels.data(), d_definition_levels + values_to_read - left_bits_length, left_bits_length * sizeof(int16_t), cudaMemcpyDeviceToHost); + for(size_t i = 0; i < h_def_levels.size(); i++) { + if (h_def_levels[i] == max_definition_level) { + last_valid_byte |= gdf::util::byte_bitmask(i); + } else { + if (h_def_levels[i] < max_definition_level) { + last_valid_byte &= gdf::util::flipped_bitmask(i); + } + } + } + } + + return rows_read_total; + +} + +template +std::size_t ColumnReader::ToGdfColumn(const gdf_column & column, const std::ptrdiff_t offset) { + if (!HasNext()) { + return 0; + } + std::int64_t values_to_read = num_buffered_values_ - num_decoded_values_; + + thrust::device_vector d_def_levels(values_to_read);//this size is work group size + std::int16_t *d_definition_levels = thrust::raw_pointer_cast(d_def_levels.data()); + + return ToGdfColumn(column, offset, d_definition_levels); +} + +template +std::size_t ColumnReader::ToGdfColumn(const gdf_column & column, const std::ptrdiff_t offset, std::int16_t *d_definition_levels) { + if (!HasNext()) { + return 0; + } + using c_type = typename DataType::c_type; + + c_type *const values = static_cast(column.data) + offset; + std::uint8_t *const d_valid_bits = + static_cast(column.valid) + (offset / 8); + + static std::int64_t levels_read = 0; + static std::int64_t values_read = 0; + static std::int64_t nulls_count = 0; + + int64_t rows_read_total = 0; + std::int64_t values_to_read = num_buffered_values_ - num_decoded_values_; + + do { + values_to_read = num_buffered_values_ - num_decoded_values_; + int64_t rows_read = ReadBatchSpaced( + values_to_read, + d_definition_levels + rows_read_total, + nullptr, + static_cast(values + rows_read_total), + d_valid_bits, + rows_read_total + (offset % 8), + &levels_read, + &values_read, + &nulls_count); + + rows_read_total += rows_read; + } while (this->HasNext()); + return static_cast(rows_read_total); +} + +template class ColumnReader<::parquet::BooleanType>; +template class ColumnReader<::parquet::Int32Type>; +template class ColumnReader<::parquet::Int64Type>; +template class ColumnReader<::parquet::FloatType>; +template class ColumnReader<::parquet::DoubleType>; + +} // namespace parquet +} // namespace gdf diff --git a/src/parquet/column_reader.h b/src/parquet/column_reader.h new file mode 100644 index 00000000..5534acb4 --- /dev/null +++ b/src/parquet/column_reader.h @@ -0,0 +1,92 @@ +/* + * Copyright 2018 BlazingDB, Inc. + * Copyright 2018 Cristhian Alberto Gonzales Castillo + * Copyright 2018 Alexander Ocsa + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _GDF_PARQUET_COLUMN_READER_H +#define _GDF_PARQUET_COLUMN_READER_H + +#include +#include +#include "decoder/cu_level_decoder.h" + +namespace gdf { +namespace parquet { + +template +class ColumnReader : public ::parquet::ColumnReader { +public: + using T = typename DataType::c_type; + + ColumnReader(const ::parquet::ColumnDescriptor* schema, std::unique_ptr<::parquet::PageReader> pager, + ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) + : ::parquet::ColumnReader(schema, std::move(pager), pool), current_decoder_(nullptr) {} + + + bool HasNext(); + + std::int64_t ReadBatch(std::int64_t batch_size, + std::int16_t *def_levels, + std::int16_t *rep_levels, + T * values, + std::int64_t *values_read); + + std::int64_t ReadBatchSpaced(std::int64_t batch_size, + std::int16_t *definition_levels, + std::int16_t *repetition_levels, + T * values, + std::uint8_t *valid_bits, + std::int64_t valid_bits_offset, + std::int64_t *levels_read, + std::int64_t *values_read, + std::int64_t *nulls_count); + + std::size_t ToGdfColumn(const gdf_column & column, const std::ptrdiff_t offset = 0); + + std::size_t ToGdfColumn(const gdf_column & column, const std::ptrdiff_t offset, std::int16_t *d_definition_levels); + + std::size_t ToGdfColumn(const gdf_column & column, const std::ptrdiff_t offset, std::uint8_t & first_valid_byte, std::uint8_t & last_valid_byte); + + + int64_t ReadDefinitionLevels(int64_t batch_size, int16_t* levels) { + if (descr_->max_definition_level() == 0) { + return 0; + } + return def_level_decoder_.Decode(static_cast(batch_size), levels); + } + + + +private: + bool ReadNewPage() final; + + using DecoderType = ::parquet::Decoder; + + std::unordered_map> decoders_; + DecoderType * current_decoder_; + gdf::parquet::decoder::CUDALevelDecoder def_level_decoder_; +}; + +using BoolReader = ColumnReader<::parquet::BooleanType>; +using Int32Reader = ColumnReader<::parquet::Int32Type>; +using Int64Reader = ColumnReader<::parquet::Int64Type>; +using FloatReader = ColumnReader<::parquet::FloatType>; +using DoubleReader = ColumnReader<::parquet::DoubleType>; + +} // namespace parquet +} // namespace gdf + +#endif diff --git a/src/parquet/decoder/cu_level_decoder.cu b/src/parquet/decoder/cu_level_decoder.cu new file mode 100644 index 00000000..628716b6 --- /dev/null +++ b/src/parquet/decoder/cu_level_decoder.cu @@ -0,0 +1,95 @@ +// +// Created by aocsa on 8/25/18. +// + +#include "arrow/util/rle-encoding.h" +#include +#include "../../arrow/rle_decoder.h" +#include "../../arrow/bit-stream.h" + +#include "cu_level_decoder.h" + +namespace gdf { +namespace parquet { +namespace decoder { + +CUDALevelDecoder::CUDALevelDecoder() + : num_values_remaining_(0), rle_decoder_(nullptr), bit_packed_decoder_(nullptr) +{ +} + +CUDALevelDecoder::~CUDALevelDecoder() {} + +int CUDALevelDecoder::SetData(::parquet::Encoding::type encoding, + int16_t max_level, int num_buffered_values, + const uint8_t* data) +{ + int32_t num_bytes = 0; + encoding_ = encoding; + num_values_remaining_ = num_buffered_values; + bit_width_ = ::arrow::BitUtil::Log2(max_level + 1); + switch (encoding) { + case ::parquet::Encoding::RLE: { + num_bytes = *reinterpret_cast(data); + const uint8_t* decoder_data = data + sizeof(int32_t); + if (rle_decoder_ == nullptr) { + rle_decoder_.reset( + new gdf::arrow::internal::RleDecoder(decoder_data, num_bytes, bit_width_)); + } else { + rle_decoder_->Reset(decoder_data, num_bytes, bit_width_); + } + return sizeof(int32_t) + num_bytes; + } + case ::parquet::Encoding::BIT_PACKED: { + num_bytes = static_cast( + ::arrow::BitUtil::Ceil(num_buffered_values * bit_width_, 8)); + if (!bit_packed_decoder_) { + bit_packed_decoder_.reset(new gdf::arrow::internal::BitReader(data, num_bytes)); + } else { + bit_packed_decoder_->Reset(data, num_bytes); + } + return num_bytes; + } + default: + throw ::parquet::ParquetException("Unknown encoding type for levels."); + } +} + +int CUDALevelDecoder::Decode(int batch_size, int16_t* d_levels) +{ + int num_decoded = 0; + int num_values = std::min(num_values_remaining_, batch_size); + if (encoding_ == ::parquet::Encoding::RLE) { + num_decoded = rle_decoder_->GetBatch(d_levels, num_values); + } else { + // num_decoded = bit_packed_decoder_->GetBatch(bit_width_, d_levels, num_values); + int literal_batch = num_values; + int values_read = 0; + std::vector rleRuns; + std::vector rleValues; + int numRle; + int numBitpacked; + std::vector unpack32InputOffsets, unpack32InputRunLengths, unpack32OutputOffsets; + std::vector remainderInputOffsets, remainderBitOffsets, remainderSetSize, + remainderOutputOffsets; + + bit_packed_decoder_->SetGpuBatchMetadata( + 1, d_levels, literal_batch, values_read, unpack32InputOffsets, unpack32InputRunLengths, + unpack32OutputOffsets, remainderInputOffsets, remainderBitOffsets, + remainderSetSize, remainderOutputOffsets); + + num_decoded = gdf::arrow::internal::unpack_using_gpu ( + bit_packed_decoder_->get_buffer(), bit_packed_decoder_->get_buffer_len(), + unpack32InputOffsets, + unpack32InputRunLengths, + unpack32OutputOffsets, + remainderInputOffsets, remainderBitOffsets, remainderSetSize, + remainderOutputOffsets, bit_width_, d_levels, literal_batch); + } + num_values_remaining_ -= num_decoded; + return num_decoded; +} + +} // namespace decoder +} // namespace parquet +} // namespace gdf diff --git a/src/parquet/decoder/cu_level_decoder.h b/src/parquet/decoder/cu_level_decoder.h new file mode 100644 index 00000000..2317c54c --- /dev/null +++ b/src/parquet/decoder/cu_level_decoder.h @@ -0,0 +1,43 @@ +// +// Created by aocsa on 8/25/18. +// + +#ifndef _GDF_PARQUET_CUDALEVELDECODER_H +#define _GDF_PARQUET_CUDALEVELDECODER_H + +#include "parquet/types.h" +#include +#include "../../arrow/rle_decoder.h" +#include "../../arrow/bit-stream.h" + +namespace gdf { +namespace parquet { +namespace decoder { + +class CUDALevelDecoder { +public: + CUDALevelDecoder(); + + ~CUDALevelDecoder(); + + // Initialize the LevelDecoder state with new data + // and return the number of bytes consumed + int SetData(::parquet::Encoding::type encoding, int16_t max_level, + int num_buffered_values, const uint8_t* data); + + // Decodes a batch of levels into an array and returns the number of levels + // decoded + int Decode(int batch_size, int16_t* levels); + +private: + int bit_width_; + int num_values_remaining_; + ::parquet::Encoding::type encoding_; + std::unique_ptr< gdf::arrow::internal::RleDecoder> rle_decoder_; + std::unique_ptr< gdf::arrow::internal::BitReader> bit_packed_decoder_; +}; +} // namespace decoder +} // namespace parquet +} // namespace gdf + +#endif //_GDF_PARQUET_CUDALEVELDECODER_H diff --git a/src/parquet/dictionary_decoder.cuh b/src/parquet/dictionary_decoder.cuh new file mode 100644 index 00000000..c5dccb0c --- /dev/null +++ b/src/parquet/dictionary_decoder.cuh @@ -0,0 +1,117 @@ +/* + * Copyright 2018 BlazingDB, Inc. + * Copyright 2018 Cristhian Alberto Gonzales Castillo + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../arrow/rle_decoder.h" +#include "arrow/util/rle-encoding.h" +#include + +namespace parquet { +class ColumnDescriptor; +} + +namespace gdf { +namespace parquet { +namespace internal { + +template +class DictionaryDecoder : public ::parquet::Decoder { +public: + typedef typename Type::c_type T; + + explicit DictionaryDecoder( + const ::parquet::ColumnDescriptor *descr, + ::arrow::MemoryPool * pool = nullptr) + : ::parquet::Decoder(descr, ::parquet::Encoding::RLE_DICTIONARY), + dictionary_(0) + { + } + + void SetDict(::parquet::Decoder *dictionary); + + void + SetData(int num_values, const std::uint8_t *data, int len) override { + num_values_ = num_values; + if (len == 0) return; + std::uint8_t bit_width = *data; + ++data; + --len; + idx_decoder_ = RleDecoder(data, len, bit_width); + } + + int + Decode(T *buffer, int max_values) override { + max_values = std::min(max_values, num_values_); + int decoded_values = idx_decoder_.GetBatchWithDict( + thrust::raw_pointer_cast(dictionary_.data()), num_dictionary_values_, buffer, max_values); + if (decoded_values != max_values) { + ::parquet::ParquetException::EofException(); + } + num_values_ -= max_values; + return max_values; + } + + int + DecodeSpaced(T * buffer, + int num_values, + int null_count, + const std::uint8_t *valid_bits, + std::int64_t valid_bits_offset) override { + int decoded_values = + idx_decoder_.GetBatchWithDictSpaced( thrust::raw_pointer_cast(dictionary_.data()), + num_dictionary_values_, + buffer, + num_values, + null_count, + valid_bits, + valid_bits_offset); + if (decoded_values != num_values) { + ::parquet::ParquetException::EofException(); + } + return decoded_values; + } + +private: + using ::parquet::Decoder::num_values_; + + thrust::device_vector dictionary_; + + RleDecoder idx_decoder_; + + int num_dictionary_values_; +}; + +template +inline void +DictionaryDecoder::SetDict( + ::parquet::Decoder *dictionary) { + int num_dictionary_values = dictionary->values_left(); + num_dictionary_values_ = num_dictionary_values; + dictionary_.resize(num_dictionary_values); + dictionary->Decode(thrust::raw_pointer_cast(dictionary_.data()), num_dictionary_values); +} + +template <> +inline void +DictionaryDecoder<::parquet::BooleanType, ::arrow::RleDecoder>::SetDict( + ::parquet::Decoder<::parquet::BooleanType> *) { + ::parquet::ParquetException::NYI( + "Dictionary encoding is not implemented for boolean values"); +} + +} // namespace internal +} // namespace parquet +} // namespace gdf diff --git a/src/parquet/file_reader.cpp b/src/parquet/file_reader.cpp new file mode 100644 index 00000000..4e23e9a4 --- /dev/null +++ b/src/parquet/file_reader.cpp @@ -0,0 +1,144 @@ +/* + * Copyright 2018 BlazingDB, Inc. + * Copyright 2018 Cristhian Alberto Gonzales Castillo + * Copyright 2018 Alexander Ocsa + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include "column_reader.h" +#include "file_reader.h" +#include "file_reader_contents.h" + +namespace gdf { +namespace parquet { + +// ---------------------------------------------------------------------- +// RowGroupReader public API + +GdfRowGroupReader::GdfRowGroupReader(std::unique_ptr<::parquet::RowGroupReader::Contents> contents) + : ::parquet::RowGroupReader(nullptr), contents_(std::move(contents)) {} + + +static std::shared_ptr<::parquet::ColumnReader> GdfColumnReaderMake(const ::parquet::ColumnDescriptor* descr, + std::unique_ptr<::parquet::PageReader> pager, + ::arrow::MemoryPool* pool) { + switch (descr->physical_type()) { + case ::parquet::Type::BOOLEAN: + return std::static_pointer_cast<::parquet::ColumnReader>(std::make_shared(descr, std::move(pager), pool)); + case ::parquet::Type::INT32: + return std::static_pointer_cast<::parquet::ColumnReader>(std::make_shared(descr, std::move(pager), pool)); + break; + case ::parquet::Type::INT64: + return std::static_pointer_cast<::parquet::ColumnReader>(std::make_shared(descr, std::move(pager), pool)); + case ::parquet::Type::FLOAT: + return std::static_pointer_cast<::parquet::ColumnReader>(std::make_shared(descr, std::move(pager), pool)); + case ::parquet::Type::DOUBLE: + return std::static_pointer_cast<::parquet::ColumnReader>(std::make_shared(descr, std::move(pager), pool)); + default: + ::parquet::ParquetException::NYI("type reader not implemented"); + } + // Unreachable code, but supress compiler warning + return std::shared_ptr<::parquet::ColumnReader>(nullptr); +} + + +std::shared_ptr<::parquet::ColumnReader> GdfRowGroupReader::Column(int i) { + DCHECK(i < metadata()->num_columns()) << "The RowGroup only has " + << metadata()->num_columns() + << "columns, requested column: " << i; + const ::parquet::ColumnDescriptor* descr = metadata()->schema()->Column(i); + + std::unique_ptr<::parquet::PageReader> page_reader = contents_->GetColumnPageReader(i); + return GdfColumnReaderMake( + descr, std::move(page_reader), + const_cast<::parquet::ReaderProperties*>(contents_->properties())->memory_pool()); +} + + + +std::unique_ptr<::parquet::PageReader> GdfRowGroupReader::GetColumnPageReader(int i) { + DCHECK(i < metadata()->num_columns()) << "The RowGroup only has " + << metadata()->num_columns() + << "columns, requested column: " << i; + return contents_->GetColumnPageReader(i); +} + +// Returns the rowgroup metadata +const ::parquet::RowGroupMetaData* GdfRowGroupReader::metadata() const { return contents_->metadata(); } + +// ---------------------------------------------------------------------- + +std::unique_ptr +FileReader::OpenFile(const std::string & path, + const ::parquet::ReaderProperties &properties) { + + FileReader *const reader = new FileReader(); + reader->parquetFileReader_.reset(new ::parquet::ParquetFileReader()); + + std::shared_ptr<::arrow::io::ReadableFile> file; + + PARQUET_THROW_NOT_OK( + ::arrow::io::ReadableFile::Open(path, properties.memory_pool(), &file)); + + std::unique_ptr<::parquet::RandomAccessSource> source( + new ::parquet::ArrowInputFile(file)); + + std::unique_ptr<::parquet::ParquetFileReader::Contents> contents( + new internal::FileReaderContents(std::move(source), properties)); + + static_cast(contents.get()) + ->ParseMetaData(); + + reader->parquetFileReader_->Open(std::move(contents)); + + return std::unique_ptr(reader); +} + +std::unique_ptr +FileReader::OpenFile(std::shared_ptr<::arrow::io::RandomAccessFile> file, + const ::parquet::ReaderProperties &properties) { + + FileReader *const reader = new FileReader(); + reader->parquetFileReader_.reset(new ::parquet::ParquetFileReader()); + + std::unique_ptr<::parquet::RandomAccessSource> source( + new ::parquet::ArrowInputFile(file)); + + std::unique_ptr<::parquet::ParquetFileReader::Contents> contents( + new internal::FileReaderContents(std::move(source), properties)); + + static_cast(contents.get()) + ->ParseMetaData(); + + reader->parquetFileReader_->Open(std::move(contents)); + + + return std::unique_ptr(reader); +} + +std::shared_ptr +FileReader::RowGroup(int i) { + return std::static_pointer_cast< GdfRowGroupReader >(parquetFileReader_->RowGroup(i)); +} + +std::shared_ptr<::parquet::FileMetaData> +FileReader::metadata() const { + return parquetFileReader_->metadata(); +} + +} // namespace parquet +} // namespace gdf diff --git a/src/parquet/file_reader.h b/src/parquet/file_reader.h new file mode 100644 index 00000000..72643991 --- /dev/null +++ b/src/parquet/file_reader.h @@ -0,0 +1,70 @@ +/* + * Copyright 2018 BlazingDB, Inc. + * Copyright 2018 Cristhian Alberto Gonzales Castillo + * Copyright 2018 Alexander Ocsa + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _GDF_PARQUET_FILE_READER_H +#define _GDF_PARQUET_FILE_READER_H + +#include +#include + +namespace gdf { +namespace parquet { + + +class GdfRowGroupReader : public ::parquet::RowGroupReader { +public: + + explicit GdfRowGroupReader(std::unique_ptr contents); + + // Returns the rowgroup metadata + const ::parquet::RowGroupMetaData* metadata() const; + + // Construct a ColumnReader for the indicated row group-relative + // column. Ownership is shared with the RowGroupReader. + std::shared_ptr<::parquet::ColumnReader> Column(int i); + + std::unique_ptr<::parquet::PageReader> GetColumnPageReader(int i); + +private: + // Holds a pointer to an instance of Contents implementation + std::unique_ptr contents_; +}; + +class FileReader { +public: + static std::unique_ptr + OpenFile(const std::string & path, + const ::parquet::ReaderProperties &properties = + ::parquet::default_reader_properties()); + + static std::unique_ptr + OpenFile(std::shared_ptr<::arrow::io::RandomAccessFile> file, + const ::parquet::ReaderProperties &properties = + ::parquet::default_reader_properties()); + + std::shared_ptr RowGroup(int i); + std::shared_ptr<::parquet::FileMetaData> metadata() const; + +private: + std::unique_ptr<::parquet::ParquetFileReader> parquetFileReader_; +}; + +} // namespace parquet +} // namespace gdf + +#endif diff --git a/src/parquet/file_reader_contents.cpp b/src/parquet/file_reader_contents.cpp new file mode 100644 index 00000000..d633a74d --- /dev/null +++ b/src/parquet/file_reader_contents.cpp @@ -0,0 +1,112 @@ +/* + * Copyright 2018 BlazingDB, Inc. + * Copyright 2018 Cristhian Alberto Gonzales Castillo + * Copyright 2018 Alexander Ocsa + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "file_reader_contents.h" + +#include "row_group_reader_contents.h" + +#include "file_reader.h" + +namespace gdf { +namespace parquet { +namespace internal { + +FileReaderContents::FileReaderContents( + std::unique_ptr<::parquet::RandomAccessSource> source, + const ::parquet::ReaderProperties & properties) + : source_(std::move(source)), properties_(properties) {} + +FileReaderContents::~FileReaderContents() { + try { + Close(); + } catch (...) {} +} + +void +FileReaderContents::Close() { + source_->Close(); +} + +std::shared_ptr<::parquet::RowGroupReader> +FileReaderContents::GetRowGroup(int i) { + std::unique_ptr contents( + new internal::RowGroupReaderContents( + source_.get(), file_metadata_.get(), i, properties_)); + return std::static_pointer_cast<::parquet::RowGroupReader>(std::make_shared(std::move(contents))); +} + +std::shared_ptr<::parquet::FileMetaData> +FileReaderContents::metadata() const { + return file_metadata_; +} + +void +FileReaderContents::ParseMetaData() { + std::int64_t file_size = source_->Size(); + + if (file_size < FOOTER_SIZE) { + throw ::parquet::ParquetException( + "Corrupted file, smaller than file footer"); + } + + std::uint8_t footer_buffer[DEFAULT_FOOTER_READ_SIZE]; + std::int64_t footer_read_size = + std::min(file_size, DEFAULT_FOOTER_READ_SIZE); + std::int64_t bytes_read = source_->ReadAt( + file_size - footer_read_size, footer_read_size, footer_buffer); + + if (bytes_read != footer_read_size + || std::memcmp(footer_buffer + footer_read_size - 4, PARQUET_MAGIC, 4) + != 0) { + throw ::parquet::ParquetException( + "Invalid parquet file. Corrupt footer."); + } + + std::uint32_t metadata_len = *reinterpret_cast( + footer_buffer + footer_read_size - FOOTER_SIZE); + std::int64_t metadata_start = file_size - FOOTER_SIZE - metadata_len; + if (FOOTER_SIZE + metadata_len > file_size) { + throw ::parquet::ParquetException( + "Invalid parquet file. File is less than " + "file metadata size."); + } + + std::shared_ptr<::parquet::PoolBuffer> metadata_buffer = + ::parquet::AllocateBuffer(properties_.memory_pool(), metadata_len); + + if (footer_read_size >= (metadata_len + FOOTER_SIZE)) { + std::memcpy(metadata_buffer->mutable_data(), + footer_buffer + + (footer_read_size - metadata_len - FOOTER_SIZE), + metadata_len); + } else { + bytes_read = source_->ReadAt( + metadata_start, metadata_len, metadata_buffer->mutable_data()); + if (bytes_read != metadata_len) { + throw ::parquet::ParquetException( + "Invalid parquet file. Could not read metadata bytes."); + } + } + + file_metadata_ = + ::parquet::FileMetaData::Make(metadata_buffer->data(), &metadata_len); +} + +} // namespace internal +} // namespace parquet +} // namespace gdf diff --git a/src/parquet/file_reader_contents.h b/src/parquet/file_reader_contents.h new file mode 100644 index 00000000..c75ac0b1 --- /dev/null +++ b/src/parquet/file_reader_contents.h @@ -0,0 +1,49 @@ +/* + * Copyright 2018 BlazingDB, Inc. + * Copyright 2018 Cristhian Alberto Gonzales Castillo + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +namespace gdf { +namespace parquet { +namespace internal { + +class FileReaderContents : public ::parquet::ParquetFileReader::Contents { +public: + FileReaderContents(std::unique_ptr<::parquet::RandomAccessSource> source, + const ::parquet::ReaderProperties &properties = + ::parquet::default_reader_properties()); + + ~FileReaderContents() final; + void Close() final; + std::shared_ptr<::parquet::RowGroupReader> GetRowGroup(int i) final; + std::shared_ptr<::parquet::FileMetaData> metadata() const final; + + void ParseMetaData(); + +private: + std::unique_ptr<::parquet::RandomAccessSource> source_; + std::shared_ptr<::parquet::FileMetaData> file_metadata_; + ::parquet::ReaderProperties properties_; + + const int64_t DEFAULT_FOOTER_READ_SIZE = 64 * 1024; + const uint32_t FOOTER_SIZE = 8; + const uint8_t PARQUET_MAGIC[4] = {'P', 'A', 'R', '1'}; +}; + +} // namespace internal +} // namespace parquet +} // namespace gdf diff --git a/src/parquet/page_reader.cpp b/src/parquet/page_reader.cpp new file mode 100644 index 00000000..5dcfcf46 --- /dev/null +++ b/src/parquet/page_reader.cpp @@ -0,0 +1,201 @@ +/* + * Copyright 2018 BlazingDB, Inc. + * Copyright 2018 Cristhian Alberto Gonzales Castillo + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "page_reader.h" + +#include +#include + +namespace gdf { +namespace parquet { +namespace internal { + +namespace { +template +inline void +DeserializeThriftMsg(const std::uint8_t *buf, + std::uint32_t * len, + T * deserialized_msg) { + std::shared_ptr tmem_transport( + new apache::thrift::transport::TMemoryBuffer( + const_cast(buf), *len)); + apache::thrift::protocol::TCompactProtocolFactoryT< + apache::thrift::transport::TMemoryBuffer> + tproto_factory; + std::shared_ptr tproto = + tproto_factory.getProtocol(tmem_transport); + + try { + deserialized_msg->read(tproto.get()); + } catch (std::exception &e) { + std::stringstream ss; + ss << "Couldn't deserialize thrift: " << e.what() << "\n"; + throw ::parquet::ParquetException(ss.str()); + } + + std::uint32_t bytes_left = tmem_transport->available_read(); + + *len = *len - bytes_left; +} + +static inline ::parquet::Encoding::type +FromThrift(::parquet::format::Encoding::type type) { + return static_cast<::parquet::Encoding::type>(type); +} +} // namespace + +PageReader::PageReader(std::unique_ptr<::parquet::InputStream> stream, + std::int64_t total_num_rows, + ::parquet::Compression::type codec, + arrow::MemoryPool * pool) + : stream_(std::move(stream)), + decompression_buffer_(::parquet::AllocateBuffer(pool, 0)), + seen_num_rows_(0), total_num_rows_(total_num_rows) { + max_page_header_size_ = kDefaultMaxPageHeaderSize; + decompressor_ = GetCodecFromArrow(codec); +} + +std::shared_ptr<::parquet::Page> +PageReader::NextPage() { + while (seen_num_rows_ < total_num_rows_) { + std::int64_t bytes_read = 0; + std::int64_t bytes_available = 0; + std::uint32_t header_size = 0; + const std::uint8_t *buffer; + std::uint32_t allowed_page_size = kDefaultPageHeaderSize; + + for (;;) { + buffer = stream_->Peek(allowed_page_size, &bytes_available); + if (bytes_available == 0) { + return std::shared_ptr<::parquet::Page>(nullptr); + } + + header_size = static_cast(bytes_available); + try { + DeserializeThriftMsg( + buffer, &header_size, ¤t_page_header_); + break; + } catch (std::exception &e) { + std::stringstream ss; + ss << e.what(); + allowed_page_size *= 2; + if (allowed_page_size > max_page_header_size_) { + ss << "Deserializing page header failed.\n"; + throw ::parquet::ParquetException(ss.str()); + } + } + } + stream_->Advance(header_size); + + int compressed_len = current_page_header_.compressed_page_size; + int uncompressed_len = current_page_header_.uncompressed_page_size; + + buffer = stream_->Read(compressed_len, &bytes_read); + if (bytes_read != compressed_len) { + ::parquet::ParquetException::EofException(); + } + + if (decompressor_ != nullptr) { + if (uncompressed_len + > static_cast(decompression_buffer_->size())) { + PARQUET_THROW_NOT_OK( + decompression_buffer_->Resize(uncompressed_len, false)); + } + PARQUET_THROW_NOT_OK(decompressor_->Decompress( + compressed_len, + buffer, + uncompressed_len, + decompression_buffer_->mutable_data())); + buffer = decompression_buffer_->data(); + } + + auto page_buffer = + std::make_shared<::parquet::Buffer>(buffer, uncompressed_len); + + if (current_page_header_.type + == ::parquet::format::PageType::DICTIONARY_PAGE) { + const ::parquet::format::DictionaryPageHeader &dict_header = + current_page_header_.dictionary_page_header; + + bool is_sorted = + dict_header.__isset.is_sorted ? dict_header.is_sorted : false; + + return std::make_shared<::parquet::DictionaryPage>( + page_buffer, + dict_header.num_values, + FromThrift(dict_header.encoding), + is_sorted); + } else if (current_page_header_.type + == ::parquet::format::PageType::DATA_PAGE) { + const ::parquet::format::DataPageHeader &header = + current_page_header_.data_page_header; + + ::parquet::EncodedStatistics page_statistics; + if (header.__isset.statistics) { + const ::parquet::format::Statistics &stats = header.statistics; + if (stats.__isset.max) { page_statistics.set_max(stats.max); } + if (stats.__isset.min) { page_statistics.set_min(stats.min); } + if (stats.__isset.null_count) { + page_statistics.set_null_count(stats.null_count); + } + if (stats.__isset.distinct_count) { + page_statistics.set_distinct_count(stats.distinct_count); + } + } + + seen_num_rows_ += header.num_values; + + return std::make_shared<::parquet::DataPage>( + page_buffer, + header.num_values, + FromThrift(header.encoding), + FromThrift(header.definition_level_encoding), + FromThrift(header.repetition_level_encoding), + page_statistics); + } else if (current_page_header_.type + == ::parquet::format::PageType::DATA_PAGE_V2) { + const ::parquet::format::DataPageHeaderV2 &header = + current_page_header_.data_page_header_v2; + bool is_compressed = + header.__isset.is_compressed ? header.is_compressed : false; + + seen_num_rows_ += header.num_values; + + return std::make_shared<::parquet::DataPageV2>( + page_buffer, + header.num_values, + header.num_nulls, + header.num_rows, + FromThrift(header.encoding), + header.definition_levels_byte_length, + header.repetition_levels_byte_length, + is_compressed); + } else { + continue; + } + } + return std::shared_ptr<::parquet::Page>(nullptr); +} + +void +PageReader::set_max_page_header_size(std::uint32_t size) { + max_page_header_size_ = size; +} + +} // namespace internal +} // namespace parquet +} // namespace gdf diff --git a/src/parquet/page_reader.h b/src/parquet/page_reader.h new file mode 100644 index 00000000..49b3d561 --- /dev/null +++ b/src/parquet/page_reader.h @@ -0,0 +1,57 @@ +/* + * Copyright 2018 BlazingDB, Inc. + * Copyright 2018 Cristhian Alberto Gonzales Castillo + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +namespace gdf { +namespace parquet { +namespace internal { + +class PageReader : public ::parquet::PageReader { +public: + PageReader(std::unique_ptr<::parquet::InputStream> stream, + std::int64_t total_num_rows, + ::parquet::Compression::type codec, + arrow::MemoryPool * pool); + + std::shared_ptr<::parquet::Page> NextPage() final; + + void set_max_page_header_size(std::uint32_t size) override; + +private: + static const std::uint32_t kDefaultMaxPageHeaderSize = 16 * 1024 * 1024; + static const std::uint32_t kDefaultPageHeaderSize = 16 * 1024; + + std::unique_ptr<::parquet::InputStream> stream_; + + ::parquet::format::PageHeader current_page_header_; + std::shared_ptr<::parquet::Page> current_page_; + + std::unique_ptr decompressor_; + std::shared_ptr decompression_buffer_; + + std::uint32_t max_page_header_size_; + + std::int64_t seen_num_rows_; + + std::int64_t total_num_rows_; +}; + +} // namespace internal +} // namespace parquet +} // namespace gdf diff --git a/src/parquet/plain_decoder.cuh b/src/parquet/plain_decoder.cuh new file mode 100644 index 00000000..67e30c4d --- /dev/null +++ b/src/parquet/plain_decoder.cuh @@ -0,0 +1,155 @@ +/* + * Copyright 2018 BlazingDB, Inc. + * Copyright 2018 Cristhian Alberto Gonzales Castillo + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include "../arrow/bit-stream.h" +#include + +namespace parquet { +class ColumnDescriptor; +} + +namespace gdf { +namespace parquet { +namespace internal { + +template +class PlainDecoder : public ::parquet::Decoder { +public: + typedef typename DataType::c_type T; + using ::parquet::Decoder::num_values_; + + explicit PlainDecoder(const ::parquet::ColumnDescriptor *descr) + : ::parquet::Decoder(descr, ::parquet::Encoding::PLAIN), + data_(nullptr), len_(0) { + if (descr_ + && descr_->physical_type() + == ::parquet::Type::FIXED_LEN_BYTE_ARRAY) { + type_length_ = descr_->type_length(); + } else { + type_length_ = -1; + } + } + + virtual void + SetData(int num_values, const std::uint8_t *data, int len) { + num_values_ = num_values; + data_ = data; + len_ = len; + } + + virtual int Decode(T *buffer, int max_values); + +private: + using ::parquet::Decoder::descr_; + const std::uint8_t *data_; + int len_; + int type_length_; +}; + +template +inline int +DecodePlain(const std::uint8_t *data, + std::int64_t data_size, + int num_values, + int, + T *out) { + int bytes_to_decode = num_values * static_cast(sizeof(T)); + if (data_size < bytes_to_decode) { + ::parquet::ParquetException::EofException(); + } + cudaMemcpy(out, data, bytes_to_decode, cudaMemcpyHostToDevice); + return bytes_to_decode; +} + +template +inline int +PlainDecoder::Decode(T *buffer, int max_values) { + max_values = std::min(max_values, num_values_); + int bytes_consumed = + DecodePlain(data_, len_, max_values, type_length_, buffer); + data_ += bytes_consumed; + len_ -= bytes_consumed; + num_values_ -= max_values; + return max_values; +} + +template <> +class PlainDecoder<::parquet::BooleanType> + : public ::parquet::Decoder<::parquet::BooleanType> { +public: + explicit PlainDecoder(const ::parquet::ColumnDescriptor *descr) + : ::parquet::Decoder<::parquet::BooleanType>( + descr, + ::parquet::Encoding::PLAIN) {} + + virtual void + SetData(int num_values, const std::uint8_t *data, int len) { + num_values_ = num_values; + bit_reader_ = gdf::arrow::internal::BitReader(data, len); + } + + int + Decode(std::uint8_t *buffer, int max_values) { + max_values = std::min(max_values, num_values_); + bool val; + for (int i = 0; i < max_values; ++i) { + if (!bit_reader_.GetValue(1, &val)) { + ::parquet::ParquetException::EofException(); + } + ::arrow::BitUtil::SetArrayBit(buffer, i, val); + } + num_values_ -= max_values; + return max_values; + } + + virtual int + Decode(bool *buffer, int max_values) { + max_values = std::min(max_values, num_values_); + + int literal_batch = max_values; + int values_read = 0; + std::vector rleRuns; + std::vector rleValues; + std::vector unpack32InputOffsets, unpack32InputRunLengths, unpack32OutputOffsets; + std::vector remainderInputOffsets, remainderBitOffsets, remainderSetSize, + remainderOutputOffsets; + + bit_reader_.SetGpuBatchMetadata( + 1, buffer, literal_batch, values_read, unpack32InputOffsets, unpack32InputRunLengths, + unpack32OutputOffsets, remainderInputOffsets, remainderBitOffsets, + remainderSetSize, remainderOutputOffsets); + + gdf::arrow::internal::unpack_using_gpu ( + bit_reader_.get_buffer(), bit_reader_.get_buffer_len(), + unpack32InputOffsets, + unpack32InputRunLengths, + unpack32OutputOffsets, + remainderInputOffsets, remainderBitOffsets, remainderSetSize, + remainderOutputOffsets, 1, buffer, literal_batch); + + num_values_ -= max_values; + return max_values; + } + +private: + gdf::arrow::internal::BitReader bit_reader_; +}; + +} // namespace internal +} // namespace parquet +} // namespace gdf diff --git a/src/parquet/row_group_reader_contents.cpp b/src/parquet/row_group_reader_contents.cpp new file mode 100644 index 00000000..252173b9 --- /dev/null +++ b/src/parquet/row_group_reader_contents.cpp @@ -0,0 +1,80 @@ +/* + * Copyright 2018 BlazingDB, Inc. + * Copyright 2018 Cristhian Alberto Gonzales Castillo + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "row_group_reader_contents.h" + +#include "page_reader.h" + +namespace gdf { +namespace parquet { +namespace internal { + +RowGroupReaderContents::RowGroupReaderContents( + ::parquet::RandomAccessSource * source, + ::parquet::FileMetaData * file_metadata, + int row_group_number, + const ::parquet::ReaderProperties &props) + : source_(source), file_metadata_(file_metadata), properties_(props) { + row_group_metadata_ = file_metadata->RowGroup(row_group_number); +} + +const ::parquet::RowGroupMetaData * +RowGroupReaderContents::metadata() const { + return row_group_metadata_.get(); +} + +const ::parquet::ReaderProperties * +RowGroupReaderContents::properties() const { + return &properties_; +} + +std::unique_ptr<::parquet::PageReader> +RowGroupReaderContents::GetColumnPageReader(int i) { + auto col = row_group_metadata_->ColumnChunk(i); + + int64_t col_start = col->data_page_offset(); + if (col->has_dictionary_page() + && col_start > col->dictionary_page_offset()) { + col_start = col->dictionary_page_offset(); + } + + std::int64_t col_length = col->total_compressed_size(); + std::unique_ptr<::parquet::InputStream> stream; + + const ::parquet::ApplicationVersion &version = + file_metadata_->writer_version(); + if (version.VersionLt( + ::parquet::ApplicationVersion::PARQUET_816_FIXED_VERSION)) { + std::int64_t bytes_remaining = + source_->Size() - (col_start + col_length); + std::int64_t padding = + std::min(kMaxDictHeaderSize, bytes_remaining); + col_length += padding; + } + + stream = properties_.GetStream(source_, col_start, col_length); + + return std::unique_ptr<::parquet::PageReader>( + new internal::PageReader(std::move(stream), + col->num_values(), + col->compression(), + properties_.memory_pool())); +} + +} // namespace internal +} // namespace parquet +} // namespace gdf diff --git a/src/parquet/row_group_reader_contents.h b/src/parquet/row_group_reader_contents.h new file mode 100644 index 00000000..e128a1c7 --- /dev/null +++ b/src/parquet/row_group_reader_contents.h @@ -0,0 +1,47 @@ +/* + * Copyright 2018 BlazingDB, Inc. + * Copyright 2018 Cristhian Alberto Gonzales Castillo + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +namespace gdf { +namespace parquet { +namespace internal { + +class RowGroupReaderContents : public ::parquet::RowGroupReader::Contents { +public: + RowGroupReaderContents(::parquet::RandomAccessSource * source, + ::parquet::FileMetaData * file_metadata, + int row_group_number, + const ::parquet::ReaderProperties &props); + + const ::parquet::RowGroupMetaData *metadata() const final; + const ::parquet::ReaderProperties *properties() const final; + virtual std::unique_ptr<::parquet::PageReader> + GetColumnPageReader(int i) final; + +private: + ::parquet::RandomAccessSource * source_; + ::parquet::FileMetaData * file_metadata_; + std::unique_ptr<::parquet::RowGroupMetaData> row_group_metadata_; + ::parquet::ReaderProperties properties_; + + const std::int64_t kMaxDictHeaderSize = 100; +}; + +} // namespace internal +} // namespace parquet +} // namespace gdf diff --git a/src/tests/CMakeLists.txt b/src/tests/CMakeLists.txt index d07fcbc6..30a46de5 100644 --- a/src/tests/CMakeLists.txt +++ b/src/tests/CMakeLists.txt @@ -43,10 +43,11 @@ add_subdirectory(datetime) add_subdirectory(hashing) add_subdirectory(join) add_subdirectory(sqls) -add_subdirectory(hash_map) -add_subdirectory(groupby) +add_subdirectory(parquet) add_subdirectory(unaryops) add_subdirectory(filterops_numeric) +add_subdirectory(hash_map) +add_subdirectory(groupby) add_subdirectory(quantiles) add_subdirectory(validops) diff --git a/src/tests/filterops_numeric/helper/utils.cuh b/src/tests/filterops_numeric/helper/utils.cuh index 7348f4da..3ae86441 100644 --- a/src/tests/filterops_numeric/helper/utils.cuh +++ b/src/tests/filterops_numeric/helper/utils.cuh @@ -3,6 +3,8 @@ #define GDF_TEST_UTILS #include +#include + #include #include #include @@ -13,7 +15,16 @@ #include #include #include -#include "gdf/gdf.h" + +#ifndef EXPECT_TRUE +#define EXPECT_TRUE (expr) + assert(expr); +#endif + +#ifndef EXPECT_EQ +#define EXPECT_EQ (lhs, rhs) + assert((lsh) == (rhs)); +#endif template inline gdf_dtype gdf_enum_type_for() diff --git a/src/tests/helper/utils.cu b/src/tests/helper/utils.cu new file mode 100644 index 00000000..e05d4006 --- /dev/null +++ b/src/tests/helper/utils.cu @@ -0,0 +1,87 @@ + +#include +#include +#include +#include +#include +#include "utils.cuh" + + +gdf_valid_type * get_gdf_valid_from_device(gdf_column* column) { + gdf_valid_type * host_valid_out; + size_t n_bytes = get_number_of_bytes_for_valid(column->size); + host_valid_out = new gdf_valid_type[n_bytes]; + cudaMemcpy(host_valid_out,column->valid, n_bytes, cudaMemcpyDeviceToHost); + return host_valid_out; +} + +std::string gdf_valid_to_str(gdf_valid_type *valid, size_t column_size) +{ + size_t n_bytes = get_number_of_bytes_for_valid(column_size); + std::string response; + for (size_t i = 0; i < n_bytes; i++) + { + size_t length = n_bytes != i + 1 ? GDF_VALID_BITSIZE : column_size - GDF_VALID_BITSIZE * (n_bytes - 1); + auto result = chartobin(valid[i], length); + response += std::string(result); + } + return response; +} + +gdf_valid_type* gen_gdf_valid(size_t column_size, size_t init_value) +{ + gdf_valid_type *valid = nullptr; + if (column_size == 0) + { + valid = new gdf_valid_type[1]; + } + else + { + size_t n_bytes = get_number_of_bytes_for_valid (column_size); + valid = new gdf_valid_type[n_bytes]; + size_t i; + for (i = 0; i < n_bytes - 1; ++i) + { + valid[i] = (init_value % 256); + } + size_t length = column_size - GDF_VALID_BITSIZE * (n_bytes - 1); + valid[i] = 1 << length - 1; + } + return valid; +} + + +void delete_gdf_column(gdf_column * column){ + cudaFree(column->data); + cudaFree(column->valid); +} + +gdf_size_type count_zero_bits(gdf_valid_type *valid, size_t column_size) +{ + size_t numbits = 0; + auto bin = gdf_valid_to_str(valid, column_size); + + for(size_t i = 0; i < bin.length(); i++) { + if ( bin [i] == '0') + numbits++; + } + return numbits; +} + +std::string chartobin(gdf_valid_type c, int size/* = 8*/) +{ + std::string bin; + bin.resize(size); + bin[0] = 0; + int i; + for (i = 0; i < size; i++) + { + bin[i] = (c % 2) + '0'; + c /= 2; + } + return bin; +} + +auto print_binary(gdf_valid_type n, int size) -> void { + std::cout << chartobin(n) << "\t sz: " << size << "\tbinary: " << chartobin(n, size) << std::endl; +} diff --git a/src/tests/helper/utils.cuh b/src/tests/helper/utils.cuh new file mode 100644 index 00000000..9c9d95f3 --- /dev/null +++ b/src/tests/helper/utils.cuh @@ -0,0 +1,336 @@ + +#ifndef GDF_TEST_UTILS +#define GDF_TEST_UTILS + +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include + + +template +inline gdf_dtype gdf_enum_type_for() +{ + return GDF_invalid; +} + +template <> +inline gdf_dtype gdf_enum_type_for() +{ + return GDF_INT8; +} + +template <> +inline gdf_dtype gdf_enum_type_for() +{ + return GDF_INT16; +} + +template <> +inline gdf_dtype gdf_enum_type_for() +{ + return GDF_INT32; +} + +template <> +inline gdf_dtype gdf_enum_type_for() +{ + return GDF_INT64; +} + +template <> +inline gdf_dtype gdf_enum_type_for() +{ + return GDF_FLOAT32; +} + +template <> +inline gdf_dtype gdf_enum_type_for() +{ + return GDF_FLOAT64; +} + +inline auto get_number_of_bytes_for_valid (size_t column_size) -> size_t { + return sizeof(gdf_valid_type) * (column_size + GDF_VALID_BITSIZE - 1) / GDF_VALID_BITSIZE; +} + + +inline gdf_error gdf_column_view_init(gdf_column *column, void *data, gdf_valid_type *valid, + gdf_size_type size, gdf_dtype dtype, gdf_size_type null_count) { + column->data = data; + column->valid = valid; + column->size = size; + column->dtype = dtype; + column->null_count = null_count; + return GDF_SUCCESS; +} + + +auto print_binary(gdf_valid_type n, int size = 8) -> void ; + +auto chartobin(gdf_valid_type n, int size = 8) -> std::string; + +gdf_size_type count_zero_bits(gdf_valid_type *valid, size_t column_size); + +auto delete_gdf_column(gdf_column * column) -> void; + +auto gen_gdf_valid(size_t column_size, size_t init_value) -> gdf_valid_type *; + +gdf_valid_type * get_gdf_valid_from_device(gdf_column* column) ; + +std::string gdf_valid_to_str(gdf_valid_type *valid, size_t column_size); + +template +auto init_device_vector(gdf_size_type num_elements) -> std::tuple> +{ + RawType *device_pointer; + cudaError_t cuda_error = cudaMalloc((void **)&device_pointer, sizeof(PointerType) * num_elements); + assert(cuda_error == cudaError::cudaSuccess); + thrust::device_ptr device_wrapper = thrust::device_pointer_cast((PointerType *)device_pointer); + return std::make_tuple(device_pointer, device_wrapper); +} + + +template +ValueType* get_gdf_data_from_device(gdf_column* column) { + ValueType* host_out = new ValueType[column->size]; + cudaMemcpy(host_out, column->data, sizeof(ValueType) * column->size, cudaMemcpyDeviceToHost); + return host_out; +} + +template +std::string gdf_data_to_str(void *data, size_t column_size) +{ + std::string response; + for (size_t i = 0; i < column_size; i++) + { + auto result = std::to_string(*((ValueType*)(data) + i)); + response += std::string(result); + } + return response; +} + + +template +gdf_column convert_to_device_gdf_column (gdf_column *column) { + size_t column_size = column->size; + char *raw_pointer; + thrust::device_ptr device_pointer; + std::tie(raw_pointer, device_pointer) = init_device_vector(column_size); + + void* host_out = column->data; + cudaMemcpy(raw_pointer, host_out, sizeof(ValueType) * column->size, cudaMemcpyHostToDevice); + + gdf_valid_type *host_valid = column->valid; + size_t n_bytes = get_number_of_bytes_for_valid(column_size); + + gdf_valid_type *valid_value_pointer; + cudaMalloc((void **)&valid_value_pointer, n_bytes); + cudaMemcpy(valid_value_pointer, host_valid, n_bytes, cudaMemcpyHostToDevice); + + gdf_column output; + gdf_column_view_init(&output, (void *)raw_pointer, valid_value_pointer, column_size, column->dtype, column->null_count); + return output; +} + +template +gdf_column convert_to_host_gdf_column (gdf_column *column) { + auto host_out = get_gdf_data_from_device(column); + auto host_valid_out = get_gdf_valid_from_device(column); + + auto output = *column; + output.data = host_out; + output.valid = host_valid_out; + return output; +} + + +template +auto print_column(gdf_column * column) -> void { + auto host_out = get_gdf_data_from_device(column); + auto bitmap = get_gdf_valid_from_device(column); + std::cout<<"Printing Column\t null_count:" << column->null_count << "\t type " << column->dtype << std::endl; + size_t n_bytes = sizeof(int8_t) * (column->size + GDF_VALID_BITSIZE - 1) / GDF_VALID_BITSIZE; + for(std::size_t i = 0; i < column->size; i++) { + size_t col_position = i / 8; + size_t length_col = n_bytes != col_position+1 ? GDF_VALID_BITSIZE : column->size - GDF_VALID_BITSIZE * (n_bytes - 1); + int bit_offset = (length_col - 1) - (i % 8); + + ValueType value = static_cast(host_out)[i]; + + if ( bitmap[i / 8] & (1 << (i % 8)) ) { + std::cout << "host_out[" << i << "] = " << value <<"\t\tvalid="<< 1 <(column_size); + // std::cout << "1. gen_gdb_column\n"; + + using thrust::detail::make_normal_iterator; + thrust::fill(make_normal_iterator(device_pointer), make_normal_iterator(device_pointer + column_size), init_value); + //std::cout << "2. gen_gdb_column\n"; + + gdf_valid_type *host_valid = gen_gdf_valid(column_size, init_value); + size_t n_bytes = get_number_of_bytes_for_valid(column_size); + + gdf_valid_type *valid_value_pointer; + cudaMalloc((void **)&valid_value_pointer, n_bytes); + cudaMemcpy(valid_value_pointer, host_valid, n_bytes, cudaMemcpyHostToDevice); + // std::cout << "3. gen_gdb_column\n"; + + gdf_column output; + auto zero_bits = output.null_count = count_zero_bits(host_valid, column_size); + + gdf_column_view_init(&output, + (void *)raw_pointer, valid_value_pointer, + column_size, + gdf_enum_type_value, + zero_bits); + //std::cout << "4. gen_gdb_column\n"; + + delete []host_valid; + return output; +} + +template +void check_column_for_stencil_operation(gdf_column *column, gdf_column *stencil, gdf_column *output_op) { + gdf_column host_column = convert_to_host_gdf_column(column); + gdf_column host_stencil = convert_to_host_gdf_column(stencil); + gdf_column host_output_op = convert_to_host_gdf_column(output_op); + + assert(host_column.size == host_stencil.size); + //EXPECT_EQ(host_column.dtype == host_output_op.dtype); // it must have the same type + + + int n_bytes = sizeof(int8_t) * (column->size + GDF_VALID_BITSIZE - 1) / GDF_VALID_BITSIZE; + std::vector indexes; + for(size_t i = 0; i < host_stencil.size; i++) { + int col_position = i / 8; + size_t length_col = n_bytes != col_position+1 ? GDF_VALID_BITSIZE : column->size - GDF_VALID_BITSIZE * (n_bytes - 1); + int bit_offset = (length_col - 1) - (i % 8); + bool valid = ((host_stencil.valid[col_position] >> bit_offset ) & 1) != 0; + if ( (int)( ((int8_t *)host_stencil.data)[i] ) == 1 && valid ) { + indexes.push_back(i); + } + } + + for(size_t i = 0; i < indexes.size(); i++) + { + int index = indexes[i]; + LeftValueType value = ((LeftValueType *)(host_column.data))[index]; + std::cout << "filtered values: " << index << "** " << "\t value: " << (int)value << std::endl; + assert( ((RightValueType*)host_output_op.data)[i] == value); + + int col_position = i / 8; + size_t length_col = n_bytes != col_position+1 ? GDF_VALID_BITSIZE : output_op->size - GDF_VALID_BITSIZE * (n_bytes - 1); + int bit_offset = (length_col - 1) - (i % 8); + bool valid = ((host_output_op.valid[col_position] >> bit_offset ) & 1) != 0; + assert(valid == true); + } +} + +template +void check_column_for_comparison_operation(gdf_column *lhs, gdf_column *rhs, gdf_column *output, gdf_comparison_operator gdf_operator) +{ + { + auto lhs_valid = get_gdf_valid_from_device(lhs); + auto rhs_valid = get_gdf_valid_from_device(rhs); + auto output_valid = get_gdf_valid_from_device(output); + + size_t n_bytes = get_number_of_bytes_for_valid(output->size); + + assert(lhs->size == rhs->size); + + for(int i = 0; i < output->size; i++) { + int col_position = i / 8; + size_t length_col = n_bytes != col_position+1 ? GDF_VALID_BITSIZE : output->size - GDF_VALID_BITSIZE * (n_bytes - 1); + int bit_offset = (length_col - 1) - (i % 8); + + assert( ((lhs_valid[col_position] >> bit_offset ) & 1) & ((rhs_valid[col_position] >> bit_offset ) & 1) == + ((output_valid[col_position] >> bit_offset ) & 1) ); + } + + delete[] lhs_valid; + delete[] rhs_valid; + delete[] output_valid; + } + + { + auto lhs_data = get_gdf_data_from_device(lhs); + auto rhs_data = get_gdf_data_from_device(rhs); + auto output_data = get_gdf_data_from_device(output); + + assert(lhs->size == rhs->size); + for(size_t i = 0; i < lhs->size; i++) + { + assert(lhs_data[i] == rhs_data[i] ? 1 : 0 == output_data[i]); + } + + delete[] lhs_data; + delete[] rhs_data; + delete[] output_data; + } + +} + +template +void check_column_for_concat_operation(gdf_column *lhs, gdf_column *rhs, gdf_column *output) +{ + { + auto lhs_valid = get_gdf_valid_from_device(lhs); + auto rhs_valid = get_gdf_valid_from_device(rhs); + auto output_valid = get_gdf_valid_from_device(output); + + auto computed = gdf_valid_to_str(output_valid, output->size); + auto expected = gdf_valid_to_str(lhs_valid, lhs->size) + gdf_valid_to_str(rhs_valid, rhs->size); + + //std::cout << "computed: " << computed << std::endl; + //std::cout << "expected: " << expected << std::endl; + + delete[] lhs_valid; + delete[] rhs_valid; + delete[] output_valid; + assert(computed == expected); + } + + { + auto lhs_data = get_gdf_data_from_device(lhs); + auto rhs_data = get_gdf_data_from_device(rhs); + auto output_data = get_gdf_data_from_device(output); + + auto computed = gdf_data_to_str(output_data, output->size); + auto expected = gdf_data_to_str(lhs_data, lhs->size) + gdf_data_to_str(rhs_data, rhs->size); + delete[] lhs_data; + delete[] rhs_data; + delete[] output_data; + assert(computed == expected); + } + +} + + +#endif // GDF_TEST_UTILS diff --git a/src/tests/parquet/CMakeLists.txt b/src/tests/parquet/CMakeLists.txt new file mode 100644 index 00000000..37ad2882 --- /dev/null +++ b/src/tests/parquet/CMakeLists.txt @@ -0,0 +1,32 @@ +#============================================================================= +# Copyright 2018 BlazingDB, Inc. +# Copyright 2018 Cristhian Alberto Gonzales Castillo +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#============================================================================= + + + + +function(GDF_ADD_PARQUET_TEST TARGET) + message(STATUS "${TARGET} will link against: gdf parquet") + list(REMOVE_AT ARGV 0) + cuda_add_executable(${TARGET} ${ARGV}) + target_include_directories(${TARGET} PUBLIC ${CMAKE_SOURCE_DIR}/src/parquet) + target_link_libraries(${TARGET} gmock_main gmock GTest::GTest gdf-parquet) + get_property(ARGN TARGET ${TARGET} PROPERTY SOURCES) + # gtest_add_tests(${TARGET} "" ${ARGN}) +endfunction() + +add_subdirectory(file_reader) +add_subdirectory(gdf_column) diff --git a/src/tests/parquet/file_reader/CMakeLists.txt b/src/tests/parquet/file_reader/CMakeLists.txt new file mode 100644 index 00000000..58c64e7e --- /dev/null +++ b/src/tests/parquet/file_reader/CMakeLists.txt @@ -0,0 +1,33 @@ +#============================================================================= +# Copyright 2018 BlazingDB, Inc. +# Copyright 2018 Cristhian Alberto Gonzales Castillo +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#============================================================================= + +find_package(Boost REQUIRED COMPONENTS filesystem) + +set(PARQUET_FILE_PATH + ${CMAKE_SOURCE_DIR}/src/tests/parquet/file_reader/reader-test.parquet) + +GDF_ADD_PARQUET_TEST(file_reader-test + #file_reader-test.cpp + #single_column_file-test.cpp + api-test.cu + null-test.cu + ../../helper/utils.cuh + ../../helper/utils.cu) + +target_compile_definitions(file_reader-test + PUBLIC -DPARQUET_FILE_PATH="${PARQUET_FILE_PATH}") +target_link_libraries(file_reader-test Boost::filesystem) diff --git a/src/tests/parquet/file_reader/api-test.cu b/src/tests/parquet/file_reader/api-test.cu new file mode 100644 index 00000000..7318cc6a --- /dev/null +++ b/src/tests/parquet/file_reader/api-test.cu @@ -0,0 +1,445 @@ +/* + * Copyright 2018 BlazingDB, Inc. + * Copyright 2018 Cristhian Alberto Gonzales Castillo + * Copyright 2018 William Malpica + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include + +#include + +#include +#include +#include +#include +#include + +#include + +#include "../../helper/utils.cuh" + +class ParquetReaderAPITest : public testing::Test { +protected: + ParquetReaderAPITest() + : filename(boost::filesystem::unique_path().native()) {} + + std::int32_t + genInt32(int i) { + if (i >= 100 && i < 150) { + return 10000; + } else if (i >= 200 && i < 300) { + return 20000; + } else if (i >= 310 && i < 350) { + return 30000; + } else if (i >= 450 && i < 550) { + return 40000; + } else if (i >= 800 && i < 950) { + return 50000; + } else { + return i * 100; + } + } + + std::int64_t + genInt64(int i) { + if (i >= 100 && i < 150) { + return 10000; + } else if (i >= 200 && i < 300) { + return 20000; + } else if (i >= 310 && i < 350) { + return 30000; + } else if (i >= 450 && i < 550) { + return 40000; + } else if (i >= 800 && i < 950) { + return 50000; + } else { + return i * 100000; + } + } + + void + SetUp() final { + static constexpr std::size_t kGroups = 3; + static constexpr std::size_t kRowsPerGroup = 499; + try { + + std::shared_ptr<::arrow::io::FileOutputStream> stream; + PARQUET_THROW_NOT_OK( + ::arrow::io::FileOutputStream::Open(filename, &stream)); + + std::shared_ptr<::parquet::schema::GroupNode> schema = + CreateSchema(); + + ::parquet::WriterProperties::Builder builder; + builder.compression(::parquet::Compression::SNAPPY); + std::shared_ptr<::parquet::WriterProperties> properties = + builder.build(); + + std::shared_ptr<::parquet::ParquetFileWriter> file_writer = + ::parquet::ParquetFileWriter::Open(stream, schema, properties); + + std::int16_t repetition_level = 0; + + for (std::size_t i = 0; i < kGroups; i++) { + ::parquet::RowGroupWriter *row_group_writer = + file_writer->AppendRowGroup(kRowsPerGroup); + + ::parquet::BoolWriter *bool_writer = + static_cast<::parquet::BoolWriter *>( + row_group_writer->NextColumn()); + for (std::size_t j = 0; j < kRowsPerGroup; j++) { + int ind = i * kRowsPerGroup + j; + std::int16_t definition_level = ind % 3 > 0 ? 1 : 0; + bool bool_value = true; + bool_writer->WriteBatch( + 1, &definition_level, &repetition_level, &bool_value); + } + + ::parquet::Int32Writer *int32_writer = + static_cast<::parquet::Int32Writer *>( + row_group_writer->NextColumn()); + for (std::size_t j = 0; j < kRowsPerGroup; j++) { + int ind = i * kRowsPerGroup + j; + std::int16_t definition_level = ind % 3 > 0 ? 1 : 0; + std::int32_t int32_value = genInt32(ind); + int32_writer->WriteBatch( + 1, &definition_level, &repetition_level, &int32_value); + } + + ::parquet::Int64Writer *int64_writer = + static_cast<::parquet::Int64Writer *>( + row_group_writer->NextColumn()); + for (std::size_t j = 0; j < kRowsPerGroup; j++) { + int ind = i * kRowsPerGroup + j; + std::int16_t definition_level = ind % 3 > 0 ? 1 : 0; + std::int64_t int64_value = genInt64(ind); + int64_writer->WriteBatch( + 1, &definition_level, &repetition_level, &int64_value); + } + + ::parquet::DoubleWriter *double_writer = + static_cast<::parquet::DoubleWriter *>( + row_group_writer->NextColumn()); + for (std::size_t j = 0; j < kRowsPerGroup; j++) { + int ind = i * kRowsPerGroup + j; + std::int16_t definition_level = ind % 3 > 0 ? 1 : 0; + double double_value = (double)ind; + double_writer->WriteBatch( + 1, &definition_level, &repetition_level, &double_value); + } + } + + file_writer->Close(); + + DCHECK(stream->Close().ok()); + } catch (const std::exception &e) { + FAIL() << "Generate file" << e.what(); + } + } + + std ::shared_ptr<::parquet::schema::GroupNode> + CreateSchema() { + return std::static_pointer_cast<::parquet::schema::GroupNode>( + ::parquet::schema::GroupNode::Make( + "schema", + ::parquet::Repetition::REQUIRED, + ::parquet::schema::NodeVector{ + ::parquet::schema::PrimitiveNode::Make( + "boolean_field", + ::parquet::Repetition::OPTIONAL, + ::parquet::Type::BOOLEAN, + ::parquet::LogicalType::NONE), + ::parquet::schema::PrimitiveNode::Make( + "int32_field", + ::parquet::Repetition::OPTIONAL, + ::parquet::Type::INT32, + ::parquet::LogicalType::NONE), + ::parquet::schema::PrimitiveNode::Make( + "int64_field", + ::parquet::Repetition::OPTIONAL, + ::parquet::Type::INT64, + ::parquet::LogicalType::NONE), + ::parquet::schema::PrimitiveNode::Make( + "double_field", + ::parquet::Repetition::OPTIONAL, + ::parquet::Type::DOUBLE, + ::parquet::LogicalType::NONE), + })); + } + + void + TearDown() final { + if (std::remove(filename.c_str())) { FAIL() << "Remove file"; } + } + + void + checkNulls(/*const */ gdf_column &column) { + + const std::size_t valid_size = + arrow::BitUtil::BytesForBits(column.size); + const std::size_t valid_last = valid_size - 1; + + int fails = 0; + for (std::size_t i = 0; i < valid_last; i++) { + + if (i % 3 == 0){ + std::uint8_t valid = column.valid[i]; + std::uint8_t expected = 0b10110110; + EXPECT_EQ(expected, valid); + if (expected != valid){ + std::cout<<"fail at checkNulls i: "< 5) + break; + } + } else if (i % 3 == 1){ + std::uint8_t valid = column.valid[i]; + std::uint8_t expected = 0b01101101; + EXPECT_EQ(expected, valid); + if (expected != valid){ + std::cout<<"fail at checkNulls i: "< 5) + break; + } + } else { + std::uint8_t valid = column.valid[i]; + std::uint8_t expected = 0b11011011; + EXPECT_EQ(expected, valid); + if (expected != valid){ + std::cout<<"fail at checkNulls i: "< 5) + break; + } + } + + + } +// EXPECT_EQ(0b00101101, 0b00101101 & column.valid[valid_last]); + } + + void + checkBoolean(/*const */ gdf_column &column) { + + gdf_column boolean_column = + convert_to_host_gdf_column<::parquet::BooleanType::c_type>(&column); + + int fails = 0; + + for (std::size_t i = 0; i < boolean_column.size; i++) { + if (i % 3 > 0) { + bool expected = true; + bool value = static_cast(boolean_column.data)[i]; + + EXPECT_EQ(expected, value); + + if (expected != value){ + std::cout<<"fail at checkBoolean row: "< 5){ + break; + } + } + } + } + checkNulls(boolean_column); + } + + void + checkInt32(/*const */ gdf_column &column) { + + gdf_column int32_column = + convert_to_host_gdf_column<::parquet::Int32Type::c_type>(&column); + + int fails = 0; + + for (std::size_t i = 0; i < int32_column.size; i++) { + if (i % 3 > 0) { + std::int32_t expected = genInt32(i); + std::int32_t value = + static_cast(int32_column.data)[i]; + + EXPECT_EQ(expected, value); + + if (expected != value){ + std::cout<<"fail at checkInt32 row: "< 5){ + break; + } + } + } + } + + checkNulls(int32_column); + } + + void + checkInt64(/*const */ gdf_column &column) { + gdf_column int64_column = + convert_to_host_gdf_column<::parquet::Int64Type::c_type>(&column); + + int fails = 0; + + for (std::size_t i = 0; i < int64_column.size; i++) { + if (i % 3 > 0) { + std::int64_t expected = genInt64(i); + std::int64_t value = + static_cast(int64_column.data)[i]; + + EXPECT_EQ(expected, value); + + if (expected != value){ + std::cout<<"fail at checkInt64 row: "< 5){ + break; + } + } + } + } + + checkNulls(int64_column); + } + + void + checkDouble(/*const */ gdf_column &column) { + gdf_column double_column = + convert_to_host_gdf_column<::parquet::DoubleType::c_type>(&column); + + int fails = 0; + + for (std::size_t i = 0; i < double_column.size; i++) { + if (i % 3 > 0) { + double expected = static_cast(i); + double value = static_cast(double_column.data)[i]; + + EXPECT_EQ(expected, value); + + if (expected != value){ + std::cout<<"fail at checkDouble row: "< 5){ + break; + } + } + } + } + + checkNulls(double_column); + } + + const std::string filename; + + gdf_column *columns = nullptr; + std::size_t columns_length = 0; +}; + +TEST_F(ParquetReaderAPITest, ReadAll) { + + gdf_error error_code = gdf::parquet::read_parquet( + filename.c_str(), nullptr, &columns, &columns_length); + + EXPECT_EQ(GDF_SUCCESS, error_code); + + EXPECT_EQ(4U, columns_length); + + EXPECT_EQ(columns[0].size, columns[1].size); + EXPECT_EQ(columns[1].size, columns[2].size); + + checkBoolean(columns[0]); + checkInt32(columns[1]); + checkInt64(columns[2]); + checkDouble(columns[3]); +} + +TEST_F(ParquetReaderAPITest, ReadSomeColumns) { + const char *const column_names[] = {"double_field", "int64_field", nullptr}; + + gdf_error error_code = gdf::parquet::read_parquet( + filename.c_str(), column_names, &columns, &columns_length); + + EXPECT_EQ(GDF_SUCCESS, error_code); + + EXPECT_EQ(2U, columns_length); + + checkDouble(columns[0]); + checkInt64(columns[1]); +} + +TEST_F(ParquetReaderAPITest, ByIdsInOrder) { + const std::vector row_group_indices = {0, 1}; + const std::vector column_indices = {0, 1, 2, 3}; + + std::vector columns; + + gdf_error error_code = gdf::parquet::read_parquet_by_ids( + filename, row_group_indices, column_indices, columns); + + EXPECT_EQ(GDF_SUCCESS, error_code); + + EXPECT_EQ(4U, columns.size()); + + checkBoolean(*columns[0]); + checkInt32(*columns[1]); + checkInt64(*columns[2]); + checkDouble(*columns[3]); +} + +TEST_F(ParquetReaderAPITest, ByIdsOutOfOrder) { + const std::vector row_group_indices = {0, 1}; + const std::vector column_indices = {1, 3, 2, 0}; + + std::vector columns; + + gdf_error error_code = gdf::parquet::read_parquet_by_ids( + filename, row_group_indices, column_indices, columns); + + EXPECT_EQ(GDF_SUCCESS, error_code); + + EXPECT_EQ(4U, columns.size()); + + checkBoolean(*columns[3]); + checkInt32(*columns[0]); + checkInt64(*columns[2]); + checkDouble(*columns[1]); +} + +TEST_F(ParquetReaderAPITest, ByIdsInFromInterface) { + const std::vector row_group_indices = {0, 1}; + const std::vector column_indices = {0, 1, 2, 3}; + + std::vector columns; + + std::shared_ptr<::arrow::io::ReadableFile> file; + const ::parquet::ReaderProperties properties = ::parquet::default_reader_properties(); + ::arrow::io::ReadableFile::Open(filename, properties.memory_pool(), &file); + + gdf_error error_code = gdf::parquet::read_parquet_by_ids( + file, row_group_indices, column_indices, columns); + + EXPECT_EQ(GDF_SUCCESS, error_code); + + EXPECT_EQ(4U, columns.size()); + + checkBoolean(*columns[0]); + checkInt32(*columns[1]); + checkInt64(*columns[2]); + checkDouble(*columns[3]); +} diff --git a/src/tests/parquet/file_reader/file_reader-test.cpp b/src/tests/parquet/file_reader/file_reader-test.cpp new file mode 100644 index 00000000..177ccdd7 --- /dev/null +++ b/src/tests/parquet/file_reader/file_reader-test.cpp @@ -0,0 +1,121 @@ +/* + * Copyright 2018 BlazingDB, Inc. + * Copyright 2018 Cristhian Alberto Gonzales Castillo + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include + +#include "column_reader.h" +#include "file_reader.h" + +#ifndef PARQUET_FILE_PATH +#error PARQUET_FILE_PATH must be defined for precompiling +#define PARQUET_FILE_PATH "/" +#endif + +inline static void +checkMetadata(const std::shared_ptr &metadata) { + EXPECT_EQ(1, metadata->num_row_groups()); + EXPECT_EQ(3, metadata->num_columns()); +} + +inline static void +checkRowGroup(const std::unique_ptr &reader) { + const std::shared_ptr<::parquet::RowGroupReader> row_group = + reader->RowGroup(0); + + std::size_t i; + std::int16_t definition_level; + std::int16_t repetition_level; + std::uint8_t valid_bits; + std::int64_t levels_read; + std::int64_t values_read = 0; + std::int64_t nulls_count; + + std::shared_ptr column; + + column = row_group->Column(0); + gdf::parquet::BoolReader *bool_reader = + static_cast(column.get()); + i = 0; +// while (bool_reader->HasNext()) { +// bool value; +// bool_reader->ReadBatchSpaced(1, +// &definition_level, +// &repetition_level, +// &value, +// &valid_bits, +// 0, +// &levels_read, +// &values_read, +// &nulls_count); +// bool expected = (i % 2) == 0; +// EXPECT_EQ(expected, value); +// i++; +// } + + column = row_group->Column(1); + gdf::parquet::Int64Reader *int64_reader = + static_cast(column.get()); + i = 0; +// while (int64_reader->HasNext()) { +// std::int64_t value; +// int64_reader->ReadBatchSpaced(1, +// &definition_level, +// &repetition_level, +// &value, +// &valid_bits, +// 0, +// &levels_read, +// &values_read, +// &nulls_count); +// std::int64_t expected = static_cast(i) * 1000000000000; +// EXPECT_EQ(expected, value); +// i++; +// } + + column = row_group->Column(2); + gdf::parquet::DoubleReader *double_reader = + static_cast(column.get()); + i = 0; +// while (double_reader->HasNext()) { +// double value; +// double_reader->ReadBatchSpaced(1, +// &definition_level, +// &repetition_level, +// &value, +// &valid_bits, +// 0, +// &levels_read, +// &values_read, +// &nulls_count); +// double expected = i * 0.001; +// EXPECT_EQ(expected, value); +// i++; +// } +} + +TEST(FileReaderTest, Read) { + std::unique_ptr reader = + gdf::parquet::FileReader::OpenFile(PARQUET_FILE_PATH); + + checkMetadata(reader->metadata()); + checkRowGroup(reader); +} diff --git a/src/tests/parquet/file_reader/null-test.cu b/src/tests/parquet/file_reader/null-test.cu new file mode 100644 index 00000000..6f1a1c1b --- /dev/null +++ b/src/tests/parquet/file_reader/null-test.cu @@ -0,0 +1,200 @@ +/* + * Copyright 2018 BlazingDB, Inc. + * Copyright 2018 Cristhian Alberto Gonzales Castillo + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include +#include + +#include +#include +#include +#include +#include + +#include + +#include "../../../parquet/column_reader.h" +#include "../../../parquet/file_reader.h" + +#include + +#include "../../helper/utils.cuh" + +template +class NullTest : public ::testing::Test { +protected: + using TYPE = typename DataType::c_type; + + NullTest(); + + void GenerateFile(); + TYPE GenerateValue(std::size_t i); + + virtual void SetUp() override; + virtual void TearDown() override; + + static constexpr std::size_t kGroups = 2; + static constexpr std::size_t kRowsPerGroup = 50; + + const std::string filename; + +private: + std::shared_ptr<::parquet::schema::GroupNode> CreateSchema(); +}; + +using Types = ::testing::Types<::parquet::Int64Type>; +TYPED_TEST_CASE(NullTest, Types); + +template +void +NullTest::SetUp() { + GenerateFile(); +} + +template +void +NullTest::TearDown() { + if (std::remove(filename.c_str())) { FAIL() << "Remove file"; } +} + +template +NullTest::NullTest() + : filename(boost::filesystem::unique_path().native()) {} + +template +void +NullTest::GenerateFile() { + try { + std::shared_ptr<::arrow::io::FileOutputStream> stream; + PARQUET_THROW_NOT_OK( + ::arrow::io::FileOutputStream::Open(filename, &stream)); + + std::shared_ptr<::parquet::schema::GroupNode> schema = CreateSchema(); + + ::parquet::WriterProperties::Builder builder; + builder.compression(::parquet::Compression::SNAPPY); + std::shared_ptr<::parquet::WriterProperties> properties = + builder.build(); + + std::shared_ptr<::parquet::ParquetFileWriter> file_writer = + ::parquet::ParquetFileWriter::Open(stream, schema, properties); + + for (std::size_t i = 0; i < kGroups; i++) { + ::parquet::RowGroupWriter *row_group_writer = + file_writer->AppendRowGroup(kRowsPerGroup); + + ::parquet::TypedColumnWriter *writer = + static_cast<::parquet::TypedColumnWriter *>( + row_group_writer->NextColumn()); + std::int16_t repetition_level = 0; + for (std::size_t j = 0; j < kRowsPerGroup; j++) { + TYPE value = GenerateValue(i * kRowsPerGroup + j); + std::int16_t definition_level = j % 2; + writer->WriteBatch( + 1, &definition_level, &repetition_level, &value); + } + } + + file_writer->Close(); + + DCHECK(stream->Close().ok()); + } catch (const std::exception &e) { FAIL() << "Generate file" << e.what(); } +} + +template +std::shared_ptr<::parquet::schema::GroupNode> +NullTest::CreateSchema() { + return std::static_pointer_cast<::parquet::schema::GroupNode>( + ::parquet::schema::GroupNode::Make( + "schema", + ::parquet::Repetition::REQUIRED, + ::parquet::schema::NodeVector{::parquet::schema::PrimitiveNode::Make( + "field", + ::parquet::Repetition::OPTIONAL, + DataType::type_num, + ::parquet::LogicalType::NONE)})); +} + +template +typename NullTest::TYPE +NullTest::GenerateValue(std::size_t i) { + return static_cast(i) * 10; +} + +TYPED_TEST(NullTest, ReadAll) { + std::unique_ptr reader = + gdf::parquet::FileReader::OpenFile(this->filename); + + std::shared_ptr> column_reader = + std::static_pointer_cast>( + reader->RowGroup(0)->Column(0)); + + ASSERT_TRUE(column_reader->HasNext()); + + using value_type = typename TypeParam::c_type; + + const std::size_t rowsPerGroup = this->kRowsPerGroup; + const std::size_t groups = this->kGroups; + + gdf_column column{ + .data = nullptr, + .valid = nullptr, + .size = 0, + .dtype = GDF_INT64, + .null_count = 0, + .dtype_info = {}, + }; + + std::size_t valid_size = + get_number_of_bytes_for_valid(rowsPerGroup * groups); + + cudaMalloc(&column.data, rowsPerGroup * groups * sizeof(value_type)); + cudaMalloc(&column.valid, valid_size); + + const std::size_t total_read = column_reader->ToGdfColumn(column); + + column_reader = + std::static_pointer_cast>( + reader->RowGroup(1)->Column(0)); + + ASSERT_TRUE(column_reader->HasNext()); + const std::size_t total_read2 = column_reader->ToGdfColumn(column, 50); + + column.size = static_cast(rowsPerGroup * groups); + + EXPECT_EQ(rowsPerGroup, total_read); + + gdf_column host_column = convert_to_host_gdf_column(&column); + + for (std::size_t i = 0; i < groups * rowsPerGroup; i++) { + value_type expected = this->GenerateValue(i); + std::int64_t value = static_cast(host_column.data)[i]; + if (i % 2) { EXPECT_EQ(expected, value); } + } + + const std::size_t valid_last = valid_size - 1; + for (std::size_t i = 0; i < valid_last; i++) { + std::uint8_t valid = host_column.valid[i]; + EXPECT_EQ(0b10101010, valid); + } + EXPECT_EQ(0b00001010, 0b00001010 & host_column.valid[valid_last]); + + delete_gdf_column(&column); +} diff --git a/src/tests/parquet/file_reader/reader-test.parquet b/src/tests/parquet/file_reader/reader-test.parquet new file mode 100644 index 00000000..f00b6a8e Binary files /dev/null and b/src/tests/parquet/file_reader/reader-test.parquet differ diff --git a/src/tests/parquet/file_reader/single_column_file-test.cpp b/src/tests/parquet/file_reader/single_column_file-test.cpp new file mode 100644 index 00000000..11c2b727 --- /dev/null +++ b/src/tests/parquet/file_reader/single_column_file-test.cpp @@ -0,0 +1,180 @@ +/* + * Copyright 2018 BlazingDB, Inc. + * Copyright 2018 Cristhian Alberto Gonzales Castillo + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include +#include + +#include +#include +#include +#include +#include + +#include + +#include "column_reader.h" +#include "file_reader.h" + +#include + +template +class SingleColumnFileTest : public ::testing::Test { +protected: + using TYPE = typename DataType::c_type; + + SingleColumnFileTest(); + + void GenerateFile(); + TYPE GenerateValue(std::size_t i); + + virtual void SetUp() override; + virtual void TearDown() override; + + static constexpr std::size_t kRowsPerGroup = 100; + + const std::string filename; + +private: + std::shared_ptr<::parquet::schema::GroupNode> CreateSchema(); +}; + +using Types = ::testing::Types<::parquet::BooleanType, + ::parquet::Int32Type, + ::parquet::Int64Type, + ::parquet::FloatType, + ::parquet::DoubleType>; +TYPED_TEST_CASE(SingleColumnFileTest, Types); + +template +void +SingleColumnFileTest::SetUp() { + GenerateFile(); +} + +template +void +SingleColumnFileTest::TearDown() { + if (std::remove(filename.c_str())) { FAIL() << "Remove file"; } +} + +template +SingleColumnFileTest::SingleColumnFileTest() + : filename(boost::filesystem::unique_path().native()) {} + +template +void +SingleColumnFileTest::GenerateFile() { + try { + std::shared_ptr<::arrow::io::FileOutputStream> stream; + PARQUET_THROW_NOT_OK( + ::arrow::io::FileOutputStream::Open(filename, &stream)); + + std::shared_ptr<::parquet::schema::GroupNode> schema = CreateSchema(); + + ::parquet::WriterProperties::Builder builder; + builder.compression(::parquet::Compression::SNAPPY); + std::shared_ptr<::parquet::WriterProperties> properties = + builder.build(); + + std::shared_ptr<::parquet::ParquetFileWriter> file_writer = + ::parquet::ParquetFileWriter::Open(stream, schema, properties); + + ::parquet::RowGroupWriter *row_group_writer = + file_writer->AppendRowGroup(kRowsPerGroup); + + ::parquet::TypedColumnWriter *writer = + static_cast<::parquet::TypedColumnWriter *>( + row_group_writer->NextColumn()); + std::int16_t repetition_level = 0; + for (std::size_t i = 0; i < kRowsPerGroup; i++) { + TYPE value = GenerateValue(i); + std::int16_t definition_level = i % 2 ? 1 : 0; + writer->WriteBatch( + 1, &definition_level, &repetition_level, &value); + } + + file_writer->Close(); + + DCHECK(stream->Close().ok()); + } catch (const std::exception &e) { + FAIL() << "Generate file" << e.what(); + } +} + +template +std::shared_ptr<::parquet::schema::GroupNode> +SingleColumnFileTest::CreateSchema() { + return std::static_pointer_cast<::parquet::schema::GroupNode>( + ::parquet::schema::GroupNode::Make( + "schema", + ::parquet::Repetition::REQUIRED, + ::parquet::schema::NodeVector{::parquet::schema::PrimitiveNode::Make( + "field", + ::parquet::Repetition::OPTIONAL, + DataType::type_num, + ::parquet::LogicalType::NONE)})); +} + +template +typename SingleColumnFileTest::TYPE +SingleColumnFileTest::GenerateValue(std::size_t i) { + return static_cast(i) * 1000000000000; +} + +TYPED_TEST(SingleColumnFileTest, ReadAll) { + std::unique_ptr reader = + gdf::parquet::FileReader::OpenFile(this->filename); + + std::shared_ptr> column_reader = + std::static_pointer_cast>( + reader->RowGroup(0)->Column(0)); + + ASSERT_TRUE(column_reader->HasNext()); + + using value_type = typename TypeParam::c_type; + + const std::size_t rowsPerGroup = this->kRowsPerGroup; + + gdf_column column{ + .data = new std::uint8_t[rowsPerGroup * sizeof(value_type)], + .valid = new std::uint8_t[rowsPerGroup], + .size = 0, + .dtype = GDF_invalid, + .null_count = 0, + .dtype_info = {}, + }; + std::int16_t definition_levels[rowsPerGroup]; + std::int16_t repetition_levels[rowsPerGroup]; + + const std::size_t total_read = + column_reader->ToGdfColumn(definition_levels, repetition_levels, column); + + EXPECT_EQ(rowsPerGroup, total_read); + + for (std::size_t i = 0; i < rowsPerGroup; i++) { + value_type expected = this->GenerateValue(i); + std::int64_t value = static_cast(column.data)[i]; + if (i % 2) { EXPECT_EQ(expected, value); } + } + + delete[] static_cast(column.data); + delete[] column.valid; +} diff --git a/src/tests/parquet/gdf_column/CMakeLists.txt b/src/tests/parquet/gdf_column/CMakeLists.txt new file mode 100644 index 00000000..1164fc4c --- /dev/null +++ b/src/tests/parquet/gdf_column/CMakeLists.txt @@ -0,0 +1,36 @@ + + +#============================================================================= +# Copyright 2018 BlazingDB, Inc. +# Copyright 2018 Cristhian Alberto Gonzales Castillo +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#============================================================================= + + + +find_package(Boost REQUIRED COMPONENTS filesystem) + +set(PARQUET_FILE_FOR_DECODING_PATH + ${CMAKE_SOURCE_DIR}/src/tests/parquet/gdf_column/reader-test.parquet) + +add_definitions(-DPARQUET_FILE_FOR_DECODING_PATH="${PARQUET_FILE_FOR_DECODING_PATH}") + +GDF_ADD_PARQUET_TEST(gdf_column-test + gdf_column-test.cu + ../../helper/utils.cuh + ../../helper/utils.cu + ) + +target_link_libraries(gdf_column-test Boost::filesystem) + diff --git a/src/tests/parquet/gdf_column/gdf_column-test.cu b/src/tests/parquet/gdf_column/gdf_column-test.cu new file mode 100644 index 00000000..b3beb60e --- /dev/null +++ b/src/tests/parquet/gdf_column/gdf_column-test.cu @@ -0,0 +1,213 @@ +/* + * Copyright 2018 BlazingDB, Inc. + * Copyright 2018 Alexander Ocsa + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + + #include + + #include + + #include + + #include + #include + + + #include + #include + #include + #include + #include + + #include + + #include + #include + + #include "../../../parquet/column_reader.h" + #include "../../../parquet/file_reader.h" + + #include "../../helper/utils.cuh" + + #ifndef PARQUET_FILE_FOR_DECODING_PATH + #error PARQUET_FILE_FOR_DECODING_PATH must be defined for precompiling + #define PARQUET_FILE_FOR_DECODING_PATH "/" + #endif + + template + class SingleColumnToGdfTest : public ::testing::Test { + protected: + using TYPE = typename DataType::c_type; + + SingleColumnToGdfTest(); + + void GenerateFile(); + + inline TYPE GenerateValue(size_t i) { + if (sizeof (TYPE) == 1 ) { + return i % 2; + } + return static_cast(i) * 10; + } + + virtual void SetUp() override; + + virtual void TearDown() override; + + static constexpr size_t kRowsPerGroup = 50; + + const std::string filename; + + private: + std::shared_ptr<::parquet::schema::GroupNode> CreateSchema(); + }; + + using Types = ::testing::Types<::parquet::BooleanType, + ::parquet::Int32Type>; + TYPED_TEST_CASE(SingleColumnToGdfTest, Types); + + template + void SingleColumnToGdfTest::SetUp() { + GenerateFile(); + } + + template + void SingleColumnToGdfTest::TearDown() { + if ( std::remove(filename.c_str())) { + FAIL() << "Remove file"; + } + } + + template + SingleColumnToGdfTest::SingleColumnToGdfTest() + : filename ( boost::filesystem::unique_path().native()) + { + } + + template + void SingleColumnToGdfTest::GenerateFile() { + try { + std::shared_ptr<::arrow::io::FileOutputStream> stream; + PARQUET_THROW_NOT_OK( + ::arrow::io::FileOutputStream::Open(filename, &stream)); + + std::shared_ptr<::parquet::schema::GroupNode> schema = CreateSchema(); + + ::parquet::WriterProperties::Builder builder; + builder.compression(::parquet::Compression::SNAPPY); + std::shared_ptr<::parquet::WriterProperties> properties = + builder.build(); + + // Set ColumnDescriptor! = 3 + + std::shared_ptr<::parquet::ParquetFileWriter> file_writer = + ::parquet::ParquetFileWriter::Open(stream, schema, properties); + + ::parquet::RowGroupWriter *row_group_writer = + file_writer->AppendRowGroup(kRowsPerGroup); + + ::parquet::TypedColumnWriter *writer = + static_cast<::parquet::TypedColumnWriter *>( + row_group_writer->NextColumn()); + std::int16_t repetition_level = 0; + for (std::size_t i = 0; i < kRowsPerGroup; i++) { + TYPE value = GenerateValue(i); + std::int16_t definition_level = i % 2 ? 1 : 0; + writer->WriteBatch( + 1, &definition_level, &repetition_level, &value); + } + + file_writer->Close(); + + DCHECK(stream->Close().ok()); + } catch (const std::exception &e) { + FAIL() << "Generate file" << e.what(); + } + } + + + template + std::shared_ptr<::parquet::schema::GroupNode> + SingleColumnToGdfTest::CreateSchema() { + return std::static_pointer_cast<::parquet::schema::GroupNode>( + ::parquet::schema::GroupNode::Make( + "schema", + ::parquet::Repetition::REQUIRED, + ::parquet::schema::NodeVector{::parquet::schema::PrimitiveNode::Make( + "field", + ::parquet::Repetition::OPTIONAL, + DataType::type_num, + ::parquet::LogicalType::NONE)})); + } + + + + TYPED_TEST(SingleColumnToGdfTest, ReadAll) { + + using FileReader = gdf::parquet::FileReader; + using ColumnReader = gdf::parquet::ColumnReader; + auto reader = FileReader::OpenFile(this->filename); + auto row_group = reader->RowGroup(0); + auto abstract_column_reader = row_group->Column(0); + std::cout << "column_reader id : " << typeid(abstract_column_reader).name() << std::endl; + + auto column_reader = std::static_pointer_cast(abstract_column_reader); + + ASSERT_TRUE(column_reader->HasNext()); + + using value_type = typename TypeParam::c_type; + + const std::size_t rowsPerGroup = this->kRowsPerGroup; + + gdf_column column{ + .data = nullptr, + .valid = nullptr, + .size = rowsPerGroup, + .dtype = GDF_invalid, + .null_count = 0, + .dtype_info = {}, + }; + cudaMalloc(&column.data, rowsPerGroup * sizeof(value_type)); + + auto n_bytes = get_number_of_bytes_for_valid(this->kRowsPerGroup); + cudaMalloc((void **)&column.valid, n_bytes); + + // std::int16_t definition_levels[rowsPerGroup]; + // std::int16_t repetition_levels[rowsPerGroup]; + + const std::size_t total_read = + column_reader->ToGdfColumn(column); + + column.size = static_cast(rowsPerGroup); + // column.dtype = ParquetTraits::gdfDType; + + EXPECT_EQ(rowsPerGroup, total_read); // using ReadBatch + + print_column(&column); + + gdf_column host_column = convert_to_host_gdf_column(&column); + + for (std::size_t i = 0; i < rowsPerGroup; i++) { + if (i % 2) { + value_type expected = this->GenerateValue(i); + value_type value = static_cast(host_column.data)[i]; + EXPECT_EQ(expected, value); + } + } + + delete_gdf_column(&column); + } + diff --git a/src/tests/parquet/gdf_column/reader-test.parquet b/src/tests/parquet/gdf_column/reader-test.parquet new file mode 100644 index 00000000..f00b6a8e Binary files /dev/null and b/src/tests/parquet/gdf_column/reader-test.parquet differ