diff --git a/.gitignore b/.gitignore
index 770c680d..2f7017ff 100644
--- a/.gitignore
+++ b/.gitignore
@@ -18,3 +18,5 @@ python/libgdf_cffi/libgdf_cffi.py
 
 ## eclipse
 .project
+
+build2/
diff --git a/CMakeLists.txt b/CMakeLists.txt
index d2cb7a84..39949076 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,7 @@
 #=============================================================================
 # Copyright 2018 BlazingDB, Inc.
 #     Copyright 2018 Percy Camilo Triveño Aucahuasi <percy@blazingdb.com>
+#     Copyright 2018 Cristhian Alberto Gonzales Castillo <cristhian@blazingdb.com>
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -25,7 +26,7 @@
 
 PROJECT(libgdf)
 
-cmake_minimum_required(VERSION 2.8)  # not sure about version required
+cmake_minimum_required(VERSION 3.3)  # not sure about version required
 
 set(CMAKE_CXX_STANDARD 11)
 message(STATUS "Using C++ standard: c++${CMAKE_CXX_STANDARD}")
@@ -46,6 +47,7 @@ include(CTest)
 # Include custom modules (see cmake directory)
 include(ConfigureGoogleTest)
 include(ConfigureArrow)
+include(ConfigureParquetCpp)
 
 find_package(CUDA)
 set_package_properties(
@@ -83,12 +85,15 @@ else()
     message(FATAL_ERROR "Apache Arrow not found, please check your settings.")
 endif()
 
+get_property(PARQUETCPP_INCLUDE_DIRS TARGET Apache::ParquetCpp PROPERTY INTERFACE_INCLUDE_DIRECTORIES)
+
 include_directories(
     "${CMAKE_CURRENT_SOURCE_DIR}/include"
     "${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/cub"
     "${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/moderngpu/src"
     "${CUDA_INCLUDE_DIRS}"
     "${ARROW_INCLUDEDIR}"
+    "${PARQUETCPP_INCLUDE_DIRS}"
 )
 
 IF(CUDA_VERSION_MAJOR GREATER 7)
@@ -119,6 +124,19 @@ if(HT_LEGACY_ALLOCATOR)
   set(CUDA_NVCC_FLAGS  ${CUDA_NVCC_FLAGS};-DHT_LEGACY_ALLOCATOR)
 endif()
 
+cuda_add_library(gdf-parquet
+    src/parquet/api.cpp
+    src/parquet/column_reader.cu
+    src/parquet/file_reader.cpp
+    src/parquet/file_reader_contents.cpp
+    src/parquet/page_reader.cpp
+    src/parquet/row_group_reader_contents.cpp
+    src/parquet/decoder/cu_level_decoder.cu
+    src/arrow/cu_decoder.cu
+    src/arrow/util/pinned_allocator.cu
+)
+
+target_link_libraries(gdf-parquet Apache::ParquetCpp)
 
 cuda_add_library(gdf SHARED
     src/binaryops.cu
@@ -198,5 +216,10 @@ if(GTEST_FOUND)
 else()
     message(AUTHOR_WARNING "Google C++ Testing Framework (Google Test) not found: automated tests are disabled.")
 endif()
+
+if(GDF_BENCHMARK)
+	add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/src/bench)
+endif()
+
 # Print the project summary
 feature_summary(WHAT ALL INCLUDE_QUIET_PACKAGES FATAL_ON_MISSING_REQUIRED_PACKAGES)
diff --git a/cmake/Modules/ConfigureArrow.cmake b/cmake/Modules/ConfigureArrow.cmake
index 030e9986..e644d504 100644
--- a/cmake/Modules/ConfigureArrow.cmake
+++ b/cmake/Modules/ConfigureArrow.cmake
@@ -1,6 +1,7 @@
 #=============================================================================
 # Copyright 2018 BlazingDB, Inc.
 #     Copyright 2018 Percy Camilo Triveño Aucahuasi <percy@blazingdb.com>
+#     Copyright 2018 Cristhian Alberto Gonzales Castillo <cristhian@blazingdb.com>
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,7 +16,7 @@
 # limitations under the License.
 #=============================================================================
 
-set(ARROW_DOWNLOAD_BINARY_DIR ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/thirdparty/arrow-download/)
+set(ARROW_DOWNLOAD_BINARY_DIR ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/thirdparty/arrow-download)
 
 # Download and unpack arrow at configure time
 configure_file(${CMAKE_SOURCE_DIR}/cmake/Templates/Arrow.CMakeLists.txt.cmake ${ARROW_DOWNLOAD_BINARY_DIR}/CMakeLists.txt COPYONLY)
diff --git a/cmake/Modules/ConfigureParquetCpp.cmake b/cmake/Modules/ConfigureParquetCpp.cmake
new file mode 100644
index 00000000..c425bd55
--- /dev/null
+++ b/cmake/Modules/ConfigureParquetCpp.cmake
@@ -0,0 +1,89 @@
+#=============================================================================
+# Copyright 2018 BlazingDB, Inc.
+#     Copyright 2018 Cristhian Alberto Gonzales Castillo <cristhian@blazingdb.com>
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+# Download and unpack ParquetCpp at configure time
+configure_file(${CMAKE_SOURCE_DIR}/cmake/Templates/ParquetCpp.CMakeLists.txt.cmake ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/thirdparty/parquetcpp-download/CMakeLists.txt)
+
+execute_process(
+    COMMAND ${CMAKE_COMMAND} -F "${CMAKE_GENERATOR}" .
+    RESULT_VARIABLE result
+    WORKING_DIRECTORY ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/thirdparty/parquetcpp-download/
+)
+
+if(result)
+    message(FATAL_ERROR "CMake step for ParquetCpp failed: ${result}")
+endif()
+
+# Transitive dependencies
+set(ARROW_TRANSITIVE_DEPENDENCIES_PREFIX ${ARROW_DOWNLOAD_BINARY_DIR}/arrow-prefix/src/arrow-build)
+set(BROTLI_TRANSITIVE_DEPENDENCY_PREFIX ${ARROW_TRANSITIVE_DEPENDENCIES_PREFIX}/brotli_ep/src/brotli_ep-install/lib/x86_64-linux-gnu)
+set(BROTLI_STATIC_LIB_ENC ${BROTLI_TRANSITIVE_DEPENDENCY_PREFIX}/libbrotlienc.a)
+set(BROTLI_STATIC_LIB_DEC ${BROTLI_TRANSITIVE_DEPENDENCY_PREFIX}/libbrotlidec.a)
+set(BROTLI_STATIC_LIB_COMMON ${BROTLI_TRANSITIVE_DEPENDENCY_PREFIX}/libbrotlicommon.a)
+set(SNAPPY_STATIC_LIB ${ARROW_TRANSITIVE_DEPENDENCIES_PREFIX}/snappy_ep/src/snappy_ep-install/lib/libsnappy.a)
+set(ZLIB_STATIC_LIB ${ARROW_TRANSITIVE_DEPENDENCIES_PREFIX}/zlib_ep/src/zlib_ep-install/lib/libz.a)
+set(LZ4_STATIC_LIB ${ARROW_TRANSITIVE_DEPENDENCIES_PREFIX}/lz4_ep-prefix/src/lz4_ep/lib/liblz4.a)
+set(ZSTD_STATIC_LIB ${ARROW_TRANSITIVE_DEPENDENCIES_PREFIX}/zstd_ep-prefix/src/zstd_ep/lib/libzstd.a)
+set(ARROW_HOME ${ARROW_ROOT})
+
+set(ENV{BROTLI_STATIC_LIB_ENC} ${BROTLI_STATIC_LIB_ENC})
+set(ENV{BROTLI_STATIC_LIB_DEC} ${BROTLI_STATIC_LIB_DEC})
+set(ENV{BROTLI_STATIC_LIB_COMMON} ${BROTLI_STATIC_LIB_COMMON})
+set(ENV{SNAPPY_STATIC_LIB} ${SNAPPY_STATIC_LIB})
+set(ENV{ZLIB_STATIC_LIB} ${ZLIB_STATIC_LIB})
+set(ENV{LZ4_STATIC_LIB} ${LZ4_STATIC_LIB})
+set(ENV{ZSTD_STATIC_LIB} ${ZSTD_STATIC_LIB})
+set(ENV{ARROW_HOME} ${ARROW_HOME})
+
+execute_process(
+    COMMAND ${CMAKE_COMMAND} --build .
+    RESULT_VARIABLE result
+    WORKING_DIRECTORY ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/thirdparty/parquetcpp-download)
+
+if(result)
+    message(FATAL_ERROR "Build step for ParquetCpp failed: ${result}")
+endif()
+
+# Add transitive dependency: Thrift
+set(THRIFT_ROOT ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/thirdparty/parquetcpp-build/thrift_ep/src/thrift_ep-install)
+
+# Locate ParquetCpp package
+set(PARQUETCPP_ROOT ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/thirdparty/parquetcpp-install)
+set(PARQUETCPP_BINARY_DIR ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/thirdparty/parquetcpp-build)
+set(PARQUETCPP_SOURCE_DIR ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/thirdparty/parquetcpp-src)
+
+# Dependency interfaces
+find_package(Boost REQUIRED COMPONENTS regex)
+
+add_library(Apache::Thrift INTERFACE IMPORTED)
+set_target_properties(Apache::Thrift
+    PROPERTIES INTERFACE_INCLUDE_DIRECTORIES ${THRIFT_ROOT}/include)
+set_target_properties(Apache::Thrift
+    PROPERTIES INTERFACE_LINK_LIBRARIES ${THRIFT_ROOT}/lib/libthrift.a)
+
+add_library(Apache::Arrow INTERFACE IMPORTED)
+set_target_properties(Apache::Arrow
+    PROPERTIES INTERFACE_INCLUDE_DIRECTORIES ${ARROW_ROOT}/include)
+set_target_properties(Apache::Arrow
+    PROPERTIES INTERFACE_LINK_LIBRARIES "${ARROW_ROOT}/lib/libarrow.a;${BROTLI_STATIC_LIB_ENC};${BROTLI_STATIC_LIB_DEC};${BROTLI_STATIC_LIB_COMMON};${SNAPPY_STATIC_LIB};${ZLIB_STATIC_LIB};${LZ4_STATIC_LIB};${ZSTD_STATIC_LIB}")
+
+add_library(Apache::ParquetCpp INTERFACE IMPORTED)
+set_target_properties(Apache::ParquetCpp
+    PROPERTIES INTERFACE_INCLUDE_DIRECTORIES
+        "${PARQUETCPP_ROOT}/include;${PARQUETCPP_BINARY_DIR}/src;${PARQUETCPP_SOURCE_DIR}/src")
+set_target_properties(Apache::ParquetCpp
+    PROPERTIES INTERFACE_LINK_LIBRARIES "${PARQUETCPP_ROOT}/lib/libparquet.a;Apache::Arrow;Apache::Thrift;Boost::regex")
diff --git a/cmake/Templates/Arrow.CMakeLists.txt.cmake b/cmake/Templates/Arrow.CMakeLists.txt.cmake
index 3fcbb108..7d4b7bc5 100644
--- a/cmake/Templates/Arrow.CMakeLists.txt.cmake
+++ b/cmake/Templates/Arrow.CMakeLists.txt.cmake
@@ -1,6 +1,7 @@
 #=============================================================================
 # Copyright 2018 BlazingDB, Inc.
 #     Copyright 2018 Percy Camilo Triveño Aucahuasi <percy@blazingdb.com>
+#     Copyright 2018 Cristhian Alberto Gonzales Castillo <cristhian@blazingdb.com>
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -23,7 +24,7 @@ project(arrow-download NONE)
 
 include(ExternalProject)
 
-set(ARROW_VERSION "apache-arrow-0.10.0")
+set(ARROW_VERSION "apache-arrow-0.9.0")
 
 if (NOT "$ENV{PARQUET_ARROW_VERSION}" STREQUAL "")
     set(ARROW_VERSION "$ENV{PARQUET_ARROW_VERSION}")
@@ -34,13 +35,6 @@ message(STATUS "Using Apache Arrow version: ${ARROW_VERSION}")
 set(ARROW_URL "https://github.com/apache/arrow/archive/${ARROW_VERSION}.tar.gz")
 
 set(ARROW_CMAKE_ARGS
-    #Arrow dependencies
-    -DARROW_WITH_LZ4=OFF
-    -DARROW_WITH_ZSTD=OFF
-    -DARROW_WITH_BROTLI=OFF
-    -DARROW_WITH_SNAPPY=OFF
-    -DARROW_WITH_ZLIB=OFF
-
     #Build settings
     -DARROW_BUILD_STATIC=ON
     -DARROW_BUILD_SHARED=OFF
@@ -48,10 +42,12 @@ set(ARROW_CMAKE_ARGS
     -DARROW_BUILD_TESTS=OFF
     -DARROW_TEST_MEMCHECK=OFF
     -DARROW_BUILD_BENCHMARKS=OFF
+    -DARROW_BUILD_UTILITIES=OFF
+    -DARROW_JEMALLOC=OFF
 
     #Arrow modules
     -DARROW_IPC=ON
-    -DARROW_COMPUTE=OFF
+    -DARROW_COMPUTE=ON
     -DARROW_GPU=OFF
     -DARROW_JEMALLOC=OFF
     -DARROW_BOOST_VENDORED=OFF
diff --git a/cmake/Templates/ParquetCpp.CMakeLists.txt.cmake b/cmake/Templates/ParquetCpp.CMakeLists.txt.cmake
new file mode 100644
index 00000000..2f61a0c2
--- /dev/null
+++ b/cmake/Templates/ParquetCpp.CMakeLists.txt.cmake
@@ -0,0 +1,44 @@
+#=============================================================================
+# Copyright 2018 BlazingDB, Inc.
+#     Copyright 2018 Cristhian Alberto Gonzales Castillo <cristhian@blazingdb.com>
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+cmake_minimum_required(VERSION 2.8.12)
+
+project(parquetcpp-download NONE)
+
+include(ExternalProject)
+
+set(PARQUET_VERSION apache-parquet-cpp-1.4.0)
+
+if (NOT $ENV{PARQUET_VERSION} STREQUAL "")
+    set(PARQUET_VERSION $ENV{PARQUET_VETSION})
+endif()
+
+message(STATUS "Using Apache ParquetCpp version: ${PARQUET_VERSION}")
+
+ExternalProject_Add(parquetcpp
+    BINARY_DIR "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/thirdparty/parquetcpp-build"
+    CMAKE_ARGS
+        -DCMAKE_BUILD_TYPE=RELEASE
+        -DCMAKE_INSTALL_PREFIX=${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/thirdparty/parquetcpp-install
+        -DPARQUET_ARROW_LINKAGE=static
+        -DPARQUET_BUILD_SHARED=OFF
+        -DPARQUET_BUILD_TESTS=OFF
+    GIT_REPOSITORY https://github.com/apache/parquet-cpp.git
+    GIT_TAG apache-parquet-cpp-1.4.0
+    INSTALL_DIR "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/thirdparty/parquetcpp-install"
+    SOURCE_DIR "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/thirdparty/parquetcpp-src"
+)
diff --git a/conda_environments/dev_py35.yml b/conda_environments/dev_py35.yml
index c879875e..b4fbb068 100644
--- a/conda_environments/dev_py35.yml
+++ b/conda_environments/dev_py35.yml
@@ -24,4 +24,6 @@ dependencies:
 - llvmlite=0.18.0=py35_0
 - numba=0.34.0.dev=np112py35_316
 - cmake=3.6.3=0
+- flex=2.6.0
+- bison=3.0.4
 - pyarrow=0.10.0
diff --git a/include/gdf/cffi/types.h b/include/gdf/cffi/types.h
index d8590aca..9bdf04b0 100644
--- a/include/gdf/cffi/types.h
+++ b/include/gdf/cffi/types.h
@@ -48,6 +48,8 @@ typedef enum {
     GDF_INVALID_API_CALL,             /**< The arguments passed into the function were invalid */   
     GDF_JOIN_DTYPE_MISMATCH,          /**< Datatype mismatch between corresponding columns in  left/right tables in the Join function */   
     GDF_JOIN_TOO_MANY_COLUMNS,        /**< Too many columns were passed in for the requested join operation*/       
+    
+    GDF_IO_ERROR,                     /**< Error occured in a parquet-reader api which load a parquet file into gdf_columns */
     GDF_DTYPE_MISMATCH,               /**< Type mismatch between columns that should be the same type */
     GDF_UNSUPPORTED_METHOD,           /**< The method requested to perform an operation was invalid or unsupported (e.g., hash vs. sort)*/ 
     GDF_INVALID_AGGREGATOR,           /**< Invalid aggregator was specified for a groupby*/
diff --git a/include/gdf/parquet/api.h b/include/gdf/parquet/api.h
new file mode 100644
index 00000000..e77dbc1e
--- /dev/null
+++ b/include/gdf/parquet/api.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright 2018 BlazingDB, Inc.
+ *     Copyright 2018 Cristhian Alberto Gonzales Castillo <cristhian@blazingdb.com>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gdf/gdf.h>
+
+#ifdef __cplusplus
+#define BEGIN_NAMESPACE_GDF_PARQUET                                           \
+    namespace gdf {                                                           \
+    namespace parquet {
+#define END_NAMESPACE_GDF_PARQUET                                             \
+    }                                                                         \
+    }
+#else
+#define BEGIN_NAMESPACE_GDF_PARQUET
+#define END_NAMESPACE_GDF_PARQUET
+#endif
+
+BEGIN_NAMESPACE_GDF_PARQUET
+
+/// \brief Read parquet file from file path into array of gdf columns
+/// \param[in] filename path to parquet file
+/// \param[in] columns will be read from the file
+/// \param[out] out_gdf_columns array
+/// \param[out] out_gdf_columns_length number of columns
+extern "C" gdf_error read_parquet(const char *const        filename,
+                                  const char *const *const columns,
+                                  gdf_column **const       out_gdf_columns,
+                                  size_t *const out_gdf_columns_length);
+
+END_NAMESPACE_GDF_PARQUET
+
+#ifdef __cplusplus
+
+#include <string>
+#include <vector>
+#include <arrow/io/file.h>
+
+namespace gdf {
+namespace parquet {
+
+/// \brief Read parquet file from file path into array of gdf columns
+/// \param[in] filename path to parquet file
+/// \param[in] indices of the rowgroups that will be read from the file
+/// \param[in] indices of the columns that will be read from the file
+/// \param[out] out_gdf_columns vector of gdf_column pointers. The data read.
+gdf_error
+read_parquet_by_ids(const std::string &             filename,
+                    const std::vector<std::size_t> &row_group_indices,
+                    const std::vector<std::size_t> &column_indices,
+                    std::vector<gdf_column *> &     out_gdf_columns);
+
+/// \brief Read parquet file from file interface into array of gdf columns
+/// \param[in] filename path to parquet file
+/// \param[in] indices of the rowgroups that will be read from the file
+/// \param[in] indices of the columns that will be read from the file
+/// \param[out] out_gdf_columns vector of gdf_column pointers. The data read.
+gdf_error
+read_parquet_by_ids(std::shared_ptr<::arrow::io::RandomAccessFile> file,
+                    const std::vector<std::size_t> &row_group_indices,
+                    const std::vector<std::size_t> &column_indices,
+                    std::vector<gdf_column *> &     out_gdf_columns);
+
+}  // namespace parquet
+}  // namespace gdf
+
+#endif
diff --git a/src/arrow/bit-stream.h b/src/arrow/bit-stream.h
new file mode 100644
index 00000000..79bb814a
--- /dev/null
+++ b/src/arrow/bit-stream.h
@@ -0,0 +1,393 @@
+/*
+ * Copyright 2018 BlazingDB, Inc.
+ *     Copyright 2018 Alexander Ocsa <alexander@blazingdb.com>
+ *     Copyright 2018 William Malpica <william@blazingdb.com>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef GDF_ARROW_UTIL_BIT_STREAM_UTILS_H
+#define GDF_ARROW_UTIL_BIT_STREAM_UTILS_H
+
+#include <algorithm>
+#include <cstdint>
+#include <string.h>
+
+#include "arrow/util/bit-util.h"
+#include "arrow/util/bpacking.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/macros.h"
+
+namespace gdf {
+namespace arrow {
+    namespace internal {
+
+        /// Utility class to read bit/byte stream.  This class can read bits or bytes
+        /// that are either byte aligned or not.  It also has utilities to read multiple
+        /// bytes in one read (e.g. encoded int).
+        class BitReader {
+        public:
+            /// 'buffer' is the buffer to read from.  The buffer's length is 'buffer_len'.
+            BitReader(const uint8_t* buffer, int buffer_len)
+                : buffer_(buffer)
+                , max_bytes_(buffer_len)
+                , byte_offset_(0)
+                , bit_offset_(0)
+            {
+                int num_bytes = std::min(8, max_bytes_ - byte_offset_);
+                memcpy(&buffered_values_, buffer_ + byte_offset_, num_bytes);
+            }
+
+            BitReader()
+                : buffer_(NULL)
+                , max_bytes_(0)
+            {
+            }
+
+            void Reset(const uint8_t* buffer, int buffer_len)
+            {
+                buffer_ = buffer;
+                max_bytes_ = buffer_len;
+                byte_offset_ = 0;
+                bit_offset_ = 0;
+                int num_bytes = std::min(8, max_bytes_ - byte_offset_);
+                memcpy(&buffered_values_, buffer_ + byte_offset_, num_bytes);
+            }
+
+            /// Gets the next value from the buffer.  Returns true if 'v' could be read or
+            /// false if
+            /// there are not enough bytes left. num_bits must be <= 32.
+            template <typename T>
+            bool GetValue(int num_bits, T* v);
+
+            template <typename T>
+            void SetGpuBatchMetadata(int num_bits, T* v, int batch_size, int values_read,
+                                std::vector<int>& unpack32InputOffsets,
+            					std::vector<int>& unpack32InputRunLengths,
+                                std::vector<int>& unpack32OutputOffsets,
+                                std::vector<int>& remainderInputOffsets,
+                                std::vector<int>& remainderBitOffsets,
+                                std::vector<int>& remainderSetSize,
+                                std::vector<int>& remainderOutputOffsets);
+
+            /// Get a number of values from the buffer. Return the number of values
+            /// actually read.
+            template <typename T>
+            int GetBatch(int num_bits, T* v, int batch_size);
+
+            /// Reads a 'num_bytes'-sized value from the buffer and stores it in 'v'. T
+            /// needs to be a little-endian native type and big enough to store
+            /// 'num_bytes'. The value is assumed to be byte-aligned so the stream will
+            /// be advanced to the start of the next byte before 'v' is read. Returns
+            /// false if there are not enough bytes left.
+            template <typename T>
+            bool GetAligned(int num_bytes, T* v);
+
+            /// Reads a vlq encoded int from the stream.  The encoded int must start at
+            /// the beginning of a byte. Return false if there were not enough bytes in
+            /// the buffer.
+            bool GetVlqInt(int32_t* v);
+
+            // Reads a zigzag encoded int `into` v.
+            bool GetZigZagVlqInt(int32_t* v);
+
+            /// Returns the number of bytes left in the stream, not including the current
+            /// byte (i.e., there may be an additional fraction of a byte).
+            int bytes_left()
+            {
+                return max_bytes_ - (byte_offset_ + static_cast<int>(::arrow::BitUtil::Ceil(
+                                                        bit_offset_, 8)));
+            }
+
+            const uint8_t* get_buffer() { return buffer_; }
+            int get_buffer_len() { return max_bytes_; }
+            /// Maximum byte length of a vlq encoded int
+            static const int MAX_VLQ_BYTE_LEN = 5;
+
+        private:
+            const uint8_t* buffer_;
+            int max_bytes_;
+
+            /// Bytes are memcpy'd from buffer_ and values are read from this variable.
+            /// This is
+            /// faster than reading values byte by byte directly from buffer_.
+            uint64_t buffered_values_;
+
+            int byte_offset_; // Offset in buffer_
+            int bit_offset_; // Offset in buffered_values_
+        };
+
+
+
+        template <typename T>
+        inline void GetValue_(int num_bits, T* v, int max_bytes, const uint8_t* buffer,
+                            int* bit_offset, int* byte_offset, uint64_t* buffered_values) {
+            #ifdef _MSC_VER
+            #pragma warning(push)
+            #pragma warning(disable : 4800)
+            #endif
+            *v = static_cast<T>(::arrow::BitUtil::TrailingBits(*buffered_values, *bit_offset + num_bits) >>
+                                *bit_offset);
+            #ifdef _MSC_VER
+            #pragma warning(pop)
+            #endif
+            *bit_offset += num_bits;
+            if (*bit_offset >= 64) {
+                *byte_offset += 8;
+                *bit_offset -= 64;
+
+                int bytes_remaining = max_bytes - *byte_offset;
+                if (ARROW_PREDICT_TRUE(bytes_remaining >= 8)) {
+                memcpy(buffered_values, buffer + *byte_offset, 8);
+                } else {
+                memcpy(buffered_values, buffer + *byte_offset, bytes_remaining);
+                }
+            #ifdef _MSC_VER
+            #pragma warning(push)
+            #pragma warning(disable : 4800 4805)
+            #endif
+                // Read bits of v that crossed into new buffered_values_
+                *v = *v | static_cast<T>(::arrow::BitUtil::TrailingBits(*buffered_values, *bit_offset)
+                                        << (num_bits - *bit_offset));
+            #ifdef _MSC_VER
+            #pragma warning(pop)
+            #endif
+                DCHECK_LE(*bit_offset, 64);
+            }
+        }
+
+        template <typename T>
+        inline bool BitReader::GetValue(int num_bits, T* v)
+        {
+            return GetBatch(num_bits, v, 1) == 1;
+        }
+
+
+        template <typename T>
+                inline void
+                BitReader::SetGpuBatchMetadata(int num_bits, T* v, int batch_size, int values_read,
+                    std::vector<int>& unpack32InputOffsets,
+					std::vector<int>& unpack32InputRunLengths,
+                    std::vector<int>& unpack32OutputOffsets,
+                    std::vector<int>& remainderInputOffsets,
+                    std::vector<int>& remainderBitOffsets,
+                    std::vector<int>& remainderSetSize,
+                    std::vector<int>& remainderOutputOffsets)
+                {
+                    DCHECK(buffer_ != NULL);
+                    // TODO: revisit this limit if necessary
+                    DCHECK_LE(num_bits, 32);
+                    //	  DCHECK_LE(num_bits, static_cast<int>(sizeof(T) * 8));
+
+
+                    int bit_offset = bit_offset_;
+                    int byte_offset = byte_offset_;
+                    uint64_t buffered_values = buffered_values_;
+                    int max_bytes = max_bytes_;
+
+                    uint64_t needed_bits = num_bits * batch_size;
+                    uint64_t remaining_bits = (max_bytes - byte_offset) * 8 - bit_offset;
+                    if (remaining_bits < needed_bits) {
+                        batch_size = static_cast<int>(remaining_bits) / num_bits;
+                    }
+
+                    int i = 0;
+                    if (ARROW_PREDICT_FALSE(bit_offset != 0)) {
+                        int byte_offset_start = byte_offset;
+                        int bit_offset_start = bit_offset;
+                        int i_start = i + values_read;
+
+                        int count = 0;
+                        for (; i < batch_size && bit_offset != 0; ++i) {   // TODO this loop can be replaced with math
+                            bit_offset += num_bits;
+                            if (bit_offset >= 64) {
+                                byte_offset += 8;
+                                bit_offset -= 64;
+                            }
+                            count++;
+                        }
+                        if (count > 0) {
+                            remainderInputOffsets.push_back(byte_offset_start);
+                            remainderBitOffsets.push_back(bit_offset_start);
+                            remainderOutputOffsets.push_back(i_start);
+                            remainderSetSize.push_back(count);
+                        }
+                    }
+
+                    int unpack_batch_size = (batch_size - i) / 32 * 32;
+
+
+                    if (unpack_batch_size > 32){
+                    	unpack32InputOffsets.push_back(byte_offset);
+                    	unpack32InputRunLengths.push_back(unpack_batch_size);
+                    	unpack32OutputOffsets.push_back(i + values_read);
+                    	i += unpack_batch_size;
+                    	byte_offset += unpack_batch_size * num_bits / 8;
+                    }
+
+
+                    int byte_offset_start = byte_offset;
+                    int bit_offset_start = bit_offset;
+                    int i_start = i + values_read;
+
+                    int count = 0;
+                    for (; i < batch_size; ++i) {  // TODO this loop can be replaced with math
+                        bit_offset += num_bits;
+                        if (bit_offset >= 64) {
+                            byte_offset += 8;
+                            bit_offset -= 64;
+                        }
+                        count++;
+                    }
+                    if (count > 0) {
+                        remainderInputOffsets.push_back(byte_offset_start);
+                        remainderBitOffsets.push_back(bit_offset_start);
+                        remainderOutputOffsets.push_back(i_start);
+                        remainderSetSize.push_back(count);
+                    }
+
+                    bit_offset_ = bit_offset;
+                    byte_offset_ = byte_offset;
+                    buffered_values_ = buffered_values;
+                }
+
+        template <typename T>
+        inline int BitReader::GetBatch(int num_bits, T* v, int batch_size)
+        {
+            DCHECK(buffer_ != NULL);
+            // TODO: revisit this limit if necessary
+            DCHECK_LE(num_bits, 32);
+            DCHECK_LE(num_bits, static_cast<int>(sizeof(T) * 8));
+
+            int bit_offset = bit_offset_;
+            int byte_offset = byte_offset_;
+            uint64_t buffered_values = buffered_values_;
+            int max_bytes = max_bytes_;
+            const uint8_t* buffer = buffer_;
+
+            uint64_t needed_bits = num_bits * batch_size;
+            uint64_t remaining_bits = (max_bytes - byte_offset) * 8 - bit_offset;
+            if (remaining_bits < needed_bits) {
+                batch_size = static_cast<int>(remaining_bits) / num_bits;
+            }
+
+            int i = 0;
+            if (ARROW_PREDICT_FALSE(bit_offset != 0)) {
+                for (; i < batch_size && bit_offset != 0; ++i) {
+                    GetValue_(num_bits, &v[i], max_bytes, buffer,
+                        &bit_offset, &byte_offset, &buffered_values);
+                }
+            }
+
+            if (sizeof(T) == 4) {
+                int num_unpacked = ::arrow::internal::unpack32(
+                    reinterpret_cast<const uint32_t*>(buffer + byte_offset),
+                    reinterpret_cast<uint32_t*>(v + i), batch_size - i, num_bits);
+                i += num_unpacked;
+                byte_offset += num_unpacked * num_bits / 8;
+            } else {
+                const int buffer_size = 1024;
+                uint32_t unpack_buffer[buffer_size];
+                while (i < batch_size) {
+                    int unpack_size = std::min(buffer_size, batch_size - i);
+                    int num_unpacked = ::arrow::internal::unpack32(
+                        reinterpret_cast<const uint32_t*>(buffer + byte_offset),
+                        unpack_buffer, unpack_size, num_bits);
+                    if (num_unpacked == 0) {
+                        break;
+                    }
+                    for (int k = 0; k < num_unpacked; ++k) {
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4800)
+#endif
+                        v[i + k] = static_cast<T>(unpack_buffer[k]);
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+                    }
+                    i += num_unpacked;
+                    byte_offset += num_unpacked * num_bits / 8;
+                }
+            }
+
+            int bytes_remaining = max_bytes - byte_offset;
+            if (bytes_remaining >= 8) {
+                memcpy(&buffered_values, buffer + byte_offset, 8);
+            } else {
+                memcpy(&buffered_values, buffer + byte_offset, bytes_remaining);
+            }
+            for (; i < batch_size; ++i) {
+                GetValue_(num_bits, &v[i], max_bytes, buffer, &bit_offset,
+                    &byte_offset, &buffered_values);
+            }
+            bit_offset_ = bit_offset;
+            byte_offset_ = byte_offset;
+            buffered_values_ = buffered_values;
+
+            return batch_size;
+        }
+
+        template <typename T>
+        inline bool BitReader::GetAligned(int num_bytes, T* v)
+        {
+            DCHECK_LE(num_bytes, static_cast<int>(sizeof(T)));
+            int bytes_read = static_cast<int>(::arrow::BitUtil::Ceil(bit_offset_, 8));
+            if (ARROW_PREDICT_FALSE(byte_offset_ + bytes_read + num_bytes > max_bytes_))
+                return false;
+
+            // Advance byte_offset to next unread byte and read num_bytes
+            byte_offset_ += bytes_read;
+            memcpy(v, buffer_ + byte_offset_, num_bytes);
+            byte_offset_ += num_bytes;
+
+            // Reset buffered_values_
+            bit_offset_ = 0;
+            int bytes_remaining = max_bytes_ - byte_offset_;
+            if (ARROW_PREDICT_TRUE(bytes_remaining >= 8)) {
+                memcpy(&buffered_values_, buffer_ + byte_offset_, 8);
+            } else {
+                memcpy(&buffered_values_, buffer_ + byte_offset_, bytes_remaining);
+            }
+            return true;
+        }
+
+        inline bool BitReader::GetVlqInt(int32_t* v)
+        {
+            *v = 0;
+            int shift = 0;
+            int num_bytes = 0;
+            uint8_t byte = 0;
+            do {
+                if (!GetAligned<uint8_t>(1, &byte))
+                    return false;
+                *v |= (byte & 0x7F) << shift;
+                shift += 7;
+                DCHECK_LE(++num_bytes, MAX_VLQ_BYTE_LEN);
+            } while ((byte & 0x80) != 0);
+            return true;
+        }
+
+        inline bool BitReader::GetZigZagVlqInt(int32_t* v)
+        {
+            int32_t u_signed;
+            if (!GetVlqInt(&u_signed))
+                return false;
+            uint32_t u = static_cast<uint32_t>(u_signed);
+            *reinterpret_cast<uint32_t*>(v) = (u >> 1) ^ -(static_cast<int32_t>(u & 1));
+            return true;
+        }
+    }
+}
+}
+
+#endif
diff --git a/src/arrow/bpacking.cuh b/src/arrow/bpacking.cuh
new file mode 100644
index 00000000..2f3a7eb7
--- /dev/null
+++ b/src/arrow/bpacking.cuh
@@ -0,0 +1,3200 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This file was modified from its original version for inclusion in parquet-cpp.
+// Original source:
+// https://github.com/lemire/FrameOfReference/blob/6ccaf9e97160f9a3b299e23a8ef739e711ef0c71/src/bpacking.cpp
+// The original copyright notice follows.
+
+// This code is released under the
+// Apache License Version 2.0 http://www.apache.org/licenses/.
+// (c) Daniel Lemire 2013
+
+#ifndef GDF_ARROW_UTIL_BPACKING_H
+#define GDF_ARROW_UTIL_BPACKING_H
+
+#include "arrow/util/logging.h"
+
+namespace gdf {
+namespace arrow {
+    namespace internal {
+
+__host__ __device__ inline const uint32_t* unpack1_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) & 1;
+  out++;
+  *out = ((*in) >> 1) & 1;
+  out++;
+  *out = ((*in) >> 2) & 1;
+  out++;
+  *out = ((*in) >> 3) & 1;
+  out++;
+  *out = ((*in) >> 4) & 1;
+  out++;
+  *out = ((*in) >> 5) & 1;
+  out++;
+  *out = ((*in) >> 6) & 1;
+  out++;
+  *out = ((*in) >> 7) & 1;
+  out++;
+  *out = ((*in) >> 8) & 1;
+  out++;
+  *out = ((*in) >> 9) & 1;
+  out++;
+  *out = ((*in) >> 10) & 1;
+  out++;
+  *out = ((*in) >> 11) & 1;
+  out++;
+  *out = ((*in) >> 12) & 1;
+  out++;
+  *out = ((*in) >> 13) & 1;
+  out++;
+  *out = ((*in) >> 14) & 1;
+  out++;
+  *out = ((*in) >> 15) & 1;
+  out++;
+  *out = ((*in) >> 16) & 1;
+  out++;
+  *out = ((*in) >> 17) & 1;
+  out++;
+  *out = ((*in) >> 18) & 1;
+  out++;
+  *out = ((*in) >> 19) & 1;
+  out++;
+  *out = ((*in) >> 20) & 1;
+  out++;
+  *out = ((*in) >> 21) & 1;
+  out++;
+  *out = ((*in) >> 22) & 1;
+  out++;
+  *out = ((*in) >> 23) & 1;
+  out++;
+  *out = ((*in) >> 24) & 1;
+  out++;
+  *out = ((*in) >> 25) & 1;
+  out++;
+  *out = ((*in) >> 26) & 1;
+  out++;
+  *out = ((*in) >> 27) & 1;
+  out++;
+  *out = ((*in) >> 28) & 1;
+  out++;
+  *out = ((*in) >> 29) & 1;
+  out++;
+  *out = ((*in) >> 30) & 1;
+  out++;
+  *out = ((*in) >> 31);
+  ++in;
+  out++;
+
+  return in;
+}
+
+__host__ __device__ inline const uint32_t* unpack2_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 2);
+  out++;
+  *out = ((*in) >> 4) % (1U << 2);
+  out++;
+  *out = ((*in) >> 6) % (1U << 2);
+  out++;
+  *out = ((*in) >> 8) % (1U << 2);
+  out++;
+  *out = ((*in) >> 10) % (1U << 2);
+  out++;
+  *out = ((*in) >> 12) % (1U << 2);
+  out++;
+  *out = ((*in) >> 14) % (1U << 2);
+  out++;
+  *out = ((*in) >> 16) % (1U << 2);
+  out++;
+  *out = ((*in) >> 18) % (1U << 2);
+  out++;
+  *out = ((*in) >> 20) % (1U << 2);
+  out++;
+  *out = ((*in) >> 22) % (1U << 2);
+  out++;
+  *out = ((*in) >> 24) % (1U << 2);
+  out++;
+  *out = ((*in) >> 26) % (1U << 2);
+  out++;
+  *out = ((*in) >> 28) % (1U << 2);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 2);
+  out++;
+  *out = ((*in) >> 4) % (1U << 2);
+  out++;
+  *out = ((*in) >> 6) % (1U << 2);
+  out++;
+  *out = ((*in) >> 8) % (1U << 2);
+  out++;
+  *out = ((*in) >> 10) % (1U << 2);
+  out++;
+  *out = ((*in) >> 12) % (1U << 2);
+  out++;
+  *out = ((*in) >> 14) % (1U << 2);
+  out++;
+  *out = ((*in) >> 16) % (1U << 2);
+  out++;
+  *out = ((*in) >> 18) % (1U << 2);
+  out++;
+  *out = ((*in) >> 20) % (1U << 2);
+  out++;
+  *out = ((*in) >> 22) % (1U << 2);
+  out++;
+  *out = ((*in) >> 24) % (1U << 2);
+  out++;
+  *out = ((*in) >> 26) % (1U << 2);
+  out++;
+  *out = ((*in) >> 28) % (1U << 2);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  out++;
+
+  return in;
+}
+
+__host__ __device__ inline const uint32_t* unpack3_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 3);
+  out++;
+  *out = ((*in) >> 3) % (1U << 3);
+  out++;
+  *out = ((*in) >> 6) % (1U << 3);
+  out++;
+  *out = ((*in) >> 9) % (1U << 3);
+  out++;
+  *out = ((*in) >> 12) % (1U << 3);
+  out++;
+  *out = ((*in) >> 15) % (1U << 3);
+  out++;
+  *out = ((*in) >> 18) % (1U << 3);
+  out++;
+  *out = ((*in) >> 21) % (1U << 3);
+  out++;
+  *out = ((*in) >> 24) % (1U << 3);
+  out++;
+  *out = ((*in) >> 27) % (1U << 3);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 1)) << (3 - 1);
+  out++;
+  *out = ((*in) >> 1) % (1U << 3);
+  out++;
+  *out = ((*in) >> 4) % (1U << 3);
+  out++;
+  *out = ((*in) >> 7) % (1U << 3);
+  out++;
+  *out = ((*in) >> 10) % (1U << 3);
+  out++;
+  *out = ((*in) >> 13) % (1U << 3);
+  out++;
+  *out = ((*in) >> 16) % (1U << 3);
+  out++;
+  *out = ((*in) >> 19) % (1U << 3);
+  out++;
+  *out = ((*in) >> 22) % (1U << 3);
+  out++;
+  *out = ((*in) >> 25) % (1U << 3);
+  out++;
+  *out = ((*in) >> 28) % (1U << 3);
+  out++;
+  *out = ((*in) >> 31);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (3 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 3);
+  out++;
+  *out = ((*in) >> 5) % (1U << 3);
+  out++;
+  *out = ((*in) >> 8) % (1U << 3);
+  out++;
+  *out = ((*in) >> 11) % (1U << 3);
+  out++;
+  *out = ((*in) >> 14) % (1U << 3);
+  out++;
+  *out = ((*in) >> 17) % (1U << 3);
+  out++;
+  *out = ((*in) >> 20) % (1U << 3);
+  out++;
+  *out = ((*in) >> 23) % (1U << 3);
+  out++;
+  *out = ((*in) >> 26) % (1U << 3);
+  out++;
+  *out = ((*in) >> 29);
+  ++in;
+  out++;
+
+  return in;
+}
+
+__host__ __device__ inline const uint32_t* unpack4_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 4);
+  out++;
+  *out = ((*in) >> 8) % (1U << 4);
+  out++;
+  *out = ((*in) >> 12) % (1U << 4);
+  out++;
+  *out = ((*in) >> 16) % (1U << 4);
+  out++;
+  *out = ((*in) >> 20) % (1U << 4);
+  out++;
+  *out = ((*in) >> 24) % (1U << 4);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 4);
+  out++;
+  *out = ((*in) >> 8) % (1U << 4);
+  out++;
+  *out = ((*in) >> 12) % (1U << 4);
+  out++;
+  *out = ((*in) >> 16) % (1U << 4);
+  out++;
+  *out = ((*in) >> 20) % (1U << 4);
+  out++;
+  *out = ((*in) >> 24) % (1U << 4);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 4);
+  out++;
+  *out = ((*in) >> 8) % (1U << 4);
+  out++;
+  *out = ((*in) >> 12) % (1U << 4);
+  out++;
+  *out = ((*in) >> 16) % (1U << 4);
+  out++;
+  *out = ((*in) >> 20) % (1U << 4);
+  out++;
+  *out = ((*in) >> 24) % (1U << 4);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 4);
+  out++;
+  *out = ((*in) >> 8) % (1U << 4);
+  out++;
+  *out = ((*in) >> 12) % (1U << 4);
+  out++;
+  *out = ((*in) >> 16) % (1U << 4);
+  out++;
+  *out = ((*in) >> 20) % (1U << 4);
+  out++;
+  *out = ((*in) >> 24) % (1U << 4);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  out++;
+
+  return in;
+}
+
+__host__ __device__ inline const uint32_t* unpack5_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 5);
+  out++;
+  *out = ((*in) >> 5) % (1U << 5);
+  out++;
+  *out = ((*in) >> 10) % (1U << 5);
+  out++;
+  *out = ((*in) >> 15) % (1U << 5);
+  out++;
+  *out = ((*in) >> 20) % (1U << 5);
+  out++;
+  *out = ((*in) >> 25) % (1U << 5);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 3)) << (5 - 3);
+  out++;
+  *out = ((*in) >> 3) % (1U << 5);
+  out++;
+  *out = ((*in) >> 8) % (1U << 5);
+  out++;
+  *out = ((*in) >> 13) % (1U << 5);
+  out++;
+  *out = ((*in) >> 18) % (1U << 5);
+  out++;
+  *out = ((*in) >> 23) % (1U << 5);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 1)) << (5 - 1);
+  out++;
+  *out = ((*in) >> 1) % (1U << 5);
+  out++;
+  *out = ((*in) >> 6) % (1U << 5);
+  out++;
+  *out = ((*in) >> 11) % (1U << 5);
+  out++;
+  *out = ((*in) >> 16) % (1U << 5);
+  out++;
+  *out = ((*in) >> 21) % (1U << 5);
+  out++;
+  *out = ((*in) >> 26) % (1U << 5);
+  out++;
+  *out = ((*in) >> 31);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (5 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 5);
+  out++;
+  *out = ((*in) >> 9) % (1U << 5);
+  out++;
+  *out = ((*in) >> 14) % (1U << 5);
+  out++;
+  *out = ((*in) >> 19) % (1U << 5);
+  out++;
+  *out = ((*in) >> 24) % (1U << 5);
+  out++;
+  *out = ((*in) >> 29);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (5 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 5);
+  out++;
+  *out = ((*in) >> 7) % (1U << 5);
+  out++;
+  *out = ((*in) >> 12) % (1U << 5);
+  out++;
+  *out = ((*in) >> 17) % (1U << 5);
+  out++;
+  *out = ((*in) >> 22) % (1U << 5);
+  out++;
+  *out = ((*in) >> 27);
+  ++in;
+  out++;
+
+  return in;
+}
+
+__host__ __device__ inline const uint32_t* unpack6_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 6);
+  out++;
+  *out = ((*in) >> 6) % (1U << 6);
+  out++;
+  *out = ((*in) >> 12) % (1U << 6);
+  out++;
+  *out = ((*in) >> 18) % (1U << 6);
+  out++;
+  *out = ((*in) >> 24) % (1U << 6);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (6 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 6);
+  out++;
+  *out = ((*in) >> 10) % (1U << 6);
+  out++;
+  *out = ((*in) >> 16) % (1U << 6);
+  out++;
+  *out = ((*in) >> 22) % (1U << 6);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (6 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 6);
+  out++;
+  *out = ((*in) >> 8) % (1U << 6);
+  out++;
+  *out = ((*in) >> 14) % (1U << 6);
+  out++;
+  *out = ((*in) >> 20) % (1U << 6);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 6);
+  out++;
+  *out = ((*in) >> 6) % (1U << 6);
+  out++;
+  *out = ((*in) >> 12) % (1U << 6);
+  out++;
+  *out = ((*in) >> 18) % (1U << 6);
+  out++;
+  *out = ((*in) >> 24) % (1U << 6);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (6 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 6);
+  out++;
+  *out = ((*in) >> 10) % (1U << 6);
+  out++;
+  *out = ((*in) >> 16) % (1U << 6);
+  out++;
+  *out = ((*in) >> 22) % (1U << 6);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (6 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 6);
+  out++;
+  *out = ((*in) >> 8) % (1U << 6);
+  out++;
+  *out = ((*in) >> 14) % (1U << 6);
+  out++;
+  *out = ((*in) >> 20) % (1U << 6);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  out++;
+
+  return in;
+}
+
+__host__ __device__ inline const uint32_t* unpack7_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 7);
+  out++;
+  *out = ((*in) >> 7) % (1U << 7);
+  out++;
+  *out = ((*in) >> 14) % (1U << 7);
+  out++;
+  *out = ((*in) >> 21) % (1U << 7);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 3)) << (7 - 3);
+  out++;
+  *out = ((*in) >> 3) % (1U << 7);
+  out++;
+  *out = ((*in) >> 10) % (1U << 7);
+  out++;
+  *out = ((*in) >> 17) % (1U << 7);
+  out++;
+  *out = ((*in) >> 24) % (1U << 7);
+  out++;
+  *out = ((*in) >> 31);
+  ++in;
+  *out |= ((*in) % (1U << 6)) << (7 - 6);
+  out++;
+  *out = ((*in) >> 6) % (1U << 7);
+  out++;
+  *out = ((*in) >> 13) % (1U << 7);
+  out++;
+  *out = ((*in) >> 20) % (1U << 7);
+  out++;
+  *out = ((*in) >> 27);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (7 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 7);
+  out++;
+  *out = ((*in) >> 9) % (1U << 7);
+  out++;
+  *out = ((*in) >> 16) % (1U << 7);
+  out++;
+  *out = ((*in) >> 23) % (1U << 7);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 5)) << (7 - 5);
+  out++;
+  *out = ((*in) >> 5) % (1U << 7);
+  out++;
+  *out = ((*in) >> 12) % (1U << 7);
+  out++;
+  *out = ((*in) >> 19) % (1U << 7);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  *out |= ((*in) % (1U << 1)) << (7 - 1);
+  out++;
+  *out = ((*in) >> 1) % (1U << 7);
+  out++;
+  *out = ((*in) >> 8) % (1U << 7);
+  out++;
+  *out = ((*in) >> 15) % (1U << 7);
+  out++;
+  *out = ((*in) >> 22) % (1U << 7);
+  out++;
+  *out = ((*in) >> 29);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (7 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 7);
+  out++;
+  *out = ((*in) >> 11) % (1U << 7);
+  out++;
+  *out = ((*in) >> 18) % (1U << 7);
+  out++;
+  *out = ((*in) >> 25);
+  ++in;
+  out++;
+
+  return in;
+}
+
+__host__ __device__ inline const uint32_t* unpack8_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 8);
+  out++;
+  *out = ((*in) >> 16) % (1U << 8);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 8);
+  out++;
+  *out = ((*in) >> 16) % (1U << 8);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 8);
+  out++;
+  *out = ((*in) >> 16) % (1U << 8);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 8);
+  out++;
+  *out = ((*in) >> 16) % (1U << 8);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 8);
+  out++;
+  *out = ((*in) >> 16) % (1U << 8);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 8);
+  out++;
+  *out = ((*in) >> 16) % (1U << 8);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 8);
+  out++;
+  *out = ((*in) >> 16) % (1U << 8);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 8);
+  out++;
+  *out = ((*in) >> 16) % (1U << 8);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  out++;
+
+  return in;
+}
+
+__host__ __device__ inline const uint32_t* unpack9_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 9);
+  out++;
+  *out = ((*in) >> 9) % (1U << 9);
+  out++;
+  *out = ((*in) >> 18) % (1U << 9);
+  out++;
+  *out = ((*in) >> 27);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (9 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 9);
+  out++;
+  *out = ((*in) >> 13) % (1U << 9);
+  out++;
+  *out = ((*in) >> 22) % (1U << 9);
+  out++;
+  *out = ((*in) >> 31);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (9 - 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 9);
+  out++;
+  *out = ((*in) >> 17) % (1U << 9);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  *out |= ((*in) % (1U << 3)) << (9 - 3);
+  out++;
+  *out = ((*in) >> 3) % (1U << 9);
+  out++;
+  *out = ((*in) >> 12) % (1U << 9);
+  out++;
+  *out = ((*in) >> 21) % (1U << 9);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 7)) << (9 - 7);
+  out++;
+  *out = ((*in) >> 7) % (1U << 9);
+  out++;
+  *out = ((*in) >> 16) % (1U << 9);
+  out++;
+  *out = ((*in) >> 25);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (9 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 9);
+  out++;
+  *out = ((*in) >> 11) % (1U << 9);
+  out++;
+  *out = ((*in) >> 20) % (1U << 9);
+  out++;
+  *out = ((*in) >> 29);
+  ++in;
+  *out |= ((*in) % (1U << 6)) << (9 - 6);
+  out++;
+  *out = ((*in) >> 6) % (1U << 9);
+  out++;
+  *out = ((*in) >> 15) % (1U << 9);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 1)) << (9 - 1);
+  out++;
+  *out = ((*in) >> 1) % (1U << 9);
+  out++;
+  *out = ((*in) >> 10) % (1U << 9);
+  out++;
+  *out = ((*in) >> 19) % (1U << 9);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 5)) << (9 - 5);
+  out++;
+  *out = ((*in) >> 5) % (1U << 9);
+  out++;
+  *out = ((*in) >> 14) % (1U << 9);
+  out++;
+  *out = ((*in) >> 23);
+  ++in;
+  out++;
+
+  return in;
+}
+
+__host__ __device__ inline const uint32_t* unpack10_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 10);
+  out++;
+  *out = ((*in) >> 10) % (1U << 10);
+  out++;
+  *out = ((*in) >> 20) % (1U << 10);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (10 - 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 10);
+  out++;
+  *out = ((*in) >> 18) % (1U << 10);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 6)) << (10 - 6);
+  out++;
+  *out = ((*in) >> 6) % (1U << 10);
+  out++;
+  *out = ((*in) >> 16) % (1U << 10);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (10 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 10);
+  out++;
+  *out = ((*in) >> 14) % (1U << 10);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (10 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 10);
+  out++;
+  *out = ((*in) >> 12) % (1U << 10);
+  out++;
+  *out = ((*in) >> 22);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 10);
+  out++;
+  *out = ((*in) >> 10) % (1U << 10);
+  out++;
+  *out = ((*in) >> 20) % (1U << 10);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (10 - 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 10);
+  out++;
+  *out = ((*in) >> 18) % (1U << 10);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 6)) << (10 - 6);
+  out++;
+  *out = ((*in) >> 6) % (1U << 10);
+  out++;
+  *out = ((*in) >> 16) % (1U << 10);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (10 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 10);
+  out++;
+  *out = ((*in) >> 14) % (1U << 10);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (10 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 10);
+  out++;
+  *out = ((*in) >> 12) % (1U << 10);
+  out++;
+  *out = ((*in) >> 22);
+  ++in;
+  out++;
+
+  return in;
+}
+
+__host__ __device__ inline const uint32_t* unpack11_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 11);
+  out++;
+  *out = ((*in) >> 11) % (1U << 11);
+  out++;
+  *out = ((*in) >> 22);
+  ++in;
+  *out |= ((*in) % (1U << 1)) << (11 - 1);
+  out++;
+  *out = ((*in) >> 1) % (1U << 11);
+  out++;
+  *out = ((*in) >> 12) % (1U << 11);
+  out++;
+  *out = ((*in) >> 23);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (11 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 11);
+  out++;
+  *out = ((*in) >> 13) % (1U << 11);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 3)) << (11 - 3);
+  out++;
+  *out = ((*in) >> 3) % (1U << 11);
+  out++;
+  *out = ((*in) >> 14) % (1U << 11);
+  out++;
+  *out = ((*in) >> 25);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (11 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 11);
+  out++;
+  *out = ((*in) >> 15) % (1U << 11);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  *out |= ((*in) % (1U << 5)) << (11 - 5);
+  out++;
+  *out = ((*in) >> 5) % (1U << 11);
+  out++;
+  *out = ((*in) >> 16) % (1U << 11);
+  out++;
+  *out = ((*in) >> 27);
+  ++in;
+  *out |= ((*in) % (1U << 6)) << (11 - 6);
+  out++;
+  *out = ((*in) >> 6) % (1U << 11);
+  out++;
+  *out = ((*in) >> 17) % (1U << 11);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 7)) << (11 - 7);
+  out++;
+  *out = ((*in) >> 7) % (1U << 11);
+  out++;
+  *out = ((*in) >> 18) % (1U << 11);
+  out++;
+  *out = ((*in) >> 29);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (11 - 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 11);
+  out++;
+  *out = ((*in) >> 19) % (1U << 11);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 9)) << (11 - 9);
+  out++;
+  *out = ((*in) >> 9) % (1U << 11);
+  out++;
+  *out = ((*in) >> 20) % (1U << 11);
+  out++;
+  *out = ((*in) >> 31);
+  ++in;
+  *out |= ((*in) % (1U << 10)) << (11 - 10);
+  out++;
+  *out = ((*in) >> 10) % (1U << 11);
+  out++;
+  *out = ((*in) >> 21);
+  ++in;
+  out++;
+
+  return in;
+}
+
+__host__ __device__ inline const uint32_t* unpack12_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 12);
+  out++;
+  *out = ((*in) >> 12) % (1U << 12);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (12 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 12);
+  out++;
+  *out = ((*in) >> 16) % (1U << 12);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (12 - 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 12);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 12);
+  out++;
+  *out = ((*in) >> 12) % (1U << 12);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (12 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 12);
+  out++;
+  *out = ((*in) >> 16) % (1U << 12);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (12 - 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 12);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 12);
+  out++;
+  *out = ((*in) >> 12) % (1U << 12);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (12 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 12);
+  out++;
+  *out = ((*in) >> 16) % (1U << 12);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (12 - 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 12);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 12);
+  out++;
+  *out = ((*in) >> 12) % (1U << 12);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (12 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 12);
+  out++;
+  *out = ((*in) >> 16) % (1U << 12);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (12 - 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 12);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  out++;
+
+  return in;
+}
+
+__host__ __device__ inline const uint32_t* unpack13_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 13);
+  out++;
+  *out = ((*in) >> 13) % (1U << 13);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  *out |= ((*in) % (1U << 7)) << (13 - 7);
+  out++;
+  *out = ((*in) >> 7) % (1U << 13);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 1)) << (13 - 1);
+  out++;
+  *out = ((*in) >> 1) % (1U << 13);
+  out++;
+  *out = ((*in) >> 14) % (1U << 13);
+  out++;
+  *out = ((*in) >> 27);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (13 - 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 13);
+  out++;
+  *out = ((*in) >> 21);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (13 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 13);
+  out++;
+  *out = ((*in) >> 15) % (1U << 13);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 9)) << (13 - 9);
+  out++;
+  *out = ((*in) >> 9) % (1U << 13);
+  out++;
+  *out = ((*in) >> 22);
+  ++in;
+  *out |= ((*in) % (1U << 3)) << (13 - 3);
+  out++;
+  *out = ((*in) >> 3) % (1U << 13);
+  out++;
+  *out = ((*in) >> 16) % (1U << 13);
+  out++;
+  *out = ((*in) >> 29);
+  ++in;
+  *out |= ((*in) % (1U << 10)) << (13 - 10);
+  out++;
+  *out = ((*in) >> 10) % (1U << 13);
+  out++;
+  *out = ((*in) >> 23);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (13 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 13);
+  out++;
+  *out = ((*in) >> 17) % (1U << 13);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 11)) << (13 - 11);
+  out++;
+  *out = ((*in) >> 11) % (1U << 13);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 5)) << (13 - 5);
+  out++;
+  *out = ((*in) >> 5) % (1U << 13);
+  out++;
+  *out = ((*in) >> 18) % (1U << 13);
+  out++;
+  *out = ((*in) >> 31);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (13 - 12);
+  out++;
+  *out = ((*in) >> 12) % (1U << 13);
+  out++;
+  *out = ((*in) >> 25);
+  ++in;
+  *out |= ((*in) % (1U << 6)) << (13 - 6);
+  out++;
+  *out = ((*in) >> 6) % (1U << 13);
+  out++;
+  *out = ((*in) >> 19);
+  ++in;
+  out++;
+
+  return in;
+}
+
+__host__ __device__ inline const uint32_t* unpack14_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 14);
+  out++;
+  *out = ((*in) >> 14) % (1U << 14);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 10)) << (14 - 10);
+  out++;
+  *out = ((*in) >> 10) % (1U << 14);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 6)) << (14 - 6);
+  out++;
+  *out = ((*in) >> 6) % (1U << 14);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (14 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 14);
+  out++;
+  *out = ((*in) >> 16) % (1U << 14);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (14 - 12);
+  out++;
+  *out = ((*in) >> 12) % (1U << 14);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (14 - 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 14);
+  out++;
+  *out = ((*in) >> 22);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (14 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 14);
+  out++;
+  *out = ((*in) >> 18);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 14);
+  out++;
+  *out = ((*in) >> 14) % (1U << 14);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 10)) << (14 - 10);
+  out++;
+  *out = ((*in) >> 10) % (1U << 14);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 6)) << (14 - 6);
+  out++;
+  *out = ((*in) >> 6) % (1U << 14);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (14 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 14);
+  out++;
+  *out = ((*in) >> 16) % (1U << 14);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (14 - 12);
+  out++;
+  *out = ((*in) >> 12) % (1U << 14);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (14 - 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 14);
+  out++;
+  *out = ((*in) >> 22);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (14 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 14);
+  out++;
+  *out = ((*in) >> 18);
+  ++in;
+  out++;
+
+  return in;
+}
+
+__host__ __device__ inline const uint32_t* unpack15_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 15);
+  out++;
+  *out = ((*in) >> 15) % (1U << 15);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 13)) << (15 - 13);
+  out++;
+  *out = ((*in) >> 13) % (1U << 15);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 11)) << (15 - 11);
+  out++;
+  *out = ((*in) >> 11) % (1U << 15);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  *out |= ((*in) % (1U << 9)) << (15 - 9);
+  out++;
+  *out = ((*in) >> 9) % (1U << 15);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 7)) << (15 - 7);
+  out++;
+  *out = ((*in) >> 7) % (1U << 15);
+  out++;
+  *out = ((*in) >> 22);
+  ++in;
+  *out |= ((*in) % (1U << 5)) << (15 - 5);
+  out++;
+  *out = ((*in) >> 5) % (1U << 15);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 3)) << (15 - 3);
+  out++;
+  *out = ((*in) >> 3) % (1U << 15);
+  out++;
+  *out = ((*in) >> 18);
+  ++in;
+  *out |= ((*in) % (1U << 1)) << (15 - 1);
+  out++;
+  *out = ((*in) >> 1) % (1U << 15);
+  out++;
+  *out = ((*in) >> 16) % (1U << 15);
+  out++;
+  *out = ((*in) >> 31);
+  ++in;
+  *out |= ((*in) % (1U << 14)) << (15 - 14);
+  out++;
+  *out = ((*in) >> 14) % (1U << 15);
+  out++;
+  *out = ((*in) >> 29);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (15 - 12);
+  out++;
+  *out = ((*in) >> 12) % (1U << 15);
+  out++;
+  *out = ((*in) >> 27);
+  ++in;
+  *out |= ((*in) % (1U << 10)) << (15 - 10);
+  out++;
+  *out = ((*in) >> 10) % (1U << 15);
+  out++;
+  *out = ((*in) >> 25);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (15 - 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 15);
+  out++;
+  *out = ((*in) >> 23);
+  ++in;
+  *out |= ((*in) % (1U << 6)) << (15 - 6);
+  out++;
+  *out = ((*in) >> 6) % (1U << 15);
+  out++;
+  *out = ((*in) >> 21);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (15 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 15);
+  out++;
+  *out = ((*in) >> 19);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (15 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 15);
+  out++;
+  *out = ((*in) >> 17);
+  ++in;
+  out++;
+
+  return in;
+}
+
+__host__ __device__ inline const uint32_t* unpack16_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  out++;
+
+  return in;
+}
+
+__host__ __device__ inline const uint32_t* unpack17_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 17);
+  out++;
+  *out = ((*in) >> 17);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (17 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 17);
+  out++;
+  *out = ((*in) >> 19);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (17 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 17);
+  out++;
+  *out = ((*in) >> 21);
+  ++in;
+  *out |= ((*in) % (1U << 6)) << (17 - 6);
+  out++;
+  *out = ((*in) >> 6) % (1U << 17);
+  out++;
+  *out = ((*in) >> 23);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (17 - 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 17);
+  out++;
+  *out = ((*in) >> 25);
+  ++in;
+  *out |= ((*in) % (1U << 10)) << (17 - 10);
+  out++;
+  *out = ((*in) >> 10) % (1U << 17);
+  out++;
+  *out = ((*in) >> 27);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (17 - 12);
+  out++;
+  *out = ((*in) >> 12) % (1U << 17);
+  out++;
+  *out = ((*in) >> 29);
+  ++in;
+  *out |= ((*in) % (1U << 14)) << (17 - 14);
+  out++;
+  *out = ((*in) >> 14) % (1U << 17);
+  out++;
+  *out = ((*in) >> 31);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (17 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 1)) << (17 - 1);
+  out++;
+  *out = ((*in) >> 1) % (1U << 17);
+  out++;
+  *out = ((*in) >> 18);
+  ++in;
+  *out |= ((*in) % (1U << 3)) << (17 - 3);
+  out++;
+  *out = ((*in) >> 3) % (1U << 17);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 5)) << (17 - 5);
+  out++;
+  *out = ((*in) >> 5) % (1U << 17);
+  out++;
+  *out = ((*in) >> 22);
+  ++in;
+  *out |= ((*in) % (1U << 7)) << (17 - 7);
+  out++;
+  *out = ((*in) >> 7) % (1U << 17);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 9)) << (17 - 9);
+  out++;
+  *out = ((*in) >> 9) % (1U << 17);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  *out |= ((*in) % (1U << 11)) << (17 - 11);
+  out++;
+  *out = ((*in) >> 11) % (1U << 17);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 13)) << (17 - 13);
+  out++;
+  *out = ((*in) >> 13) % (1U << 17);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 15)) << (17 - 15);
+  out++;
+  *out = ((*in) >> 15);
+  ++in;
+  out++;
+
+  return in;
+}
+
+__host__ __device__ inline const uint32_t* unpack18_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 18);
+  out++;
+  *out = ((*in) >> 18);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (18 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 18);
+  out++;
+  *out = ((*in) >> 22);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (18 - 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 18);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (18 - 12);
+  out++;
+  *out = ((*in) >> 12) % (1U << 18);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (18 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (18 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 18);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 6)) << (18 - 6);
+  out++;
+  *out = ((*in) >> 6) % (1U << 18);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 10)) << (18 - 10);
+  out++;
+  *out = ((*in) >> 10) % (1U << 18);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 14)) << (18 - 14);
+  out++;
+  *out = ((*in) >> 14);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 18);
+  out++;
+  *out = ((*in) >> 18);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (18 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 18);
+  out++;
+  *out = ((*in) >> 22);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (18 - 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 18);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (18 - 12);
+  out++;
+  *out = ((*in) >> 12) % (1U << 18);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (18 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (18 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 18);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 6)) << (18 - 6);
+  out++;
+  *out = ((*in) >> 6) % (1U << 18);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 10)) << (18 - 10);
+  out++;
+  *out = ((*in) >> 10) % (1U << 18);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 14)) << (18 - 14);
+  out++;
+  *out = ((*in) >> 14);
+  ++in;
+  out++;
+
+  return in;
+}
+
+__host__ __device__ inline const uint32_t* unpack19_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 19);
+  out++;
+  *out = ((*in) >> 19);
+  ++in;
+  *out |= ((*in) % (1U << 6)) << (19 - 6);
+  out++;
+  *out = ((*in) >> 6) % (1U << 19);
+  out++;
+  *out = ((*in) >> 25);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (19 - 12);
+  out++;
+  *out = ((*in) >> 12) % (1U << 19);
+  out++;
+  *out = ((*in) >> 31);
+  ++in;
+  *out |= ((*in) % (1U << 18)) << (19 - 18);
+  out++;
+  *out = ((*in) >> 18);
+  ++in;
+  *out |= ((*in) % (1U << 5)) << (19 - 5);
+  out++;
+  *out = ((*in) >> 5) % (1U << 19);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 11)) << (19 - 11);
+  out++;
+  *out = ((*in) >> 11) % (1U << 19);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 17)) << (19 - 17);
+  out++;
+  *out = ((*in) >> 17);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (19 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 19);
+  out++;
+  *out = ((*in) >> 23);
+  ++in;
+  *out |= ((*in) % (1U << 10)) << (19 - 10);
+  out++;
+  *out = ((*in) >> 10) % (1U << 19);
+  out++;
+  *out = ((*in) >> 29);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (19 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 3)) << (19 - 3);
+  out++;
+  *out = ((*in) >> 3) % (1U << 19);
+  out++;
+  *out = ((*in) >> 22);
+  ++in;
+  *out |= ((*in) % (1U << 9)) << (19 - 9);
+  out++;
+  *out = ((*in) >> 9) % (1U << 19);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 15)) << (19 - 15);
+  out++;
+  *out = ((*in) >> 15);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (19 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 19);
+  out++;
+  *out = ((*in) >> 21);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (19 - 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 19);
+  out++;
+  *out = ((*in) >> 27);
+  ++in;
+  *out |= ((*in) % (1U << 14)) << (19 - 14);
+  out++;
+  *out = ((*in) >> 14);
+  ++in;
+  *out |= ((*in) % (1U << 1)) << (19 - 1);
+  out++;
+  *out = ((*in) >> 1) % (1U << 19);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 7)) << (19 - 7);
+  out++;
+  *out = ((*in) >> 7) % (1U << 19);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  *out |= ((*in) % (1U << 13)) << (19 - 13);
+  out++;
+  *out = ((*in) >> 13);
+  ++in;
+  out++;
+
+  return in;
+}
+
+__host__ __device__ inline const uint32_t* unpack20_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 20);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (20 - 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 20);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (20 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (20 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 20);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (20 - 12);
+  out++;
+  *out = ((*in) >> 12);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 20);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (20 - 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 20);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (20 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (20 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 20);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (20 - 12);
+  out++;
+  *out = ((*in) >> 12);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 20);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (20 - 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 20);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (20 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (20 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 20);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (20 - 12);
+  out++;
+  *out = ((*in) >> 12);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 20);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (20 - 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 20);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (20 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (20 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 20);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (20 - 12);
+  out++;
+  *out = ((*in) >> 12);
+  ++in;
+  out++;
+
+  return in;
+}
+
+__host__ __device__ inline const uint32_t* unpack21_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 21);
+  out++;
+  *out = ((*in) >> 21);
+  ++in;
+  *out |= ((*in) % (1U << 10)) << (21 - 10);
+  out++;
+  *out = ((*in) >> 10) % (1U << 21);
+  out++;
+  *out = ((*in) >> 31);
+  ++in;
+  *out |= ((*in) % (1U << 20)) << (21 - 20);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 9)) << (21 - 9);
+  out++;
+  *out = ((*in) >> 9) % (1U << 21);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 19)) << (21 - 19);
+  out++;
+  *out = ((*in) >> 19);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (21 - 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 21);
+  out++;
+  *out = ((*in) >> 29);
+  ++in;
+  *out |= ((*in) % (1U << 18)) << (21 - 18);
+  out++;
+  *out = ((*in) >> 18);
+  ++in;
+  *out |= ((*in) % (1U << 7)) << (21 - 7);
+  out++;
+  *out = ((*in) >> 7) % (1U << 21);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 17)) << (21 - 17);
+  out++;
+  *out = ((*in) >> 17);
+  ++in;
+  *out |= ((*in) % (1U << 6)) << (21 - 6);
+  out++;
+  *out = ((*in) >> 6) % (1U << 21);
+  out++;
+  *out = ((*in) >> 27);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (21 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 5)) << (21 - 5);
+  out++;
+  *out = ((*in) >> 5) % (1U << 21);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  *out |= ((*in) % (1U << 15)) << (21 - 15);
+  out++;
+  *out = ((*in) >> 15);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (21 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 21);
+  out++;
+  *out = ((*in) >> 25);
+  ++in;
+  *out |= ((*in) % (1U << 14)) << (21 - 14);
+  out++;
+  *out = ((*in) >> 14);
+  ++in;
+  *out |= ((*in) % (1U << 3)) << (21 - 3);
+  out++;
+  *out = ((*in) >> 3) % (1U << 21);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 13)) << (21 - 13);
+  out++;
+  *out = ((*in) >> 13);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (21 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 21);
+  out++;
+  *out = ((*in) >> 23);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (21 - 12);
+  out++;
+  *out = ((*in) >> 12);
+  ++in;
+  *out |= ((*in) % (1U << 1)) << (21 - 1);
+  out++;
+  *out = ((*in) >> 1) % (1U << 21);
+  out++;
+  *out = ((*in) >> 22);
+  ++in;
+  *out |= ((*in) % (1U << 11)) << (21 - 11);
+  out++;
+  *out = ((*in) >> 11);
+  ++in;
+  out++;
+
+  return in;
+}
+
+__host__ __device__ inline const uint32_t* unpack22_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 22);
+  out++;
+  *out = ((*in) >> 22);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (22 - 12);
+  out++;
+  *out = ((*in) >> 12);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (22 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 22);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 14)) << (22 - 14);
+  out++;
+  *out = ((*in) >> 14);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (22 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 22);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (22 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 6)) << (22 - 6);
+  out++;
+  *out = ((*in) >> 6) % (1U << 22);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 18)) << (22 - 18);
+  out++;
+  *out = ((*in) >> 18);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (22 - 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 22);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 20)) << (22 - 20);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 10)) << (22 - 10);
+  out++;
+  *out = ((*in) >> 10);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 22);
+  out++;
+  *out = ((*in) >> 22);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (22 - 12);
+  out++;
+  *out = ((*in) >> 12);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (22 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 22);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 14)) << (22 - 14);
+  out++;
+  *out = ((*in) >> 14);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (22 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 22);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (22 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 6)) << (22 - 6);
+  out++;
+  *out = ((*in) >> 6) % (1U << 22);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 18)) << (22 - 18);
+  out++;
+  *out = ((*in) >> 18);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (22 - 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 22);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 20)) << (22 - 20);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 10)) << (22 - 10);
+  out++;
+  *out = ((*in) >> 10);
+  ++in;
+  out++;
+
+  return in;
+}
+
+__host__ __device__ inline const uint32_t* unpack23_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 23);
+  out++;
+  *out = ((*in) >> 23);
+  ++in;
+  *out |= ((*in) % (1U << 14)) << (23 - 14);
+  out++;
+  *out = ((*in) >> 14);
+  ++in;
+  *out |= ((*in) % (1U << 5)) << (23 - 5);
+  out++;
+  *out = ((*in) >> 5) % (1U << 23);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 19)) << (23 - 19);
+  out++;
+  *out = ((*in) >> 19);
+  ++in;
+  *out |= ((*in) % (1U << 10)) << (23 - 10);
+  out++;
+  *out = ((*in) >> 10);
+  ++in;
+  *out |= ((*in) % (1U << 1)) << (23 - 1);
+  out++;
+  *out = ((*in) >> 1) % (1U << 23);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 15)) << (23 - 15);
+  out++;
+  *out = ((*in) >> 15);
+  ++in;
+  *out |= ((*in) % (1U << 6)) << (23 - 6);
+  out++;
+  *out = ((*in) >> 6) % (1U << 23);
+  out++;
+  *out = ((*in) >> 29);
+  ++in;
+  *out |= ((*in) % (1U << 20)) << (23 - 20);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 11)) << (23 - 11);
+  out++;
+  *out = ((*in) >> 11);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (23 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 23);
+  out++;
+  *out = ((*in) >> 25);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (23 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 7)) << (23 - 7);
+  out++;
+  *out = ((*in) >> 7) % (1U << 23);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 21)) << (23 - 21);
+  out++;
+  *out = ((*in) >> 21);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (23 - 12);
+  out++;
+  *out = ((*in) >> 12);
+  ++in;
+  *out |= ((*in) % (1U << 3)) << (23 - 3);
+  out++;
+  *out = ((*in) >> 3) % (1U << 23);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  *out |= ((*in) % (1U << 17)) << (23 - 17);
+  out++;
+  *out = ((*in) >> 17);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (23 - 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 23);
+  out++;
+  *out = ((*in) >> 31);
+  ++in;
+  *out |= ((*in) % (1U << 22)) << (23 - 22);
+  out++;
+  *out = ((*in) >> 22);
+  ++in;
+  *out |= ((*in) % (1U << 13)) << (23 - 13);
+  out++;
+  *out = ((*in) >> 13);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (23 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 23);
+  out++;
+  *out = ((*in) >> 27);
+  ++in;
+  *out |= ((*in) % (1U << 18)) << (23 - 18);
+  out++;
+  *out = ((*in) >> 18);
+  ++in;
+  *out |= ((*in) % (1U << 9)) << (23 - 9);
+  out++;
+  *out = ((*in) >> 9);
+  ++in;
+  out++;
+
+  return in;
+}
+
+__host__ __device__ inline const uint32_t* unpack24_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 24);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (24 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (24 - 8);
+  out++;
+  *out = ((*in) >> 8);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 24);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (24 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (24 - 8);
+  out++;
+  *out = ((*in) >> 8);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 24);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (24 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (24 - 8);
+  out++;
+  *out = ((*in) >> 8);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 24);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (24 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (24 - 8);
+  out++;
+  *out = ((*in) >> 8);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 24);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (24 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (24 - 8);
+  out++;
+  *out = ((*in) >> 8);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 24);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (24 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (24 - 8);
+  out++;
+  *out = ((*in) >> 8);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 24);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (24 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (24 - 8);
+  out++;
+  *out = ((*in) >> 8);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 24);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (24 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (24 - 8);
+  out++;
+  *out = ((*in) >> 8);
+  ++in;
+  out++;
+
+  return in;
+}
+
+__host__ __device__ inline const uint32_t* unpack25_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 25);
+  out++;
+  *out = ((*in) >> 25);
+  ++in;
+  *out |= ((*in) % (1U << 18)) << (25 - 18);
+  out++;
+  *out = ((*in) >> 18);
+  ++in;
+  *out |= ((*in) % (1U << 11)) << (25 - 11);
+  out++;
+  *out = ((*in) >> 11);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (25 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 25);
+  out++;
+  *out = ((*in) >> 29);
+  ++in;
+  *out |= ((*in) % (1U << 22)) << (25 - 22);
+  out++;
+  *out = ((*in) >> 22);
+  ++in;
+  *out |= ((*in) % (1U << 15)) << (25 - 15);
+  out++;
+  *out = ((*in) >> 15);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (25 - 8);
+  out++;
+  *out = ((*in) >> 8);
+  ++in;
+  *out |= ((*in) % (1U << 1)) << (25 - 1);
+  out++;
+  *out = ((*in) >> 1) % (1U << 25);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  *out |= ((*in) % (1U << 19)) << (25 - 19);
+  out++;
+  *out = ((*in) >> 19);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (25 - 12);
+  out++;
+  *out = ((*in) >> 12);
+  ++in;
+  *out |= ((*in) % (1U << 5)) << (25 - 5);
+  out++;
+  *out = ((*in) >> 5) % (1U << 25);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 23)) << (25 - 23);
+  out++;
+  *out = ((*in) >> 23);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (25 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 9)) << (25 - 9);
+  out++;
+  *out = ((*in) >> 9);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (25 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 25);
+  out++;
+  *out = ((*in) >> 27);
+  ++in;
+  *out |= ((*in) % (1U << 20)) << (25 - 20);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 13)) << (25 - 13);
+  out++;
+  *out = ((*in) >> 13);
+  ++in;
+  *out |= ((*in) % (1U << 6)) << (25 - 6);
+  out++;
+  *out = ((*in) >> 6) % (1U << 25);
+  out++;
+  *out = ((*in) >> 31);
+  ++in;
+  *out |= ((*in) % (1U << 24)) << (25 - 24);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 17)) << (25 - 17);
+  out++;
+  *out = ((*in) >> 17);
+  ++in;
+  *out |= ((*in) % (1U << 10)) << (25 - 10);
+  out++;
+  *out = ((*in) >> 10);
+  ++in;
+  *out |= ((*in) % (1U << 3)) << (25 - 3);
+  out++;
+  *out = ((*in) >> 3) % (1U << 25);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 21)) << (25 - 21);
+  out++;
+  *out = ((*in) >> 21);
+  ++in;
+  *out |= ((*in) % (1U << 14)) << (25 - 14);
+  out++;
+  *out = ((*in) >> 14);
+  ++in;
+  *out |= ((*in) % (1U << 7)) << (25 - 7);
+  out++;
+  *out = ((*in) >> 7);
+  ++in;
+  out++;
+
+  return in;
+}
+
+__host__ __device__ inline const uint32_t* unpack26_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 26);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  *out |= ((*in) % (1U << 20)) << (26 - 20);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 14)) << (26 - 14);
+  out++;
+  *out = ((*in) >> 14);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (26 - 8);
+  out++;
+  *out = ((*in) >> 8);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (26 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 26);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 22)) << (26 - 22);
+  out++;
+  *out = ((*in) >> 22);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (26 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 10)) << (26 - 10);
+  out++;
+  *out = ((*in) >> 10);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (26 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 26);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 24)) << (26 - 24);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 18)) << (26 - 18);
+  out++;
+  *out = ((*in) >> 18);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (26 - 12);
+  out++;
+  *out = ((*in) >> 12);
+  ++in;
+  *out |= ((*in) % (1U << 6)) << (26 - 6);
+  out++;
+  *out = ((*in) >> 6);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 26);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  *out |= ((*in) % (1U << 20)) << (26 - 20);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 14)) << (26 - 14);
+  out++;
+  *out = ((*in) >> 14);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (26 - 8);
+  out++;
+  *out = ((*in) >> 8);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (26 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 26);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 22)) << (26 - 22);
+  out++;
+  *out = ((*in) >> 22);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (26 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 10)) << (26 - 10);
+  out++;
+  *out = ((*in) >> 10);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (26 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 26);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 24)) << (26 - 24);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 18)) << (26 - 18);
+  out++;
+  *out = ((*in) >> 18);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (26 - 12);
+  out++;
+  *out = ((*in) >> 12);
+  ++in;
+  *out |= ((*in) % (1U << 6)) << (26 - 6);
+  out++;
+  *out = ((*in) >> 6);
+  ++in;
+  out++;
+
+  return in;
+}
+
+__host__ __device__ inline const uint32_t* unpack27_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 27);
+  out++;
+  *out = ((*in) >> 27);
+  ++in;
+  *out |= ((*in) % (1U << 22)) << (27 - 22);
+  out++;
+  *out = ((*in) >> 22);
+  ++in;
+  *out |= ((*in) % (1U << 17)) << (27 - 17);
+  out++;
+  *out = ((*in) >> 17);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (27 - 12);
+  out++;
+  *out = ((*in) >> 12);
+  ++in;
+  *out |= ((*in) % (1U << 7)) << (27 - 7);
+  out++;
+  *out = ((*in) >> 7);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (27 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 27);
+  out++;
+  *out = ((*in) >> 29);
+  ++in;
+  *out |= ((*in) % (1U << 24)) << (27 - 24);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 19)) << (27 - 19);
+  out++;
+  *out = ((*in) >> 19);
+  ++in;
+  *out |= ((*in) % (1U << 14)) << (27 - 14);
+  out++;
+  *out = ((*in) >> 14);
+  ++in;
+  *out |= ((*in) % (1U << 9)) << (27 - 9);
+  out++;
+  *out = ((*in) >> 9);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (27 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 27);
+  out++;
+  *out = ((*in) >> 31);
+  ++in;
+  *out |= ((*in) % (1U << 26)) << (27 - 26);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  *out |= ((*in) % (1U << 21)) << (27 - 21);
+  out++;
+  *out = ((*in) >> 21);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (27 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 11)) << (27 - 11);
+  out++;
+  *out = ((*in) >> 11);
+  ++in;
+  *out |= ((*in) % (1U << 6)) << (27 - 6);
+  out++;
+  *out = ((*in) >> 6);
+  ++in;
+  *out |= ((*in) % (1U << 1)) << (27 - 1);
+  out++;
+  *out = ((*in) >> 1) % (1U << 27);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 23)) << (27 - 23);
+  out++;
+  *out = ((*in) >> 23);
+  ++in;
+  *out |= ((*in) % (1U << 18)) << (27 - 18);
+  out++;
+  *out = ((*in) >> 18);
+  ++in;
+  *out |= ((*in) % (1U << 13)) << (27 - 13);
+  out++;
+  *out = ((*in) >> 13);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (27 - 8);
+  out++;
+  *out = ((*in) >> 8);
+  ++in;
+  *out |= ((*in) % (1U << 3)) << (27 - 3);
+  out++;
+  *out = ((*in) >> 3) % (1U << 27);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 25)) << (27 - 25);
+  out++;
+  *out = ((*in) >> 25);
+  ++in;
+  *out |= ((*in) % (1U << 20)) << (27 - 20);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 15)) << (27 - 15);
+  out++;
+  *out = ((*in) >> 15);
+  ++in;
+  *out |= ((*in) % (1U << 10)) << (27 - 10);
+  out++;
+  *out = ((*in) >> 10);
+  ++in;
+  *out |= ((*in) % (1U << 5)) << (27 - 5);
+  out++;
+  *out = ((*in) >> 5);
+  ++in;
+  out++;
+
+  return in;
+}
+
+__host__ __device__ inline const uint32_t* unpack28_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 28);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 24)) << (28 - 24);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 20)) << (28 - 20);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (28 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (28 - 12);
+  out++;
+  *out = ((*in) >> 12);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (28 - 8);
+  out++;
+  *out = ((*in) >> 8);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (28 - 4);
+  out++;
+  *out = ((*in) >> 4);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 28);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 24)) << (28 - 24);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 20)) << (28 - 20);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (28 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (28 - 12);
+  out++;
+  *out = ((*in) >> 12);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (28 - 8);
+  out++;
+  *out = ((*in) >> 8);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (28 - 4);
+  out++;
+  *out = ((*in) >> 4);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 28);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 24)) << (28 - 24);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 20)) << (28 - 20);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (28 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (28 - 12);
+  out++;
+  *out = ((*in) >> 12);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (28 - 8);
+  out++;
+  *out = ((*in) >> 8);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (28 - 4);
+  out++;
+  *out = ((*in) >> 4);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 28);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 24)) << (28 - 24);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 20)) << (28 - 20);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (28 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (28 - 12);
+  out++;
+  *out = ((*in) >> 12);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (28 - 8);
+  out++;
+  *out = ((*in) >> 8);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (28 - 4);
+  out++;
+  *out = ((*in) >> 4);
+  ++in;
+  out++;
+
+  return in;
+}
+
+__host__ __device__ inline const uint32_t* unpack29_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 29);
+  out++;
+  *out = ((*in) >> 29);
+  ++in;
+  *out |= ((*in) % (1U << 26)) << (29 - 26);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  *out |= ((*in) % (1U << 23)) << (29 - 23);
+  out++;
+  *out = ((*in) >> 23);
+  ++in;
+  *out |= ((*in) % (1U << 20)) << (29 - 20);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 17)) << (29 - 17);
+  out++;
+  *out = ((*in) >> 17);
+  ++in;
+  *out |= ((*in) % (1U << 14)) << (29 - 14);
+  out++;
+  *out = ((*in) >> 14);
+  ++in;
+  *out |= ((*in) % (1U << 11)) << (29 - 11);
+  out++;
+  *out = ((*in) >> 11);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (29 - 8);
+  out++;
+  *out = ((*in) >> 8);
+  ++in;
+  *out |= ((*in) % (1U << 5)) << (29 - 5);
+  out++;
+  *out = ((*in) >> 5);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (29 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 29);
+  out++;
+  *out = ((*in) >> 31);
+  ++in;
+  *out |= ((*in) % (1U << 28)) << (29 - 28);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 25)) << (29 - 25);
+  out++;
+  *out = ((*in) >> 25);
+  ++in;
+  *out |= ((*in) % (1U << 22)) << (29 - 22);
+  out++;
+  *out = ((*in) >> 22);
+  ++in;
+  *out |= ((*in) % (1U << 19)) << (29 - 19);
+  out++;
+  *out = ((*in) >> 19);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (29 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 13)) << (29 - 13);
+  out++;
+  *out = ((*in) >> 13);
+  ++in;
+  *out |= ((*in) % (1U << 10)) << (29 - 10);
+  out++;
+  *out = ((*in) >> 10);
+  ++in;
+  *out |= ((*in) % (1U << 7)) << (29 - 7);
+  out++;
+  *out = ((*in) >> 7);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (29 - 4);
+  out++;
+  *out = ((*in) >> 4);
+  ++in;
+  *out |= ((*in) % (1U << 1)) << (29 - 1);
+  out++;
+  *out = ((*in) >> 1) % (1U << 29);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 27)) << (29 - 27);
+  out++;
+  *out = ((*in) >> 27);
+  ++in;
+  *out |= ((*in) % (1U << 24)) << (29 - 24);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 21)) << (29 - 21);
+  out++;
+  *out = ((*in) >> 21);
+  ++in;
+  *out |= ((*in) % (1U << 18)) << (29 - 18);
+  out++;
+  *out = ((*in) >> 18);
+  ++in;
+  *out |= ((*in) % (1U << 15)) << (29 - 15);
+  out++;
+  *out = ((*in) >> 15);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (29 - 12);
+  out++;
+  *out = ((*in) >> 12);
+  ++in;
+  *out |= ((*in) % (1U << 9)) << (29 - 9);
+  out++;
+  *out = ((*in) >> 9);
+  ++in;
+  *out |= ((*in) % (1U << 6)) << (29 - 6);
+  out++;
+  *out = ((*in) >> 6);
+  ++in;
+  *out |= ((*in) % (1U << 3)) << (29 - 3);
+  out++;
+  *out = ((*in) >> 3);
+  ++in;
+  out++;
+
+  return in;
+}
+
+__host__ __device__ inline const uint32_t* unpack30_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 30);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 28)) << (30 - 28);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 26)) << (30 - 26);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  *out |= ((*in) % (1U << 24)) << (30 - 24);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 22)) << (30 - 22);
+  out++;
+  *out = ((*in) >> 22);
+  ++in;
+  *out |= ((*in) % (1U << 20)) << (30 - 20);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 18)) << (30 - 18);
+  out++;
+  *out = ((*in) >> 18);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (30 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 14)) << (30 - 14);
+  out++;
+  *out = ((*in) >> 14);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (30 - 12);
+  out++;
+  *out = ((*in) >> 12);
+  ++in;
+  *out |= ((*in) % (1U << 10)) << (30 - 10);
+  out++;
+  *out = ((*in) >> 10);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (30 - 8);
+  out++;
+  *out = ((*in) >> 8);
+  ++in;
+  *out |= ((*in) % (1U << 6)) << (30 - 6);
+  out++;
+  *out = ((*in) >> 6);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (30 - 4);
+  out++;
+  *out = ((*in) >> 4);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (30 - 2);
+  out++;
+  *out = ((*in) >> 2);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 30);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 28)) << (30 - 28);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 26)) << (30 - 26);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  *out |= ((*in) % (1U << 24)) << (30 - 24);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 22)) << (30 - 22);
+  out++;
+  *out = ((*in) >> 22);
+  ++in;
+  *out |= ((*in) % (1U << 20)) << (30 - 20);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 18)) << (30 - 18);
+  out++;
+  *out = ((*in) >> 18);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (30 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 14)) << (30 - 14);
+  out++;
+  *out = ((*in) >> 14);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (30 - 12);
+  out++;
+  *out = ((*in) >> 12);
+  ++in;
+  *out |= ((*in) % (1U << 10)) << (30 - 10);
+  out++;
+  *out = ((*in) >> 10);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (30 - 8);
+  out++;
+  *out = ((*in) >> 8);
+  ++in;
+  *out |= ((*in) % (1U << 6)) << (30 - 6);
+  out++;
+  *out = ((*in) >> 6);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (30 - 4);
+  out++;
+  *out = ((*in) >> 4);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (30 - 2);
+  out++;
+  *out = ((*in) >> 2);
+  ++in;
+  out++;
+
+  return in;
+}
+
+__host__ __device__ inline const uint32_t* unpack31_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 31);
+  out++;
+  *out = ((*in) >> 31);
+  ++in;
+  *out |= ((*in) % (1U << 30)) << (31 - 30);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 29)) << (31 - 29);
+  out++;
+  *out = ((*in) >> 29);
+  ++in;
+  *out |= ((*in) % (1U << 28)) << (31 - 28);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 27)) << (31 - 27);
+  out++;
+  *out = ((*in) >> 27);
+  ++in;
+  *out |= ((*in) % (1U << 26)) << (31 - 26);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  *out |= ((*in) % (1U << 25)) << (31 - 25);
+  out++;
+  *out = ((*in) >> 25);
+  ++in;
+  *out |= ((*in) % (1U << 24)) << (31 - 24);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 23)) << (31 - 23);
+  out++;
+  *out = ((*in) >> 23);
+  ++in;
+  *out |= ((*in) % (1U << 22)) << (31 - 22);
+  out++;
+  *out = ((*in) >> 22);
+  ++in;
+  *out |= ((*in) % (1U << 21)) << (31 - 21);
+  out++;
+  *out = ((*in) >> 21);
+  ++in;
+  *out |= ((*in) % (1U << 20)) << (31 - 20);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 19)) << (31 - 19);
+  out++;
+  *out = ((*in) >> 19);
+  ++in;
+  *out |= ((*in) % (1U << 18)) << (31 - 18);
+  out++;
+  *out = ((*in) >> 18);
+  ++in;
+  *out |= ((*in) % (1U << 17)) << (31 - 17);
+  out++;
+  *out = ((*in) >> 17);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (31 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 15)) << (31 - 15);
+  out++;
+  *out = ((*in) >> 15);
+  ++in;
+  *out |= ((*in) % (1U << 14)) << (31 - 14);
+  out++;
+  *out = ((*in) >> 14);
+  ++in;
+  *out |= ((*in) % (1U << 13)) << (31 - 13);
+  out++;
+  *out = ((*in) >> 13);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (31 - 12);
+  out++;
+  *out = ((*in) >> 12);
+  ++in;
+  *out |= ((*in) % (1U << 11)) << (31 - 11);
+  out++;
+  *out = ((*in) >> 11);
+  ++in;
+  *out |= ((*in) % (1U << 10)) << (31 - 10);
+  out++;
+  *out = ((*in) >> 10);
+  ++in;
+  *out |= ((*in) % (1U << 9)) << (31 - 9);
+  out++;
+  *out = ((*in) >> 9);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (31 - 8);
+  out++;
+  *out = ((*in) >> 8);
+  ++in;
+  *out |= ((*in) % (1U << 7)) << (31 - 7);
+  out++;
+  *out = ((*in) >> 7);
+  ++in;
+  *out |= ((*in) % (1U << 6)) << (31 - 6);
+  out++;
+  *out = ((*in) >> 6);
+  ++in;
+  *out |= ((*in) % (1U << 5)) << (31 - 5);
+  out++;
+  *out = ((*in) >> 5);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (31 - 4);
+  out++;
+  *out = ((*in) >> 4);
+  ++in;
+  *out |= ((*in) % (1U << 3)) << (31 - 3);
+  out++;
+  *out = ((*in) >> 3);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (31 - 2);
+  out++;
+  *out = ((*in) >> 2);
+  ++in;
+  *out |= ((*in) % (1U << 1)) << (31 - 1);
+  out++;
+  *out = ((*in) >> 1);
+  ++in;
+  out++;
+
+  return in;
+}
+
+__host__ __device__ inline const uint32_t* unpack32_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+
+  return in;
+}
+
+__host__ __device__ inline const uint32_t* nullunpacker32(const uint32_t* in, uint32_t* out) {
+  for (int k = 0; k < 32; ++k) {
+    out[k] = 0;
+  }
+  return in;
+}
+
+
+}  // namespace internal
+}  // namespace arrow
+}  // namespace gdf
+
+#endif  // GDF_ARROW_UTIL_BPACKING_H
diff --git a/src/arrow/cu_decoder.cu b/src/arrow/cu_decoder.cu
new file mode 100644
index 00000000..4abe90d6
--- /dev/null
+++ b/src/arrow/cu_decoder.cu
@@ -0,0 +1,616 @@
+/*
+ * Copyright 2018 BlazingDB, Inc.
+ *     Copyright 2018 Alexander Ocsa <alexander@blazingdb.com>
+ *     Copyright 2018 William Malpica <william@blazingdb.com>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cu_decoder.cuh"
+
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/functional.h>
+#include <thrust/functional.h>
+#include <thrust/host_vector.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/iterator/permutation_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/device_ptr.h>
+#include <thrust/transform.h>
+
+#include <thrust/copy.h>
+#include <thrust/fill.h>
+#include <thrust/gather.h>
+#include <thrust/reduce.h>
+#include <thrust/scan.h>
+
+#include <algorithm>
+#include <iostream>
+#include <tuple>
+
+#include <cassert>
+#include <fstream>
+#include <iostream>
+#include <list>
+#include <memory>
+
+#include "bpacking.cuh"
+#include "util/pinned_allocator.cuh"
+
+namespace gdf
+{
+namespace arrow
+{
+namespace internal {
+
+CachingPinnedAllocator pinnedAllocator(2, 14, 29, 1024*1024*1024*1ull);
+ 
+namespace detail
+{
+
+#define ARROW_PREDICT_FALSE(x) (__builtin_expect(x, 0))
+#define ARROW_PREDICT_TRUE(x) (__builtin_expect(!!(x), 1))
+
+#define ARROW_DEBUG (-1)
+#define ARROW_INFO 0
+#define ARROW_WARNING 1
+#define ARROW_ERROR 2
+#define ARROW_FATAL 3
+
+class CerrLog
+{
+  public:
+    CerrLog(int severity) // NOLINT(runtime/explicit)
+        : severity_(severity),
+          has_logged_(false)
+    {
+    }
+
+    virtual ~CerrLog()
+    {
+        if (has_logged_)
+        {
+            std::cerr << std::endl;
+        }
+        if (severity_ == ARROW_FATAL)
+        {
+            std::exit(1);
+        }
+    }
+
+    template <class T>
+    CerrLog &operator<<(const T &t)
+    {
+        if (severity_ != ARROW_DEBUG)
+        {
+            has_logged_ = true;
+            std::cerr << t;
+        }
+        return *this;
+    }
+
+  protected:
+    const int severity_;
+    bool has_logged_;
+};
+ 
+
+/// Returns the 'num_bits' least-significant bits of 'v'.
+__device__  __host__  static inline uint64_t TrailingBits(uint64_t v,
+                                                        int num_bits)
+{
+    if (ARROW_PREDICT_FALSE(num_bits == 0))
+        return 0;
+    if (ARROW_PREDICT_FALSE(num_bits >= 64))
+        return v;
+    int n = 64 - num_bits;
+    return (v << n) >> n;
+}
+
+template <typename T>
+__device__  __host__   inline void GetValue_(int num_bits, T *v, int max_bytes,
+                                          const uint8_t *buffer,
+                                          int *bit_offset, int *byte_offset,
+                                          uint64_t *buffered_values)
+{
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4800)
+#endif
+    *v = static_cast<T>(TrailingBits(*buffered_values, *bit_offset + num_bits) >> *bit_offset);
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+    *bit_offset += num_bits;
+
+    if (*bit_offset >= 64)
+    {
+        *byte_offset += 8;
+        *bit_offset -= 64;
+
+        int bytes_remaining = max_bytes - *byte_offset;
+        if (ARROW_PREDICT_TRUE(bytes_remaining >= 8))
+        {
+            memcpy(buffered_values, buffer + *byte_offset, 8);
+        }
+        else
+        {
+            memcpy(buffered_values, buffer + *byte_offset, bytes_remaining);
+        }
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4800 4805)
+#endif
+        // Read bits of v that crossed into new buffered_values_
+        *v = *v | static_cast<T>(TrailingBits(*buffered_values, *bit_offset)
+                                 << (num_bits - *bit_offset));
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+        // DCHECK_LE(*bit_offset, 64);
+    }
+}
+
+} // namespace detail
+
+template <typename InputIterator1, typename InputIterator2,
+          typename OutputIterator>
+OutputIterator gpu_expand(InputIterator1 first1, InputIterator1 last1,
+                          InputIterator2 first2, OutputIterator output)
+{
+    typedef typename thrust::iterator_difference<InputIterator1>::type
+        difference_type;
+
+    difference_type input_size = thrust::distance(first1, last1);
+    difference_type output_size = thrust::reduce(first1, last1);
+
+    // scan the counts to obtain output offsets for each input element
+    thrust::device_vector<difference_type> output_offsets(input_size, 0);
+    thrust::exclusive_scan(first1, last1, output_offsets.begin());
+
+    // scatter the nonzero counts into their corresponding output positions
+    thrust::device_vector<difference_type> output_indices(output_size, 0);
+    thrust::scatter_if(thrust::counting_iterator<difference_type>(0),
+                       thrust::counting_iterator<difference_type>(input_size),
+                       output_offsets.begin(), first1, output_indices.begin());
+
+    // compute max-scan over the output indices, filling in the holes
+    thrust::inclusive_scan(output_indices.begin(), output_indices.end(),
+                           output_indices.begin(),
+                           thrust::maximum<difference_type>());
+
+    // gather input values according to index array (output =
+    // first2[output_indices])
+    OutputIterator output_end = output;
+    thrust::advance(output_end, output_size);
+    thrust::gather(output_indices.begin(), output_indices.end(), first2, output);
+
+    // return output + output_size
+    thrust::advance(output, output_size);
+    return output;
+}
+
+__host__ __device__ inline const uint32_t* unpack32(const uint32_t* in, uint32_t* out, int num_bits) {
+    const uint32_t* (*UnpackFunctionPtr[])(const uint32_t* in, uint32_t* out) = {nullunpacker32, unpack1_32, unpack2_32, unpack3_32, unpack4_32, unpack5_32, unpack6_32, unpack7_32, unpack8_32, unpack9_32, unpack10_32, unpack11_32, unpack12_32, unpack13_32, unpack14_32, unpack15_32, unpack16_32, unpack17_32, unpack18_32, unpack19_32, unpack20_32, unpack21_32, unpack22_32, unpack23_32, unpack24_32, unpack25_32, unpack26_32, unpack27_32, unpack28_32, unpack29_32, unpack30_32, unpack31_32, unpack32_32};
+    return UnpackFunctionPtr[num_bits](in, out);
+}
+
+template<class T>
+struct unpack_functor
+    : public thrust::binary_function<uint8_t, T, uint32_t>
+{
+    int num_bits;
+    unpack_functor(int num_bits) : num_bits(num_bits) {
+
+    }
+    __host__ __device__ uint32_t operator()(uint8_t &input, T &output)
+    {
+        uint32_t *input_ptr = (uint32_t *)&input;
+        uint32_t *output_ptr = (uint32_t *)&output;
+        
+        unpack32(input_ptr, output_ptr, num_bits);
+
+        return 0;
+    }
+};
+
+template<typename Func>
+    __global__
+    void decode_bitpacking_32sets(uint8_t *buffer, int *output, int *input_offsets, int *input_run_lengths, int num_sets,
+    		int * output_offsets, short bit_width, int max_num_sets_in_run, Func unpack_func)
+    {
+
+    	extern __shared__ uint8_t temp[];
+
+    	const short INPUT_BLOCK_BYTES = bit_width * 32 / 8;
+    	const short OUTPUT_BLOCK_BYTES = 32 * 4;
+    	const short BLOCK_SIZE = 32;
+    	const short IO_BLOCK = INPUT_BLOCK_BYTES + OUTPUT_BLOCK_BYTES;  // size in bytes of INPUT and OUTPUT BLOCK
+
+    	int index = blockIdx.x * blockDim.x + threadIdx.x;
+
+    	int set_index = index/max_num_sets_in_run;
+
+    	if (set_index < num_sets){
+    		int intput_index = input_offsets[set_index] + INPUT_BLOCK_BYTES * (index % max_num_sets_in_run);
+    		int output_index = output_offsets[set_index] + BLOCK_SIZE * (index % max_num_sets_in_run);
+
+    		if ((INPUT_BLOCK_BYTES * (index % max_num_sets_in_run)) < input_run_lengths[set_index]*bit_width/8) { // if we want to actually process
+
+    			uint8_t * temp_in = &temp[IO_BLOCK * threadIdx.x];
+    			int *temp_out = (int*)&temp[IO_BLOCK * threadIdx.x + INPUT_BLOCK_BYTES];
+
+    			for (int i = 0; i < INPUT_BLOCK_BYTES; i++){
+    				temp_in[i] = buffer[intput_index + i];
+    			}
+    			unpack_func(temp_in[0], temp_out[0]);
+
+    			for (int i = 0; i < BLOCK_SIZE; i++){
+    				output[output_index + i] = temp_out[i];
+    			}
+    		}
+    	}
+    }
+
+typedef thrust::tuple<int, int, int, int> Int4;
+
+template<class T>
+struct remainder_functor : public thrust::unary_function<Int4, int>
+{
+    int max_bytes;
+    int num_bits;
+    uint8_t *d_buffer;
+    T *ptr_output;
+    remainder_functor(int max_bytes, int num_bits, uint8_t *buffer,
+                      T *ptr_output)
+        : max_bytes(max_bytes), num_bits(num_bits), d_buffer(buffer), ptr_output(ptr_output)
+    {
+    }
+    __device__ __host__ int operator()(Int4 tuple)
+    {
+        int bit_offset = thrust::get<0>(tuple);  // remainderBitOffsets[k];
+        int byte_offset = thrust::get<1>(tuple); // remainderInputOffsets[k];
+        uint64_t buffered_values = 0;
+
+        int bytes_remaining = max_bytes - byte_offset;
+        if (bytes_remaining >= 8)
+        {
+            memcpy(&buffered_values, d_buffer + byte_offset, 8);
+        }
+        else
+        {
+            memcpy(&buffered_values, d_buffer + byte_offset, bytes_remaining);
+        }
+        int i = thrust::get<2>(tuple); // remainderOutputOffsets[k];
+        int batch_size = thrust::get<2>(tuple) + thrust::get<3>(tuple); // remainderOutputOffsets[k] + remainderSetSize[k];
+        for (; i < batch_size; ++i)
+        {
+            detail::GetValue_(num_bits, &ptr_output[i], max_bytes, (uint8_t *)d_buffer,
+                              &bit_offset, &byte_offset, &buffered_values);
+        }
+        return 0;
+    }
+};
+
+template<typename T>
+void gpu_bit_packing_remainder( thrust::device_vector<uint8_t> & d_buffer,
+                                const std::vector<int> &remainderInputOffsets,
+                                const std::vector<int> &remainderBitOffsets,
+                                const std::vector<int> &remainderSetSize,
+                                const std::vector<int> &remainderOutputOffsets,
+                                thrust::device_vector<T>& d_output,
+                                int num_bits)
+{
+
+    thrust::device_vector<int> d_remainder_input_offsets(remainderInputOffsets);
+    thrust::device_vector<int> d_remainder_bit_offsets(remainderBitOffsets);
+    thrust::device_vector<int> d_remainder_setsize(remainderSetSize);
+    thrust::device_vector<int> d_remainder_output_offsets(remainderOutputOffsets);
+
+    int max_bytes = d_buffer.size();
+    auto zip_iterator_begin = thrust::make_zip_iterator(thrust::make_tuple(
+        d_remainder_bit_offsets.begin(), d_remainder_input_offsets.begin(),
+        d_remainder_output_offsets.begin(), d_remainder_setsize.begin()));
+    auto zip_iterator_end = thrust::make_zip_iterator(thrust::make_tuple(
+        d_remainder_bit_offsets.end(), d_remainder_input_offsets.end(),
+        d_remainder_output_offsets.end(), d_remainder_setsize.end()));
+
+    thrust::transform(
+        thrust::device, zip_iterator_begin, zip_iterator_end,
+        thrust::make_discard_iterator(),
+        remainder_functor<T>(max_bytes, num_bits, d_buffer.data().get(),
+                          d_output.data().get()));
+
+}
+
+
+template<typename T>
+void gpu_bit_packing(const uint8_t *buffer, 
+                     const int buffer_len,
+                     const std::vector<int> &input_offset,
+                     const std::vector<std::pair<uint32_t, uint32_t>>& bitpackset,
+                     const std::vector<int> &output_offset,
+                     thrust::device_vector<T>& d_output, 
+                     int num_bits) 
+{
+    thrust::device_vector<int> d_output_offset(output_offset);
+    int step_size = 32 * num_bits / 8;
+    uint8_t* h_bit_buffer;
+    pinnedAllocator.pinnedAllocate((void **)&h_bit_buffer, step_size * input_offset.size());
+
+	thrust::host_vector<int> h_bit_offset;
+	for (int i = 0; i < input_offset.size(); i++){
+	    h_bit_offset.push_back(i*step_size);
+	}
+     int sum = 0;
+    for (auto &&pair : bitpackset) {
+	    memcpy ( &h_bit_buffer[sum] , &buffer[pair.first], pair.second );
+        sum += pair.second;
+	}
+    thrust::device_vector<uint8_t> d_bit_buffer(h_bit_buffer, h_bit_buffer + step_size * input_offset.size());
+    thrust::device_vector<int> d_bit_offset(h_bit_offset);
+
+	thrust::transform(thrust::cuda::par,
+	    thrust::make_permutation_iterator(d_bit_buffer.begin(), d_bit_offset.begin()),
+	    thrust::make_permutation_iterator(d_bit_buffer.end(), d_bit_offset.end()),
+	    thrust::make_permutation_iterator(d_output.begin(), d_output_offset.begin()),
+	    thrust::make_discard_iterator(), unpack_functor<T>(num_bits));
+    pinnedAllocator.pinnedFree(h_bit_buffer);
+}
+ 
+template<typename T>
+int decode_using_gpu(const T * d_dictionary, int num_dictionary_values, T* d_output, const uint8_t *buffer, const int buffer_len,
+                     const std::vector<uint32_t> &rle_runs,
+                     const std::vector<uint64_t> &rle_values,
+                     const std::vector<int> &input_offset,
+					 const std::vector<int> &input_runlengths,
+                     const std::vector<int> &output_offset,
+                     const std::vector<int> &remainderInputOffsets,
+                     const std::vector<int> &remainderBitOffsets,
+                     const std::vector<int> &remainderSetSize,
+                     const std::vector<int> &remainderOutputOffsets,
+                     int num_bits, int batch_size)
+{
+    thrust::device_vector<int> d_indices(batch_size);
+
+    {
+    	thrust::device_vector<uint32_t> d_counts(rle_runs);
+    	thrust::device_vector<uint64_t> d_values(rle_values);
+    	gpu_expand(d_counts.begin(), d_counts.end(), d_values.begin(), d_indices.begin());
+    }
+
+    thrust::device_vector<uint8_t> d_buffer(buffer_len);
+    thrust::copy(buffer, buffer + buffer_len, d_buffer.begin());
+    if (input_offset.size() > 0){
+    	unpack_functor<int> func(num_bits);
+    	thrust::device_vector<int> d_input_offsets(input_offset);
+    	thrust::device_vector<int> d_input_runlengths(input_runlengths);
+    	thrust::device_vector<int> d_output_offset(output_offset);
+
+    	int max_num_sets_in_run = thrust::reduce(thrust::device,
+    			d_input_runlengths.begin(), d_input_runlengths.end(),
+				0,
+				thrust::maximum<int>());
+    	max_num_sets_in_run = max_num_sets_in_run/32;
+
+    	int max_total_sets = max_num_sets_in_run * input_offset.size();
+
+    	int blocksize = std::min(128, max_total_sets);
+    	int gridsize = (max_total_sets + blocksize - 1) / blocksize;
+
+    	int shared_memory = blocksize * (num_bits * 32/8 + 32 * 4);
+
+    	decode_bitpacking_32sets<<<gridsize, blocksize, shared_memory>>>(thrust::raw_pointer_cast(d_buffer.data()), thrust::raw_pointer_cast(d_indices.data()),
+    			thrust::raw_pointer_cast(d_input_offsets.data()), thrust::raw_pointer_cast(d_input_runlengths.data()), input_offset.size(),
+				thrust::raw_pointer_cast(d_output_offset.data()), num_bits, max_num_sets_in_run, func);
+
+    }
+
+    if (remainderInputOffsets.size() > 0){
+    	gpu_bit_packing_remainder(d_buffer, remainderInputOffsets, remainderBitOffsets, remainderSetSize, remainderOutputOffsets, d_indices, num_bits);
+    }
+    
+    thrust::gather(thrust::device,
+                d_indices.begin(), d_indices.end(),
+                d_dictionary,
+                d_output);
+    return batch_size;
+}
+
+template <typename T>
+struct copy_functor : public thrust::unary_function<int, T>
+{
+    __host__ __device__ T operator()(int input)
+    {
+        return static_cast<T>(input);
+    }
+};
+
+template<typename T>
+int decode_def_levels(const uint8_t* buffer, const int buffer_len,
+                const std::vector<uint32_t> &rle_runs,
+                const std::vector<uint64_t> &rle_values,
+                const std::vector<int>& input_offset,
+				const std::vector<int>& input_runlengths,
+                const std::vector<int>& output_offset,
+                const std::vector<int>& remainderInputOffsets,
+                const std::vector<int>& remainderBitOffsets,
+                const std::vector<int>& remainderSetSize,
+                const std::vector<int>& remainderOutputOffsets,
+                int num_bits,
+                T* output, int batch_size) 
+{
+
+	thrust::device_vector<int> d_indices(batch_size);
+
+	{
+		thrust::device_vector<uint32_t> d_counts(rle_runs);
+		thrust::device_vector<uint64_t> d_values(rle_values);
+		gpu_expand(d_counts.begin(), d_counts.end(), d_values.begin(), d_indices.begin());
+	}
+
+	thrust::device_vector<uint8_t> d_buffer(buffer_len);
+	thrust::copy(buffer, buffer + buffer_len, d_buffer.begin());
+	if (input_offset.size() > 0){
+		unpack_functor<int> func(num_bits);
+		thrust::device_vector<int> d_input_offsets(input_offset);
+		thrust::device_vector<int> d_input_runlengths(input_runlengths);
+		thrust::device_vector<int> d_output_offset(output_offset);
+
+		int max_num_sets_in_run = thrust::reduce(thrust::device,
+				d_input_runlengths.begin(), d_input_runlengths.end(),
+				0,
+				thrust::maximum<int>());
+		max_num_sets_in_run = max_num_sets_in_run/32;
+
+		int max_total_sets = max_num_sets_in_run * input_offset.size();
+
+		int blocksize = std::min(128, max_total_sets);
+		int gridsize = (max_total_sets + blocksize - 1) / blocksize;
+
+		int shared_memory = blocksize * (num_bits * 32/8 + 32 * 4);
+
+		decode_bitpacking_32sets<<<gridsize, blocksize, shared_memory>>>(thrust::raw_pointer_cast(d_buffer.data()), thrust::raw_pointer_cast(d_indices.data()),
+				thrust::raw_pointer_cast(d_input_offsets.data()), thrust::raw_pointer_cast(d_input_runlengths.data()), input_offset.size(),
+				thrust::raw_pointer_cast(d_output_offset.data()), num_bits, max_num_sets_in_run, func);
+
+	}
+
+	if (remainderInputOffsets.size() > 0){
+		gpu_bit_packing_remainder(d_buffer, remainderInputOffsets, remainderBitOffsets, remainderSetSize, remainderOutputOffsets, d_indices, num_bits);
+	}
+    
+    thrust::transform(thrust::device, d_indices.begin(), d_indices.end(), output, copy_functor<T>());
+    return batch_size;
+}
+                
+template<typename T>
+int unpack_using_gpu(const uint8_t* buffer, const int buffer_len,
+                const std::vector<int>& input_offset,
+				const std::vector<int>& input_runlengths,
+                const std::vector<int>& output_offset,
+                const std::vector<int>& remainderInputOffsets,
+                const std::vector<int>& remainderBitOffsets,
+                const std::vector<int>& remainderSetSize,
+                const std::vector<int>& remainderOutputOffsets,
+                int num_bits,
+                T* device_output, int batch_size) 
+{
+
+	thrust::device_vector<int> d_output_int(batch_size);
+    thrust::device_vector<uint8_t> d_buffer(buffer_len);
+    thrust::copy(buffer, buffer + buffer_len, d_buffer.begin());
+
+    if (input_offset.size() > 0){
+
+    	unpack_functor<int> func(num_bits);
+    	thrust::device_vector<int> d_input_offsets(input_offset);
+    	thrust::device_vector<int> d_input_runlengths(input_runlengths);
+    	thrust::device_vector<int> d_output_offset(output_offset);
+
+    	int max_num_sets_in_run = thrust::reduce(thrust::device,
+    			d_input_runlengths.begin(), d_input_runlengths.end(),
+				0,
+				thrust::maximum<int>());
+    	max_num_sets_in_run = max_num_sets_in_run/32;
+
+    	int max_total_sets = max_num_sets_in_run * input_offset.size();
+
+    	int blocksize = std::min(128, max_total_sets);
+    	int gridsize = (max_total_sets + blocksize - 1) / blocksize;
+
+    	int shared_memory = blocksize * (num_bits * 32/8 + 32 * 4);
+
+    	decode_bitpacking_32sets<<<gridsize, blocksize, shared_memory>>>(thrust::raw_pointer_cast(d_buffer.data()), thrust::raw_pointer_cast(d_output_int.data()),
+    			thrust::raw_pointer_cast(d_input_offsets.data()), thrust::raw_pointer_cast(d_input_runlengths.data()), input_offset.size(),
+				thrust::raw_pointer_cast(d_output_offset.data()), num_bits, max_num_sets_in_run, func);
+
+    }
+
+    if (remainderInputOffsets.size() > 0){
+    	gpu_bit_packing_remainder(d_buffer, remainderInputOffsets, remainderBitOffsets, remainderSetSize, remainderOutputOffsets, d_output_int, num_bits);
+    }
+
+    thrust::transform(thrust::device, d_output_int.begin(), d_output_int.end(), device_output, copy_functor<T>());
+    return batch_size;
+}
+
+
+#define CONCRETIZE_FUNCTION(T) \
+template int decode_using_gpu<T>(const T *dictionary, int num_dictionary_values, T* d_output, const uint8_t *buffer, const int buffer_len, \
+                    const std::vector<uint32_t> &rle_runs, \
+                    const std::vector<uint64_t> &rle_values, \
+                    const std::vector<int> &input_offset, \
+					const std::vector<int> &input_runlengths, \
+                    const std::vector<int> &output_offset, \
+                    const std::vector<int> &remainderInputOffsets, \
+                    const std::vector<int> &remainderBitOffsets, \
+                    const std::vector<int> &remainderSetSize, \
+                    const std::vector<int> &remainderOutputOffsets, \
+                    int num_bits, \
+                    int batch_size \
+                    )
+
+CONCRETIZE_FUNCTION(bool);
+CONCRETIZE_FUNCTION(int32_t);
+CONCRETIZE_FUNCTION(int64_t);
+CONCRETIZE_FUNCTION(float);
+CONCRETIZE_FUNCTION(double);
+
+#undef CONCRETIZE_FUNCTION 
+
+template int unpack_using_gpu<bool>(const uint8_t* buffer, const int buffer_len, 
+            const std::vector<int>& input_offset,
+			const std::vector<int>& input_runlengths,
+            const std::vector<int>& output_offset,  
+            const std::vector<int>& remainderInputOffsets, 
+            const std::vector<int>& remainderBitOffsets,  
+            const std::vector<int>& remainderSetSize, 
+            const std::vector<int>& remainderOutputOffsets, 
+            int num_bits, 
+            bool* device_output, int batch_size
+            );
+
+
+template int unpack_using_gpu<int16_t>(const uint8_t* buffer, const int buffer_len, 
+            const std::vector<int>& input_offset, 
+			const std::vector<int>& input_runlengths,
+            const std::vector<int>& output_offset,  
+            const std::vector<int>& remainderInputOffsets, 
+            const std::vector<int>& remainderBitOffsets,  
+            const std::vector<int>& remainderSetSize, 
+            const std::vector<int>& remainderOutputOffsets, 
+            int num_bits, 
+            int16_t* output, int batch_size 
+            );
+
+template int decode_def_levels<int16_t>(const uint8_t* buffer, const int buffer_len,
+                const std::vector<uint32_t> &rle_runs,
+                const std::vector<uint64_t> &rle_values,
+                const std::vector<int>& input_offset,
+				const std::vector<int>& input_runlengths,
+                const std::vector<int>& output_offset,
+                const std::vector<int>& remainderInputOffsets,
+                const std::vector<int>& remainderBitOffsets,
+                const std::vector<int>& remainderSetSize,
+                const std::vector<int>& remainderOutputOffsets,
+                int num_bits,
+                int16_t* output, int batch_size);
+
+                
+
+} // namespace internal
+} // namespace arrow
+} // namespace gdf
diff --git a/src/arrow/cu_decoder.cuh b/src/arrow/cu_decoder.cuh
new file mode 100644
index 00000000..1fb0b5e8
--- /dev/null
+++ b/src/arrow/cu_decoder.cuh
@@ -0,0 +1,124 @@
+#ifndef _CU_DECODER_H_
+#define _CU_DECODER_H_
+/*
+ * Copyright 2018 BlazingDB, Inc.
+ *     Copyright 2018 Alexander Ocsa <alexander@blazingdb.com>
+ *     Copyright 2018 William Malpica <william@blazingdb.com>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <thrust/scatter.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/scan.h>
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+
+namespace gdf {
+namespace arrow {
+namespace internal {
+    
+    template<typename T>
+    int decode_using_gpu(const T *dictionary, int num_dictionary_values, T* d_output, const uint8_t *buffer, const int buffer_len,
+                     const std::vector<uint32_t> &rle_runs,
+                     const std::vector<uint64_t> &rle_values,
+                     const std::vector<int> &input_offset,
+					 const std::vector<int> &intput_runlengths,
+                     const std::vector<int> &output_offset,
+                     const std::vector<int> &remainderInputOffsets,
+                     const std::vector<int> &remainderBitOffsets,
+                     const std::vector<int> &remainderSetSize,
+                     const std::vector<int> &remainderOutputOffsets,
+                     int num_bits, int batch_size);
+
+    template<typename T>
+    int unpack_using_gpu(const uint8_t* buffer, const int buffer_len,
+                 const std::vector<int>& input_offset,
+				 const std::vector<int>& input_runlengths,
+                 const std::vector<int>& output_offset,
+                 const std::vector<int>& remainderInputOffsets,
+                 const std::vector<int>& remainderBitOffsets,
+                 const std::vector<int>& remainderSetSize,
+                 const std::vector<int>& remainderOutputOffsets,
+                 int num_bits,
+                 T* output, int batch_size);
+    template<typename T>
+    int decode_def_levels(const uint8_t* buffer, const int buffer_len,
+                 const std::vector<uint32_t> &rle_runs,
+                 const std::vector<uint64_t> &rle_values,
+                 const std::vector<int>& input_offset,
+				 const std::vector<int>& input_runlengths,
+                 const std::vector<int>& output_offset,
+                 const std::vector<int>& remainderInputOffsets,
+                 const std::vector<int>& remainderBitOffsets,
+                 const std::vector<int>& remainderSetSize,
+                 const std::vector<int>& remainderOutputOffsets,
+                 int num_bits,
+                 T* output, int batch_size);
+
+
+    // expands data vector that does not contain nulls into a representation that has indeterminate values where there should be nulls
+    // A vector of int work_space needs to be allocated to hold the map for the scatter operation. The workspace should be of size batch_size
+    template <typename T>
+    void compact_to_sparse_for_nulls(T* data_in, T* data_out, const uint8_t* definition_levels, uint8_t max_definition_level,
+    		int batch_size, int * work_space){
+
+    	struct is_equal
+		{
+    		uint8_t _val;
+
+    		__host__ __device__ is_equal(uint8_t val){
+    			_val = val;
+    		}
+    		__host__ __device__
+			bool operator()(const uint8_t &x)
+    		{
+    			return x == _val;
+    		}
+		};
+
+    	is_equal op(max_definition_level);
+    	thrust::counting_iterator<int> iter(0);
+    	auto out_iter = thrust::copy_if(iter, iter + batch_size, definition_levels, work_space, op);
+    	int num_not_null = out_iter - work_space;
+
+    	thrust::scatter(data_in, data_in + num_not_null, work_space, data_out);
+    }
+
+}
+} // namespace arrow
+} // namespace gdf
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+#endif // _CU_DECODER_H_
diff --git a/src/arrow/rle_decoder.h b/src/arrow/rle_decoder.h
new file mode 100644
index 00000000..5086d7b3
--- /dev/null
+++ b/src/arrow/rle_decoder.h
@@ -0,0 +1,258 @@
+/*
+ * Copyright 2018 BlazingDB, Inc.
+ *     Copyright 2018 Alexander Ocsa <alexander@blazingdb.com>
+ *     Copyright 2018 William Malpica <william@blazingdb.com>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef GDF_ARROW_UTIL_RLE_DECODER_H
+#define GDF_ARROW_UTIL_RLE_DECODER_H
+
+#include "bit-stream.h"
+#include "cu_decoder.cuh"
+#include <arrow/util/bit-stream-utils.h>
+#include <thrust/gather.h>
+#include <thrust/fill.h>
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+
+namespace parquet {
+class ColumnDescriptor;
+}
+
+namespace gdf {
+namespace arrow {
+    namespace internal {
+
+        /// Decoder class for RLE encoded data.
+        class RleDecoder {
+        public:
+            /// Create a decoder object. buffer/buffer_len is the decoded data.
+            /// bit_width is the width of each value (before encoding).
+            RleDecoder(const uint8_t* buffer, int buffer_len, int bit_width)
+                : bit_reader_(buffer, buffer_len)
+                , bit_width_(bit_width)
+                , current_value_(0)
+                , repeat_count_(0)
+                , literal_count_(0)
+            {
+                DCHECK_GE(bit_width_, 0);
+                DCHECK_LE(bit_width_, 64);
+            }
+
+            RleDecoder()
+                : bit_width_(-1)
+            {
+            }
+
+            void Reset(const uint8_t* buffer, int buffer_len, int bit_width)
+            {
+                DCHECK_GE(bit_width, 0);
+                DCHECK_LE(bit_width, 64);
+                bit_reader_.Reset(buffer, buffer_len);
+                bit_width_ = bit_width;
+                current_value_ = 0;
+                repeat_count_ = 0;
+                literal_count_ = 0;
+            }
+
+            /// Gets the next value.  Returns false if there are no more.
+            template <typename T>
+            bool Get(T* val);
+
+            /// Gets a batch of values.  Returns the number of decoded elements.
+            template <typename T>
+            int GetBatch(T* values, int batch_size);
+
+            /// Like GetBatch but the values are then decoded using the provided
+            /// dictionary
+            template <typename T>
+            int GetBatchWithDict(const T* dictionary, int num_dictionary_values, T* values, int batch_size);
+
+            /// Like GetBatchWithDict but add spacing for null entries
+            template <typename T>
+            int GetBatchWithDictSpaced(const T* dictionary, int num_dictionary_values, T* values, int batch_size,
+                int null_count, const uint8_t* valid_bits,
+                int64_t valid_bits_offset);
+
+        protected:
+            BitReader bit_reader_;
+            /// Number of bits needed to encode the value. Must be between 0 and 64.
+            int bit_width_;
+            uint64_t current_value_;
+            uint32_t repeat_count_;
+            uint32_t literal_count_;
+
+        private:
+            /// Fills literal_count_ and repeat_count_ with next values. Returns false if
+            /// there
+            /// are no more.
+            template <typename T>
+            bool NextCounts();
+        };
+
+        template <typename T>
+        inline bool RleDecoder::Get(T* val)
+        {
+            return GetBatch(val, 1) == 1;
+        }
+
+        template <typename T>
+        inline int RleDecoder::GetBatch(T* values, int batch_size)
+        {
+            DCHECK_GE(bit_width_, 0);
+            int values_read = 0;
+
+            std::vector<uint32_t> rleRuns;
+            std::vector<uint64_t> rleValues;
+            int numRle;
+            int numBitpacked;
+            std::vector< std::pair<uint32_t, uint32_t> > bitpackset;
+            std::vector<int> unpack32InputOffsets, unpack32InputRunLengths, unpack32OutputOffsets;
+            std::vector<int> remainderInputOffsets, remainderBitOffsets, remainderSetSize,
+                    remainderOutputOffsets;
+
+            while (values_read < batch_size) {
+                if (repeat_count_ > 0) {
+                    int repeat_batch = std::min(batch_size - values_read, static_cast<int>(repeat_count_));
+                    rleRuns.push_back(repeat_batch);
+                    rleValues.push_back(current_value_);
+
+                    repeat_count_ -= repeat_batch;
+                    values_read += repeat_batch;
+                } else if (literal_count_ > 0) {
+                    int literal_batch = std::min(batch_size - values_read, static_cast<int>(literal_count_));
+                    rleRuns.push_back(literal_batch);
+                    rleValues.push_back(0);
+
+                    bit_reader_.SetGpuBatchMetadata(
+                        bit_width_, values + values_read, literal_batch, values_read, unpack32InputOffsets, unpack32InputRunLengths,
+                        unpack32OutputOffsets, remainderInputOffsets, remainderBitOffsets,
+                        remainderSetSize, remainderOutputOffsets);
+
+                    literal_count_ -= literal_batch;
+                    values_read += literal_batch;
+                } else {
+                    if (!NextCounts<T>())
+                        return values_read;
+                }
+            }
+            gdf::arrow::internal::decode_def_levels(
+                    this->bit_reader_.get_buffer(), this->bit_reader_.get_buffer_len(),
+                    rleRuns, rleValues, 
+                    unpack32InputOffsets,
+					unpack32InputRunLengths,
+                    unpack32OutputOffsets,
+                    remainderInputOffsets, remainderBitOffsets, remainderSetSize,
+                    remainderOutputOffsets, bit_width_, values, batch_size);
+
+            return values_read;
+        }
+        
+        template <typename T>
+        inline int RleDecoder::GetBatchWithDict(const T* dictionary, int num_dictionary_values, T* values,
+            int batch_size)
+        {
+            DCHECK_GE(bit_width_, 0);
+            int values_read = 0;
+
+            std::vector<uint32_t> rleRuns;
+            std::vector<uint64_t> rleValues;
+            int numRle;
+            int numBitpacked;
+            std::vector<int> unpack32InputOffsets, unpack32InputRunLengths, unpack32OutputOffsets;
+            std::vector<int> remainderInputOffsets, remainderBitOffsets, remainderSetSize,
+                remainderOutputOffsets;
+
+            while (values_read < batch_size) {
+                if (repeat_count_ > 0) {
+                    int repeat_batch = std::min(batch_size - values_read, static_cast<int>(repeat_count_));
+                    rleRuns.push_back(repeat_batch);
+                    rleValues.push_back(current_value_);
+                    numRle++;
+
+                    repeat_count_ -= repeat_batch;
+                    values_read += repeat_batch;
+                } else if (literal_count_ > 0) {
+                    int literal_batch = std::min(batch_size - values_read, static_cast<int>(literal_count_));
+
+                    const int buffer_size = 1024; //@todo, check this buffer size for optimization
+                    int indices[buffer_size];
+                    literal_batch = std::min(literal_batch, buffer_size);
+                    rleRuns.push_back(literal_batch);
+                    rleValues.push_back(0);
+                    numBitpacked++;
+                    bit_reader_.SetGpuBatchMetadata(
+                        bit_width_, &indices[0], literal_batch, values_read, unpack32InputOffsets, unpack32InputRunLengths,
+                        unpack32OutputOffsets, remainderInputOffsets, remainderBitOffsets,
+                        remainderSetSize, remainderOutputOffsets);
+                    literal_count_ -= literal_batch;
+                    values_read += literal_batch;
+                } else {
+                    if (!NextCounts<T>())
+                        return values_read;
+                }
+            }
+            int actual_read = gdf::arrow::internal::decode_using_gpu(dictionary, num_dictionary_values, values,
+                this->bit_reader_.get_buffer(), this->bit_reader_.get_buffer_len(),
+                rleRuns, rleValues, 
+                unpack32InputOffsets,
+				unpack32InputRunLengths,
+                unpack32OutputOffsets,
+                remainderInputOffsets, remainderBitOffsets, remainderSetSize,
+                remainderOutputOffsets, bit_width_, batch_size);
+            
+            return values_read;
+        }
+
+        template <typename T>
+        inline int RleDecoder::GetBatchWithDictSpaced(const T* dictionary, int num_dictionary_values, T* values,
+            int batch_size, int null_count,
+            const uint8_t* valid_bits,
+            int64_t valid_bits_offset)
+        {
+            DCHECK_GE(bit_width_, 0);
+
+            int values_read = GetBatchWithDict(dictionary, num_dictionary_values, values, batch_size);
+
+            return values_read;
+        }
+
+        template <typename T>
+        inline bool RleDecoder::NextCounts()
+        {
+            // Read the next run's indicator int, it could be a literal or repeated run.
+            // The int is encoded as a vlq-encoded value.
+            int32_t indicator_value = 0;
+            bool result = bit_reader_.GetVlqInt(&indicator_value);
+            if (!result)
+                return false;
+
+            // lsb indicates if it is a literal run or repeated run
+            bool is_literal = indicator_value & 1;
+            if (is_literal) {
+                literal_count_ = (indicator_value >> 1) * 8;
+            } else {
+                repeat_count_ = indicator_value >> 1;
+                bool result = bit_reader_.GetAligned<T>(
+                    static_cast<int>(::arrow::BitUtil::Ceil(bit_width_, 8)),
+                    reinterpret_cast<T*>(&current_value_));
+                DCHECK(result);
+            }
+            return true;
+        }
+
+    } // namespace internal
+} // namespace parquet
+} // namespace gdf
+#endif
diff --git a/src/arrow/util/pinned_allocator.cu b/src/arrow/util/pinned_allocator.cu
new file mode 100644
index 00000000..e5528ab8
--- /dev/null
+++ b/src/arrow/util/pinned_allocator.cu
@@ -0,0 +1,217 @@
+#include "pinned_allocator.cuh"
+
+	cudaError_t CachingPinnedAllocator::pinnedAllocate(
+			void            **d_ptr,            ///< [out] Reference to pointer to the allocation
+			size_t          bytes     )
+	{
+		*d_ptr                          = NULL;
+
+		cudaError_t error               = cudaSuccess;
+
+
+		// Create a block descriptor for the requested allocation
+		bool found = false;
+		BlockDescriptor search_key;
+
+		NearestPowerOf(search_key.bin, search_key.bytes, bin_growth, bytes);
+
+		if (search_key.bin > max_bin)
+		{
+			// Bin is greater than our maximum bin: allocate the request
+			// exactly and give out-of-bounds bin.  It will not be cached
+			// for reuse when returned.
+			search_key.bin      = INVALID_BIN;
+			search_key.bytes    = bytes;
+		}
+		else
+		{
+			// Search for a suitable cached allocation: lock
+			mutex.lock();
+
+			if (search_key.bin < min_bin)
+			{
+				// Bin is less than minimum bin: round up
+				search_key.bin      = min_bin;
+				search_key.bytes    = min_bin_bytes;
+			}
+
+			// Iterate through the range of cached blocks on the same device in the same bin
+			CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key);
+			while ((block_itr != cached_blocks.end())
+					&& (block_itr->bin == search_key.bin))
+			{
+				// To prevent races with reusing blocks returned by the host but still
+				// in use by the device, only consider cached blocks that are
+				// either (from the active stream) or (from an idle stream)
+
+				// Reuse existing cache block.  Insert into live blocks.
+				found = true;
+				search_key = *block_itr;
+
+				live_blocks.insert(search_key);
+
+				// Remove from free blocks
+				cached_bytes.free -= search_key.bytes;
+				cached_bytes.live += search_key.bytes;
+
+				cached_blocks.erase(block_itr);
+
+
+				block_itr++;
+			}
+
+			// Done searching: unlock
+			mutex.unlock();
+		}
+
+		// Allocate the block if necessary
+		if (!found)
+		{
+
+			// Attempt to allocate
+			if (cudaMallocHost((void **)&search_key.d_ptr, search_key.bytes) != cudaSuccess)
+			{
+
+				error = cudaSuccess;    // Reset the error we will return
+				cudaGetLastError();     // Reset CUDART's error
+
+				// Lock
+				mutex.lock();
+
+				// Iterate the range of free blocks on the same device
+				BlockDescriptor free_key;
+				CachedBlocks::iterator block_itr = cached_blocks.lower_bound(free_key);
+
+				while ((block_itr != cached_blocks.end()))
+				{
+					// No need to worry about synchronization with the device: cudaFreeHost is
+					// blocking and will synchronize across all kernels executing
+					// on the current device
+
+					// Free device memory and destroy stream event.
+					error = cudaFreeHost(block_itr->d_ptr);
+					if(error != cudaSuccess){
+					//	std::cout<<"could not free from host";
+						break;
+					}
+
+					// Reduce balance and erase entry
+					cached_bytes.free -= block_itr->bytes;
+
+
+					cached_blocks.erase(block_itr);
+
+					block_itr++;
+				}
+
+				// Unlock
+				mutex.unlock();
+
+				// Return under error
+				if (error) return error;
+
+				// Try to allocate again
+				error = cudaMallocHost((void **)&search_key.d_ptr, search_key.bytes);
+				if(error != cudaSuccess){
+					return error;
+				}
+
+			}
+
+			// Insert into live blocks
+			mutex.lock();
+			live_blocks.insert(search_key);
+			cached_bytes.live += search_key.bytes;
+			mutex.unlock();
+
+
+		}
+
+		// Copy device pointer to output parameter
+		*d_ptr = search_key.d_ptr;
+
+		return error;
+	}
+
+
+	cudaError_t CachingPinnedAllocator::pinnedFree(
+				void*           d_ptr)
+		{
+			cudaError_t error               = cudaSuccess;
+
+
+
+			// Lock
+			mutex.lock();
+
+			// Find corresponding block descriptor
+			bool recached = false;
+			BlockDescriptor search_key(d_ptr);
+			BusyBlocks::iterator block_itr = live_blocks.find(search_key);
+			if (block_itr != live_blocks.end())
+			{
+				// Remove from live blocks
+				search_key = *block_itr;
+				live_blocks.erase(block_itr);
+				cached_bytes.live -= search_key.bytes;
+
+				// Keep the returned allocation if bin is valid and we won't exceed the max cached threshold
+				if ((search_key.bin != INVALID_BIN) && (cached_bytes.free + search_key.bytes <= max_cached_bytes))
+				{
+					// Insert returned allocation into free blocks
+					recached = true;
+					cached_blocks.insert(search_key);
+					cached_bytes.free += search_key.bytes;
+
+				}
+			}
+
+			// Unlock
+			mutex.unlock();
+
+
+			if (recached)
+			{
+				// Insert the ready event in the associated stream (must have current device set properly)
+				//TODO: see if we have to do anything here to handle concurrency
+			}
+			else
+			{
+				// Free the allocation from the runtime and cleanup the event.
+				error = cudaFreeHost(d_ptr);
+				if (error != cudaSuccess) return error;
+
+			}
+
+			return error;
+		}
+
+
+	cudaError_t CachingPinnedAllocator::FreeAllCached()
+		{
+			cudaError_t error         = cudaSuccess;
+
+			mutex.lock();
+
+			while (!cached_blocks.empty())
+			{
+				// Get first block
+				CachedBlocks::iterator begin = cached_blocks.begin();
+
+
+
+				// Free device memory
+				error = cudaFreeHost(begin->d_ptr);
+				if (error != cudaSuccess) break;
+
+				// Reduce balance and erase entry
+				cached_bytes.free -= begin->bytes;
+
+				cached_blocks.erase(begin);
+			}
+
+			mutex.unlock();
+
+
+			return error;
+		}
diff --git a/src/arrow/util/pinned_allocator.cuh b/src/arrow/util/pinned_allocator.cuh
new file mode 100644
index 00000000..a99544d8
--- /dev/null
+++ b/src/arrow/util/pinned_allocator.cuh
@@ -0,0 +1,385 @@
+/*
+ * CachedPinnedAllocator.h
+ *
+ *  Created on: Mar 15, 2018
+ *      Author: felipe
+ */
+
+#ifndef SRC_GPUABSTRACTIONS_CACHEDPINNEDALLOCATOR_H_
+#define SRC_GPUABSTRACTIONS_CACHEDPINNEDALLOCATOR_H_
+
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Simple caching allocator for device memory allocations. The allocator is
+ * thread-safe and capable of managing device allocations on multiple devices.
+ ******************************************************************************/
+
+
+
+#include <set>
+#include <map>
+
+
+#include <math.h>
+#include <mutex>
+
+
+#include "driver_types.h"
+
+
+/******************************************************************************
+ * CachingPinnedAllocator (host use)
+ ******************************************************************************/
+
+/**
+ * \brief A simple caching allocator for device memory allocations.
+ *
+ * \par Overview
+ * The allocator is thread-safe and stream-safe and is capable of managing cached
+ * device allocations on multiple devices.  It behaves as follows:
+ *
+ * \par
+ * - Allocations from the allocator are associated with an \p active_stream.  Once freed,
+ *   the allocation becomes available immediately for reuse within the \p active_stream
+ *   with which it was associated with during allocation, and it becomes available for
+ *   reuse within other streams when all prior work submitted to \p active_stream has completed.
+ * - Allocations are categorized and cached by bin size.  A new allocation request of
+ *   a given size will only consider cached allocations within the corresponding bin.
+ * - Bin limits progress geometrically in accordance with the growth factor
+ *   \p bin_growth provided during construction.  Unused device allocations within
+ *   a larger bin cache are not reused for allocation requests that categorize to
+ *   smaller bin sizes.
+ * - Allocation requests below (\p bin_growth ^ \p min_bin) are rounded up to
+ *   (\p bin_growth ^ \p min_bin).
+ * - Allocations above (\p bin_growth ^ \p max_bin) are not rounded up to the nearest
+ *   bin and are simply freed when they are deallocated instead of being returned
+ *   to a bin-cache.
+ * - %If the total storage of cached allocations on a given device will exceed
+ *   \p max_cached_bytes, allocations for that device are simply freed when they are
+ *   deallocated instead of being returned to their bin-cache.
+ *
+ * \par
+ * For example, the default-constructed CachingPinnedAllocator is configured with:
+ * - \p bin_growth          = 8
+ * - \p min_bin             = 3
+ * - \p max_bin             = 7
+ * - \p max_cached_bytes    = 6MB - 1B
+ *
+ * \par
+ * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB
+ * and sets a maximum of 6,291,455 cached bytes per device
+ *
+ */
+struct CachingPinnedAllocator
+{
+
+	//---------------------------------------------------------------------
+	// Constants
+	//---------------------------------------------------------------------
+
+	/// Out-of-bounds bin
+	static const unsigned int INVALID_BIN = (unsigned int) -1;
+
+	/// Invalid size
+	static const size_t INVALID_SIZE = (size_t) -1;
+
+
+	//---------------------------------------------------------------------
+	// Type definitions and helper types
+	//---------------------------------------------------------------------
+
+	/**
+	 * Descriptor for device memory allocations
+	 */
+	struct BlockDescriptor
+	{
+		void*           d_ptr;              // Device pointer
+		size_t          bytes;              // Size of allocation in bytes
+		unsigned int    bin;                // Bin enumeration
+		//    int             device;             // device ordinal
+		//     cudaStream_t    associated_stream;  // Associated associated_stream
+		//     cudaEvent_t     ready_event;        // Signal when associated stream has run to the point at which this block was freed
+
+		// Constructor (suitable for searching maps for a specific block, given its pointer and device)
+		BlockDescriptor(void *d_ptr) :
+			d_ptr(d_ptr),
+			bytes(0),
+			bin(INVALID_BIN)
+
+		{}
+
+		// Constructor (suitable for searching maps for a range of suitable blocks, given a device)
+		BlockDescriptor() :
+			d_ptr(NULL),
+			bytes(0),
+			bin(INVALID_BIN)
+		{}
+
+		// Comparison functor for comparing device pointers
+		static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b)
+		{
+			return (a.d_ptr < b.d_ptr);
+		}
+
+		// Comparison functor for comparing allocation sizes
+		static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b)
+		{
+			return (a.bytes < b.bytes);
+		}
+	};
+
+	/// BlockDescriptor comparator function interface
+	typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &);
+
+	class TotalBytes {
+	public:
+		size_t free;
+		size_t live;
+		TotalBytes() { free = live = 0; }
+	};
+
+	/// Set type for cached blocks (ordered by size)
+	typedef std::multiset<BlockDescriptor, Compare> CachedBlocks;
+
+	/// Set type for live blocks (ordered by ptr)
+	typedef std::multiset<BlockDescriptor, Compare> BusyBlocks;
+
+	/// Map type of device ordinals to the number of cached bytes cached by each device
+
+
+
+	//---------------------------------------------------------------------
+	// Utility functions
+	//---------------------------------------------------------------------
+
+	/**
+	 * Integer pow function for unsigned base and exponent
+	 */
+	static unsigned int IntPow(
+			unsigned int base,
+			unsigned int exp)
+	{
+		unsigned int retval = 1;
+		while (exp > 0)
+		{
+			if (exp & 1) {
+				retval = retval * base;        // multiply the result by the current base
+			}
+			base = base * base;                // square the base
+			exp = exp >> 1;                    // divide the exponent in half
+		}
+		return retval;
+	}
+
+
+	/**
+	 * Round up to the nearest power-of
+	 */
+	void NearestPowerOf(
+			unsigned int    &power,
+			size_t          &rounded_bytes,
+			unsigned int    base,
+			size_t          value)
+	{
+		power = 0;
+		rounded_bytes = 1;
+
+		if (value * base < value)
+		{
+			// Overflow
+			power = sizeof(size_t) * 8;
+			rounded_bytes = size_t(0) - 1;
+			return;
+		}
+
+		while (rounded_bytes < value)
+		{
+			rounded_bytes *= base;
+			power++;
+		}
+	}
+
+
+	//---------------------------------------------------------------------
+	// Fields
+	//---------------------------------------------------------------------
+
+	std::mutex      mutex;              /// Mutex for thread-safety
+
+	unsigned int    bin_growth;         /// Geometric growth factor for bin-sizes
+	unsigned int    min_bin;            /// Minimum bin enumeration
+	unsigned int    max_bin;            /// Maximum bin enumeration
+
+	size_t          min_bin_bytes;      /// Minimum bin size
+	size_t          max_bin_bytes;      /// Maximum bin size
+	size_t          max_cached_bytes;   /// Maximum aggregate cached bytes per device
+
+	const bool      skip_cleanup;       /// Whether or not to skip a call to FreeAllCached() when destructor is called.  (The CUDA runtime may have already shut down for statically declared allocators)
+	bool            debug;              /// Whether or not to print (de)allocation events to stdout
+
+	TotalBytes cached_bytes;       /// Map of device ordinal to aggregate cached bytes on that device
+	CachedBlocks    cached_blocks;      /// Set of cached device allocations available for reuse
+	BusyBlocks      live_blocks;        /// Set of live device allocations currently in use
+
+
+
+	//---------------------------------------------------------------------
+	// Methods
+	//---------------------------------------------------------------------
+
+	/**
+	 * \brief Constructor.
+	 */
+	CachingPinnedAllocator(
+			unsigned int    bin_growth,                             ///< Geometric growth factor for bin-sizes
+			unsigned int    min_bin             = 1,                ///< Minimum bin (default is bin_growth ^ 1)
+			unsigned int    max_bin             = INVALID_BIN,      ///< Maximum bin (default is no max bin)
+			size_t          max_cached_bytes    = INVALID_SIZE,     ///< Maximum aggregate cached bytes per device (default is no limit)
+			bool            skip_cleanup        = false,            ///< Whether or not to skip a call to \p FreeAllCached() when the destructor is called (default is to deallocate)
+			bool            debug               = false)            ///< Whether or not to print (de)allocation events to stdout (default is no stderr output)
+	:
+		bin_growth(bin_growth),
+		min_bin(min_bin),
+		max_bin(max_bin),
+		min_bin_bytes(IntPow(bin_growth, min_bin)),
+		max_bin_bytes(IntPow(bin_growth, max_bin)),
+		max_cached_bytes(max_cached_bytes),
+		skip_cleanup(skip_cleanup),
+		debug(debug),
+		cached_blocks(BlockDescriptor::SizeCompare),
+		live_blocks(BlockDescriptor::PtrCompare)
+	{}
+
+
+	/**
+	 * \brief Default constructor.
+	 *
+	 * Configured with:
+	 * \par
+	 * - \p bin_growth          = 8
+	 * - \p min_bin             = 3
+	 * - \p max_bin             = 7
+	 * - \p max_cached_bytes    = (\p bin_growth ^ \p max_bin) * 3) - 1 = 6,291,455 bytes
+	 *
+	 * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB and
+	 * sets a maximum of 6,291,455 cached bytes per device
+	 */
+	CachingPinnedAllocator(
+			bool skip_cleanup = false,
+			bool debug = false)
+	:
+		bin_growth(2),
+		min_bin(20),
+		max_bin(25),
+		min_bin_bytes(IntPow(bin_growth, min_bin)),
+		max_bin_bytes(IntPow(bin_growth, max_bin)),
+		max_cached_bytes((max_bin_bytes * 3) - 1),
+		skip_cleanup(skip_cleanup),
+		debug(debug),
+		cached_blocks(BlockDescriptor::SizeCompare),
+		live_blocks(BlockDescriptor::PtrCompare)
+	{}
+
+
+	/**
+	 * \brief Sets the limit on the number bytes this allocator is allowed to cache per device.
+	 *
+	 * Changing the ceiling of cached bytes does not cause any allocations (in-use or
+	 * cached-in-reserve) to be freed.  See \p FreeAllCached().
+	 */
+	cudaError_t SetMaxCachedBytes(
+			size_t max_cached_bytes)
+	{
+		// Lock
+		mutex.lock();
+
+		//   if (debug) _CubLog("Changing max_cached_bytes (%lld -> %lld)\n", (long long) this->max_cached_bytes, (long long) max_cached_bytes);
+
+		this->max_cached_bytes = max_cached_bytes;
+
+		// Unlock
+		mutex.unlock();
+
+		return cudaSuccess;
+	}
+
+
+	/**
+	 * \brief Provides a suitable allocation of device memory for the given size on the specified device.
+	 *
+	 * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
+	 * with which it was associated with during allocation, and it becomes available for reuse within other
+	 * streams when all prior work submitted to \p active_stream has completed.
+	 */
+
+	cudaError_t pinnedAllocate(
+			void            **d_ptr,            ///< [out] Reference to pointer to the allocation
+			size_t          bytes     );
+
+
+
+
+	/**
+	 * \brief Frees a live allocation of device memory on the specified device, returning it to the allocator.
+	 *
+	 * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
+	 * with which it was associated with during allocation, and it becomes available for reuse within other
+	 * streams when all prior work submitted to \p active_stream has completed.
+	 */
+	cudaError_t pinnedFree(
+			void*           d_ptr);
+
+
+
+
+
+	/**
+	 * \brief Frees all cached device allocations on all devices
+	 */
+	cudaError_t FreeAllCached();
+
+
+	/**
+	 * \brief Destructor
+	 */
+	virtual ~CachingPinnedAllocator()
+	{
+		if (!skip_cleanup)
+			FreeAllCached();
+	}
+
+};
+
+
+
+
+
+#endif /* SRC_GPUABSTRACTIONS_CACHEDPINNEDALLOCATOR_H_ */
diff --git a/src/bench/CMakeLists.txt b/src/bench/CMakeLists.txt
new file mode 100644
index 00000000..17382abd
--- /dev/null
+++ b/src/bench/CMakeLists.txt
@@ -0,0 +1,46 @@
+
+if(GDF_BENCHMARK)
+
+include(ExternalProject)
+
+ExternalProject_Add(benchmark_ep
+	CMAKE_ARGS
+		-DCMAKE_BUILD_TYPE=RELEASE
+		-DCMAKE_INSTALL_PREFIX=build
+		-DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON
+	GIT_REPOSITORY https://github.com/google/benchmark.git
+    GIT_TAG v1.4.1
+	UPDATE_COMMAND ""
+)
+ExternalProject_Get_property(benchmark_ep BINARY_DIR)
+set(BENCHMARK_ROOT ${BINARY_DIR}/build)
+
+file(MAKE_DIRECTORY ${BENCHMARK_ROOT}/include)
+file(MAKE_DIRECTORY ${BENCHMARK_ROOT}/lib)
+
+add_library(Google::Benchmark INTERFACE IMPORTED)
+add_dependencies(Google::Benchmark benchmark_ep)
+set_target_properties(Google::Benchmark
+    PROPERTIES INTERFACE_INCLUDE_DIRECTORIES ${BENCHMARK_ROOT}/include)
+set_target_properties(Google::Benchmark
+    PROPERTIES INTERFACE_LINK_LIBRARIES ${BENCHMARK_ROOT}/lib/libbenchmark.a)
+
+add_library(Google::Benchmark::Main INTERFACE IMPORTED)
+set_target_properties(Google::Benchmark::Main
+    PROPERTIES INTERFACE_LINK_LIBRARIES ${BENCHMARK_ROOT}/lib/libbenchmark_main.a)
+endif()
+
+
+function(GDF_ADD_BENCHMARK TARGET)
+    if(GDF_BENCHMARK)
+        list(REMOVE_AT ARGV 0)
+        cuda_add_executable(${TARGET} ${ARGV})
+        target_include_directories(${TARGET}
+            PUBLIC ${CMAKE_SOURCE_DIR}/src/parquet)
+        target_link_libraries(${TARGET}
+            Google::Benchmark Google::Benchmark::Main
+            Threads::Threads gdf-parquet)
+    endif()
+endfunction()
+
+add_subdirectory(parquet)
\ No newline at end of file
diff --git a/src/bench/parquet/CMakeLists.txt b/src/bench/parquet/CMakeLists.txt
new file mode 100644
index 00000000..428bd241
--- /dev/null
+++ b/src/bench/parquet/CMakeLists.txt
@@ -0,0 +1,34 @@
+#=============================================================================
+# Copyright 2018 BlazingDB, Inc.
+#     Copyright 2018 Alexander Ocsa <alexander@blazingdb.com>
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+find_package(Boost REQUIRED COMPONENTS filesystem)
+
+set(PARQUET_FILE_PATH
+    ${CMAKE_SOURCE_DIR}/src/bench/parquet/huge_dataset.parquet)
+
+GDF_ADD_BENCHMARK(parquet-benchmark parquet-benchmark.cu
+						parquet-multithread-benchmark parquet-multithread-benchmark.cu
+                          ../../tests/helper/utils.cuh 
+                          ../../tests/helper/utils.cu
+)
+
+if (GDF_BENCHMARK)
+target_compile_definitions(parquet-benchmark
+    PUBLIC -DPARQUET_FILE_PATH="${PARQUET_FILE_PATH}")
+endif()
+
+
diff --git a/src/bench/parquet/huge_dataset.parquet b/src/bench/parquet/huge_dataset.parquet
new file mode 100644
index 00000000..6cc822be
Binary files /dev/null and b/src/bench/parquet/huge_dataset.parquet differ
diff --git a/src/bench/parquet/parquet-benchmark.cu b/src/bench/parquet/parquet-benchmark.cu
new file mode 100644
index 00000000..2007f855
--- /dev/null
+++ b/src/bench/parquet/parquet-benchmark.cu
@@ -0,0 +1,234 @@
+/*
+ * Copyright 2018 BlazingDB, Inc.
+ *     Copyright 2018 Alexander Ocsa <alexander@blazingdb.com>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmark/benchmark.h>
+#include <cassert>
+#include <thrust/functional.h>
+#include <thrust/device_ptr.h>
+
+#include "column_reader.h"
+#include "file_reader.h"
+
+#include "../../tests/helper/utils.cuh"
+
+#ifndef PARQUET_FILE_PATH
+#error PARQUET_FILE_PATH must be defined for precompiling
+#define PARQUET_FILE_PATH "/"
+#endif
+ 
+
+enum ReaderType : std::uint8_t { kGdf, kParquet };
+
+template <ReaderType T>
+struct Readers {};
+
+template <>                                                            
+struct Readers<kGdf> {                                                 
+    typedef typename gdf::parquet::BoolReader   BoolReader;            
+    typedef typename gdf::parquet::Int32Reader  Int32Reader;           
+    typedef typename gdf::parquet::Int64Reader  Int64Reader;           
+    typedef typename gdf::parquet::FloatReader  FloatReader;          
+    typedef typename gdf::parquet::DoubleReader DoubleReader;          
+    typedef typename gdf::parquet::FileReader   FileReader;            
+     
+
+    static inline gdf_error init_gdf_buffers(void **device_values, gdf_valid_type** device_valid, int16_t** def_levels, uint32_t values_malloc_size, gdf_size_type column_size){
+        cudaError_t cuda_error = cudaMalloc(device_values, values_malloc_size);
+        auto n_bytes = get_number_of_bytes_for_valid(column_size);
+        cudaMalloc(device_valid, n_bytes);
+        cudaMalloc(def_levels, column_size * sizeof(int16_t));
+        return GDF_SUCCESS;
+    }
+     
+
+    static inline gdf_error buffer_to_gdf_column(gdf_column *output, void *device_values, gdf_valid_type* device_valid, uint32_t values_malloc_size, gdf_size_type column_size, gdf_dtype dtype) {
+        return gdf_column_view_init(output, device_values, device_valid, column_size, dtype, 0);
+    }
+
+    static inline void freeDefLevels(int16_t* def_levels){
+    	cudaFree(def_levels);
+    }
+};
+
+
+template <>                                                            
+struct Readers<kParquet> {                                                 
+    typedef typename ::parquet::BoolReader          BoolReader;            
+    typedef typename ::parquet::Int32Reader         Int32Reader;           
+    typedef typename ::parquet::Int64Reader         Int64Reader;           
+    typedef typename ::parquet::FloatReader         FloatReader;          
+    typedef typename ::parquet::DoubleReader        DoubleReader;          
+    typedef typename ::parquet::ParquetFileReader   FileReader;            
+
+    static inline gdf_error init_gdf_buffers(void **host_values, gdf_valid_type** host_valid, int16_t** def_levels, uint32_t values_malloc_size, gdf_size_type column_size){
+        *host_values = malloc(values_malloc_size);
+         auto n_bytes = get_number_of_bytes_for_valid(column_size);
+        *host_valid = (gdf_valid_type*)malloc(n_bytes);
+        *def_levels = (int16_t*)malloc(column_size * sizeof(int16_t));
+        return GDF_SUCCESS;
+    }
+    
+    static inline gdf_error buffer_to_gdf_column(gdf_column *output, void *host_values, gdf_valid_type* host_valid, uint32_t values_malloc_size, gdf_size_type column_size, gdf_dtype dtype) {
+        void *device_values;
+        cudaError_t cuda_error = cudaMalloc((void **)&device_values, values_malloc_size);
+        cudaMemcpy(device_values, host_values, values_malloc_size, cudaMemcpyHostToDevice);
+        
+        gdf_valid_type *device_valid;
+        auto n_bytes = get_number_of_bytes_for_valid(column_size);
+        cudaMalloc((void **)&device_valid, n_bytes);
+        cudaMemcpy(device_valid, host_valid, n_bytes, cudaMemcpyHostToDevice);
+
+        auto zero_bits = count_zero_bits(host_valid, column_size);
+
+        free(host_values);
+        free(host_valid);
+        return gdf_column_view_init(output, device_values, device_valid, column_size, dtype, zero_bits);
+    }
+
+    static inline void freeDefLevels(int16_t* def_levels){
+    	free(def_levels);
+    }
+};
+
+
+template<parquet::Type::type T>
+struct parquet_traits {
+
+};
+
+#define PARQUET_TRAITS_FACTORY(ParquetTypeEnum, ParquetType, GdfTypeValue)  \
+template <> struct parquet_traits<ParquetTypeEnum> {                        \
+    typedef ParquetType             parquet_type;                           \
+    static inline gdf_dtype         gdf_type() { return GdfTypeValue; }       \
+}
+
+PARQUET_TRAITS_FACTORY(parquet::Type::BOOLEAN, bool, GDF_INT8);
+PARQUET_TRAITS_FACTORY(parquet::Type::INT32, int32_t, GDF_INT32);
+PARQUET_TRAITS_FACTORY(parquet::Type::INT64, int64_t, GDF_INT64);
+PARQUET_TRAITS_FACTORY(parquet::Type::FLOAT, float, GDF_FLOAT32);
+PARQUET_TRAITS_FACTORY(parquet::Type::DOUBLE, double, GDF_FLOAT64);
+
+#undef PARQUET_TRAITS_FACTORY
+
+template <ReaderType T, class ColumnReaderType, parquet::Type::type C>
+static inline gdf_error 
+convert(gdf_column *column, ColumnReaderType *column_reader, int64_t amount_to_read, uint32_t batch_size) {
+    typedef typename parquet_traits<C>::parquet_type    parquet_type;
+    parquet_type* values_buffer;
+    gdf_valid_type* valid_bits;
+    int16_t * definition_level;
+
+    auto values_malloc_size = amount_to_read * sizeof(parquet_type);
+    gdf_error status = Readers<T>::init_gdf_buffers((void **)&(values_buffer), &valid_bits, &definition_level, values_malloc_size, amount_to_read);
+
+    std::int64_t levels_read;
+    std::int64_t values_read = 0;
+    std::int64_t nulls_count;
+
+    int64_t rows_read_total = 0;
+    while (column_reader->HasNext() && rows_read_total < amount_to_read) {
+        int64_t rows_read = column_reader->ReadBatchSpaced(batch_size,
+        							 &definition_level[rows_read_total],
+                                     nullptr,
+                                     &values_buffer[rows_read_total],
+                                     valid_bits,
+                                     0,
+                                     &levels_read,
+                                     &values_read,
+                                     &nulls_count);
+        rows_read_total += rows_read;
+    }
+
+    Readers<T>::buffer_to_gdf_column(column, (void *)values_buffer, valid_bits, values_malloc_size, amount_to_read, parquet_traits<C>::gdf_type());
+
+    Readers<T>::freeDefLevels(definition_level);
+
+    return GDF_SUCCESS;
+}
+
+
+template <ReaderType T>
+static inline gdf_error containerFrom(gdf_column *column, std::shared_ptr<parquet::ColumnReader> column_reader, int64_t numRecords, uint32_t batch_size) {
+
+    parquet::Type::type parquetDataType = column_reader->type();
+
+    #define WHEN(dataType, Prefix)                                  \
+        if ((dataType) == parquetDataType)                          \
+            return convert<T, typename Readers<T>::Prefix##Reader, dataType>       \
+                    (column, static_cast<typename Readers<T>::Prefix##Reader*>(column_reader.get()), numRecords, batch_size)
+
+        WHEN(parquet::Type::BOOLEAN, Bool);
+        WHEN(parquet::Type::INT32, Int32);
+        WHEN(parquet::Type::INT64, Int64);
+        WHEN(parquet::Type::FLOAT, Float);
+        WHEN(parquet::Type::DOUBLE, Double);
+
+    #undef WHEN
+
+    throw std::invalid_argument("ERROR: Bad parquet column type");
+} 
+template <ReaderType T>
+inline static void
+readRowGroup(const std::unique_ptr<typename Readers<T>::FileReader> &parquet_reader, uint32_t batch_size) {
+
+    std::shared_ptr<parquet::FileMetaData> file_metadata = parquet_reader->metadata();
+    const parquet::SchemaDescriptor *schema = file_metadata->schema();
+    int numRowGroups = file_metadata->num_row_groups();
+
+    std::vector<gdf_column> columns;
+
+    for (int rowGroupIndex = 0; rowGroupIndex < numRowGroups; rowGroupIndex++) {
+        auto groupReader = parquet_reader->RowGroup(rowGroupIndex);
+        const parquet::RowGroupMetaData *rowGroupMetadata = groupReader->metadata();
+        for (int columnIndex = 0; columnIndex < file_metadata->num_columns(); columnIndex++) {
+            const parquet::ColumnDescriptor *column = schema->Column(columnIndex);
+            std::unique_ptr<parquet::ColumnChunkMetaData> columnMetaData = rowGroupMetadata->ColumnChunk(
+                    columnIndex);
+            parquet::Type::type type = column->physical_type();
+
+            if (type != parquet::Type::BYTE_ARRAY){
+                const std::shared_ptr<parquet::ColumnReader> columnReader = groupReader->Column(columnIndex);
+                int64_t numRecords = rowGroupMetadata->num_rows();
+                
+                gdf_column output;
+                containerFrom<T>(&output, columnReader, numRecords, batch_size);
+                columns.push_back(output);
+            }
+        }
+    }   
+
+    for(size_t i = 0; i < columns.size(); i++)
+    {
+        delete_gdf_column(&(columns[i]));
+    }
+}
+
+template <ReaderType T>
+static void
+BM_FileRead(benchmark::State &state) {
+    for (auto _ : state) {
+        std::unique_ptr<typename Readers<T>::FileReader> reader =
+          Readers<T>::FileReader::OpenFile(PARQUET_FILE_PATH);
+        
+        readRowGroup<T>(reader, state.range(0));
+    }
+}
+
+BENCHMARK_TEMPLATE(BM_FileRead, kParquet)->Arg(50000)->Arg(100000)->Arg(500000)->Arg(1000000);
+BENCHMARK_TEMPLATE(BM_FileRead, kGdf)->Arg(50000)->Arg(100000)->Arg(500000)->Arg(1000000);
+
+ 
diff --git a/src/bench/parquet/parquet-multithread-benchmark.cu b/src/bench/parquet/parquet-multithread-benchmark.cu
new file mode 100644
index 00000000..2a831e6d
--- /dev/null
+++ b/src/bench/parquet/parquet-multithread-benchmark.cu
@@ -0,0 +1,81 @@
+#include <benchmark/benchmark.h>
+
+#include <gdf/parquet/api.h>
+
+#include "column_reader.h"
+#include "file_reader.h"
+
+#include "../../tests/helper/utils.cuh"
+
+#ifndef PARQUET_FILE_PATH
+#error PARQUET_FILE_PATH must be defined for precompiling
+#define PARQUET_FILE_PATH "/"
+#endif
+
+static void
+BM_FileRead_mt(benchmark::State &state) {
+
+	for (auto _ : state) {
+
+		gdf_column *columns = nullptr;
+		std::size_t columns_length = 0;
+		gdf_error error_code = gdf::parquet::read_parquet(
+				PARQUET_FILE_PATH, nullptr, &columns, &columns_length);
+
+
+		for (std::size_t i = 0; i < columns_length; i++){
+			cudaFree(columns[i].data);
+			cudaFree(columns[i].valid);
+		}
+
+	}
+}
+
+
+// NOTE: this way of doing the reading singlethreaded adds some overhead.
+static void
+BM_FileRead_st(benchmark::State &state) {
+
+	for (auto _ : state) {
+
+		const std::unique_ptr<gdf::parquet::FileReader> file_reader = gdf::parquet::FileReader::OpenFile(PARQUET_FILE_PATH);
+
+		std::shared_ptr<parquet::FileMetaData> file_metadata = file_reader->metadata();
+
+		int numRowGroups = file_metadata->num_row_groups();
+		int num_columns = file_metadata->num_columns();
+
+		auto schema = file_reader->RowGroup(0)->metadata()->schema();
+
+		std::vector<std::size_t> row_group_indices(1);
+		std::vector<std::size_t> column_indices(1);
+
+		for (int rg = 0; rg < numRowGroups; rg++){
+			for (int col = 0; col < num_columns; col++){
+
+				if (schema->Column(col)->physical_type() != ::parquet::Type::BYTE_ARRAY &&
+						schema->Column(col)->physical_type() != ::parquet::Type::FIXED_LEN_BYTE_ARRAY){
+
+					row_group_indices[0] = rg;
+					column_indices[0] = col;
+
+					std::vector<gdf_column *>  out_gdf_columns;
+					gdf_error error_code = gdf::parquet::read_parquet_by_ids(
+							PARQUET_FILE_PATH, row_group_indices, column_indices, out_gdf_columns);
+
+
+					for (std::size_t i = 0; i < out_gdf_columns.size(); i++){
+						cudaFree(out_gdf_columns[i]->data);
+						cudaFree(out_gdf_columns[i]->valid);
+					}
+
+				}
+			}
+		}
+
+
+	}
+}
+
+BENCHMARK(BM_FileRead_mt);
+BENCHMARK(BM_FileRead_st);
diff --git a/src/parquet/api.cpp b/src/parquet/api.cpp
new file mode 100644
index 00000000..75a3878c
--- /dev/null
+++ b/src/parquet/api.cpp
@@ -0,0 +1,683 @@
+/*
+ * Copyright 2018 BlazingDB, Inc.
+ *     Copyright 2018 Cristhian Alberto Gonzales Castillo <cristhian@blazingdb.com>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <mutex>
+#include <thread>
+#include <numeric>
+
+#include <arrow/util/bit-util.h>
+#include <arrow/util/logging.h>
+
+#include <parquet/column_reader.h>
+#include <parquet/file_reader.h>
+#include <parquet/metadata.h>
+
+#include <thrust/device_ptr.h>
+
+#include "column_reader.h"
+#include "file_reader.h"
+
+#include <gdf/parquet/api.h>
+
+BEGIN_NAMESPACE_GDF_PARQUET
+
+namespace {
+
+struct ParquetTypeHash {
+    template <class T>
+    std::size_t
+    operator()(T t) const {
+        return static_cast<std::size_t>(t);
+    }
+};
+
+const std::unordered_map<::parquet::Type::type, gdf_dtype, ParquetTypeHash>
+  dtype_from_physical_type_map{
+    {::parquet::Type::BOOLEAN, GDF_INT8},
+    {::parquet::Type::INT32, GDF_INT32},
+    {::parquet::Type::INT64, GDF_INT64},
+    {::parquet::Type::FLOAT, GDF_FLOAT32},
+    {::parquet::Type::DOUBLE, GDF_FLOAT64},
+  };
+
+const std::
+  unordered_map<::parquet::LogicalType::type, gdf_dtype, ParquetTypeHash>
+    dtype_from_logical_type_map{
+      {::parquet::LogicalType::NONE, GDF_invalid},
+      {::parquet::LogicalType::UTF8, GDF_invalid},
+      {::parquet::LogicalType::MAP, GDF_invalid},
+      {::parquet::LogicalType::MAP_KEY_VALUE, GDF_invalid},
+      {::parquet::LogicalType::LIST, GDF_invalid},
+      {::parquet::LogicalType::ENUM, GDF_invalid},
+      {::parquet::LogicalType::DECIMAL, GDF_invalid},
+      {::parquet::LogicalType::DATE, GDF_DATE32},
+      {::parquet::LogicalType::TIME_MILLIS, GDF_invalid},
+      {::parquet::LogicalType::TIME_MICROS, GDF_invalid},
+      {::parquet::LogicalType::TIMESTAMP_MILLIS, GDF_TIMESTAMP},
+      {::parquet::LogicalType::TIMESTAMP_MICROS, GDF_invalid},
+      {::parquet::LogicalType::UINT_8, GDF_invalid},
+      {::parquet::LogicalType::UINT_16, GDF_invalid},
+      {::parquet::LogicalType::UINT_32, GDF_invalid},
+      {::parquet::LogicalType::UINT_64, GDF_invalid},
+      {::parquet::LogicalType::INT_8, GDF_INT8},
+      {::parquet::LogicalType::INT_16, GDF_INT16},
+      {::parquet::LogicalType::INT_32, GDF_INT32},
+      {::parquet::LogicalType::INT_64, GDF_INT64},
+      {::parquet::LogicalType::JSON, GDF_invalid},
+      {::parquet::LogicalType::BSON, GDF_invalid},
+      {::parquet::LogicalType::INTERVAL, GDF_invalid},
+      {::parquet::LogicalType::NA, GDF_invalid},
+    };
+
+static inline gdf_dtype
+_DTypeFrom(const ::parquet::ColumnDescriptor *const column_descriptor) {
+    const ::parquet::LogicalType::type logical_type =
+      column_descriptor->logical_type();
+
+    if (logical_type != ::parquet::LogicalType::NONE) {
+        return dtype_from_logical_type_map.at(logical_type);
+    }
+
+    const ::parquet::Type::type physical_type =
+      column_descriptor->physical_type();
+
+    return dtype_from_physical_type_map.at(physical_type);
+}
+
+static inline gdf_error
+_ReadColumn(const std::shared_ptr<GdfRowGroupReader> &row_group_reader,
+            const std::vector<std::size_t> &          column_indices,
+            std::size_t                               offsets[],
+            gdf_column *const                         gdf_columns) {
+    for (std::size_t column_reader_index = 0;
+         column_reader_index < column_indices.size();
+         column_reader_index++) {
+        const gdf_column &_gdf_column = gdf_columns[column_reader_index];
+        const std::shared_ptr<::parquet::ColumnReader> column_reader =
+          row_group_reader->Column(
+            static_cast<int>(column_indices[column_reader_index]));
+
+        switch (column_reader->type()) {
+#define WHEN(TYPE)                                                            \
+    case ::parquet::Type::TYPE: {                                             \
+        std::shared_ptr<gdf::parquet::ColumnReader<                           \
+          ::parquet::DataType<::parquet::Type::TYPE>>>                        \
+          reader = std::static_pointer_cast<gdf::parquet::ColumnReader<       \
+            ::parquet::DataType<::parquet::Type::TYPE>>>(column_reader);      \
+        if (reader->HasNext()) {                                              \
+            offsets[column_reader_index] +=                                   \
+              reader->ToGdfColumn(_gdf_column, offsets[column_reader_index]); \
+        }                                                                     \
+    } break
+            WHEN(BOOLEAN);
+            WHEN(INT32);
+            WHEN(INT64);
+            WHEN(FLOAT);
+            WHEN(DOUBLE);
+        default:
+#ifdef GDF_DEBUG
+            std::cerr << "Column type error from file" << std::endl;
+#endif
+            return GDF_IO_ERROR;  //TODO: improve using exception handling
+#undef WHEN
+        }
+    }
+    return GDF_SUCCESS;
+}
+
+static inline gdf_error
+_ReadFile(const std::unique_ptr<FileReader> &file_reader,
+          const std::vector<std::size_t> &   indices,
+          gdf_column *const                  gdf_columns) {
+    const std::shared_ptr<::parquet::FileMetaData> &metadata =
+      file_reader->metadata();
+    const std::size_t num_rows =
+      static_cast<std::size_t>(metadata->num_rows());
+    const std::size_t num_row_groups =
+      static_cast<std::size_t>(metadata->num_row_groups());
+
+    std::size_t offsets[indices.size()];
+    for (std::size_t i = 0; i < indices.size(); i++) { offsets[i] = 0; }
+
+    for (std::size_t row_group_index = 0; row_group_index < num_row_groups;
+         row_group_index++) {
+        const auto row_group_reader =
+          file_reader->RowGroup(static_cast<int>(row_group_index));
+
+        gdf_error status =
+          _ReadColumn(row_group_reader, indices, offsets, gdf_columns);
+        if (status != GDF_SUCCESS) { return status; }
+    }
+
+    return GDF_SUCCESS;
+}
+
+static inline gdf_error
+_ReadFile(const std::unique_ptr<FileReader> &file_reader,
+          const std::vector<std::size_t> &   row_group_indices,
+          const std::vector<std::size_t> &   column_indices,
+          gdf_column *const                  gdf_columns) {
+    const std::shared_ptr<::parquet::FileMetaData> &metadata =
+      file_reader->metadata();
+    const std::size_t num_rows =
+      static_cast<std::size_t>(metadata->num_rows());
+
+    std::size_t offsets[column_indices.size()];
+    for (std::size_t i = 0; i < column_indices.size(); i++) { offsets[i] = 0; }
+
+    for (const std::size_t row_group_index : row_group_indices) {
+        const auto row_group_reader =
+          file_reader->RowGroup(static_cast<int>(row_group_index));
+
+        gdf_error status =
+          _ReadColumn(row_group_reader, column_indices, offsets, gdf_columns);
+        if (status != GDF_SUCCESS) { return status; }
+    }
+
+    return GDF_SUCCESS;
+}
+
+
+struct ParquetReaderJob {
+
+	std::size_t row_group_index;
+	std::size_t column_index;
+	std::size_t column_index_in_read_set;
+
+//	std::shared_ptr<GdfRowGroupReader> row_group_reader;
+	std::shared_ptr<::parquet::ColumnReader> column_reader;
+
+	const gdf_column & column;
+	std::size_t offset;
+
+	gdf_valid_type first_valid_byte;
+	gdf_valid_type last_valid_byte;
+
+	ParquetReaderJob(std::size_t _row_group_index,
+	std::size_t _column_index,
+	std::size_t _column_index_in_read_set,
+//	std::shared_ptr<GdfRowGroupReader> _row_group_reader,
+	std::shared_ptr<::parquet::ColumnReader> _column_reader,
+	const gdf_column & _column,
+	std::size_t _offset )
+	: row_group_index(_row_group_index),
+	  column_index(_column_index),
+	  column_index_in_read_set(_column_index_in_read_set),
+//	  row_group_reader(std::move(_row_group_reader)),
+	  column_reader(std::move(_column_reader)),
+	  column(std::move(_column)),
+	  offset(_offset)
+	{}
+};
+
+
+
+void _ProcessParquetReaderJobsThread(std::vector<ParquetReaderJob> & jobs, std::mutex & lock,
+		int & job_index, gdf_error & gdf_error_out){
+
+	lock.lock();
+	int current_job = job_index;
+	job_index++;
+	lock.unlock();
+
+	gdf_error current_gdf_error = GDF_SUCCESS;
+
+	while (current_job < jobs.size()){
+
+		switch (jobs[current_job].column_reader->type()) {
+		#define WHEN(TYPE)                                                            \
+				case ::parquet::Type::TYPE: {                                             \
+					std::shared_ptr<gdf::parquet::ColumnReader<                           \
+					::parquet::DataType<::parquet::Type::TYPE>>>                        \
+					 reader = std::static_pointer_cast<gdf::parquet::ColumnReader<       \
+					 ::parquet::DataType<::parquet::Type::TYPE>>>(jobs[current_job].column_reader);      \
+					 if (reader->HasNext()) {                                              \
+						 reader->ToGdfColumn(jobs[current_job].column, jobs[current_job].offset, jobs[current_job].first_valid_byte, jobs[current_job].last_valid_byte); \
+					 }                                                                     \
+				} break
+				WHEN(BOOLEAN);
+				WHEN(INT32);
+				WHEN(INT64);
+				WHEN(FLOAT);
+				WHEN(DOUBLE);
+				default:
+		#ifdef GDF_DEBUG
+					std::cerr << "Column type error from file" << std::endl;
+		#endif
+					current_gdf_error =  GDF_IO_ERROR;  //TODO: improve using exception handling
+		#undef WHEN
+				}
+
+
+		lock.lock();
+		if (gdf_error_out != GDF_SUCCESS){ // if error we want to exit
+			current_job = jobs.size();
+		} else if (current_gdf_error != GDF_SUCCESS) { // if error we want to exit
+			gdf_error_out = current_gdf_error;
+			current_job = jobs.size();
+		} else {
+			current_job = job_index;
+		}
+		job_index++;
+		lock.unlock();
+	}
+
+}
+
+gdf_error _ProcessParquetReaderJobs(std::vector<ParquetReaderJob> & jobs){
+
+	std::mutex lock;
+	int job_index = 0;
+	gdf_error gdf_error_out = GDF_SUCCESS;
+
+	int num_threads = std::thread::hardware_concurrency();
+	num_threads = jobs.size() < num_threads ? jobs.size() : num_threads;
+
+
+//	_ProcessParquetReaderJobsThread(jobs, lock, job_index, gdf_error_out);
+
+	std::vector<std::thread> threads(num_threads);
+
+	for (int i = 0; i < num_threads; i++){
+		threads[i] = std::thread(_ProcessParquetReaderJobsThread,
+				std::ref(jobs), std::ref(lock), std::ref(job_index), std::ref(gdf_error_out));
+	}
+	for (int i = 0; i < num_threads; i++){
+		threads[i].join();
+	}
+
+
+
+	return gdf_error_out;
+}
+
+
+static inline gdf_error
+_ReadFileMultiThread(const std::unique_ptr<FileReader> &file_reader,
+		const std::vector<std::size_t> &   row_group_indices,
+		const std::vector<std::size_t> &   column_indices,
+		gdf_column *const                  gdf_columns) {
+	const std::shared_ptr<::parquet::FileMetaData> &metadata =
+			file_reader->metadata();
+	const std::size_t num_rows =
+			static_cast<std::size_t>(metadata->num_rows());
+
+
+	std::vector<ParquetReaderJob> jobs;
+
+	std::vector<std::size_t> offsets(row_group_indices.size(), 0);
+
+	for (std::size_t row_group_index_in_set = 0; row_group_index_in_set < row_group_indices.size();
+			row_group_index_in_set++) {
+
+		std::size_t row_group_index = row_group_indices[row_group_index_in_set];
+
+		const auto row_group_reader =
+				file_reader->RowGroup(static_cast<int>(row_group_index));
+
+		int64_t num_rows = row_group_reader->metadata()->num_rows();
+
+
+		for (std::size_t column_reader_index = 0;
+				column_reader_index < column_indices.size();
+				column_reader_index++) {
+			const gdf_column &_gdf_column = gdf_columns[column_reader_index];
+			const std::shared_ptr<::parquet::ColumnReader> column_reader =
+					row_group_reader->Column(
+							static_cast<int>(column_indices[column_reader_index]));
+
+			jobs.emplace_back(row_group_index, column_indices[column_reader_index],
+					column_reader_index, column_reader,
+					_gdf_column, offsets[row_group_index_in_set]);
+
+		}
+
+		if (row_group_index_in_set < row_group_indices.size() - 1){
+			offsets[row_group_index_in_set + 1] = offsets[row_group_index_in_set] + num_rows;
+		}
+	}
+
+	gdf_error gdf_error_out = _ProcessParquetReaderJobs(jobs);
+
+	// now lets fix all the valid bytes that were shared for a column accross rowgroups
+	if (row_group_indices.size() > 1){
+		for (std::size_t column_reader_index = 0; column_reader_index < column_indices.size();
+				column_reader_index++) {
+
+			for (std::size_t row_group_index_in_set = 0; row_group_index_in_set < row_group_indices.size() - 1;
+					row_group_index_in_set++) {
+
+				int job_index1 = (row_group_index_in_set * column_indices.size()) + column_reader_index;
+				int job_index2 = ((row_group_index_in_set + 1) * column_indices.size()) + column_reader_index;
+
+				gdf_valid_type merged = jobs[job_index1].last_valid_byte | jobs[job_index2].first_valid_byte;
+
+				// determine location of where the merged byte goes
+				// copy merged into valid
+				std::size_t merged_byte_offset = (offsets[row_group_index_in_set + 1]/8);
+
+				cudaMemcpy(gdf_columns[column_reader_index].valid + merged_byte_offset, &merged, sizeof(gdf_valid_type), cudaMemcpyHostToDevice);
+			}
+		}
+	}
+
+
+	return gdf_error_out;
+}
+
+
+
+template <::parquet::Type::type TYPE>
+static inline gdf_error
+_AllocateGdfColumn(const std::size_t                        num_rows,
+                   const ::parquet::ColumnDescriptor *const column_descriptor,
+                   gdf_column &                             _gdf_column) {
+    const std::size_t value_byte_size =
+      static_cast<std::size_t>(::parquet::type_traits<TYPE>::value_byte_size);
+
+    cudaError_t status =
+      cudaMalloc(&_gdf_column.data, num_rows * value_byte_size);
+    if (status != cudaSuccess) {
+#ifdef GDF_DEBUG
+        std::cerr << "Allocation error for data\n" << e.what() << std::endl;
+#endif
+        return GDF_IO_ERROR;
+    }
+
+    status = cudaMalloc(reinterpret_cast<void **>(&_gdf_column.valid),
+                        ::arrow::BitUtil::BytesForBits(num_rows));
+    if (status != cudaSuccess) {
+#ifdef GDF_DEBUG
+        std::cerr << "Allocation error for valid\n" << e.what() << std::endl;
+#endif
+        return GDF_IO_ERROR;
+    }
+
+    _gdf_column.size  = num_rows;
+    _gdf_column.dtype = _DTypeFrom(column_descriptor);
+
+    return GDF_SUCCESS;
+}  // namespace
+
+static inline std::vector<const ::parquet::ColumnDescriptor *>
+_ColumnDescriptorsFrom(const std::unique_ptr<FileReader> &file_reader,
+                       const std::vector<std::size_t> &   indices) {
+    const auto &row_group_reader = file_reader->RowGroup(0);
+
+    std::vector<const ::parquet::ColumnDescriptor *> column_descriptors;
+    column_descriptors.reserve(indices.size());
+
+    for (const std::size_t i : indices) {
+        column_descriptors.emplace_back(row_group_reader->Column(i)->descr());
+    }
+
+    return column_descriptors;
+}
+
+static inline gdf_error
+_AllocateGdfColumns(const std::unique_ptr<FileReader> &file_reader,
+		const std::vector<std::size_t> &   row_group_indices,
+		const std::vector<std::size_t> &   column_indices,
+		gdf_column *const                  gdf_columns) {
+    const std::vector<const ::parquet::ColumnDescriptor *> column_descriptors =
+      _ColumnDescriptorsFrom(file_reader, column_indices);
+
+    int64_t num_rows = 0;
+    for (std::size_t row_group_index_in_set = 0; row_group_index_in_set < row_group_indices.size();
+    		row_group_index_in_set++) {
+
+    	std::size_t row_group_index = row_group_indices[row_group_index_in_set];
+
+    	const auto row_group_reader =
+    			file_reader->RowGroup(static_cast<int>(row_group_index));
+
+    	num_rows += row_group_reader->metadata()->num_rows();
+    }
+
+
+    const std::size_t num_columns = column_indices.size();
+
+
+#define WHEN(TYPE)                                                            \
+    case ::parquet::Type::TYPE:                                               \
+        _AllocateGdfColumn<::parquet::Type::TYPE>(                            \
+          num_rows, column_descriptor, _gdf_column);                          \
+        break
+
+    for (std::size_t i = 0; i < num_columns; i++) {
+        gdf_column &                             _gdf_column = gdf_columns[i];
+        const ::parquet::ColumnDescriptor *const column_descriptor =
+          column_descriptors[i];
+
+        switch (column_descriptor->physical_type()) {
+            WHEN(BOOLEAN);
+            WHEN(INT32);
+            WHEN(INT64);
+            WHEN(FLOAT);
+            WHEN(DOUBLE);
+        default:
+#ifdef GDF_DEBUG
+            std::cerr << "Column type not supported" << std::endl;
+#endif
+            return GDF_IO_ERROR;
+        }
+    }
+#undef WHEN
+    return GDF_SUCCESS;
+}
+
+static inline gdf_column *
+_CreateGdfColumns(const std::size_t num_columns) try {
+    return new gdf_column[num_columns];
+} catch (const std::bad_alloc &e) {
+#ifdef GDF_DEBUG
+    std::cerr << "Allocation error for gdf columns\n" << e.what() << std::endl;
+#endif
+    return nullptr;
+}
+
+
+static inline std::vector<std::size_t>
+_GetColumnIndices(const std::unique_ptr<FileReader> &file_reader,
+		const char *const *const raw_names){
+
+	 std::vector<std::size_t> indices;
+
+	const std::shared_ptr<const ::parquet::FileMetaData> &metadata =
+			file_reader->metadata();
+
+	const std::size_t num_columns =
+			static_cast<std::size_t>(metadata->num_columns());
+
+	 auto schema = file_reader->RowGroup(0)->metadata()->schema();
+
+	 std::vector<std::pair<std::string, std::size_t>> parquet_columns;
+	 parquet_columns.reserve(num_columns);
+
+	 for (std::size_t i = 0; i < num_columns; i++) {
+		 if (schema->Column(i)->physical_type() != ::parquet::Type::BYTE_ARRAY &&
+				 schema->Column(i)->physical_type() != ::parquet::Type::FIXED_LEN_BYTE_ARRAY){
+
+			 parquet_columns.push_back(std::make_pair(schema->Column(i)->name(), i));
+
+		 }
+	 }
+
+	if (raw_names != nullptr) {
+		for (const char *const *name_ptr = raw_names; *name_ptr != nullptr;
+				name_ptr++) {
+
+			std::string filter_name = *name_ptr;
+			for (std::size_t i = 0; i < parquet_columns.size(); i++) {
+				if (filter_name == parquet_columns[i].first){
+					indices.push_back(parquet_columns[i].second);
+					break;
+				}
+			}
+		}
+	} else {
+		for (std::size_t i = 0; i < parquet_columns.size(); i++) {
+			indices.push_back(parquet_columns[i].second);
+		}
+	}
+	return indices;
+}
+
+
+static inline gdf_error
+_CheckMinimalData(const std::unique_ptr<FileReader> &file_reader) {
+    const std::shared_ptr<const ::parquet::FileMetaData> &metadata =
+      file_reader->metadata();
+
+    if (metadata->num_row_groups() == 0) { return GDF_IO_ERROR; }
+
+    if (metadata->num_rows() == 0) { return GDF_IO_ERROR; }
+
+    return GDF_SUCCESS;
+}
+
+static inline std::unique_ptr<FileReader>
+_OpenFile(const std::string &filename) try {
+    return FileReader::OpenFile(filename);
+} catch (std::exception &e) {
+#ifdef GDF_DEBUG
+    std::cerr << "Open file\n" << e.what() << std::endl;
+#endif
+    return nullptr;
+}
+
+static inline std::unique_ptr<FileReader>
+_OpenFile(std::shared_ptr<::arrow::io::RandomAccessFile> file) try {
+    return FileReader::OpenFile(file);
+} catch (std::exception &e) {
+#ifdef GDF_DEBUG
+    std::cerr << "Open file\n" << e.what() << std::endl;
+#endif
+    return nullptr;
+}
+
+}  // namespace
+
+
+static inline gdf_error
+_read_parquet_by_ids(const std::unique_ptr<FileReader> & file_reader,
+                    const std::vector<std::size_t> &row_group_indices,
+                    const std::vector<std::size_t> &column_indices,
+					gdf_column *const                  gdf_columns) {
+
+    if (gdf_columns == nullptr) { return GDF_IO_ERROR; }
+
+    if (_AllocateGdfColumns(file_reader, row_group_indices, column_indices, gdf_columns)
+        != GDF_SUCCESS) {
+        return GDF_IO_ERROR;
+    }
+
+    if (_ReadFileMultiThread(file_reader, row_group_indices, column_indices, gdf_columns)
+        != GDF_SUCCESS) {
+        return GDF_IO_ERROR;
+    }
+
+    return GDF_SUCCESS;
+}
+
+
+gdf_error
+read_parquet_by_ids(const std::string &             filename,
+                    const std::vector<std::size_t> &row_group_indices,
+                    const std::vector<std::size_t> &column_indices,
+                    std::vector<gdf_column *> &     out_gdf_columns) {
+
+	const std::unique_ptr<FileReader> file_reader = _OpenFile(filename);
+
+    if (!file_reader) { return GDF_IO_ERROR; }
+
+    if (_CheckMinimalData(file_reader) != GDF_SUCCESS) { return GDF_IO_ERROR; }
+
+    gdf_column *const gdf_columns = _CreateGdfColumns(column_indices.size());
+
+    gdf_error status = _read_parquet_by_ids(std::move(file_reader), row_group_indices, column_indices, gdf_columns);
+
+    for (std::size_t i = 0; i < column_indices.size(); i++) {
+    	out_gdf_columns.push_back(&gdf_columns[i]);
+    }
+
+    return status;
+}
+
+gdf_error
+read_parquet_by_ids(std::shared_ptr<::arrow::io::RandomAccessFile> file,
+                    const std::vector<std::size_t> &row_group_indices,
+                    const std::vector<std::size_t> &column_indices,
+                    std::vector<gdf_column *> &     out_gdf_columns) {
+
+    const std::unique_ptr<FileReader> file_reader = _OpenFile(file);
+
+    if (!file_reader) { return GDF_IO_ERROR; }
+
+    if (_CheckMinimalData(file_reader) != GDF_SUCCESS) { return GDF_IO_ERROR; }
+
+    gdf_column *const gdf_columns = _CreateGdfColumns(column_indices.size());
+
+    gdf_error status = _read_parquet_by_ids(std::move(file_reader), row_group_indices, column_indices, gdf_columns);
+
+    for (std::size_t i = 0; i < column_indices.size(); i++) {
+    	out_gdf_columns.push_back(&gdf_columns[i]);
+    }
+
+    return status;
+}
+
+extern "C" {
+
+gdf_error
+read_parquet(const char *const        filename,
+             const char *const *const columns,
+             gdf_column **const       out_gdf_columns,
+             size_t *const            out_gdf_columns_length) {
+
+    const std::unique_ptr<FileReader> file_reader = _OpenFile(filename);
+
+    if (!file_reader) { return GDF_IO_ERROR; }
+
+    if (_CheckMinimalData(file_reader) != GDF_SUCCESS) { return GDF_IO_ERROR; }
+
+    const std::vector<std::size_t> column_indices =
+    		_GetColumnIndices(file_reader, columns);
+
+    const std::shared_ptr<::parquet::FileMetaData> &metadata =
+    		file_reader->metadata();
+    const std::size_t num_row_groups =
+    		static_cast<std::size_t>(metadata->num_row_groups());
+
+    std::vector<std::size_t> row_group_ind(num_row_groups);
+    std::iota( row_group_ind.begin(), row_group_ind.end(), 0);
+
+    const std::vector<std::size_t> row_group_indices(row_group_ind);
+
+    gdf_column *const gdf_columns = _CreateGdfColumns(column_indices.size());
+
+    gdf_error status = _read_parquet_by_ids(std::move(file_reader), row_group_indices, column_indices, gdf_columns);
+
+   *out_gdf_columns = gdf_columns;
+   *out_gdf_columns_length = column_indices.size();
+
+    return status;
+
+}
+}
+
+END_NAMESPACE_GDF_PARQUET
diff --git a/src/parquet/column_reader.cu b/src/parquet/column_reader.cu
new file mode 100644
index 00000000..5084242d
--- /dev/null
+++ b/src/parquet/column_reader.cu
@@ -0,0 +1,717 @@
+/*
+ * Copyright 2018 BlazingDB, Inc.
+ *     Copyright 2018 Cristhian Alberto Gonzales Castillo <cristhian@blazingdb.com>
+ *     Copyright 2018 Alexander Ocsa <alexander@blazingdb.com>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <arrow/util/bit-util.h>
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/functional.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/iterator/permutation_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/device_ptr.h>
+#include <thrust/transform.h>
+
+#include "column_reader.h"
+#include "dictionary_decoder.cuh"
+#include "plain_decoder.cuh"
+
+#include <gdf/utils.h>
+#include "../util/bit_util.cuh"
+
+namespace gdf
+{
+namespace parquet
+{
+
+template <class DataType, class DecoderType>
+static inline void
+_ConfigureDictionary(
+    const ::parquet::Page *page,
+    std::unordered_map<int, std::shared_ptr<DecoderType>> &decoders,
+    const ::parquet::ColumnDescriptor *const column_descriptor,
+    ::arrow::MemoryPool *const pool,
+    DecoderType **out_decoder)
+{
+    const ::parquet::DictionaryPage *dictionary_page =
+        static_cast<const ::parquet::DictionaryPage *>(page);
+
+    int encoding = static_cast<int>(dictionary_page->encoding());
+    if (dictionary_page->encoding() == ::parquet::Encoding::PLAIN_DICTIONARY || dictionary_page->encoding() == ::parquet::Encoding::PLAIN)
+    {
+        encoding = static_cast<int>(::parquet::Encoding::RLE_DICTIONARY);
+    }
+
+    auto it = decoders.find(encoding);
+    if (it != decoders.end())
+    {
+        throw ::parquet::ParquetException(
+            "Column cannot have more than one dictionary.");
+    }
+
+    if (dictionary_page->encoding() == ::parquet::Encoding::PLAIN_DICTIONARY || dictionary_page->encoding() == ::parquet::Encoding::PLAIN)
+    {
+        internal::PlainDecoder<DataType> dictionary(column_descriptor);
+        dictionary.SetData(
+            dictionary_page->num_values(), page->data(), page->size());
+
+        auto decoder = std::make_shared<internal::DictionaryDecoder<
+            DataType, gdf::arrow::internal::RleDecoder>>(column_descriptor, pool);
+        decoder->SetDict(&dictionary);
+        decoders[encoding] = decoder;
+    }
+    else
+    {
+        ::parquet::ParquetException::NYI(
+            "only plain dictionary encoding has been implemented");
+    }
+
+    *out_decoder = decoders[encoding].get();
+}
+
+static inline bool
+_IsDictionaryIndexEncoding(const ::parquet::Encoding::type &e)
+{
+    return e == ::parquet::Encoding::RLE_DICTIONARY || e == ::parquet::Encoding::PLAIN_DICTIONARY;
+}
+
+template <class DecoderType, class T>
+static inline std::int64_t
+_ReadValues(DecoderType *decoder, std::int64_t batch_size, T *out)
+{
+    std::int64_t num_decoded =
+        decoder->Decode(out, static_cast<int>(batch_size));
+    return num_decoded;
+}
+
+template <class DataType>
+bool ColumnReader<DataType>::HasNext()
+{
+    if (num_buffered_values_ == 0 || num_decoded_values_ == num_buffered_values_)
+    {
+        if (!ReadNewPage() || num_buffered_values_ == 0)
+        {
+            return false;
+        }
+    }
+    return true;
+}
+
+template <class DataType>
+bool ColumnReader<DataType>::ReadNewPage()
+{
+    const std::uint8_t *buffer;
+
+    for (;;)
+    {
+        current_page_ = pager_->NextPage();
+        if (!current_page_)
+        {
+            return false;
+        }
+
+        if (current_page_->type() == ::parquet::PageType::DICTIONARY_PAGE)
+        {
+            _ConfigureDictionary<DataType>(current_page_.get(),
+                                           decoders_,
+                                           descr_,
+                                           pool_,
+                                           &current_decoder_);
+            continue;
+        }
+        else if (current_page_->type() == ::parquet::PageType::DATA_PAGE)
+        {
+            const ::parquet::DataPage *page =
+                static_cast<const ::parquet::DataPage *>(current_page_.get());
+
+            num_buffered_values_ = page->num_values();
+            num_decoded_values_ = 0;
+            buffer = page->data();
+
+            std::int64_t data_size = page->size();
+
+            if (descr_->max_repetition_level() > 0)
+            {
+                std::int64_t rep_levels_bytes =
+                    repetition_level_decoder_.SetData(
+                        page->repetition_level_encoding(),
+                        descr_->max_repetition_level(),
+                        static_cast<int>(num_buffered_values_),
+                        buffer);
+                buffer += rep_levels_bytes;
+                data_size -= rep_levels_bytes;
+            }
+
+            if (descr_->max_definition_level() > 0)
+            {
+                std::int64_t def_levels_bytes =
+                    def_level_decoder_.SetData(
+                        page->definition_level_encoding(),
+                        descr_->max_definition_level(),
+                        static_cast<int>(num_buffered_values_),
+                        buffer);
+                buffer += def_levels_bytes;
+                data_size -= def_levels_bytes;
+            }
+
+            ::parquet::Encoding::type encoding = page->encoding();
+
+            if (_IsDictionaryIndexEncoding(encoding))
+            {
+                encoding = ::parquet::Encoding::RLE_DICTIONARY;
+            }
+
+            auto it = decoders_.find(static_cast<int>(encoding));
+            if (it != decoders_.end())
+            {
+                if (encoding == ::parquet::Encoding::RLE_DICTIONARY)
+                {
+                    DCHECK(current_decoder_->encoding() == ::parquet::Encoding::RLE_DICTIONARY);
+                }
+                current_decoder_ = it->second.get();
+            }
+            else
+            {
+                switch (encoding)
+                {
+                case ::parquet::Encoding::PLAIN:
+                {
+                    std::shared_ptr<DecoderType> decoder(
+                        new internal::PlainDecoder<DataType>(descr_));
+                    decoders_[static_cast<int>(encoding)] = decoder;
+                    current_decoder_ = decoder.get();
+                    break;
+                }
+                case ::parquet::Encoding::RLE_DICTIONARY:
+                    throw ::parquet::ParquetException(
+                        "Dictionary page must be before data page.");
+
+                case ::parquet::Encoding::DELTA_BINARY_PACKED:
+                case ::parquet::Encoding::DELTA_LENGTH_BYTE_ARRAY:
+                case ::parquet::Encoding::DELTA_BYTE_ARRAY:
+                    ::parquet::ParquetException::NYI("Unsupported encoding");
+
+                default:
+                    throw ::parquet::ParquetException(
+                        "Unknown encoding type.");
+                }
+            }
+            current_decoder_->SetData(static_cast<int>(num_buffered_values_),
+                                      buffer,
+                                      static_cast<int>(data_size));
+            return true;
+        }
+        else
+        {
+            continue;
+        }
+    }
+    return true;
+}
+
+static inline bool
+_HasSpacedValues(const ::parquet::ColumnDescriptor *descr)
+{
+    if (descr->max_repetition_level() > 0)
+    {
+        return !descr->schema_node()->is_required();
+    }
+    else
+    {
+        const ::parquet::schema::Node *node = descr->schema_node().get();
+        while (node)
+        {
+            if (node->is_optional())
+            {
+                return true;
+            }
+            node = node->parent();
+        }
+        return false;
+    }
+}
+
+
+struct is_equal
+{
+    int16_t max_definition_level;
+
+    is_equal(int16_t max_definition_level)
+        : max_definition_level(max_definition_level)
+    {
+
+    }
+    __host__ __device__ bool operator()(const int16_t &x)
+    {
+        return x == max_definition_level;
+    }
+};
+
+// expands data vector that does not contain nulls into a representation that has indeterminate values where there should be nulls
+// A vector of int work_space needs to be allocated to hold the map for the scatter operation. The workspace should be of size batch_size
+template <typename T>
+void compact_to_sparse_for_nulls(T *data_in, T *data_out, const int16_t *definition_levels, int16_t max_definition_level,
+                                 int64_t batch_size, int *work_space)
+{
+    is_equal op(max_definition_level);
+    auto out_iter = thrust::copy_if(thrust::device,
+                                    thrust::counting_iterator<int>(0),
+                                    thrust::counting_iterator<int>(batch_size),
+                                    definition_levels,
+                                    work_space,
+                                    op);
+    int num_not_null = out_iter - work_space;
+    thrust::scatter(thrust::device, data_in, data_in + num_not_null, work_space, data_out);
+}
+
+
+#define WARP_BYTE  4
+#define WARP_SIZE  32
+#define WARP_MASK  0xFFFFFFFF
+constexpr unsigned int THREAD_BLOCK_SIZE{256};
+
+template<typename Functor>
+__global__ void transform_valid_kernel(uint8_t* valid, const int64_t size, Functor is_valid) {
+    size_t tid = threadIdx.x;
+	size_t blkid = blockIdx.x;
+	size_t blksz = blockDim.x;
+	size_t gridsz = gridDim.x;
+
+	size_t step = blksz * gridsz;
+	size_t i = tid + blkid * blksz;
+
+	while (i < size) {
+		uint32_t bitmask = 0;
+		uint32_t result = is_valid(i);
+		bitmask = (-result << (i % WARP_SIZE));
+
+        #pragma unroll
+		for (size_t offset = 16; offset > 0; offset /= 2) {
+			bitmask += __shfl_down_sync(WARP_MASK, bitmask, offset);
+		}
+
+		if ((i % WARP_SIZE) == 0) {
+			int index = i / WARP_SIZE * WARP_BYTE;
+            valid[index + 0] = 0xFF & bitmask;
+            valid[index + 1] = 0xFF & (bitmask >> 8);
+            valid[index + 2] = 0xFF & (bitmask >> 16);
+            valid[index + 3] = 0xFF & (bitmask >> 24);
+		}
+		i += step;
+	}
+}
+
+template<typename Functor>
+__global__ void transform_valid_kernel(uint8_t* valid, const int64_t size, size_t num_chars, Functor is_valid) {
+    size_t tid = threadIdx.x;
+	size_t blkid = blockIdx.x;
+	size_t blksz = blockDim.x;
+	size_t gridsz = gridDim.x;
+
+	size_t step = blksz * gridsz;
+	size_t i = tid + blkid * blksz;
+
+	while (i < size) {
+		uint32_t bitmask = 0;
+		uint32_t result = is_valid(i);
+		bitmask = (-result << (i % WARP_SIZE));
+
+        #pragma unroll
+		for (size_t offset = 16; offset > 0; offset /= 2) {
+			bitmask += __shfl_down_sync(WARP_MASK, bitmask, offset);
+		}
+
+		if ((i % WARP_SIZE) == 0) {
+			int index = i / WARP_SIZE * WARP_BYTE;
+            if (index + 0 < num_chars)
+                valid[index + 0] = 0xFF & bitmask;
+            if (index + 1 < num_chars)
+                valid[index + 1] = 0xFF & (bitmask >> 8);
+            if (index + 2 < num_chars)
+                valid[index + 2] = 0xFF & (bitmask >> 16);
+            if (index + 3 < num_chars)
+                valid[index + 3] = 0xFF & (bitmask >> 24);
+		}
+		i += step;
+	}
+}
+
+template<typename Functor>
+void transform_valid(uint8_t* valid, const int64_t size, Functor is_valid) {
+    const dim3 grid ((size + THREAD_BLOCK_SIZE - 1) / THREAD_BLOCK_SIZE, 1, 1);
+    const dim3 block (THREAD_BLOCK_SIZE, 1, 1);
+    if (size % 32 == 0) {
+        transform_valid_kernel<Functor> <<<grid, block>>>(valid, size, is_valid);
+    }
+    else {
+        size_t num_chars = gdf_get_num_chars_bitmask(size);
+        transform_valid_kernel<Functor> <<<grid, block>>>(valid, size, num_chars, is_valid);
+    }
+}
+
+struct TurnOnFunctor {
+    __host__ __device__ uint32_t operator() (size_t index) {
+        return 0xFFFFFFFF;
+    }
+};
+
+static inline void _TurnBitOnForValids(std::int64_t        def_length,
+                                       std::uint8_t *      d_valid_ptr,
+                                       const std::int64_t  valid_bits_offset) 
+{
+    if (valid_bits_offset % 8 == 0) {
+        transform_valid(d_valid_ptr + valid_bits_offset / 8, def_length, TurnOnFunctor{});
+    } else {
+        size_t left_bits_length = valid_bits_offset % 8;
+        size_t rigth_bits_length = 8 - left_bits_length;
+        uint8_t mask;
+        cudaMemcpy(&mask, d_valid_ptr + (valid_bits_offset/8), 1, cudaMemcpyDeviceToHost);
+
+        for(size_t i = 0; i < rigth_bits_length; i++) {
+            mask |= gdf::util::byte_bitmask(i + left_bits_length);
+        }
+        cudaMemcpy(d_valid_ptr + valid_bits_offset / 8, &mask, sizeof(uint8_t), cudaMemcpyHostToDevice);
+        transform_valid((d_valid_ptr + valid_bits_offset / 8 + 1), def_length, TurnOnFunctor{});
+    }
+}
+
+struct IsValidFunctor {
+    const std::int16_t *d_def_levels;
+    std::int16_t       max_definition_level;
+    IsValidFunctor (const std::int16_t *d_def_levels, std::int16_t max_definition_level) :  d_def_levels {d_def_levels}, max_definition_level{max_definition_level} 
+    {
+    }
+    __host__ __device__ uint32_t operator() (size_t index) {
+        return d_def_levels[index] == max_definition_level ? 0xFFFFFFFF : 0x00000000;
+    }
+};
+
+static inline void
+_DefinitionLevelsToBitmap(const std::int16_t *d_def_levels,
+                          std::int64_t        def_length,
+                          const std::int16_t  max_definition_level,
+                          std::int64_t *      values_read,
+                          std::int64_t *      null_count,
+                          std::uint8_t *      d_valid_ptr,
+                          const std::int64_t  valid_bits_offset) {
+
+	if (valid_bits_offset % 8 == 0) {
+		transform_valid(
+				(d_valid_ptr + valid_bits_offset / 8), 
+                def_length,
+				IsValidFunctor{ d_def_levels, max_definition_level });
+	} else {
+        int left_bits_length = valid_bits_offset % 8;
+        int right_bits_length = 8 - left_bits_length;
+        uint8_t mask;
+        cudaMemcpy(&mask, d_valid_ptr + (valid_bits_offset/8), 1, cudaMemcpyDeviceToHost);
+
+        thrust::host_vector<int16_t> h_def_levels(right_bits_length);
+        cudaMemcpy(h_def_levels.data(), d_def_levels, right_bits_length * sizeof(int16_t), cudaMemcpyDeviceToHost);
+        for(size_t i = 0; i < h_def_levels.size(); i++) {
+            if (h_def_levels[i] == max_definition_level) {
+                mask |= gdf::util::byte_bitmask(i + left_bits_length);
+            } else {
+                if (h_def_levels[i] < max_definition_level) {
+                    mask &= gdf::util::flipped_bitmask(i + left_bits_length);
+                }
+            }
+        }
+        cudaMemcpy(d_valid_ptr + valid_bits_offset / 8, &mask, sizeof(uint8_t), cudaMemcpyHostToDevice);
+        transform_valid (d_valid_ptr + valid_bits_offset/8 + 1,
+                          def_length - right_bits_length, 
+                          IsValidFunctor{d_def_levels + right_bits_length, max_definition_level});
+    }
+    int not_null_count = thrust::count(thrust::device_pointer_cast(d_def_levels), thrust::device_pointer_cast(d_def_levels) + def_length, max_definition_level);
+    *null_count = def_length - not_null_count;
+    *values_read = not_null_count;
+}
+ 
+template <class DecoderType, class T>
+static inline std::int64_t
+_ReadValuesSpaced(DecoderType *decoder,
+                  std::int64_t batch_size,
+                  T *out,
+                  std::int64_t null_count,
+                  std::uint8_t *valid_bits,
+                  std::int64_t valid_bits_offset)
+{
+    return decoder->DecodeSpaced(out,
+                                 static_cast<int>(batch_size),
+                                 static_cast<int>(null_count),
+                                 valid_bits,
+                                 valid_bits_offset);
+}
+
+template <typename DataType>
+inline std::int64_t
+ColumnReader<DataType>::ReadBatchSpaced(std::int64_t batch_size,
+                                        std::int16_t *definition_levels,
+                                        std::int16_t *repetition_levels,
+                                        T *values,
+                                        std::uint8_t *valid_bits,
+                                        std::int64_t valid_bits_offset, //
+                                        std::int64_t *levels_read,
+                                        std::int64_t *values_read,
+                                        std::int64_t *nulls_count)
+{
+    if (!HasNext())
+    {
+        *levels_read = 0;
+        *values_read = 0;
+        *nulls_count = 0;
+        return 0;
+    }
+
+    std::int64_t total_values;
+
+    batch_size = std::min(batch_size, num_buffered_values_ - num_decoded_values_);
+
+    if (descr_->max_definition_level() > 0)
+    {
+        std::int64_t num_def_levels = ReadDefinitionLevels(batch_size, definition_levels);
+
+        const bool has_spaced_values = _HasSpacedValues(descr_);
+
+        std::int64_t null_count = 0;
+        if (!has_spaced_values)
+        {
+            int result = thrust::count(thrust::device_pointer_cast(definition_levels), thrust::device_pointer_cast(definition_levels) + num_def_levels,  descr_->max_definition_level());
+            int values_to_read = result;
+
+            total_values = _ReadValues(current_decoder_, values_to_read, values);
+           _TurnBitOnForValids(total_values, valid_bits, valid_bits_offset);
+            *values_read = total_values;
+        }
+        else
+        {
+            std::int16_t max_definition_level = descr_->max_definition_level();
+            std::int16_t max_repetition_level = descr_->max_repetition_level();
+          
+            _DefinitionLevelsToBitmap(
+                definition_levels,
+                num_def_levels,
+                max_definition_level,
+                values_read,
+                &null_count,
+                valid_bits,
+                valid_bits_offset);
+
+            total_values = _ReadValues(current_decoder_, *values_read, values);
+            total_values = num_def_levels; 
+
+            if (total_values != *values_read) {
+                thrust::device_vector<int> work_space_vector(total_values);
+                int* work_space = thrust::raw_pointer_cast(work_space_vector.data());
+                thrust::device_vector<T> d_values_in(values, values + total_values);
+                compact_to_sparse_for_nulls(thrust::raw_pointer_cast(d_values_in.data()),
+                                            values,
+                                            definition_levels,
+                                            max_definition_level,
+                                            total_values,
+                                            work_space);
+            }
+        }
+        *levels_read = num_def_levels;
+        *nulls_count = null_count;
+    }
+    else {
+        total_values = _ReadValues(current_decoder_, batch_size, values);
+        _TurnBitOnForValids(total_values, valid_bits, valid_bits_offset);
+        *nulls_count = 0;
+        *levels_read = total_values;
+    }
+
+    ConsumeBufferedValues(*levels_read);
+
+    return total_values;
+}
+
+template <class DataType>
+inline std::int64_t
+ColumnReader<DataType>::ReadBatch(std::int64_t batch_size,
+                                  std::int16_t *def_levels,
+                                  std::int16_t *rep_levels,
+                                  T *values,
+                                  std::int64_t *values_read)
+{
+    // assert(rep_levels == nullptr);
+    if (!HasNext())
+    {
+        *values_read = 0;
+        return 0;
+    }
+    batch_size = std::min(batch_size, num_buffered_values_ - num_decoded_values_);
+
+    std::int64_t num_def_levels = 0;
+
+    std::int64_t values_to_read = 0;
+
+    if (descr_->max_definition_level() > 0 && def_levels)
+    {
+        num_def_levels = ReadDefinitionLevels(batch_size, def_levels);
+        int result = thrust::count(thrust::device_pointer_cast(def_levels), thrust::device_pointer_cast(def_levels) + num_def_levels,  descr_->max_definition_level());
+        values_to_read = result;
+    }
+    else
+    {
+        values_to_read = batch_size;
+    }
+
+    *values_read = _ReadValues(current_decoder_, values_to_read, values);
+    std::int64_t total_values = std::max(num_def_levels, *values_read);
+    ConsumeBufferedValues(total_values);
+
+    return total_values;
+}
+
+template <class DataType>
+struct ParquetTraits
+{
+};
+
+#define TYPE_TRAITS_FACTORY(ParquetType, GdfDType)      \
+    template <>                                         \
+    struct ParquetTraits<ParquetType>                   \
+    {                                                   \
+        static constexpr gdf_dtype gdfDType = GdfDType; \
+    }
+
+TYPE_TRAITS_FACTORY(::parquet::BooleanType, GDF_INT8);
+TYPE_TRAITS_FACTORY(::parquet::Int32Type, GDF_INT32);
+TYPE_TRAITS_FACTORY(::parquet::Int64Type, GDF_INT64);
+TYPE_TRAITS_FACTORY(::parquet::FloatType, GDF_FLOAT32);
+TYPE_TRAITS_FACTORY(::parquet::DoubleType, GDF_FLOAT64);
+
+#undef TYPE_TRAITS_FACTORY
+
+
+template <class DataType>
+std::size_t ColumnReader<DataType>::ToGdfColumn(const gdf_column & column, const std::ptrdiff_t offset,
+		std::uint8_t & first_valid_byte, std::uint8_t & last_valid_byte) {
+
+	   if (!HasNext()) {
+	        return 0;
+	    }
+	   std::int64_t values_to_read = num_buffered_values_ - num_decoded_values_;
+
+	   thrust::device_vector<int16_t> d_def_levels(values_to_read);//this size is work group size
+	   std::int16_t *d_definition_levels = thrust::raw_pointer_cast(d_def_levels.data());
+
+	   std::size_t rows_read_total = ToGdfColumn(column, offset, d_definition_levels);
+
+	   std::int16_t  max_definition_level = descr_->max_definition_level();
+
+	   if (offset > 0 && offset % 8 != 0){ // need to figure out the first_valid_byte
+		   first_valid_byte = 0;
+
+		   int left_bits_length = offset % 8;
+		   int right_bits_length = 8 - left_bits_length;
+
+		   thrust::host_vector<int16_t> h_def_levels(right_bits_length);
+		   cudaMemcpy(h_def_levels.data(), d_definition_levels, right_bits_length * sizeof(int16_t), cudaMemcpyDeviceToHost);
+		   for(size_t i = 0; i < h_def_levels.size(); i++) {
+			   if (h_def_levels[i] == max_definition_level) {
+				   first_valid_byte |= gdf::util::byte_bitmask(i + left_bits_length);
+			   } else {
+				   if (h_def_levels[i] < max_definition_level) {
+					   first_valid_byte &= gdf::util::flipped_bitmask(i + left_bits_length);
+				   }
+			   }
+		   }
+	   }
+	   if ( (offset + values_to_read) % 8 != 0 ) { // need to figure out the last_valid_byte
+		   last_valid_byte = 0;
+
+		   int left_bits_length = (offset + values_to_read) % 8;
+		   int right_bits_length = 8 - left_bits_length;
+
+		   thrust::host_vector<int16_t> h_def_levels(left_bits_length);
+		   cudaMemcpy(h_def_levels.data(), d_definition_levels + values_to_read - left_bits_length, left_bits_length * sizeof(int16_t), cudaMemcpyDeviceToHost);
+		   for(size_t i = 0; i < h_def_levels.size(); i++) {
+			   if (h_def_levels[i] == max_definition_level) {
+				   last_valid_byte |= gdf::util::byte_bitmask(i);
+			   } else {
+				   if (h_def_levels[i] < max_definition_level) {
+					   last_valid_byte &= gdf::util::flipped_bitmask(i);
+				   }
+			   }
+		   }
+	   }
+
+	   return rows_read_total;
+
+}
+
+template <class DataType>
+std::size_t ColumnReader<DataType>::ToGdfColumn(const gdf_column & column, const std::ptrdiff_t offset) {
+    if (!HasNext()) {
+        return 0;
+    }
+   std::int64_t values_to_read = num_buffered_values_ - num_decoded_values_;
+
+   thrust::device_vector<int16_t> d_def_levels(values_to_read);//this size is work group size
+   std::int16_t *d_definition_levels = thrust::raw_pointer_cast(d_def_levels.data());
+
+   return ToGdfColumn(column, offset, d_definition_levels);
+}
+
+template <class DataType>
+std::size_t ColumnReader<DataType>::ToGdfColumn(const gdf_column & column, const std::ptrdiff_t offset, std::int16_t *d_definition_levels) {
+    if (!HasNext()) {
+        return 0;
+    }
+    using c_type = typename DataType::c_type;
+
+    c_type *const values = static_cast<c_type *const>(column.data) + offset;
+    std::uint8_t *const d_valid_bits =
+      static_cast<std::uint8_t *const>(column.valid) + (offset / 8);
+
+    static std::int64_t levels_read = 0;
+    static std::int64_t values_read = 0;
+    static std::int64_t nulls_count = 0;
+
+    int64_t rows_read_total     = 0;
+    std::int64_t values_to_read = num_buffered_values_ - num_decoded_values_;
+
+    do {
+        values_to_read = num_buffered_values_ - num_decoded_values_;
+        int64_t rows_read = ReadBatchSpaced(
+                            values_to_read,
+                            d_definition_levels + rows_read_total,
+                            nullptr,
+                            static_cast<T *>(values + rows_read_total),
+                            d_valid_bits,
+                            rows_read_total + (offset % 8),
+                            &levels_read,
+                            &values_read,
+                            &nulls_count);
+        
+        rows_read_total += rows_read;
+    } while (this->HasNext());
+    return static_cast<std::size_t>(rows_read_total);
+}
+
+template class ColumnReader<::parquet::BooleanType>;
+template class ColumnReader<::parquet::Int32Type>;
+template class ColumnReader<::parquet::Int64Type>;
+template class ColumnReader<::parquet::FloatType>;
+template class ColumnReader<::parquet::DoubleType>;
+
+} // namespace parquet
+} // namespace gdf
diff --git a/src/parquet/column_reader.h b/src/parquet/column_reader.h
new file mode 100644
index 00000000..5534acb4
--- /dev/null
+++ b/src/parquet/column_reader.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright 2018 BlazingDB, Inc.
+ *     Copyright 2018 Cristhian Alberto Gonzales Castillo <cristhian@blazingdb.com>
+ *     Copyright 2018 Alexander Ocsa <alexander@blazingdb.com>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _GDF_PARQUET_COLUMN_READER_H
+#define _GDF_PARQUET_COLUMN_READER_H
+
+#include <parquet/column_reader.h>
+#include <gdf/gdf.h>
+#include "decoder/cu_level_decoder.h"
+
+namespace gdf {
+namespace parquet {
+
+template <class DataType>
+class ColumnReader : public ::parquet::ColumnReader {
+public:
+    using T = typename DataType::c_type;
+
+     ColumnReader(const ::parquet::ColumnDescriptor* schema, std::unique_ptr<::parquet::PageReader> pager,
+                      ::arrow::MemoryPool* pool = ::arrow::default_memory_pool())
+            : ::parquet::ColumnReader(schema, std::move(pager), pool), current_decoder_(nullptr) {}
+
+
+    bool HasNext();
+
+    std::int64_t ReadBatch(std::int64_t  batch_size,
+                          std::int16_t *def_levels,
+                          std::int16_t *rep_levels,
+                          T *           values,
+                          std::int64_t *values_read);
+
+    std::int64_t ReadBatchSpaced(std::int64_t  batch_size,
+                                 std::int16_t *definition_levels,
+                                 std::int16_t *repetition_levels,
+                                 T *           values,
+                                 std::uint8_t *valid_bits,
+                                 std::int64_t  valid_bits_offset,
+                                 std::int64_t *levels_read,
+                                 std::int64_t *values_read,
+                                 std::int64_t *nulls_count);
+
+    std::size_t ToGdfColumn(const gdf_column &  column, const std::ptrdiff_t offset = 0);
+
+    std::size_t ToGdfColumn(const gdf_column &  column, const std::ptrdiff_t offset, std::int16_t *d_definition_levels);
+
+    std::size_t ToGdfColumn(const gdf_column & column, const std::ptrdiff_t offset, std::uint8_t & first_valid_byte, std::uint8_t & last_valid_byte);
+
+
+    int64_t ReadDefinitionLevels(int64_t batch_size, int16_t* levels) {
+        if (descr_->max_definition_level() == 0) {
+            return 0;
+        }
+        return def_level_decoder_.Decode(static_cast<int>(batch_size), levels);
+    }
+ 
+
+
+private:
+    bool ReadNewPage() final;
+
+    using DecoderType = ::parquet::Decoder<DataType>;
+
+    std::unordered_map<int, std::shared_ptr<DecoderType>> decoders_;
+    DecoderType *                                         current_decoder_;
+    gdf::parquet::decoder::CUDALevelDecoder               def_level_decoder_;
+};
+
+using BoolReader              = ColumnReader<::parquet::BooleanType>;
+using Int32Reader             = ColumnReader<::parquet::Int32Type>;
+using Int64Reader             = ColumnReader<::parquet::Int64Type>;
+using FloatReader             = ColumnReader<::parquet::FloatType>;
+using DoubleReader            = ColumnReader<::parquet::DoubleType>;
+
+}  // namespace parquet
+}  // namespace gdf
+
+#endif
diff --git a/src/parquet/decoder/cu_level_decoder.cu b/src/parquet/decoder/cu_level_decoder.cu
new file mode 100644
index 00000000..628716b6
--- /dev/null
+++ b/src/parquet/decoder/cu_level_decoder.cu
@@ -0,0 +1,95 @@
+//
+// Created by aocsa on 8/25/18.
+//
+
+#include "arrow/util/rle-encoding.h"
+#include <arrow/util/bit-util.h>
+#include "../../arrow/rle_decoder.h"
+#include "../../arrow/bit-stream.h"
+
+#include "cu_level_decoder.h"
+
+namespace gdf {
+namespace parquet {
+namespace decoder {
+
+CUDALevelDecoder::CUDALevelDecoder()
+    : num_values_remaining_(0), rle_decoder_(nullptr), bit_packed_decoder_(nullptr)
+{
+}
+
+CUDALevelDecoder::~CUDALevelDecoder() {}
+
+int CUDALevelDecoder::SetData(::parquet::Encoding::type encoding,
+    int16_t max_level, int num_buffered_values,
+    const uint8_t* data)
+{
+    int32_t num_bytes = 0;
+    encoding_ = encoding;
+    num_values_remaining_ = num_buffered_values;
+    bit_width_ = ::arrow::BitUtil::Log2(max_level + 1);
+    switch (encoding) {
+    case ::parquet::Encoding::RLE: {
+        num_bytes = *reinterpret_cast<const int32_t*>(data);
+        const uint8_t* decoder_data = data + sizeof(int32_t);
+        if (rle_decoder_ == nullptr) {
+            rle_decoder_.reset(
+                new gdf::arrow::internal::RleDecoder(decoder_data, num_bytes, bit_width_));
+        } else {
+            rle_decoder_->Reset(decoder_data, num_bytes, bit_width_);
+        }
+        return sizeof(int32_t) + num_bytes;
+    }
+    case ::parquet::Encoding::BIT_PACKED: {
+        num_bytes = static_cast<int32_t>(
+            ::arrow::BitUtil::Ceil(num_buffered_values * bit_width_, 8));
+        if (!bit_packed_decoder_) {
+            bit_packed_decoder_.reset(new gdf::arrow::internal::BitReader(data, num_bytes));
+        } else {
+            bit_packed_decoder_->Reset(data, num_bytes);
+        }
+        return num_bytes;
+    }
+    default:
+        throw ::parquet::ParquetException("Unknown encoding type for levels.");
+    }
+}
+
+int CUDALevelDecoder::Decode(int batch_size, int16_t* d_levels)
+{
+    int num_decoded = 0;
+    int num_values = std::min(num_values_remaining_, batch_size);
+    if (encoding_ == ::parquet::Encoding::RLE) {
+        num_decoded = rle_decoder_->GetBatch(d_levels, num_values);
+    } else {
+        // num_decoded = bit_packed_decoder_->GetBatch(bit_width_, d_levels, num_values);
+        int literal_batch = num_values;
+        int values_read = 0;
+        std::vector<uint32_t> rleRuns;
+        std::vector<uint64_t> rleValues;
+        int numRle;
+        int numBitpacked;
+        std::vector<int> unpack32InputOffsets, unpack32InputRunLengths, unpack32OutputOffsets;
+        std::vector<int> remainderInputOffsets, remainderBitOffsets, remainderSetSize,
+                remainderOutputOffsets;
+
+        bit_packed_decoder_->SetGpuBatchMetadata(
+                1, d_levels, literal_batch, values_read, unpack32InputOffsets, unpack32InputRunLengths,
+                unpack32OutputOffsets, remainderInputOffsets, remainderBitOffsets,
+                remainderSetSize, remainderOutputOffsets);
+
+        num_decoded = gdf::arrow::internal::unpack_using_gpu<int16_t> (
+                bit_packed_decoder_->get_buffer(), bit_packed_decoder_->get_buffer_len(),
+                unpack32InputOffsets,
+				unpack32InputRunLengths,
+                unpack32OutputOffsets,
+                remainderInputOffsets, remainderBitOffsets, remainderSetSize,
+                remainderOutputOffsets, bit_width_, d_levels, literal_batch);
+    }
+    num_values_remaining_ -= num_decoded;
+    return num_decoded;
+}
+
+} // namespace decoder
+} // namespace parquet
+} // namespace gdf
diff --git a/src/parquet/decoder/cu_level_decoder.h b/src/parquet/decoder/cu_level_decoder.h
new file mode 100644
index 00000000..2317c54c
--- /dev/null
+++ b/src/parquet/decoder/cu_level_decoder.h
@@ -0,0 +1,43 @@
+//
+// Created by aocsa on 8/25/18.
+//
+
+#ifndef _GDF_PARQUET_CUDALEVELDECODER_H
+#define _GDF_PARQUET_CUDALEVELDECODER_H
+
+#include "parquet/types.h"
+#include <parquet/column_reader.h>
+#include "../../arrow/rle_decoder.h"
+#include "../../arrow/bit-stream.h"
+
+namespace gdf {
+namespace parquet {
+namespace decoder {
+
+class CUDALevelDecoder {
+public:
+    CUDALevelDecoder();
+
+    ~CUDALevelDecoder();
+
+    // Initialize the LevelDecoder state with new data
+    // and return the number of bytes consumed
+    int SetData(::parquet::Encoding::type encoding, int16_t max_level,
+        int num_buffered_values, const uint8_t* data);
+
+    // Decodes a batch of levels into an array and returns the number of levels
+    // decoded
+    int Decode(int batch_size, int16_t* levels);
+
+private:
+    int bit_width_;
+    int num_values_remaining_;
+    ::parquet::Encoding::type encoding_;
+    std::unique_ptr< gdf::arrow::internal::RleDecoder> rle_decoder_;
+    std::unique_ptr< gdf::arrow::internal::BitReader> bit_packed_decoder_;
+};
+} // namespace decoder
+} // namespace parquet
+} // namespace gdf
+
+#endif //_GDF_PARQUET_CUDALEVELDECODER_H
diff --git a/src/parquet/dictionary_decoder.cuh b/src/parquet/dictionary_decoder.cuh
new file mode 100644
index 00000000..c5dccb0c
--- /dev/null
+++ b/src/parquet/dictionary_decoder.cuh
@@ -0,0 +1,117 @@
+/*
+ * Copyright 2018 BlazingDB, Inc.
+ *     Copyright 2018 Cristhian Alberto Gonzales Castillo <cristhian@blazingdb.com>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../arrow/rle_decoder.h"
+#include "arrow/util/rle-encoding.h"
+#include <thrust/device_vector.h>
+
+namespace parquet {
+class ColumnDescriptor;
+}
+
+namespace gdf {
+namespace parquet {
+namespace internal {
+
+template <typename Type, typename RleDecoder>
+class DictionaryDecoder : public ::parquet::Decoder<Type> {
+public:
+    typedef typename Type::c_type T;
+
+    explicit DictionaryDecoder(
+      const ::parquet::ColumnDescriptor *descr,
+      ::arrow::MemoryPool *              pool = nullptr)
+      : ::parquet::Decoder<Type>(descr, ::parquet::Encoding::RLE_DICTIONARY),
+        dictionary_(0) 
+    {
+    }
+
+    void SetDict(::parquet::Decoder<Type> *dictionary);
+
+    void
+    SetData(int num_values, const std::uint8_t *data, int len) override {
+        num_values_ = num_values;
+        if (len == 0) return;
+        std::uint8_t bit_width = *data;
+        ++data;
+        --len;
+        idx_decoder_ = RleDecoder(data, len, bit_width);
+    }
+
+    int
+    Decode(T *buffer, int max_values) override {
+        max_values         = std::min(max_values, num_values_);
+        int decoded_values = idx_decoder_.GetBatchWithDict(
+          thrust::raw_pointer_cast(dictionary_.data()), num_dictionary_values_, buffer, max_values);
+        if (decoded_values != max_values) {
+            ::parquet::ParquetException::EofException();
+        }
+        num_values_ -= max_values;
+        return max_values;
+    }
+
+    int
+    DecodeSpaced(T *                 buffer,
+                 int                 num_values,
+                 int                 null_count,
+                 const std::uint8_t *valid_bits,
+                 std::int64_t        valid_bits_offset) override {
+        int decoded_values =
+          idx_decoder_.GetBatchWithDictSpaced( thrust::raw_pointer_cast(dictionary_.data()),
+        		  	  	  	  	  	  	  	  num_dictionary_values_,
+                                              buffer,
+                                              num_values,
+                                              null_count,
+                                              valid_bits,
+                                              valid_bits_offset);
+        if (decoded_values != num_values) {
+            ::parquet::ParquetException::EofException();
+        }
+        return decoded_values;
+    }
+
+private:
+    using ::parquet::Decoder<Type>::num_values_;
+
+    thrust::device_vector<T> dictionary_;
+
+    RleDecoder idx_decoder_;
+
+    int num_dictionary_values_;
+};
+
+template <typename Type, typename RleDecoder>
+inline void
+DictionaryDecoder<Type, RleDecoder>::SetDict(
+  ::parquet::Decoder<Type> *dictionary) {
+    int num_dictionary_values = dictionary->values_left();
+    num_dictionary_values_ = num_dictionary_values;
+    dictionary_.resize(num_dictionary_values);
+    dictionary->Decode(thrust::raw_pointer_cast(dictionary_.data()), num_dictionary_values);
+}
+
+template <>
+inline void
+DictionaryDecoder<::parquet::BooleanType, ::arrow::RleDecoder>::SetDict(
+  ::parquet::Decoder<::parquet::BooleanType> *) {
+    ::parquet::ParquetException::NYI(
+      "Dictionary encoding is not implemented for boolean values");
+}
+
+}  // namespace internal
+}  // namespace parquet
+}  // namespace gdf
diff --git a/src/parquet/file_reader.cpp b/src/parquet/file_reader.cpp
new file mode 100644
index 00000000..4e23e9a4
--- /dev/null
+++ b/src/parquet/file_reader.cpp
@@ -0,0 +1,144 @@
+/*
+ * Copyright 2018 BlazingDB, Inc.
+ *     Copyright 2018 Cristhian Alberto Gonzales Castillo <cristhian@blazingdb.com>
+ *     Copyright 2018 Alexander Ocsa <alexander@blazingdb.com>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <arrow/io/file.h>
+#include <arrow/util/logging.h>
+
+#include "column_reader.h"
+#include "file_reader.h"
+#include "file_reader_contents.h"
+
+namespace gdf {
+namespace parquet {
+
+// ----------------------------------------------------------------------
+// RowGroupReader public API
+
+GdfRowGroupReader::GdfRowGroupReader(std::unique_ptr<::parquet::RowGroupReader::Contents> contents)
+        : ::parquet::RowGroupReader(nullptr), contents_(std::move(contents)) {}
+
+
+static std::shared_ptr<::parquet::ColumnReader> GdfColumnReaderMake(const ::parquet::ColumnDescriptor* descr,
+                                                 std::unique_ptr<::parquet::PageReader> pager,
+                                                             ::arrow::MemoryPool* pool) {
+    switch (descr->physical_type()) {
+        case ::parquet::Type::BOOLEAN:
+            return std::static_pointer_cast<::parquet::ColumnReader>(std::make_shared<BoolReader>(descr, std::move(pager), pool));
+        case ::parquet::Type::INT32:
+            return std::static_pointer_cast<::parquet::ColumnReader>(std::make_shared<Int32Reader>(descr, std::move(pager), pool));
+            break;
+        case ::parquet::Type::INT64:
+            return std::static_pointer_cast<::parquet::ColumnReader>(std::make_shared<Int64Reader>(descr, std::move(pager), pool));
+        case ::parquet::Type::FLOAT:
+            return std::static_pointer_cast<::parquet::ColumnReader>(std::make_shared<FloatReader>(descr, std::move(pager), pool));
+        case ::parquet::Type::DOUBLE:
+            return std::static_pointer_cast<::parquet::ColumnReader>(std::make_shared<DoubleReader>(descr, std::move(pager), pool));
+        default:
+            ::parquet::ParquetException::NYI("type reader not implemented");
+    }
+    // Unreachable code, but supress compiler warning
+    return std::shared_ptr<::parquet::ColumnReader>(nullptr);
+}
+
+
+std::shared_ptr<::parquet::ColumnReader> GdfRowGroupReader::Column(int i) {
+    DCHECK(i < metadata()->num_columns()) << "The RowGroup only has "
+                                          << metadata()->num_columns()
+                                          << "columns, requested column: " << i;
+    const ::parquet::ColumnDescriptor* descr = metadata()->schema()->Column(i);
+
+    std::unique_ptr<::parquet::PageReader> page_reader = contents_->GetColumnPageReader(i);
+    return GdfColumnReaderMake(
+            descr, std::move(page_reader),
+            const_cast<::parquet::ReaderProperties*>(contents_->properties())->memory_pool());
+}
+
+
+
+std::unique_ptr<::parquet::PageReader> GdfRowGroupReader::GetColumnPageReader(int i) {
+    DCHECK(i < metadata()->num_columns()) << "The RowGroup only has "
+                                          << metadata()->num_columns()
+                                          << "columns, requested column: " << i;
+    return contents_->GetColumnPageReader(i);
+}
+
+// Returns the rowgroup metadata
+const ::parquet::RowGroupMetaData* GdfRowGroupReader::metadata() const { return contents_->metadata(); }
+
+// ----------------------------------------------------------------------
+
+std::unique_ptr<FileReader>
+FileReader::OpenFile(const std::string &                path,
+                     const ::parquet::ReaderProperties &properties) {
+
+	FileReader *const reader = new FileReader();
+	reader->parquetFileReader_.reset(new ::parquet::ParquetFileReader());
+
+	std::shared_ptr<::arrow::io::ReadableFile> file;
+
+	PARQUET_THROW_NOT_OK(
+			::arrow::io::ReadableFile::Open(path, properties.memory_pool(), &file));
+
+	std::unique_ptr<::parquet::RandomAccessSource> source(
+			new ::parquet::ArrowInputFile(file));
+
+	std::unique_ptr<::parquet::ParquetFileReader::Contents> contents(
+			new internal::FileReaderContents(std::move(source), properties));
+
+	static_cast<internal::FileReaderContents *>(contents.get())
+	    		   ->ParseMetaData();
+
+	reader->parquetFileReader_->Open(std::move(contents));
+
+	return std::unique_ptr<FileReader>(reader);
+}
+
+std::unique_ptr<FileReader>
+FileReader::OpenFile(std::shared_ptr<::arrow::io::RandomAccessFile> file,
+		const ::parquet::ReaderProperties &properties) {
+
+	FileReader *const reader = new FileReader();
+	reader->parquetFileReader_.reset(new ::parquet::ParquetFileReader());
+
+	std::unique_ptr<::parquet::RandomAccessSource> source(
+			new ::parquet::ArrowInputFile(file));
+
+	std::unique_ptr<::parquet::ParquetFileReader::Contents> contents(
+			new internal::FileReaderContents(std::move(source), properties));
+
+	static_cast<internal::FileReaderContents *>(contents.get())
+		    				   ->ParseMetaData();
+
+	reader->parquetFileReader_->Open(std::move(contents));
+
+
+	return std::unique_ptr<FileReader>(reader);
+}
+
+std::shared_ptr<GdfRowGroupReader>
+FileReader::RowGroup(int i) {
+    return std::static_pointer_cast< GdfRowGroupReader >(parquetFileReader_->RowGroup(i));
+}
+
+std::shared_ptr<::parquet::FileMetaData>
+FileReader::metadata() const {
+    return parquetFileReader_->metadata();
+}
+
+}  // namespace parquet
+}  // namespace gdf
diff --git a/src/parquet/file_reader.h b/src/parquet/file_reader.h
new file mode 100644
index 00000000..72643991
--- /dev/null
+++ b/src/parquet/file_reader.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright 2018 BlazingDB, Inc.
+ *     Copyright 2018 Cristhian Alberto Gonzales Castillo <cristhian@blazingdb.com>
+ *     Copyright 2018 Alexander Ocsa <alexander@blazingdb.com>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _GDF_PARQUET_FILE_READER_H
+#define _GDF_PARQUET_FILE_READER_H
+
+#include <parquet/file_reader.h>
+#include <arrow/io/file.h>
+
+namespace gdf {
+namespace parquet {
+
+
+class  GdfRowGroupReader : public ::parquet::RowGroupReader {
+public:
+
+    explicit GdfRowGroupReader(std::unique_ptr<Contents> contents);
+
+    // Returns the rowgroup metadata
+    const ::parquet::RowGroupMetaData* metadata() const;
+
+    // Construct a ColumnReader for the indicated row group-relative
+    // column. Ownership is shared with the RowGroupReader.
+    std::shared_ptr<::parquet::ColumnReader> Column(int i);
+
+    std::unique_ptr<::parquet::PageReader> GetColumnPageReader(int i);
+
+private:
+    // Holds a pointer to an instance of Contents implementation
+    std::unique_ptr<Contents> contents_;
+};
+
+class FileReader {
+public:
+    static std::unique_ptr<FileReader>
+    OpenFile(const std::string &                path,
+             const ::parquet::ReaderProperties &properties =
+               ::parquet::default_reader_properties());
+
+    static std::unique_ptr<FileReader>
+        OpenFile(std::shared_ptr<::arrow::io::RandomAccessFile> file,
+                 const ::parquet::ReaderProperties &properties =
+                   ::parquet::default_reader_properties());
+
+    std::shared_ptr<GdfRowGroupReader> RowGroup(int i);
+    std::shared_ptr<::parquet::FileMetaData>   metadata() const;
+
+private:
+    std::unique_ptr<::parquet::ParquetFileReader> parquetFileReader_;
+};
+
+}  // namespace parquet
+}  // namespace gdf
+
+#endif
diff --git a/src/parquet/file_reader_contents.cpp b/src/parquet/file_reader_contents.cpp
new file mode 100644
index 00000000..d633a74d
--- /dev/null
+++ b/src/parquet/file_reader_contents.cpp
@@ -0,0 +1,112 @@
+/*
+ * Copyright 2018 BlazingDB, Inc.
+ *     Copyright 2018 Cristhian Alberto Gonzales Castillo <cristhian@blazingdb.com>
+ *     Copyright 2018 Alexander Ocsa <alexander@blazingdb.com>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "file_reader_contents.h"
+
+#include "row_group_reader_contents.h"
+
+#include "file_reader.h"
+
+namespace gdf {
+namespace parquet {
+namespace internal {
+
+FileReaderContents::FileReaderContents(
+  std::unique_ptr<::parquet::RandomAccessSource> source,
+  const ::parquet::ReaderProperties &            properties)
+  : source_(std::move(source)), properties_(properties) {}
+
+FileReaderContents::~FileReaderContents() {
+    try {
+        Close();
+    } catch (...) {}
+}
+
+void
+FileReaderContents::Close() {
+    source_->Close();
+}
+
+std::shared_ptr<::parquet::RowGroupReader>
+FileReaderContents::GetRowGroup(int i) {
+    std::unique_ptr<internal::RowGroupReaderContents> contents(
+      new internal::RowGroupReaderContents(
+        source_.get(), file_metadata_.get(), i, properties_));
+    return std::static_pointer_cast<::parquet::RowGroupReader>(std::make_shared<GdfRowGroupReader>(std::move(contents)));
+}
+
+std::shared_ptr<::parquet::FileMetaData>
+FileReaderContents::metadata() const {
+    return file_metadata_;
+}
+
+void
+FileReaderContents::ParseMetaData() {
+    std::int64_t file_size = source_->Size();
+
+    if (file_size < FOOTER_SIZE) {
+        throw ::parquet::ParquetException(
+          "Corrupted file, smaller than file footer");
+    }
+
+    std::uint8_t footer_buffer[DEFAULT_FOOTER_READ_SIZE];
+    std::int64_t footer_read_size =
+      std::min(file_size, DEFAULT_FOOTER_READ_SIZE);
+    std::int64_t bytes_read = source_->ReadAt(
+      file_size - footer_read_size, footer_read_size, footer_buffer);
+
+    if (bytes_read != footer_read_size
+        || std::memcmp(footer_buffer + footer_read_size - 4, PARQUET_MAGIC, 4)
+             != 0) {
+        throw ::parquet::ParquetException(
+          "Invalid parquet file. Corrupt footer.");
+    }
+
+    std::uint32_t metadata_len = *reinterpret_cast<std::uint32_t *>(
+      footer_buffer + footer_read_size - FOOTER_SIZE);
+    std::int64_t metadata_start = file_size - FOOTER_SIZE - metadata_len;
+    if (FOOTER_SIZE + metadata_len > file_size) {
+        throw ::parquet::ParquetException(
+          "Invalid parquet file. File is less than "
+          "file metadata size.");
+    }
+
+    std::shared_ptr<::parquet::PoolBuffer> metadata_buffer =
+      ::parquet::AllocateBuffer(properties_.memory_pool(), metadata_len);
+
+    if (footer_read_size >= (metadata_len + FOOTER_SIZE)) {
+        std::memcpy(metadata_buffer->mutable_data(),
+                    footer_buffer
+                      + (footer_read_size - metadata_len - FOOTER_SIZE),
+                    metadata_len);
+    } else {
+        bytes_read = source_->ReadAt(
+          metadata_start, metadata_len, metadata_buffer->mutable_data());
+        if (bytes_read != metadata_len) {
+            throw ::parquet::ParquetException(
+              "Invalid parquet file. Could not read metadata bytes.");
+        }
+    }
+
+    file_metadata_ =
+      ::parquet::FileMetaData::Make(metadata_buffer->data(), &metadata_len);
+}
+
+}  // namespace internal
+}  // namespace parquet
+}  // namespace gdf
diff --git a/src/parquet/file_reader_contents.h b/src/parquet/file_reader_contents.h
new file mode 100644
index 00000000..c75ac0b1
--- /dev/null
+++ b/src/parquet/file_reader_contents.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2018 BlazingDB, Inc.
+ *     Copyright 2018 Cristhian Alberto Gonzales Castillo <cristhian@blazingdb.com>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <parquet/file_reader.h>
+
+namespace gdf {
+namespace parquet {
+namespace internal {
+
+class FileReaderContents : public ::parquet::ParquetFileReader::Contents {
+public:
+    FileReaderContents(std::unique_ptr<::parquet::RandomAccessSource> source,
+                       const ::parquet::ReaderProperties &properties =
+                         ::parquet::default_reader_properties());
+
+    ~FileReaderContents() final;
+    void                                       Close() final;
+    std::shared_ptr<::parquet::RowGroupReader> GetRowGroup(int i) final;
+    std::shared_ptr<::parquet::FileMetaData>   metadata() const final;
+
+    void ParseMetaData();
+
+private:
+    std::unique_ptr<::parquet::RandomAccessSource> source_;
+    std::shared_ptr<::parquet::FileMetaData>       file_metadata_;
+    ::parquet::ReaderProperties                    properties_;
+
+    const int64_t  DEFAULT_FOOTER_READ_SIZE = 64 * 1024;
+    const uint32_t FOOTER_SIZE              = 8;
+    const uint8_t  PARQUET_MAGIC[4]         = {'P', 'A', 'R', '1'};
+};
+
+}  // namespace internal
+}  // namespace parquet
+}  // namespace gdf
diff --git a/src/parquet/page_reader.cpp b/src/parquet/page_reader.cpp
new file mode 100644
index 00000000..5dcfcf46
--- /dev/null
+++ b/src/parquet/page_reader.cpp
@@ -0,0 +1,201 @@
+/*
+ * Copyright 2018 BlazingDB, Inc.
+ *     Copyright 2018 Cristhian Alberto Gonzales Castillo <cristhian@blazingdb.com>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "page_reader.h"
+
+#include <thrift/protocol/TCompactProtocol.h>
+#include <thrift/transport/TBufferTransports.h>
+
+namespace gdf {
+namespace parquet {
+namespace internal {
+
+namespace {
+template <class T>
+inline void
+DeserializeThriftMsg(const std::uint8_t *buf,
+                     std::uint32_t *     len,
+                     T *                 deserialized_msg) {
+    std::shared_ptr<apache::thrift::transport::TMemoryBuffer> tmem_transport(
+      new apache::thrift::transport::TMemoryBuffer(
+        const_cast<std::uint8_t *>(buf), *len));
+    apache::thrift::protocol::TCompactProtocolFactoryT<
+      apache::thrift::transport::TMemoryBuffer>
+                                                         tproto_factory;
+    std::shared_ptr<apache::thrift::protocol::TProtocol> tproto =
+      tproto_factory.getProtocol(tmem_transport);
+
+    try {
+        deserialized_msg->read(tproto.get());
+    } catch (std::exception &e) {
+        std::stringstream ss;
+        ss << "Couldn't deserialize thrift: " << e.what() << "\n";
+        throw ::parquet::ParquetException(ss.str());
+    }
+
+    std::uint32_t bytes_left = tmem_transport->available_read();
+
+    *len = *len - bytes_left;
+}
+
+static inline ::parquet::Encoding::type
+FromThrift(::parquet::format::Encoding::type type) {
+    return static_cast<::parquet::Encoding::type>(type);
+}
+}  // namespace
+
+PageReader::PageReader(std::unique_ptr<::parquet::InputStream> stream,
+                       std::int64_t                            total_num_rows,
+                       ::parquet::Compression::type            codec,
+                       arrow::MemoryPool *                     pool)
+  : stream_(std::move(stream)),
+    decompression_buffer_(::parquet::AllocateBuffer(pool, 0)),
+    seen_num_rows_(0), total_num_rows_(total_num_rows) {
+    max_page_header_size_ = kDefaultMaxPageHeaderSize;
+    decompressor_         = GetCodecFromArrow(codec);
+}
+
+std::shared_ptr<::parquet::Page>
+PageReader::NextPage() {
+    while (seen_num_rows_ < total_num_rows_) {
+        std::int64_t        bytes_read      = 0;
+        std::int64_t        bytes_available = 0;
+        std::uint32_t       header_size     = 0;
+        const std::uint8_t *buffer;
+        std::uint32_t       allowed_page_size = kDefaultPageHeaderSize;
+
+        for (;;) {
+            buffer = stream_->Peek(allowed_page_size, &bytes_available);
+            if (bytes_available == 0) {
+                return std::shared_ptr<::parquet::Page>(nullptr);
+            }
+
+            header_size = static_cast<std::uint32_t>(bytes_available);
+            try {
+                DeserializeThriftMsg(
+                  buffer, &header_size, &current_page_header_);
+                break;
+            } catch (std::exception &e) {
+                std::stringstream ss;
+                ss << e.what();
+                allowed_page_size *= 2;
+                if (allowed_page_size > max_page_header_size_) {
+                    ss << "Deserializing page header failed.\n";
+                    throw ::parquet::ParquetException(ss.str());
+                }
+            }
+        }
+        stream_->Advance(header_size);
+
+        int compressed_len   = current_page_header_.compressed_page_size;
+        int uncompressed_len = current_page_header_.uncompressed_page_size;
+
+        buffer = stream_->Read(compressed_len, &bytes_read);
+        if (bytes_read != compressed_len) {
+            ::parquet::ParquetException::EofException();
+        }
+
+        if (decompressor_ != nullptr) {
+            if (uncompressed_len
+                > static_cast<int>(decompression_buffer_->size())) {
+                PARQUET_THROW_NOT_OK(
+                  decompression_buffer_->Resize(uncompressed_len, false));
+            }
+            PARQUET_THROW_NOT_OK(decompressor_->Decompress(
+              compressed_len,
+              buffer,
+              uncompressed_len,
+              decompression_buffer_->mutable_data()));
+            buffer = decompression_buffer_->data();
+        }
+
+        auto page_buffer =
+          std::make_shared<::parquet::Buffer>(buffer, uncompressed_len);
+
+        if (current_page_header_.type
+            == ::parquet::format::PageType::DICTIONARY_PAGE) {
+            const ::parquet::format::DictionaryPageHeader &dict_header =
+              current_page_header_.dictionary_page_header;
+
+            bool is_sorted =
+              dict_header.__isset.is_sorted ? dict_header.is_sorted : false;
+
+            return std::make_shared<::parquet::DictionaryPage>(
+              page_buffer,
+              dict_header.num_values,
+              FromThrift(dict_header.encoding),
+              is_sorted);
+        } else if (current_page_header_.type
+                   == ::parquet::format::PageType::DATA_PAGE) {
+            const ::parquet::format::DataPageHeader &header =
+              current_page_header_.data_page_header;
+
+            ::parquet::EncodedStatistics page_statistics;
+            if (header.__isset.statistics) {
+                const ::parquet::format::Statistics &stats = header.statistics;
+                if (stats.__isset.max) { page_statistics.set_max(stats.max); }
+                if (stats.__isset.min) { page_statistics.set_min(stats.min); }
+                if (stats.__isset.null_count) {
+                    page_statistics.set_null_count(stats.null_count);
+                }
+                if (stats.__isset.distinct_count) {
+                    page_statistics.set_distinct_count(stats.distinct_count);
+                }
+            }
+
+            seen_num_rows_ += header.num_values;
+
+            return std::make_shared<::parquet::DataPage>(
+              page_buffer,
+              header.num_values,
+              FromThrift(header.encoding),
+              FromThrift(header.definition_level_encoding),
+              FromThrift(header.repetition_level_encoding),
+              page_statistics);
+        } else if (current_page_header_.type
+                   == ::parquet::format::PageType::DATA_PAGE_V2) {
+            const ::parquet::format::DataPageHeaderV2 &header =
+              current_page_header_.data_page_header_v2;
+            bool is_compressed =
+              header.__isset.is_compressed ? header.is_compressed : false;
+
+            seen_num_rows_ += header.num_values;
+
+            return std::make_shared<::parquet::DataPageV2>(
+              page_buffer,
+              header.num_values,
+              header.num_nulls,
+              header.num_rows,
+              FromThrift(header.encoding),
+              header.definition_levels_byte_length,
+              header.repetition_levels_byte_length,
+              is_compressed);
+        } else {
+            continue;
+        }
+    }
+    return std::shared_ptr<::parquet::Page>(nullptr);
+}
+
+void
+PageReader::set_max_page_header_size(std::uint32_t size) {
+    max_page_header_size_ = size;
+}
+
+}  // namespace internal
+}  // namespace parquet
+}  // namespace gdf
diff --git a/src/parquet/page_reader.h b/src/parquet/page_reader.h
new file mode 100644
index 00000000..49b3d561
--- /dev/null
+++ b/src/parquet/page_reader.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright 2018 BlazingDB, Inc.
+ *     Copyright 2018 Cristhian Alberto Gonzales Castillo <cristhian@blazingdb.com>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <parquet/column_reader.h>
+#include <parquet/parquet_types.h>
+
+namespace gdf {
+namespace parquet {
+namespace internal {
+
+class PageReader : public ::parquet::PageReader {
+public:
+    PageReader(std::unique_ptr<::parquet::InputStream> stream,
+               std::int64_t                            total_num_rows,
+               ::parquet::Compression::type            codec,
+               arrow::MemoryPool *                     pool);
+
+    std::shared_ptr<::parquet::Page> NextPage() final;
+
+    void set_max_page_header_size(std::uint32_t size) override;
+
+private:
+    static const std::uint32_t kDefaultMaxPageHeaderSize = 16 * 1024 * 1024;
+    static const std::uint32_t kDefaultPageHeaderSize    = 16 * 1024;
+
+    std::unique_ptr<::parquet::InputStream> stream_;
+
+    ::parquet::format::PageHeader    current_page_header_;
+    std::shared_ptr<::parquet::Page> current_page_;
+
+    std::unique_ptr<arrow::Codec>      decompressor_;
+    std::shared_ptr<arrow::PoolBuffer> decompression_buffer_;
+
+    std::uint32_t max_page_header_size_;
+
+    std::int64_t seen_num_rows_;
+
+    std::int64_t total_num_rows_;
+};
+
+}  // namespace internal
+}  // namespace parquet
+}  // namespace gdf
diff --git a/src/parquet/plain_decoder.cuh b/src/parquet/plain_decoder.cuh
new file mode 100644
index 00000000..67e30c4d
--- /dev/null
+++ b/src/parquet/plain_decoder.cuh
@@ -0,0 +1,155 @@
+/*
+ * Copyright 2018 BlazingDB, Inc.
+ *     Copyright 2018 Cristhian Alberto Gonzales Castillo <cristhian@blazingdb.com>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <arrow/util/bit-stream-utils.h>
+#include "../arrow/bit-stream.h"
+#include <thrust/device_vector.h>
+
+namespace parquet {
+class ColumnDescriptor;
+}
+
+namespace gdf {
+namespace parquet {
+namespace internal {
+
+template <typename DataType>
+class PlainDecoder : public ::parquet::Decoder<DataType> {
+public:
+    typedef typename DataType::c_type T;
+    using ::parquet::Decoder<DataType>::num_values_;
+
+    explicit PlainDecoder(const ::parquet::ColumnDescriptor *descr)
+      : ::parquet::Decoder<DataType>(descr, ::parquet::Encoding::PLAIN),
+        data_(nullptr), len_(0) {
+        if (descr_
+            && descr_->physical_type()
+                 == ::parquet::Type::FIXED_LEN_BYTE_ARRAY) {
+            type_length_ = descr_->type_length();
+        } else {
+            type_length_ = -1;
+        }
+    }
+
+    virtual void
+    SetData(int num_values, const std::uint8_t *data, int len) {
+        num_values_ = num_values;
+        data_       = data;
+        len_        = len;
+    }
+
+    virtual int Decode(T *buffer, int max_values);
+
+private:
+    using ::parquet::Decoder<DataType>::descr_;
+    const std::uint8_t *data_;
+    int                 len_;
+    int                 type_length_;
+};
+
+template <typename T>
+inline int
+DecodePlain(const std::uint8_t *data,
+            std::int64_t        data_size,
+            int                 num_values,
+            int,
+            T *out) {
+    int bytes_to_decode = num_values * static_cast<int>(sizeof(T));
+    if (data_size < bytes_to_decode) {
+        ::parquet::ParquetException::EofException();
+    }
+    cudaMemcpy(out, data, bytes_to_decode, cudaMemcpyHostToDevice);
+    return bytes_to_decode;
+}
+
+template <typename DataType>
+inline int
+PlainDecoder<DataType>::Decode(T *buffer, int max_values) {
+    max_values = std::min(max_values, num_values_);
+    int bytes_consumed =
+      DecodePlain<T>(data_, len_, max_values, type_length_, buffer);
+    data_ += bytes_consumed;
+    len_ -= bytes_consumed;
+    num_values_ -= max_values;
+    return max_values;
+}
+
+template <>
+class PlainDecoder<::parquet::BooleanType>
+  : public ::parquet::Decoder<::parquet::BooleanType> {
+public:
+    explicit PlainDecoder(const ::parquet::ColumnDescriptor *descr)
+      : ::parquet::Decoder<::parquet::BooleanType>(
+          descr,
+          ::parquet::Encoding::PLAIN) {}
+
+    virtual void
+    SetData(int num_values, const std::uint8_t *data, int len) {
+        num_values_ = num_values;
+        bit_reader_ = gdf::arrow::internal::BitReader(data, len);
+    }
+
+    int
+    Decode(std::uint8_t *buffer, int max_values) {
+        max_values = std::min(max_values, num_values_);
+        bool val;
+        for (int i = 0; i < max_values; ++i) {
+            if (!bit_reader_.GetValue(1, &val)) {
+                ::parquet::ParquetException::EofException();
+            }
+            ::arrow::BitUtil::SetArrayBit(buffer, i, val);
+        }
+        num_values_ -= max_values;
+        return max_values;
+    }
+
+    virtual int
+    Decode(bool *buffer, int max_values) {
+        max_values = std::min(max_values, num_values_);
+
+        int literal_batch = max_values;
+        int values_read = 0;
+        std::vector<uint32_t> rleRuns;
+        std::vector<uint64_t> rleValues;
+        std::vector<int> unpack32InputOffsets, unpack32InputRunLengths, unpack32OutputOffsets;
+        std::vector<int> remainderInputOffsets, remainderBitOffsets, remainderSetSize,
+                remainderOutputOffsets;
+
+        bit_reader_.SetGpuBatchMetadata(
+                1, buffer, literal_batch, values_read, unpack32InputOffsets, unpack32InputRunLengths,
+                unpack32OutputOffsets, remainderInputOffsets, remainderBitOffsets,
+                remainderSetSize, remainderOutputOffsets);
+
+        gdf::arrow::internal::unpack_using_gpu<bool> (
+                bit_reader_.get_buffer(), bit_reader_.get_buffer_len(),
+                unpack32InputOffsets,
+				unpack32InputRunLengths,
+                unpack32OutputOffsets,
+                remainderInputOffsets, remainderBitOffsets, remainderSetSize,
+                remainderOutputOffsets, 1, buffer, literal_batch);
+
+        num_values_ -= max_values;
+        return max_values;
+    }
+
+private:
+    gdf::arrow::internal::BitReader bit_reader_;
+};
+
+}  // namespace internal
+}  // namespace parquet
+}  // namespace gdf
diff --git a/src/parquet/row_group_reader_contents.cpp b/src/parquet/row_group_reader_contents.cpp
new file mode 100644
index 00000000..252173b9
--- /dev/null
+++ b/src/parquet/row_group_reader_contents.cpp
@@ -0,0 +1,80 @@
+/*
+ * Copyright 2018 BlazingDB, Inc.
+ *     Copyright 2018 Cristhian Alberto Gonzales Castillo <cristhian@blazingdb.com>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "row_group_reader_contents.h"
+
+#include "page_reader.h"
+
+namespace gdf {
+namespace parquet {
+namespace internal {
+
+RowGroupReaderContents::RowGroupReaderContents(
+  ::parquet::RandomAccessSource *    source,
+  ::parquet::FileMetaData *          file_metadata,
+  int                                row_group_number,
+  const ::parquet::ReaderProperties &props)
+  : source_(source), file_metadata_(file_metadata), properties_(props) {
+    row_group_metadata_ = file_metadata->RowGroup(row_group_number);
+}
+
+const ::parquet::RowGroupMetaData *
+RowGroupReaderContents::metadata() const {
+    return row_group_metadata_.get();
+}
+
+const ::parquet::ReaderProperties *
+RowGroupReaderContents::properties() const {
+    return &properties_;
+}
+
+std::unique_ptr<::parquet::PageReader>
+RowGroupReaderContents::GetColumnPageReader(int i) {
+    auto col = row_group_metadata_->ColumnChunk(i);
+
+    int64_t col_start = col->data_page_offset();
+    if (col->has_dictionary_page()
+        && col_start > col->dictionary_page_offset()) {
+        col_start = col->dictionary_page_offset();
+    }
+
+    std::int64_t col_length = col->total_compressed_size();
+    std::unique_ptr<::parquet::InputStream> stream;
+
+    const ::parquet::ApplicationVersion &version =
+      file_metadata_->writer_version();
+    if (version.VersionLt(
+          ::parquet::ApplicationVersion::PARQUET_816_FIXED_VERSION)) {
+        std::int64_t bytes_remaining =
+          source_->Size() - (col_start + col_length);
+        std::int64_t padding =
+          std::min<std::int64_t>(kMaxDictHeaderSize, bytes_remaining);
+        col_length += padding;
+    }
+
+    stream = properties_.GetStream(source_, col_start, col_length);
+
+    return std::unique_ptr<::parquet::PageReader>(
+      new internal::PageReader(std::move(stream),
+                               col->num_values(),
+                               col->compression(),
+                               properties_.memory_pool()));
+}
+
+}  // namespace internal
+}  // namespace parquet
+}  // namespace gdf
diff --git a/src/parquet/row_group_reader_contents.h b/src/parquet/row_group_reader_contents.h
new file mode 100644
index 00000000..e128a1c7
--- /dev/null
+++ b/src/parquet/row_group_reader_contents.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2018 BlazingDB, Inc.
+ *     Copyright 2018 Cristhian Alberto Gonzales Castillo <cristhian@blazingdb.com>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <parquet/file_reader.h>
+
+namespace gdf {
+namespace parquet {
+namespace internal {
+
+class RowGroupReaderContents : public ::parquet::RowGroupReader::Contents {
+public:
+    RowGroupReaderContents(::parquet::RandomAccessSource *    source,
+                           ::parquet::FileMetaData *          file_metadata,
+                           int                                row_group_number,
+                           const ::parquet::ReaderProperties &props);
+
+    const ::parquet::RowGroupMetaData *metadata() const final;
+    const ::parquet::ReaderProperties *properties() const final;
+    virtual std::unique_ptr<::parquet::PageReader>
+    GetColumnPageReader(int i) final;
+
+private:
+    ::parquet::RandomAccessSource *              source_;
+    ::parquet::FileMetaData *                    file_metadata_;
+    std::unique_ptr<::parquet::RowGroupMetaData> row_group_metadata_;
+    ::parquet::ReaderProperties                  properties_;
+
+    const std::int64_t kMaxDictHeaderSize = 100;
+};
+
+}  // namespace internal
+}  // namespace parquet
+}  // namespace gdf
diff --git a/src/tests/CMakeLists.txt b/src/tests/CMakeLists.txt
index d07fcbc6..30a46de5 100644
--- a/src/tests/CMakeLists.txt
+++ b/src/tests/CMakeLists.txt
@@ -43,10 +43,11 @@ add_subdirectory(datetime)
 add_subdirectory(hashing)
 add_subdirectory(join)
 add_subdirectory(sqls)
-add_subdirectory(hash_map)
-add_subdirectory(groupby)
+add_subdirectory(parquet)
 add_subdirectory(unaryops)
 add_subdirectory(filterops_numeric)
+add_subdirectory(hash_map)
+add_subdirectory(groupby)
 add_subdirectory(quantiles)
 add_subdirectory(validops)
 
diff --git a/src/tests/filterops_numeric/helper/utils.cuh b/src/tests/filterops_numeric/helper/utils.cuh
index 7348f4da..3ae86441 100644
--- a/src/tests/filterops_numeric/helper/utils.cuh
+++ b/src/tests/filterops_numeric/helper/utils.cuh
@@ -3,6 +3,8 @@
 #define GDF_TEST_UTILS
 
 #include <iostream>
+#include <cassert>
+
 #include <gdf/gdf.h>
 #include <gdf/cffi/functions.h>
 #include <thrust/functional.h>
@@ -13,7 +15,16 @@
 #include <functional>
 #include <vector>
 #include <tuple>
-#include "gdf/gdf.h"
+
+#ifndef EXPECT_TRUE
+#define EXPECT_TRUE (expr)
+    assert(expr);
+#endif
+
+#ifndef EXPECT_EQ
+#define EXPECT_EQ (lhs, rhs)
+    assert((lsh) == (rhs));
+#endif
 
 template <typename gdf_type>
 inline gdf_dtype gdf_enum_type_for()
diff --git a/src/tests/helper/utils.cu b/src/tests/helper/utils.cu
new file mode 100644
index 00000000..e05d4006
--- /dev/null
+++ b/src/tests/helper/utils.cu
@@ -0,0 +1,87 @@
+
+#include <iostream>
+#include <gdf/gdf.h>
+#include <gdf/cffi/functions.h>
+#include <cuda_runtime.h>
+#include <limits.h>
+#include "utils.cuh"
+
+
+gdf_valid_type * get_gdf_valid_from_device(gdf_column* column) {
+    gdf_valid_type * host_valid_out;
+    size_t n_bytes = get_number_of_bytes_for_valid(column->size);
+    host_valid_out = new gdf_valid_type[n_bytes];
+    cudaMemcpy(host_valid_out,column->valid, n_bytes, cudaMemcpyDeviceToHost);
+    return host_valid_out;
+}
+
+std::string gdf_valid_to_str(gdf_valid_type *valid, size_t column_size)
+{
+    size_t n_bytes = get_number_of_bytes_for_valid(column_size);
+    std::string response;
+    for (size_t i = 0; i < n_bytes; i++)
+    {
+        size_t length = n_bytes != i + 1 ? GDF_VALID_BITSIZE : column_size - GDF_VALID_BITSIZE * (n_bytes - 1);
+        auto result = chartobin(valid[i], length);
+        response += std::string(result);
+    }
+    return response;
+}
+
+gdf_valid_type* gen_gdf_valid(size_t column_size, size_t init_value)
+{
+    gdf_valid_type *valid = nullptr;
+    if (column_size == 0)
+    {
+        valid = new gdf_valid_type[1];
+    }
+    else
+    {
+        size_t n_bytes = get_number_of_bytes_for_valid (column_size);
+        valid = new gdf_valid_type[n_bytes];
+        size_t i;
+        for (i = 0; i < n_bytes - 1; ++i)
+        {
+            valid[i] = (init_value % 256);
+        }
+        size_t length = column_size - GDF_VALID_BITSIZE * (n_bytes - 1);
+        valid[i] = 1 << length - 1;
+    }
+    return valid;
+}
+
+
+void delete_gdf_column(gdf_column * column){
+    cudaFree(column->data);
+    cudaFree(column->valid);
+}
+
+gdf_size_type count_zero_bits(gdf_valid_type *valid, size_t column_size)
+{    
+    size_t numbits = 0;
+    auto bin = gdf_valid_to_str(valid, column_size);
+
+    for(size_t i = 0; i < bin.length(); i++) {
+        if ( bin [i] == '0')
+            numbits++;
+    }
+    return numbits;
+}
+
+std::string chartobin(gdf_valid_type c, int size/* = 8*/)
+{
+    std::string bin;
+    bin.resize(size);
+    bin[0] = 0;
+    int i;
+    for (i = 0; i < size; i++)
+    {
+        bin[i] = (c % 2) + '0';
+        c /= 2;
+    }
+    return bin;
+}
+
+auto print_binary(gdf_valid_type n, int size) -> void {
+    std::cout << chartobin(n) << "\t sz: " <<  size <<  "\tbinary: " << chartobin(n, size) << std::endl;
+}
diff --git a/src/tests/helper/utils.cuh b/src/tests/helper/utils.cuh
new file mode 100644
index 00000000..9c9d95f3
--- /dev/null
+++ b/src/tests/helper/utils.cuh
@@ -0,0 +1,336 @@
+
+#ifndef GDF_TEST_UTILS
+#define GDF_TEST_UTILS
+
+#include <iostream>
+#include <cassert>
+
+#include <gdf/gdf.h>
+#include <thrust/functional.h>
+#include <thrust/device_ptr.h>
+#include <thrust/iterator/counting_iterator.h>
+
+#include <string>
+#include <functional>
+#include <vector>
+#include <tuple>
+
+
+template <typename gdf_type>
+inline gdf_dtype gdf_enum_type_for()
+{
+    return GDF_invalid;
+}
+
+template <>
+inline gdf_dtype gdf_enum_type_for<int8_t>()
+{
+    return GDF_INT8;
+}
+
+template <>
+inline gdf_dtype gdf_enum_type_for<int16_t>()
+{
+    return GDF_INT16;
+}
+
+template <>
+inline gdf_dtype gdf_enum_type_for<int32_t>()
+{
+    return GDF_INT32;
+}
+
+template <>
+inline gdf_dtype gdf_enum_type_for<int64_t>()
+{
+    return GDF_INT64;
+}
+
+template <>
+inline gdf_dtype gdf_enum_type_for<float>()
+{
+    return GDF_FLOAT32;
+}
+
+template <>
+inline gdf_dtype gdf_enum_type_for<double>()
+{
+    return GDF_FLOAT64;
+}
+
+inline auto get_number_of_bytes_for_valid (size_t column_size) -> size_t {
+    return sizeof(gdf_valid_type) * (column_size + GDF_VALID_BITSIZE - 1) / GDF_VALID_BITSIZE;
+}
+
+
+inline gdf_error gdf_column_view_init(gdf_column *column, void *data, gdf_valid_type *valid,
+		gdf_size_type size, gdf_dtype dtype, gdf_size_type null_count) {
+	column->data = data;
+	column->valid = valid;
+	column->size = size;
+	column->dtype = dtype;
+	column->null_count = null_count;
+	return GDF_SUCCESS;
+}
+
+
+auto print_binary(gdf_valid_type n, int size = 8) -> void ;
+
+auto chartobin(gdf_valid_type n, int size = 8) -> std::string;
+
+gdf_size_type count_zero_bits(gdf_valid_type *valid, size_t column_size);
+
+auto delete_gdf_column(gdf_column * column) -> void;
+
+auto gen_gdf_valid(size_t column_size, size_t init_value) -> gdf_valid_type *;
+
+gdf_valid_type * get_gdf_valid_from_device(gdf_column* column) ;
+
+std::string gdf_valid_to_str(gdf_valid_type *valid, size_t column_size);
+
+template <typename RawType, typename PointerType>
+auto init_device_vector(gdf_size_type num_elements) -> std::tuple<RawType *, thrust::device_ptr<PointerType>>
+{
+    RawType *device_pointer;
+    cudaError_t cuda_error = cudaMalloc((void **)&device_pointer, sizeof(PointerType) * num_elements);
+    assert(cuda_error == cudaError::cudaSuccess);
+    thrust::device_ptr<PointerType> device_wrapper = thrust::device_pointer_cast((PointerType *)device_pointer);
+    return std::make_tuple(device_pointer, device_wrapper);
+}
+
+
+template <typename ValueType = int8_t>
+ValueType* get_gdf_data_from_device(gdf_column* column) {
+    ValueType* host_out = new ValueType[column->size];
+    cudaMemcpy(host_out, column->data, sizeof(ValueType) * column->size, cudaMemcpyDeviceToHost);
+    return host_out;
+}
+
+template <typename ValueType = int8_t>
+std::string gdf_data_to_str(void *data, size_t column_size)
+{
+    std::string response;
+    for (size_t i = 0; i < column_size; i++)
+    {
+        auto result = std::to_string(*((ValueType*)(data) + i));
+        response += std::string(result);
+    }
+    return response;
+}
+
+
+template <typename ValueType = int8_t>
+gdf_column convert_to_device_gdf_column (gdf_column *column) {
+    size_t column_size = column->size;
+    char *raw_pointer;
+    thrust::device_ptr<ValueType> device_pointer;
+    std::tie(raw_pointer, device_pointer) = init_device_vector<char, ValueType>(column_size);
+
+    void* host_out = column->data;
+    cudaMemcpy(raw_pointer, host_out, sizeof(ValueType) * column->size, cudaMemcpyHostToDevice);
+
+    gdf_valid_type *host_valid = column->valid;
+    size_t n_bytes = get_number_of_bytes_for_valid(column_size);
+
+    gdf_valid_type *valid_value_pointer;
+    cudaMalloc((void **)&valid_value_pointer, n_bytes);
+    cudaMemcpy(valid_value_pointer, host_valid, n_bytes, cudaMemcpyHostToDevice);
+
+    gdf_column output;
+    gdf_column_view_init(&output, (void *)raw_pointer, valid_value_pointer, column_size, column->dtype, column->null_count);
+    return output;
+}
+
+template <typename ValueType = int8_t>
+gdf_column convert_to_host_gdf_column (gdf_column *column) {
+    auto host_out = get_gdf_data_from_device<ValueType>(column);
+    auto host_valid_out = get_gdf_valid_from_device(column);
+
+    auto output = *column;
+    output.data = host_out;
+    output.valid = host_valid_out;
+    return output;
+}
+
+
+template <typename ValueType = int8_t>
+auto print_column(gdf_column * column) -> void {
+    auto host_out = get_gdf_data_from_device<ValueType>(column);
+    auto bitmap = get_gdf_valid_from_device(column);
+    std::cout<<"Printing Column\t null_count:" << column->null_count << "\t type " << column->dtype <<  std::endl;
+    size_t  n_bytes =  sizeof(int8_t) * (column->size + GDF_VALID_BITSIZE - 1) / GDF_VALID_BITSIZE;
+    for(std::size_t i = 0; i < column->size; i++) {
+        size_t col_position =  i / 8;
+        size_t length_col = n_bytes != col_position+1 ? GDF_VALID_BITSIZE : column->size - GDF_VALID_BITSIZE * (n_bytes - 1);
+        int bit_offset =  (length_col - 1) - (i % 8);
+
+        ValueType value    = static_cast<ValueType *>(host_out)[i];
+
+        if ( bitmap[i / 8] & (1 << (i % 8)) ) {
+             std::cout << "host_out[" << i << "] = " << value <<"\t\tvalid="<< 1 <<std::endl;
+         } else {
+             std::cout << "host_out[" << i << "] = " << '\0' <<"\t\tvalid="<< 0 <<std::endl;
+         }
+    }
+    for (size_t i = 0; i < n_bytes; i++) {
+        size_t length = n_bytes != i+1 ? GDF_VALID_BITSIZE : column->size - GDF_VALID_BITSIZE * (n_bytes - 1);
+        print_binary(bitmap[i], length);
+    }
+    delete[] host_out;
+    delete[] bitmap;
+    std::cout<<std::endl<<std::endl;
+}
+template <typename ValueType = int8_t>
+gdf_column gen_gdb_column(size_t column_size, ValueType init_value)
+{
+    char *raw_pointer;
+    auto gdf_enum_type_value =  gdf_enum_type_for<ValueType>();
+    thrust::device_ptr<ValueType> device_pointer;
+   // std::cout << "0. gen_gdb_column\n";
+    std::tie(raw_pointer, device_pointer) = init_device_vector<char, ValueType>(column_size);
+   // std::cout << "1. gen_gdb_column\n";
+
+    using thrust::detail::make_normal_iterator;
+    thrust::fill(make_normal_iterator(device_pointer), make_normal_iterator(device_pointer + column_size), init_value);
+    //std::cout << "2. gen_gdb_column\n";
+
+    gdf_valid_type *host_valid = gen_gdf_valid(column_size, init_value);
+    size_t n_bytes = get_number_of_bytes_for_valid(column_size);
+
+    gdf_valid_type *valid_value_pointer;
+    cudaMalloc((void **)&valid_value_pointer, n_bytes);
+    cudaMemcpy(valid_value_pointer, host_valid, n_bytes, cudaMemcpyHostToDevice);
+   // std::cout << "3. gen_gdb_column\n";
+
+    gdf_column output;
+    auto zero_bits = output.null_count = count_zero_bits(host_valid, column_size);
+
+    gdf_column_view_init(&output,
+                             (void *)raw_pointer, valid_value_pointer,
+                             column_size,
+                             gdf_enum_type_value,
+                             zero_bits);
+    //std::cout << "4. gen_gdb_column\n";
+
+    delete []host_valid;
+    return output;
+}
+
+template <typename LeftValueType = int8_t, typename RightValueType = int8_t>
+void check_column_for_stencil_operation(gdf_column *column, gdf_column *stencil, gdf_column *output_op) {
+    gdf_column host_column = convert_to_host_gdf_column(column);
+    gdf_column host_stencil = convert_to_host_gdf_column(stencil);
+    gdf_column host_output_op = convert_to_host_gdf_column(output_op);
+
+    assert(host_column.size == host_stencil.size);
+    //EXPECT_EQ(host_column.dtype == host_output_op.dtype);  // it must have the same type
+
+
+    int  n_bytes =  sizeof(int8_t) * (column->size + GDF_VALID_BITSIZE - 1) / GDF_VALID_BITSIZE;
+    std::vector<int> indexes;
+    for(size_t i = 0; i < host_stencil.size; i++) {
+        int col_position =  i / 8;
+        size_t length_col = n_bytes != col_position+1 ? GDF_VALID_BITSIZE : column->size - GDF_VALID_BITSIZE * (n_bytes - 1);
+        int bit_offset =  (length_col - 1) - (i % 8);
+        bool valid = ((host_stencil.valid[col_position] >> bit_offset ) & 1) != 0;
+         if ( (int)( ((int8_t *)host_stencil.data)[i] ) == 1 && valid ) {
+             indexes.push_back(i);
+         }
+    }
+
+    for(size_t i = 0; i < indexes.size(); i++)
+    {
+        int index = indexes[i];
+        LeftValueType value = ((LeftValueType *)(host_column.data))[index];
+        std::cout << "filtered values: " << index  << "** "  << "\t value: " << (int)value << std::endl;
+        assert( ((RightValueType*)host_output_op.data)[i] == value);
+
+        int col_position =  i / 8;
+        size_t length_col = n_bytes != col_position+1 ? GDF_VALID_BITSIZE : output_op->size - GDF_VALID_BITSIZE * (n_bytes - 1);
+        int bit_offset =  (length_col - 1) - (i % 8);
+        bool valid = ((host_output_op.valid[col_position] >> bit_offset ) & 1) != 0;
+        assert(valid == true);
+    }
+}
+
+template <typename LeftValueType, typename RightValueType>
+void check_column_for_comparison_operation(gdf_column *lhs, gdf_column *rhs, gdf_column *output, gdf_comparison_operator gdf_operator)
+{
+    {
+        auto lhs_valid = get_gdf_valid_from_device(lhs);
+        auto rhs_valid = get_gdf_valid_from_device(rhs);
+        auto output_valid = get_gdf_valid_from_device(output);
+
+        size_t n_bytes = get_number_of_bytes_for_valid(output->size);
+
+        assert(lhs->size == rhs->size);
+
+        for(int i = 0; i < output->size; i++) {
+            int col_position =  i / 8;
+            size_t length_col = n_bytes != col_position+1 ? GDF_VALID_BITSIZE : output->size - GDF_VALID_BITSIZE * (n_bytes - 1);
+            int bit_offset =  (length_col - 1) - (i % 8);
+
+            assert( ((lhs_valid[col_position] >> bit_offset ) & 1) & ((rhs_valid[col_position] >> bit_offset ) & 1) ==
+            ((output_valid[col_position] >> bit_offset ) & 1) );
+        }
+
+        delete[] lhs_valid;
+        delete[] rhs_valid;
+        delete[] output_valid;
+    }
+
+    {
+        auto lhs_data = get_gdf_data_from_device<LeftValueType>(lhs);
+        auto rhs_data = get_gdf_data_from_device<RightValueType>(rhs);
+        auto output_data = get_gdf_data_from_device<int8_t>(output);
+
+        assert(lhs->size == rhs->size);
+        for(size_t i = 0; i < lhs->size; i++)
+        {
+            assert(lhs_data[i] == rhs_data[i] ? 1 : 0  ==  output_data[i]);
+        }
+
+        delete[] lhs_data;
+        delete[] rhs_data;
+        delete[] output_data;
+    }
+
+}
+
+template <typename ValueType = int8_t>
+void check_column_for_concat_operation(gdf_column *lhs, gdf_column *rhs, gdf_column *output)
+{
+    {
+        auto lhs_valid = get_gdf_valid_from_device(lhs);
+        auto rhs_valid = get_gdf_valid_from_device(rhs);
+        auto output_valid = get_gdf_valid_from_device(output);
+
+        auto computed = gdf_valid_to_str(output_valid, output->size);
+        auto expected = gdf_valid_to_str(lhs_valid, lhs->size) + gdf_valid_to_str(rhs_valid, rhs->size);
+
+        //std::cout << "computed: " <<  computed << std::endl;
+        //std::cout << "expected: " << expected << std::endl;
+
+        delete[] lhs_valid;
+        delete[] rhs_valid;
+        delete[] output_valid;
+        assert(computed == expected);
+    }
+
+    {
+        auto lhs_data = get_gdf_data_from_device<ValueType>(lhs);
+        auto rhs_data = get_gdf_data_from_device<ValueType>(rhs);
+        auto output_data = get_gdf_data_from_device<ValueType>(output);
+
+        auto computed = gdf_data_to_str<ValueType>(output_data, output->size);
+        auto expected = gdf_data_to_str<ValueType>(lhs_data, lhs->size) + gdf_data_to_str<ValueType>(rhs_data, rhs->size);
+        delete[] lhs_data;
+        delete[] rhs_data;
+        delete[] output_data;
+        assert(computed == expected);
+    }
+
+}
+
+
+#endif // GDF_TEST_UTILS
diff --git a/src/tests/parquet/CMakeLists.txt b/src/tests/parquet/CMakeLists.txt
new file mode 100644
index 00000000..37ad2882
--- /dev/null
+++ b/src/tests/parquet/CMakeLists.txt
@@ -0,0 +1,32 @@
+#=============================================================================
+# Copyright 2018 BlazingDB, Inc.
+#     Copyright 2018 Cristhian Alberto Gonzales Castillo <cristhian@blazingdb.com>
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+
+
+
+function(GDF_ADD_PARQUET_TEST TARGET)
+    message(STATUS "${TARGET} will link against: gdf parquet")
+    list(REMOVE_AT ARGV 0)
+    cuda_add_executable(${TARGET} ${ARGV})
+    target_include_directories(${TARGET} PUBLIC ${CMAKE_SOURCE_DIR}/src/parquet)
+    target_link_libraries(${TARGET} gmock_main gmock GTest::GTest gdf-parquet)
+    get_property(ARGN TARGET ${TARGET} PROPERTY SOURCES)
+    # gtest_add_tests(${TARGET} "" ${ARGN})
+endfunction()
+
+add_subdirectory(file_reader)
+add_subdirectory(gdf_column)
diff --git a/src/tests/parquet/file_reader/CMakeLists.txt b/src/tests/parquet/file_reader/CMakeLists.txt
new file mode 100644
index 00000000..58c64e7e
--- /dev/null
+++ b/src/tests/parquet/file_reader/CMakeLists.txt
@@ -0,0 +1,33 @@
+#=============================================================================
+# Copyright 2018 BlazingDB, Inc.
+#     Copyright 2018 Cristhian Alberto Gonzales Castillo <cristhian@blazingdb.com>
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+find_package(Boost REQUIRED COMPONENTS filesystem)
+
+set(PARQUET_FILE_PATH
+    ${CMAKE_SOURCE_DIR}/src/tests/parquet/file_reader/reader-test.parquet)
+
+GDF_ADD_PARQUET_TEST(file_reader-test
+    #file_reader-test.cpp
+    #single_column_file-test.cpp
+    api-test.cu
+    null-test.cu
+    ../../helper/utils.cuh
+    ../../helper/utils.cu)
+
+target_compile_definitions(file_reader-test
+    PUBLIC -DPARQUET_FILE_PATH="${PARQUET_FILE_PATH}")
+target_link_libraries(file_reader-test Boost::filesystem)
diff --git a/src/tests/parquet/file_reader/api-test.cu b/src/tests/parquet/file_reader/api-test.cu
new file mode 100644
index 00000000..7318cc6a
--- /dev/null
+++ b/src/tests/parquet/file_reader/api-test.cu
@@ -0,0 +1,445 @@
+/*
+ * Copyright 2018 BlazingDB, Inc.
+ *     Copyright 2018 Cristhian Alberto Gonzales Castillo <cristhian@blazingdb.com>
+ *     Copyright 2018 William Malpica <william@blazingdb.com>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gdf/parquet/api.h>
+
+#include <arrow/io/file.h>
+#include <arrow/util/logging.h>
+
+#include <boost/filesystem.hpp>
+
+#include <parquet/column_writer.h>
+#include <parquet/file_writer.h>
+#include <parquet/properties.h>
+#include <parquet/schema.h>
+#include <parquet/types.h>
+
+#include <gtest/gtest.h>
+
+#include "../../helper/utils.cuh"
+
+class ParquetReaderAPITest : public testing::Test {
+protected:
+    ParquetReaderAPITest()
+      : filename(boost::filesystem::unique_path().native()) {}
+
+    std::int32_t
+    genInt32(int i) {
+        if (i >= 100 && i < 150) {
+            return 10000;
+        } else if (i >= 200 && i < 300) {
+            return 20000;
+        } else if (i >= 310 && i < 350) {
+            return 30000;
+        } else if (i >= 450 && i < 550) {
+            return 40000;
+        } else if (i >= 800 && i < 950) {
+            return 50000;
+        } else {
+            return i * 100;
+        }
+    }
+
+    std::int64_t
+    genInt64(int i) {
+        if (i >= 100 && i < 150) {
+            return 10000;
+        } else if (i >= 200 && i < 300) {
+            return 20000;
+        } else if (i >= 310 && i < 350) {
+            return 30000;
+        } else if (i >= 450 && i < 550) {
+            return 40000;
+        } else if (i >= 800 && i < 950) {
+            return 50000;
+        } else {
+            return i * 100000;
+        }
+    }
+
+    void
+    SetUp() final {
+        static constexpr std::size_t kGroups       = 3;
+        static constexpr std::size_t kRowsPerGroup = 499;
+        try {
+
+            std::shared_ptr<::arrow::io::FileOutputStream> stream;
+            PARQUET_THROW_NOT_OK(
+              ::arrow::io::FileOutputStream::Open(filename, &stream));
+
+            std::shared_ptr<::parquet::schema::GroupNode> schema =
+              CreateSchema();
+
+            ::parquet::WriterProperties::Builder builder;
+            builder.compression(::parquet::Compression::SNAPPY);
+            std::shared_ptr<::parquet::WriterProperties> properties =
+              builder.build();
+
+            std::shared_ptr<::parquet::ParquetFileWriter> file_writer =
+              ::parquet::ParquetFileWriter::Open(stream, schema, properties);
+
+            std::int16_t repetition_level = 0;
+
+            for (std::size_t i = 0; i < kGroups; i++) {
+                ::parquet::RowGroupWriter *row_group_writer =
+                  file_writer->AppendRowGroup(kRowsPerGroup);
+
+                ::parquet::BoolWriter *bool_writer =
+                  static_cast<::parquet::BoolWriter *>(
+                    row_group_writer->NextColumn());
+                for (std::size_t j = 0; j < kRowsPerGroup; j++) {
+                	int ind = i * kRowsPerGroup + j;
+                    std::int16_t definition_level = ind % 3 > 0 ? 1 : 0;
+                    bool         bool_value       = true;
+                    bool_writer->WriteBatch(
+                      1, &definition_level, &repetition_level, &bool_value);
+                }
+
+                ::parquet::Int32Writer *int32_writer =
+                  static_cast<::parquet::Int32Writer *>(
+                    row_group_writer->NextColumn());
+                for (std::size_t j = 0; j < kRowsPerGroup; j++) {
+                	int ind = i * kRowsPerGroup + j;
+                    std::int16_t definition_level = ind % 3 > 0 ? 1 : 0;
+                    std::int32_t int32_value = genInt32(ind);
+                    int32_writer->WriteBatch(
+                      1, &definition_level, &repetition_level, &int32_value);
+                }
+
+                ::parquet::Int64Writer *int64_writer =
+                  static_cast<::parquet::Int64Writer *>(
+                    row_group_writer->NextColumn());
+                for (std::size_t j = 0; j < kRowsPerGroup; j++) {
+                	int ind = i * kRowsPerGroup + j;
+                    std::int16_t definition_level = ind % 3 > 0 ? 1 : 0;
+                    std::int64_t int64_value = genInt64(ind);
+                    int64_writer->WriteBatch(
+                      1, &definition_level, &repetition_level, &int64_value);
+                }
+
+                ::parquet::DoubleWriter *double_writer =
+                  static_cast<::parquet::DoubleWriter *>(
+                    row_group_writer->NextColumn());
+                for (std::size_t j = 0; j < kRowsPerGroup; j++) {
+                	int ind = i * kRowsPerGroup + j;
+                    std::int16_t definition_level = ind % 3 > 0 ? 1 : 0;
+                    double       double_value     = (double)ind;
+                    double_writer->WriteBatch(
+                      1, &definition_level, &repetition_level, &double_value);
+                }
+            }
+
+            file_writer->Close();
+
+            DCHECK(stream->Close().ok());
+        } catch (const std::exception &e) {
+            FAIL() << "Generate file" << e.what();
+        }
+    }
+
+    std ::shared_ptr<::parquet::schema::GroupNode>
+    CreateSchema() {
+        return std::static_pointer_cast<::parquet::schema::GroupNode>(
+          ::parquet::schema::GroupNode::Make(
+            "schema",
+            ::parquet::Repetition::REQUIRED,
+            ::parquet::schema::NodeVector{
+              ::parquet::schema::PrimitiveNode::Make(
+                "boolean_field",
+                ::parquet::Repetition::OPTIONAL,
+                ::parquet::Type::BOOLEAN,
+                ::parquet::LogicalType::NONE),
+              ::parquet::schema::PrimitiveNode::Make(
+                "int32_field",
+                ::parquet::Repetition::OPTIONAL,
+                ::parquet::Type::INT32,
+                ::parquet::LogicalType::NONE),
+              ::parquet::schema::PrimitiveNode::Make(
+                "int64_field",
+                ::parquet::Repetition::OPTIONAL,
+                ::parquet::Type::INT64,
+                ::parquet::LogicalType::NONE),
+              ::parquet::schema::PrimitiveNode::Make(
+                "double_field",
+                ::parquet::Repetition::OPTIONAL,
+                ::parquet::Type::DOUBLE,
+                ::parquet::LogicalType::NONE),
+            }));
+    }
+
+    void
+    TearDown() final {
+        if (std::remove(filename.c_str())) { FAIL() << "Remove file"; }
+    }
+
+    void
+    checkNulls(/*const */ gdf_column &column) {
+
+        const std::size_t valid_size =
+          arrow::BitUtil::BytesForBits(column.size);
+        const std::size_t valid_last = valid_size - 1;
+
+        int fails = 0;
+        for (std::size_t i = 0; i < valid_last; i++) {
+
+        	if (i % 3 == 0){
+        		std::uint8_t valid = column.valid[i];
+        		std::uint8_t expected = 0b10110110;
+        		EXPECT_EQ(expected, valid);
+        		if (expected != valid){
+        			std::cout<<"fail at checkNulls i: "<<i<<std::endl;
+        			fails++;
+        			if (fails > 5)
+        				break;
+        		}
+        	} else if (i % 3 == 1){
+        		std::uint8_t valid = column.valid[i];
+        		std::uint8_t expected = 0b01101101;
+        		EXPECT_EQ(expected, valid);
+        		if (expected != valid){
+        			std::cout<<"fail at checkNulls i: "<<i<<std::endl;
+        			fails++;
+        			if (fails > 5)
+        				break;
+        		}
+        	} else {
+        		std::uint8_t valid = column.valid[i];
+        		std::uint8_t expected = 0b11011011;
+        		EXPECT_EQ(expected, valid);
+        		if (expected != valid){
+        			std::cout<<"fail at checkNulls i: "<<i<<std::endl;
+        			fails++;
+        			if (fails > 5)
+        				break;
+        		}
+        	}
+
+
+        }
+//        EXPECT_EQ(0b00101101, 0b00101101 & column.valid[valid_last]);
+    }
+
+    void
+    checkBoolean(/*const */ gdf_column &column) {
+
+        gdf_column boolean_column =
+          convert_to_host_gdf_column<::parquet::BooleanType::c_type>(&column);
+
+        int fails = 0;
+
+        for (std::size_t i = 0; i < boolean_column.size; i++) {
+            if (i % 3 > 0) {
+                bool expected = true;
+                bool value    = static_cast<bool *>(boolean_column.data)[i];
+
+                EXPECT_EQ(expected, value);
+
+                if (expected != value){
+                	std::cout<<"fail at checkBoolean row: "<<i<<std::endl;
+                	fails++;
+                	if (fails > 5){
+                		break;
+                	}
+                }
+            }
+        }
+        checkNulls(boolean_column);
+    }
+
+    void
+    checkInt32(/*const */ gdf_column &column) {
+
+        gdf_column int32_column =
+        convert_to_host_gdf_column<::parquet::Int32Type::c_type>(&column);
+
+        int fails = 0;
+
+        for (std::size_t i = 0; i < int32_column.size; i++) {
+            if (i % 3 > 0) {
+                std::int32_t expected = genInt32(i);
+                std::int32_t value =
+                  static_cast<std::int32_t *>(int32_column.data)[i];
+
+                EXPECT_EQ(expected, value);
+
+                if (expected != value){
+                	std::cout<<"fail at checkInt32 row: "<<i<<std::endl;
+                	fails++;
+                	if (fails > 5){
+                		break;
+                	}
+                }
+            }
+        }
+
+        checkNulls(int32_column);
+    }
+
+    void
+    checkInt64(/*const */ gdf_column &column) {
+        gdf_column int64_column =
+          convert_to_host_gdf_column<::parquet::Int64Type::c_type>(&column);
+
+        int fails = 0;
+
+        for (std::size_t i = 0; i < int64_column.size; i++) {
+            if (i % 3 > 0) {
+                std::int64_t expected = genInt64(i);
+                std::int64_t value =
+                  static_cast<std::int64_t *>(int64_column.data)[i];
+
+                EXPECT_EQ(expected, value);
+
+                if (expected != value){
+                	std::cout<<"fail at checkInt64 row: "<<i<<std::endl;
+                	fails++;
+                	if (fails > 5){
+                		break;
+                	}
+                }
+            }
+        }
+
+        checkNulls(int64_column);
+    }
+
+    void
+    checkDouble(/*const */ gdf_column &column) {
+        gdf_column double_column =
+          convert_to_host_gdf_column<::parquet::DoubleType::c_type>(&column);
+
+        int fails = 0;
+
+        for (std::size_t i = 0; i < double_column.size; i++) {
+            if (i % 3 > 0) {
+                double expected = static_cast<double>(i);
+                double value    = static_cast<double *>(double_column.data)[i];
+
+                EXPECT_EQ(expected, value);
+
+                if (expected != value){
+                	std::cout<<"fail at checkDouble row: "<<i<<std::endl;
+                	fails++;
+                	if (fails > 5){
+                		break;
+                	}
+                }
+            }
+        }
+
+        checkNulls(double_column);
+    }
+
+    const std::string filename;
+
+    gdf_column *columns        = nullptr;
+    std::size_t columns_length = 0;
+};
+
+TEST_F(ParquetReaderAPITest, ReadAll) {
+
+	gdf_error error_code = gdf::parquet::read_parquet(
+      filename.c_str(), nullptr, &columns, &columns_length);
+
+    EXPECT_EQ(GDF_SUCCESS, error_code);
+
+    EXPECT_EQ(4U, columns_length);
+
+    EXPECT_EQ(columns[0].size, columns[1].size);
+    EXPECT_EQ(columns[1].size, columns[2].size);
+
+    checkBoolean(columns[0]);
+    checkInt32(columns[1]);
+    checkInt64(columns[2]);
+    checkDouble(columns[3]);
+}
+
+TEST_F(ParquetReaderAPITest, ReadSomeColumns) {
+    const char *const column_names[] = {"double_field", "int64_field", nullptr};
+
+    gdf_error error_code = gdf::parquet::read_parquet(
+      filename.c_str(), column_names, &columns, &columns_length);
+
+    EXPECT_EQ(GDF_SUCCESS, error_code);
+
+    EXPECT_EQ(2U, columns_length);
+
+    checkDouble(columns[0]);
+    checkInt64(columns[1]);
+}
+
+TEST_F(ParquetReaderAPITest, ByIdsInOrder) {
+    const std::vector<std::size_t> row_group_indices = {0, 1};
+    const std::vector<std::size_t> column_indices    = {0, 1, 2, 3};
+
+    std::vector<gdf_column *> columns;
+
+    gdf_error error_code = gdf::parquet::read_parquet_by_ids(
+      filename, row_group_indices, column_indices, columns);
+
+    EXPECT_EQ(GDF_SUCCESS, error_code);
+
+    EXPECT_EQ(4U, columns.size());
+
+    checkBoolean(*columns[0]);
+    checkInt32(*columns[1]);
+    checkInt64(*columns[2]);
+    checkDouble(*columns[3]);
+}
+
+TEST_F(ParquetReaderAPITest, ByIdsOutOfOrder) {
+    const std::vector<std::size_t> row_group_indices = {0, 1};
+    const std::vector<std::size_t> column_indices    = {1, 3, 2, 0};
+
+    std::vector<gdf_column *> columns;
+
+    gdf_error error_code = gdf::parquet::read_parquet_by_ids(
+      filename, row_group_indices, column_indices, columns);
+
+    EXPECT_EQ(GDF_SUCCESS, error_code);
+
+    EXPECT_EQ(4U, columns.size());
+
+    checkBoolean(*columns[3]);
+    checkInt32(*columns[0]);
+    checkInt64(*columns[2]);
+    checkDouble(*columns[1]);
+}
+
+TEST_F(ParquetReaderAPITest, ByIdsInFromInterface) {
+    const std::vector<std::size_t> row_group_indices = {0, 1};
+    const std::vector<std::size_t> column_indices    = {0, 1, 2, 3};
+
+    std::vector<gdf_column *> columns;
+
+    std::shared_ptr<::arrow::io::ReadableFile> file;
+    const ::parquet::ReaderProperties properties = ::parquet::default_reader_properties();
+    ::arrow::io::ReadableFile::Open(filename, properties.memory_pool(), &file);
+
+    gdf_error error_code = gdf::parquet::read_parquet_by_ids(
+    		file, row_group_indices, column_indices, columns);
+
+    EXPECT_EQ(GDF_SUCCESS, error_code);
+
+    EXPECT_EQ(4U, columns.size());
+
+    checkBoolean(*columns[0]);
+    checkInt32(*columns[1]);
+    checkInt64(*columns[2]);
+    checkDouble(*columns[3]);
+}
diff --git a/src/tests/parquet/file_reader/file_reader-test.cpp b/src/tests/parquet/file_reader/file_reader-test.cpp
new file mode 100644
index 00000000..177ccdd7
--- /dev/null
+++ b/src/tests/parquet/file_reader/file_reader-test.cpp
@@ -0,0 +1,121 @@
+/*
+ * Copyright 2018 BlazingDB, Inc.
+ *     Copyright 2018 Cristhian Alberto Gonzales Castillo <cristhian@blazingdb.com>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <arrow/io/file.h>
+#include <arrow/util/logging.h>
+#include <parquet/api/reader.h>
+#include <parquet/api/writer.h>
+
+#include <gtest/gtest.h>
+
+#include "column_reader.h"
+#include "file_reader.h"
+
+#ifndef PARQUET_FILE_PATH
+#error PARQUET_FILE_PATH must be defined for precompiling
+#define PARQUET_FILE_PATH "/"
+#endif
+
+inline static void
+checkMetadata(const std::shared_ptr<const ::parquet::FileMetaData> &metadata) {
+    EXPECT_EQ(1, metadata->num_row_groups());
+    EXPECT_EQ(3, metadata->num_columns());
+}
+
+inline static void
+checkRowGroup(const std::unique_ptr<gdf::parquet::FileReader> &reader) {
+    const std::shared_ptr<::parquet::RowGroupReader> row_group =
+      reader->RowGroup(0);
+
+    std::size_t  i;
+    std::int16_t definition_level;
+    std::int16_t repetition_level;
+    std::uint8_t valid_bits;
+    std::int64_t levels_read;
+    std::int64_t values_read = 0;
+    std::int64_t nulls_count;
+
+    std::shared_ptr<parquet::ColumnReader> column;
+
+    column = row_group->Column(0);
+    gdf::parquet::BoolReader *bool_reader =
+      static_cast<gdf::parquet::BoolReader *>(column.get());
+    i = 0;
+//    while (bool_reader->HasNext()) {
+//        bool value;
+//        bool_reader->ReadBatchSpaced(1,
+//                                     &definition_level,
+//                                     &repetition_level,
+//                                     &value,
+//                                     &valid_bits,
+//                                     0,
+//                                     &levels_read,
+//                                     &values_read,
+//                                     &nulls_count);
+//        bool expected = (i % 2) == 0;
+//        EXPECT_EQ(expected, value);
+//        i++;
+//    }
+
+    column = row_group->Column(1);
+    gdf::parquet::Int64Reader *int64_reader =
+      static_cast<gdf::parquet::Int64Reader *>(column.get());
+    i = 0;
+//    while (int64_reader->HasNext()) {
+//        std::int64_t value;
+//        int64_reader->ReadBatchSpaced(1,
+//                                      &definition_level,
+//                                      &repetition_level,
+//                                      &value,
+//                                      &valid_bits,
+//                                      0,
+//                                      &levels_read,
+//                                      &values_read,
+//                                      &nulls_count);
+//        std::int64_t expected = static_cast<std::int64_t>(i) * 1000000000000;
+//        EXPECT_EQ(expected, value);
+//        i++;
+//    }
+
+    column = row_group->Column(2);
+    gdf::parquet::DoubleReader *double_reader =
+      static_cast<gdf::parquet::DoubleReader *>(column.get());
+    i = 0;
+//    while (double_reader->HasNext()) {
+//        double value;
+//        double_reader->ReadBatchSpaced(1,
+//                                       &definition_level,
+//                                       &repetition_level,
+//                                       &value,
+//                                       &valid_bits,
+//                                       0,
+//                                       &levels_read,
+//                                       &values_read,
+//                                       &nulls_count);
+//        double expected = i * 0.001;
+//        EXPECT_EQ(expected, value);
+//        i++;
+//    }
+}
+
+TEST(FileReaderTest, Read) {
+    std::unique_ptr<gdf::parquet::FileReader> reader =
+      gdf::parquet::FileReader::OpenFile(PARQUET_FILE_PATH);
+
+    checkMetadata(reader->metadata());
+    checkRowGroup(reader);
+}
diff --git a/src/tests/parquet/file_reader/null-test.cu b/src/tests/parquet/file_reader/null-test.cu
new file mode 100644
index 00000000..6f1a1c1b
--- /dev/null
+++ b/src/tests/parquet/file_reader/null-test.cu
@@ -0,0 +1,200 @@
+/*
+ * Copyright 2018 BlazingDB, Inc.
+ *     Copyright 2018 Cristhian Alberto Gonzales Castillo <cristhian@blazingdb.com>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <sstream>
+
+#include <boost/filesystem.hpp>
+
+#include <arrow/io/file.h>
+#include <arrow/util/logging.h>
+
+#include <parquet/column_writer.h>
+#include <parquet/file_writer.h>
+#include <parquet/properties.h>
+#include <parquet/schema.h>
+#include <parquet/types.h>
+
+#include <gtest/gtest.h>
+
+#include "../../../parquet/column_reader.h"
+#include "../../../parquet/file_reader.h"
+
+#include <gdf/gdf.h>
+
+#include "../../helper/utils.cuh"
+
+template <class DataType>
+class NullTest : public ::testing::Test {
+protected:
+    using TYPE = typename DataType::c_type;
+
+    NullTest();
+
+    void GenerateFile();
+    TYPE GenerateValue(std::size_t i);
+
+    virtual void SetUp() override;
+    virtual void TearDown() override;
+
+    static constexpr std::size_t kGroups       = 2;
+    static constexpr std::size_t kRowsPerGroup = 50;
+
+    const std::string filename;
+
+private:
+    std::shared_ptr<::parquet::schema::GroupNode> CreateSchema();
+};
+
+using Types = ::testing::Types<::parquet::Int64Type>;
+TYPED_TEST_CASE(NullTest, Types);
+
+template <class DataType>
+void
+NullTest<DataType>::SetUp() {
+    GenerateFile();
+}
+
+template <class DataType>
+void
+NullTest<DataType>::TearDown() {
+    if (std::remove(filename.c_str())) { FAIL() << "Remove file"; }
+}
+
+template <class DataType>
+NullTest<DataType>::NullTest()
+  : filename(boost::filesystem::unique_path().native()) {}
+
+template <class DataType>
+void
+NullTest<DataType>::GenerateFile() {
+    try {
+        std::shared_ptr<::arrow::io::FileOutputStream> stream;
+        PARQUET_THROW_NOT_OK(
+          ::arrow::io::FileOutputStream::Open(filename, &stream));
+
+        std::shared_ptr<::parquet::schema::GroupNode> schema = CreateSchema();
+
+        ::parquet::WriterProperties::Builder builder;
+        builder.compression(::parquet::Compression::SNAPPY);
+        std::shared_ptr<::parquet::WriterProperties> properties =
+          builder.build();
+
+        std::shared_ptr<::parquet::ParquetFileWriter> file_writer =
+          ::parquet::ParquetFileWriter::Open(stream, schema, properties);
+
+        for (std::size_t i = 0; i < kGroups; i++) {
+            ::parquet::RowGroupWriter *row_group_writer =
+              file_writer->AppendRowGroup(kRowsPerGroup);
+
+            ::parquet::TypedColumnWriter<DataType> *writer =
+              static_cast<::parquet::TypedColumnWriter<DataType> *>(
+                row_group_writer->NextColumn());
+            std::int16_t repetition_level = 0;
+            for (std::size_t j = 0; j < kRowsPerGroup; j++) {
+                TYPE         value = GenerateValue(i * kRowsPerGroup + j);
+                std::int16_t definition_level = j % 2;
+                writer->WriteBatch(
+                  1, &definition_level, &repetition_level, &value);
+            }
+        }
+
+        file_writer->Close();
+
+        DCHECK(stream->Close().ok());
+    } catch (const std::exception &e) { FAIL() << "Generate file" << e.what(); }
+}
+
+template <class DataType>
+std::shared_ptr<::parquet::schema::GroupNode>
+NullTest<DataType>::CreateSchema() {
+    return std::static_pointer_cast<::parquet::schema::GroupNode>(
+      ::parquet::schema::GroupNode::Make(
+        "schema",
+        ::parquet::Repetition::REQUIRED,
+        ::parquet::schema::NodeVector{::parquet::schema::PrimitiveNode::Make(
+          "field",
+          ::parquet::Repetition::OPTIONAL,
+          DataType::type_num,
+          ::parquet::LogicalType::NONE)}));
+}
+
+template <class DataType>
+typename NullTest<DataType>::TYPE
+NullTest<DataType>::GenerateValue(std::size_t i) {
+    return static_cast<TYPE>(i) * 10;
+}
+
+TYPED_TEST(NullTest, ReadAll) {
+    std::unique_ptr<gdf::parquet::FileReader> reader =
+      gdf::parquet::FileReader::OpenFile(this->filename);
+
+    std::shared_ptr<gdf::parquet::ColumnReader<TypeParam>> column_reader =
+      std::static_pointer_cast<gdf::parquet::ColumnReader<TypeParam>>(
+        reader->RowGroup(0)->Column(0));
+
+    ASSERT_TRUE(column_reader->HasNext());
+
+    using value_type = typename TypeParam::c_type;
+
+    const std::size_t rowsPerGroup = this->kRowsPerGroup;
+    const std::size_t groups       = this->kGroups;
+
+    gdf_column column{
+      .data       = nullptr,
+      .valid      = nullptr,
+      .size       = 0,
+      .dtype      = GDF_INT64,
+      .null_count = 0,
+      .dtype_info = {},
+    };
+
+    std::size_t valid_size =
+      get_number_of_bytes_for_valid(rowsPerGroup * groups);
+
+    cudaMalloc(&column.data, rowsPerGroup * groups * sizeof(value_type));
+    cudaMalloc(&column.valid, valid_size);
+
+    const std::size_t total_read = column_reader->ToGdfColumn(column);
+
+    column_reader =
+      std::static_pointer_cast<gdf::parquet::ColumnReader<TypeParam>>(
+        reader->RowGroup(1)->Column(0));
+
+    ASSERT_TRUE(column_reader->HasNext());
+    const std::size_t total_read2 = column_reader->ToGdfColumn(column, 50);
+
+    column.size = static_cast<gdf_size_type>(rowsPerGroup * groups);
+
+    EXPECT_EQ(rowsPerGroup, total_read);
+
+    gdf_column host_column = convert_to_host_gdf_column<value_type>(&column);
+
+    for (std::size_t i = 0; i < groups * rowsPerGroup; i++) {
+        value_type   expected = this->GenerateValue(i);
+        std::int64_t value    = static_cast<value_type *>(host_column.data)[i];
+        if (i % 2) { EXPECT_EQ(expected, value); }
+    }
+
+    const std::size_t valid_last = valid_size - 1;
+    for (std::size_t i = 0; i < valid_last; i++) {
+        std::uint8_t valid = host_column.valid[i];
+        EXPECT_EQ(0b10101010, valid);
+    }
+    EXPECT_EQ(0b00001010, 0b00001010 & host_column.valid[valid_last]);
+
+    delete_gdf_column(&column);
+}
diff --git a/src/tests/parquet/file_reader/reader-test.parquet b/src/tests/parquet/file_reader/reader-test.parquet
new file mode 100644
index 00000000..f00b6a8e
Binary files /dev/null and b/src/tests/parquet/file_reader/reader-test.parquet differ
diff --git a/src/tests/parquet/file_reader/single_column_file-test.cpp b/src/tests/parquet/file_reader/single_column_file-test.cpp
new file mode 100644
index 00000000..11c2b727
--- /dev/null
+++ b/src/tests/parquet/file_reader/single_column_file-test.cpp
@@ -0,0 +1,180 @@
+/*
+ * Copyright 2018 BlazingDB, Inc.
+ *     Copyright 2018 Cristhian Alberto Gonzales Castillo <cristhian@blazingdb.com>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <sstream>
+
+#include <boost/filesystem.hpp>
+
+#include <arrow/io/file.h>
+#include <arrow/util/logging.h>
+
+#include <parquet/column_writer.h>
+#include <parquet/file/writer.h>
+#include <parquet/properties.h>
+#include <parquet/schema.h>
+#include <parquet/types.h>
+
+#include <gtest/gtest.h>
+
+#include "column_reader.h"
+#include "file_reader.h"
+
+#include <gdf/gdf.h>
+
+template <class DataType>
+class SingleColumnFileTest : public ::testing::Test {
+protected:
+    using TYPE = typename DataType::c_type;
+
+    SingleColumnFileTest();
+
+    void GenerateFile();
+    TYPE GenerateValue(std::size_t i);
+
+    virtual void SetUp() override;
+    virtual void TearDown() override;
+
+    static constexpr std::size_t kRowsPerGroup = 100;
+
+    const std::string filename;
+
+private:
+    std::shared_ptr<::parquet::schema::GroupNode> CreateSchema();
+};
+
+using Types = ::testing::Types<::parquet::BooleanType,
+                               ::parquet::Int32Type,
+                               ::parquet::Int64Type,
+                               ::parquet::FloatType,
+                               ::parquet::DoubleType>;
+TYPED_TEST_CASE(SingleColumnFileTest, Types);
+
+template <class DataType>
+void
+SingleColumnFileTest<DataType>::SetUp() {
+    GenerateFile();
+}
+
+template <class DataType>
+void
+SingleColumnFileTest<DataType>::TearDown() {
+    if (std::remove(filename.c_str())) { FAIL() << "Remove file"; }
+}
+
+template <class DataType>
+SingleColumnFileTest<DataType>::SingleColumnFileTest()
+  : filename(boost::filesystem::unique_path().native()) {}
+
+template <class DataType>
+void
+SingleColumnFileTest<DataType>::GenerateFile() {
+    try {
+        std::shared_ptr<::arrow::io::FileOutputStream> stream;
+        PARQUET_THROW_NOT_OK(
+          ::arrow::io::FileOutputStream::Open(filename, &stream));
+
+        std::shared_ptr<::parquet::schema::GroupNode> schema = CreateSchema();
+
+        ::parquet::WriterProperties::Builder builder;
+        builder.compression(::parquet::Compression::SNAPPY);
+        std::shared_ptr<::parquet::WriterProperties> properties =
+          builder.build();
+
+        std::shared_ptr<::parquet::ParquetFileWriter> file_writer =
+          ::parquet::ParquetFileWriter::Open(stream, schema, properties);
+
+        ::parquet::RowGroupWriter *row_group_writer =
+          file_writer->AppendRowGroup(kRowsPerGroup);
+
+        ::parquet::TypedColumnWriter<DataType> *writer =
+          static_cast<::parquet::TypedColumnWriter<DataType> *>(
+            row_group_writer->NextColumn());
+        std::int16_t repetition_level = 0;
+        for (std::size_t i = 0; i < kRowsPerGroup; i++) {
+            TYPE         value            = GenerateValue(i);
+            std::int16_t definition_level = i % 2 ? 1 : 0;
+            writer->WriteBatch(
+              1, &definition_level, &repetition_level, &value);
+        }
+
+        file_writer->Close();
+
+        DCHECK(stream->Close().ok());
+    } catch (const std::exception &e) {
+        FAIL() << "Generate file" << e.what();
+    }
+}
+
+template <class DataType>
+std::shared_ptr<::parquet::schema::GroupNode>
+SingleColumnFileTest<DataType>::CreateSchema() {
+    return std::static_pointer_cast<::parquet::schema::GroupNode>(
+      ::parquet::schema::GroupNode::Make(
+        "schema",
+        ::parquet::Repetition::REQUIRED,
+        ::parquet::schema::NodeVector{::parquet::schema::PrimitiveNode::Make(
+          "field",
+          ::parquet::Repetition::OPTIONAL,
+          DataType::type_num,
+          ::parquet::LogicalType::NONE)}));
+}
+
+template <class DataType>
+typename SingleColumnFileTest<DataType>::TYPE
+SingleColumnFileTest<DataType>::GenerateValue(std::size_t i) {
+    return static_cast<TYPE>(i) * 1000000000000;
+}
+
+TYPED_TEST(SingleColumnFileTest, ReadAll) {
+    std::unique_ptr<gdf::parquet::FileReader> reader =
+      gdf::parquet::FileReader::OpenFile(this->filename);
+
+    std::shared_ptr<gdf::parquet::ColumnReader<TypeParam>> column_reader =
+      std::static_pointer_cast<gdf::parquet::ColumnReader<TypeParam>>(
+        reader->RowGroup(0)->Column(0));
+
+    ASSERT_TRUE(column_reader->HasNext());
+
+    using value_type = typename TypeParam::c_type;
+
+    const std::size_t rowsPerGroup = this->kRowsPerGroup;
+
+    gdf_column column{
+      .data       = new std::uint8_t[rowsPerGroup * sizeof(value_type)],
+      .valid      = new std::uint8_t[rowsPerGroup],
+      .size       = 0,
+      .dtype      = GDF_invalid,
+      .null_count = 0,
+      .dtype_info = {},
+    };
+    std::int16_t definition_levels[rowsPerGroup];
+    std::int16_t repetition_levels[rowsPerGroup];
+
+    const std::size_t total_read =
+      column_reader->ToGdfColumn(definition_levels, repetition_levels, column);
+
+    EXPECT_EQ(rowsPerGroup, total_read);
+
+    for (std::size_t i = 0; i < rowsPerGroup; i++) {
+        value_type   expected = this->GenerateValue(i);
+        std::int64_t value    = static_cast<value_type *>(column.data)[i];
+        if (i % 2) { EXPECT_EQ(expected, value); }
+    }
+
+    delete[] static_cast<std::uint8_t *>(column.data);
+    delete[] column.valid;
+}
diff --git a/src/tests/parquet/gdf_column/CMakeLists.txt b/src/tests/parquet/gdf_column/CMakeLists.txt
new file mode 100644
index 00000000..1164fc4c
--- /dev/null
+++ b/src/tests/parquet/gdf_column/CMakeLists.txt
@@ -0,0 +1,36 @@
+
+
+#=============================================================================
+# Copyright 2018 BlazingDB, Inc.
+#     Copyright 2018 Cristhian Alberto Gonzales Castillo <cristhian@blazingdb.com>
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+
+
+find_package(Boost REQUIRED COMPONENTS filesystem)
+
+set(PARQUET_FILE_FOR_DECODING_PATH
+        ${CMAKE_SOURCE_DIR}/src/tests/parquet/gdf_column/reader-test.parquet)
+
+add_definitions(-DPARQUET_FILE_FOR_DECODING_PATH="${PARQUET_FILE_FOR_DECODING_PATH}")
+
+GDF_ADD_PARQUET_TEST(gdf_column-test
+        gdf_column-test.cu
+        ../../helper/utils.cuh
+        ../../helper/utils.cu
+        )
+
+target_link_libraries(gdf_column-test Boost::filesystem)
+
diff --git a/src/tests/parquet/gdf_column/gdf_column-test.cu b/src/tests/parquet/gdf_column/gdf_column-test.cu
new file mode 100644
index 00000000..b3beb60e
--- /dev/null
+++ b/src/tests/parquet/gdf_column/gdf_column-test.cu
@@ -0,0 +1,213 @@
+/*
+ * Copyright 2018 BlazingDB, Inc.
+ *     Copyright 2018 Alexander Ocsa <alexander@blazingdb.com>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+ #include <gtest/gtest.h>
+
+ #include <sstream>
+
+ #include <boost/filesystem.hpp>
+
+ #include <arrow/io/file.h>
+ #include <arrow/util/logging.h>
+
+
+ #include <parquet/column_writer.h>
+ #include <parquet/file_writer.h>
+ #include <parquet/properties.h>
+ #include <parquet/schema.h>
+ #include <parquet/types.h>
+
+ #include <cassert>
+
+ #include <thrust/functional.h>
+ #include <thrust/device_ptr.h>
+
+ #include "../../../parquet/column_reader.h"
+ #include "../../../parquet/file_reader.h"
+
+ #include "../../helper/utils.cuh"
+
+ #ifndef PARQUET_FILE_FOR_DECODING_PATH
+ #error PARQUET_FILE_FOR_DECODING_PATH must be defined for precompiling
+ #define PARQUET_FILE_FOR_DECODING_PATH "/"
+ #endif
+
+ template <typename DataType>
+ class SingleColumnToGdfTest : public ::testing::Test {
+ protected:
+     using TYPE = typename DataType::c_type;
+
+     SingleColumnToGdfTest();
+
+     void GenerateFile();
+
+     inline TYPE GenerateValue(size_t i) {
+         if (sizeof (TYPE) == 1  ) {
+             return i % 2;
+         }
+         return static_cast<TYPE>(i) * 10;
+     }
+
+     virtual void SetUp() override;
+
+     virtual void TearDown() override;
+
+     static constexpr size_t kRowsPerGroup = 50;
+
+     const std::string filename;
+
+ private:
+     std::shared_ptr<::parquet::schema::GroupNode> CreateSchema();
+ };
+
+ using Types = ::testing::Types<::parquet::BooleanType,
+                                ::parquet::Int32Type>;
+ TYPED_TEST_CASE(SingleColumnToGdfTest, Types);
+
+ template<typename DataType>
+ void SingleColumnToGdfTest<DataType>::SetUp() {
+     GenerateFile();
+ }
+
+ template<typename DataType>
+ void SingleColumnToGdfTest<DataType>::TearDown() {
+     if ( std::remove(filename.c_str())) {
+         FAIL() << "Remove file";
+     }
+ }
+
+ template<typename DataType>
+ SingleColumnToGdfTest<DataType>::SingleColumnToGdfTest()
+     : filename ( boost::filesystem::unique_path().native())
+ {
+ }
+
+ template <class DataType>
+ void SingleColumnToGdfTest<DataType>::GenerateFile() {
+     try {
+         std::shared_ptr<::arrow::io::FileOutputStream> stream;
+         PARQUET_THROW_NOT_OK(
+           ::arrow::io::FileOutputStream::Open(filename, &stream));
+
+         std::shared_ptr<::parquet::schema::GroupNode> schema = CreateSchema();
+
+         ::parquet::WriterProperties::Builder builder;
+         builder.compression(::parquet::Compression::SNAPPY);
+         std::shared_ptr<::parquet::WriterProperties> properties =
+           builder.build();
+
+         // Set ColumnDescriptor! =  3
+
+         std::shared_ptr<::parquet::ParquetFileWriter> file_writer =
+           ::parquet::ParquetFileWriter::Open(stream, schema, properties);
+
+         ::parquet::RowGroupWriter *row_group_writer =
+           file_writer->AppendRowGroup(kRowsPerGroup);
+
+         ::parquet::TypedColumnWriter<DataType> *writer =
+           static_cast<::parquet::TypedColumnWriter<DataType> *>(
+             row_group_writer->NextColumn());
+         std::int16_t repetition_level = 0;
+         for (std::size_t i = 0; i < kRowsPerGroup; i++) {
+             TYPE         value            = GenerateValue(i);
+             std::int16_t definition_level = i % 2 ? 1 : 0;
+             writer->WriteBatch(
+               1, &definition_level, &repetition_level, &value);
+         }
+
+         file_writer->Close();
+
+         DCHECK(stream->Close().ok());
+     } catch (const std::exception &e) {
+         FAIL() << "Generate file" << e.what();
+     }
+ }
+
+
+ template <class DataType>
+ std::shared_ptr<::parquet::schema::GroupNode>
+ SingleColumnToGdfTest<DataType>::CreateSchema() {
+     return std::static_pointer_cast<::parquet::schema::GroupNode>(
+       ::parquet::schema::GroupNode::Make(
+         "schema",
+         ::parquet::Repetition::REQUIRED,
+         ::parquet::schema::NodeVector{::parquet::schema::PrimitiveNode::Make(
+           "field",
+           ::parquet::Repetition::OPTIONAL,
+           DataType::type_num,
+           ::parquet::LogicalType::NONE)}));
+ }
+
+
+
+ TYPED_TEST(SingleColumnToGdfTest, ReadAll) {
+
+    using FileReader = gdf::parquet::FileReader;
+    using ColumnReader = gdf::parquet::ColumnReader<TypeParam>;
+    auto reader = FileReader::OpenFile(this->filename);
+    auto row_group = reader->RowGroup(0);
+    auto abstract_column_reader = row_group->Column(0);
+    std::cout << "column_reader id : " << typeid(abstract_column_reader).name() << std::endl;
+
+    auto column_reader = std::static_pointer_cast<ColumnReader>(abstract_column_reader);
+
+     ASSERT_TRUE(column_reader->HasNext());
+
+     using value_type = typename TypeParam::c_type;
+
+     const std::size_t rowsPerGroup = this->kRowsPerGroup;
+
+     gdf_column column{
+       .data       = nullptr,
+       .valid      = nullptr,
+       .size       = rowsPerGroup,
+       .dtype      = GDF_invalid,
+       .null_count = 0,
+       .dtype_info = {},
+     };
+     cudaMalloc(&column.data, rowsPerGroup * sizeof(value_type));
+
+     auto n_bytes = get_number_of_bytes_for_valid(this->kRowsPerGroup);
+     cudaMalloc((void **)&column.valid, n_bytes);
+
+     // std::int16_t definition_levels[rowsPerGroup];
+     // std::int16_t repetition_levels[rowsPerGroup];
+
+     const std::size_t total_read =
+       column_reader->ToGdfColumn(column);
+
+     column.size = static_cast<gdf_size_type>(rowsPerGroup);
+    //  column.dtype = ParquetTraits<TypeParam>::gdfDType;
+
+     EXPECT_EQ(rowsPerGroup, total_read); // using ReadBatch
+
+     print_column<value_type>(&column);
+
+     gdf_column host_column = convert_to_host_gdf_column<value_type>(&column);
+
+     for (std::size_t i = 0; i < rowsPerGroup; i++) {
+         if (i % 2) {
+             value_type   expected = this->GenerateValue(i);
+             value_type   value    = static_cast<value_type *>(host_column.data)[i];
+             EXPECT_EQ(expected, value);
+         }
+     }
+
+     delete_gdf_column(&column);
+ }
+
diff --git a/src/tests/parquet/gdf_column/reader-test.parquet b/src/tests/parquet/gdf_column/reader-test.parquet
new file mode 100644
index 00000000..f00b6a8e
Binary files /dev/null and b/src/tests/parquet/gdf_column/reader-test.parquet differ