diff --git a/libgdf/CMakeLists.txt b/libgdf/CMakeLists.txt index 59f4a856..0a8d3ce7 100644 --- a/libgdf/CMakeLists.txt +++ b/libgdf/CMakeLists.txt @@ -151,6 +151,7 @@ cuda_add_library(gdf SHARED src/sqls_ops.cu src/streamcompactionops.cu src/unaryops.cu + src/replace.cu #src/windowedops.cu src/quantiles.cu src/io/csv/csv-reader.cu @@ -220,5 +221,10 @@ if(GTEST_FOUND) else() message(AUTHOR_WARNING "Google C++ Testing Framework (Google Test) not found: automated tests are disabled.") endif() + +if(GDF_BENCHMARK) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/src/bench) + endif() + # Print the project summary feature_summary(WHAT ALL INCLUDE_QUIET_PACKAGES FATAL_ON_MISSING_REQUIRED_PACKAGES) diff --git a/libgdf/include/gdf/cffi/functions.h b/libgdf/include/gdf/cffi/functions.h index 2e328224..7e4d5454 100644 --- a/libgdf/include/gdf/cffi/functions.h +++ b/libgdf/include/gdf/cffi/functions.h @@ -1,18 +1,18 @@ #pragma once /* --------------------------------------------------------------------------*/ -/** +/** * @Synopsis Start a NVTX range with predefined color. * * This function is useful only for profiling with nvvp or Nsight Systems. It * demarcates the begining of a user-defined range with a specified name and * color that will show up in the timeline view of nvvp/Nsight Systems. Can be * nested within other ranges. - * + * * @Param name The name of the NVTX range * @Param color The predefined gdf_color enum to use to color this range - * - * @Returns + * + * @Returns */ /* ----------------------------------------------------------------------------*/ gdf_error gdf_nvtx_range_push(char const * const name, gdf_color color ); @@ -21,47 +21,47 @@ gdf_error gdf_nvtx_range_push(char const * const name, gdf_color color ); /* --------------------------------------------------------------------------*/ -/** +/** * @Synopsis Start a NVTX range with a custom ARGB color code. * * This function is useful only for profiling with nvvp or Nsight Systems. It * demarcates the begining of a user-defined range with a specified name and * color that will show up in the timeline view of nvvp/Nsight Systems. Can be * nested within other ranges. - * + * * @Param name The name of the NVTX range * @Param color The ARGB hex color code to use to color this range (e.g., 0xFF00FF00) - * - * @Returns + * + * @Returns */ /* ----------------------------------------------------------------------------*/ gdf_error gdf_nvtx_range_push_hex(char const * const name, unsigned int color ); /* --------------------------------------------------------------------------*/ -/** +/** * @Synopsis Ends the inner-most NVTX range. * * This function is useful only for profiling with nvvp or Nsight Systems. It * will demarcate the end of the inner-most range, i.e., the most recent call to * gdf_nvtx_range_push. - * - * @Returns + * + * @Returns */ /* ----------------------------------------------------------------------------*/ gdf_error gdf_nvtx_range_pop(); /* --------------------------------------------------------------------------*/ -/** +/** * @Synopsis Counts the number of valid bits in the mask that corresponds to * the specified number of rows. - * + * * @Param[in] masks Array of gdf_valid_types with enough bits to represent * num_rows number of rows * @Param[in] num_rows The number of rows represented in the bit-validity mask. * @Param[out] count The number of valid rows in the mask - * - * @Returns GDF_SUCCESS upon successful completion. + * + * @Returns GDF_SUCCESS upon successful completion. */ /* ----------------------------------------------------------------------------*/ gdf_error gdf_count_nonzero_mask(gdf_valid_type const * masks, int num_rows, int * count); @@ -79,15 +79,15 @@ gdf_error gdf_column_view_augmented(gdf_column *column, void *data, gdf_valid_ty gdf_error gdf_column_free(gdf_column *column); /* --------------------------------------------------------------------------*/ -/** +/** * @Synopsis Concatenates the gdf_columns into a single, contiguous column, * including the validity bitmasks - * - * @Param[out] output A column whose buffers are already allocated that will + * + * @Param[out] output A column whose buffers are already allocated that will * @Param[in] columns_to_conat[] The columns to concatenate * @Param[in] num_columns The number of columns to concatenate * contain the concatenation of the input columns - * + * * @Returns GDF_SUCCESS upon successful completion */ /* ----------------------------------------------------------------------------*/ @@ -198,10 +198,10 @@ gdf_error gdf_segmented_radixsort_generic(gdf_segmented_radixsort_plan_type *hdl /* --------------------------------------------------------------------------*/ -/** +/** * @Synopsis Performs an inner join on the specified columns of two * dataframes (left, right) - * + * * @Param[in] left_cols[] The columns of the left dataframe * @Param[in] num_left_cols The number of columns in the left dataframe * @Param[in] left_join_cols[] The column indices of columns from the left dataframe @@ -218,13 +218,13 @@ gdf_error gdf_segmented_radixsort_generic(gdf_segmented_radixsort_plan_type *hdl * @Param[out] gdf_column * right_indices If not nullptr, indices of rows from the right table that match rows in the left table * @Param[in] join_context The context to use to control how the join is performed,e.g., * sort vs hash based implementation - * + * * @Returns GDF_SUCCESS if the join operation was successful, otherwise an appropriate * error code */ /* ----------------------------------------------------------------------------*/ gdf_error gdf_inner_join( - gdf_column **left_cols, + gdf_column **left_cols, int num_left_cols, int left_join_cols[], gdf_column **right_cols, @@ -238,10 +238,10 @@ gdf_error gdf_inner_join( gdf_context *join_context); /* --------------------------------------------------------------------------*/ -/** +/** * @Synopsis Performs a left join (also known as left outer join) on the * specified columns of two dataframes (left, right) - * + * * @Param[in] left_cols[] The columns of the left dataframe * @Param[in] num_left_cols The number of columns in the left dataframe * @Param[in] left_join_cols[] The column indices of columns from the left dataframe @@ -258,13 +258,13 @@ gdf_error gdf_inner_join( * @Param[out] gdf_column * right_indices If not nullptr, indices of rows from the right table that match rows in the left table * @Param[in] join_context The context to use to control how the join is performed,e.g., * sort vs hash based implementation - * + * * @Returns GDF_SUCCESS if the join operation was successful, otherwise an appropriate * error code */ /* ----------------------------------------------------------------------------*/ gdf_error gdf_left_join( - gdf_column **left_cols, + gdf_column **left_cols, int num_left_cols, int left_join_cols[], gdf_column **right_cols, @@ -278,10 +278,10 @@ gdf_error gdf_left_join( gdf_context *join_context); /* --------------------------------------------------------------------------*/ -/** +/** * @Synopsis Performs a full join (also known as full outer join) on the * specified columns of two dataframes (left, right) - * + * * @Param[in] left_cols[] The columns of the left dataframe * @Param[in] num_left_cols The number of columns in the left dataframe * @Param[in] left_join_cols[] The column indices of columns from the left dataframe @@ -298,13 +298,13 @@ gdf_error gdf_left_join( * @Param[out] gdf_column * right_indices If not nullptr, indices of rows from the right table that match rows in the left table * @Param[in] join_context The context to use to control how the join is performed,e.g., * sort vs hash based implementation - * + * * @Returns GDF_SUCCESS if the join operation was successful, otherwise an appropriate * error code */ /* ----------------------------------------------------------------------------*/ gdf_error gdf_full_join( - gdf_column **left_cols, + gdf_column **left_cols, int num_left_cols, int left_join_cols[], gdf_column **right_cols, @@ -320,32 +320,32 @@ gdf_error gdf_full_join( /* partioning */ /* --------------------------------------------------------------------------*/ -/** - * @brief Computes the hash values of the rows in the specified columns of the - * input columns and bins the hash values into the desired number of partitions. - * Rearranges the input columns such that rows with hash values in the same bin +/** + * @brief Computes the hash values of the rows in the specified columns of the + * input columns and bins the hash values into the desired number of partitions. + * Rearranges the input columns such that rows with hash values in the same bin * are contiguous. - * + * * @Param[in] num_input_cols The number of columns in the input columns * @Param[in] input[] The input set of columns * @Param[in] columns_to_hash[] Indices of the columns in the input set to hash * @Param[in] num_cols_to_hash The number of columns to hash * @Param[in] num_partitions The number of partitions to rearrange the input rows into - * @Param[out] partitioned_output Preallocated gdf_columns to hold the rearrangement + * @Param[out] partitioned_output Preallocated gdf_columns to hold the rearrangement * of the input columns into the desired number of partitions * @Param[out] partition_offsets Preallocated array the size of the number of * partitions. Where partition_offsets[i] indicates the starting position * of partition 'i' * @Param[in] hash The hash function to use - * + * * @Returns If the operation was successful, returns GDF_SUCCESS */ /* ----------------------------------------------------------------------------*/ -gdf_error gdf_hash_partition(int num_input_cols, - gdf_column * input[], +gdf_error gdf_hash_partition(int num_input_cols, + gdf_column * input[], int columns_to_hash[], int num_cols_to_hash, - int num_partitions, + int num_partitions, gdf_column * partitioned_output[], int partition_offsets[], gdf_hash_func hash); @@ -363,14 +363,14 @@ gdf_error gdf_prefixsum_i64(gdf_column *inp, gdf_column *out, int inclusive); /* hashing */ /* --------------------------------------------------------------------------*/ -/** +/** * @Synopsis Computes the hash value of each row in the input set of columns. - * + * * @Param num_cols The number of columns in the input set * @Param input The list of columns whose rows will be hashed * @Param hash The hash function to use * @Param output The hash value of each row of the input - * + * * @Returns GDF_SUCCESS if the operation was successful, otherwise an appropriate * error code */ @@ -702,7 +702,7 @@ gdf_error gpu_hash_columns(gdf_column ** columns_to_hash, int num_columns, gdf_c gdf_error get_column_byte_width(gdf_column * col, int * width); -/* +/* Multi-Column SQL ops: WHERE (Filtering) ORDER-BY @@ -783,3 +783,16 @@ gdf_error gdf_quantile_aprrox( gdf_column* col_in, //input column; double q, //requested quantile in [0,1] void* t_erased_res, //type-erased result of same type as column; gdf_context* ctxt); //context info + +/* replace */ + +/// \brief For each value in `to_replace`, find all instances of that value +/// in `column` and replace it with the corresponding value in `values`. +/// \param[in/out] column data +/// \param[in] to_replace contains values of column that will be replaced +/// \param[in] values contains the replacement values +/// +/// Note that `to_replace` and `values` are related by the index +gdf_error gdf_find_and_replace_all(gdf_column * column, + const gdf_column *to_replace, + const gdf_column *values); diff --git a/libgdf/src/tests/CMakeLists.txt b/libgdf/src/tests/CMakeLists.txt index abf3d2fa..783799cf 100644 --- a/libgdf/src/tests/CMakeLists.txt +++ b/libgdf/src/tests/CMakeLists.txt @@ -46,6 +46,7 @@ add_subdirectory(datetime) add_subdirectory(hashing) add_subdirectory(join) add_subdirectory(sqls) +add_subdirectory(replace) add_subdirectory(hash_map) add_subdirectory(groupby) add_subdirectory(unaryops) diff --git a/libgdf/thirdparty/cnmem b/libgdf/thirdparty/cnmem deleted file mode 160000 index 28a182d4..00000000 --- a/libgdf/thirdparty/cnmem +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 28a182d49529da49f4ac4e3941cec3edf16b3540 diff --git a/libgdf/thirdparty/cub b/libgdf/thirdparty/cub deleted file mode 160000 index b165e1fb..00000000 --- a/libgdf/thirdparty/cub +++ /dev/null @@ -1 +0,0 @@ -Subproject commit b165e1fb11eeea64ccf95053e40f2424312599cc diff --git a/libgdf/thirdparty/moderngpu b/libgdf/thirdparty/moderngpu deleted file mode 160000 index c1fd31df..00000000 --- a/libgdf/thirdparty/moderngpu +++ /dev/null @@ -1 +0,0 @@ -Subproject commit c1fd31df008d79f727483e795b1ee1ce45b782ca diff --git a/src/bench/CMakeLists.txt b/src/bench/CMakeLists.txt new file mode 100644 index 00000000..adff8993 --- /dev/null +++ b/src/bench/CMakeLists.txt @@ -0,0 +1,57 @@ +#============================================================================= +# Copyright 2018 BlazingDB, Inc. +# Copyright 2018 Cristhian Alberto Gonzales Castillo +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#============================================================================= + +if(GDF_BENCHMARK) + +include(ExternalProject) + +ExternalProject_Add(benchmark_ep + CMAKE_ARGS + -DCMAKE_BUILD_TYPE=RELEASE + -DCMAKE_INSTALL_PREFIX=build + -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON + GIT_REPOSITORY https://github.com/google/benchmark.git + GIT_TAG v1.4.1 + UPDATE_COMMAND "" +) +ExternalProject_Get_property(benchmark_ep BINARY_DIR) +set(BENCHMARK_ROOT ${BINARY_DIR}/build) + +file(MAKE_DIRECTORY ${BENCHMARK_ROOT}/include) +file(MAKE_DIRECTORY ${BENCHMARK_ROOT}/lib) + +add_library(Google::Benchmark INTERFACE IMPORTED) +add_dependencies(Google::Benchmark benchmark_ep) +set_target_properties(Google::Benchmark + PROPERTIES INTERFACE_INCLUDE_DIRECTORIES ${BENCHMARK_ROOT}/include) +set_target_properties(Google::Benchmark + PROPERTIES INTERFACE_LINK_LIBRARIES ${BENCHMARK_ROOT}/lib/libbenchmark.a) + +add_library(Google::Benchmark::Main INTERFACE IMPORTED) +set_target_properties(Google::Benchmark::Main + PROPERTIES INTERFACE_LINK_LIBRARIES ${BENCHMARK_ROOT}/lib/libbenchmark_main.a) + + +function(GDF_ADD_BENCHMARK TARGET) + list(REMOVE_AT ARGV 0) + cuda_add_executable(${TARGET} ${ARGV}) + target_link_libraries(${TARGET} Google::Benchmark Google::Benchmark::Main gdf) +endfunction() + + +add_subdirectory(replace) +endif() diff --git a/src/bench/replace/CMakeLists.txt b/src/bench/replace/CMakeLists.txt new file mode 100644 index 00000000..4d7296fc --- /dev/null +++ b/src/bench/replace/CMakeLists.txt @@ -0,0 +1,18 @@ +#============================================================================= +# Copyright 2018 BlazingDB, Inc. +# Copyright 2018 Cristhian Alberto Gonzales Castillo +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#============================================================================= + +GDF_ADD_BENCHMARK(replace-benchmark replace-benchmark.cu) diff --git a/src/bench/replace/replace-benchmark.cu b/src/bench/replace/replace-benchmark.cu new file mode 100644 index 00000000..a2001d44 --- /dev/null +++ b/src/bench/replace/replace-benchmark.cu @@ -0,0 +1,111 @@ +/* + * Copyright 2018 BlazingDB, Inc. + * Copyright 2018 Cristhian Alberto Gonzales Castillo + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include +#include + +#include + +#include "../../tests/replace/utils.h" + +using T = std::int64_t; + +static void +BM_CPU_LoopReplace(benchmark::State &state) { + const std::size_t length = state.range(0); + + std::vector vector(length); + thrust::sequence(vector.begin(), vector.end(), 1); + + std::vector to_replace_vector(10); + thrust::sequence(to_replace_vector.begin(), to_replace_vector.end(), 1); + + std::vector values_vector(10); + thrust::sequence(values_vector.begin(), values_vector.end(), 1); + + for (auto _ : state) { + for (std::size_t i = 0; i < vector.size(); i++) { + auto current = std::find( + to_replace_vector.begin(), to_replace_vector.end(), vector[i]); + if (current != to_replace_vector.end()) { + std::size_t j = + std::distance(to_replace_vector.begin(), current); + vector[i] = values_vector[j]; + } + } + } +} + +static void +BM_CPU_MapReplace(benchmark::State &state) { + const std::size_t length = state.range(0); + + std::vector vector(length); + thrust::sequence(vector.begin(), vector.end(), 1); + + std::vector to_replace_vector(10); + thrust::sequence(to_replace_vector.begin(), to_replace_vector.end(), 1); + + std::vector values_vector(10); + thrust::sequence(values_vector.begin(), values_vector.end(), 1); + + for (auto _ : state) { + std::unordered_map map; + for (std::size_t i = 0; i < values_vector.size(); i++) { + map.insert({to_replace_vector[i], values_vector[i]}); + } + + for (std::size_t i = 0; i < vector.size(); i++) { + try { + vector[i] = map[vector[i]]; + } catch (...) { continue; } + } + } +} + +static void +BM_GPU_LoopReplace(benchmark::State &state) { + const std::size_t length = state.range(0); + + thrust::device_vector device_vector(length); + thrust::sequence(device_vector.begin(), device_vector.end(), 1); + gdf_column column = MakeGdfColumn(device_vector); + + thrust::device_vector to_replace_vector(10); + thrust::sequence(to_replace_vector.begin(), to_replace_vector.end(), 1); + gdf_column to_replace = MakeGdfColumn(to_replace_vector); + + thrust::device_vector values_vector(10); + thrust::sequence(values_vector.begin(), values_vector.end(), 1); + gdf_column values = MakeGdfColumn(values_vector); + + for (auto _ : state) { + const gdf_error status = + gdf_find_and_replace_all(&column, &to_replace, &values); + state.PauseTiming(); + if (status != GDF_SUCCESS) { state.SkipWithError("Failed replace"); } + state.ResumeTiming(); + } +} + +BENCHMARK(BM_CPU_LoopReplace)->Ranges({{8, 8 << 16}, {8, 512}}); +BENCHMARK(BM_CPU_MapReplace)->Ranges({{8, 8 << 16}, {8, 512}}); +BENCHMARK(BM_GPU_LoopReplace)->Ranges({{8, 8 << 16}, {8, 512}}); diff --git a/src/replace.cu b/src/replace.cu new file mode 100644 index 00000000..bfe827a5 --- /dev/null +++ b/src/replace.cu @@ -0,0 +1,175 @@ +/* + * Copyright 2018 BlazingDB, Inc. + * Copyright 2018 Cristhian Alberto Gonzales Castillo + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include +#include + +#include + +namespace { + +//! traits to get primitive type from gdf dtype +template +struct gdf_dtype_traits {}; + +#define DTYPE_FACTORY(DTYPE, T) \ + template <> \ + struct gdf_dtype_traits { \ + typedef T value_type; \ + } + +DTYPE_FACTORY(INT8, std::int8_t); +DTYPE_FACTORY(INT16, std::int16_t); +DTYPE_FACTORY(INT32, std::int32_t); +DTYPE_FACTORY(INT64, std::int64_t); +DTYPE_FACTORY(FLOAT32, float); +DTYPE_FACTORY(FLOAT64, double); +DTYPE_FACTORY(DATE32, std::int32_t); +DTYPE_FACTORY(DATE64, std::int64_t); +DTYPE_FACTORY(TIMESTAMP, std::int64_t); + +#undef DTYPE_FACTORY + +/// /brief Replace kernel +/// \param[in/out] data with elements to be replaced +/// \param[in] values contains the replacement values +/// \param[in] to_replace_begin begin pointer of `to_replace` array +/// \param[in] to_replace_begin end pointer of `to_replace` array +template +__global__ void +replace_kernel(T *const data, + const std::size_t data_size, + const T *const values, + const thrust::device_ptr to_replace_begin, + const thrust::device_ptr to_replace_end) { + for (std::size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < data_size; + i += blockDim.x * gridDim.x) { + // TODO: find by map kernel + const thrust::device_ptr found_ptr = thrust::find( + thrust::device, to_replace_begin, to_replace_end, data[i]); + + if (found_ptr != to_replace_end) { + typename thrust::iterator_traits< + const thrust::device_ptr>::difference_type + value_found_index = thrust::distance(to_replace_begin, found_ptr); + + data[i] = values[value_found_index]; + } + } +} + +/// /brief Call replace kernel according to primitive type T +/// \param[in/out] data with elements to be replaced +/// \param[in] data_size number of elements in data +/// \param[in] to_replace contains values that will be replaced +/// \param[in] values contains the replacement values +/// \param[in] replacement_ptrdiff to get the end pointer of `to_replace` array +template +static inline gdf_error +Replace(T *const data, + const std::size_t data_size, + const T *const to_replace, + const T *const values, + const std::ptrdiff_t replacement_ptrdiff) { + const std::size_t blocks = std::ceil(data_size / 256.); + + const thrust::device_ptr to_replace_begin(to_replace); + const thrust::device_ptr to_replace_end(to_replace_begin + + replacement_ptrdiff); + + replace_kernel<<>>( // TODO: calc blocks and threads + data, + data_size, + values, + to_replace_begin, + to_replace_end); + + return GDF_SUCCESS; +} + +/// \brief Check if two gdf_columns have the same size +/// \param[in] to_replace is a gdf_column +/// \param[in] values is a gdf_column +static inline bool +NotEqualReplacementSize(const gdf_column *to_replace, + const gdf_column *values) { + return to_replace->size != values->size; +} + +/// \brief Check if the three gdf columns have the same dtype +/// \param[in] column is as gdf_column +/// \param[in] to_replace is a gdf_column +/// \param[in] values is a gdf_column +static inline bool +NotSameDType(const gdf_column *column, + const gdf_column *to_replace, + const gdf_column *values) { + return column->dtype != to_replace->dtype + || to_replace->dtype != values->dtype; +} + +} // namespace + +/// \brief For each value in `to_replace`, find all instances of that value +/// in `column` and replace it with the corresponding value in `values`. +/// \param[in/out] column data +/// \param[in] to_replace contains values of column that will be replaced +/// \param[in] values contains the replacement values +/// +/// Note that `to_replace` and `values` are related by the index +gdf_error +gdf_find_and_replace_all(gdf_column * column, + const gdf_column *to_replace, + const gdf_column *values) { + if (NotEqualReplacementSize(to_replace, values)) { + return GDF_COLUMN_SIZE_MISMATCH; + } + + if (NotSameDType(column, to_replace, values)) { return GDF_CUDA_ERROR; } + + switch (column->dtype) { +#define REPLACE_CASE(DTYPE) \ + case GDF_##DTYPE: { \ + using value_type = gdf_dtype_traits::value_type; \ + return Replace(static_cast(column->data), \ + static_cast(column->size), \ + static_cast(to_replace->data), \ + static_cast(values->data), \ + static_cast(values->size)); \ + } + + REPLACE_CASE(INT8); + REPLACE_CASE(INT16); + REPLACE_CASE(INT32); + REPLACE_CASE(INT64); + REPLACE_CASE(FLOAT32); + REPLACE_CASE(FLOAT64); + REPLACE_CASE(DATE32); + REPLACE_CASE(DATE64); + REPLACE_CASE(TIMESTAMP); + +#undef REPLACE_CASE + + case GDF_invalid: + default: return GDF_UNSUPPORTED_DTYPE; + } +} diff --git a/src/tests/replace/CMakeLists.txt b/src/tests/replace/CMakeLists.txt new file mode 100644 index 00000000..bdf3c5d2 --- /dev/null +++ b/src/tests/replace/CMakeLists.txt @@ -0,0 +1,18 @@ +#============================================================================= +# Copyright 2018 BlazingDB, Inc. +# Copyright 2018 Cristhian Alberto Gonzales Castillo +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#============================================================================= + +configure_test(replace-test replace-test.cu) diff --git a/src/tests/replace/replace-test.cu b/src/tests/replace/replace-test.cu new file mode 100644 index 00000000..c6ec346a --- /dev/null +++ b/src/tests/replace/replace-test.cu @@ -0,0 +1,179 @@ +/* + * Copyright 2018 BlazingDB, Inc. + * Copyright 2018 Cristhian Alberto Gonzales Castillo + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include "utils.h" + +template +static inline thrust::device_vector +MakeDeviceVector(const std::initializer_list list) { + const std::vector column_data(list); + thrust::device_vector device_data(column_data); + return device_data; +} + +// This is the main test feature +template +class ReplaceTest : public testing::Test { +protected: + thrust::device_ptr + test(const std::initializer_list &data_list, + const std::initializer_list &to_replace_list, + const std::initializer_list &values_list) { + device_data = MakeDeviceVector(data_list); + to_replace_data = MakeDeviceVector(to_replace_list); + values_data = MakeDeviceVector(values_list); + + column = MakeGdfColumn(device_data); + to_replace = MakeGdfColumn(to_replace_data); + values = MakeGdfColumn(values_data); + + const gdf_error status = + gdf_find_and_replace_all(&column, &to_replace, &values); + + EXPECT_EQ(GDF_SUCCESS, status); + + return thrust::device_ptr(static_cast(column.data)); + } + + thrust::device_vector device_data; + thrust::device_vector to_replace_data; + thrust::device_vector values_data; + + gdf_column column; + gdf_column to_replace; + gdf_column values; +}; + +using Types = testing:: + Types; + +TYPED_TEST_CASE(ReplaceTest, Types); + +// Simple test, replacing all even values +TYPED_TEST(ReplaceTest, ReplaceEvenPosition) { + thrust::device_ptr results = + this->test({1, 2, 3, 4, 5, 6, 7, 8}, {2, 4, 6, 8}, {0, 2, 4, 6}); + + EXPECT_EQ(0, results[1]); + EXPECT_EQ(2, results[3]); + EXPECT_EQ(4, results[5]); + EXPECT_EQ(6, results[7]); +} + +// Similar test as ReplaceEvenPosition, but with unordered data +TYPED_TEST(ReplaceTest, Unordered) { + thrust::device_ptr results = + this->test({7, 5, 6, 3, 1, 2, 8, 4}, {2, 6, 4, 8}, {0, 4, 2, 6}); + + EXPECT_EQ(4, results[2]); + EXPECT_EQ(0, results[5]); + EXPECT_EQ(6, results[6]); + EXPECT_EQ(2, results[7]); +} + +// Testing with Empty Replace +TYPED_TEST(ReplaceTest, EmptyReplace) { + thrust::device_ptr results = + this->test({7, 5, 6, 3, 1, 2, 8, 4}, {}, {}); + + EXPECT_EQ(7, results[0]); + EXPECT_EQ(5, results[1]); + EXPECT_EQ(6, results[2]); + EXPECT_EQ(3, results[3]); + EXPECT_EQ(1, results[4]); + EXPECT_EQ(2, results[5]); + EXPECT_EQ(8, results[6]); + EXPECT_EQ(4, results[7]); +} + +// Testing with Nothing To Replace +TYPED_TEST(ReplaceTest, NothingToReplace) { + thrust::device_ptr results = + this->test({7, 5, 6, 3, 1, 2, 8, 4}, {10, 11, 12}, {15, 16, 17}); + + EXPECT_EQ(7, results[0]); + EXPECT_EQ(5, results[1]); + EXPECT_EQ(6, results[2]); + EXPECT_EQ(3, results[3]); + EXPECT_EQ(1, results[4]); + EXPECT_EQ(2, results[5]); + EXPECT_EQ(8, results[6]); + EXPECT_EQ(4, results[7]); +} + +// Testing with Empty Data +TYPED_TEST(ReplaceTest, EmptyData) { + this->test({}, {10, 11, 12}, {15, 16, 17}); +} + +// Test with much larger data sets +TEST(LargeScaleReplaceTest, LargeScaleReplaceTest) { + const int DATA_SIZE = 1000000; + const int REPLACE_SIZE = 10000; + + srand((unsigned) time(NULL)); + + std::vector column_data(DATA_SIZE); + for (int i = 0; i < DATA_SIZE; i++) { + column_data[i] = rand() % (2 * REPLACE_SIZE); + } + + std::vector from(DATA_SIZE); + std::vector to(DATA_SIZE); + int count = 0; + for (int i = 0; i < 7; i++) { + for (int j = 0; j < REPLACE_SIZE; j += 7) { + from[i + j] = count; + count++; + to[i + j] = count; + } + } + + thrust::device_vector device_data(column_data); + gdf_column data_gdf = MakeGdfColumn(device_data); + thrust::device_vector device_from(from); + gdf_column from_gdf = MakeGdfColumn(device_from); + thrust::device_vector device_to(to); + gdf_column to_gdf = MakeGdfColumn(device_to); + + const gdf_error status = + gdf_find_and_replace_all(&data_gdf, &from_gdf, &to_gdf); + + EXPECT_EQ(GDF_SUCCESS, status); + + std::vector replaced_data(DATA_SIZE); + thrust::copy(device_data.begin(), device_data.end(), replaced_data.begin()); + + count = 0; + for (int i = 0; i < DATA_SIZE; i++) { + if (column_data[i] < REPLACE_SIZE) { + EXPECT_EQ(column_data[i] + 1, replaced_data[i]); + if (column_data[i] + 1 != replaced_data[i]) { + std::cout << "failed at " << i + << " column_data[i]: " << column_data[i] + << " replaced_data[i]: " << replaced_data[i] + << std::endl; + count++; + if (count > 20) { break; } + } + } + } +} diff --git a/src/tests/replace/utils.h b/src/tests/replace/utils.h new file mode 100644 index 00000000..54fa26ca --- /dev/null +++ b/src/tests/replace/utils.h @@ -0,0 +1,67 @@ +/* + * Copyright 2018 BlazingDB, Inc. + * Copyright 2018 Cristhian Alberto Gonzales Castillo + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +//! traits to get gdf dtype from primitive type +template +struct TypeTraits {}; + +#define TYPE_FACTORY(U, D) \ + template <> \ + struct TypeTraits { \ + static constexpr gdf_dtype dtype = GDF_##D; \ + } + +TYPE_FACTORY(std::int8_t, INT8); +TYPE_FACTORY(std::int16_t, INT16); +TYPE_FACTORY(std::int32_t, INT32); +TYPE_FACTORY(std::int64_t, INT64); +TYPE_FACTORY(float, FLOAT32); +TYPE_FACTORY(double, FLOAT64); + +#undef TYPE_FACTORY + +//! Convert thrust device vector to gdf_column +template +static inline gdf_column +MakeGdfColumn(thrust::device_vector &device_vector) { + return gdf_column{ + .data = thrust::raw_pointer_cast(device_vector.data()), + .valid = nullptr, + .size = device_vector.size(), + .dtype = TypeTraits::dtype, + .null_count = 0, + .dtype_info = {}, + }; +} + +//! Convert STL vector to gdf_column +template +static inline gdf_column +MakeGdfColumn(std::vector &vector) { + return gdf_column{ + .data = vector.data(), + .valid = nullptr, + .size = vector.size(), + .dtype = TypeTraits::dtype, + .null_count = 0, + .dtype_info = {}, + }; +}