Skip to content
This repository was archived by the owner on Dec 21, 2018. It is now read-only.

[REVIEW] Replace function #106

Open
wants to merge 36 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 14 commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
350dafc
[replace-function] API definition
gcca Aug 22, 2018
8f805d9
[replace-function] Merged from master
gcca Aug 23, 2018
5a8b93a
[replace-function] Add first implementation
gcca Aug 23, 2018
adf537d
[replace-function] Add replacement by lower bound
gcca Aug 23, 2018
3214b08
[replace-function] Update typed unit test
gcca Aug 24, 2018
3935048
[replace-function] Update to replace on unordered column
gcca Aug 30, 2018
a4a7383
[replace-function] Merged from master
gcca Aug 30, 2018
f5afc31
[replace-function] Update class replace functor
gcca Aug 30, 2018
ef3f382
[replace-function] Common test fixtures
gcca Aug 30, 2018
003bd9d
created larger scale test for replace funtion
Aug 31, 2018
5553c67
[replace-function] Replace kernel
gcca Sep 3, 2018
0f3891d
[replace-function] Check device attribute status
gcca Sep 4, 2018
26c522a
[replace-function] Move ptr construction (invariant)
gcca Sep 4, 2018
2a7a023
[replace-function] Add replace benchmark against cpu
gcca Sep 5, 2018
200da31
Merge branch 'master' into replace-function
Sep 27, 2018
a2fa767
moved replace benchmark to bench folder. Added comments and more test…
Sep 28, 2018
cc3beca
[replace-function] API definition
gcca Aug 22, 2018
ef4de00
[replace-function] Add first implementation
gcca Aug 23, 2018
0b62cd9
[replace-function] Add replacement by lower bound
gcca Aug 23, 2018
5f2c338
[replace-function] Update typed unit test
gcca Aug 24, 2018
35c765d
[replace-function] Update to replace on unordered column
gcca Aug 30, 2018
d3e50d3
[replace-function] Update class replace functor
gcca Aug 30, 2018
7c12b1a
[replace-function] Common test fixtures
gcca Aug 30, 2018
c01252b
created larger scale test for replace funtion
Aug 31, 2018
321656e
[replace-function] Replace kernel
gcca Sep 3, 2018
7f80017
[replace-function] Check device attribute status
gcca Sep 4, 2018
207fe0e
[replace-function] Move ptr construction (invariant)
gcca Sep 4, 2018
8767314
[replace-function] Add replace benchmark against cpu
gcca Sep 5, 2018
622abbd
moved replace benchmark to bench folder. Added comments and more test…
Sep 28, 2018
d364fd8
[replace-function] Add documentation
gcca Oct 17, 2018
95a8a21
[replace-function] Merged from remote
gcca Oct 17, 2018
99f6ebf
Merge branch 'master' into replace-function
gcca Oct 25, 2018
c5f27c1
[replace-function] Update function name
gcca Oct 25, 2018
b8e7ddd
[replace-function] Update name for test
gcca Oct 25, 2018
fdb7afc
[replace-function] Update function for benchmark
gcca Oct 25, 2018
41b058c
Merge upstream 'master' into replace-function
gcca Oct 26, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,7 @@ cuda_add_library(gdf SHARED
src/sqls_ops.cu
src/streamcompactionops.cu
src/unaryops.cu
src/replace.cu
#src/windowedops.cu
src/quantiles.cu
)
Expand Down
14 changes: 13 additions & 1 deletion include/gdf/cffi/functions.h
Original file line number Diff line number Diff line change
Expand Up @@ -496,7 +496,7 @@ gdf_error gpu_hash_columns(gdf_column ** columns_to_hash, int num_columns, gdf_c

gdf_error get_column_byte_width(gdf_column * col, int * width);

/*
/*
Multi-Column SQL ops:
WHERE (Filtering)
ORDER-BY
Expand Down Expand Up @@ -577,3 +577,15 @@ gdf_error gdf_quantile_aprrox( gdf_column* col_in, //input column;
double q, //requested quantile in [0,1]
void* t_erased_res, //type-erased result of same type as column;
gdf_context* ctxt); //context info

/* replace */

/// \brief Replace `to_replace` data of `column` with `values`
/// \param[in/out] column data
/// \param[in] to_replace contains values of column that will be replaced
/// \param[in] values contains the replacement values
///
/// Note that `to_replace` and `values` are related by the index
gdf_error gdf_replace(gdf_column * column,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The interface is confusing to me. Why does it have column and to_replace parameters? What happens to column? What happens to to_replace? Why not just make a gdf_copy(out_column, in_column)?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Now I see. The meaning of "replace" is not clear. It's not a copy. The semantics are actually those of "find_and_replace_all": For each value in to_replace, find all instances of that value in column and replace it with the corresponding value in values. This should be made clear in the header documentation for the function. Consider changing the name to gdf_find_and_replace_all() or something like that...

const gdf_column *to_replace,
const gdf_column *values);
150 changes: 150 additions & 0 deletions src/replace.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
/*
* Copyright 2018 BlazingDB, Inc.
* Copyright 2018 Cristhian Alberto Gonzales Castillo <[email protected]>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <cmath>

#include <thrust/device_ptr.h>
#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <thrust/for_each.h>
#include <thrust/replace.h>

#include <gdf/gdf.h>

namespace {

template <gdf_dtype DTYPE>
struct gdf_dtype_traits {};

#define DTYPE_FACTORY(DTYPE, T) \
template <> \
struct gdf_dtype_traits<GDF_##DTYPE> { \
typedef T value_type; \
}

DTYPE_FACTORY(INT8, std::int8_t);
DTYPE_FACTORY(INT16, std::int16_t);
DTYPE_FACTORY(INT32, std::int32_t);
DTYPE_FACTORY(INT64, std::int64_t);
DTYPE_FACTORY(FLOAT32, float);
DTYPE_FACTORY(FLOAT64, double);
DTYPE_FACTORY(DATE32, std::int32_t);
DTYPE_FACTORY(DATE64, std::int64_t);
DTYPE_FACTORY(TIMESTAMP, std::int64_t);

#undef DTYPE_FACTORY

template <class T>
__global__ void
replace_kernel(T *const data,
const std::size_t data_size,
const T *const values,
const thrust::device_ptr<const T> to_replace_begin,
const thrust::device_ptr<const T> to_replace_end) {
for (std::size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < data_size;
i += blockDim.x * gridDim.x) {
// TODO: find by map kernel
const thrust::device_ptr<const T> found_ptr = thrust::find(
thrust::device, to_replace_begin, to_replace_end, data[i]);

if (found_ptr != to_replace_end) {
typename thrust::iterator_traits<
const thrust::device_ptr<const T>>::difference_type
value_found_index = thrust::distance(to_replace_begin, found_ptr);

data[i] = values[value_found_index];
}
}
}

template <class T>
static inline gdf_error
Replace(T *const data,
const std::size_t data_size,
const T *const to_replace,
const T *const values,
const std::ptrdiff_t replacement_ptrdiff) {
const std::size_t blocks = std::ceil(data_size / 256.);

const thrust::device_ptr<const T> to_replace_begin(to_replace);
const thrust::device_ptr<const T> to_replace_end(to_replace_begin
+ replacement_ptrdiff);

replace_kernel<T>
<<<blocks, 256>>>( // TODO: calc blocks and threads
data,
data_size,
values,
to_replace_begin,
to_replace_end);

return GDF_SUCCESS;
}

static inline bool
NotEqualReplacementSize(const gdf_column *to_replace,
const gdf_column *values) {
return to_replace->size != values->size;
}

static inline bool
NotSameDType(const gdf_column *column,
const gdf_column *to_replace,
const gdf_column *values) {
return column->dtype != to_replace->dtype
|| to_replace->dtype != values->dtype;
}

} // namespace

gdf_error
gdf_replace(gdf_column * column,
const gdf_column *to_replace,
const gdf_column *values) {
if (NotEqualReplacementSize(to_replace, values)) {
return GDF_COLUMN_SIZE_MISMATCH;
}

if (NotSameDType(column, to_replace, values)) { return GDF_CUDA_ERROR; }

switch (column->dtype) {
#define WHEN(DTYPE) \
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't really like calling this macro "WHEN" -- it's not descriptive. Something like REPLACE_CASE would be clearer. Then you would have

switch (column->type) {
    REPLACE_CASE(INT8);
    REPLACE_CASE(INT16);
    // etc.
}

case GDF_##DTYPE: { \
using value_type = gdf_dtype_traits<GDF_##DTYPE>::value_type; \
return Replace(static_cast<value_type *>(column->data), \
static_cast<std::size_t>(column->size), \
static_cast<value_type *>(to_replace->data), \
static_cast<value_type *>(values->data), \
static_cast<std::ptrdiff_t>(values->size)); \
}

WHEN(INT8);
WHEN(INT16);
WHEN(INT32);
WHEN(INT64);
WHEN(FLOAT32);
WHEN(FLOAT64);
WHEN(DATE32);
WHEN(DATE64);
WHEN(TIMESTAMP);

#undef WHEN

case GDF_invalid:
default: return GDF_UNSUPPORTED_DTYPE;
}
}
1 change: 1 addition & 0 deletions src/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ add_subdirectory(datetime)
add_subdirectory(hashing)
add_subdirectory(join)
add_subdirectory(sqls)
add_subdirectory(replace)
add_subdirectory(hash_map)
add_subdirectory(groupby)
add_subdirectory(unaryops)
Expand Down
57 changes: 57 additions & 0 deletions src/tests/replace/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#=============================================================================
# Copyright 2018 BlazingDB, Inc.
# Copyright 2018 Cristhian Alberto Gonzales Castillo <[email protected]>
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#=============================================================================

configure_test(replace-test replace-test.cu)

if (GDF_BENCHMARK)
include(ExternalProject)

ExternalProject_Add(benchmark_ep
CMAKE_ARGS
-DCMAKE_BUILD_TYPE=RELEASE
-DCMAKE_INSTALL_PREFIX=build
-DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON
GIT_REPOSITORY https://github.com/google/benchmark.git
GIT_TAG v1.4.1
UPDATE_COMMAND ""
)
ExternalProject_Get_property(benchmark_ep BINARY_DIR)
set(BENCHMARK_ROOT ${BINARY_DIR}/build)

file(MAKE_DIRECTORY ${BENCHMARK_ROOT}/include)
file(MAKE_DIRECTORY ${BENCHMARK_ROOT}/lib)

add_library(Google::Benchmark INTERFACE IMPORTED)
add_dependencies(Google::Benchmark benchmark_ep)
set_target_properties(Google::Benchmark
PROPERTIES INTERFACE_INCLUDE_DIRECTORIES ${BENCHMARK_ROOT}/include)
set_target_properties(Google::Benchmark
PROPERTIES INTERFACE_LINK_LIBRARIES ${BENCHMARK_ROOT}/lib/libbenchmark.a)

add_library(Google::Benchmark::Main INTERFACE IMPORTED)
set_target_properties(Google::Benchmark::Main
PROPERTIES INTERFACE_LINK_LIBRARIES ${BENCHMARK_ROOT}/lib/libbenchmark_main.a)

function(GDF_ADD_BENCHMARK TARGET)
list(REMOVE_AT ARGV 0)
cuda_add_executable(${TARGET} ${ARGV})
target_link_libraries(${TARGET}
Google::Benchmark Google::Benchmark::Main gdf)
endfunction()

GDF_ADD_BENCHMARK(replace-benchmark replace-benchmark.cu)
endif()
110 changes: 110 additions & 0 deletions src/tests/replace/replace-benchmark.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
/*
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should be moved to a separate bench folder as we discussed in the binary ops PR review?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And separated into benchmarks and unit tests?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

* Copyright 2018 BlazingDB, Inc.
* Copyright 2018 Cristhian Alberto Gonzales Castillo <[email protected]>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <benchmark/benchmark.h>

#include <unordered_map>

#include <thrust/device_vector.h>
#include <thrust/sequence.h>

#include <gdf/gdf.h>

#include "utils.h"

using T = std::int64_t;

static void
BM_CPU_LoopReplace(benchmark::State &state) {
const std::size_t length = state.range(0);

std::vector<T> vector(length);
thrust::sequence(vector.begin(), vector.end(), 1);

std::vector<T> to_replace_vector(10);
thrust::sequence(to_replace_vector.begin(), to_replace_vector.end(), 1);

std::vector<T> values_vector(10);
thrust::sequence(values_vector.begin(), values_vector.end(), 1);

for (auto _ : state) {
for (std::size_t i = 0; i < vector.size(); i++) {
auto current = std::find(
to_replace_vector.begin(), to_replace_vector.end(), vector[i]);
if (current != to_replace_vector.end()) {
std::size_t j =
std::distance(to_replace_vector.begin(), current);
vector[i] = values_vector[j];
}
}
}
}

static void
BM_CPU_MapReplace(benchmark::State &state) {
const std::size_t length = state.range(0);

std::vector<T> vector(length);
thrust::sequence(vector.begin(), vector.end(), 1);

std::vector<T> to_replace_vector(10);
thrust::sequence(to_replace_vector.begin(), to_replace_vector.end(), 1);

std::vector<T> values_vector(10);
thrust::sequence(values_vector.begin(), values_vector.end(), 1);

for (auto _ : state) {
std::unordered_map<T, T> map;
for (std::size_t i = 0; i < values_vector.size(); i++) {
map.insert({to_replace_vector[i], values_vector[i]});
}

for (std::size_t i = 0; i < vector.size(); i++) {
try {
vector[i] = map[vector[i]];
} catch (...) { continue; }
}
}
}

static void
BM_GPU_LoopReplace(benchmark::State &state) {
const std::size_t length = state.range(0);

thrust::device_vector<T> device_vector(length);
thrust::sequence(device_vector.begin(), device_vector.end(), 1);
gdf_column column = MakeGdfColumn(device_vector);

thrust::device_vector<T> to_replace_vector(10);
thrust::sequence(to_replace_vector.begin(), to_replace_vector.end(), 1);
gdf_column to_replace = MakeGdfColumn(to_replace_vector);

thrust::device_vector<T> values_vector(10);
thrust::sequence(values_vector.begin(), values_vector.end(), 1);
gdf_column values = MakeGdfColumn(values_vector);

for (auto _ : state) {
const gdf_error status = gdf_replace(&column, &to_replace, &values);
state.PauseTiming();
if (status != GDF_SUCCESS) { state.SkipWithError("Failed replace"); }
state.ResumeTiming();
}
}

BENCHMARK(BM_CPU_LoopReplace)->Ranges({{8, 8 << 16}, {8, 512}});
BENCHMARK(BM_CPU_MapReplace)->Ranges({{8, 8 << 16}, {8, 512}});
BENCHMARK(BM_GPU_LoopReplace)->Ranges({{8, 8 << 16}, {8, 512}});
Loading