Skip to content

Commit

Permalink
Move nvtext ngrams benchmarks to nvbench (#17173)
Browse files Browse the repository at this point in the history
Moves the `nvtext::generate_ngrams` and `nvtext::generate_character_ngrams` benchmarks from google-bench to nvbench.
Target parameters are exposed to help with profiling.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Yunsong Wang (https://github.com/PointKernel)

URL: #17173
  • Loading branch information
davidwendt authored Oct 25, 2024
1 parent e98e6b9 commit 0bb699e
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 42 deletions.
6 changes: 3 additions & 3 deletions cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -345,11 +345,11 @@ ConfigureNVBench(BINARYOP_NVBENCH binaryop/binaryop.cpp binaryop/compiled_binary

# ##################################################################################################
# * nvtext benchmark -------------------------------------------------------------------
ConfigureBench(TEXT_BENCH text/ngrams.cpp text/subword.cpp)
ConfigureBench(TEXT_BENCH text/subword.cpp)

ConfigureNVBench(
TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/normalize.cpp
text/replace.cpp text/tokenize.cpp text/vocab.cpp
TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/ngrams.cpp
text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp
)

# ##################################################################################################
Expand Down
65 changes: 26 additions & 39 deletions cpp/benchmarks/text/ngrams.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,58 +15,45 @@
*/

#include <benchmarks/common/generate_input.hpp>
#include <benchmarks/fixture/benchmark_fixture.hpp>
#include <benchmarks/string/string_bench_args.hpp>
#include <benchmarks/synchronization/synchronization.hpp>

#include <cudf/scalar/scalar.hpp>
#include <cudf/strings/strings_column_view.hpp>

#include <nvtext/generate_ngrams.hpp>

class TextNGrams : public cudf::benchmark {};
#include <nvbench/nvbench.cuh>

enum class ngrams_type { tokens, characters };

static void BM_ngrams(benchmark::State& state, ngrams_type nt)
static void bench_ngrams(nvbench::state& state)
{
auto const n_rows = static_cast<cudf::size_type>(state.range(0));
auto const max_str_length = static_cast<cudf::size_type>(state.range(1));
auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
auto const ngram_type = state.get_string("type");

data_profile const profile = data_profile_builder().distribution(
cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile);
cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile);
cudf::strings_column_view input(column->view());
auto const separator = cudf::string_scalar("_");

for (auto _ : state) {
cuda_event_timer raii(state, true);
switch (nt) {
case ngrams_type::tokens: nvtext::generate_ngrams(input, 2, separator); break;
case ngrams_type::characters: nvtext::generate_character_ngrams(input); break;
}
}
state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));

state.SetBytesProcessed(state.iterations() * input.chars_size(cudf::get_default_stream()));
}
auto chars_size = input.chars_size(cudf::get_default_stream());
state.add_global_memory_reads<nvbench::int8_t>(chars_size);
state.add_global_memory_writes<nvbench::int8_t>(chars_size * 2);

static void generate_bench_args(benchmark::internal::Benchmark* b)
{
int const min_rows = 1 << 12;
int const max_rows = 1 << 24;
int const row_mult = 8;
int const min_rowlen = 5;
int const max_rowlen = 40;
int const len_mult = 2;
generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult);
if (ngram_type == "chars") {
state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
auto result = nvtext::generate_character_ngrams(input);
});
} else {
state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
auto result = nvtext::generate_ngrams(input, 2, separator);
});
}
}

#define NVTEXT_BENCHMARK_DEFINE(name) \
BENCHMARK_DEFINE_F(TextNGrams, name) \
(::benchmark::State & st) { BM_ngrams(st, ngrams_type::name); } \
BENCHMARK_REGISTER_F(TextNGrams, name) \
->Apply(generate_bench_args) \
->UseManualTime() \
->Unit(benchmark::kMillisecond);

NVTEXT_BENCHMARK_DEFINE(tokens)
NVTEXT_BENCHMARK_DEFINE(characters)
NVBENCH_BENCH(bench_ngrams)
.set_name("ngrams")
.add_int64_axis("num_rows", {131072, 262144, 524288, 1048578})
.add_int64_axis("row_width", {10, 20, 40, 100})
.add_string_axis("type", {"chars", "tokens"});

0 comments on commit 0bb699e

Please sign in to comment.