Skip to content

Commit 0c5bd66

Browse files
authored
Rework minhash APIs for deprecation cycle (#17421)
Renames `minhash_permuted()` to `minhash()` and deprecates `minhash_permuted` Also removes the `word_minhash` APIs deprecated in 24.12. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Bradley Dice (https://github.com/bdice) - Matthew Murray (https://github.com/Matt711) - Vyas Ramasubramani (https://github.com/vyasr) URL: #17421
1 parent 1e95864 commit 0c5bd66

File tree

14 files changed

+100
-1060
lines changed

14 files changed

+100
-1060
lines changed

cpp/benchmarks/text/minhash.cpp

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -54,9 +54,8 @@ static void bench_minhash(nvbench::state& state)
5454
state.add_global_memory_writes<nvbench::int32_t>(num_rows); // output are hashes
5555

5656
state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
57-
auto result = base64
58-
? nvtext::minhash64_permuted(input, 0, parameters_a, parameters_b, hash_width)
59-
: nvtext::minhash_permuted(input, 0, parameters_a, parameters_b, hash_width);
57+
auto result = base64 ? nvtext::minhash64(input, 0, parameters_a, parameters_b, hash_width)
58+
: nvtext::minhash(input, 0, parameters_a, parameters_b, hash_width);
6059
});
6160
}
6261

cpp/include/nvtext/minhash.hpp

Lines changed: 17 additions & 177 deletions
Original file line numberDiff line numberDiff line change
@@ -31,69 +31,6 @@ namespace CUDF_EXPORT nvtext {
3131
* @file
3232
*/
3333

34-
/**
35-
* @brief Returns the minhash value for each string
36-
*
37-
* Hash values are computed from substrings of each string and the
38-
* minimum hash value is returned for each string.
39-
*
40-
* Any null row entries result in corresponding null output rows.
41-
*
42-
* This function uses MurmurHash3_x86_32 for the hash algorithm.
43-
*
44-
* @deprecated Deprecated in 24.12
45-
*
46-
* @throw std::invalid_argument if the width < 2
47-
*
48-
* @param input Strings column to compute minhash
49-
* @param seed Seed value used for the hash algorithm
50-
* @param width The character width used for apply substrings;
51-
* Default is 4 characters.
52-
* @param stream CUDA stream used for device memory operations and kernel launches
53-
* @param mr Device memory resource used to allocate the returned column's device memory
54-
* @return Minhash values for each string in input
55-
*/
56-
[[deprecated]] std::unique_ptr<cudf::column> minhash(
57-
cudf::strings_column_view const& input,
58-
cudf::numeric_scalar<uint32_t> seed = 0,
59-
cudf::size_type width = 4,
60-
rmm::cuda_stream_view stream = cudf::get_default_stream(),
61-
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
62-
63-
/**
64-
* @brief Returns the minhash values for each string per seed
65-
*
66-
* Hash values are computed from substrings of each string and the
67-
* minimum hash value is returned for each string for each seed.
68-
* Each row of the list column are seed results for the corresponding
69-
* string. The order of the elements in each row match the order of
70-
* the seeds provided in the `seeds` parameter.
71-
*
72-
* This function uses MurmurHash3_x86_32 for the hash algorithm.
73-
*
74-
* Any null row entries result in corresponding null output rows.
75-
*
76-
* @deprecated Deprecated in 24.12 - to be replaced in a future release
77-
*
78-
* @throw std::invalid_argument if the width < 2
79-
* @throw std::invalid_argument if seeds is empty
80-
* @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
81-
*
82-
* @param input Strings column to compute minhash
83-
* @param seeds Seed values used for the hash algorithm
84-
* @param width The character width used for apply substrings;
85-
* Default is 4 characters.
86-
* @param stream CUDA stream used for device memory operations and kernel launches
87-
* @param mr Device memory resource used to allocate the returned column's device memory
88-
* @return List column of minhash values for each string per seed
89-
*/
90-
[[deprecated]] std::unique_ptr<cudf::column> minhash(
91-
cudf::strings_column_view const& input,
92-
cudf::device_span<uint32_t const> seeds,
93-
cudf::size_type width = 4,
94-
rmm::cuda_stream_view stream = cudf::get_default_stream(),
95-
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
96-
9734
/**
9835
* @brief Returns the minhash values for each string
9936
*
@@ -132,7 +69,7 @@ namespace CUDF_EXPORT nvtext {
13269
* @param mr Device memory resource used to allocate the returned column's device memory
13370
* @return List column of minhash values for each string per seed
13471
*/
135-
std::unique_ptr<cudf::column> minhash_permuted(
72+
std::unique_ptr<cudf::column> minhash(
13673
cudf::strings_column_view const& input,
13774
uint32_t seed,
13875
cudf::device_span<uint32_t const> parameter_a,
@@ -142,67 +79,16 @@ std::unique_ptr<cudf::column> minhash_permuted(
14279
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
14380

14481
/**
145-
* @brief Returns the minhash value for each string
146-
*
147-
* Hash values are computed from substrings of each string and the
148-
* minimum hash value is returned for each string.
149-
*
150-
* Any null row entries result in corresponding null output rows.
151-
*
152-
* This function uses MurmurHash3_x64_128 for the hash algorithm.
153-
* The hash function returns 2 uint64 values but only the first value
154-
* is used with the minhash calculation.
155-
*
156-
* @deprecated Deprecated in 24.12
157-
*
158-
* @throw std::invalid_argument if the width < 2
159-
*
160-
* @param input Strings column to compute minhash
161-
* @param seed Seed value used for the hash algorithm
162-
* @param width The character width used for apply substrings;
163-
* Default is 4 characters.
164-
* @param stream CUDA stream used for device memory operations and kernel launches
165-
* @param mr Device memory resource used to allocate the returned column's device memory
166-
* @return Minhash values as UINT64 for each string in input
167-
*/
168-
[[deprecated]] std::unique_ptr<cudf::column> minhash64(
169-
cudf::strings_column_view const& input,
170-
cudf::numeric_scalar<uint64_t> seed = 0,
171-
cudf::size_type width = 4,
172-
rmm::cuda_stream_view stream = cudf::get_default_stream(),
173-
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
174-
175-
/**
176-
* @brief Returns the minhash values for each string per seed
177-
*
178-
* Hash values are computed from substrings of each string and the
179-
* minimum hash value is returned for each string for each seed.
180-
* Each row of the list column are seed results for the corresponding
181-
* string. The order of the elements in each row match the order of
182-
* the seeds provided in the `seeds` parameter.
183-
*
184-
* This function uses MurmurHash3_x64_128 for the hash algorithm.
82+
* @copydoc nvtext::minhash
18583
*
186-
* Any null row entries result in corresponding null output rows.
187-
*
188-
* @deprecated Deprecated in 24.12 - to be replaced in a future release
189-
*
190-
* @throw std::invalid_argument if the width < 2
191-
* @throw std::invalid_argument if seeds is empty
192-
* @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
193-
*
194-
* @param input Strings column to compute minhash
195-
* @param seeds Seed values used for the hash algorithm
196-
* @param width The character width used for apply substrings;
197-
* Default is 4 characters.
198-
* @param stream CUDA stream used for device memory operations and kernel launches
199-
* @param mr Device memory resource used to allocate the returned column's device memory
200-
* @return List column of minhash values for each string per seed
84+
* @deprecated Use nvtext::minhash()
20185
*/
202-
[[deprecated]] std::unique_ptr<cudf::column> minhash64(
86+
[[deprecated]] std::unique_ptr<cudf::column> minhash_permuted(
20387
cudf::strings_column_view const& input,
204-
cudf::device_span<uint64_t const> seeds,
205-
cudf::size_type width = 4,
88+
uint32_t seed,
89+
cudf::device_span<uint32_t const> parameter_a,
90+
cudf::device_span<uint32_t const> parameter_b,
91+
cudf::size_type width,
20692
rmm::cuda_stream_view stream = cudf::get_default_stream(),
20793
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
20894

@@ -244,7 +130,7 @@ std::unique_ptr<cudf::column> minhash_permuted(
244130
* @param mr Device memory resource used to allocate the returned column's device memory
245131
* @return List column of minhash values for each string per seed
246132
*/
247-
std::unique_ptr<cudf::column> minhash64_permuted(
133+
std::unique_ptr<cudf::column> minhash64(
248134
cudf::strings_column_view const& input,
249135
uint64_t seed,
250136
cudf::device_span<uint64_t const> parameter_a,
@@ -254,64 +140,18 @@ std::unique_ptr<cudf::column> minhash64_permuted(
254140
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
255141

256142
/**
257-
* @brief Returns the minhash values for each row of strings per seed
258-
*
259-
* Hash values are computed from each string in each row and the
260-
* minimum hash value is returned for each row for each seed.
261-
* Each row of the output list column are seed results for the corresponding
262-
* input row. The order of the elements in each row match the order of
263-
* the seeds provided in the `seeds` parameter.
264-
*
265-
* This function uses MurmurHash3_x86_32 for the hash algorithm.
266-
*
267-
* Any null row entries result in corresponding null output rows.
143+
* @copydoc nvtext::minhash64
268144
*
269-
* @deprecated Deprecated in 24.12
270-
*
271-
* @throw std::invalid_argument if seeds is empty
272-
* @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
273-
*
274-
* @param input Lists column of strings to compute minhash
275-
* @param seeds Seed values used for the hash algorithm
276-
* @param stream CUDA stream used for device memory operations and kernel launches
277-
* @param mr Device memory resource used to allocate the returned column's device memory
278-
* @return List column of minhash values for each string per seed
145+
* @deprecated Use nvtext::minhash64()
279146
*/
280-
[[deprecated]] std::unique_ptr<cudf::column> word_minhash(
281-
cudf::lists_column_view const& input,
282-
cudf::device_span<uint32_t const> seeds,
147+
[[deprecated]] std::unique_ptr<cudf::column> minhash64_permuted(
148+
cudf::strings_column_view const& input,
149+
uint64_t seed,
150+
cudf::device_span<uint64_t const> parameter_a,
151+
cudf::device_span<uint64_t const> parameter_b,
152+
cudf::size_type width,
283153
rmm::cuda_stream_view stream = cudf::get_default_stream(),
284154
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
285155

286-
/**
287-
* @brief Returns the minhash values for each row of strings per seed
288-
*
289-
* Hash values are computed from each string in each row and the
290-
* minimum hash value is returned for each row for each seed.
291-
* Each row of the output list column are seed results for the corresponding
292-
* input row. The order of the elements in each row match the order of
293-
* the seeds provided in the `seeds` parameter.
294-
*
295-
* This function uses MurmurHash3_x64_128 for the hash algorithm though
296-
* only the first 64-bits of the hash are used in computing the output.
297-
*
298-
* Any null row entries result in corresponding null output rows.
299-
*
300-
* @deprecated Deprecated in 24.12
301-
*
302-
* @throw std::invalid_argument if seeds is empty
303-
* @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
304-
*
305-
* @param input Lists column of strings to compute minhash
306-
* @param seeds Seed values used for the hash algorithm
307-
* @param stream CUDA stream used for device memory operations and kernel launches
308-
* @param mr Device memory resource used to allocate the returned column's device memory
309-
* @return List column of minhash values for each string per seed
310-
*/
311-
[[deprecated]] std::unique_ptr<cudf::column> word_minhash64(
312-
cudf::lists_column_view const& input,
313-
cudf::device_span<uint64_t const> seeds,
314-
rmm::cuda_stream_view stream = cudf::get_default_stream(),
315-
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
316156
/** @} */ // end of group
317157
} // namespace CUDF_EXPORT nvtext

0 commit comments

Comments
 (0)