@@ -31,69 +31,6 @@ namespace CUDF_EXPORT nvtext {
31
31
* @file
32
32
*/
33
33
34
- /* *
35
- * @brief Returns the minhash value for each string
36
- *
37
- * Hash values are computed from substrings of each string and the
38
- * minimum hash value is returned for each string.
39
- *
40
- * Any null row entries result in corresponding null output rows.
41
- *
42
- * This function uses MurmurHash3_x86_32 for the hash algorithm.
43
- *
44
- * @deprecated Deprecated in 24.12
45
- *
46
- * @throw std::invalid_argument if the width < 2
47
- *
48
- * @param input Strings column to compute minhash
49
- * @param seed Seed value used for the hash algorithm
50
- * @param width The character width used for apply substrings;
51
- * Default is 4 characters.
52
- * @param stream CUDA stream used for device memory operations and kernel launches
53
- * @param mr Device memory resource used to allocate the returned column's device memory
54
- * @return Minhash values for each string in input
55
- */
56
- [[deprecated]] std::unique_ptr<cudf::column> minhash (
57
- cudf::strings_column_view const & input,
58
- cudf::numeric_scalar<uint32_t > seed = 0 ,
59
- cudf::size_type width = 4 ,
60
- rmm::cuda_stream_view stream = cudf::get_default_stream(),
61
- rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
62
-
63
- /* *
64
- * @brief Returns the minhash values for each string per seed
65
- *
66
- * Hash values are computed from substrings of each string and the
67
- * minimum hash value is returned for each string for each seed.
68
- * Each row of the list column are seed results for the corresponding
69
- * string. The order of the elements in each row match the order of
70
- * the seeds provided in the `seeds` parameter.
71
- *
72
- * This function uses MurmurHash3_x86_32 for the hash algorithm.
73
- *
74
- * Any null row entries result in corresponding null output rows.
75
- *
76
- * @deprecated Deprecated in 24.12 - to be replaced in a future release
77
- *
78
- * @throw std::invalid_argument if the width < 2
79
- * @throw std::invalid_argument if seeds is empty
80
- * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
81
- *
82
- * @param input Strings column to compute minhash
83
- * @param seeds Seed values used for the hash algorithm
84
- * @param width The character width used for apply substrings;
85
- * Default is 4 characters.
86
- * @param stream CUDA stream used for device memory operations and kernel launches
87
- * @param mr Device memory resource used to allocate the returned column's device memory
88
- * @return List column of minhash values for each string per seed
89
- */
90
- [[deprecated]] std::unique_ptr<cudf::column> minhash (
91
- cudf::strings_column_view const & input,
92
- cudf::device_span<uint32_t const > seeds,
93
- cudf::size_type width = 4 ,
94
- rmm::cuda_stream_view stream = cudf::get_default_stream(),
95
- rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
96
-
97
34
/* *
98
35
* @brief Returns the minhash values for each string
99
36
*
@@ -132,7 +69,7 @@ namespace CUDF_EXPORT nvtext {
132
69
* @param mr Device memory resource used to allocate the returned column's device memory
133
70
* @return List column of minhash values for each string per seed
134
71
*/
135
- std::unique_ptr<cudf::column> minhash_permuted (
72
+ std::unique_ptr<cudf::column> minhash (
136
73
cudf::strings_column_view const & input,
137
74
uint32_t seed,
138
75
cudf::device_span<uint32_t const > parameter_a,
@@ -142,67 +79,16 @@ std::unique_ptr<cudf::column> minhash_permuted(
142
79
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
143
80
144
81
/* *
145
- * @brief Returns the minhash value for each string
146
- *
147
- * Hash values are computed from substrings of each string and the
148
- * minimum hash value is returned for each string.
149
- *
150
- * Any null row entries result in corresponding null output rows.
151
- *
152
- * This function uses MurmurHash3_x64_128 for the hash algorithm.
153
- * The hash function returns 2 uint64 values but only the first value
154
- * is used with the minhash calculation.
155
- *
156
- * @deprecated Deprecated in 24.12
157
- *
158
- * @throw std::invalid_argument if the width < 2
159
- *
160
- * @param input Strings column to compute minhash
161
- * @param seed Seed value used for the hash algorithm
162
- * @param width The character width used for apply substrings;
163
- * Default is 4 characters.
164
- * @param stream CUDA stream used for device memory operations and kernel launches
165
- * @param mr Device memory resource used to allocate the returned column's device memory
166
- * @return Minhash values as UINT64 for each string in input
167
- */
168
- [[deprecated]] std::unique_ptr<cudf::column> minhash64 (
169
- cudf::strings_column_view const & input,
170
- cudf::numeric_scalar<uint64_t > seed = 0 ,
171
- cudf::size_type width = 4 ,
172
- rmm::cuda_stream_view stream = cudf::get_default_stream(),
173
- rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
174
-
175
- /* *
176
- * @brief Returns the minhash values for each string per seed
177
- *
178
- * Hash values are computed from substrings of each string and the
179
- * minimum hash value is returned for each string for each seed.
180
- * Each row of the list column are seed results for the corresponding
181
- * string. The order of the elements in each row match the order of
182
- * the seeds provided in the `seeds` parameter.
183
- *
184
- * This function uses MurmurHash3_x64_128 for the hash algorithm.
82
+ * @copydoc nvtext::minhash
185
83
*
186
- * Any null row entries result in corresponding null output rows.
187
- *
188
- * @deprecated Deprecated in 24.12 - to be replaced in a future release
189
- *
190
- * @throw std::invalid_argument if the width < 2
191
- * @throw std::invalid_argument if seeds is empty
192
- * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
193
- *
194
- * @param input Strings column to compute minhash
195
- * @param seeds Seed values used for the hash algorithm
196
- * @param width The character width used for apply substrings;
197
- * Default is 4 characters.
198
- * @param stream CUDA stream used for device memory operations and kernel launches
199
- * @param mr Device memory resource used to allocate the returned column's device memory
200
- * @return List column of minhash values for each string per seed
84
+ * @deprecated Use nvtext::minhash()
201
85
*/
202
- [[deprecated]] std::unique_ptr<cudf::column> minhash64 (
86
+ [[deprecated]] std::unique_ptr<cudf::column> minhash_permuted (
203
87
cudf::strings_column_view const & input,
204
- cudf::device_span<uint64_t const > seeds,
205
- cudf::size_type width = 4 ,
88
+ uint32_t seed,
89
+ cudf::device_span<uint32_t const > parameter_a,
90
+ cudf::device_span<uint32_t const > parameter_b,
91
+ cudf::size_type width,
206
92
rmm::cuda_stream_view stream = cudf::get_default_stream(),
207
93
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
208
94
@@ -244,7 +130,7 @@ std::unique_ptr<cudf::column> minhash_permuted(
244
130
* @param mr Device memory resource used to allocate the returned column's device memory
245
131
* @return List column of minhash values for each string per seed
246
132
*/
247
- std::unique_ptr<cudf::column> minhash64_permuted (
133
+ std::unique_ptr<cudf::column> minhash64 (
248
134
cudf::strings_column_view const & input,
249
135
uint64_t seed,
250
136
cudf::device_span<uint64_t const > parameter_a,
@@ -254,64 +140,18 @@ std::unique_ptr<cudf::column> minhash64_permuted(
254
140
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
255
141
256
142
/* *
257
- * @brief Returns the minhash values for each row of strings per seed
258
- *
259
- * Hash values are computed from each string in each row and the
260
- * minimum hash value is returned for each row for each seed.
261
- * Each row of the output list column are seed results for the corresponding
262
- * input row. The order of the elements in each row match the order of
263
- * the seeds provided in the `seeds` parameter.
264
- *
265
- * This function uses MurmurHash3_x86_32 for the hash algorithm.
266
- *
267
- * Any null row entries result in corresponding null output rows.
143
+ * @copydoc nvtext::minhash64
268
144
*
269
- * @deprecated Deprecated in 24.12
270
- *
271
- * @throw std::invalid_argument if seeds is empty
272
- * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
273
- *
274
- * @param input Lists column of strings to compute minhash
275
- * @param seeds Seed values used for the hash algorithm
276
- * @param stream CUDA stream used for device memory operations and kernel launches
277
- * @param mr Device memory resource used to allocate the returned column's device memory
278
- * @return List column of minhash values for each string per seed
145
+ * @deprecated Use nvtext::minhash64()
279
146
*/
280
- [[deprecated]] std::unique_ptr<cudf::column> word_minhash (
281
- cudf::lists_column_view const & input,
282
- cudf::device_span<uint32_t const > seeds,
147
+ [[deprecated]] std::unique_ptr<cudf::column> minhash64_permuted (
148
+ cudf::strings_column_view const & input,
149
+ uint64_t seed,
150
+ cudf::device_span<uint64_t const > parameter_a,
151
+ cudf::device_span<uint64_t const > parameter_b,
152
+ cudf::size_type width,
283
153
rmm::cuda_stream_view stream = cudf::get_default_stream(),
284
154
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
285
155
286
- /* *
287
- * @brief Returns the minhash values for each row of strings per seed
288
- *
289
- * Hash values are computed from each string in each row and the
290
- * minimum hash value is returned for each row for each seed.
291
- * Each row of the output list column are seed results for the corresponding
292
- * input row. The order of the elements in each row match the order of
293
- * the seeds provided in the `seeds` parameter.
294
- *
295
- * This function uses MurmurHash3_x64_128 for the hash algorithm though
296
- * only the first 64-bits of the hash are used in computing the output.
297
- *
298
- * Any null row entries result in corresponding null output rows.
299
- *
300
- * @deprecated Deprecated in 24.12
301
- *
302
- * @throw std::invalid_argument if seeds is empty
303
- * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
304
- *
305
- * @param input Lists column of strings to compute minhash
306
- * @param seeds Seed values used for the hash algorithm
307
- * @param stream CUDA stream used for device memory operations and kernel launches
308
- * @param mr Device memory resource used to allocate the returned column's device memory
309
- * @return List column of minhash values for each string per seed
310
- */
311
- [[deprecated]] std::unique_ptr<cudf::column> word_minhash64 (
312
- cudf::lists_column_view const & input,
313
- cudf::device_span<uint64_t const > seeds,
314
- rmm::cuda_stream_view stream = cudf::get_default_stream(),
315
- rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
316
156
/* * @} */ // end of group
317
157
} // namespace CUDF_EXPORT nvtext
0 commit comments