Skip to content

OSS MPZCH CUDA kernel in FBGEMM #4214

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions fbgemm_gpu/FbgemmGpu.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,11 @@ if(NOT FBGEMM_BUILD_VARIANT STREQUAL BUILD_VARIANT_CPU)
src/metric_ops/metric_ops_host.cpp
src/input_combine_ops/input_combine_gpu.cpp)

if(NOT FBGEMM_BUILD_VARIANT STREQUAL BUILD_VARIANT_ROCM)
list(APPEND fbgemm_gpu_sources_cpu_static
src/faster_hash_ops/faster_hash.cpp)
endif()

if(NVML_LIB_PATH OR FBGEMM_BUILD_VARIANT STREQUAL BUILD_VARIANT_ROCM)
message(STATUS "Adding merge_pooled_embeddings sources")
list(APPEND fbgemm_gpu_sources_cpu_static
Expand Down Expand Up @@ -122,6 +127,11 @@ if(NOT FBGEMM_BUILD_VARIANT STREQUAL BUILD_VARIANT_CPU)
src/sparse_ops/sparse_reorder_batched_ad.cu
src/sparse_ops/sparse_segment_sum_csr.cu
src/sparse_ops/sparse_zipf.cu)

if(NOT FBGEMM_BUILD_VARIANT STREQUAL BUILD_VARIANT_ROCM)
list(APPEND fbgemm_gpu_sources_gpu_static
src/faster_hash_ops/faster_hash.cu)
endif()
endif()


Expand Down
137 changes: 137 additions & 0 deletions fbgemm_gpu/include/fbgemm_gpu/faster_hash_ops/common_utils.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
/*
* The MIT License (MIT)
*
* Copyright (C) 2016 ExplosionAI GmbH, 2014-2015 Matthew Honnibal, 2016 spaCy
* GmbH
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*/
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#pragma once

#include <ATen/ATen.h>

#define AT_DISPATCH_INTEGER_TYPES(TYPE, NAME, HINT, ...) \
AT_DISPATCH_SWITCH( \
TYPE, \
NAME, \
AT_PRIVATE_CASE_TYPE_USING_HINT(at::ScalarType::Int, HINT, __VA_ARGS__) \
AT_PRIVATE_CASE_TYPE_USING_HINT( \
at::ScalarType::Long, HINT, __VA_ARGS__))

namespace fbgemm_gpu {

#if defined(TORBOREC_CUDA)
#define TORBOREC_INLINE __device__ __host__ __inline__
#else
#define TORBOREC_INLINE inline
#endif

// NOLINTNEXTLINE:
TORBOREC_INLINE uint64_t
murmur_hash3_2x64(const uint64_t x, const uint64_t y, const uint64_t seed) {
const uint64_t c1 = 0x87c37b91114253d5;
const uint64_t c2 = 0x4cf5ad432745937f;

uint64_t h1 = seed;
uint64_t h2 = seed;

// First 64-bit block
uint64_t k1 = x;
k1 *= c1;
k1 = (k1 << 31) | (k1 >> (64 - 31));
k1 *= c2;
h1 ^= k1;
h1 = (h1 << 27) | (h1 >> (64 - 27));
h1 += h2;
h1 = h1 * 5 + 0x52dce729;

// Second 64-bit block
uint64_t k2 = y;
k2 *= c2;
k2 = (k2 << 33) | (k2 >> (64 - 33));
k2 *= c1;
h2 ^= k2;
h2 = (h2 << 31) | (h2 >> (64 - 31));
h2 += h1;
h2 = h2 * 5 + 0x38495ab5;

// Finalization
h1 ^= 16;
h2 ^= 16;
h1 += h2;
h2 += h1;
h1 ^= h1 >> 33;
h1 *= 0xff51afd7ed558ccd;
h1 ^= h1 >> 33;
h1 *= 0xc4ceb9fe1a85ec53;
h1 ^= h1 >> 33;
h2 ^= h2 >> 33;
h2 *= 0xff51afd7ed558ccd;
h2 ^= h2 >> 33;
h2 *= 0xc4ceb9fe1a85ec53;
h2 ^= h2 >> 33;
h1 += h2;
h2 += h1;

return h1 ^ h2;
}

// NOLINTNEXTLINE:
template <bool CIRCULAR_PROBE>
TORBOREC_INLINE int64_t next_output_index(
int64_t output_index,
int64_t modulo,
int64_t& /* max_probe_local */) {
static_assert(CIRCULAR_PROBE);
return (output_index + 1) % modulo;
}

// NOLINTNEXTLINE:
template <>
TORBOREC_INLINE int64_t next_output_index<false>(
int64_t output_index,
int64_t modulo,
int64_t& max_probe_local) {
output_index = (output_index + 1) % modulo;
if (output_index == 0) {
// circular, using max_probe_local to control exit.
max_probe_local = 0;
}
return output_index;
}

TORBOREC_INLINE bool is_eviction_enabled(
bool readonly,
int eviction_threshold,
int eviction_policy) {
return !readonly && (eviction_threshold > 0 || eviction_policy > 0);
}

#undef TORBOREC_INLINE

} // namespace fbgemm_gpu
80 changes: 80 additions & 0 deletions fbgemm_gpu/include/fbgemm_gpu/faster_hash_ops/faster_hash_ops.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#pragma once

#include <ATen/ATen.h>

/// @defgroup faster-hash-ops CUDA Operators
/// The following are CUDA Operators

namespace fbgemm_gpu {

using at::Tensor;

///@ingroup faster-hash-ops
///
/// CUDA implementation of zero collision hash
///
/// @param output the output tensor that will be modified in place
/// @param evict_slots the slots that will be evicted
/// @param input the input tensor
/// @param identities the identity tensor
/// @param max_probe the maximum number of probes
/// @param circular_probe whether to use circular probe
/// @param cur_hour the current hour
/// @param readonly whether to use readonly mode
/// @param support_evict whether to support evict
/// @param local_sizes the local sizes tensor
/// @param offsets the offsets tensor
/// @param hash_identity whether to hash the identity
/// @param metadata the metadata tensor
/// @param disable_fallback whether to disable fallback
/// @param input_metadata the input metadata tensor
/// @param eviction_threshold the eviction threshold
/// @param eviction_policy the eviction policy
/// @param opt_in_prob the opt-in probability
/// @param num_reserved_slots the number of reserved slots
/// @param opt_in_rands the opt-in randoms tensor
///
/// @return None
template <typename TInput, typename TIdentity>
void _zero_collision_hash_cuda(
Tensor& output,
Tensor& evict_slots,
const Tensor& input,
Tensor& identities,
int64_t max_probe,
bool circular_probe,
int64_t cur_hour,
bool readonly,
bool support_evict,
const std::optional<Tensor>& local_sizes,
const std::optional<Tensor>& offsets,
int32_t hash_identity,
const std::optional<Tensor>& metadata,
bool disable_fallback,
const std::optional<Tensor>& input_metadata,
int64_t eviction_threshold,
int64_t eviction_policy,
int64_t opt_in_prob,
int64_t num_reserved_slots,
const std::optional<Tensor>& opt_in_rands);

///@ingroup faster-hash-ops
///
/// CUDA implementation of murmurhash3
///
/// @param input the input tensor
/// @param y the y value
/// @param seed the seed value

/// @return the output tensor
Tensor murmur_hash3_cuda(const Tensor& input, int64_t y, int64_t seed);

} // namespace fbgemm_gpu
Loading
Loading