Skip to content

Commit a273543

Browse files
lizhouyufacebook-github-bot
authored andcommitted
OSS MPZCH CUDA kernel in FBGEMM (#4214)
Summary: Pull Request resolved: #4214 X-link: facebookresearch/FBGEMM#1290 Opensource FBGEMM CUDA Kernel for MPZCH feature ### Major changes - Create a folder named `faster_hash` under the `fbgemm/fbgemmgpu/src` folder. - Copy the following files to the created folder from `fbsource/fbcode/caffe2/torch/fb/retrieval` - faster_hash.cpp - faster_hash.cu - common_utils.cuh - Revise the `faster_hash.cpp` - Change `namespace fb` to `namespace fbgemm_gpu`. - Comment out `using namespace torch::fb::turborec;` - Change `TORCH_LIBRARY_IMPL(fb, ...)` to `TORCH_LIBRARY_IMPL(fbgemm, ...)` - Fix namespace calling issue due to the namespace change. - Revise the `faster_hash.cu` - Change `namespace fb` to `namespace fbgemm_gpu`. - Change `TORCH_LIBRARY_IMPL(fb, ...)` to `TORCH_LIBRARY_IMPL(fbgemm, ...)` - Fix namespace calling issue due to the namespace change. - Revise the `common_utils.cuh` file - Change `namespace fb` to `namespace fbgemm_gpu`. - Add a BUCK file to compile the cpp and cuda library. - Copy the `faster_hash_test.py` file to the `fbgemm/fbgemm_gpu/test` folder. - Add a section in the BUCK file under the `test` folder for `python_unittest` of `faster_hash_test`. - In the `faster_hash_test.py` file - Load the `faster_hash` related libraries with `torch.ops.load` API. - Replace all the `torch.ops.fb` to `torch.ops.fbgemm`. - Following other test files to add `opensource` and `gpu availability` check. ### Questions - After refactorying, the API calls `torch.ops.create_zch_buffer`, `torch.ops.zero_collision_hash`, `torch.ops.fbgemm.zero_collision_hash`, and `torch.ops.fbgemm.create_zch_buffer` are all valid, while `torch.ops.create_zch_buffer` and `torch.ops.zero_collision_hash` may incur certain parameter mismatches. How to resolve this issue and disable the API calls without `fbgemm`? - How to integrate the refactoryed library into fbgemm so the test can call something like `from fbgemm_gpu import create_zch_buffer, zero_collision_hash`? Differential Revision: D75505020
1 parent 16c61f1 commit a273543

File tree

4 files changed

+3260
-0
lines changed

4 files changed

+3260
-0
lines changed
Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
/*
2+
* The MIT License (MIT)
3+
*
4+
* Copyright (C) 2016 ExplosionAI GmbH, 2014-2015 Matthew Honnibal, 2016 spaCy
5+
* GmbH
6+
*
7+
* Permission is hereby granted, free of charge, to any person obtaining a copy
8+
* of this software and associated documentation files (the "Software"), to deal
9+
* in the Software without restriction, including without limitation the rights
10+
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11+
* copies of the Software, and to permit persons to whom the Software is
12+
* furnished to do so, subject to the following conditions:
13+
*
14+
* The above copyright notice and this permission notice shall be included in
15+
* all copies or substantial portions of the Software.
16+
*
17+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18+
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19+
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20+
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21+
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22+
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23+
* THE SOFTWARE.
24+
*
25+
*/
26+
/*
27+
* Copyright (c) Meta Platforms, Inc. and affiliates.
28+
* All rights reserved.
29+
*
30+
* This source code is licensed under the BSD-style license found in the
31+
* LICENSE file in the root directory of this source tree.
32+
*/
33+
34+
#pragma once
35+
36+
#include <ATen/ATen.h>
37+
38+
#define AT_DISPATCH_INTEGER_TYPES(TYPE, NAME, HINT, ...) \
39+
AT_DISPATCH_SWITCH( \
40+
TYPE, \
41+
NAME, \
42+
AT_PRIVATE_CASE_TYPE_USING_HINT(at::ScalarType::Int, HINT, __VA_ARGS__) \
43+
AT_PRIVATE_CASE_TYPE_USING_HINT( \
44+
at::ScalarType::Long, HINT, __VA_ARGS__))
45+
46+
namespace fbgemm_gpu {
47+
48+
#if defined(TORBOREC_CUDA)
49+
#define TORBOREC_INLINE __device__ __host__ __inline__
50+
#else
51+
#define TORBOREC_INLINE inline
52+
#endif
53+
54+
// NOLINTNEXTLINE:
55+
TORBOREC_INLINE uint64_t
56+
murmur_hash3_2x64(const uint64_t x, const uint64_t y, const uint64_t seed) {
57+
const uint64_t c1 = 0x87c37b91114253d5;
58+
const uint64_t c2 = 0x4cf5ad432745937f;
59+
60+
uint64_t h1 = seed;
61+
uint64_t h2 = seed;
62+
63+
// First 64-bit block
64+
uint64_t k1 = x;
65+
k1 *= c1;
66+
k1 = (k1 << 31) | (k1 >> (64 - 31));
67+
k1 *= c2;
68+
h1 ^= k1;
69+
h1 = (h1 << 27) | (h1 >> (64 - 27));
70+
h1 += h2;
71+
h1 = h1 * 5 + 0x52dce729;
72+
73+
// Second 64-bit block
74+
uint64_t k2 = y;
75+
k2 *= c2;
76+
k2 = (k2 << 33) | (k2 >> (64 - 33));
77+
k2 *= c1;
78+
h2 ^= k2;
79+
h2 = (h2 << 31) | (h2 >> (64 - 31));
80+
h2 += h1;
81+
h2 = h2 * 5 + 0x38495ab5;
82+
83+
// Finalization
84+
h1 ^= 16;
85+
h2 ^= 16;
86+
h1 += h2;
87+
h2 += h1;
88+
h1 ^= h1 >> 33;
89+
h1 *= 0xff51afd7ed558ccd;
90+
h1 ^= h1 >> 33;
91+
h1 *= 0xc4ceb9fe1a85ec53;
92+
h1 ^= h1 >> 33;
93+
h2 ^= h2 >> 33;
94+
h2 *= 0xff51afd7ed558ccd;
95+
h2 ^= h2 >> 33;
96+
h2 *= 0xc4ceb9fe1a85ec53;
97+
h2 ^= h2 >> 33;
98+
h1 += h2;
99+
h2 += h1;
100+
101+
return h1 ^ h2;
102+
}
103+
104+
// NOLINTNEXTLINE:
105+
template <bool CIRCULAR_PROBE>
106+
TORBOREC_INLINE int64_t next_output_index(
107+
int64_t output_index,
108+
int64_t modulo,
109+
int64_t& /* max_probe_local */) {
110+
static_assert(CIRCULAR_PROBE);
111+
return (output_index + 1) % modulo;
112+
}
113+
114+
// NOLINTNEXTLINE:
115+
template <>
116+
TORBOREC_INLINE int64_t next_output_index<false>(
117+
int64_t output_index,
118+
int64_t modulo,
119+
int64_t& max_probe_local) {
120+
output_index = (output_index + 1) % modulo;
121+
if (output_index == 0) {
122+
// circular, using max_probe_local to control exit.
123+
max_probe_local = 0;
124+
}
125+
return output_index;
126+
}
127+
128+
TORBOREC_INLINE bool is_eviction_enabled(
129+
bool readonly,
130+
int eviction_threshold,
131+
int eviction_policy) {
132+
return !readonly && (eviction_threshold > 0 || eviction_policy > 0);
133+
}
134+
135+
#undef TORBOREC_INLINE
136+
137+
} // namespace fbgemm_gpu

0 commit comments

Comments
 (0)