Skip to content

Commit 8cb23a5

Browse files
committed
build hip_fmha on github (#104)
Summary: Pull Request resolved: #104 Differential Revision: D90669881
1 parent 7bc1973 commit 8cb23a5

File tree

1,072 files changed

+8076
-212
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,072 files changed

+8076
-212
lines changed

CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,7 @@ include(${CMAKEMODULES}/GpuCppLibrary.cmake)
126126
set(mslk_include_directories
127127
# MSLK
128128
${MSLK}/include
129+
${MSLK}/csrc/attention/ck/fmha/hip_fmha
129130
# PyTorch
130131
${TORCH_INCLUDE_DIRS}
131132
# Third-party

MslkDefault.cmake

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,8 @@ glob_files_nohip(mslk_cpp_source_files_cpu
1616
csrc/gemm/*.cpp
1717
csrc/kv_cache/*.cpp
1818
csrc/moe/*.cpp
19-
csrc/quantize/*.cpp)
19+
csrc/quantize/*.cpp
20+
csrc/attention/ck/fmha/*.cpp)
2021

2122
glob_files_nohip(mslk_cpp_source_files_gpu
2223
csrc/attention/cuda/gqa_attn_splitk/*.cu
@@ -79,7 +80,8 @@ file(GLOB_RECURSE mslk_cpp_source_files_hip
7980
csrc/gemm/ck/*.hip
8081
csrc/gemm/ck/**/*.hip
8182
csrc/quantize/ck/*.hip
82-
csrc/quantize/ck/**/*.hip)
83+
csrc/quantize/ck/**/*.hip
84+
csrc/attention/ck/fmha/hip_fmha/**/*.hip)
8385

8486
################################################################################
8587
# Build Shared Library

csrc/attention/ck/fmha/hip_fmha/GENERATE_INSTANCES.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
12
# Instances generator
23

34
The instances generator is a simple python tool used to generate several hundred of instances (.cpp files) and their references (.h files).

csrc/attention/ck/fmha/hip_fmha/attention_backward_generic_ck_tiled.hip renamed to csrc/attention/ck/fmha/hip_fmha/attention_backward_generic_ck_tiled.cpp

Lines changed: 19 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,9 @@
1111
#include <ATen/ScalarOps.h>
1212
#include <ATen/Tensor.h>
1313
#include <ATen/TensorOperators.h>
14-
#include <c10/cuda/CUDAGuard.h>
14+
#include <c10/hip/HIPStream.h>
1515
#include <torch/library.h>
16+
#include <ATen/cuda/PhiloxUtils.cuh>
1617

1718
#include "ck_fmha_util.h"
1819
#include "ck_tiled_fmha_params.h"
@@ -38,26 +39,26 @@ efficient_attention_backward_ck(
3839
const at::Tensor& query,
3940
const at::Tensor& key,
4041
const at::Tensor& value,
41-
const std::optional<at::Tensor>& bias, // additive attention bias
42+
const c10::optional<at::Tensor>& bias, // additive attention bias
4243
// (Mode 1MHK only) [b+1]: cu_seqlens_q[b] contains the
4344
// position of the first query token for batch $b
44-
const std::optional<at::Tensor>& seqstart_q,
45+
const c10::optional<at::Tensor>& seqstart_q,
4546
// (Mode 1MHK only) [b+1]: cu_seqlens_k[b] contains the
4647
// position of the first key token for batch $b
47-
const std::optional<at::Tensor>& seqstart_k,
48+
const c10::optional<at::Tensor>& seqstart_k,
4849
// (Mode 1MHK only) Maximum sequence length across batches
49-
const std::optional<int64_t> max_seqlen_q_,
50+
const c10::optional<int64_t> max_seqlen_q_,
5051
// (Mode 1MHK only) Maximum sequence length across batches
51-
const std::optional<int64_t> max_seqlen_k_,
52-
const std::optional<at::Tensor>& seqlen_k,
52+
const c10::optional<int64_t> max_seqlen_k_,
53+
const c10::optional<at::Tensor>& seqlen_k,
5354
const at::Tensor& logsumexp,
5455
const at::Tensor& out,
5556
double dropout_p, // dropout probability
5657
int64_t rng_seed, // seed using for generating random numbers for dropout
5758
int64_t rng_offset, // offset into random number sequence
5859
int64_t custom_mask_type,
59-
const std::optional<double> scale,
60-
const std::optional<int64_t> window_size) {
60+
const c10::optional<double> scale,
61+
const c10::optional<int64_t> window_size) {
6162
// ndim
6263
TORCH_CHECK(query.dim() == grad_out.dim());
6364
TORCH_CHECK(query.dim() == key.dim());
@@ -111,7 +112,7 @@ efficient_attention_backward_ck(
111112
TORCH_CHECK(max_seqlen_k_.has_value());
112113
}
113114

114-
hipStream_t stream = at::hip::getCurrentHIPStream().stream();
115+
hipStream_t stream = c10::hip::getCurrentHIPStream().stream();
115116

116117
int64_t B = query.size(0);
117118
int64_t M = query.size(1);
@@ -550,26 +551,26 @@ efficient_attention_backward_ck_meta(
550551
const at::Tensor& query,
551552
const at::Tensor& key,
552553
const at::Tensor& value,
553-
const std::optional<at::Tensor>& bias, // additive attention bias
554+
const c10::optional<at::Tensor>& bias, // additive attention bias
554555
// (Mode 1MHK only) [b+1]: cu_seqlens_q[b] contains the
555556
// position of the first query token for batch $b
556-
const std::optional<at::Tensor>& seqstart_q,
557+
const c10::optional<at::Tensor>& seqstart_q,
557558
// (Mode 1MHK only) [b+1]: cu_seqlens_k[b] contains the
558559
// position of the first key token for batch $b
559-
const std::optional<at::Tensor>& seqstart_k,
560+
const c10::optional<at::Tensor>& seqstart_k,
560561
// (Mode 1MHK only) Maximum sequence length across batches
561-
const std::optional<int64_t> max_seqlen_q_,
562+
const c10::optional<int64_t> max_seqlen_q_,
562563
// (Mode 1MHK only) Maximum sequence length across batches
563-
const std::optional<int64_t> max_seqlen_k_,
564-
const std::optional<at::Tensor>& seqlen_k,
564+
const c10::optional<int64_t> max_seqlen_k_,
565+
const c10::optional<at::Tensor>& seqlen_k,
565566
const at::Tensor& logsumexp,
566567
const at::Tensor& out,
567568
double dropout_p, // dropout probability
568569
int64_t rng_seed, // seed using for generating random numbers for dropout
569570
int64_t rng_offset, // offset into random number sequence
570571
int64_t custom_mask_type,
571-
const std::optional<double> scale,
572-
const std::optional<int64_t> window_size) {
572+
const c10::optional<double> scale,
573+
const c10::optional<int64_t> window_size) {
573574
int64_t B = query.size(0);
574575
int64_t M = query.size(1);
575576
int64_t N = key.size(1);

csrc/attention/ck/fmha/hip_fmha/attention_ck_rand_uniform.hip renamed to csrc/attention/ck/fmha/hip_fmha/attention_ck_rand_uniform.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,12 @@
66
* LICENSE file in the root directory of this source tree.
77
*/
88
#include <ATen/ATen.h>
9-
#include <ATen/cuda/CUDAContext.h>
109
#include <ATen/cuda/CUDAGeneratorImpl.h>
1110
#include <c10/core/TensorOptions.h>
11+
#include <c10/hip/HIPStream.h>
1212
#include <torch/library.h>
1313
#include <torch/types.h>
14-
#include <ATen/cuda/CUDAGraphsUtils.cuh>
14+
#include <ATen/cuda/PhiloxUtils.cuh>
1515

1616
#include <ck_tile/core.hpp>
1717
#include <ck_tile/host/kernel_launch.hpp>
@@ -33,11 +33,11 @@ at::Tensor rand_uniform_int(
3333
int M = out_pattern.size(2);
3434
int N = out_pattern.size(3);
3535

36-
hipStream_t stream = at::hip::getCurrentHIPStream().stream();
36+
hipStream_t stream = c10::hip::getCurrentHIPStream().stream();
3737

3838
at::CUDAGeneratorImpl* gen =
3939
at::get_generator_or_default<at::CUDAGeneratorImpl>(
40-
std::nullopt, at::cuda::detail::getDefaultCUDAGenerator());
40+
c10::nullopt, at::cuda::detail::getDefaultCUDAGenerator());
4141

4242
at::PhiloxCudaState rng_engine_inputs;
4343
{

csrc/attention/ck/fmha/hip_fmha/attention_forward_generic_ck_tiled.hip renamed to csrc/attention/ck/fmha/hip_fmha/attention_forward_generic_ck_tiled.cpp

Lines changed: 25 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,11 @@
1212
#include <ATen/ScalarOps.h>
1313
#include <ATen/Tensor.h>
1414
#include <ATen/core/Generator.h>
15-
#include <ATen/cuda/CUDAContext.h>
1615
#include <ATen/cuda/CUDAGeneratorImpl.h>
17-
#include <c10/cuda/CUDAGuard.h>
16+
#include <c10/hip/HIPStream.h>
1817
#include <c10/util/Optional.h>
1918
#include <torch/library.h>
20-
#include <ATen/cuda/CUDAGraphsUtils.cuh>
19+
#include <ATen/cuda/PhiloxUtils.cuh>
2120

2221
#include "ck_fmha_util.h"
2322
#include "ck_tiled_fmha_fwd_splitkv_selector.h"
@@ -53,23 +52,23 @@ efficient_attention_forward_ck(
5352
const at::Tensor& query, // [b, seqlen, num_heads_q, K]
5453
const at::Tensor& key, // [b, seqlen, num_heads_kv, K]
5554
const at::Tensor& value, // [b, seqlen, num_heads_kv, Kv]
56-
const std::optional<at::Tensor>& bias, // [b, num_heads_q, seqlen, seqlen]
55+
const c10::optional<at::Tensor>& bias, // [b, num_heads_q, seqlen, seqlen]
5756
// (Mode 1MHK only) [b+1]: cu_seqlens_q[b] contains the
5857
// position of the first query token for batch $b
59-
const std::optional<at::Tensor>& seqstart_q,
58+
const c10::optional<at::Tensor>& seqstart_q,
6059
// (Mode 1MHK only) [b+1]: cu_seqlen_k[b] contains the
6160
// position of the first key token for batch $b
62-
const std::optional<at::Tensor>& seqstart_k,
61+
const c10::optional<at::Tensor>& seqstart_k,
6362
// (Mode 1MHK only) Maximum sequence length across batches
64-
const std::optional<int64_t> max_seqlen_q_,
63+
const c10::optional<int64_t> max_seqlen_q_,
6564
double dropout_p, // attention matrix dropout probability
6665
bool compute_logsumexp,
6766
int64_t custom_mask_type,
68-
std::optional<double> scale,
69-
const std::optional<at::Tensor>& seqlen_k,
70-
const std::optional<int64_t> window_size,
71-
const std::optional<at::Tensor>& block_tables,
72-
const std::optional<int64_t> page_size) {
67+
c10::optional<double> scale,
68+
const c10::optional<at::Tensor>& seqlen_k,
69+
const c10::optional<int64_t> window_size,
70+
const c10::optional<at::Tensor>& block_tables,
71+
const c10::optional<int64_t> page_size) {
7372
TORCH_CHECK(query.dim() == 4);
7473
TORCH_CHECK(key.dim() == 4);
7574
TORCH_CHECK(value.dim() == 4);
@@ -116,7 +115,7 @@ efficient_attention_forward_ck(
116115
CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(key);
117116
CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(value);
118117

119-
hipStream_t stream = at::hip::getCurrentHIPStream().stream();
118+
hipStream_t stream = c10::hip::getCurrentHIPStream().stream();
120119

121120
int64_t B = query.size(0);
122121
int64_t M = query.size(1);
@@ -143,7 +142,7 @@ efficient_attention_forward_ck(
143142
at::PhiloxCudaState rng_engine_inputs;
144143
at::CUDAGeneratorImpl* gen =
145144
at::get_generator_or_default<at::CUDAGeneratorImpl>(
146-
std::nullopt, at::cuda::detail::getDefaultCUDAGenerator());
145+
c10::nullopt, at::cuda::detail::getDefaultCUDAGenerator());
147146

148147
std::lock_guard<std::mutex> lock(gen->mutex_);
149148
// if using dropout, we produce 1 random number for each element of the
@@ -478,26 +477,29 @@ efficient_attention_forward_ck_meta(
478477
const at::Tensor& query, // [b, seqlen, num_heads_q, K]
479478
const at::Tensor& key, // [b, seqlen, num_heads_kv, K]
480479
const at::Tensor& value, // [b, seqlen, num_heads_kv, Kv]
481-
const std::optional<at::Tensor>& bias, // [b, num_heads_q, seqlen, seqlen]
480+
const c10::optional<at::Tensor>& bias, // [b, num_heads_q, seqlen, seqlen]
482481
// (Mode 1MHK only) [b+1]: cu_seqlens_q[b] contains the
483482
// position of the first query token for batch $b
484-
const std::optional<at::Tensor>& seqstart_q,
483+
const c10::optional<at::Tensor>& seqstart_q,
485484
// (Mode 1MHK only) [b+1]: cu_seqlen_k[b] contains the
486485
// position of the first key token for batch $b
487-
const std::optional<at::Tensor>& seqstart_k,
486+
const c10::optional<at::Tensor>& seqstart_k,
488487
// (Mode 1MHK only) Maximum sequence length across batches
489-
const std::optional<int64_t> max_seqlen_q_,
488+
const c10::optional<int64_t> max_seqlen_q_,
490489
double dropout_p, // attention matrix dropout probability
491490
bool compute_logsumexp,
492491
int64_t custom_mask_type,
493-
std::optional<double> scale,
494-
const std::optional<at::Tensor>& seqlen_k,
495-
const std::optional<int64_t> window_size,
496-
const std::optional<at::Tensor>& block_tables,
497-
const std::optional<int64_t> page_size) {
492+
c10::optional<double> scale,
493+
const c10::optional<at::Tensor>& seqlen_k,
494+
const c10::optional<int64_t> window_size,
495+
const c10::optional<at::Tensor>& block_tables,
496+
const c10::optional<int64_t> page_size) {
498497
at::SymInt B = query.sym_size(0);
499498
at::SymInt M = query.sym_size(1);
499+
at::SymInt N = key.sym_size(1);
500500
at::SymInt Hq = query.sym_size(-2);
501+
at::SymInt Hkv = key.sym_size(-2);
502+
at::SymInt K = query.sym_size(-1);
501503
at::SymInt Kv = value.sym_size(-1);
502504
auto opts = query.options();
503505
std::optional<at::Tensor> logsumexp = std::nullopt;
File renamed without changes.

csrc/attention/ck/fmha/hip_fmha/ck_fmha_util.h

Lines changed: 1 addition & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
#include <sstream>
1212
#include <stdexcept>
1313

14-
#include <torch/torch.h>
14+
#include <torch/all.h>
1515

1616
#define XFORMERS_CHECK(COND, ERR) \
1717
if (!(COND)) { \
@@ -47,23 +47,6 @@
4747
} \
4848
} while (0)
4949

50-
static inline size_t get_size_in_bytes(size_t n, at::ScalarType dtype) {
51-
if (dtype == at::ScalarType::Float) {
52-
return n * 4;
53-
} else if (dtype == at::ScalarType::Half) {
54-
return n * 2;
55-
} else if (dtype == at::ScalarType::BFloat16) {
56-
return n * 2;
57-
} else if (dtype == at::ScalarType::Short) {
58-
return n * 2;
59-
} else if (dtype == at::ScalarType::Int) {
60-
return n * 4;
61-
} else if (dtype == at::ScalarType::Byte) {
62-
return n;
63-
}
64-
return 0;
65-
}
66-
6750
/**
6851
* kernels expect 4D bias/bias.grad with shape
6952
* (batch_sz, n_heads, n_queries, n_keys). common bias shapes users may pass

csrc/attention/ck/fmha/hip_fmha/ck_tiled_fmha_batched_backward_bf16.hip renamed to csrc/attention/ck/fmha/hip_fmha/ck_tiled_fmha_batched_backward_bf16.cpp

File renamed without changes.

csrc/attention/ck/fmha/hip_fmha/ck_tiled_fmha_batched_backward_fp16.hip renamed to csrc/attention/ck/fmha/hip_fmha/ck_tiled_fmha_batched_backward_fp16.cpp

File renamed without changes.

0 commit comments

Comments
 (0)