Skip to content

Commit 8437724

Browse files
authored
Merge pull request #2 from Artem-B/clang-fixes
Merging "Clang fixes" into master.
2 parents 95b0578 + 1c9b54d commit 8437724

File tree

3 files changed

+49
-10
lines changed

3 files changed

+49
-10
lines changed

common.mk

Lines changed: 33 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -41,51 +41,64 @@ endif
4141

4242
ifeq (70, $(findstring 70, $(SM_ARCH)))
4343
SM_TARGETS += -gencode=arch=compute_70,code=\"sm_70,compute_70\"
44+
CLANG_SM_TARGETS += --cuda-gpu-arch=sm_70
4445
endif
4546
ifeq (62, $(findstring 62, $(SM_ARCH)))
4647
SM_TARGETS += -gencode=arch=compute_62,code=\"sm_62,compute_62\"
48+
CLANG_SM_TARGETS += --cuda-gpu-arch=sm_62
4749
endif
4850
ifeq (61, $(findstring 61, $(SM_ARCH)))
4951
SM_TARGETS += -gencode=arch=compute_61,code=\"sm_61,compute_61\"
52+
CLANG_SM_TARGETS += --cuda-gpu-arch=sm_61
5053
endif
5154
ifeq (60, $(findstring 60, $(SM_ARCH)))
5255
SM_TARGETS += -gencode=arch=compute_60,code=\"sm_60,compute_60\"
56+
CLANG_SM_TARGETS += --cuda-gpu-arch=sm_60
5357
endif
5458
ifeq (52, $(findstring 52, $(SM_ARCH)))
5559
SM_TARGETS += -gencode=arch=compute_52,code=\"sm_52,compute_52\"
60+
CLANG_SM_TARGETS += --cuda-gpu-arch=sm_52
5661
endif
5762
ifeq (37, $(findstring 37, $(SM_ARCH)))
5863
SM_TARGETS += -gencode=arch=compute_37,code=\"sm_37,compute_37\"
64+
CLANG_SM_TARGETS += --cuda-gpu-arch=sm_37
5965
endif
6066
ifeq (35, $(findstring 35, $(SM_ARCH)))
6167
SM_TARGETS += -gencode=arch=compute_35,code=\"sm_35,compute_35\"
68+
CLANG_SM_TARGETS += --cuda-gpu-arch=sm_35
6269
endif
6370
ifeq (30, $(findstring 30, $(SM_ARCH)))
6471
SM_TARGETS += -gencode=arch=compute_30,code=\"sm_30,compute_30\"
72+
CLANG_SM_TARGETS += --cuda-gpu-arch=sm_30
6573
endif
6674
ifeq (21, $(findstring 21, $(SM_ARCH)))
6775
SM_TARGETS += -gencode=arch=compute_20,code=\"sm_21,compute_20\"
76+
CLANG_SM_TARGETS += --cuda-gpu-arch=sm_21
6877
endif
6978
ifeq (20, $(findstring 20, $(SM_ARCH)))
7079
SM_TARGETS += -gencode=arch=compute_20,code=\"sm_20,compute_20\"
80+
CLANG_SM_TARGETS += --cuda-gpu-arch=sm_20
7181
endif
7282

7383

7484
# [verbose=<0|1>] Verbose toolchain output from nvcc option
7585
ifeq ($(verbose), 1)
7686
NVCCFLAGS += -v
87+
CLANG_CFLAGS += -v
7788
endif
7889

7990

8091
# [keep=<0|1>] Keep intermediate compilation artifacts option
8192
ifeq ($(keep), 1)
8293
NVCCFLAGS += -keep
94+
CLANG_CFLAGS += --save-temps
8395
endif
8496

8597

8698
# [debug=<0|1>] Generate debug mode code
8799
ifeq ($(debug), 1)
88100
NVCCFLAGS += -G
101+
CLANG_CFLAGS += --cuda-noopt-device-debug
89102
endif
90103

91104

@@ -107,7 +120,7 @@ OSUPPER := $(shell uname -s 2>/dev/null | tr [:lower:] [:upper:])
107120

108121
# Default flags: verbose kernel properties (regs, smem, cmem, etc.); runtimes for compilation phases
109122
NVCCFLAGS += -O3 -Xptxas -v
110-
123+
CLANG_CFLAGS += -O3 -Xcuda-ptxas -v
111124
ifeq (WIN_NT, $(findstring WIN_NT, $(OSUPPER)))
112125
# For MSVC
113126

@@ -139,15 +152,30 @@ else
139152

140153
endif
141154

155+
# compiler=clang Enables compilation with clang.
156+
157+
ifeq ($(compiler), clang)
158+
# NVCC_VERSION is used as the proxy for the CUDA version.
159+
BIN_SUFFIX := sm$(SM_ARCH)_clang_cuda_$(NVCC_VERSION)
160+
# Clangs needs few extra flags to point it to CUDA SDK
161+
# and link the binaries with CUDA runtime.
162+
CUDA_BASE=$(realpath $(join $(dir $(shell which nvcc)), ..))
163+
CLANG_CFLAGS += --cuda-path=$(CUDA_BASE)
164+
LIBINC += -L$(CUDA_BASE)/lib64 -Wl,-rpath=$(CUDA_BASE)/lib64
165+
LIBS += -lcudart
142166

143-
# Suffix to append to each binary
144-
BIN_SUFFIX := sm$(SM_ARCH)_$(NVCC_VERSION)
167+
# Replace NVCC and its options with clang++.
168+
NVCC = clang++
169+
NVCCFLAGS = $(CLANG_CFLAGS)
170+
SM_TARGETS = $(CLANG_SM_TARGETS)
171+
else
172+
# Suffix to append to each binary
173+
BIN_SUFFIX := sm$(SM_ARCH)_nvcc_$(NVCC_VERSION)
174+
endif
145175

146176

147177
#-------------------------------------------------------------------------------
148178
# Function for computing dependency Lists
149179
#-------------------------------------------------------------------------------
150180

151181
rwildcard=$(foreach d,$(wildcard $1*),$(call rwildcard,$d/,$2) $(filter $(subst *,%,$2),$d))
152-
153-

cutlass/gemm/block_task.h

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -484,7 +484,8 @@ struct block_task
484484
* - Applies the scalar multipliers and addends to the accumulators
485485
* - Write the result to the output matrix
486486
*/
487-
inline __device__ void epilogue()
487+
__forceinline__ __device__
488+
void epilogue()
488489
{
489490
// Wait for predecessor thread block(s) to produce block-wide tile of
490491
// exclsuive partial-sums
@@ -551,7 +552,7 @@ struct block_task
551552
* Consume a tile of A and B each
552553
*/
553554
template <bool DoGlobalPrefetch>
554-
inline __device__
555+
__forceinline__ __device__
555556
void consume_tile()
556557
{
557558
// Unroll BlockDpVectorsK iterations of outer-product accumulations
@@ -612,7 +613,7 @@ struct block_task
612613
/**
613614
* Compute GEMM
614615
*/
615-
inline __device__
616+
__forceinline__ __device__
616617
void run()
617618
{
618619
// Quit if the thread block is fully out-of-bounds

cutlass/util/debug.h

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,9 +45,19 @@ namespace cutlass {
4545
*/
4646
#if !defined(CUDA_LOG)
4747
#if !defined(__CUDA_ARCH__)
48-
#define CUDA_LOG(format, ...) printf(format,__VA_ARGS__)
48+
#define CUDA_LOG(format, ...) printf(format, __VA_ARGS__)
4949
#else
50-
#define CUDA_LOG(format, ...) printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y, threadIdx.z, __VA_ARGS__);
50+
inline __host__ __device__ unsigned get_threadidx_x() { return threadIdx.x; }
51+
inline __host__ __device__ unsigned get_threadidx_y() { return threadIdx.y; }
52+
inline __host__ __device__ unsigned get_threadidx_z() { return threadIdx.z; }
53+
inline __host__ __device__ unsigned get_blockidx_x() { return blockIdx.x; }
54+
inline __host__ __device__ unsigned get_blockidx_y() { return blockIdx.y; }
55+
inline __host__ __device__ unsigned get_blockidx_z() { return blockIdx.z; }
56+
#define CUDA_LOG(format, ...) \
57+
printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, \
58+
get_blockidx_x(), get_blockidx_y(), get_blockidx_z(), \
59+
get_threadidx_x(), get_threadidx_y(), get_threadidx_z(), \
60+
__VA_ARGS__);
5161
#endif
5262
#endif
5363

0 commit comments

Comments
 (0)