Skip to content

Commit 877bdca

Browse files
authored
Cutlass 1.3 Release (#42)
CUTLASS 1.3 Release - Efficient GEMM kernel targeting Volta Tensor Cores via mma.sync instruction added in CUDA 10.1.
1 parent 19a9d64 commit 877bdca

File tree

256 files changed

+16930
-802
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

256 files changed

+16930
-802
lines changed

CHANGELOG.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# NVIDIA CUTLASS Changelog
22

3-
## [1.2.1](https://github.com/NVIDIA/cutlass/releases/tag/v1.2.1) (2018-12-19)
4-
* Resolved issue with sm50 and sm52 architectures
3+
## [1.3.0](https://github.com/NVIDIA/cutlass/releases/tag/v1.3.0) (2019-03-20)
4+
* Efficient GEMM kernel targeting Volta Tensor Cores via `mma.sync` instruction added in CUDA 10.1.
55

66
## [1.2.0](https://github.com/NVIDIA/cutlass/releases/tag/v1.2.0) (2018-10-26)
77
* Parallelized reductions across threadblocks ("Split-K")

CMakeLists.txt

Lines changed: 82 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
1+
# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
22
#
33
# Redistribution and use in source and binary forms, with or without modification, are permitted
44
# provided that the following conditions are met:
@@ -20,7 +20,7 @@
2020
# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
2121
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2222

23-
cmake_minimum_required(VERSION 3.3.0)
23+
cmake_minimum_required(VERSION 3.3.0 FATAL_ERROR)
2424

2525
set(CUTLASS_LANGUAGES CXX)
2626

@@ -36,7 +36,8 @@ else()
3636
# FindCUDA fails to detect VS 2017 due to a changed directory format of the toolkits.
3737
# For this configuration we need CMake >= 3.9.0 to use the native CUDA support.
3838
if (WIN32 AND MSVC_VERSION GREATER 1800)
39-
message(FATAL_ERROR "Please upgrade CMake to version >= 3.9.0 to support Visual Studio 2017 or higher")
39+
message(SEND_ERROR "Please upgrade CMake to version >= 3.9.0 to support Visual Studio 2017 or higher")
40+
cmake_minimum_required(VERSION 3.9.0 FATAL_ERROR)
4041
endif()
4142

4243
# Fall back to the FindCUDA version to create an executable with CUDA files
@@ -52,7 +53,11 @@ if( NOT CMAKE_SIZEOF_VOID_P EQUAL 8 )
5253
message(FATAL_ERROR "CUTLASS requires a 64-bit compiler!")
5354
endif()
5455

55-
find_package(CUDA)
56+
find_package(CUDA REQUIRED)
57+
include_directories(SYSTEM ${CUDA_INCLUDE_DIRS})
58+
# Some platforms (e.g. Visual Studio) don't add the CUDA include directories to the system include
59+
# paths by default, so we add it explicitly here.
60+
5661
find_package(Doxygen QUIET)
5762

5863
###################################################################################################
@@ -61,9 +66,18 @@ find_package(Doxygen QUIET)
6166
#
6267
###################################################################################################
6368

64-
find_library(CUBLAS_LIBRARY cublas HINTS
69+
#
70+
# Conditionally enable cuBLAS
71+
#
72+
set(CUTLASS_ENABLE_CUBLAS ON CACHE BOOL "Enable CUTLASS Tests to build with cuBLAS library.")
73+
74+
if(CUTLASS_ENABLE_CUBLAS)
75+
76+
find_library(CUBLAS_LIBRARY cublas HINTS
6577
${CUDA_TOOLKIT_ROOT_DIR}/lib64
6678
${CUDA_TOOLKIT_ROOT_DIR}/lib/x64)
79+
endif()
80+
6781

6882
# By default we want to build in Release mode to ensure that we're getting best performance
6983
if (NOT (CMAKE_BUILD_TYPE OR CONFIGURATION_TYPES))
@@ -78,26 +92,56 @@ if(WIN32)
7892
endif()
7993

8094
if (WIN32)
81-
# Enable more warnings and treat as errors
82-
string(APPEND NVCC_FLAGS " -Xcompiler /W3 -Xcompiler /WX")
95+
# Enable more warnings and treat as errors
96+
string(APPEND NVCC_FLAGS " -Xcompiler /W3 -Xcompiler /WX")
8397

84-
# Disable warning on Unicode characters
85-
string(APPEND NVCC_FLAGS " -Xcompiler /wd4819")
98+
# Disable warning on Unicode characters
99+
string(APPEND NVCC_FLAGS " -Xcompiler /wd4819")
86100

87-
# Disable excess x86 floating point precision that can lead to results being labeled incorrectly
88-
string(APPEND NVCC_FLAGS " -Xcompiler /fp:strict")
101+
# Disable excess x86 floating point precision that can lead to results being labeled incorrectly
102+
string(APPEND NVCC_FLAGS " -Xcompiler /fp:strict")
89103

90-
# Verbose option
91-
if (${CUTLASS_NVCC_VERBOSE})
92-
string(APPEND NVCC_FLAGS " -v")
93-
endif()
104+
# Verbose option
105+
if (${CUTLASS_NVCC_VERBOSE})
106+
string(APPEND NVCC_FLAGS " -v")
107+
endif()
94108
endif(WIN32)
95109

96-
set(CUTLASS_NVCC_ARCHS "50;60;61;70;75" CACHE STRING "The SM architectures to build code for.")
110+
set(CUTLASS_NVCC_ARCHS_DEFAULT "")
111+
if(NOT CUDA_VERSION VERSION_LESS 7.5)
112+
list(APPEND CUTLASS_NVCC_ARCHS_DEFAULT 50)
113+
endif()
114+
if(NOT CUDA_VERSION VERSION_LESS 8.0)
115+
list(APPEND CUTLASS_NVCC_ARCHS_DEFAULT 60 61)
116+
endif()
117+
if(NOT CUDA_VERSION VERSION_LESS 9.0)
118+
list(APPEND CUTLASS_NVCC_ARCHS_DEFAULT 70)
119+
endif()
120+
if(NOT CUDA_VERSION VERSION_LESS 9.2)
121+
list(APPEND CUTLASS_NVCC_ARCHS_DEFAULT 72)
122+
endif()
123+
if(NOT CUDA_VERSION VERSION_LESS 10.0)
124+
list(APPEND CUTLASS_NVCC_ARCHS_DEFAULT 75)
125+
endif()
126+
set(CUTLASS_NVCC_ARCHS ${CUTLASS_NVCC_ARCHS_DEFAULT} CACHE STRING "The SM architectures to build code for.")
127+
97128
set(CUTLASS_NVCC_EMBED_CUBIN ON CACHE BOOL "Embed compiled CUDA kernel binaries into executables.")
98129
set(CUTLASS_NVCC_EMBED_PTX ON CACHE BOOL "Embed compiled PTX into executables.")
99130
set(CUTLASS_NVCC_KEEP OFF CACHE BOOL "Keep intermediate files generated by NVCC.")
100131

132+
# CUDA 10.1 introduces "mma" in PTX performing collective matrix multiply operations.
133+
if (CUDA_VERSION VERSION_LESS 10.1)
134+
set(CUTLASS_ENABLE_TENSOR_CORE_MMA_DEFAULT OFF)
135+
else()
136+
set(CUTLASS_ENABLE_TENSOR_CORE_MMA_DEFAULT ON)
137+
endif()
138+
139+
set(CUTLASS_ENABLE_TENSOR_CORE_MMA ${CUTLASS_ENABLE_TENSOR_CORE_MMA_DEFAULT} CACHE BOOL
140+
"Enable PTX mma instruction for collective matrix multiply operations.")
141+
142+
set(CUTLASS_EXHAUSTIVE_PERFORMANCE_TEST ${CUTLASS_EXHAUSTIVE_PERFORMANCE_TEST} CACHE BOOL
143+
"Enable more kernels instantiated in the perf suite. This might result in longer compiler time. ")
144+
101145
#
102146
# NOTE: running with asan and CUDA requires the following environment variable:
103147
#
@@ -131,6 +175,18 @@ foreach(ARCH ${CUTLASS_NVCC_ARCHS})
131175
endif()
132176
endforeach()
133177

178+
if (CUTLASS_ENABLE_TENSOR_CORE_MMA)
179+
string(APPEND NVCC_FLAGS " -DCUTLASS_ENABLE_TENSOR_CORE_MMA=1")
180+
endif()
181+
182+
if (CUTLASS_ENABLE_CUBLAS)
183+
string(APPEND NVCC_FLAGS " -DCUTLASS_ENABLE_CUBLAS=1")
184+
endif()
185+
186+
if (CUTLASS_EXHAUSTIVE_PERFORMANCE_TEST)
187+
add_definitions(-DEXHAUSTIVE_PROF)
188+
endif()
189+
134190
if (CUTLASS_NVCC_KEEP)
135191
string(APPEND NVCC_FLAGS " -keep")
136192
endif()
@@ -174,6 +230,7 @@ file(GLOB CUTLASS_UTIL RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} cutlass/util/*.h)
174230
file(GLOB CUTLASS_DEVICE RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} cutlass/device/*.h)
175231
file(GLOB CUTLASS_CORE RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} cutlass/*.h)
176232
file(GLOB CUTLASS_REDUCTION RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} cutlass/reduction/*.h )
233+
file(GLOB CUTLASS_LAYOUT_THREAD RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} cutlass/layout/thread/*.h)
177234

178235
###################################################################################################
179236
#
@@ -185,16 +242,24 @@ source_group("cutlass\\gemm" FILES ${CUTLASS_GEMM})
185242
source_group("cutlass\\util" FILES ${CUTLASS_UTIL})
186243
source_group("cutlass\\device" FILES ${CUTLASS_DEVICE})
187244
source_group("cutlass\\reduction" FILES ${CUTLASS_REDUCTION})
245+
source_group("cutlass\\layout\\thread" FILES ${CUTLASS_LAYOUT_THREAD})
188246
source_group("cutlass" FILES ${CUTLASS_CORE})
189247

190248
add_library(CUTLASS INTERFACE)
191249
include_directories("${CMAKE_CURRENT_SOURCE_DIR}")
250+
251+
# Special policy introduced in CMake 3.13
252+
if (POLICY CMP0076)
253+
cmake_policy(SET CMP0076 NEW)
254+
endif()
255+
192256
target_sources(CUTLASS INTERFACE
193257
${CUTLASS_GEMM}
194258
${CUTLASS_UTIL}
195259
${CUTLASS_DEVICE}
196260
${CUTLASS_CORE}
197261
${CUTLASS_REDUCTION}
262+
${CUTLASS_LAYOUT_THREAD}
198263
)
199264

200265
target_include_directories(CUTLASS INTERFACE ${CMAKE_CURRENT_SOURCE_DIR})
@@ -206,6 +271,7 @@ add_custom_target(cutlass_ide SOURCES
206271
${CUTLASS_DEVICE}
207272
${CUTLASS_CORE}
208273
${CUTLASS_REDUCTION}
274+
${CUTLASS_LAYOUT_THREAD}
209275
)
210276
# Doxygen is available. Generate documentation
211277
if (DOXYGEN_FOUND)

CUTLASS.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ CUTLASS core components, and to identify their role in implementing GEMM computa
1414
# <a name="S-design-patterns"></a> 1. Design Patterns
1515

1616
CUTLASS strives to achieve the highest performance possible on NVIDIA GPUs while also offering a
17-
flexible composition that an be easily applied to solve new problems related to Deep Learning and
17+
flexible composition that can be easily applied to solve new problems related to Deep Learning and
1818
linear algebra. Though we intend to make CUTLASS as simple and straightforward as possible, given
1919
a tradeoff between simplicity and performance, CUTLASS chooses performance. Consequently, several
2020
design patterns are necessary to yield a composable structure while also satisfying these performance
@@ -31,7 +31,7 @@ CUTLASS embodies a design paradigm exemplified by the [CUB library](https://nvla
3131

3232
## <a name="S-patterns-tiles-iterators"></a> Tiles and Iterators
3333

34-
Efficient dense linear algebra computations emphasize data movement to match the execution of mathemtical operators to the flow of data. Consequently, CUTLASS defines a rich set of primitives for partitioning a tile of data among participating threads, warps, and threadblocks. CUTLASS applies the familiar iterator design pattern to provide an abstraction layer to (1.) access these tile objects and (2.) traverse a sequence of objects embedded in a higher level data structure. These subpartitions are typically defined by compile-time constants
34+
Efficient dense linear algebra computations emphasize data movement to match the execution of mathematical operators to the flow of data. Consequently, CUTLASS defines a rich set of primitives for partitioning a tile of data among participating threads, warps, and threadblocks. CUTLASS applies the familiar iterator design pattern to provide an abstraction layer to (1.) access these tile objects and (2.) traverse a sequence of objects embedded in a higher level data structure. These subpartitions are typically defined by compile-time constants
3535
specifying element type, size, and data layout. CUTLASS refers to subpartitions as _tiles_.
3636

3737
_Iterators_ are familiar design patterns in C++ that provide an abstraction for accessing individual
@@ -353,7 +353,7 @@ An example of splitK usage can be found [here](examples/06_splitK_gemm/splitK_ge
353353

354354
# Copyright
355355

356-
Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
356+
Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
357357

358358
```
359359
Redistribution and use in source and binary forms, with or without modification, are permitted

README.md

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
![ALT](/media/images/gemm-hierarchy-with-epilogue-no-labels.png "Complete CUDA GEMM decomposition")
22

3-
# CUTLASS 1.2
3+
# CUTLASS 1.3
44

5-
_CUTLASS 1.2 - October 2018_
5+
_CUTLASS 1.3.0 - March 2019_
66

77
CUTLASS is a collection of CUDA C++ template abstractions for implementing
88
high-performance matrix-multiplication (GEMM) at all levels and scales within CUDA.
@@ -20,13 +20,18 @@ multiply-accumulate abstractions for 8-bit integer, half-precision floating
2020
point (FP16), single-precision floating point (FP32), and double-precision floating
2121
point (FP64) types. Furthermore, CUTLASS demonstrates CUDA's WMMA API for targeting
2222
the programmable, high-throughput _Tensor Cores_ provided by NVIDIA's Volta architecture
23-
and beyond.
23+
and beyond. Even faster performance on Volta is possible via direct access to
24+
Volta Tenor Cores via `mma.sync` (added in CUDA 10.1).
2425

25-
CUTLASS 1.2 is described in the [CUTLASS Documentation](CUTLASS.md) and the accompanying
26+
CUTLASS 1.3 is described in the [CUTLASS Documentation](CUTLASS.md) and the accompanying
2627
[Doxygen documentation](https://nvidia.github.io/cutlass).
2728
We describe the structure of an efficient GEMM in our talk at the
2829
[GPU Technology Conference 2018](http://on-demand.gputechconf.com/gtc/2018/presentation/s8854-cutlass-software-primitives-for-dense-linear-algebra-at-all-levels-and-scales-within-cuda.pdf).
2930

31+
# What's New in CUTLASS 1.3
32+
_March 2019_
33+
* CUTLASS 1.3 includes an efficient GEMM implementation with the `mma.sync` instruction added in CUDA 10.1.
34+
3035
# What's New in CUTLASS 1.2
3136
_October 2018_
3237
* [Parallelized Reductions](CUTLASS.md#parallel-reductions-across-gemm-k)
@@ -63,8 +68,8 @@ when compiled with CUDA 10.0.
6368

6469
# Compatibility
6570

66-
CUTLASS performs best when compiled with the [CUDA 10.0 Toolkit](ttps://developer.nvidia.com/cuda-toolkit).
67-
It is compatible with CUDA 9.0, 9.1, and 9.2, but these versions of the CUDA Toolkit do not support new Turing WMMA features.
71+
CUTLASS performs best when compiled with the [CUDA 10.1 Toolkit](ttps://developer.nvidia.com/cuda-toolkit).
72+
It is also compatible with CUDA 9.0, 9.1, 9.2, and 10.0.
6873

6974
We have tested the following environments.
7075

@@ -77,7 +82,7 @@ We have tested the following environments.
7782
| Ubuntu 18.04 | GCC 7.3.0 |
7883

7984
CUTLASS runs successfully on the following NVIDIA GPUs, and it is expected to be efficient on
80-
any Maxwell-, Pascal-, or Volta-architecture NVIDIA GPU.
85+
any Maxwell-, Pascal-, Volta-, and Turing-architecture NVIDIA GPUs.
8186

8287
|**GPU**|
8388
|---|
@@ -220,6 +225,9 @@ Program usage:
220225
221226
# Varies GEMM K dimension for SGEMM and IGEMM with column-major multiplicands
222227
$ ./tools/test/perf/cutlass_perf_test --m=10240 --n=4096 --k=1024:8192:128 --kernels=sgemm_nn,igemm_nn
228+
229+
# Executes GEMM kernel on Volta Tensor Cores
230+
$ ./tools/test/perf/cutlass_perf_test --kernels=s884gemm_nt
223231
```
224232

225233
# About
@@ -230,7 +238,7 @@ CUTLASS is released by NVIDIA Corporation as Open Source software under the
230238

231239
# Copyright
232240

233-
Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
241+
Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
234242

235243
```
236244
Redistribution and use in source and binary forms, with or without modification, are permitted
@@ -253,4 +261,3 @@ Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
253261
STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
254262
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
255263
```
256-

0 commit comments

Comments
 (0)