Commit 4faf1a1
File tree
2,773 files changed
+3162
-3053
lines changed- .github/workflows
- cmake
- examples
- 00_basic_gemm
- 01_cutlass_utilities
- 02_dump_reg_shmem
- 03_visualize_layout
- 04_tile_iterator
- 05_batched_gemm
- 06_splitK_gemm
- 07_volta_tensorop_gemm
- 08_turing_tensorop_gemm
- 09_turing_tensorop_conv2dfprop
- 10_planar_complex
- 11_planar_complex_array
- 12_gemm_bias_relu
- 13_two_tensor_op_fusion
- device
- kernel
- reference/device
- threadblock
- 14_ampere_tf32_tensorop_gemm
- 15_ampere_sparse_tensorop_gemm
- 16_ampere_tensorop_conv2dfprop
- 17_fprop_per_channel_bias
- 18_ampere_fp64_tensorop_affine2_gemm
- 19_tensorop_canonical
- 20_simt_canonical
- 21_quaternion_gemm
- 22_quaternion_conv
- 23_ampere_gemm_operand_reduction_fusion
- 24_gemm_grouped
- 25_ampere_fprop_mainloop_fusion
- 26_ampere_wgrad_mainloop_fusion
- 27_ampere_3xtf32_fast_accurate_tensorop_gemm
- 28_ampere_3xtf32_fast_accurate_tensorop_fprop
- 29_ampere_3xtf32_fast_accurate_tensorop_complex_gemm
- 30_wgrad_split_k
- 31_basic_syrk
- 32_basic_trmm
- 33_ampere_3xtf32_tensorop_symm
- 34_transposed_conv2d
- 35_gemm_softmax
- 36_gather_scatter_fusion
- 37_gemm_layernorm_gemm_fusion
- 38_syr2k_grouped
- 39_gemm_permute
- 40_cutlass_py
- customizable
- 41_fused_multi_head_attention
- epilogue
- gemm
- iterators
- transform
- 42_ampere_tensorop_group_conv
- 43_ell_block_sparse_gemm
- 44_multi_gemm_ir_and_codegen
- fixed_impl
- epilogue
- threadblock
- warp
- gemm/warp
- ir_gen
- 45_dual_gemm
- device
- kernel
- threadblock
- thread
- 46_depthwise_simt_conv2dfprop
- 47_ampere_gemm_universal_streamk
- 48_hopper_warp_specialized_gemm
- 49_hopper_gemm_with_collective_builder
- 50_hopper_gemm_with_epilogue_swizzle
- 51_hopper_gett
- 52_hopper_gather_scatter_fusion
- 53_hopper_gemm_permute
- 54_hopper_fp8_warp_specialized_gemm
- 55_hopper_mixed_dtype_gemm
- 56_hopper_ptr_array_batched_gemm
- 57_hopper_grouped_gemm
- 58_ada_fp8_gemm
- 59_ampere_gather_scatter_conv
- 60_cutlass_import
- 61_hopper_gemm_with_topk_and_softmax
- 62_hopper_sparse_gemm
- 63_hopper_gemm_with_weight_prefetch
- collective
- kernel
- pipeline
- 64_ada_fp8_gemm_grouped
- 65_distributed_gemm
- 67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling
- 68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling
- 69_hopper_mixed_dtype_grouped_gemm
- 70_blackwell_gemm
- 71_blackwell_gemm_with_collective_builder
- 72_blackwell_narrow_precision_gemm
- 73_blackwell_gemm_preferred_cluster
- 74_blackwell_gemm_streamk
- 75_blackwell_grouped_gemm
- 76_blackwell_conv
- 77_blackwell_fmha
- collective
- common
- device
- kernel
- reference
- 78_blackwell_emulated_bf16x9_gemm
- 79_blackwell_geforce_gemm
- 80_blackwell_geforce_sparse_gemm
- 81_blackwell_gemm_blockwise
- 82_blackwell_distributed_gemm
- 83_blackwell_sparse_gemm
- 84_blackwell_narrow_precision_sparse_gemm
- 86_blackwell_mixed_dtype_gemm
- 87_blackwell_geforce_gemm_blockwise
- 88_hopper_fmha
- collective
- device
- kernel
- reference
- 89_sm103_fp4_ultra_gemm
- 90_sm103_fp4_ultra_grouped_gemm
- 91_fp4_gemv
- 92_blackwell_moe_gemm
- common
- cute
- tutorial
- blackwell
- hopper
- python
- CuTeDSL
- ampere
- blackwell_geforce
- blackwell
- blockwise_gemm
- mamba2_ssd
- mixed_input_fmha
- cute
- ffi
- tvm_ffi
- helpers
- hopper
- notebooks
- utils
- deprecated
- include
- cute
- algorithm
- arch
- atom
- container
- numeric
- util
- cutlass
- arch
- conv
- collective
- builders
- device
- kernel
- threadblock
- thread
- warp
- detail
- collective
- epilogue
- collective
- builders
- fusion
- threadblock
- fusion
- thread
- warp
- experimental/distributed
- device
- kernel
- schedules
- gemm
- collective
- builders
- device
- kernel
- threadblock
- thread
- warp
- layout
- pipeline
- platform
- reduction
- device
- kernel
- thread
- thread
- transform
- collective
- device
- kernel
- threadblock
- thread
- warp
- media/docs/cpp
- build
- cute
- python
- CuTeDSL
- cutlass
- base_dsl
- _mlir_helpers
- export
- runtime
- tvm_ffi_builder
- utils
- cute
- arch
- export
- nvgpu
- cpasync
- tcgen05
- warpgroup
- warp
- cutlass_dsl
- pipeline
- utils
- cutlass_cppgen
- backend
- evt
- backend
- frontend
- ir
- passes
- utils
- emit
- epilogue
- op
- utils
- cutlass_library
- docs_src/source
- pycute
- test
- python
- cutlass
- conv2d
- emit
- evt
- utils
- gemm
- interface
- pycute
- self_contained_includes
- unit
- cluster_launch
- common
- conv
- device_3x
- dgrad
- fprop
- wgrad
- device
- core
- cute
- ampere
- core
- hopper
- layout
- msvc_compilation
- turing
- volta
- epilogue
- threadblock
- thread
- warp
- gemm
- device
- sm100_blockscaled_sparse_tensorop_gemm
- sm100_blockscaled_tensorop_gemm
- sm100_sparse_tensorop_gemm
- narrow_precision
- sm100_tensorop_gemm
- narrow_precision
- sm120_blockscaled_sparse_tensorop_gemm
- sm120_blockscaled_tensorop_gemm
- sm120_sparse_tensorop_gemm
- sm120_tensorop_gemm
- kernel
- threadblock
- thread
- host
- warp
- layout
- nvrtc
- cutlass/nvrtc
- kernel/thread
- stdlib
- thread
- pipeline
- reduction
- device
- kernel
- thread
- substrate
- transform
- device
- kernel
- threadblock
- util
- tools
- library
- include/cutlass/library
- src
- reduction
- reference
- profiler
- include/cutlass/profiler
- src
- util
- include/cutlass/util
- reference
- detail
- device
- kernel
- thread
- host
- scripts
Some content is hidden
Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.
2,773 files changed
+3162
-3053
lines changed| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
1 | 1 | | |
2 | 2 | | |
3 | | - | |
| 3 | + | |
4 | 4 | | |
5 | 5 | | |
6 | 6 | | |
| |||
| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
1 | | - | |
| 1 | + | |
2 | 2 | | |
3 | 3 | | |
4 | 4 | | |
| |||
| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
1 | | - | |
| 1 | + | |
2 | 2 | | |
3 | 3 | | |
4 | 4 | | |
| |||
| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
1 | | - | |
| 1 | + | |
2 | 2 | | |
3 | 3 | | |
4 | 4 | | |
| |||
| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
1 | | - | |
| 1 | + | |
2 | 2 | | |
3 | 3 | | |
4 | 4 | | |
| |||
| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
1 | | - | |
| 1 | + | |
2 | 2 | | |
3 | 3 | | |
4 | 4 | | |
| |||
0 commit comments