diff --git a/CHANGELOG.md b/CHANGELOG.md index 9468cc1a..d44e26fb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,8 @@ # NVIDIA CUTLASS Changelog +## [1.3.2](https://github.com/NVIDIA/cutlass/releases/tag/v1.3.2) (2019-07-09) + * Performance improvement for Volta Tensor Cores TN and TT layouts. + ## [1.3.1](https://github.com/NVIDIA/cutlass/releases/tag/v1.3.1) (2019-04-09) * Corrected NVRTC unit tests. diff --git a/README.md b/README.md index 4c539e92..e43fb20a 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ # CUTLASS 1.3 -_CUTLASS 1.3.1 - April 2019_ +_CUTLASS 1.3.2 - July 2019_ CUTLASS is a collection of CUDA C++ template abstractions for implementing high-performance matrix-multiplication (GEMM) at all levels and scales within CUDA. @@ -28,9 +28,6 @@ CUTLASS 1.3 is described in the [CUTLASS Documentation](CUTLASS.md) and the acco We describe the structure of an efficient GEMM in our talk at the [GPU Technology Conference 2018](http://on-demand.gputechconf.com/gtc/2018/presentation/s8854-cutlass-software-primitives-for-dense-linear-algebra-at-all-levels-and-scales-within-cuda.pdf). -# What's New in CUTLASS 1.3.1 -_April 2019_ -* CUTLASS 1.3.1 corrected NVRTC unit tests.. # What's New in CUTLASS 1.3 _March 2019_ @@ -60,6 +57,8 @@ _September 2018_ * [Reference implementations](tools/util/reference) for tensor operations in [host](tools/util/reference/host) and [device](tools/util/reference/device) code * Added `HostMatrix<>` for simplified matrix creation +For all updates, see the [CUTLASS changelog](CHANGELOG.md). + # Performance

diff --git a/cutlass/cutlass.h b/cutlass/cutlass.h index a44950cb..c7a80839 100644 --- a/cutlass/cutlass.h +++ b/cutlass/cutlass.h @@ -34,7 +34,7 @@ #define CUTLASS_MAJOR 1 #define CUTLASS_MINOR 3 -#define CUTLASS_PATCH 1 +#define CUTLASS_PATCH 2 #define CUTLASS_VERSION ((CUTLASS_MAJOR)*100 + (CUTLASS_MINOR)*10 + CUTLASS_PATCH) #if defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__)) diff --git a/cutlass/gemm/volta884_shared_tile_crosswise.h b/cutlass/gemm/volta884_shared_tile_crosswise.h index 361c791a..5351a4e5 100644 --- a/cutlass/gemm/volta884_shared_tile_crosswise.h +++ b/cutlass/gemm/volta884_shared_tile_crosswise.h @@ -237,6 +237,12 @@ struct Volta884ThreadblockMultiplicandStoreIterator offset = offset_func(ptr_idx); pointer[ptr_idx] = _params.pointer + (_block_offset + offset).template dot(stride); } + + if (((threadIdx.x >> 5) * Iterations::kD) & 2) { + Scalar *tmp = pointer[0]; + pointer[0] = pointer[1]; + pointer[1] = tmp; + } } /// Stores a fragment @@ -254,16 +260,12 @@ struct Volta884ThreadblockMultiplicandStoreIterator> 5); - - int ldg_idx = d + warp_id * Iterations::kD; int k_idx = w + h * 8; int smem_row = (d >> 1); // Two store pointers - int ptr_idx = ((ldg_idx & 1) ^ ((ldg_idx >> 1) & 1)); - - Scalar *_pointer = pointer[ptr_idx]; + Scalar *_pointer = pointer[(d & 1) ^ ((d >> 1) & 1)]; + Coord<4> sts_offset = make_Coord(k_idx, smem_row, 0, 0); Store::store( @@ -277,6 +279,7 @@ struct Volta884ThreadblockMultiplicandStoreIterator(stride); @@ -293,6 +296,7 @@ struct Volta884ThreadblockMultiplicandStoreIterator(stride); diff --git a/tools/test/unit/gemm/volta884_gemm.cu b/tools/test/unit/gemm/volta884_gemm.cu index 707c6d37..376c0126 100644 --- a/tools/test/unit/gemm/volta884_gemm.cu +++ b/tools/test/unit/gemm/volta884_gemm.cu @@ -183,7 +183,7 @@ TEST(Volta884_f16_s884gemm_128x128x32_tt, short_480x280x224) { // Contiguous - s884gemm // //////////////////////////////////////////////////////////////////////////////////////////////////// -#if 0 + TEST(Volta884_f16_s884gemm_64x64x32_nt, 64x64x32) { typedef cutlass::gemm::Volta884GemmTraits< @@ -218,7 +218,6 @@ TEST(Volta884_f16_s884gemm_64x64x32_nt, 64x64x30_residue) { run_gemm(64, 64, 30); } -#if 0 //////////////////////////////////////////////////////////////////////////////////////////////////// TEST(Volta884_f16_s884gemm_64x64x32_nt, 64x64x64) { @@ -874,7 +873,6 @@ TEST(Volta884_f16_s884gemm_128x128x32_nn, 392x264x192) { run_gemm(392, 264, 192); } -#endif //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -1281,7 +1279,6 @@ TEST(Volta884_f16_s884gemm_f16_128x256x32_tn, 480x280x224) { run_gemm(480, 280, 224); } -#endif //////////////////////////////////////////////////////////////////////////////////////////////////// #endif // if defined(CUTLASS_ENABLE_TENSOR_CORE_MMA)