diff --git a/CHANGELOG.md b/CHANGELOG.md index 73c2f768..c0606491 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,7 @@ ## 1.1.0 (2018-09-19) * Turing Features - * WMMA GEMM targeting TensorCores - INT8, INT4, INT1 + * WMMA GEMM targeting TensorCores - INT8, INT4, 1-bit * Batched Strided GEMM * Threadblock rasterization strategies * Improved performance for adverse problem sizes and data layouts diff --git a/README.md b/README.md index d5bd15ef..c53a42f4 100644 --- a/README.md +++ b/README.md @@ -33,7 +33,7 @@ We describe the structure of an efficient GEMM in our talk at the * [Examples](examples/) * Basic GEMM, tensor views, CUTLASS utilities, batched GEMM, WMMA GEMM * Turing Features - * [WMMA GEMM targeting TensorCores](tools/test/unit/gemm/wmma_integer_gemm.cu) - INT8, INT4, INT1 + * [WMMA GEMM targeting TensorCores](tools/test/unit/gemm/wmma_integer_gemm.cu) - INT8, INT4, 1-bit * [Batched Strided GEMM](tools/test/unit/gemm/batched_strided_sgemm_128x128x8.cu) * [Threadblock rasterization strategies](tools/test/unit/gemm/sgemm_threadblock_swizzle_nt.cu) * Improved performance for adverse problem sizes and data layouts diff --git a/cutlass/cutlass.h b/cutlass/cutlass.h index 097714c0..15ea83c0 100644 --- a/cutlass/cutlass.h +++ b/cutlass/cutlass.h @@ -32,8 +32,8 @@ //////////////////////////////////////////////////////////////////////////////////////////////////// #define CUTLASS_MAJOR 1 -#define CUTLASS_MINOR 0 -#define CUTLASS_PATCH 1 +#define CUTLASS_MINOR 1 +#define CUTLASS_PATCH 0 #define CUTLASS_VERSION ((CUTLASS_MAJOR)*100 + (CUTLASS_MINOR)*10 + CUTLASS_PATCH) #ifdef __NVCC__ diff --git a/media/images/cutlass-performance-plot.png b/media/images/cutlass-performance-plot.png index f61c2e50..041d28b3 100644 Binary files a/media/images/cutlass-performance-plot.png and b/media/images/cutlass-performance-plot.png differ diff --git a/tools/test/unit/gemm/batched_strided_sgemm_128x128x8.cu b/tools/test/unit/gemm/batched_strided_sgemm_128x128x8.cu index ffeba34f..fcee155f 100644 --- a/tools/test/unit/gemm/batched_strided_sgemm_128x128x8.cu +++ b/tools/test/unit/gemm/batched_strided_sgemm_128x128x8.cu @@ -34,7 +34,6 @@ TEST(Sgemm_strided_batched_128x128x8, sgemm_256x384x64x3_nn) { typedef cutlass::gemm::SgemmTraits > SgemmTraits; - //think about using run_gemm directly run_batched_strided_gemm(256/*m*/, 384/*n*/, 64/*k*/, 3 /*batch_size*/); } @@ -44,7 +43,6 @@ TEST(Sgemm_strided_batched_128x128x8, sgemm_128x384x192x2_nn) { typedef cutlass::gemm::SgemmTraits > SgemmTraits; - //think about using run_gemm directly run_batched_strided_gemm(128/*m*/, 384/*n*/, 192/*k*/, 2 /*batch_size*/); } @@ -54,7 +52,6 @@ TEST(Sgemm_strided_batched_128x128x8, sgemm_127x384x192x2_nn) { typedef cutlass::gemm::SgemmTraits > SgemmTraits; - //think about using run_gemm directly run_batched_strided_gemm(127/*m*/, 384/*n*/, 192/*k*/, 2 /*batch_size*/); } @@ -64,7 +61,6 @@ TEST(Sgemm_strided_batched_128x128x8, sgemm_127x388x190x2_nn) { typedef cutlass::gemm::SgemmTraits > SgemmTraits; - //think about using run_gemm directly run_batched_strided_gemm(127/*m*/, 388/*n*/, 190/*k*/, 2 /*batch_size*/); } @@ -74,7 +70,6 @@ TEST(Sgemm_strided_batched_128x128x8, sgemm_256x384x64x3_nt) { typedef cutlass::gemm::SgemmTraits > SgemmTraits; - //think about using run_gemm directly run_batched_strided_gemm(256/*m*/, 384/*n*/, 64/*k*/, 3 /*batch_size*/); } @@ -84,7 +79,6 @@ TEST(Sgemm_strided_batched_128x128x8, sgemm_128x384x192x2_nt) { typedef cutlass::gemm::SgemmTraits > SgemmTraits; - //think about using run_gemm directly run_batched_strided_gemm(128/*m*/, 384/*n*/, 192/*k*/, 2 /*batch_size*/); } @@ -96,7 +90,6 @@ TEST(Sgemm_strided_batched_128x128x8, sgemm_256x384x64x3_tn) { typedef cutlass::gemm::SgemmTraits > SgemmTraits; - //think about using run_gemm directly run_batched_strided_gemm(256/*m*/, 384/*n*/, 64/*k*/, 3 /*batch_size*/); } @@ -106,7 +99,6 @@ TEST(Sgemm_strided_batched_128x128x8, sgemm_128x384x192x2_tn) { typedef cutlass::gemm::SgemmTraits > SgemmTraits; - //think about using run_gemm directly run_batched_strided_gemm(128/*m*/, 384/*n*/, 192/*k*/, 2 /*batch_size*/); } @@ -118,7 +110,6 @@ TEST(Sgemm_strided_batched_128x128x8, sgemm_256x384x64x3_tt) { typedef cutlass::gemm::SgemmTraits > SgemmTraits; - //think about using run_gemm directly run_batched_strided_gemm(256/*m*/, 384/*n*/, 64/*k*/, 3 /*batch_size*/); } @@ -128,8 +119,8 @@ TEST(Sgemm_strided_batched_128x128x8, sgemm_128x384x192x2_tt) { typedef cutlass::gemm::SgemmTraits > SgemmTraits; - //think about using run_gemm directly run_batched_strided_gemm(128/*m*/, 384/*n*/, 192/*k*/, 2 /*batch_size*/); } //////////////////////////////////////////////////////////////////////////////////////////////////// +