From 0abaac84eaa96b45144391b4bc75e2f9eb863d09 Mon Sep 17 00:00:00 2001
From: Jack Kosaian <jkosaian@andrew.cmu.edu>
Date: Fri, 3 Jun 2022 22:52:41 -0400
Subject: [PATCH] [examples] Fix typos in SYRK and TRMM examples (#507)

---
 examples/31_basic_syrk/basic_syrk.cu | 6 +++---
 examples/32_basic_trmm/basic_trmm.cu | 8 ++++----
 2 files changed, 7 insertions(+), 7 deletions(-)
diff --git a/examples/31_basic_syrk/basic_syrk.cu b/examples/31_basic_syrk/basic_syrk.cu
index e207cf73..79b3ab60 100644
--- a/examples/31_basic_syrk/basic_syrk.cu
+++ b/examples/31_basic_syrk/basic_syrk.cu
@@ -37,7 +37,7 @@
   the symmetric rank-k update (SYRK) using double-precision doubleing-point arithmetic and assumes
   all matrices have column-major layout.
 
-  The threadblock tile size is chosen as 128x128x8 which offers good performance for large matrices.
+  The threadblock tile size is chosen as 16x32x16 which offers good performance for large matrices.
   See the CUTLASS Parallel for All blog post for more exposition on the tunable parameters available
   in CUTLASS.
 
@@ -83,7 +83,7 @@ cudaError_t CutlassSsyrkNN(
   int ldc) {
 
   // Define type definition for double-precision CUTLASS SYRK with column-major
-  // input matrices and 128x128x8 threadblock tile size (chosen by default).
+  // input matrices and 16x32x16 threadblock tile size (chosen by default).
   //
   // To keep the interface manageable, several helpers are defined for plausible compositions
   // including the following example for double-precision SYRK. Typical values are used as
@@ -138,7 +138,7 @@ cudaError_t CutlassSsyrkNN(
                               {alpha, beta}, // Scalars used in the Epilogue
                               reinterpret_cast<void const *>(A),
                               const_cast<void *>(reinterpret_cast<void *>(C)),
-                              reinterpret_cast<void *>(C), // destination matrix D (may be different memory than so    urce C matrix)
+                              reinterpret_cast<void *>(C), // destination matrix D (may be different memory than source C matrix)
                               (int64_t)N*K, // Batch strides
                               (int64_t)N*N,
                               (int64_t)N*N,
diff --git a/examples/32_basic_trmm/basic_trmm.cu b/examples/32_basic_trmm/basic_trmm.cu
index 54ac11ff..988b7a62 100644
--- a/examples/32_basic_trmm/basic_trmm.cu
+++ b/examples/32_basic_trmm/basic_trmm.cu
@@ -37,7 +37,7 @@
   the triangular matrix product (TRMM) using double-precision doubleing-point arithmetic and assumes
   all matrices have column-major layout.
 
-  The threadblock tile size is chosen as 128x128x8 which offers good performance for large matrices.
+  The threadblock tile size is chosen as 64x64x16 which offers good performance for large matrices.
   See the CUTLASS Parallel for All blog post for more exposition on the tunable parameters available
   in CUTLASS.
 
@@ -84,7 +84,7 @@ cudaError_t CutlassStrmmNN(
   int ldc) {
 
   // Define type definition for double-precision CUTLASS TRMM with column-major
-  // input matrices and 128x128x8 threadblock tile size (chosen by default).
+  // input matrices and 64x64x16 threadblock tile size (chosen by default).
   //
   // To keep the interface manageable, several helpers are defined for plausible compositions
   // including the following example for double-precision TRMM. Typical values are used as
@@ -107,7 +107,7 @@ cudaError_t CutlassStrmmNN(
     double,
     cutlass::arch::OpClassTensorOp,
     cutlass::arch::Sm80,
-    cutlass::gemm::GemmShape<64,64, 16>,
+    cutlass::gemm::GemmShape<64, 64, 16>,
     cutlass::gemm::GemmShape<32, 32, 16>,
     cutlass::gemm::GemmShape<8, 8, 4>,
     cutlass::epilogue::thread::LinearCombination<
@@ -143,7 +143,7 @@ cudaError_t CutlassStrmmNN(
                               {alpha}, // Scalars used in the Epilogue
                               reinterpret_cast<void const *>(A),
                               reinterpret_cast<void const *>(B),
-                              reinterpret_cast<void *>(C), // destination matrix D (may be different memory than so    urce C matrix)
+                              reinterpret_cast<void *>(C), // destination matrix D (may be different memory than source C matrix)
                               (int64_t)M*M, // Batch strides
                               (int64_t)M*N,
                               (int64_t)M*N,