[examples] Fix typos in SYRK and TRMM examples (#507)

This commit is contained in:
Jack Kosaian 2022-06-03 22:52:41 -04:00 committed by GitHub
parent 858c735856
commit 0abaac84ea
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 7 additions and 7 deletions

View File

@ -37,7 +37,7 @@
the symmetric rank-k update (SYRK) using double-precision doubleing-point arithmetic and assumes
all matrices have column-major layout.
The threadblock tile size is chosen as 128x128x8 which offers good performance for large matrices.
The threadblock tile size is chosen as 16x32x16 which offers good performance for large matrices.
See the CUTLASS Parallel for All blog post for more exposition on the tunable parameters available
in CUTLASS.
@ -83,7 +83,7 @@ cudaError_t CutlassSsyrkNN(
int ldc) {
// Define type definition for double-precision CUTLASS SYRK with column-major
// input matrices and 128x128x8 threadblock tile size (chosen by default).
// input matrices and 16x32x16 threadblock tile size (chosen by default).
//
// To keep the interface manageable, several helpers are defined for plausible compositions
// including the following example for double-precision SYRK. Typical values are used as
@ -138,7 +138,7 @@ cudaError_t CutlassSsyrkNN(
{alpha, beta}, // Scalars used in the Epilogue
reinterpret_cast<void const *>(A),
const_cast<void *>(reinterpret_cast<void *>(C)),
reinterpret_cast<void *>(C), // destination matrix D (may be different memory than so urce C matrix)
reinterpret_cast<void *>(C), // destination matrix D (may be different memory than source C matrix)
(int64_t)N*K, // Batch strides
(int64_t)N*N,
(int64_t)N*N,

View File

@ -37,7 +37,7 @@
the triangular matrix product (TRMM) using double-precision doubleing-point arithmetic and assumes
all matrices have column-major layout.
The threadblock tile size is chosen as 128x128x8 which offers good performance for large matrices.
The threadblock tile size is chosen as 64x64x16 which offers good performance for large matrices.
See the CUTLASS Parallel for All blog post for more exposition on the tunable parameters available
in CUTLASS.
@ -84,7 +84,7 @@ cudaError_t CutlassStrmmNN(
int ldc) {
// Define type definition for double-precision CUTLASS TRMM with column-major
// input matrices and 128x128x8 threadblock tile size (chosen by default).
// input matrices and 64x64x16 threadblock tile size (chosen by default).
//
// To keep the interface manageable, several helpers are defined for plausible compositions
// including the following example for double-precision TRMM. Typical values are used as
@ -107,7 +107,7 @@ cudaError_t CutlassStrmmNN(
double,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<64,64, 16>,
cutlass::gemm::GemmShape<64, 64, 16>,
cutlass::gemm::GemmShape<32, 32, 16>,
cutlass::gemm::GemmShape<8, 8, 4>,
cutlass::epilogue::thread::LinearCombination<
@ -143,7 +143,7 @@ cudaError_t CutlassStrmmNN(
{alpha}, // Scalars used in the Epilogue
reinterpret_cast<void const *>(A),
reinterpret_cast<void const *>(B),
reinterpret_cast<void *>(C), // destination matrix D (may be different memory than so urce C matrix)
reinterpret_cast<void *>(C), // destination matrix D (may be different memory than source C matrix)
(int64_t)M*M, // Batch strides
(int64_t)M*N,
(int64_t)M*N,