[examples] Fix typos in SYRK and TRMM examples (#507)
This commit is contained in:
parent
858c735856
commit
0abaac84ea
@ -37,7 +37,7 @@
|
||||
the symmetric rank-k update (SYRK) using double-precision doubleing-point arithmetic and assumes
|
||||
all matrices have column-major layout.
|
||||
|
||||
The threadblock tile size is chosen as 128x128x8 which offers good performance for large matrices.
|
||||
The threadblock tile size is chosen as 16x32x16 which offers good performance for large matrices.
|
||||
See the CUTLASS Parallel for All blog post for more exposition on the tunable parameters available
|
||||
in CUTLASS.
|
||||
|
||||
@ -83,7 +83,7 @@ cudaError_t CutlassSsyrkNN(
|
||||
int ldc) {
|
||||
|
||||
// Define type definition for double-precision CUTLASS SYRK with column-major
|
||||
// input matrices and 128x128x8 threadblock tile size (chosen by default).
|
||||
// input matrices and 16x32x16 threadblock tile size (chosen by default).
|
||||
//
|
||||
// To keep the interface manageable, several helpers are defined for plausible compositions
|
||||
// including the following example for double-precision SYRK. Typical values are used as
|
||||
@ -138,7 +138,7 @@ cudaError_t CutlassSsyrkNN(
|
||||
{alpha, beta}, // Scalars used in the Epilogue
|
||||
reinterpret_cast<void const *>(A),
|
||||
const_cast<void *>(reinterpret_cast<void *>(C)),
|
||||
reinterpret_cast<void *>(C), // destination matrix D (may be different memory than so urce C matrix)
|
||||
reinterpret_cast<void *>(C), // destination matrix D (may be different memory than source C matrix)
|
||||
(int64_t)N*K, // Batch strides
|
||||
(int64_t)N*N,
|
||||
(int64_t)N*N,
|
||||
|
@ -37,7 +37,7 @@
|
||||
the triangular matrix product (TRMM) using double-precision doubleing-point arithmetic and assumes
|
||||
all matrices have column-major layout.
|
||||
|
||||
The threadblock tile size is chosen as 128x128x8 which offers good performance for large matrices.
|
||||
The threadblock tile size is chosen as 64x64x16 which offers good performance for large matrices.
|
||||
See the CUTLASS Parallel for All blog post for more exposition on the tunable parameters available
|
||||
in CUTLASS.
|
||||
|
||||
@ -84,7 +84,7 @@ cudaError_t CutlassStrmmNN(
|
||||
int ldc) {
|
||||
|
||||
// Define type definition for double-precision CUTLASS TRMM with column-major
|
||||
// input matrices and 128x128x8 threadblock tile size (chosen by default).
|
||||
// input matrices and 64x64x16 threadblock tile size (chosen by default).
|
||||
//
|
||||
// To keep the interface manageable, several helpers are defined for plausible compositions
|
||||
// including the following example for double-precision TRMM. Typical values are used as
|
||||
@ -107,7 +107,7 @@ cudaError_t CutlassStrmmNN(
|
||||
double,
|
||||
cutlass::arch::OpClassTensorOp,
|
||||
cutlass::arch::Sm80,
|
||||
cutlass::gemm::GemmShape<64,64, 16>,
|
||||
cutlass::gemm::GemmShape<64, 64, 16>,
|
||||
cutlass::gemm::GemmShape<32, 32, 16>,
|
||||
cutlass::gemm::GemmShape<8, 8, 4>,
|
||||
cutlass::epilogue::thread::LinearCombination<
|
||||
@ -143,7 +143,7 @@ cudaError_t CutlassStrmmNN(
|
||||
{alpha}, // Scalars used in the Epilogue
|
||||
reinterpret_cast<void const *>(A),
|
||||
reinterpret_cast<void const *>(B),
|
||||
reinterpret_cast<void *>(C), // destination matrix D (may be different memory than so urce C matrix)
|
||||
reinterpret_cast<void *>(C), // destination matrix D (may be different memory than source C matrix)
|
||||
(int64_t)M*M, // Batch strides
|
||||
(int64_t)M*N,
|
||||
(int64_t)M*N,
|
||||
|
Loading…
Reference in New Issue
Block a user