[examples] Fix typos in SYRK and TRMM examples (#507)
This commit is contained in:
parent
858c735856
commit
0abaac84ea
@ -37,7 +37,7 @@
|
|||||||
the symmetric rank-k update (SYRK) using double-precision doubleing-point arithmetic and assumes
|
the symmetric rank-k update (SYRK) using double-precision doubleing-point arithmetic and assumes
|
||||||
all matrices have column-major layout.
|
all matrices have column-major layout.
|
||||||
|
|
||||||
The threadblock tile size is chosen as 128x128x8 which offers good performance for large matrices.
|
The threadblock tile size is chosen as 16x32x16 which offers good performance for large matrices.
|
||||||
See the CUTLASS Parallel for All blog post for more exposition on the tunable parameters available
|
See the CUTLASS Parallel for All blog post for more exposition on the tunable parameters available
|
||||||
in CUTLASS.
|
in CUTLASS.
|
||||||
|
|
||||||
@ -83,7 +83,7 @@ cudaError_t CutlassSsyrkNN(
|
|||||||
int ldc) {
|
int ldc) {
|
||||||
|
|
||||||
// Define type definition for double-precision CUTLASS SYRK with column-major
|
// Define type definition for double-precision CUTLASS SYRK with column-major
|
||||||
// input matrices and 128x128x8 threadblock tile size (chosen by default).
|
// input matrices and 16x32x16 threadblock tile size (chosen by default).
|
||||||
//
|
//
|
||||||
// To keep the interface manageable, several helpers are defined for plausible compositions
|
// To keep the interface manageable, several helpers are defined for plausible compositions
|
||||||
// including the following example for double-precision SYRK. Typical values are used as
|
// including the following example for double-precision SYRK. Typical values are used as
|
||||||
@ -138,7 +138,7 @@ cudaError_t CutlassSsyrkNN(
|
|||||||
{alpha, beta}, // Scalars used in the Epilogue
|
{alpha, beta}, // Scalars used in the Epilogue
|
||||||
reinterpret_cast<void const *>(A),
|
reinterpret_cast<void const *>(A),
|
||||||
const_cast<void *>(reinterpret_cast<void *>(C)),
|
const_cast<void *>(reinterpret_cast<void *>(C)),
|
||||||
reinterpret_cast<void *>(C), // destination matrix D (may be different memory than so urce C matrix)
|
reinterpret_cast<void *>(C), // destination matrix D (may be different memory than source C matrix)
|
||||||
(int64_t)N*K, // Batch strides
|
(int64_t)N*K, // Batch strides
|
||||||
(int64_t)N*N,
|
(int64_t)N*N,
|
||||||
(int64_t)N*N,
|
(int64_t)N*N,
|
||||||
|
@ -37,7 +37,7 @@
|
|||||||
the triangular matrix product (TRMM) using double-precision doubleing-point arithmetic and assumes
|
the triangular matrix product (TRMM) using double-precision doubleing-point arithmetic and assumes
|
||||||
all matrices have column-major layout.
|
all matrices have column-major layout.
|
||||||
|
|
||||||
The threadblock tile size is chosen as 128x128x8 which offers good performance for large matrices.
|
The threadblock tile size is chosen as 64x64x16 which offers good performance for large matrices.
|
||||||
See the CUTLASS Parallel for All blog post for more exposition on the tunable parameters available
|
See the CUTLASS Parallel for All blog post for more exposition on the tunable parameters available
|
||||||
in CUTLASS.
|
in CUTLASS.
|
||||||
|
|
||||||
@ -84,7 +84,7 @@ cudaError_t CutlassStrmmNN(
|
|||||||
int ldc) {
|
int ldc) {
|
||||||
|
|
||||||
// Define type definition for double-precision CUTLASS TRMM with column-major
|
// Define type definition for double-precision CUTLASS TRMM with column-major
|
||||||
// input matrices and 128x128x8 threadblock tile size (chosen by default).
|
// input matrices and 64x64x16 threadblock tile size (chosen by default).
|
||||||
//
|
//
|
||||||
// To keep the interface manageable, several helpers are defined for plausible compositions
|
// To keep the interface manageable, several helpers are defined for plausible compositions
|
||||||
// including the following example for double-precision TRMM. Typical values are used as
|
// including the following example for double-precision TRMM. Typical values are used as
|
||||||
@ -107,7 +107,7 @@ cudaError_t CutlassStrmmNN(
|
|||||||
double,
|
double,
|
||||||
cutlass::arch::OpClassTensorOp,
|
cutlass::arch::OpClassTensorOp,
|
||||||
cutlass::arch::Sm80,
|
cutlass::arch::Sm80,
|
||||||
cutlass::gemm::GemmShape<64,64, 16>,
|
cutlass::gemm::GemmShape<64, 64, 16>,
|
||||||
cutlass::gemm::GemmShape<32, 32, 16>,
|
cutlass::gemm::GemmShape<32, 32, 16>,
|
||||||
cutlass::gemm::GemmShape<8, 8, 4>,
|
cutlass::gemm::GemmShape<8, 8, 4>,
|
||||||
cutlass::epilogue::thread::LinearCombination<
|
cutlass::epilogue::thread::LinearCombination<
|
||||||
@ -143,7 +143,7 @@ cudaError_t CutlassStrmmNN(
|
|||||||
{alpha}, // Scalars used in the Epilogue
|
{alpha}, // Scalars used in the Epilogue
|
||||||
reinterpret_cast<void const *>(A),
|
reinterpret_cast<void const *>(A),
|
||||||
reinterpret_cast<void const *>(B),
|
reinterpret_cast<void const *>(B),
|
||||||
reinterpret_cast<void *>(C), // destination matrix D (may be different memory than so urce C matrix)
|
reinterpret_cast<void *>(C), // destination matrix D (may be different memory than source C matrix)
|
||||||
(int64_t)M*M, // Batch strides
|
(int64_t)M*M, // Batch strides
|
||||||
(int64_t)M*N,
|
(int64_t)M*N,
|
||||||
(int64_t)M*N,
|
(int64_t)M*N,
|
||||||
|
Loading…
Reference in New Issue
Block a user