diff --git a/examples/06_splitK_gemm/splitk_gemm.cu b/examples/06_splitK_gemm/splitk_gemm.cu index 6e01a101..b38de0c8 100644 --- a/examples/06_splitK_gemm/splitk_gemm.cu +++ b/examples/06_splitK_gemm/splitk_gemm.cu @@ -205,7 +205,7 @@ int run() { cutlass::HostTensor tensor_a( problem_size.mk()); // <- Create matrix A with dimensions M x K cutlass::HostTensor tensor_b( - problem_size.nk()); // <- Create matrix B with dimensions N x K + problem_size.kn()); // <- Create matrix B with dimensions K x N cutlass::HostTensor tensor_c( problem_size.mn()); // <- Create matrix C with dimensions M x N cutlass::HostTensor tensor_d( diff --git a/examples/12_gemm_bias_relu/gemm_bias_relu.cu b/examples/12_gemm_bias_relu/gemm_bias_relu.cu index 7faaa98a..1f83a61a 100644 --- a/examples/12_gemm_bias_relu/gemm_bias_relu.cu +++ b/examples/12_gemm_bias_relu/gemm_bias_relu.cu @@ -132,7 +132,7 @@ int run() { cutlass::HostTensor tensor_a( problem_size.mk()); // <- Create matrix A with dimensions M x K cutlass::HostTensor tensor_b( - problem_size.nk()); // <- Create matrix B with dimensions N x K + problem_size.kn()); // <- Create matrix B with dimensions K x N cutlass::HostTensor tensor_c_bias( {problem_size.m(), 1}); // <- Create matrix C with dimensions M x 1 @@ -234,7 +234,6 @@ int run() { tensor_a.device_ref(), tensor_b.device_ref(), 0, - tensor_c_bias.device_ref(), tensor_ref_d.device_ref()); // Wait for kernels to finish diff --git a/include/cutlass/arch/mma_sm75.h b/include/cutlass/arch/mma_sm75.h index ef65f20b..a862e65d 100644 --- a/include/cutlass/arch/mma_sm75.h +++ b/include/cutlass/arch/mma_sm75.h @@ -823,7 +823,7 @@ struct Mma< int const *C = reinterpret_cast(&c); int *D = reinterpret_cast(&d); - asm volatile("_mma.m8n8k32.row.col.s32.s4.u4.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n" + asm volatile("mma.sync.aligned.m8n8k32.row.col.s32.s4.u4.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n" : "=r"(D[0]), "=r"(D[1]) : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));