diff --git a/examples/06_splitK_gemm/splitk_gemm.cu b/examples/06_splitK_gemm/splitk_gemm.cu
index 6e01a101..b38de0c8 100644
--- a/examples/06_splitK_gemm/splitk_gemm.cu
+++ b/examples/06_splitK_gemm/splitk_gemm.cu
@@ -205,7 +205,7 @@ int run() {
   cutlass::HostTensor<ElementInputA, LayoutInputA> tensor_a(
       problem_size.mk());  // <- Create matrix A with dimensions M x K
   cutlass::HostTensor<ElementInputB, LayoutInputB> tensor_b(
-      problem_size.nk());  // <- Create matrix B with dimensions N x K
+      problem_size.kn());  // <- Create matrix B with dimensions K x N
   cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_c(
       problem_size.mn());  // <- Create matrix C with dimensions M x N
   cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_d(
diff --git a/examples/12_gemm_bias_relu/gemm_bias_relu.cu b/examples/12_gemm_bias_relu/gemm_bias_relu.cu
index 7faaa98a..1f83a61a 100644
--- a/examples/12_gemm_bias_relu/gemm_bias_relu.cu
+++ b/examples/12_gemm_bias_relu/gemm_bias_relu.cu
@@ -132,7 +132,7 @@ int run() {
   cutlass::HostTensor<ElementInputA, LayoutInputA> tensor_a(
       problem_size.mk());  // <- Create matrix A with dimensions M x K
   cutlass::HostTensor<ElementInputB, LayoutInputB> tensor_b(
-      problem_size.nk());  // <- Create matrix B with dimensions N x K
+      problem_size.kn());  // <- Create matrix B with dimensions K x N
 
   cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_c_bias(
       {problem_size.m(), 1});  // <- Create matrix C with dimensions M x 1
@@ -234,7 +234,6 @@ int run() {
     tensor_a.device_ref(),
     tensor_b.device_ref(),
     0,
-    tensor_c_bias.device_ref(),
     tensor_ref_d.device_ref());
 
   // Wait for kernels to finish
diff --git a/include/cutlass/arch/mma_sm75.h b/include/cutlass/arch/mma_sm75.h
index ef65f20b..a862e65d 100644
--- a/include/cutlass/arch/mma_sm75.h
+++ b/include/cutlass/arch/mma_sm75.h
@@ -823,7 +823,7 @@ struct Mma<
   int const *C = reinterpret_cast<int const *>(&c);
   int *D = reinterpret_cast<int *>(&d);
 
-  asm volatile("_mma.m8n8k32.row.col.s32.s4.u4.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
+  asm volatile("mma.sync.aligned.m8n8k32.row.col.s32.s4.u4.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
       : "=r"(D[0]), "=r"(D[1])
       : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));