diff --git a/examples/18_ampere_fp64_tensorop_affine2_gemm/ampere_fp64_tensorop_affine2_gemm.cu b/examples/18_ampere_fp64_tensorop_affine2_gemm/ampere_fp64_tensorop_affine2_gemm.cu
index 677f4787..b32b20ab 100644
--- a/examples/18_ampere_fp64_tensorop_affine2_gemm/ampere_fp64_tensorop_affine2_gemm.cu
+++ b/examples/18_ampere_fp64_tensorop_affine2_gemm/ampere_fp64_tensorop_affine2_gemm.cu
@@ -94,7 +94,7 @@ using SmArch = cutlass::arch::Sm80;
 using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 16>;  // Threadblock tile shape
 
 // This code section describes tile size a warp will compute
-using WarpShape = cutlass::gemm::GemmShape<64, 32, 16>;         // Warp tile shape
+using WarpShape = cutlass::gemm::GemmShape<32, 64, 16>;         // Warp tile shape
 
 // This code section describes the size of MMA op
 using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;    // TensorCore instruction shape
@@ -110,7 +110,8 @@ using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
     ElementOutput,                                        // Data type of output matrix.
     1,                                                    // The number of elements per memory
                                                           // access has.  It has to be 1 for
-                                                          // affine2. 
+                                                          // affine2.
+    ElementAccumulator,
     ElementComputeEpilogue>;
 
 using GemmKernel = typename cutlass::gemm::kernel::DefaultGemmUniversal<
@@ -226,7 +227,7 @@ int run() {
     tensor_b.device_ref().data(),              // <- reference to matrix B on device
     tensor_c.device_ref().data(),              // <- reference to matrix C on device
     tensor_d.device_ref().data(),              // <- reference to matrix D on device
-    tensor_a.layout().capacity(problem_size.mn()),
+    tensor_a.layout().capacity(problem_size.mk()),
     tensor_b.layout().capacity(problem_size.kn()),
     tensor_c.layout().capacity(problem_size.mn()),
     tensor_d.layout().capacity(problem_size.mn()),
@@ -302,7 +303,7 @@ int run() {
 
   CUTLASS_CHECK(status);
 
-  return 0;
+  return (pass ? 0 : -1);
 }
 
 int main(int argc, char const **args) {
diff --git a/include/cutlass/coord.h b/include/cutlass/coord.h
index e80deeb1..02343c84 100644
--- a/include/cutlass/coord.h
+++ b/include/cutlass/coord.h
@@ -94,14 +94,6 @@ public:
     }
   }
 
-  /// Copy constructor
-  CUTLASS_HOST_DEVICE
-  Coord(Coord<kRank, Index, LongIndex> const &coord) {
-    for (int i = 0; i < kRank; ++i) {
-      idx[i] = coord[i];
-    }
-  }
-
   /// Returns a slice of the Coord which may be larger or smaller in rank
   /// than this.
   template <int Slice>
diff --git a/include/cutlass/gemm/kernel/gemm_universal.h b/include/cutlass/gemm/kernel/gemm_universal.h
index eeeeb05c..d06822a2 100644
--- a/include/cutlass/gemm/kernel/gemm_universal.h
+++ b/include/cutlass/gemm/kernel/gemm_universal.h
@@ -162,6 +162,10 @@ public:
       ptr_A(ptr_A), ptr_B(ptr_B), ptr_C(ptr_C), ptr_D(ptr_D), 
       batch_stride_A(batch_stride_A), batch_stride_B(batch_stride_B), batch_stride_C(batch_stride_C), batch_stride_D(batch_stride_D), 
       stride_a(stride_a), stride_b(stride_b), stride_c(stride_c), stride_d(stride_d) {
+      lda = 0;
+      ldb = 0;
+      ldc = 0;
+      ldd = 0;
 
       CUTLASS_TRACE_HOST("GemmUniversal::Arguments::Arguments() - problem_size: " << problem_size);
       }
diff --git a/include/cutlass/tensor_ref.h b/include/cutlass/tensor_ref.h
index 1e6402b3..a7792d3f 100644
--- a/include/cutlass/tensor_ref.h
+++ b/include/cutlass/tensor_ref.h
@@ -219,9 +219,12 @@ class TensorRef {
   }
 
   /// Converting constructor from TensorRef to non-constant data.
+  template<typename _Magic = int>
   CUTLASS_HOST_DEVICE
   TensorRef(
-    NonConstTensorRef const &ref              ///< TensorRef to non-const data
+    NonConstTensorRef const &ref,             ///< TensorRef to non-const data
+    ///SFINAE trick to avoid creating a copy-constructor when Element_ is already non-const
+    _Magic magic = (typename platform::enable_if< ! platform::is_same<NonConstTensorRef, TensorRef<Element_, Layout_> >::value, _Magic>::type)0
   ):
     ptr_(ref.data()), layout_(ref.layout()) { }