diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9bf6d239..9468cc1a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,8 @@
 # NVIDIA CUTLASS Changelog
 
+## [1.3.1](https://github.com/NVIDIA/cutlass/releases/tag/v1.3.1) (2019-04-09)
+ * Corrected NVRTC unit tests.
+
 ## [1.3.0](https://github.com/NVIDIA/cutlass/releases/tag/v1.3.0) (2019-03-20)
  * Efficient GEMM kernel targeting Volta Tensor Cores via `mma.sync` instruction added in CUDA 10.1.
 
diff --git a/README.md b/README.md
index 231eafba..4c539e92 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 
 # CUTLASS 1.3
 
-_CUTLASS 1.3.0 - March 2019_
+_CUTLASS 1.3.1 - April 2019_
 
 CUTLASS is a collection of CUDA C++ template abstractions for implementing
 high-performance matrix-multiplication (GEMM) at all levels and scales within CUDA.
@@ -28,6 +28,10 @@ CUTLASS 1.3 is described in the [CUTLASS Documentation](CUTLASS.md) and the acco
 We describe the structure of an efficient GEMM in our talk at the
 [GPU Technology Conference 2018](http://on-demand.gputechconf.com/gtc/2018/presentation/s8854-cutlass-software-primitives-for-dense-linear-algebra-at-all-levels-and-scales-within-cuda.pdf).
 
+# What's New in CUTLASS 1.3.1
+_April 2019_
+* CUTLASS 1.3.1 corrected NVRTC unit tests..
+
 # What's New in CUTLASS 1.3
 _March 2019_
 * CUTLASS 1.3 includes an efficient GEMM implementation with the `mma.sync` instruction added in CUDA 10.1.
diff --git a/cutlass/cutlass.h b/cutlass/cutlass.h
index 26de6c02..783ea3b6 100644
--- a/cutlass/cutlass.h
+++ b/cutlass/cutlass.h
@@ -34,7 +34,7 @@
 
 #define CUTLASS_MAJOR 1
 #define CUTLASS_MINOR 3
-#define CUTLASS_PATCH 0
+#define CUTLASS_PATCH 1
 #define CUTLASS_VERSION ((CUTLASS_MAJOR)*100 + (CUTLASS_MINOR)*10 + CUTLASS_PATCH)
 
 #ifdef __NVCC__
@@ -58,8 +58,13 @@
 
 // CUTLASS_PRAGMA_(UNROLL|NO_UNROLL) optimization directives for the CUDA compiler.
 #if defined(__CUDA_ARCH__)
+#ifdef __NVCC__
     #define CUTLASS_PRAGMA_UNROLL #pragma unroll
     #define CUTLASS_PRAGMA_NO_UNROLL #pragma unroll 1
+#elif defined(__CUDACC_RTC__)
+    #define CUTLASS_PRAGMA_UNROLL _Pragma("unroll")
+    #define CUTLASS_PRAGMA_NO_UNROLL _Pragma("unroll 1")
+#endif
 
     #define CUTLASS_GEMM_LOOP CUTLASS_PRAGMA_NO_UNROLL
 
@@ -80,6 +85,7 @@ template <typename T>
 struct DebugType {};
 
 template <typename T>
+CUTLASS_HOST_DEVICE
 void DebugTypeFunc(T const& t) {
   T::t;
 }
diff --git a/cutlass/gemm/gemm.h b/cutlass/gemm/gemm.h
index 0d919199..70cb2ccc 100644
--- a/cutlass/gemm/gemm.h
+++ b/cutlass/gemm/gemm.h
@@ -33,7 +33,6 @@
 
 #include "cutlass/coord.h"
 #include "cutlass/util/platform.h"
-#include <cstdio>
 namespace cutlass {
 namespace gemm {
 
@@ -84,6 +83,7 @@ void gemm_kernel_nolb(typename Gemm_::Params params) {
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
+#if !defined(__CUDACC_RTC__)
 /// Partial specialization for launching the GEMM kernel with or without launch bounds
 template <typename Gemm, bool WithLaunchBounds>
 struct Launch {
@@ -152,7 +152,51 @@ struct Launch<Gemm, false> {
       smem_size,
       stream >>>(params);
   }
+
+  // Use device API to launch kernel
+  Launch(cudaError_t &result, CUfunction kernel,
+         typename Gemm::Params params, dim3 grid, dim3 block, CUstream stream = CU_STREAM_LEGACY) {
+    void* params_[] = {const_cast<void*>(reinterpret_cast<void const*>(&params))};
+
+    int smem_size = int(sizeof(typename Gemm::SharedStorage));
+    if (smem_size >= (48 << 10)) {
+
+      result = cudaFuncSetAttribute(
+        kernel,
+        cudaFuncAttributeMaxDynamicSharedMemorySize,
+        smem_size
+      );
+
+      if (result != cudaSuccess) {
+        return;
+      }
+
+      result = cudaFuncSetAttribute(
+        kernel,
+        cudaFuncAttributePreferredSharedMemoryCarveout,
+        100);
+
+      if (result != cudaSuccess) {
+        return;
+      }
+    }
+
+    CUresult launch_result = cuLaunchKernel(
+        kernel,
+        grid.x, grid.y, grid.z,
+        block.x, block.y, block.z,
+        smem_size, stream, params_, 0);
+
+    if (launch_result != CUDA_SUCCESS) {
+      result = cudaErrorLaunchFailure;
+      return;
+    }
+
+    result = cudaSuccess;
+    return;
+  }
 };
+#endif
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -188,20 +232,13 @@ struct Gemm {
   static __host__ cudaError_t launch(CUfunction kernel,
                                      Params const& params,
                                      CUstream stream = CU_STREAM_LEGACY) {
+    cudaError_t result;
 
     // Launch the kernel.
-    void* params_[] = {const_cast<void*>(reinterpret_cast<void const*>(&params))};
+    Launch<KernelClass, Traits::GemmConfig::kLaunchBounds>(
+      result, kernel, params, params.grid, params.block, stream);
 
-    CUresult result = cuLaunchKernel(
-        kernel,
-        params.grid.x, params.grid.y, params.grid.z,
-        params.block.x, params.block.y, params.block.z,
-        0, stream, params_, 0);
-
-    if (result != CUDA_SUCCESS) {
-      return cudaErrorLaunchFailure;
-    }
-    return cudaSuccess;
+    return result;
   }
 
 #endif
diff --git a/cutlass/gemm/scalar_or_pointer.h b/cutlass/gemm/scalar_or_pointer.h
index 9e292951..315d1a4c 100644
--- a/cutlass/gemm/scalar_or_pointer.h
+++ b/cutlass/gemm/scalar_or_pointer.h
@@ -1,3 +1,4 @@
+
 /***************************************************************************************************
  * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
  *
diff --git a/cutlass/layout/thread/transform.h b/cutlass/layout/thread/transform.h
index 3abb22e1..8b9445e0 100644
--- a/cutlass/layout/thread/transform.h
+++ b/cutlass/layout/thread/transform.h
@@ -77,6 +77,7 @@ struct Copy {
   }
 };
 
+#if !defined(__CUDACC_RTC__) || defined(CUTLASS_NVRTC_HAS_FP16)
 template <int rank>
 struct Copy<half, half, rank, cutlass::MatrixLayout::RowMajor, cutlass::MatrixLayout::RowMajor> {
   CUTLASS_DEVICE void copy(cutlass::TensorView<half, rank, cutlass::MatrixLayout::RowMajor> dst,
@@ -140,6 +141,7 @@ struct Copy<half, half, 2, cutlass::MatrixLayout::RowMajor, cutlass::MatrixLayou
     }
   }
 };
+#endif
 
 /// igemm swizzle
 /// Transform a fragment.
@@ -239,6 +241,7 @@ struct Transform {
   }
 };
 
+#if !defined(__CUDACC_RTC__) || defined(CUTLASS_NVRTC_HAS_FP16)
 template <typename Shape, int Rank, typename DstLayout, typename SrcLayout>
 struct Transform<Shape, Rank, half, DstLayout, half, SrcLayout> {
   typedef Fragment<half, ShapeCount<Shape>::kCount> DstFragment;
@@ -266,6 +269,7 @@ struct Transform<Shape, Rank, half, DstLayout, half, SrcLayout> {
     Transformer.copy(dstView, srcView);
   }
 };
+#endif
 
 template <typename Shape, int Rank, typename DstLayout, typename SrcLayout>
 struct Transform<Shape, Rank, int8_t, DstLayout, int8_t, SrcLayout> {
diff --git a/cutlass/tensor_view.h b/cutlass/tensor_view.h
index d770a193..28b12e5a 100644
--- a/cutlass/tensor_view.h
+++ b/cutlass/tensor_view.h
@@ -36,8 +36,6 @@
 
 #pragma once
 
-#include <cmath>
-
 #include "cutlass/cutlass.h"
 #include "cutlass/tensor_ref.h"
 
diff --git a/cutlass/tile_iterator.h b/cutlass/tile_iterator.h
index 923d7e10..b8a88252 100644
--- a/cutlass/tile_iterator.h
+++ b/cutlass/tile_iterator.h
@@ -34,7 +34,6 @@
 #include "cutlass/load_store.h"
 #include "cutlass/predicate_vector.h"
 #include "cutlass/vector.h"
-#include <cstdio>
 
 namespace cutlass {
 
diff --git a/cutlass/util/performance_tuning.h b/cutlass/util/performance_tuning.h
deleted file mode 100644
index fd117740..00000000
--- a/cutlass/util/performance_tuning.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are not permitted.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-#pragma once
-#ifndef CUTLASS_PERFORMANCE_TUNING_H
-#define CUTLASS_PERFORMANCE_TUNING_H
-
-// CUTLASS_PRAGMA_(UNROLL|NO_UNROLL) optimization directives for the CUDA compiler.
-
-#if defined(__CUDA_ARCH__)
-#if defined(_MSC_VER)
-#define CUTLASS_PRAGMA_UNROLL __pragma("unroll")
-#define CUTLASS_PRAGMA_NO_UNROLL __pragma("unroll 1")
-#else
-#define CUTLASS_PRAGMA_UNROLL _Pragma("unroll")
-#define CUTLASS_PRAGMA_NO_UNROLL _Pragma("unroll 1")
-#endif
-#else
-#define CUTLASS_PRAGMA_UNROLL
-#define CUTLASS_PRAGMA_NO_UNROLL
-#endif
-
-#define CUTLASS_GEMM_LOOP CUTLASS_PRAGMA_NO_UNROLL
-#endif // CUTLASS_PERFORMANCE_TUNING_H
diff --git a/cutlass/vector.h b/cutlass/vector.h
index 9b8a30ea..e59340b8 100644
--- a/cutlass/vector.h
+++ b/cutlass/vector.h
@@ -88,6 +88,8 @@ union Vector {
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
+#if !defined(__CUDACC_RTC__) || defined(CUTLASS_NVRTC_HAS_FP16)
+
 template <>
 union Vector<half, 1> {
   /// The scalar type.
@@ -118,7 +120,6 @@ union Vector<half, 1> {
   }
 };
 
-#if !defined(__CUDACC_RTC__) || defined(CUTLASS_NVRTC_HAS_FP16)
 
 template <int kLanes_>
 union Vector<half, kLanes_> {
diff --git a/tools/nvrtc/CMakeLists.txt b/tools/nvrtc/CMakeLists.txt
index 2eeb90d0..bb8eeee9 100644
--- a/tools/nvrtc/CMakeLists.txt
+++ b/tools/nvrtc/CMakeLists.txt
@@ -54,7 +54,7 @@ if (CUTLASS_NVRTC_ENABLE)
   string(APPEND NVRTC_INCLUDES_STRINGS "char const *kCutlassHeaders[] = {\n")
   string(APPEND NVRTC_INCLUDES_NAMES "char const *kCutlassHeaderNames[] = {\n")
 
-  add_nvrtc_headers(${CMAKE_SOURCE_DIR} "${CUTLASS_CORE};${CUTLASS_GEMM};${CUTLASS_UTIL};${CUTLASS_DEVICE}")
+  add_nvrtc_headers(${CMAKE_SOURCE_DIR} "${CUTLASS_CORE};${CUTLASS_GEMM};${CUTLASS_UTIL};${CUTLASS_DEVICE};${CUTLASS_ARCH};${CUTLASS_LAYOUT_THREAD}")
   message("${CMAKE_CURRENT_SOURCE_DIR}/")
   add_nvrtc_headers("${CMAKE_CURRENT_SOURCE_DIR}/stdlib" "assert.h;stdint.h")
   if(CUTLASS_NVRTC_HAS_CUDA_FP16)
diff --git a/tools/test/unit/gemm/gemm_nvrtc.cu b/tools/test/unit/gemm/gemm_nvrtc.cu
index 89dfe1a6..dcfb1658 100644
--- a/tools/test/unit/gemm/gemm_nvrtc.cu
+++ b/tools/test/unit/gemm/gemm_nvrtc.cu
@@ -43,6 +43,8 @@ TEST(Dgemm_nvrtc_64x32x8, dgemm_nvrtc_64x32x8_nt) {
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
+#if (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 610))
+
 TEST(Igemm__nvrtc_128x128x32, igemm_nvrtc_256x256x64_tt) {
   typedef cutlass::gemm::IgemmTraits<cutlass::MatrixLayout::kRowMajor,
     cutlass::MatrixLayout::kRowMajor,
@@ -52,6 +54,8 @@ TEST(Igemm__nvrtc_128x128x32, igemm_nvrtc_256x256x64_tt) {
   run_gemm_nvrtc<IgemmTraits>(gemm_traits, 256, 256, 64);
 }
 
+#endif
+
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 TEST(Sgemm_nvrtc_128x128x8, sgemm_nvrtc_128x112x16_alpha2_beta1_nt) {
diff --git a/tools/test/unit/gemm/gemm_nvrtc.h b/tools/test/unit/gemm/gemm_nvrtc.h
index 493eafb4..10123d68 100644
--- a/tools/test/unit/gemm/gemm_nvrtc.h
+++ b/tools/test/unit/gemm/gemm_nvrtc.h
@@ -30,6 +30,7 @@
 #include <nvrtc.h>
 #include "tools/nvrtc/cutlass/nvrtc/environment.h"
 #include <string>
+#include <sstream>
 
 static inline bool check_nvrtc_error(nvrtcResult error) {
   if (error != NVRTC_SUCCESS) {
@@ -67,31 +68,36 @@ static __host__ void run_gemm_nvrtc(
       testbed(m,
               n,
               k,
-              cutlass::convert(GemmTraits_::kLayoutA),
-              cutlass::convert(GemmTraits_::kLayoutB),
+              test::convert(GemmTraits_::kLayoutA),
+              test::convert(GemmTraits_::kLayoutB),
               alpha,
               beta);
 
+  int currentDevice;
+  cudaGetDevice(&currentDevice);
+
+  // generate the architecture string for the nvrtc conmpiler
+  cudaDeviceProp deviceProperties;
+  cudaGetDeviceProperties(&deviceProperties, currentDevice);
+  std::stringstream arch;
+  arch << "-arch=compute_" << deviceProperties.major << deviceProperties.minor;
+
   // Instantiate gemm_kernel
   nvrtcResult result_nvrtc;
   nvrtcProgram program;
   static char const *src =
-      "#include "cutlass/gemm/gemm.h"\n"
-      "#include "cutlass/gemm/sgemm_traits.h"\n"
-      "#include "cutlass/gemm/dgemm_traits.h"\n"
-      "#include "cutlass/gemm/igemm_traits.h"\n"
+      "#include \"cutlass/gemm/gemm.h\"\n"
+      "#include \"cutlass/gemm/sgemm_traits.h\"\n"
+      "#include \"cutlass/gemm/dgemm_traits.h\"\n"
+      "#include \"cutlass/gemm/igemm_traits.h\"\n"
 #if defined(CUTLASS_NVRTC_HAS_FP16)
-      "#include "cutlass/gemm/hgemm_traits.h"\n"
-      "#include "cutlass/gemm/wmma_gemm_traits.h"\n"
+      "#include \"cutlass/gemm/hgemm_traits.h\"\n"
+      "#include \"cutlass/gemm/wmma_gemm_traits.h\"\n"
 #endif
       ;
 
   std::string type_name;
-#if 0
-  nvrtcGetTypeName<typename GemmTraits_>(&type_name);
-#else
-  type_name = gemm_traits;
-#endif
+  nvrtcGetTypeName<GemmTraits_>(&type_name);
 
   result_nvrtc = nvrtcCreateProgram(&program,
                                     src,
@@ -102,10 +108,22 @@ static __host__ void run_gemm_nvrtc(
   check_nvrtc_error(result_nvrtc);
 
   std::string gemm_kernel_instantiation =
-      "cutlass::gemm::gemm_kernel<cutlass::gemm::Gemm< " + type_name + " > >";
+      "cutlass::gemm::gemm_kernel<cutlass::gemm::Gemm< " + type_name + " >::KernelClass >";
   nvrtcAddNameExpression(program, gemm_kernel_instantiation.c_str());
 
-  result_nvrtc = nvrtcCompileProgram(program, 0, NULL);
+  // generate option list to genereate kernel for the underlying GPU
+  std::vector<std::string> options;
+  std::vector<const char*> c_options;
+
+  options.push_back(arch.str());
+
+  // convert option list into a c-string list for the nvrtc interface
+  for (std::vector<std::string>::const_iterator i = options.begin(); i != options.end(); ++i) {
+      c_options.push_back(i->c_str());
+  }
+
+  // compile
+  result_nvrtc = nvrtcCompileProgram(program, int(c_options.size()), c_options.data());
   if (result_nvrtc != NVRTC_SUCCESS) {
     size_t logSize;
     nvrtcGetProgramLogSize(program, &logSize);
@@ -118,11 +136,13 @@ static __host__ void run_gemm_nvrtc(
   }
 
   // The lowered name is the name of the template instantiation in the generated PTX code.
-  char const *gemm_kernel_lowered_name;
-  nvrtcGetLoweredName(program, gemm_kernel_instantiation.c_str(), &gemm_kernel_lowered_name);
+  char const *temp_gemm_kernel_lowered_name;
+  nvrtcGetLoweredName(program, gemm_kernel_instantiation.c_str(), &temp_gemm_kernel_lowered_name);
   if (!check_nvrtc_error(result_nvrtc)) {
     ASSERT_TRUE(false);
   }
+  // the ponter we got from nvrtcGetLoweredName is valid only as long as the program is valid. create a copy.
+  std::string gemm_kernel_lowered_name(temp_gemm_kernel_lowered_name);
 
   // Query the size of the genereated PTX so that we can allocate storage and retrieve it afterwards
   size_t ptx_size;
@@ -134,22 +154,32 @@ static __host__ void run_gemm_nvrtc(
   std::vector<char> ptx(ptx_size);
   result_nvrtc = nvrtcGetPTX(program, ptx.data());
   if (!check_nvrtc_error(result_nvrtc)) {
+      std::cerr << "failed to get ptx" << std::endl;
     ASSERT_TRUE(false);
   }
 
   // we do not need the nvrtc program anymore
   nvrtcDestroyProgram(&program);
 
+  // Now load the module
   CUmodule module;
   CUresult result_cuda;
+
   result_cuda = cuModuleLoadDataEx(&module, ptx.data(), 0, 0, 0);
   if (result_cuda != CUDA_SUCCESS) {
+    const char *msg;
+    cuGetErrorName(result_cuda, &msg);
+    std::cerr << "\ncuModuleLoadDataEx error: failed with error " << msg << std::endl;
     ASSERT_TRUE(false);
   }
 
+  // and retrieve the function
   CUfunction kernel;
-  result_cuda = cuModuleGetFunction(&kernel, module, gemm_kernel_lowered_name);
+  result_cuda = cuModuleGetFunction(&kernel, module, gemm_kernel_lowered_name.c_str());
   if (result_cuda != CUDA_SUCCESS) {
+    const char *msg;
+    cuGetErrorName(result_cuda, &msg);
+    std::cerr << "\ncuModuleGetFunction error: failed with error " << msg << std::endl;
     ASSERT_TRUE(false);
   }
 
@@ -173,16 +203,23 @@ static __host__ void run_gemm_nvrtc(
                     testbed.ptr_computed(),
                     testbed.ldc());
 
-  // Gemm::launch(params);
   Gemm::launch(kernel, params);
 
   cudaError_t result = cudaDeviceSynchronize();
   ASSERT_EQ(result, cudaSuccess) << "\nCUDA kernel launch error: " << cudaGetErrorString(result)
-                                 << "\n";
+                                 << std::endl;
 
   if (testbed.has_cublas_support()) {
     ASSERT_TRUE(testbed.verify_with_cublas());
   } else {
     ASSERT_TRUE(testbed.verify_with_host());
   }
+
+  result_cuda = cuModuleUnload(module);
+  if (result_cuda != CUDA_SUCCESS) {
+    const char *msg;
+    cuGetErrorName(result_cuda, &msg);
+    std::cerr << "\ncuModuleUnload error: failed with error " << msg << std::endl;
+    ASSERT_TRUE(false);
+  }
 }