From 47ebfccbec25ed8dd66ef3335bbb83ea6d3073ed Mon Sep 17 00:00:00 2001
From: Manikandan Ananth <mananth@nvidia.com>
Date: Wed, 2 Jun 2021 10:08:25 -0700
Subject: [PATCH 1/2] bug fixes

---
 examples/00_basic_gemm/basic_gemm.cu          | 21 +++++++++----------
 .../thread/linear_combination_clamp.h         | 11 ++++++----
 2 files changed, 17 insertions(+), 15 deletions(-)
diff --git a/examples/00_basic_gemm/basic_gemm.cu b/examples/00_basic_gemm/basic_gemm.cu
index 1dbeef75..70d632ea 100644
--- a/examples/00_basic_gemm/basic_gemm.cu
+++ b/examples/00_basic_gemm/basic_gemm.cu
@@ -148,7 +148,6 @@ cudaError_t CutlassSgemmNN(
 /// Kernel to initialize a matrix with small integers.
 __global__ void InitializeMatrix_kernel(
   float *matrix,
-  int ldm,
   int rows,
   int columns,
   int seed = 0) {
@@ -157,7 +156,7 @@ __global__ void InitializeMatrix_kernel(
   int j = threadIdx.y + blockIdx.y * blockDim.y;
 
   if (i < rows && j < columns) {
-    int offset = i + j * ldm;
+    int offset = i + j * rows;
 
     // Generate arbitrary elements.
     int const k = 16807;
@@ -169,7 +168,7 @@ __global__ void InitializeMatrix_kernel(
 }
 
 /// Simple function to initialize a matrix to arbitrary small integers.
-cudaError_t InitializeMatrix(float *matrix, int ldm, int rows, int columns, int seed = 0) {
+cudaError_t InitializeMatrix(float *matrix, int rows, int columns, int seed = 0) {
 
   dim3 block(16, 16);
   dim3 grid(
@@ -177,7 +176,7 @@ cudaError_t InitializeMatrix(float *matrix, int ldm, int rows, int columns, int
     (columns + block.y - 1) / block.y
   );
 
-  InitializeMatrix_kernel<<< grid, block >>>(matrix, ldm, rows, columns, seed);
+  InitializeMatrix_kernel<<< grid, block >>>(matrix, rows, columns, seed);
 
   return cudaGetLastError();
 }
@@ -185,10 +184,10 @@ cudaError_t InitializeMatrix(float *matrix, int ldm, int rows, int columns, int
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// Allocates device memory for a matrix then fills with arbitrary small integers.
-cudaError_t AllocateMatrix(float **matrix, int ldm, int rows, int columns, int seed = 0) {
+cudaError_t AllocateMatrix(float **matrix, int rows, int columns, int seed = 0) {
   cudaError_t result;
 
-  size_t sizeof_matrix = sizeof(float) * ldm * columns;
+  size_t sizeof_matrix = sizeof(float) * columns;
 
   // Allocate device memory.
   result = cudaMalloc(reinterpret_cast<void **>(matrix), sizeof_matrix);
@@ -209,7 +208,7 @@ cudaError_t AllocateMatrix(float **matrix, int ldm, int rows, int columns, int s
   }
 
   // Initialize matrix elements to arbitrary small integers.
-  result = InitializeMatrix(*matrix, ldm, rows, columns, seed);
+  result = InitializeMatrix(*matrix, rows, columns, seed);
 
   if (result != cudaSuccess) {
     std::cerr << "Failed to initialize matrix: "
@@ -304,20 +303,20 @@ cudaError_t TestCutlassGemm(int M, int N, int K, float alpha, float beta) {
   // Allocate matrices in GPU device memory with arbitrary seeds.
   //
 
-  result = AllocateMatrix(&A, lda, M, K, 0);
+  result = AllocateMatrix(&A, M, K, 0);
 
   if (result !=  cudaSuccess) {
     return result;
   }
 
-  result = AllocateMatrix(&B, ldb, K, N, 17);
+  result = AllocateMatrix(&B, K, N, 17);
 
   if (result !=  cudaSuccess) {
     cudaFree(A);
     return result;
   }
 
-  result = AllocateMatrix(&C_cutlass, ldc, M, N, 101);
+  result = AllocateMatrix(&C_cutlass, M, N, 101);
 
   if (result != cudaSuccess) {
     cudaFree(A);
@@ -325,7 +324,7 @@ cudaError_t TestCutlassGemm(int M, int N, int K, float alpha, float beta) {
     return result;
   }
 
-  result = AllocateMatrix(&C_reference, ldc, M, N, 101);
+  result = AllocateMatrix(&C_reference, M, N, 101);
 
   if (result != cudaSuccess) {
     cudaFree(A);
diff --git a/include/cutlass/epilogue/thread/linear_combination_clamp.h b/include/cutlass/epilogue/thread/linear_combination_clamp.h
index 7d47a5c5..e1bf10bb 100644
--- a/include/cutlass/epilogue/thread/linear_combination_clamp.h
+++ b/include/cutlass/epilogue/thread/linear_combination_clamp.h
@@ -223,11 +223,14 @@ public:
     intermediate = mul_accumulator(alpha_, converted_accumulator);    // D = alpha * Accum
 
     /// Clamping constant value
-    ElementCompute const kClamp =
-        ElementCompute((1U << (sizeof_bits<ElementOutput>::value - 1)) - 1);
+    ElementCompute const kClampMax =
+        ElementCompute(platform::numeric_limits<ElementOutput>::max());
 
-    intermediate = max_accumulator(intermediate, -kClamp - ElementCompute(1));
-    intermediate = min_accumulator(intermediate, kClamp);
+    ElementCompute const kClampMin =
+        ElementCompute(platform::numeric_limits<ElementOutput>::lowest());
+
+    intermediate = max_accumulator(intermediate, kClampMin);
+    intermediate = min_accumulator(intermediate, kClampMax);
 
     // Convert to destination numeric type
     NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;

From c5f1ef4dff6d90e1ef1c5817c17b758fba1e0818 Mon Sep 17 00:00:00 2001
From: Manikandan Ananth <mananth@nvidia.com>
Date: Wed, 2 Jun 2021 10:11:42 -0700
Subject: [PATCH 2/2] update contributors

---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index a4e0a2a4..5ce83dfb 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -23,6 +23,7 @@ Scott Yokim
 Markus Hohnerbach  
 Aditya Atluri  
 David Tanner  
+Manikandan Ananth  
   
 ## CONTRIBUTORS
 Timothy Costa