Merge pull request #274 from mani-ananth/master

Some pending Bug fixes
2021-06-02 13:17:39 -04:00 · 2021-06-02 13:17:39 -04:00 · 6a1064093f
commit 6a1064093f
parent ad9486684f c5f1ef4dff
3 changed files with 18 additions and 15 deletions
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@ -23,6 +23,7 @@ Scott Yokim
 Markus Hohnerbach  
 Aditya Atluri  
 David Tanner  
+Manikandan Ananth  
  
 ## CONTRIBUTORS
 Timothy Costa  
--- a/examples/00_basic_gemm/basic_gemm.cu
+++ b/examples/00_basic_gemm/basic_gemm.cu
@ -148,7 +148,6 @@ cudaError_t CutlassSgemmNN(
 /// Kernel to initialize a matrix with small integers.
 __global__ void InitializeMatrix_kernel(
  float *matrix,
-  int ldm,
  int rows,
  int columns,
  int seed = 0) {
@ -157,7 +156,7 @@ __global__ void InitializeMatrix_kernel(
  int j = threadIdx.y + blockIdx.y * blockDim.y;

  if (i < rows && j < columns) {
-    int offset = i + j * ldm;
+    int offset = i + j * rows;

    // Generate arbitrary elements.
    int const k = 16807;
@ -169,7 +168,7 @@ __global__ void InitializeMatrix_kernel(
 }

 /// Simple function to initialize a matrix to arbitrary small integers.
-cudaError_t InitializeMatrix(float *matrix, int ldm, int rows, int columns, int seed = 0) {
+cudaError_t InitializeMatrix(float *matrix, int rows, int columns, int seed = 0) {

  dim3 block(16, 16);
  dim3 grid(
@ -177,7 +176,7 @@ cudaError_t InitializeMatrix(float *matrix, int ldm, int rows, int columns, int
    (columns + block.y - 1) / block.y
  );

-  InitializeMatrix_kernel<<< grid, block >>>(matrix, ldm, rows, columns, seed);
+  InitializeMatrix_kernel<<< grid, block >>>(matrix, rows, columns, seed);

  return cudaGetLastError();
 }
@ -185,10 +184,10 @@ cudaError_t InitializeMatrix(float *matrix, int ldm, int rows, int columns, int
 ///////////////////////////////////////////////////////////////////////////////////////////////////

 /// Allocates device memory for a matrix then fills with arbitrary small integers.
-cudaError_t AllocateMatrix(float **matrix, int ldm, int rows, int columns, int seed = 0) {
+cudaError_t AllocateMatrix(float **matrix, int rows, int columns, int seed = 0) {
  cudaError_t result;

-  size_t sizeof_matrix = sizeof(float) * ldm * columns;
+  size_t sizeof_matrix = sizeof(float) * columns;

  // Allocate device memory.
  result = cudaMalloc(reinterpret_cast<void **>(matrix), sizeof_matrix);
@ -209,7 +208,7 @@ cudaError_t AllocateMatrix(float **matrix, int ldm, int rows, int columns, int s
  }

  // Initialize matrix elements to arbitrary small integers.
-  result = InitializeMatrix(*matrix, ldm, rows, columns, seed);
+  result = InitializeMatrix(*matrix, rows, columns, seed);

  if (result != cudaSuccess) {
    std::cerr << "Failed to initialize matrix: "
@ -304,20 +303,20 @@ cudaError_t TestCutlassGemm(int M, int N, int K, float alpha, float beta) {
  // Allocate matrices in GPU device memory with arbitrary seeds.
  //

-  result = AllocateMatrix(&A, lda, M, K, 0);
+  result = AllocateMatrix(&A, M, K, 0);

  if (result !=  cudaSuccess) {
    return result;
  }

-  result = AllocateMatrix(&B, ldb, K, N, 17);
+  result = AllocateMatrix(&B, K, N, 17);

  if (result !=  cudaSuccess) {
    cudaFree(A);
    return result;
  }

-  result = AllocateMatrix(&C_cutlass, ldc, M, N, 101);
+  result = AllocateMatrix(&C_cutlass, M, N, 101);

  if (result != cudaSuccess) {
    cudaFree(A);
@ -325,7 +324,7 @@ cudaError_t TestCutlassGemm(int M, int N, int K, float alpha, float beta) {
    return result;
  }

-  result = AllocateMatrix(&C_reference, ldc, M, N, 101);
+  result = AllocateMatrix(&C_reference, M, N, 101);

  if (result != cudaSuccess) {
    cudaFree(A);
--- a/include/cutlass/epilogue/thread/linear_combination_clamp.h
+++ b/include/cutlass/epilogue/thread/linear_combination_clamp.h
@ -223,11 +223,14 @@ public:
    intermediate = mul_accumulator(alpha_, converted_accumulator);    // D = alpha * Accum

    /// Clamping constant value
-    ElementCompute const kClamp =
-        ElementCompute((1U << (sizeof_bits<ElementOutput>::value - 1)) - 1);
+    ElementCompute const kClampMax =
+        ElementCompute(platform::numeric_limits<ElementOutput>::max());

-    intermediate = max_accumulator(intermediate, -kClamp - ElementCompute(1));
-    intermediate = min_accumulator(intermediate, kClamp);
+    ElementCompute const kClampMin =
+        ElementCompute(platform::numeric_limits<ElementOutput>::lowest());
+
+    intermediate = max_accumulator(intermediate, kClampMin);
+    intermediate = min_accumulator(intermediate, kClampMax);

    // Convert to destination numeric type
    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;