From 47ebfccbec25ed8dd66ef3335bbb83ea6d3073ed Mon Sep 17 00:00:00 2001 From: Manikandan Ananth Date: Wed, 2 Jun 2021 10:08:25 -0700 Subject: [PATCH 1/2] bug fixes --- examples/00_basic_gemm/basic_gemm.cu | 21 +++++++++---------- .../thread/linear_combination_clamp.h | 11 ++++++---- 2 files changed, 17 insertions(+), 15 deletions(-) diff --git a/examples/00_basic_gemm/basic_gemm.cu b/examples/00_basic_gemm/basic_gemm.cu index 1dbeef75..70d632ea 100644 --- a/examples/00_basic_gemm/basic_gemm.cu +++ b/examples/00_basic_gemm/basic_gemm.cu @@ -148,7 +148,6 @@ cudaError_t CutlassSgemmNN( /// Kernel to initialize a matrix with small integers. __global__ void InitializeMatrix_kernel( float *matrix, - int ldm, int rows, int columns, int seed = 0) { @@ -157,7 +156,7 @@ __global__ void InitializeMatrix_kernel( int j = threadIdx.y + blockIdx.y * blockDim.y; if (i < rows && j < columns) { - int offset = i + j * ldm; + int offset = i + j * rows; // Generate arbitrary elements. int const k = 16807; @@ -169,7 +168,7 @@ __global__ void InitializeMatrix_kernel( } /// Simple function to initialize a matrix to arbitrary small integers. -cudaError_t InitializeMatrix(float *matrix, int ldm, int rows, int columns, int seed = 0) { +cudaError_t InitializeMatrix(float *matrix, int rows, int columns, int seed = 0) { dim3 block(16, 16); dim3 grid( @@ -177,7 +176,7 @@ cudaError_t InitializeMatrix(float *matrix, int ldm, int rows, int columns, int (columns + block.y - 1) / block.y ); - InitializeMatrix_kernel<<< grid, block >>>(matrix, ldm, rows, columns, seed); + InitializeMatrix_kernel<<< grid, block >>>(matrix, rows, columns, seed); return cudaGetLastError(); } @@ -185,10 +184,10 @@ cudaError_t InitializeMatrix(float *matrix, int ldm, int rows, int columns, int /////////////////////////////////////////////////////////////////////////////////////////////////// /// Allocates device memory for a matrix then fills with arbitrary small integers. -cudaError_t AllocateMatrix(float **matrix, int ldm, int rows, int columns, int seed = 0) { +cudaError_t AllocateMatrix(float **matrix, int rows, int columns, int seed = 0) { cudaError_t result; - size_t sizeof_matrix = sizeof(float) * ldm * columns; + size_t sizeof_matrix = sizeof(float) * columns; // Allocate device memory. result = cudaMalloc(reinterpret_cast(matrix), sizeof_matrix); @@ -209,7 +208,7 @@ cudaError_t AllocateMatrix(float **matrix, int ldm, int rows, int columns, int s } // Initialize matrix elements to arbitrary small integers. - result = InitializeMatrix(*matrix, ldm, rows, columns, seed); + result = InitializeMatrix(*matrix, rows, columns, seed); if (result != cudaSuccess) { std::cerr << "Failed to initialize matrix: " @@ -304,20 +303,20 @@ cudaError_t TestCutlassGemm(int M, int N, int K, float alpha, float beta) { // Allocate matrices in GPU device memory with arbitrary seeds. // - result = AllocateMatrix(&A, lda, M, K, 0); + result = AllocateMatrix(&A, M, K, 0); if (result != cudaSuccess) { return result; } - result = AllocateMatrix(&B, ldb, K, N, 17); + result = AllocateMatrix(&B, K, N, 17); if (result != cudaSuccess) { cudaFree(A); return result; } - result = AllocateMatrix(&C_cutlass, ldc, M, N, 101); + result = AllocateMatrix(&C_cutlass, M, N, 101); if (result != cudaSuccess) { cudaFree(A); @@ -325,7 +324,7 @@ cudaError_t TestCutlassGemm(int M, int N, int K, float alpha, float beta) { return result; } - result = AllocateMatrix(&C_reference, ldc, M, N, 101); + result = AllocateMatrix(&C_reference, M, N, 101); if (result != cudaSuccess) { cudaFree(A); diff --git a/include/cutlass/epilogue/thread/linear_combination_clamp.h b/include/cutlass/epilogue/thread/linear_combination_clamp.h index 7d47a5c5..e1bf10bb 100644 --- a/include/cutlass/epilogue/thread/linear_combination_clamp.h +++ b/include/cutlass/epilogue/thread/linear_combination_clamp.h @@ -223,11 +223,14 @@ public: intermediate = mul_accumulator(alpha_, converted_accumulator); // D = alpha * Accum /// Clamping constant value - ElementCompute const kClamp = - ElementCompute((1U << (sizeof_bits::value - 1)) - 1); + ElementCompute const kClampMax = + ElementCompute(platform::numeric_limits::max()); - intermediate = max_accumulator(intermediate, -kClamp - ElementCompute(1)); - intermediate = min_accumulator(intermediate, kClamp); + ElementCompute const kClampMin = + ElementCompute(platform::numeric_limits::lowest()); + + intermediate = max_accumulator(intermediate, kClampMin); + intermediate = min_accumulator(intermediate, kClampMax); // Convert to destination numeric type NumericArrayConverter destination_converter; From c5f1ef4dff6d90e1ef1c5817c17b758fba1e0818 Mon Sep 17 00:00:00 2001 From: Manikandan Ananth Date: Wed, 2 Jun 2021 10:11:42 -0700 Subject: [PATCH 2/2] update contributors --- CONTRIBUTORS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index a4e0a2a4..5ce83dfb 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -23,6 +23,7 @@ Scott Yokim Markus Hohnerbach Aditya Atluri David Tanner +Manikandan Ananth ## CONTRIBUTORS Timothy Costa