bug fixes

This commit is contained in:
Manikandan Ananth 2021-06-02 10:08:25 -07:00
parent ad9486684f
commit 47ebfccbec
2 changed files with 17 additions and 15 deletions

View File

@ -148,7 +148,6 @@ cudaError_t CutlassSgemmNN(
/// Kernel to initialize a matrix with small integers. /// Kernel to initialize a matrix with small integers.
__global__ void InitializeMatrix_kernel( __global__ void InitializeMatrix_kernel(
float *matrix, float *matrix,
int ldm,
int rows, int rows,
int columns, int columns,
int seed = 0) { int seed = 0) {
@ -157,7 +156,7 @@ __global__ void InitializeMatrix_kernel(
int j = threadIdx.y + blockIdx.y * blockDim.y; int j = threadIdx.y + blockIdx.y * blockDim.y;
if (i < rows && j < columns) { if (i < rows && j < columns) {
int offset = i + j * ldm; int offset = i + j * rows;
// Generate arbitrary elements. // Generate arbitrary elements.
int const k = 16807; int const k = 16807;
@ -169,7 +168,7 @@ __global__ void InitializeMatrix_kernel(
} }
/// Simple function to initialize a matrix to arbitrary small integers. /// Simple function to initialize a matrix to arbitrary small integers.
cudaError_t InitializeMatrix(float *matrix, int ldm, int rows, int columns, int seed = 0) { cudaError_t InitializeMatrix(float *matrix, int rows, int columns, int seed = 0) {
dim3 block(16, 16); dim3 block(16, 16);
dim3 grid( dim3 grid(
@ -177,7 +176,7 @@ cudaError_t InitializeMatrix(float *matrix, int ldm, int rows, int columns, int
(columns + block.y - 1) / block.y (columns + block.y - 1) / block.y
); );
InitializeMatrix_kernel<<< grid, block >>>(matrix, ldm, rows, columns, seed); InitializeMatrix_kernel<<< grid, block >>>(matrix, rows, columns, seed);
return cudaGetLastError(); return cudaGetLastError();
} }
@ -185,10 +184,10 @@ cudaError_t InitializeMatrix(float *matrix, int ldm, int rows, int columns, int
/////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////
/// Allocates device memory for a matrix then fills with arbitrary small integers. /// Allocates device memory for a matrix then fills with arbitrary small integers.
cudaError_t AllocateMatrix(float **matrix, int ldm, int rows, int columns, int seed = 0) { cudaError_t AllocateMatrix(float **matrix, int rows, int columns, int seed = 0) {
cudaError_t result; cudaError_t result;
size_t sizeof_matrix = sizeof(float) * ldm * columns; size_t sizeof_matrix = sizeof(float) * columns;
// Allocate device memory. // Allocate device memory.
result = cudaMalloc(reinterpret_cast<void **>(matrix), sizeof_matrix); result = cudaMalloc(reinterpret_cast<void **>(matrix), sizeof_matrix);
@ -209,7 +208,7 @@ cudaError_t AllocateMatrix(float **matrix, int ldm, int rows, int columns, int s
} }
// Initialize matrix elements to arbitrary small integers. // Initialize matrix elements to arbitrary small integers.
result = InitializeMatrix(*matrix, ldm, rows, columns, seed); result = InitializeMatrix(*matrix, rows, columns, seed);
if (result != cudaSuccess) { if (result != cudaSuccess) {
std::cerr << "Failed to initialize matrix: " std::cerr << "Failed to initialize matrix: "
@ -304,20 +303,20 @@ cudaError_t TestCutlassGemm(int M, int N, int K, float alpha, float beta) {
// Allocate matrices in GPU device memory with arbitrary seeds. // Allocate matrices in GPU device memory with arbitrary seeds.
// //
result = AllocateMatrix(&A, lda, M, K, 0); result = AllocateMatrix(&A, M, K, 0);
if (result != cudaSuccess) { if (result != cudaSuccess) {
return result; return result;
} }
result = AllocateMatrix(&B, ldb, K, N, 17); result = AllocateMatrix(&B, K, N, 17);
if (result != cudaSuccess) { if (result != cudaSuccess) {
cudaFree(A); cudaFree(A);
return result; return result;
} }
result = AllocateMatrix(&C_cutlass, ldc, M, N, 101); result = AllocateMatrix(&C_cutlass, M, N, 101);
if (result != cudaSuccess) { if (result != cudaSuccess) {
cudaFree(A); cudaFree(A);
@ -325,7 +324,7 @@ cudaError_t TestCutlassGemm(int M, int N, int K, float alpha, float beta) {
return result; return result;
} }
result = AllocateMatrix(&C_reference, ldc, M, N, 101); result = AllocateMatrix(&C_reference, M, N, 101);
if (result != cudaSuccess) { if (result != cudaSuccess) {
cudaFree(A); cudaFree(A);

View File

@ -223,11 +223,14 @@ public:
intermediate = mul_accumulator(alpha_, converted_accumulator); // D = alpha * Accum intermediate = mul_accumulator(alpha_, converted_accumulator); // D = alpha * Accum
/// Clamping constant value /// Clamping constant value
ElementCompute const kClamp = ElementCompute const kClampMax =
ElementCompute((1U << (sizeof_bits<ElementOutput>::value - 1)) - 1); ElementCompute(platform::numeric_limits<ElementOutput>::max());
intermediate = max_accumulator(intermediate, -kClamp - ElementCompute(1)); ElementCompute const kClampMin =
intermediate = min_accumulator(intermediate, kClamp); ElementCompute(platform::numeric_limits<ElementOutput>::lowest());
intermediate = max_accumulator(intermediate, kClampMin);
intermediate = min_accumulator(intermediate, kClampMax);
// Convert to destination numeric type // Convert to destination numeric type
NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter; NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;