 66d9cddc83
			
		
	
	
		66d9cddc83
		
			
		
	
	
	
	
		
			
			* New updates. * Minor profiler updates Co-authored-by: Aniket Shivam <ashivam@nvidia.com>
		
			
				
	
	
		
			523 lines
		
	
	
		
			15 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
			
		
		
	
	
			523 lines
		
	
	
		
			15 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
| /***************************************************************************************************
 | |
|  * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 | |
|  * SPDX-License-Identifier: BSD-3-Clause
 | |
|  *
 | |
|  * Redistribution and use in source and binary forms, with or without
 | |
|  * modification, are permitted provided that the following conditions are met:
 | |
|  *
 | |
|  * 1. Redistributions of source code must retain the above copyright notice, this
 | |
|  * list of conditions and the following disclaimer.
 | |
|  *
 | |
|  * 2. Redistributions in binary form must reproduce the above copyright notice,
 | |
|  * this list of conditions and the following disclaimer in the documentation
 | |
|  * and/or other materials provided with the distribution.
 | |
|  *
 | |
|  * 3. Neither the name of the copyright holder nor the names of its
 | |
|  * contributors may be used to endorse or promote products derived from
 | |
|  * this software without specific prior written permission.
 | |
|  *
 | |
|  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 | |
|  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 | |
|  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 | |
|  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 | |
|  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 | |
|  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 | |
|  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 | |
|  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 | |
|  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 | |
|  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 | |
|  *
 | |
|  **************************************************************************************************/
 | |
| 
 | |
| /*
 | |
|   This example demonstrates how to call a CUTLASS SYRK kernel and provides a naive reference
 | |
|   matrix multiply kernel to verify its correctness.
 | |
| 
 | |
|   The CUTLASS Syrk template is instantiated in the function CutlassSsyrkNN. This is kernel computes
 | |
|   the symmetric rank-k update (SYRK) using double-precision doubleing-point arithmetic and assumes
 | |
|   all matrices have column-major layout.
 | |
| 
 | |
|   The threadblock tile size is chosen as 16x32x16 which offers good performance for large matrices.
 | |
|   See the CUTLASS Parallel for All blog post for more exposition on the tunable parameters available
 | |
|   in CUTLASS.
 | |
| 
 | |
|   https://devblogs.nvidia.com/cutlass-linear-algebra-cuda/
 | |
| 
 | |
|   Aside from defining and launching the SSYRK kernel, this example does not use any other components
 | |
|   or utilities within CUTLASS. Such utilities are demonstrated elsewhere in other examples and are
 | |
|   prevalent in the CUTLASS unit tests.
 | |
| 
 | |
| */
 | |
| 
 | |
| // Standard Library includes
 | |
| #include <iostream>
 | |
| #include <sstream>
 | |
| #include <vector>
 | |
| 
 | |
| // Helper methods to check for errors
 | |
| #include "helper.h"
 | |
| 
 | |
| //
 | |
| // CUTLASS includes needed for double-precision SYRK kernel
 | |
| //
 | |
| 
 | |
| // Defines cutlass::gemm::device::Syrk, the generic Syrk computation template class.
 | |
| #include "cutlass/gemm/device/rank_k.h"
 | |
| 
 | |
| ///////////////////////////////////////////////////////////////////////////////////////////////////
 | |
| //
 | |
| // This function defines a CUTLASS SYRK kernel instantiation, constructs its parameters object,
 | |
| // and launches it on the CUDA device.
 | |
| //
 | |
| ///////////////////////////////////////////////////////////////////////////////////////////////////
 | |
| 
 | |
| /// Define a CUTLASS SYRK template and launch a SYRK kernel.
 | |
| cudaError_t CutlassSsyrkNN(
 | |
|   int N,
 | |
|   int K,
 | |
|   double alpha,
 | |
|   double const *A,
 | |
|   int lda,
 | |
|   double beta,
 | |
|   double *C,
 | |
|   int ldc) {
 | |
| 
 | |
|   // Define type definition for double-precision CUTLASS SYRK with column-major
 | |
|   // input matrices and 16x32x16 threadblock tile size (chosen by default).
 | |
|   //
 | |
|   // To keep the interface manageable, several helpers are defined for plausible compositions
 | |
|   // including the following example for double-precision SYRK. Typical values are used as
 | |
|   // default template arguments.
 | |
|   //
 | |
|   // To view the full syrk device API interface, see `cutlass/gemm/device/syrk.h`
 | |
| 
 | |
|   using ColumnMajor = cutlass::layout::ColumnMajor;
 | |
| 
 | |
|   using CutlassSyrk = cutlass::gemm::device::RankK<
 | |
|     double,
 | |
|     ColumnMajor,
 | |
|     double,
 | |
|     ColumnMajor,
 | |
|     cutlass::FillMode::kLower,
 | |
|     double,
 | |
|     cutlass::arch::OpClassTensorOp,
 | |
|     cutlass::arch::Sm80,
 | |
|     cutlass::gemm::GemmShape<16, 32, 16>,
 | |
|     cutlass::gemm::GemmShape<16, 16, 16>,
 | |
|     cutlass::gemm::GemmShape<8, 8, 4>,
 | |
|     cutlass::epilogue::thread::LinearCombination<
 | |
|       double,
 | |
|       1,
 | |
|       double,
 | |
|       double
 | |
|     >,
 | |
|     cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>,
 | |
|     5,     // Stages
 | |
|     1,     // AligmentA
 | |
|     false, // SplitKSerail
 | |
|     cutlass::arch::OpMultiplyAdd, 
 | |
|     cutlass::ComplexTransform::kNone, 
 | |
|     cutlass::BlasMode::kSymmetric
 | |
|   >;
 | |
| 
 | |
|   // Define a CUTLASS SYRK type
 | |
|   CutlassSyrk syrk_operator;
 | |
| 
 | |
|   // Construct the CUTLASS SYRK arguments object.
 | |
|   //
 | |
|   // One of CUTLASS's design patterns is to define syrk argument objects that are constructible
 | |
|   // in host code and passed to kernels by value. These may include pointers, strides, scalars,
 | |
|   // and other arguments needed by Syrk and its components.
 | |
|   //
 | |
|   // The benefits of this pattern are (1.) a structured, composable strategy for passing host-constructible
 | |
|   // arguments to kernels and (2.) minimized initialization overhead on kernel entry.
 | |
|   //
 | |
|   CutlassSyrk::Arguments args(cutlass::gemm::GemmUniversalMode::kGemm,
 | |
|                               {N, N, K}, // Syrk Problem dimensions
 | |
|                               1, // batch_count,
 | |
|                               {alpha, beta}, // Scalars used in the Epilogue
 | |
|                               reinterpret_cast<void const *>(A),
 | |
|                               const_cast<void *>(reinterpret_cast<void *>(C)),
 | |
|                               reinterpret_cast<void *>(C), // destination matrix D (may be different memory than source C matrix)
 | |
|                               (int64_t)N*K, // Batch strides
 | |
|                               (int64_t)N*N,
 | |
|                               (int64_t)N*N,
 | |
|                               lda,
 | |
|                               ldc,
 | |
|                               ldc);
 | |
| 
 | |
|   //
 | |
|   // Launch the CUTLASS SYRK kernel.
 | |
|   //
 | |
|   
 | |
|   cutlass::Status status = syrk_operator(args);
 | |
| 
 | |
|   //
 | |
|   // Return a cudaError_t if the CUTLASS SYRK operator returned an error code.
 | |
|   //
 | |
| 
 | |
|   if (status != cutlass::Status::kSuccess) {
 | |
|     return cudaErrorUnknown;
 | |
|   }
 | |
| 
 | |
|   // Return success, if no errors were encountered.
 | |
|   return cudaSuccess;
 | |
| }
 | |
| 
 | |
| ///////////////////////////////////////////////////////////////////////////////////////////////////
 | |
| //
 | |
| // The source code after this point in the file is generic CUDA using the CUDA Runtime API
 | |
| // and simple CUDA kernels to initialize matrices and compute the general matrix product.
 | |
| //
 | |
| ///////////////////////////////////////////////////////////////////////////////////////////////////
 | |
| 
 | |
| /// Kernel to initialize a matrix with small integers.
 | |
| __global__ void InitializeMatrix_kernel(
 | |
|   double *matrix,
 | |
|   int ldm,
 | |
|   int rows,
 | |
|   int columns,
 | |
|   int seed = 0) {
 | |
| 
 | |
|   int i = threadIdx.x + blockIdx.x * blockDim.x;
 | |
|   int j = threadIdx.y + blockIdx.y * blockDim.y;
 | |
| 
 | |
|   if (i < rows && j < columns) {
 | |
|     int offset = i + j * ldm;
 | |
| 
 | |
|     // Generate arbitrary elements.
 | |
|     int const k = 16807;
 | |
|     int const m = 16;
 | |
|     double value = double(((offset + seed) * k % m) - m / 2);
 | |
| 
 | |
|     matrix[offset] = value;
 | |
|   }
 | |
| }
 | |
| 
 | |
| /// Simple function to initialize a matrix to arbitrary small integers.
 | |
| cudaError_t InitializeMatrix(double *matrix, int ldm, int rows, int columns, int seed = 0) {
 | |
| 
 | |
|   dim3 block(16, 16);
 | |
|   dim3 grid(
 | |
|     (rows + block.x - 1) / block.x,
 | |
|     (columns + block.y - 1) / block.y
 | |
|   );
 | |
| 
 | |
|   InitializeMatrix_kernel<<< grid, block >>>(matrix, ldm, rows, columns, seed);
 | |
| 
 | |
|   return cudaGetLastError();
 | |
| }
 | |
| 
 | |
| ///////////////////////////////////////////////////////////////////////////////////////////////////
 | |
| 
 | |
| /// Allocates device memory for a matrix then fills with arbitrary small integers.
 | |
| cudaError_t AllocateMatrix(double **matrix, int ldm, int rows, int columns, int seed = 0) {
 | |
|   cudaError_t result;
 | |
| 
 | |
|   size_t sizeof_matrix = sizeof(double) * ldm * columns;
 | |
| 
 | |
|   // Allocate device memory.
 | |
|   result = cudaMalloc(reinterpret_cast<void **>(matrix), sizeof_matrix);
 | |
| 
 | |
|   if (result != cudaSuccess) {
 | |
|     std::cerr << "Failed to allocate matrix: "
 | |
|       << cudaGetErrorString(result) << std::endl;
 | |
|     return result;
 | |
|   }
 | |
| 
 | |
|   // Clear the allocation.
 | |
|   result = cudaMemset(*matrix, 0, sizeof_matrix);
 | |
| 
 | |
|   if (result != cudaSuccess) {
 | |
|     std::cerr << "Failed to clear matrix device memory: "
 | |
|       << cudaGetErrorString(result) << std::endl;
 | |
|     return result;
 | |
|   }
 | |
| 
 | |
|   // Initialize matrix elements to arbitrary small integers.
 | |
|   result = InitializeMatrix(*matrix, ldm, rows, columns, seed);
 | |
| 
 | |
|   if (result != cudaSuccess) {
 | |
|     std::cerr << "Failed to initialize matrix: "
 | |
|       << cudaGetErrorString(result) << std::endl;
 | |
|     return result;
 | |
|   }
 | |
| 
 | |
|   return result;
 | |
| }
 | |
| 
 | |
| ///////////////////////////////////////////////////////////////////////////////////////////////////
 | |
| 
 | |
| /// Naive reference SYRK computation.
 | |
| __global__ void ReferenceSyrk_kernel(
 | |
|   int N,
 | |
|   int K,
 | |
|   double alpha,
 | |
|   double const *A,
 | |
|   int lda,
 | |
|   double beta,
 | |
|   double *C,
 | |
|   int ldc) {
 | |
| 
 | |
|   int i = threadIdx.x + blockIdx.x * blockDim.x;
 | |
|   int j = threadIdx.y + blockIdx.y * blockDim.y;
 | |
| 
 | |
|   if (i < N && j < N && i >= j ) { // Since C is in Lower Fill Mode
 | |
|     double accumulator = 0;
 | |
| 
 | |
|     for (int k = 0; k < K; ++k) {
 | |
|       accumulator += A[i + k * lda] * A[j + k * lda];
 | |
|     }
 | |
| 
 | |
|     C[i + j * ldc] = alpha * accumulator + beta * C[i + j * ldc];
 | |
|   }
 | |
| }
 | |
| 
 | |
| /// Reference SYRK computation.
 | |
| cudaError_t ReferenceSyrk(
 | |
|   int N,
 | |
|   int K,
 | |
|   double alpha,
 | |
|   double const *A,
 | |
|   int lda,
 | |
|   double beta,
 | |
|   double *C,
 | |
|   int ldc) {
 | |
| 
 | |
|   dim3 block(16, 16);
 | |
|   dim3 grid(
 | |
|     (N + block.x - 1) / block.x,
 | |
|     (N + block.y - 1) / block.y
 | |
|   );
 | |
| 
 | |
|   ReferenceSyrk_kernel<<< grid, block >>>(N, K, alpha, A, lda, beta, C, ldc);
 | |
| 
 | |
|   return cudaGetLastError();
 | |
| }
 | |
| 
 | |
| ///////////////////////////////////////////////////////////////////////////////////////////////////
 | |
| 
 | |
| /// Allocate several matrices in GPU device memory and call a double-precision
 | |
| /// CUTLASS SYRK kernel.
 | |
| cudaError_t TestCutlassSyrk(int N, int K, double alpha, double beta) {
 | |
|   cudaError_t result;
 | |
| 
 | |
|   //
 | |
|   // Define several matrices to be used as operands to SYRK kernels.
 | |
|   //
 | |
| 
 | |
|   // Compute leading dimensions for each matrix.
 | |
|   int lda = N;
 | |
|   int ldc = N;
 | |
| 
 | |
|   // Compute size in bytes of the C matrix.
 | |
|   size_t sizeof_C = sizeof(double) * ldc * N;
 | |
| 
 | |
|   // Define pointers to matrices in GPU device memory.
 | |
|   double *A;
 | |
|   double *C_cutlass;
 | |
|   double *C_reference;
 | |
| 
 | |
|   //
 | |
|   // Allocate matrices in GPU device memory with arbitrary seeds.
 | |
|   //
 | |
| 
 | |
|   result = AllocateMatrix(&A, lda, N, K, 0);
 | |
| 
 | |
|   if (result !=  cudaSuccess) {
 | |
|     return result;
 | |
|   }
 | |
| 
 | |
|   result = AllocateMatrix(&C_cutlass, ldc, N, N, 101);
 | |
| 
 | |
|   if (result != cudaSuccess) {
 | |
|     cudaFree(A);
 | |
|     return result;
 | |
|   }
 | |
| 
 | |
|   result = AllocateMatrix(&C_reference, ldc, N, N, 101);
 | |
| 
 | |
|   if (result != cudaSuccess) {
 | |
|     cudaFree(A);
 | |
|     cudaFree(C_cutlass);
 | |
|     return result;
 | |
|   }
 | |
| 
 | |
|   result = cudaMemcpy(C_reference, C_cutlass, sizeof_C, cudaMemcpyDeviceToDevice);
 | |
| 
 | |
|   if (result != cudaSuccess) {
 | |
|     std::cerr << "Failed to copy C_cutlass matrix to C_reference: "
 | |
|       << cudaGetErrorString(result) << std::endl;
 | |
| 
 | |
|     cudaFree(C_reference);
 | |
|     cudaFree(C_cutlass);
 | |
|     cudaFree(A);
 | |
| 
 | |
|     return result;
 | |
|   }
 | |
| 
 | |
|   //
 | |
|   // Launch CUTLASS SYRK.
 | |
|   //
 | |
| 
 | |
|   result = CutlassSsyrkNN(N, K, alpha, A, lda, beta, C_cutlass, ldc);
 | |
| 
 | |
|   if (result != cudaSuccess) {
 | |
|     std::cerr << "CUTLASS SYRK kernel failed: "
 | |
|       << cudaGetErrorString(result) << std::endl;
 | |
| 
 | |
|     cudaFree(C_reference);
 | |
|     cudaFree(C_cutlass);
 | |
|     cudaFree(A);
 | |
| 
 | |
|     return result;
 | |
|   }
 | |
| 
 | |
|   //
 | |
|   // Verify.
 | |
|   //
 | |
| 
 | |
|   // Launch reference SYRK
 | |
|   result = ReferenceSyrk(N, K, alpha, A, lda, beta, C_reference, ldc);
 | |
| 
 | |
|   if (result != cudaSuccess) {
 | |
|     std::cerr << "Reference SYRK kernel failed: "
 | |
|       << cudaGetErrorString(result) << std::endl;
 | |
| 
 | |
|     cudaFree(C_reference);
 | |
|     cudaFree(C_cutlass);
 | |
|     cudaFree(A);
 | |
| 
 | |
|     return result;
 | |
|   }
 | |
| 
 | |
|   // Copy to host and verify equivalence.
 | |
|   std::vector<double> host_cutlass(ldc * N, 0);
 | |
|   std::vector<double> host_reference(ldc * N, 0);
 | |
| 
 | |
|   result = cudaMemcpy(host_cutlass.data(), C_cutlass, sizeof_C, cudaMemcpyDeviceToHost);
 | |
| 
 | |
|   if (result != cudaSuccess) {
 | |
|     std::cerr << "Failed to copy CUTLASS SYRK results: "
 | |
|       << cudaGetErrorString(result) << std::endl;
 | |
| 
 | |
|     cudaFree(C_reference);
 | |
|     cudaFree(C_cutlass);
 | |
|     cudaFree(A);
 | |
| 
 | |
|     return result;
 | |
|   }
 | |
| 
 | |
|   result = cudaMemcpy(host_reference.data(), C_reference, sizeof_C, cudaMemcpyDeviceToHost);
 | |
| 
 | |
|   if (result != cudaSuccess) {
 | |
|     std::cerr << "Failed to copy Reference SYRK results: "
 | |
|       << cudaGetErrorString(result) << std::endl;
 | |
| 
 | |
|     cudaFree(C_reference);
 | |
|     cudaFree(C_cutlass);
 | |
|     cudaFree(A);
 | |
| 
 | |
|     return result;
 | |
|   }
 | |
| 
 | |
|   //
 | |
|   // Free device memory allocations.
 | |
|   //
 | |
| 
 | |
|   cudaFree(C_reference);
 | |
|   cudaFree(C_cutlass);
 | |
|   cudaFree(A);
 | |
| 
 | |
|   //
 | |
|   // Test for bit equivalence of results.
 | |
|   //
 | |
| 
 | |
|   if (host_cutlass != host_reference) {
 | |
|     std::cerr << "CUTLASS results incorrect." << std::endl;
 | |
| 
 | |
|     return cudaErrorUnknown;
 | |
|   }
 | |
| 
 | |
|   return cudaSuccess;
 | |
| }
 | |
| 
 | |
| ///////////////////////////////////////////////////////////////////////////////////////////////////
 | |
| 
 | |
| /// Entry point to basic_syrk example.
 | |
| //
 | |
| // usage:
 | |
| //
 | |
| //   00_basic_syrk <N> <K> <alpha> <beta>
 | |
| //
 | |
| int main(int argc, const char *arg[]) {
 | |
| 
 | |
|   bool notSupported = false;
 | |
| 
 | |
|   // CUTLASS must be compiled with CUDA 11 Toolkit to run these examples.
 | |
|   if (!(__CUDACC_VER_MAJOR__ >= 11)) {
 | |
|     std::cerr << "NVIDIA  Ampere Tensor Core operations must be compiled with CUDA 11.0 Toolkit or later." << std::endl;
 | |
|     notSupported = true;
 | |
|   }
 | |
| 
 | |
|   cudaDeviceProp props;
 | |
| 
 | |
|   cudaError_t error = cudaGetDeviceProperties(&props, 0);
 | |
|   if (error != cudaSuccess) {
 | |
|     std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
 | |
| 
 | |
|     return -1;
 | |
|   }
 | |
| 
 | |
|   if (!((props.major * 10 + props.minor) >= 80)) {
 | |
| 
 | |
|     std::cerr << "This example requires compute capability at least 80."
 | |
|               << std::endl;
 | |
|     notSupported = true;
 | |
|   }
 | |
| 
 | |
|   if (notSupported) {
 | |
|     return 0;
 | |
|   }
 | |
| 
 | |
|   //
 | |
|   // Parse the command line to obtain SYRK dimensions and scalar values.
 | |
|   //
 | |
| 
 | |
|   // SYRK problem dimensions.
 | |
|   int problem[2] = { 128, 128 };
 | |
| 
 | |
|   for (int i = 1; i < argc && i < 3; ++i) {
 | |
|     std::stringstream ss(arg[i]);
 | |
|     ss >> problem[i - 1];
 | |
|   }
 | |
| 
 | |
|   // Scalars used for linear scaling the result of the matrix product.
 | |
|   double scalars[2] = { 1, 0 };
 | |
| 
 | |
|   for (int i = 3; i < argc && i < 5; ++i) {
 | |
|     std::stringstream ss(arg[i]);
 | |
|     ss >> scalars[i - 3];
 | |
|   }
 | |
| 
 | |
|   //
 | |
|   // Run the CUTLASS SYRK test.
 | |
|   //
 | |
| 
 | |
|   cudaError_t result = TestCutlassSyrk(
 | |
|     problem[0],     // SYRK N dimension
 | |
|     problem[1],     // SYRK K dimension
 | |
|     scalars[0],     // alpha
 | |
|     scalars[1]      // beta
 | |
|   );
 | |
| 
 | |
|   if (result == cudaSuccess) {
 | |
|     std::cout << "Passed." << std::endl;
 | |
|   }
 | |
| 
 | |
|   // Exit.
 | |
|   return result == cudaSuccess ? 0 : -1;
 | |
| }
 | |
| 
 | |
| ///////////////////////////////////////////////////////////////////////////////////////////////////
 |