9fd55460c6c16d0edb11beb60087a05470776ede/docs/gemm_8h_source.html

 /***************************************************************************************************
  * Copyright (c) 2017-2018, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright notice, this list of
  *       conditions and the following disclaimer.
  *     * Redistributions in binary form must reproduce the above copyright notice, this list of
  *       conditions and the following disclaimer in the documentation and/or other materials
  *       provided with the distribution.
  *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
  *       to endorse or promote products derived from this software without specific prior written
  *       permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
 #pragma once

 #if !defined(__CUDACC_RTC__)
 #include <cuda.h>
 #endif

 #include <cutlass/coord.h>
 #include <cutlass/util/platform.h>

 namespace cutlass {
 namespace gemm {


 template <typename Gemm_>
 __global__ void gemm_kernel(typename Gemm_::Params params) {
   // Declare shared memory.
   __shared__ typename Gemm_::SharedStorage shared_storage;

   // Construct the GEMM object.
   Gemm_ gemm(params, shared_storage);
   // Run GEMM.
   gemm.multiply_add();
 }


 template <typename Scalar_, typename Index_ = int>
 struct GemmDesc {
   Index_ m, n, k;
   Scalar_ alpha, beta;
   void const* d_a;
   Index_ lda;
   void const* d_b;
   Index_ ldb;
   void const* d_c;
   Index_ ldc;
   void* d_d;
   Index_ ldd;
 };


 template <typename GemmTraits_>
 struct Gemm {
   typedef Gemm<GemmTraits_> This_;
   typedef GemmTraits_ Traits;
   typedef typename Traits::SharedStorage SharedStorage;

   typedef typename Traits::ScalarA ScalarA;
   typedef typename Traits::ScalarB ScalarB;
   typedef typename Traits::Epilogue::Scalar ScalarEpilogue;
   typedef typename Traits::Epilogue::ScalarC ScalarC;
   typedef typename Traits::Epilogue::ScalarD ScalarD;
   typedef typename Traits::Index Index;

   static int const kThreads = Traits::GemmConfig::kThreads;

   struct Params : public Traits::Params {
     CUTLASS_HOST_DEVICE int initialize(Index m,
                                        Index n,
                                        Index k,
                                        ScalarEpilogue alpha,
                                        ScalarA const* d_a,
                                        Index lda,
                                        ScalarB const* d_b,
                                        Index ldb,
                                        ScalarEpilogue beta,
                                        ScalarC const* d_c,
                                        Index ldc,
                                        ScalarD* d_d,
                                        Index ldd) {
       GemmDesc<ScalarEpilogue, Index> desc;
       desc.m = m;
       desc.n = n;
       desc.k = k;
       desc.alpha = alpha;
       desc.beta = beta;
       desc.d_a = reinterpret_cast<void const*>(d_a);
       desc.lda = lda;
       desc.d_b = reinterpret_cast<void const*>(d_b);
       desc.ldb = ldb;
       desc.d_c = reinterpret_cast<void const*>(d_c);
       desc.ldc = ldc;
       desc.d_d = reinterpret_cast<void*>(d_d);
       desc.ldd = ldd;
       return Traits::Params::initialize(desc);
     }
   };

 #if !defined(__CUDACC_RTC__)
   static __host__ cudaError_t launch(Params const& params,
                                      cudaStream_t stream = cudaStreamDefault) {
     // Setup the grid.
     dim3 grid;
     grid.x = (params.m + Traits::OutputTile::kW - 1) / Traits::OutputTile::kW;
     grid.y = (params.n + Traits::OutputTile::kH - 1) / Traits::OutputTile::kH;

     // The number of threads.
     dim3 block;
     block.x = kThreads;

     // Launch the kernel.
     void const* params_ = reinterpret_cast<void const*>(&params);

     return cudaLaunchKernel(reinterpret_cast<void*>(&gemm_kernel<This_>),
                             grid,
                             block,
                             const_cast<void**>(&params_),
                             0,
                             stream);
   }

   static __host__ cudaError_t launch(CUfunction kernel,
                                      Params const& params,
                                      CUstream stream = CU_STREAM_LEGACY) {
     // Setup the grid.
     dim3 grid;
     grid.x = (params.m + Traits::OutputTile::kW - 1) / Traits::OutputTile::kW;
     grid.y = (params.n + Traits::OutputTile::kH - 1) / Traits::OutputTile::kH;

     // The number of threads.
     dim3 block;
     block.x = kThreads;

     // Launch the kernel.
     void* params_[] = {const_cast<void*>(reinterpret_cast<void const*>(&params))};

     // return cudaLaunchKernel(reinterpret_cast<void*>(&gemm_kernel<This_>), grid, block,
     //  const_cast<void**>(&params_), 0, stream);
     CUresult result = cuLaunchKernel(
         kernel, grid.x, grid.y, grid.z, block.x, block.y, block.z, 0, stream, params_, 0);

     if (result != CUDA_SUCCESS) {
       return cudaErrorLaunchFailure;
     }
     return cudaSuccess;
   }

 #endif

   CUTLASS_DEVICE Gemm(Params const& params_, SharedStorage& shared_storage_)
       : params(params_), shared_storage(shared_storage_) {}

   CUTLASS_DEVICE void multiply_add() {
     // Swizzle the IDs of the block (to enable better cache behavior).
     typename Traits::BlockSwizzle block_swizzle;
     dim3 block = block_swizzle.swizzle();

     // Scale the id.
     block.x *= Traits::OutputTile::kW;
     block.y *= Traits::OutputTile::kH;

     // We may want to use shared memory to clear the registers.
     typedef typename Traits::ClearAccumulators ClearAccumulators;

     // The streams to read A/B from global memory to shared memory.
     typename Traits::GlobalLoadStream global_stream(params, shared_storage, block);

     // Create the accumulator clear.
     ClearAccumulators clear(shared_storage.main_loop.clear);

     typedef typename Traits::MultiplyAdd MultiplyAdd;

     // By how much we unroll the main loop.
     Index const kUnroll = static_cast<Index>(MultiplyAdd::AccumulatorsPerWarp::kD);

     // If we do not have enough steps in the main loop, trigger the residue code.
     if (params.k < kUnroll) {
       global_stream.residue(params.k, true);
     }

     // Fetch the fragments for A and B from global memory.
     global_stream.copy();

     // Copy the elements to shared memory (after transformation if needed).
     global_stream.commit();

     // Make sure the data is in shared memory.
     Traits::shared_store_fence(false);

     // The unrolling steps for the main loop.
     int const kUnrollingSteps =
         MultiplyAdd::AccumulatorsPerWarp::kD / MultiplyAdd::InstructionShape::kD;

     // Make sure we have at least 2 unrolling steps or our pipeling is not going to work.
     static_assert(kUnrollingSteps >= 2, "The pipelining assumes at least two steps");

     // The stream of data from shared memory to fragments.
     typename Traits::SharedLoadStream shared_load_stream(params, shared_storage);

     // Trigger the copy from shared memory for the 1st stream.
     shared_load_stream.copy(0);

     // Allocate the accumulators.
     typename MultiplyAdd::Accumulators accumulators;
     // Clear the accumulators.
     clear.clear(accumulators);

     // Enter the main loop and iterate.
     typedef typename Traits::Index Index;
     for (Index outer_k = params.k - kUnroll; outer_k > -kUnroll; outer_k -= kUnroll) {
       // If that's the last "load iteration" update the predicates.
       int const is_residue = outer_k <= kUnroll;
       if (is_residue) {
         global_stream.residue(outer_k);
       }

       // Load data for the next iteration of the main loop.
       global_stream.copy();

       CUTLASS_PRAGMA_UNROLL
       for (int step = 0; step < kUnrollingSteps - 1; ++step) {
         // Trigger the copy from shared memory for the next A/B values.
         shared_load_stream.copy(step + 1);
         // Make sure the values are available for the current iteration to do the multiply-add.
         shared_load_stream.commit(step);

         // Do the math on the fragments of the current iteration.
         MultiplyAdd multiply_add;
         multiply_add.multiply_add(shared_load_stream.fragment_a(step),
                                   shared_load_stream.fragment_b(step),
                                   accumulators,
                                   accumulators);
       }

       // Make sure the data from shared memory has been entirely consumed.
       Traits::shared_load_fence(true);

       // Commit the data in shared memory for A/B.
       global_stream.commit();

       // Make sure the data is in shared memory.
       Traits::shared_store_fence(true);

       // Move to the next stage for the load (if it makes sense).
       shared_load_stream.inc_stage();
       // Trigger the copy from shared memory for the next loop iteration.
       shared_load_stream.copy(0);
       // Make sure the values are available for the current iteration to do the multiply-add.
       shared_load_stream.commit(kUnrollingSteps - 1);

       // Do the math on the fragments of the current iteration.
       MultiplyAdd multiply_add;
       multiply_add.multiply_add(shared_load_stream.fragment_a(kUnrollingSteps - 1),
                                 shared_load_stream.fragment_b(kUnrollingSteps - 1),
                                 accumulators,
                                 accumulators);
     }

     // Epilogue.
     typedef typename Traits::Epilogue Epilogue;
     Epilogue epilogue(params.epilogue, shared_storage.epilogue, params.m, params.n);
     epilogue.epilogue(cutlass::make_Coord(0, block.y, block.x), accumulators);
   }

   Params const& params;
   SharedStorage& shared_storage;
 };


 }  // namespace gemm
 }  // namespace cutlass
cutlass::gemm::GemmDesc
Definition: gemm.h:56

cutlass
Definition: convert.h:33

cutlass::gemm::Gemm::shared_storage
SharedStorage & shared_storage
The shared storage.
Definition: gemm.h:313

cutlass::gemm::Gemm::ScalarD
Traits::Epilogue::ScalarD ScalarD
The scalar for D.
Definition: gemm.h:99

cutlass::gemm::GemmDesc::beta
Scalar_ beta
Definition: gemm.h:60

cutlass::gemm::GemmDesc::k
Index_ k
Definition: gemm.h:58

cutlass::gemm::Gemm::SharedStorage
Traits::SharedStorage SharedStorage
The shared storage.
Definition: gemm.h:88

cutlass::gemm::Gemm::Params
The params.
Definition: gemm.h:107

coord.h
A Coord is a coordinate of arbitrary rank into a tensor or matrix.

cutlass::make_Coord
CUTLASS_HOST_DEVICE Coord< 1 > make_Coord(int _0)
Helper to make a 2-element coordinate.
Definition: coord.h:241

cutlass::gemm::Gemm::params
Params const  & params
The params.
Definition: gemm.h:311

cutlass::gemm::GemmDesc::m
Index_ m
The dimensions of the GEMM.
Definition: gemm.h:58

cutlass::gemm::Gemm::ScalarC
Traits::Epilogue::ScalarC ScalarC
The scalar for C.
Definition: gemm.h:97

cutlass::gemm::GemmDesc::ldb
Index_ ldb
The stride for B.
Definition: gemm.h:68

platform.h
C++ features that may be otherwise unimplemented for CUDA device functions.

cutlass::gemm::Gemm::multiply_add
CUTLASS_DEVICE void multiply_add()
Do the GEMM.
Definition: gemm.h:197

cutlass::gemm::Gemm::Traits
GemmTraits_ Traits
The traits.
Definition: gemm.h:86

cutlass::gemm::Gemm::ScalarEpilogue
Traits::Epilogue::Scalar ScalarEpilogue
The scalar in the epilogue.
Definition: gemm.h:95

cutlass::gemm::GemmDesc::n
Index_ n
Definition: gemm.h:58

cutlass::gemm::Gemm::ScalarB
Traits::ScalarB ScalarB
The scalar for B.
Definition: gemm.h:93

cutlass::gemm::ClearAccumulators
Definition: clear_accumulators.h:38

cutlass::gemm::GemmDesc::d_d
void * d_d
The destination matrix D.
Definition: gemm.h:74

cutlass::gemm::Gemm
Definition: gemm.h:82

CUTLASS_PRAGMA_UNROLL
#define CUTLASS_PRAGMA_UNROLL
Definition: cutlass.h:60

cutlass::gemm::Gemm::launch
static __host__ cudaError_t launch(CUfunction kernel, Params const &params, CUstream stream=CU_STREAM_LEGACY)
Launch the kernel.
Definition: gemm.h:164

cutlass::gemm::GemmDesc::d_a
void const  * d_a
The source matrix A.
Definition: gemm.h:62

cutlass::gemm::gemm_kernel
__global__ void gemm_kernel(typename Gemm_::Params params)
Definition: gemm.h:43

cutlass::gemm::Gemm::Params::initialize
CUTLASS_HOST_DEVICE int initialize(Index m, Index n, Index k, ScalarEpilogue alpha, ScalarA const *d_a, Index lda, ScalarB const *d_b, Index ldb, ScalarEpilogue beta, ScalarC const *d_c, Index ldc, ScalarD *d_d, Index ldd)
Definition: gemm.h:108

cutlass::gemm::GemmDesc::lda
Index_ lda
The stride for A.
Definition: gemm.h:64

CUTLASS_HOST_DEVICE
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:46

cutlass::gemm::Gemm::This_
Gemm< GemmTraits_ > This_
This class.
Definition: gemm.h:84

cutlass::gemm::GemmDesc::ldc
Index_ ldc
The stride for C.
Definition: gemm.h:72

cutlass::gemm::Gemm::Gemm
CUTLASS_DEVICE Gemm(Params const &params_, SharedStorage &shared_storage_)
Ctor.
Definition: gemm.h:193

static_assert
#define static_assert(__e, __m)
Definition: platform.h:145

cutlass::gemm::GemmDesc::ldd
Index_ ldd
The stride for D.
Definition: gemm.h:76

cutlass::gemm::Gemm::ScalarA
Traits::ScalarA ScalarA
The scalar for A.
Definition: gemm.h:91

cutlass::gemm::ClearAccumulators::clear
CUTLASS_DEVICE void clear(Fragment_ &fragment)
Clear the fragment.
Definition: clear_accumulators.h:47

cutlass::gemm::Gemm::kThreads
static int const kThreads
The number of threads.
Definition: gemm.h:104

cutlass::gemm::GemmDesc::alpha
Scalar_ alpha
The alpha/beta scaling values.
Definition: gemm.h:60

cutlass::gemm::GemmDesc::d_c
void const  * d_c
The source matrix C.
Definition: gemm.h:70

cutlass::gemm::Gemm::launch
static __host__ cudaError_t launch(Params const &params, cudaStream_t stream=cudaStreamDefault)
Launch the kernel.
Definition: gemm.h:141

cutlass::gemm::Gemm::Index
Traits::Index Index
The index.
Definition: gemm.h:101

cutlass::gemm::GemmDesc::d_b
void const  * d_b
The source matrix B.
Definition: gemm.h:66