d7137f9c0a1633b76455109373887e1640713b5d/docs/gemm__traits_8h_source.html

 /***************************************************************************************************
  * Copyright (c) 2017-2018, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright notice, this list of
  *       conditions and the following disclaimer.
  *     * Redistributions in binary form must reproduce the above copyright notice, this list of
  *       conditions and the following disclaimer in the documentation and/or other materials
  *       provided with the distribution.
  *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
  *       to endorse or promote products derived from this software without specific prior written
  *       permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
 #pragma once

 #include "cutlass/convert.h"
 #include "cutlass/matrix_traits.h"
 #include "cutlass/reshape_tile.h"
 #include "cutlass/tile_allocation.h"
 #include "cutlass/tile_iterator.h"
 #include "cutlass/kernel_launch.h"

 #include "cutlass/gemm/clear_accumulators.h"
 #include "cutlass/gemm/gemm_config.h"
 #include "cutlass/gemm/gemm_desc.h"
 #include "cutlass/gemm/gemm_stream_pair.h"
 #include "cutlass/gemm/gemm_global_stream.h"
 #include "cutlass/gemm/gemm_operand.h"
 #include "cutlass/gemm/gemm_shared_stream.h"
 #include "cutlass/gemm/threadblock_swizzle.h"
 #include "cutlass/gemm/gemm.h"
 namespace cutlass {
 namespace gemm {


 template <enum MatrixLayout::Kind, typename GemmConfig_>
 struct GemmTileTraitsHelperA {};


 template <typename GemmConfig_>
 struct GemmTileTraitsHelperA<MatrixLayout::kColumnMajor, GemmConfig_> {
   static MatrixLayout::Kind const kLayout = MatrixLayout::kColumnMajor;

   typedef typename GemmConfig_::ScalarA Scalar;
   typedef typename GemmConfig_::MultiplyAdd::ScalarA MultiplyAddScalar;

   typedef GemmGlobalTileTraits<
       // That's A.
       GemmOperand::kA,
       // A is column-major.
       MatrixLayout::kColumnMajor,
       // The pointer is float const.
       Scalar const,
       // The tile has size KxM in GEMM's terminology.
       Shape<1, GemmConfig_::OutputTile::kD, GemmConfig_::OutputTile::kW>,
       // The threads are distributed as warps x 32 (the traits may reorganize).
       Shape<1, ShapeCount<typename GemmConfig_::Warps>::kCount, GemmConfig_::kWarpSize>,
       // The number of scalars per LDG (LDG.32 or LDG.128, etc).
       GemmConfig_::kScalarsPerLdgA>
       GlobalTileTraits;

   typedef GemmSharedStoreTileAbTraits<
       // The pointer is float.
       MultiplyAddScalar,
       // The tile has size KxM in GEMM's terminology.
       Shape<GemmConfig_::kStages,
             GemmConfig_::OutputTile::kD / GemmConfig_::InstructionShape::kD,
             GemmConfig_::OutputTile::kW * GemmConfig_::InstructionShape::kD>,
       // The threads are distributed as warps x 32 (the traits may reorganize).
       typename GlobalTileTraits::Threads,
       // The number of scalars per STS (STS.32 or STS.128, etc).
       GemmConfig_::kScalarsPerStsA>
       SharedStoreTileTraits;

   typedef GemmSharedLoadTileATraits<
       // The pointer is float const.
       MultiplyAddScalar const,
       // The output tile size.
       typename GemmConfig_::OutputTile,
       // The number of warps.
       typename GemmConfig_::Warps,
       // The number of threads per warp.
       typename GemmConfig_::MultiplyAdd::ThreadsPerWarp,
       // The shape of the FMA instruction.
       typename GemmConfig_::InstructionShape,
       // The number of stages.
       GemmConfig_::kStages,
       // The number of scalars per LDS.
       GemmConfig_::kScalarsPerLdsA,
       // The skew.
       0>
       SharedLoadTileTraits;
 };


 template <typename GemmConfig_>
 struct GemmTileTraitsHelperA<MatrixLayout::kRowMajor, GemmConfig_> {
   static MatrixLayout::Kind const kLayout = MatrixLayout::kRowMajor;

   typedef typename GemmConfig_::ScalarA Scalar;
   typedef typename GemmConfig_::MultiplyAdd::ScalarA MultiplyAddScalar;

   typedef GemmGlobalTileTraits<
       // That's A.
       GemmOperand::kA,
       // A is row-major.
       MatrixLayout::kRowMajor,
       // The pointer is float const.
       Scalar const,
       // The tile has size MxK in GEMM's terminology.
       Shape<1, GemmConfig_::OutputTile::kW, GemmConfig_::OutputTile::kD>,
       // The threads are distributed as (threads / K) x K (the traits may reorganize).
       Shape<1, GemmConfig_::kThreads / GemmConfig_::OutputTile::kD, GemmConfig_::OutputTile::kD>,
       // The number of scalars per LDG (LDG.32 or LDG.128, etc).
       GemmConfig_::kScalarsPerLdgA>
       GlobalTileTraits;

   static int const kScalarsIn4B = sizeof(MultiplyAddScalar) > 4 ? 1 : 4 / sizeof(MultiplyAddScalar);
   static int const kSkewA = 128 / sizeof(MultiplyAddScalar) / GemmConfig_::kScalarsPerStsA /
                             GlobalTileTraits::Threads::kW * kScalarsIn4B;

   typedef GemmSharedStoreWithSkewTileAbTraits <
       // The pointer is float.
       MultiplyAddScalar,
       // The tile has size KxM in GEMM's terminology.
       Shape<GemmConfig_::kStages,
             GemmConfig_::OutputTile::kD / GemmConfig_::InstructionShape::kD,
             GemmConfig_::OutputTile::kW * GemmConfig_::InstructionShape::kD>,
       // The threads are distributed as (threads / K) x K (the traits may reorganize).
       typename GlobalTileTraits::Threads,
       // The number of scalars per STS.
       GemmConfig_::kScalarsPerStsA,
       // The skew to avoid bank conflicts added in the tile W dimension.
       kSkewA<GemmConfig_::kScalarsPerLdsA ? GemmConfig_::kScalarsPerLdsA : kSkewA>
           SharedStoreTileTraits;

   typedef GemmSharedLoadTileATraits<
       // The pointer is float const.
       MultiplyAddScalar const,
       // The output tile size.
       typename GemmConfig_::OutputTile,
       // The number of warps.
       typename GemmConfig_::Warps,
       // The number of threads per warp.
       typename GemmConfig_::MultiplyAdd::ThreadsPerWarp,
       // The shape of the FMA instruction.
       typename GemmConfig_::InstructionShape,
       // The number of stages.
       GemmConfig_::kStages,
       // The number of scalars per LDS.
       GemmConfig_::kScalarsPerLdsA,
       // The skew.
       SharedStoreTileTraits::kSkew>
       SharedLoadTileTraits;
 };


 template <enum MatrixLayout::Kind, typename GemmConfig_>
 struct GemmTileTraitsHelperB {};


 template <typename GemmConfig_>
 struct GemmTileTraitsHelperB<MatrixLayout::kColumnMajor, GemmConfig_> {
   static MatrixLayout::Kind const kLayout = MatrixLayout::kColumnMajor;

   typedef typename GemmConfig_::ScalarB Scalar;
   typedef typename GemmConfig_::MultiplyAdd::ScalarB MultiplyAddScalar;

   typedef GemmGlobalTileTraits<
       // That's B.
       GemmOperand::kB,
       // B is column-major.
       MatrixLayout::kColumnMajor,
       // The pointer is float const.
       Scalar const,
       // The tile has size MxK in GEMM's terminology.
       Shape<1, GemmConfig_::OutputTile::kH, GemmConfig_::OutputTile::kD>,
       // The threads are distributed as (threads / K) x K (the traits may reorganize).
       Shape<1, GemmConfig_::kThreads / GemmConfig_::OutputTile::kD, GemmConfig_::OutputTile::kD>,
       // The number of scalars per LDG (LDG.32 or LDG.128, etc).
       GemmConfig_::kScalarsPerLdgB>
       GlobalTileTraits;

   static int const kScalarsIn4B = sizeof(MultiplyAddScalar) > 4 ? 1 : 4 / sizeof(MultiplyAddScalar);
   static int const kSkewB = 128 / sizeof(MultiplyAddScalar) / GemmConfig_::kScalarsPerStsB /
                             GlobalTileTraits::Threads::kW * kScalarsIn4B;

   typedef GemmSharedStoreWithSkewTileAbTraits <
       // The pointer is float.
       MultiplyAddScalar,
       // The tile has size KxN in GEMM's terminology.
       Shape<GemmConfig_::kStages,
             GemmConfig_::OutputTile::kD / GemmConfig_::InstructionShape::kD,
             GemmConfig_::OutputTile::kH * GemmConfig_::InstructionShape::kD>,
       // The threads are distributed as (threads / K) x K (the traits may reorganize).
       typename GlobalTileTraits::Threads,
       // The number of scalars per STS.
       GemmConfig_::kScalarsPerStsB,
       // The skew to avoid bank conflicts added in the tile W dimension.
       kSkewB<GemmConfig_::kScalarsPerLdsB ? GemmConfig_::kScalarsPerLdsB : kSkewB>
           SharedStoreTileTraits;

   typedef GemmSharedLoadTileBTraits<
       // The pointer is float const.
       MultiplyAddScalar const,
       // The output tile size.
       typename GemmConfig_::OutputTile,
       // The number of warps.
       typename GemmConfig_::Warps,
       // The number of threads per warp.
       typename GemmConfig_::MultiplyAdd::ThreadsPerWarp,
       // The shape of the FMA instruction.
       typename GemmConfig_::InstructionShape,
       // The number of stages.
       GemmConfig_::kStages,
       // The number of scalars per LDS.
       GemmConfig_::kScalarsPerLdsB,
       // The skew.
       SharedStoreTileTraits::kSkew>
       SharedLoadTileTraits;
 };


 template <typename GemmConfig_>
 struct GemmTileTraitsHelperB<MatrixLayout::kRowMajor, GemmConfig_> {
   static MatrixLayout::Kind const kLayout = MatrixLayout::kRowMajor;

   typedef typename GemmConfig_::ScalarB Scalar;
   typedef typename GemmConfig_::MultiplyAdd::ScalarB MultiplyAddScalar;

   typedef GemmGlobalTileTraits<
       // That's B.
       GemmOperand::kB,
       // B is row-major.
       MatrixLayout::kRowMajor,
       // The pointer is float const.
       Scalar const,
       // The tile has size KxN in GEMM's terminology.
       Shape<1, GemmConfig_::OutputTile::kD, GemmConfig_::OutputTile::kH>,
       // The threads are distributed as warps x 32 (the traits may reorganize).
       Shape<1, ShapeCount<typename GemmConfig_::Warps>::kCount, GemmConfig_::kWarpSize>,
       // The number of scalars per LDG (LDG.32 or LDG.128, etc).
       GemmConfig_::kScalarsPerLdgB>
       GlobalTileTraits;

   typedef GemmSharedStoreTileAbTraits<
       // The pointer is float.
       MultiplyAddScalar,
       // The tile has size KxN in GEMM's terminology.
       Shape<GemmConfig_::kStages,
             GemmConfig_::OutputTile::kD / GemmConfig_::InstructionShape::kD,
             GemmConfig_::OutputTile::kH * GemmConfig_::InstructionShape::kD>,
       // The threads are distributed as warps x 32 (the traits may reorganize).
       typename GlobalTileTraits::Threads,
       // The number of scalars per STS (STS.32 or STS.128, etc).
       GemmConfig_::kScalarsPerStsB>
       SharedStoreTileTraits;

   typedef GemmSharedLoadTileBTraits<
       // The pointer is float const.
       MultiplyAddScalar const,
       // The output tile size.
       typename GemmConfig_::OutputTile,
       // The number of warps.
       typename GemmConfig_::Warps,
       // The number of threads per warp.
       typename GemmConfig_::MultiplyAdd::ThreadsPerWarp,
       // The shape of the FMA instruction.
       typename GemmConfig_::InstructionShape,
       // The number of stages.
       GemmConfig_::kStages,
       // The number of scalars per LDS.
       GemmConfig_::kScalarsPerLdsB,
       // The skew.
       0>
       SharedLoadTileTraits;
 };


 template <
     typename GemmConfig_,
     typename GlobalLoadStreamA_,
     typename GlobalLoadStreamB_,
     typename SharedLoadStreamA_,
     typename SharedLoadStreamB_,
     typename Epilogue_,
     typename BlockSwizzle_ = IdentityBlockSwizzle,
     typename Index_ = int,
     typename ClearAccumulators_ = ClearAccumulators<typename GemmConfig_::Accumulators::Element> >

 struct GemmTraits {
   typedef GemmTraits<GemmConfig_,
     GlobalLoadStreamA_,
     GlobalLoadStreamB_,
     SharedLoadStreamA_,
     SharedLoadStreamB_,
     Epilogue_,
     BlockSwizzle_,
     Index_,
     ClearAccumulators_> This_;

   typedef typename cutlass::gemm::Gemm<This_> KernelClass;

   typedef GemmConfig_ GemmConfig;
   typedef typename GemmConfig::OutputTile OutputTile;

   typedef GlobalLoadStreamA_ GlobalLoadStreamA;
   static MatrixLayout::Kind const kLayoutA = GlobalLoadStreamA::kLayout;
   typedef typename GlobalLoadStreamA_::Scalar ScalarA;

   typedef GlobalLoadStreamB_ GlobalLoadStreamB;
   static MatrixLayout::Kind const kLayoutB = GlobalLoadStreamB::kLayout;
   typedef typename GlobalLoadStreamB_::Scalar ScalarB;

   typedef SharedLoadStreamA_ SharedLoadStreamA;
   typedef SharedLoadStreamB_ SharedLoadStreamB;

   typedef typename GemmConfig::MultiplyAdd MultiplyAdd;
   typedef Epilogue_ Epilogue;
   typedef typename Epilogue::ScalarC ScalarC;
   typedef typename Epilogue::ScalarD ScalarD;

   typedef BlockSwizzle_ BlockSwizzle;
   typedef Index_ Index;
   typedef ClearAccumulators_ ClearAccumulators;

   typedef GlobalLoadStreamPair<GlobalLoadStreamA,
                                GlobalLoadStreamB,
                                GemmConfig::kResidueInProlog>
       GlobalLoadStream;

   typedef typename GlobalLoadStream::ThreadblockTileStorage ThreadblockTileStorage;

   typedef SharedStreamPair<SharedLoadStreamA, SharedLoadStreamB> SharedStream;

   struct Params : public KernelLaunchConfiguration {

     GemmCoord problem_size;

     typename GlobalLoadStream::Params global_to_shared_stream;

     typename SharedStream::Params shared_stream;

     typename Epilogue::Params epilogue;

     template <typename GemmDesc_>
     CUTLASS_HOST_DEVICE int initialize(GemmDesc_ const& desc) {
       // Set the problem size.
       problem_size = desc.problem_size;

       // Compute grid dimensions
       BlockSwizzle block_swizzle;
       this->block = dim3(GemmConfig::kThreads);
       this->grid = block_swizzle.get_grid_layout(
         problem_size,
         make_Coord_from_shape<OutputTile>());

       // Compute offset to residue.
       Index gemm_k = problem_size[0];
       Index offset_to_residue = (gemm_k % OutputTile::kD) ? gemm_k - (gemm_k % OutputTile::kD) : 0;

       // Initialize parameters objects for
       int error_code = global_to_shared_stream.stream_a.initialize(
         desc.A.data(),
         desc.batch_stride_A,
         desc.A.leading_dim(),
         offset_to_residue
       );
       if (error_code) {
         return error_code;
       }

       error_code = global_to_shared_stream.stream_b.initialize(
         desc.B.data(),
         desc.batch_stride_B,
         desc.B.leading_dim(),
         offset_to_residue
       );

       if (error_code) {
         return error_code;
       }

       // The epilogue.
       return epilogue.initialize(desc);
     }

     CUTLASS_HOST_DEVICE int initialize(Index m,
                                        Index n,
                                        Index k,
                                        typename Epilogue::Scalar alpha,
                                        ScalarA const* d_a,
                                        Index lda,
                                        ScalarB const* d_b,
                                        Index ldb,
                                        typename Epilogue::Scalar beta,
                                        ScalarC const* d_c,
                                        Index ldc,
                                        ScalarD* d_d,
                                        Index ldd) {
       GemmDesc<ScalarA, ScalarB, ScalarC, ScalarD, typename Epilogue::Scalar> desc(
         GemmCoord(k, n, m, 1),
         alpha,
         TensorRef<ScalarA const, 2>(d_a, lda),
         TensorRef<ScalarB const, 2>(d_b, ldb),
         beta,
         TensorRef<ScalarC const, 2>(d_c, ldc),
         TensorRef<ScalarD, 2>(d_d, ldd)
       );

       return this->initialize(desc);
     }

     CUTLASS_HOST_DEVICE int initialize(Index m,
                                        Index n,
                                        Index k,
                                        typename Epilogue::Scalar alpha,
                                        ScalarA const* d_a,
                                        Index lda,
                                        long long int batch_stride_A,
                                        ScalarB const* d_b,
                                        Index ldb,
                                        long long int batch_stride_B,
                                        typename Epilogue::Scalar beta,
                                        ScalarC const* d_c,
                                        Index ldc,
                                        long long int batch_stride_C,
                                        ScalarD* d_d,
                                        Index ldd,
                                        long long int batch_stride_D,
                                        Index batch_count) {

       GemmDesc<ScalarA, ScalarB, ScalarC, ScalarD, typename Epilogue::Scalar> desc(
         GemmCoord(k, n, m, batch_count),
         alpha,
         TensorRef<ScalarA const, 2>(d_a, lda),
         batch_stride_A,
         TensorRef<ScalarB const, 2>(d_b, ldb),
         batch_stride_B,
         beta,
         TensorRef<ScalarC const, 2>(d_c, ldc),
         batch_stride_C,
         TensorRef<ScalarD, 2>(d_d, ldd),
         batch_stride_D
       );

       return this->initialize(desc);
     }
   };

   // The storage for the main loop + prologue.
   struct MainLoopSharedStorage {
     ThreadblockTileStorage threadblock_tile;

     typename GlobalLoadStream::SharedStorage global_to_shared_stream;

     typename ClearAccumulators::SharedStorage clear;
   };

   union SharedStorage {
     // The storage for the main loop.
     MainLoopSharedStorage main_loop;
     // The storage for the epilogue.
     typename Epilogue::SharedStorage epilogue;
   };

   static CUTLASS_DEVICE void shared_load_fence(bool in_loop) {
     if (SharedLoadStreamA::Iterator::kRequiresLoadFence ||
         SharedLoadStreamB::Iterator::kRequiresLoadFence) {
         __syncthreads();
     }
   }

   static CUTLASS_DEVICE void shared_store_fence(bool in_loop) {
       __syncthreads();
   }
 };


 template <typename GemmTileTraitsHelperA_, typename GemmTileTraitsHelperB_, typename Index_>
 struct SimplifiedGemmTraitsHelper {
   typedef GemmGlobalIteratorAb<typename GemmTileTraitsHelperA_::GlobalTileTraits, Index_>
       GlobalLoadIteratorA;
   typedef Copy<typename GlobalLoadIteratorA::Fragment> GlobalTransformerA;
   typedef TileStoreIterator<typename GemmTileTraitsHelperA_::SharedStoreTileTraits,
                             typename GemmTileTraitsHelperA_::SharedStoreTileTraits::Scalar,
                             IteratorAdvance::kH,
                             MemorySpace::kShared>
       SharedStoreIteratorA;
   typedef GlobalLoadStream<GemmOperand::kA,
                               GlobalLoadIteratorA,
                               SharedStoreIteratorA,
                               GlobalTransformerA>
       GlobalLoadStreamA;

   typedef GemmGlobalIteratorAb<typename GemmTileTraitsHelperB_::GlobalTileTraits, Index_>
       GlobalLoadIteratorB;
   typedef Copy<typename GlobalLoadIteratorB::Fragment> GlobalTransformerB;
   typedef TileStoreIterator<typename GemmTileTraitsHelperB_::SharedStoreTileTraits,
                             typename GemmTileTraitsHelperB_::SharedStoreTileTraits::Scalar,
                             IteratorAdvance::kH,
                             MemorySpace::kShared>
       SharedStoreIteratorB;
   typedef GlobalLoadStream<GemmOperand::kB,
                               GlobalLoadIteratorB,
                               SharedStoreIteratorB,
                               GlobalTransformerB>
       GlobalLoadStreamB;

   typedef TileLoadIterator<typename GemmTileTraitsHelperA_::SharedLoadTileTraits,
                            typename GemmTileTraitsHelperA_::Scalar,
                            IteratorAdvance::kH,
                            MemorySpace::kShared>
       SharedLoadIteratorA;
   typedef SharedLoadStream<SharedLoadIteratorA> SharedLoadStreamA;
   typedef TileLoadIterator<typename GemmTileTraitsHelperB_::SharedLoadTileTraits,
                            typename GemmTileTraitsHelperB_::Scalar,
                            IteratorAdvance::kH,
                            MemorySpace::kShared>
       SharedLoadIteratorB;
   typedef SharedLoadStream<SharedLoadIteratorB> SharedLoadStreamB;
 };


 template <
     MatrixLayout::Kind kLayoutA_,
     MatrixLayout::Kind kLayoutB_,
     typename GemmConfig_,
     typename Epilogue_,
     typename Index_ = int,
     // The configuration for the A matrix.
     typename GemmTileTraitsHelperA_ = GemmTileTraitsHelperA<kLayoutA_, GemmConfig_>,
     // The configuration for the B matrix.
     typename GemmTileTraitsHelperB_ = GemmTileTraitsHelperB<kLayoutB_, GemmConfig_>,
     // The helper class to create the streams and iterators.
     typename Helper_ =
         SimplifiedGemmTraitsHelper<GemmTileTraitsHelperA_, GemmTileTraitsHelperB_, Index_> >
 struct SimplifiedGemmTraits : public GemmTraits<
                                   // The config.
                                   GemmConfig_,
                                   // The stream to load A from global memory to shared memory.
                                   typename Helper_::GlobalLoadStreamA,
                                   // The stream to load B from global memory to shared memory.
                                   typename Helper_::GlobalLoadStreamB,
                                   // The stream to load A from shared memory.
                                   typename Helper_::SharedLoadStreamA,
                                   // The stream to load B from shared memory.
                                   typename Helper_::SharedLoadStreamB,
                                   // The epilogue.
                                   Epilogue_,
                                   // The block swizzle to reorganize the grid.
                                   IdentityBlockSwizzle,
                                   // The index.
                                   Index_,
                                   // The tool used to clear accumulators.
                                   ClearAccumulators<typename GemmConfig_::Accumulators::Element> > {
 };


 }  // namespace gemm
 }  // namespace cutlass
cutlass::gemm::GemmTraits::SharedStorage::epilogue
Epilogue::SharedStorage epilogue
Definition: gemm_traits.h:555

cutlass::gemm::GemmDesc
GEMM problem description.
Definition: gemm_desc.h:50

cutlass::gemm::GemmTraits::GlobalLoadStreamA
GlobalLoadStreamA_ GlobalLoadStreamA
The stream to load A from global memory to shared memory.
Definition: gemm_traits.h:370

cutlass::gemm::GemmEpilogue::ScalarD
GlobalStoreIteratorD::Scalar ScalarD
The scalar for D.
Definition: gemm_epilogue.h:85

cutlass::gemm::SimplifiedGemmTraitsHelper::GlobalLoadStreamA
GlobalLoadStream< GemmOperand::kA, GlobalLoadIteratorA, SharedStoreIteratorA, GlobalTransformerA > GlobalLoadStreamA
The stream to load A from global memory to shared memory.
Definition: gemm_traits.h:592

cutlass::MemorySpace::kShared
Definition: load_store.h:41

cutlass::gemm::GemmTraits::SharedLoadStreamA
SharedLoadStreamA_ SharedLoadStreamA
The iterator for A to load from shared memory.
Definition: gemm_traits.h:384

cutlass
Definition: convert.h:33

cutlass::gemm::GemmSharedLoadTileATraits
Definition: gemm_shared_tile.h:128

cutlass::gemm::GemmTraits::GlobalLoadStreamB
GlobalLoadStreamB_ GlobalLoadStreamB
The stream to load B from global memory to shared memory.
Definition: gemm_traits.h:377

cutlass::gemm::GemmSharedStoreWithSkewTileAbTraits
Definition: gemm_shared_tile.h:80

cutlass::gemm::GemmConfig::kThreads
static int const kThreads
The numnber of threads.
Definition: gemm_config.h:103

cutlass::gemm::SimplifiedGemmTraitsHelper::SharedStoreIteratorA
TileStoreIterator< typename GemmTileTraitsHelperA_::SharedStoreTileTraits, typename GemmTileTraitsHelperA_::SharedStoreTileTraits::Scalar, IteratorAdvance::kH, MemorySpace::kShared > SharedStoreIteratorA
The iterator to store A to shared memory.
Definition: gemm_traits.h:586

tile_iterator.h
Defines the Tile Traits concept and iterators for loading and storing to tiles efficiently.

cutlass::gemm::GemmTraits::ScalarD
Epilogue::ScalarD ScalarD
Definition: gemm_traits.h:394

cutlass::gemm::GemmTraits::Params::initialize
CUTLASS_HOST_DEVICE int initialize(Index m, Index n, Index k, typename Epilogue::Scalar alpha, ScalarA const *d_a, Index lda, ScalarB const *d_b, Index ldb, typename Epilogue::Scalar beta, ScalarC const *d_c, Index ldc, ScalarD *d_d, Index ldd)
Helper to construct a GEMM params using a BLAS-like API.
Definition: gemm_traits.h:474

cutlass::gemm::GemmTraits::SharedStorage
The storage in shared memory.
Definition: gemm_traits.h:551

cutlass::gemm::SimplifiedGemmTraitsHelper::SharedLoadStreamB
SharedLoadStream< SharedLoadIteratorB > SharedLoadStreamB
The stream to load B from shared memory.
Definition: gemm_traits.h:627

cutlass::gemm::GemmGlobalTileTraits
Definition: gemm_global_tile.h:70

cutlass::gemm::GlobalLoadStreamPair::SharedStorage
Defines a structure containing shared storage for each pair.
Definition: gemm_stream_pair.h:91

cutlass::gemm::SimplifiedGemmTraitsHelper::GlobalLoadStreamB
GlobalLoadStream< GemmOperand::kB, GlobalLoadIteratorB, SharedStoreIteratorB, GlobalTransformerB > GlobalLoadStreamB
The stream to load B from global memory to shared memory.
Definition: gemm_traits.h:610

cutlass::gemm::GemmTileTraitsHelperB< MatrixLayout::kColumnMajor, GemmConfig_ >::Scalar
GemmConfig_::ScalarB Scalar
The input scalar.
Definition: gemm_traits.h:201

cutlass::gemm::GemmTileTraitsHelperB< MatrixLayout::kRowMajor, GemmConfig_ >::SharedStoreTileTraits
GemmSharedStoreTileAbTraits< MultiplyAddScalar, Shape< GemmConfig_::kStages, GemmConfig_::OutputTile::kD/GemmConfig_::InstructionShape::kD, GemmConfig_::OutputTile::kH *GemmConfig_::InstructionShape::kD >, typename GlobalTileTraits::Threads, GemmConfig_::kScalarsPerStsB > SharedStoreTileTraits
The traits class to build the iterator to store data to shared memory for B^T.
Definition: gemm_traits.h:304

cutlass::gemm::GemmCoord
Definition: gemm_coord.h:43

cutlass::gemm::GemmTraits::This_
GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ > This_
This traits.
Definition: gemm_traits.h:359

cutlass::gemm::GemmTraits::SharedLoadStreamB
SharedLoadStreamB_ SharedLoadStreamB
The iterator for B to load from shared memory.
Definition: gemm_traits.h:386

kernel_launch.h
Defines structures and helpers to launch CUDA kernels within CUTLASS.

cutlass::gemm::GemmTraits::GlobalLoadStream
GlobalLoadStreamPair< GlobalLoadStreamA, GlobalLoadStreamB, GemmConfig::kResidueInProlog > GlobalLoadStream
Assemble the global load streams for A/B.
Definition: gemm_traits.h:407

cutlass::gemm::GemmTileTraitsHelperB< MatrixLayout::kColumnMajor, GemmConfig_ >::GlobalTileTraits
GemmGlobalTileTraits< GemmOperand::kB, MatrixLayout::kColumnMajor, Scalar const, Shape< 1, GemmConfig_::OutputTile::kH, GemmConfig_::OutputTile::kD >, Shape< 1, GemmConfig_::kThreads/GemmConfig_::OutputTile::kD, GemmConfig_::OutputTile::kD >, GemmConfig_::kScalarsPerLdgB > GlobalTileTraits
The traits class to build the iterator to load data from global memory for B^N.
Definition: gemm_traits.h:219

cutlass::Copy
Definition: convert.h:69

cutlass::gemm::GemmTraits::MainLoopSharedStorage::threadblock_tile
ThreadblockTileStorage threadblock_tile
Stores the threadblock tile.
Definition: gemm_traits.h:541

cutlass::gemm::SimplifiedGemmTraitsHelper::SharedLoadStreamA
SharedLoadStream< SharedLoadIteratorA > SharedLoadStreamA
The stream to load A from shared memory.
Definition: gemm_traits.h:619

cutlass::gemm::GemmSharedStoreTileAbTraits
Definition: gemm_shared_tile.h:38

cutlass::gemm::GemmTileTraitsHelperA< MatrixLayout::kColumnMajor, GemmConfig_ >::SharedLoadTileTraits
GemmSharedLoadTileATraits< MultiplyAddScalar const, typename GemmConfig_::OutputTile, typename GemmConfig_::Warps, typename GemmConfig_::MultiplyAdd::ThreadsPerWarp, typename GemmConfig_::InstructionShape, GemmConfig_::kStages, GemmConfig_::kScalarsPerLdsA, 0 > SharedLoadTileTraits
The traits class to build the iterator to load from shared memory for A^N.
Definition: gemm_traits.h:114

cutlass::gemm::GemmTraits::Epilogue
Epilogue_ Epilogue
The epilogue.
Definition: gemm_traits.h:391

cutlass::gemm::GemmTraits::ScalarA
GlobalLoadStreamA_::Scalar ScalarA
The scalar for A.
Definition: gemm_traits.h:374

cutlass::IteratorAdvance::kH
Definition: tile_iterator.h:65

cutlass::gemm::GemmTileTraitsHelperA< MatrixLayout::kColumnMajor, GemmConfig_ >::GlobalTileTraits
GemmGlobalTileTraits< GemmOperand::kA, MatrixLayout::kColumnMajor, Scalar const, Shape< 1, GemmConfig_::OutputTile::kD, GemmConfig_::OutputTile::kW >, Shape< 1, ShapeCount< typename GemmConfig_::Warps >::kCount, GemmConfig_::kWarpSize >, GemmConfig_::kScalarsPerLdgA > GlobalTileTraits
The traits class to build the iterator to load data from global memory for A^N.
Definition: gemm_traits.h:80

cutlass::gemm::GemmTraits::Params::global_to_shared_stream
GlobalLoadStream::Params global_to_shared_stream
Parameters object for the global load stream.
Definition: gemm_traits.h:422

cutlass::gemm::GemmTileTraitsHelperA< MatrixLayout::kColumnMajor, GemmConfig_ >::Scalar
GemmConfig_::ScalarA Scalar
The input scalar.
Definition: gemm_traits.h:62

cutlass::gemm::GemmSharedLoadTileBTraits
Definition: gemm_shared_tile.h:200

cutlass::gemm::GemmGlobalIteratorAb
Definition: gemm_global_tile.h:163

cutlass::gemm::GemmTraits::ScalarC
Epilogue::ScalarC ScalarC
The scalars in the epilogue.
Definition: gemm_traits.h:393

cutlass::gemm::GemmTraits::MultiplyAdd
GemmConfig::MultiplyAdd MultiplyAdd
The multiply-add functor.
Definition: gemm_traits.h:389

cutlass::gemm::GemmTraits::shared_load_fence
static CUTLASS_DEVICE void shared_load_fence(bool in_loop)
The memory fence for shared loads.
Definition: gemm_traits.h:559

cutlass::gemm::GemmTraits::GemmConfig
GemmConfig_ GemmConfig
The configuration.
Definition: gemm_traits.h:365

cutlass::gemm::GlobalLoadStream
Definition: gemm_global_stream.h:52

cutlass::gemm::GemmTileTraitsHelperB
Definition: gemm_traits.h:191

cutlass::gemm::ClearAccumulators
Definition: clear_accumulators.h:38

cutlass::gemm::GemmTraits::Params
Parameters object constructable on the host.
Definition: gemm_traits.h:416

cutlass::gemm::SharedStreamPair
Collect the global load streams for multiplicands.
Definition: gemm_stream_pair.h:173

cutlass::gemm::SimplifiedGemmTraitsHelper::GlobalTransformerB
Copy< typename GlobalLoadIteratorB::Fragment > GlobalTransformerB
The data converter for B before storing to shared memory.
Definition: gemm_traits.h:598

cutlass::gemm::GemmTileTraitsHelperB< MatrixLayout::kRowMajor, GemmConfig_ >::Scalar
GemmConfig_::ScalarB Scalar
The input scalar.
Definition: gemm_traits.h:272

cutlass::gemm::GlobalLoadStreamPair::Params::stream_b
StreamB::Params stream_b
Parameters object for StreamB.
Definition: gemm_stream_pair.h:67

cutlass::gemm::Gemm
Definition: gemm.h:92

cutlass::MatrixLayout
Defines data layouts of various matrix formats usable by TensorRef and other classes.
Definition: matrix_traits.h:156

cutlass::gemm::SimplifiedGemmTraitsHelper::GlobalLoadIteratorB
GemmGlobalIteratorAb< typename GemmTileTraitsHelperB_::GlobalTileTraits, Index_ > GlobalLoadIteratorB
The global iterator to load B from global memory.
Definition: gemm_traits.h:596

cutlass::gemm::GemmConfig::kResidueInProlog
static bool const kResidueInProlog
If true, residue is computed in the prologue.
Definition: gemm_config.h:136

cutlass::TileLoadIterator
An iterator implementing Tile Load Iterator Concept for loading a tile from memory.
Definition: tile_iterator.h:399

cutlass::gemm::GemmTraits::MainLoopSharedStorage
Definition: gemm_traits.h:539

cutlass::gemm::GlobalLoadStreamPair
Collect the global load streams for multiplicands.
Definition: gemm_stream_pair.h:50

cutlass::gemm::GemmConfig::MultiplyAdd
MultiplyAdd_ MultiplyAdd
The functor to do D = A*B + C.
Definition: gemm_config.h:90

cutlass::MatrixLayout::kRowMajor
Definition: matrix_traits.h:159

tile_allocation.h
Defines a fragment based on a Shape<> template.

cutlass::KernelLaunchConfiguration
Structure containing the basic launch configuration of a CUDA kernel.
Definition: kernel_launch.h:38

cutlass::gemm::GemmTraits::ClearAccumulators
ClearAccumulators_ ClearAccumulators
Clear the accumulators.
Definition: gemm_traits.h:401

cutlass::gemm::SharedLoadStream
Definition: gemm_shared_stream.h:45

cutlass::gemm::GemmTileTraitsHelperA< MatrixLayout::kRowMajor, GemmConfig_ >::GlobalTileTraits
GemmGlobalTileTraits< GemmOperand::kA, MatrixLayout::kRowMajor, Scalar const, Shape< 1, GemmConfig_::OutputTile::kW, GemmConfig_::OutputTile::kD >, Shape< 1, GemmConfig_::kThreads/GemmConfig_::OutputTile::kD, GemmConfig_::OutputTile::kD >, GemmConfig_::kScalarsPerLdgA > GlobalTileTraits
The traits class to build the iterator to load data from global memory for A^T.
Definition: gemm_traits.h:143

cutlass::gemm::GlobalLoadStreamPair::Params
Parameters object.
Definition: gemm_stream_pair.h:62

reshape_tile.h
Defines a type for restructuring a tile.

gemm_operand.h
Defines constant expressions for mapping GEMM problem size and strides onto pitch-linear memory...

cutlass::gemm::GemmTraits::Params::problem_size
GemmCoord problem_size
GEMM problem size.
Definition: gemm_traits.h:419

gemm_desc.h
Implements a software-pipelined efficient GEMM.

cutlass::gemm::GemmTraits::Params::initialize
CUTLASS_HOST_DEVICE int initialize(Index m, Index n, Index k, typename Epilogue::Scalar alpha, ScalarA const *d_a, Index lda, long long int batch_stride_A, ScalarB const *d_b, Index ldb, long long int batch_stride_B, typename Epilogue::Scalar beta, ScalarC const *d_c, Index ldc, long long int batch_stride_C, ScalarD *d_d, Index ldd, long long int batch_stride_D, Index batch_count)
Helper to construct a batched GEMM params.
Definition: gemm_traits.h:501

clear_accumulators.h
Defines abstractions for efficiently clearing accumulator tiles.

cutlass::TensorRef
Definition: tensor_ref.h:131

cutlass::gemm::GemmTraits::SharedStream
SharedStreamPair< SharedLoadStreamA, SharedLoadStreamB > SharedStream
Assemble the shared load streams for A/B.
Definition: gemm_traits.h:413

cutlass::gemm::GemmTraits::shared_store_fence
static CUTLASS_DEVICE void shared_store_fence(bool in_loop)
The memory fence for shared stores.
Definition: gemm_traits.h:567

cutlass::gemm::GemmTileTraitsHelperA< MatrixLayout::kRowMajor, GemmConfig_ >::Scalar
GemmConfig_::ScalarA Scalar
The input scalar.
Definition: gemm_traits.h:125

cutlass::ZipTileAllocation
Manages a pair of tile allocations as if they are one allocation.
Definition: tile_allocation.h:100

cutlass::gemm::GemmTileTraitsHelperA
Definition: gemm_traits.h:52

cutlass::gemm::GemmTraits::Params::initialize
CUTLASS_HOST_DEVICE int initialize(GemmDesc_ const &desc)
Initialize the parameters.
Definition: gemm_traits.h:432

cutlass::GemmOperand::kB
Definition: matrix_traits.h:357

cutlass::gemm::IdentityBlockSwizzle
Definition: threadblock_swizzle.h:65

cutlass::gemm::GemmTileTraitsHelperA< MatrixLayout::kColumnMajor, GemmConfig_ >::SharedStoreTileTraits
GemmSharedStoreTileAbTraits< MultiplyAddScalar, Shape< GemmConfig_::kStages, GemmConfig_::OutputTile::kD/GemmConfig_::InstructionShape::kD, GemmConfig_::OutputTile::kW *GemmConfig_::InstructionShape::kD >, typename GlobalTileTraits::Threads, GemmConfig_::kScalarsPerStsA > SharedStoreTileTraits
The traits class to build the iterator to store data to shared memory for A^N.
Definition: gemm_traits.h:94

cutlass::gemm::GemmTileTraitsHelperB< MatrixLayout::kRowMajor, GemmConfig_ >::MultiplyAddScalar
GemmConfig_::MultiplyAdd::ScalarB MultiplyAddScalar
The scalar stored in shared memory.
Definition: gemm_traits.h:274

cutlass::gemm::GemmTileTraitsHelperB< MatrixLayout::kColumnMajor, GemmConfig_ >::MultiplyAddScalar
GemmConfig_::MultiplyAdd::ScalarB MultiplyAddScalar
The scalar stored in shared memory.
Definition: gemm_traits.h:203

cutlass::gemm::GemmTraits::ScalarB
GlobalLoadStreamB_::Scalar ScalarB
The scalar for B.
Definition: gemm_traits.h:381

CUTLASS_HOST_DEVICE
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:46

gemm_config.h
Defines properties of GEMM computation that impose some constraints on caller.

cutlass::gemm::GemmTraits
Definition: gemm_traits.h:349

cutlass::Shape
A Shape implementing Layout Concept describing the dimensions of a cube.
Definition: shape.h:64

cutlass::gemm::GemmTraits::KernelClass
cutlass::gemm::Gemm< This_ > KernelClass
The struct that consumes this Traits.
Definition: gemm_traits.h:362

cutlass::gemm::GemmTraits::Params::shared_stream
SharedStream::Params shared_stream
Parameters object for the shared load stream.
Definition: gemm_traits.h:425

cutlass::gemm::GemmGlobalTileTraits::Threads
ReshapeThreads< VectorizedTile, Threads_ >::Threads Threads
The threads shape.
Definition: gemm_global_tile.h:88

cutlass::gemm::GemmTraits::BlockSwizzle
BlockSwizzle_ BlockSwizzle
The block swizzle to reorganize the grid.
Definition: gemm_traits.h:397

cutlass::gemm::SimplifiedGemmTraitsHelper::SharedLoadIteratorA
TileLoadIterator< typename GemmTileTraitsHelperA_::SharedLoadTileTraits, typename GemmTileTraitsHelperA_::Scalar, IteratorAdvance::kH, MemorySpace::kShared > SharedLoadIteratorA
The iterator to load A from shared memory.
Definition: gemm_traits.h:617

cutlass::MatrixLayout::kColumnMajor
Definition: matrix_traits.h:159

cutlass::gemm::SimplifiedGemmTraitsHelper::SharedLoadIteratorB
TileLoadIterator< typename GemmTileTraitsHelperB_::SharedLoadTileTraits, typename GemmTileTraitsHelperB_::Scalar, IteratorAdvance::kH, MemorySpace::kShared > SharedLoadIteratorB
The iterator to load B from shared memory.
Definition: gemm_traits.h:625

cutlass::gemm::GemmTraits::ThreadblockTileStorage
GlobalLoadStream::ThreadblockTileStorage ThreadblockTileStorage
Memory needed to store the threadblock-scoped GEMM tile.
Definition: gemm_traits.h:410

cutlass::KernelLaunchConfiguration::block
dim3 block
CUDA threablock dimensions.
Definition: kernel_launch.h:44

cutlass::gemm::GemmEpilogue::ScalarC
GlobalLoadIteratorC::Scalar ScalarC
The scalar for C.
Definition: gemm_epilogue.h:83

cutlass::gemm::GemmTraits::Index
Index_ Index
The index.
Definition: gemm_traits.h:399

cutlass::gemm::GemmTileTraitsHelperA< MatrixLayout::kColumnMajor, GemmConfig_ >::MultiplyAddScalar
GemmConfig_::MultiplyAdd::ScalarA MultiplyAddScalar
The scalar stored in shared memory.
Definition: gemm_traits.h:64

cutlass::gemm::SimplifiedGemmTraitsHelper::SharedStoreIteratorB
TileStoreIterator< typename GemmTileTraitsHelperB_::SharedStoreTileTraits, typename GemmTileTraitsHelperB_::SharedStoreTileTraits::Scalar, IteratorAdvance::kH, MemorySpace::kShared > SharedStoreIteratorB
The iterator to store B to shared memory.
Definition: gemm_traits.h:604

cutlass::gemm::GemmTraits::Params::epilogue
Epilogue::Params epilogue
The params for the epilogue.
Definition: gemm_traits.h:428

cutlass::MatrixLayout::Kind
Kind
Enumeration defining fundamental contiguous layouts.
Definition: matrix_traits.h:159

gemm_stream_pair.h
Defines a pair of GEMM tile streams.

cutlass::gemm::ClearAccumulators::SharedStorage
The shared storage.
Definition: clear_accumulators.h:40

gemm_global_stream.h
Implements efficient loading of the thread block-level tile from global memory and storing to shared ...

cutlass::gemm::GemmTraits::SharedStorage::main_loop
MainLoopSharedStorage main_loop
Definition: gemm_traits.h:553

cutlass::gemm::GemmTraits::kLayoutA
static MatrixLayout::Kind const kLayoutA
The layout of A.
Definition: gemm_traits.h:372

cutlass::KernelLaunchConfiguration::grid
dim3 grid
CUDA grid dimensions.
Definition: kernel_launch.h:41

cutlass::GemmOperand::kA
Definition: matrix_traits.h:357

cutlass::gemm::GemmTraits::MainLoopSharedStorage::global_to_shared_stream
GlobalLoadStream::SharedStorage global_to_shared_stream
Storage for GEMM global stream.
Definition: gemm_traits.h:544

cutlass::gemm::SharedStreamPair::Params
Parameters object passed to load iterators.
Definition: gemm_stream_pair.h:185

threadblock_swizzle.h
Defies functors for mapping blockIdx to partitions of the GEMM computation.

cutlass::gemm::SimplifiedGemmTraitsHelper
Definition: gemm_traits.h:575

gemm.h
Implements a software-pipelined efficient GEMM.

cutlass::gemm::SimplifiedGemmTraitsHelper::GlobalLoadIteratorA
GemmGlobalIteratorAb< typename GemmTileTraitsHelperA_::GlobalTileTraits, Index_ > GlobalLoadIteratorA
The global iterator to load A from global memory.
Definition: gemm_traits.h:578

cutlass::gemm::GemmTraits::OutputTile
GemmConfig::OutputTile OutputTile
The output tile.
Definition: gemm_traits.h:367

matrix_traits.h
Defines properties of matrices used to denote layout and operands to GEMM kernels.

cutlass::gemm::SimplifiedGemmTraitsHelper::GlobalTransformerA
Copy< typename GlobalLoadIteratorA::Fragment > GlobalTransformerA
The data converter for A before storing to shared memory.
Definition: gemm_traits.h:580

cutlass::gemm::GemmTileTraitsHelperB< MatrixLayout::kRowMajor, GemmConfig_ >::SharedLoadTileTraits
GemmSharedLoadTileBTraits< MultiplyAddScalar const, typename GemmConfig_::OutputTile, typename GemmConfig_::Warps, typename GemmConfig_::MultiplyAdd::ThreadsPerWarp, typename GemmConfig_::InstructionShape, GemmConfig_::kStages, GemmConfig_::kScalarsPerLdsB, 0 > SharedLoadTileTraits
The traits class to build the iterator to load from shared memory for B^T.
Definition: gemm_traits.h:324

cutlass::gemm::GemmTraits::MainLoopSharedStorage::clear
ClearAccumulators::SharedStorage clear
Storage for clearing accumulators.
Definition: gemm_traits.h:547

cutlass::gemm::GlobalLoadStreamPair::Params::stream_a
StreamA::Params stream_a
Parameters object for StreamA.
Definition: gemm_stream_pair.h:64

cutlass::gemm::GemmTileTraitsHelperB< MatrixLayout::kRowMajor, GemmConfig_ >::GlobalTileTraits
GemmGlobalTileTraits< GemmOperand::kB, MatrixLayout::kRowMajor, Scalar const, Shape< 1, GemmConfig_::OutputTile::kD, GemmConfig_::OutputTile::kH >, Shape< 1, ShapeCount< typename GemmConfig_::Warps >::kCount, GemmConfig_::kWarpSize >, GemmConfig_::kScalarsPerLdgB > GlobalTileTraits
The traits class to build the iterator to load data from global memory for B^T.
Definition: gemm_traits.h:290

gemm_shared_stream.h
Defines abstractions for managing loading and storing fragments to shared memory in the efficient GEM...

convert.h
Defines conversion operations among Fragments of different base type.

cutlass::gemm::SimplifiedGemmTraits
Definition: gemm_traits.h:650

cutlass::gemm::GemmConfig::OutputTile
OutputTile_ OutputTile
The tile.
Definition: gemm_config.h:88

cutlass::gemm::GemmTraits::kLayoutB
static MatrixLayout::Kind const kLayoutB
The layout of B.
Definition: gemm_traits.h:379

cutlass::TileStoreIterator
An iterator implementing Tile Store Iterator Concept for storing a tile to memory.
Definition: tile_iterator.h:836

cutlass::gemm::GemmTileTraitsHelperA< MatrixLayout::kRowMajor, GemmConfig_ >::MultiplyAddScalar
GemmConfig_::MultiplyAdd::ScalarA MultiplyAddScalar
The scalar stored in shared memory.
Definition: gemm_traits.h:127