461f417b9dfb7b7a14fbe65cf7c9191115b3f7b0/docs/gemm__traits_8h_source.html

 /***************************************************************************************************
  * Copyright (c) 2017-2018, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright notice, this list of
  *       conditions and the following disclaimer.
  *     * Redistributions in binary form must reproduce the above copyright notice, this list of
  *       conditions and the following disclaimer in the documentation and/or other materials
  *       provided with the distribution.
  *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
  *       to endorse or promote products derived from this software without specific prior written
  *       permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
 #pragma once

 #include <cutlass/convert.h>
 #include <cutlass/gemm/clear_accumulators.h>
 #include <cutlass/gemm/gemm_global_stream.h>
 #include <cutlass/gemm/gemm_operand.h>
 #include <cutlass/gemm/gemm_shared_stream.h>
 #include <cutlass/gemm/identity_block_swizzle.h>
 #include <cutlass/matrix_traits.h>
 #include <cutlass/reshape_tile.h>
 #include <cutlass/tile_iterator.h>

 namespace cutlass {
 namespace gemm {


 template <
     typename ScalarA_,
     typename ScalarB_,
     typename ScalarC_,
     typename ScalarD_,
     typename OutputTile_,
     typename MultiplyAdd_,
     int kScalarsPerLdgA_,
     int kScalarsPerStsA_,
     int kScalarsPerLdsA_,
     int kScalarsPerLdgB_,
     int kScalarsPerStsB_,
     int kScalarsPerLdsB_,
     int kScalarsPerLdgCAndStgD_,
     int kScalarsPerStsD_,
     int kScalarsPerLdsD_,
     int kStages_>

 struct GemmConfig {
   //
   typedef ScalarA_ ScalarA;
   typedef ScalarB_ ScalarB;
   typedef ScalarC_ ScalarC;
   typedef ScalarD_ ScalarD;

   typedef OutputTile_ OutputTile;
   typedef MultiplyAdd_ MultiplyAdd;
   typedef typename MultiplyAdd::InstructionShape InstructionShape;
   typedef typename MultiplyAdd::AccumulatorsPerWarp AccumulatorsPerWarp;
   typedef typename MultiplyAdd::Accumulators Accumulators;

   typedef typename ShapeDiv<OutputTile, AccumulatorsPerWarp>::Shape Warps;
   static int const kWarpSize = cutlass::kWarpSize;
   static int const kThreads = ShapeCount<Warps>::kCount * kWarpSize;

   static int const kScalarsPerLdgA = kScalarsPerLdgA_;
   static int const kScalarsPerStsA = kScalarsPerStsA_;
   static int const kScalarsPerLdsA = kScalarsPerLdsA_;

   static int const kScalarsPerLdgB = kScalarsPerLdgB_;
   static int const kScalarsPerStsB = kScalarsPerStsB_;
   static int const kScalarsPerLdsB = kScalarsPerLdsB_;

   static int const kScalarsPerLdgC = kScalarsPerLdgCAndStgD_;

   static int const kScalarsPerStgD = kScalarsPerLdgCAndStgD_;
   static int const kScalarsPerStsD = kScalarsPerStsD_;
   static int const kScalarsPerLdsD = kScalarsPerLdsD_;

   static int const kAccumulatorsPerLdsA = kScalarsPerLdsA / InstructionShape::kD;
   static int const kAccumulatorsPerLdsB = kScalarsPerLdsB / InstructionShape::kD;

   static int const kStages = kStages_;
 };


 template <enum MatrixLayout::Kind, typename GemmConfig_>
 struct GemmTileTraitsHelperA {};


 template <typename GemmConfig_>
 struct GemmTileTraitsHelperA<MatrixLayout::kColumnMajor, GemmConfig_> {
   static MatrixLayout::Kind const kLayout = MatrixLayout::kColumnMajor;

   typedef typename GemmConfig_::ScalarA Scalar;
   typedef typename GemmConfig_::MultiplyAdd::ScalarA MultiplyAddScalar;

   typedef GemmGlobalTileTraits<
       // That's A.
       GemmOperand::kA,
       // A is column-major.
       MatrixLayout::kColumnMajor,
       // The pointer is float const.
       Scalar const,
       // The tile has size KxM in GEMM's terminology.
       Shape<1, GemmConfig_::OutputTile::kD, GemmConfig_::OutputTile::kW>,
       // The threads are distributed as warps x 32 (the traits may reorganize).
       Shape<1, ShapeCount<typename GemmConfig_::Warps>::kCount, GemmConfig_::kWarpSize>,
       // The number of scalars per LDG (LDG.32 or LDG.128, etc).
       GemmConfig_::kScalarsPerLdgA>
       GlobalTileTraits;

   typedef GemmSharedStoreTileAbTraits<
       // The pointer is float.
       MultiplyAddScalar,
       // The tile has size KxM in GEMM's terminology.
       Shape<GemmConfig_::kStages,
             GemmConfig_::OutputTile::kD / GemmConfig_::InstructionShape::kD,
             GemmConfig_::OutputTile::kW * GemmConfig_::InstructionShape::kD>,
       // The threads are distributed as warps x 32 (the traits may reorganize).
       typename GlobalTileTraits::Threads,
       // The number of scalars per STS (STS.32 or STS.128, etc).
       GemmConfig_::kScalarsPerStsA>
       SharedStoreTileTraits;

   typedef GemmSharedLoadTileATraits<
       // The pointer is float const.
       MultiplyAddScalar const,
       // The output tile size.
       typename GemmConfig_::OutputTile,
       // The number of warps.
       typename GemmConfig_::Warps,
       // The number of threads per warp.
       typename GemmConfig_::MultiplyAdd::ThreadsPerWarp,
       // The shape of the FMA instruction.
       typename GemmConfig_::InstructionShape,
       // The number of stages.
       GemmConfig_::kStages,
       // The number of scalars per LDS.
       GemmConfig_::kScalarsPerLdsA,
       // The skew.
       0>
       SharedLoadTileTraits;
 };


 template <typename GemmConfig_>
 struct GemmTileTraitsHelperA<MatrixLayout::kRowMajor, GemmConfig_> {
   static MatrixLayout::Kind const kLayout = MatrixLayout::kRowMajor;

   typedef typename GemmConfig_::ScalarA Scalar;
   typedef typename GemmConfig_::MultiplyAdd::ScalarA MultiplyAddScalar;

   typedef GemmGlobalTileTraits<
       // That's A.
       GemmOperand::kA,
       // A is row-major.
       MatrixLayout::kRowMajor,
       // The pointer is float const.
       Scalar const,
       // The tile has size MxK in GEMM's terminology.
       Shape<1, GemmConfig_::OutputTile::kW, GemmConfig_::OutputTile::kD>,
       // The threads are distributed as (threads / K) x K (the traits may reorganize).
       Shape<1, GemmConfig_::kThreads / GemmConfig_::OutputTile::kD, GemmConfig_::OutputTile::kD>,
       // The number of scalars per LDG (LDG.32 or LDG.128, etc).
       GemmConfig_::kScalarsPerLdgA>
       GlobalTileTraits;

   static int const kScalarsIn4B = sizeof(MultiplyAddScalar) > 4 ? 1 : 4 / sizeof(MultiplyAddScalar);
   typedef GemmSharedStoreWithSkewTileAbTraits<
       // The pointer is float.
       MultiplyAddScalar,
       // The tile has size KxM in GEMM's terminology.
       Shape<GemmConfig_::kStages,
             GemmConfig_::OutputTile::kD / GemmConfig_::InstructionShape::kD,
             GemmConfig_::OutputTile::kW * GemmConfig_::InstructionShape::kD>,
       // The threads are distributed as (threads / K) x K (the traits may reorganize).
       typename GlobalTileTraits::Threads,
       // The number of scalars per STS.
       GemmConfig_::kScalarsPerStsA,
       // The skew to avoid bank conflicts added in the tile W dimension.
       128 / sizeof(MultiplyAddScalar) / GemmConfig_::kScalarsPerStsA /
           GlobalTileTraits::Threads::kW * kScalarsIn4B>
       SharedStoreTileTraits;

   typedef GemmSharedLoadTileATraits<
       // The pointer is float const.
       MultiplyAddScalar const,
       // The output tile size.
       typename GemmConfig_::OutputTile,
       // The number of warps.
       typename GemmConfig_::Warps,
       // The number of threads per warp.
       typename GemmConfig_::MultiplyAdd::ThreadsPerWarp,
       // The shape of the FMA instruction.
       typename GemmConfig_::InstructionShape,
       // The number of stages.
       GemmConfig_::kStages,
       // The number of scalars per LDS.
       GemmConfig_::kScalarsPerLdsA,
       // The skew.
       SharedStoreTileTraits::kSkew>
       SharedLoadTileTraits;
 };


 template <enum MatrixLayout::Kind, typename GemmConfig_>
 struct GemmTileTraitsHelperB {};


 template <typename GemmConfig_>
 struct GemmTileTraitsHelperB<MatrixLayout::kColumnMajor, GemmConfig_> {
   static MatrixLayout::Kind const kLayout = MatrixLayout::kColumnMajor;

   typedef typename GemmConfig_::ScalarB Scalar;
   typedef typename GemmConfig_::MultiplyAdd::ScalarB MultiplyAddScalar;

   typedef GemmGlobalTileTraits<
       // That's B.
       GemmOperand::kB,
       // B is column-major.
       MatrixLayout::kColumnMajor,
       // The pointer is float const.
       Scalar const,
       // The tile has size MxK in GEMM's terminology.
       Shape<1, GemmConfig_::OutputTile::kH, GemmConfig_::OutputTile::kD>,
       // The threads are distributed as (threads / K) x K (the traits may reorganize).
       Shape<1, GemmConfig_::kThreads / GemmConfig_::OutputTile::kD, GemmConfig_::OutputTile::kD>,
       // The number of scalars per LDG (LDG.32 or LDG.128, etc).
       GemmConfig_::kScalarsPerLdgB>
       GlobalTileTraits;

   static int const kScalarsIn4B = sizeof(MultiplyAddScalar) > 4 ? 1 : 4 / sizeof(MultiplyAddScalar);
   typedef GemmSharedStoreWithSkewTileAbTraits<
       // The pointer is float.
       MultiplyAddScalar,
       // The tile has size KxN in GEMM's terminology.
       Shape<GemmConfig_::kStages,
             GemmConfig_::OutputTile::kD / GemmConfig_::InstructionShape::kD,
             GemmConfig_::OutputTile::kH * GemmConfig_::InstructionShape::kD>,
       // The threads are distributed as (threads / K) x K (the traits may reorganize).
       typename GlobalTileTraits::Threads,
       // The number of scalars per STS.
       GemmConfig_::kScalarsPerStsB,
       // The skew to avoid bank conflicts added in the tile W dimension.
       128 / sizeof(MultiplyAddScalar) / GemmConfig_::kScalarsPerStsB /
           GlobalTileTraits::Threads::kW * kScalarsIn4B>
       SharedStoreTileTraits;

   typedef GemmSharedLoadTileBTraits<
       // The pointer is float const.
       MultiplyAddScalar const,
       // The output tile size.
       typename GemmConfig_::OutputTile,
       // The number of warps.
       typename GemmConfig_::Warps,
       // The number of threads per warp.
       typename GemmConfig_::MultiplyAdd::ThreadsPerWarp,
       // The shape of the FMA instruction.
       typename GemmConfig_::InstructionShape,
       // The number of stages.
       GemmConfig_::kStages,
       // The number of scalars per LDS.
       GemmConfig_::kScalarsPerLdsB,
       // The skew.
       SharedStoreTileTraits::kSkew>
       SharedLoadTileTraits;
 };


 template <typename GemmConfig_>
 struct GemmTileTraitsHelperB<MatrixLayout::kRowMajor, GemmConfig_> {
   static MatrixLayout::Kind const kLayout = MatrixLayout::kRowMajor;

   typedef typename GemmConfig_::ScalarB Scalar;
   typedef typename GemmConfig_::MultiplyAdd::ScalarB MultiplyAddScalar;

   typedef GemmGlobalTileTraits<
       // That's B.
       GemmOperand::kB,
       // B is row-major.
       MatrixLayout::kRowMajor,
       // The pointer is float const.
       Scalar const,
       // The tile has size KxN in GEMM's terminology.
       Shape<1, GemmConfig_::OutputTile::kD, GemmConfig_::OutputTile::kH>,
       // The threads are distributed as warps x 32 (the traits may reorganize).
       Shape<1, ShapeCount<typename GemmConfig_::Warps>::kCount, GemmConfig_::kWarpSize>,
       // The number of scalars per LDG (LDG.32 or LDG.128, etc).
       GemmConfig_::kScalarsPerLdgB>
       GlobalTileTraits;

   typedef GemmSharedStoreTileAbTraits<
       // The pointer is float.
       MultiplyAddScalar,
       // The tile has size KxN in GEMM's terminology.
       Shape<GemmConfig_::kStages,
             GemmConfig_::OutputTile::kD / GemmConfig_::InstructionShape::kD,
             GemmConfig_::OutputTile::kH * GemmConfig_::InstructionShape::kD>,
       // The threads are distributed as warps x 32 (the traits may reorganize).
       typename GlobalTileTraits::Threads,
       // The number of scalars per STS (STS.32 or STS.128, etc).
       GemmConfig_::kScalarsPerStsB>
       SharedStoreTileTraits;

   typedef GemmSharedLoadTileBTraits<
       // The pointer is float const.
       MultiplyAddScalar const,
       // The output tile size.
       typename GemmConfig_::OutputTile,
       // The number of warps.
       typename GemmConfig_::Warps,
       // The number of threads per warp.
       typename GemmConfig_::MultiplyAdd::ThreadsPerWarp,
       // The shape of the FMA instruction.
       typename GemmConfig_::InstructionShape,
       // The number of stages.
       GemmConfig_::kStages,
       // The number of scalars per LDS.
       GemmConfig_::kScalarsPerLdsB,
       // The skew.
       0>
       SharedLoadTileTraits;
 };


 template <
     typename GemmConfig_,
     typename GlobalLoadStreamA_,
     typename GlobalLoadStreamB_,
     typename SharedLoadStreamA_,
     typename SharedLoadStreamB_,
     typename Epilogue_,
     typename BlockSwizzle_ = IdentityBlockSwizzle,
     typename Index_ = int,
     typename ClearAccumulators_ = ClearAccumulators<typename GemmConfig_::Accumulators::Scalar> >

 struct GemmTraits {
   typedef GemmConfig_ GemmConfig;
   typedef typename GemmConfig::OutputTile OutputTile;

   typedef GlobalLoadStreamA_ GlobalLoadStreamA;
   static MatrixLayout::Kind const kLayoutA = GlobalLoadStreamA::kLayout;
   typedef typename GlobalLoadStreamA_::Scalar ScalarA;

   typedef GlobalLoadStreamB_ GlobalLoadStreamB;
   static MatrixLayout::Kind const kLayoutB = GlobalLoadStreamB::kLayout;
   typedef typename GlobalLoadStreamB_::Scalar ScalarB;

   typedef SharedLoadStreamA_ SharedLoadStreamA;
   typedef SharedLoadStreamB_ SharedLoadStreamB;

   typedef typename GlobalLoadStreamA::SharedStoreStorage SharedStoreStorageA;
   // Btw, make sure we did not messed up with the size of the storage.
   static_assert(sizeof(SharedStoreStorageA) == sizeof(typename SharedLoadStreamA::SharedStorage),
                 "");

   typedef typename GlobalLoadStreamB::SharedStoreStorage SharedStoreStorageB;
   // Btw, make sure we did not messed up with the size of the storage.
   static_assert(sizeof(SharedStoreStorageB) == sizeof(typename SharedLoadStreamB::SharedStorage),
                 "");

   typedef typename GemmConfig::MultiplyAdd MultiplyAdd;
   typedef Epilogue_ Epilogue;
   typedef typename Epilogue::ScalarC ScalarC;
   typedef typename Epilogue::ScalarD ScalarD;

   typedef BlockSwizzle_ BlockSwizzle;
   typedef Index_ Index;
   typedef ClearAccumulators_ ClearAccumulators;

   struct Params {
     Index m, n, k;
     typename GlobalLoadStreamA::Params global_stream_a;
     typename GlobalLoadStreamB::Params global_stream_b;
     typename SharedLoadStreamA::Params shared_stream_a;
     typename SharedLoadStreamB::Params shared_stream_b;
     typename Epilogue::Params epilogue;

     template <typename GemmDesc_>
     CUTLASS_HOST_DEVICE int initialize(GemmDesc_ const& desc) {
       // Set the problem size.
       this->m = desc.m;
       this->n = desc.n;
       this->k = desc.k;

       // Initialize the iterator for A.
       int error_code =
           global_stream_a.initialize(reinterpret_cast<ScalarA const*>(desc.d_a), desc.lda);

       if (error_code) {
         return error_code;
       }

       // Initialize the iterator for B.
       error_code = global_stream_b.initialize(reinterpret_cast<ScalarB const*>(desc.d_b), desc.ldb);

       if (error_code) {
         return error_code;
       }

       // The epilogue.
       return epilogue.initialize(desc);
     }
   };

   // The storage for A.
   template <typename GlobalLoadStream_, typename SharedLoadStream_>
   union StreamSharedStorage {
     // The storage needed by the global stream.
     typename GlobalLoadStream_::SharedStorage global;
     // The storage needed by the shared stream.
     typename SharedLoadStream_::SharedStorage shared;
   };

   // The storage for the main loop + prologue.
   struct MainLoopSharedStorage {
     // The storage to shuffle the A matrix in shared memory.
     StreamSharedStorage<GlobalLoadStreamA, SharedLoadStreamA> stream_a;
     // The storage to shuffle the B matrix in shared memory.
     StreamSharedStorage<GlobalLoadStreamB, SharedLoadStreamB> stream_b;
     // The storage to clear the accumulators if needed.
     typename ClearAccumulators::SharedStorage clear;
   };

   union SharedStorage {
     // The storage for the main loop.
     MainLoopSharedStorage main_loop;
     // The storage for the epilogue.
     typename Epilogue::SharedStorage epilogue;
   };

   struct GlobalLoadStream {
     CUTLASS_DEVICE GlobalLoadStream(Params const& params,
                                     SharedStorage& shared_storage,
                                     dim3 const& block)
         : stream_a(params.global_stream_a,
                    shared_storage.main_loop.stream_a.global,
                    cutlass::make_Coord(0, params.k, params.m),
                    cutlass::make_Coord(0, 0, block.x)),
           stream_b(params.global_stream_b,
                    shared_storage.main_loop.stream_b.global,
                    cutlass::make_Coord(0, params.k, params.n),
                    make_Coord(0, 0, block.y)) {}

     CUTLASS_DEVICE void copy() {
       stream_a.copy();
       stream_b.copy();
     }

     CUTLASS_DEVICE void commit() {
       stream_a.commit();
       stream_b.commit();
     }

     CUTLASS_DEVICE void residue(Index k, bool skip_clear = false) {
       stream_a.residue(k, skip_clear);
       stream_b.residue(k, skip_clear);
     }

     GlobalLoadStreamA stream_a;
     GlobalLoadStreamB stream_b;
   };

   struct SharedLoadStream {
     CUTLASS_DEVICE SharedLoadStream(Params const& params, SharedStorage& shared_storage) {
       stream_a.initialize(params.shared_stream_a, shared_storage.main_loop.stream_a.shared);
       stream_b.initialize(params.shared_stream_b, shared_storage.main_loop.stream_b.shared);
     }

     CUTLASS_DEVICE void copy(int step) {
       stream_a.copy(step, fetched_a[step % 2]);
       stream_b.copy(step, fetched_b[step % 2]);
     }

     CUTLASS_DEVICE void commit(int step) {
       stream_a.commit(fetched_a[step % 2], transformed_a[step % 2]);
       stream_b.commit(fetched_b[step % 2], transformed_b[step % 2]);
     }

     CUTLASS_DEVICE typename SharedLoadStreamA::Fragment const& fragment_a(int step) const {
       return transformed_a[step % 2];
     }

     CUTLASS_DEVICE typename SharedLoadStreamB::Fragment const& fragment_b(int step) const {
       return transformed_b[step % 2];
     }

     CUTLASS_DEVICE void inc_stage() {
       stream_a.inc_stage();
       stream_b.inc_stage();
     }

     SharedLoadStreamA stream_a;
     typename SharedLoadStreamA::FetchedFragment fetched_a[2];
     typename SharedLoadStreamA::TransformedFragment transformed_a[2];
     SharedLoadStreamB stream_b;
     typename SharedLoadStreamB::FetchedFragment fetched_b[2];
     typename SharedLoadStreamB::TransformedFragment transformed_b[2];
   };

   static CUTLASS_DEVICE void shared_load_fence(bool in_loop) {
     if (SharedLoadStreamA::Iterator::kRequiresLoadFence ||
         SharedLoadStreamB::Iterator::kRequiresLoadFence) {
       __syncthreads();
     }
   }

   static CUTLASS_DEVICE void shared_store_fence(bool in_loop) { __syncthreads(); }
 };


 template <typename GemmTileTraitsHelperA_, typename GemmTileTraitsHelperB_, typename Index_>
 struct SimplifiedGemmTraitsHelper {
   typedef GemmGlobalIteratorAb<typename GemmTileTraitsHelperA_::GlobalTileTraits, Index_>
       GlobalLoadIteratorA;
   typedef Copy<typename GlobalLoadIteratorA::Fragment> GlobalTransformerA;
   typedef TileStoreIterator<typename GemmTileTraitsHelperA_::SharedStoreTileTraits,
                             typename GemmTileTraitsHelperA_::SharedStoreTileTraits::Scalar,
                             IteratorAdvance::kH,
                             MemorySpace::kShared>
       SharedStoreIteratorA;
   typedef GlobalLoadStream<GlobalLoadIteratorA, SharedStoreIteratorA, GlobalTransformerA>
       GlobalLoadStreamA;

   typedef GemmGlobalIteratorAb<typename GemmTileTraitsHelperB_::GlobalTileTraits, Index_>
       GlobalLoadIteratorB;
   typedef Copy<typename GlobalLoadIteratorB::Fragment> GlobalTransformerB;
   typedef TileStoreIterator<typename GemmTileTraitsHelperB_::SharedStoreTileTraits,
                             typename GemmTileTraitsHelperB_::SharedStoreTileTraits::Scalar,
                             IteratorAdvance::kH,
                             MemorySpace::kShared>
       SharedStoreIteratorB;
   typedef GlobalLoadStream<GlobalLoadIteratorB, SharedStoreIteratorB, GlobalTransformerB>
       GlobalLoadStreamB;

   typedef TileLoadIterator<typename GemmTileTraitsHelperA_::SharedLoadTileTraits,
                            typename GemmTileTraitsHelperA_::Scalar,
                            IteratorAdvance::kH,
                            MemorySpace::kShared>
       SharedLoadIteratorA;
   typedef SharedLoadStream<SharedLoadIteratorA> SharedLoadStreamA;
   typedef TileLoadIterator<typename GemmTileTraitsHelperB_::SharedLoadTileTraits,
                            typename GemmTileTraitsHelperB_::Scalar,
                            IteratorAdvance::kH,
                            MemorySpace::kShared>
       SharedLoadIteratorB;
   typedef SharedLoadStream<SharedLoadIteratorB> SharedLoadStreamB;
 };


 template <
     MatrixLayout::Kind kLayoutA_,
     MatrixLayout::Kind kLayoutB_,
     typename GemmConfig_,
     typename Epilogue_,
     typename Index_ = int,
     // The configuration for the A matrix.
     typename GemmTileTraitsHelperA_ = GemmTileTraitsHelperA<kLayoutA_, GemmConfig_>,
     // The configuration for the B matrix.
     typename GemmTileTraitsHelperB_ = GemmTileTraitsHelperB<kLayoutB_, GemmConfig_>,
     // The helper class to create the streams and iterators.
     typename Helper_ =
         SimplifiedGemmTraitsHelper<GemmTileTraitsHelperA_, GemmTileTraitsHelperB_, Index_> >
 struct SimplifiedGemmTraits : public GemmTraits<
                                   // The config.
                                   GemmConfig_,
                                   // The stream to load A from global memory to shared memory.
                                   typename Helper_::GlobalLoadStreamA,
                                   // The stream to load B from global memory to shared memory.
                                   typename Helper_::GlobalLoadStreamB,
                                   // The stream to load A from shared memory.
                                   typename Helper_::SharedLoadStreamA,
                                   // The stream to load B from shared memory.
                                   typename Helper_::SharedLoadStreamB,
                                   // The epilogue.
                                   Epilogue_,
                                   // The block swizzle to reorganize the grid.
                                   IdentityBlockSwizzle,
                                   // The index.
                                   Index_,
                                   // The tool used to clear accumulators.
                                   ClearAccumulators<typename GemmConfig_::Accumulators::Element> > {
 };


 }  // namespace gemm
 }  // namespace cutlass
cutlass::gemm::GemmTraits::Params::n
Index n
Definition: gemm_traits.h:483

cutlass::gemm::GemmConfig::kWarpSize
static int const kWarpSize
The default warp size (32 threads per warp).
Definition: gemm_traits.h:104

cutlass::gemm::GemmTraits::SharedStorage::epilogue
Epilogue::SharedStorage epilogue
Definition: gemm_traits.h:547

cutlass::gemm::GemmConfig::kScalarsPerStsA
static int const kScalarsPerStsA
Definition: gemm_traits.h:110

cutlass::gemm::GemmTileTraitsHelperB< MatrixLayout::kColumnMajor, GemmConfig_ >::SharedLoadTileTraits
GemmSharedLoadTileBTraits< MultiplyAddScalar const, typename GemmConfig_::OutputTile, typename GemmConfig_::Warps, typename GemmConfig_::MultiplyAdd::ThreadsPerWarp, typename GemmConfig_::InstructionShape, GemmConfig_::kStages, GemmConfig_::kScalarsPerLdsB, SharedStoreTileTraits::kSkew > SharedLoadTileTraits
The traits class to build the iterator to load from shared memory for B^N.
Definition: gemm_traits.h:340

cutlass::gemm::GemmConfig::ScalarA
ScalarA_ ScalarA
The scalar for A.
Definition: gemm_traits.h:82

cutlass::gemm::GemmTraits::GlobalLoadStreamA
GlobalLoadStreamA_ GlobalLoadStreamA
The stream to load A from global memory to shared memory.
Definition: gemm_traits.h:435

cutlass::gemm::GemmEpilogue::ScalarD
GlobalStoreIteratorD::Scalar ScalarD
The scalar for D.
Definition: gemm_epilogue.h:98

cutlass::gemm::GemmConfig::MultiplyAdd
MultiplyAdd_ MultiplyAdd
The functor to do D = A*B + C.
Definition: gemm_traits.h:93

cutlass::gemm::GemmConfig::kAccumulatorsPerLdsA
static int const kAccumulatorsPerLdsA
The number of accumulators that are going to be fed from one LDS A/B.
Definition: gemm_traits.h:127

cutlass::MemorySpace::kShared
Definition: load_store.h:42

cutlass::gemm::GemmConfig::kScalarsPerLdsA
static int const kScalarsPerLdsA
Definition: gemm_traits.h:111

cutlass::gemm::GemmTraits::SharedLoadStreamA
SharedLoadStreamA_ SharedLoadStreamA
The iterator for A to load from shared memory.
Definition: gemm_traits.h:449

cutlass::gemm::GemmConfig::InstructionShape
MultiplyAdd::InstructionShape InstructionShape
The shape of the instruction.
Definition: gemm_traits.h:95

cutlass
Definition: convert.h:33

cutlass::gemm::GemmTraits::Params::shared_stream_a
SharedLoadStreamA::Params shared_stream_a
The params for the A stream from shared memory.
Definition: gemm_traits.h:489

cutlass::gemm::GemmSharedLoadTileATraits
Definition: gemm_shared_tile.h:129

cutlass::gemm::GemmTraits::GlobalLoadStreamB
GlobalLoadStreamB_ GlobalLoadStreamB
The stream to load B from global memory to shared memory.
Definition: gemm_traits.h:442

cutlass::gemm::GemmSharedStoreWithSkewTileAbTraits
Definition: gemm_shared_tile.h:80

cutlass::gemm::GemmTraits::SharedLoadStream::inc_stage
CUTLASS_DEVICE void inc_stage()
Increment the stage.
Definition: gemm_traits.h:620

cutlass::gemm::SimplifiedGemmTraitsHelper::SharedStoreIteratorA
TileStoreIterator< typename GemmTileTraitsHelperA_::SharedStoreTileTraits, typename GemmTileTraitsHelperA_::SharedStoreTileTraits::Scalar, IteratorAdvance::kH, MemorySpace::kShared > SharedStoreIteratorA
The iterator to store A to shared memory.
Definition: gemm_traits.h:665

cutlass::gemm::GemmConfig::kScalarsPerLdsB
static int const kScalarsPerLdsB
Definition: gemm_traits.h:116

tile_iterator.h
Defines the Tile Traits concept and iterators for loading and storing to tiles efficiently.

cutlass::gemm::GemmTraits::ScalarD
Epilogue::ScalarD ScalarD
Definition: gemm_traits.h:471

cutlass::gemm::GemmTraits::SharedStorage
The storage in shared memory.
Definition: gemm_traits.h:543

cutlass::gemm::SimplifiedGemmTraitsHelper::SharedLoadStreamB
SharedLoadStream< SharedLoadIteratorB > SharedLoadStreamB
The stream to load B from shared memory.
Definition: gemm_traits.h:700

cutlass::gemm::GemmTraits::Params::k
Index k
Definition: gemm_traits.h:483

cutlass::gemm::GemmTraits::StreamSharedStorage
Definition: gemm_traits.h:525

cutlass::gemm::GemmGlobalTileTraits
Definition: gemm_global_tile.h:70

cutlass::gemm::GemmTraits::SharedLoadStream::fetched_a
SharedLoadStreamA::FetchedFragment fetched_a[2]
The fragments to fetch A.
Definition: gemm_traits.h:628

cutlass::make_Coord
CUTLASS_HOST_DEVICE Coord< 1 > make_Coord(int _0)
Helper to make a 2-element coordinate.
Definition: coord.h:241

cutlass::gemm::GemmTileTraitsHelperB< MatrixLayout::kColumnMajor, GemmConfig_ >::Scalar
GemmConfig_::ScalarB Scalar
The input scalar.
Definition: gemm_traits.h:283

cutlass::gemm::GemmTileTraitsHelperB< MatrixLayout::kRowMajor, GemmConfig_ >::SharedStoreTileTraits
GemmSharedStoreTileAbTraits< MultiplyAddScalar, Shape< GemmConfig_::kStages, GemmConfig_::OutputTile::kD/GemmConfig_::InstructionShape::kD, GemmConfig_::OutputTile::kH *GemmConfig_::InstructionShape::kD >, typename GlobalTileTraits::Threads, GemmConfig_::kScalarsPerStsB > SharedStoreTileTraits
The traits class to build the iterator to store data to shared memory for B^T.
Definition: gemm_traits.h:383

cutlass::gemm::GemmTraits::SharedLoadStreamB
SharedLoadStreamB_ SharedLoadStreamB
The iterator for B to load from shared memory.
Definition: gemm_traits.h:451

cutlass::gemm::GemmConfig::kScalarsPerStgD
static int const kScalarsPerStgD
The number of scalars per STS/LDS/STG for D.
Definition: gemm_traits.h:122

cutlass::gemm::GemmTraits::SharedLoadStream::copy
CUTLASS_DEVICE void copy(int step)
Trigger the copies from shared memory to registers.
Definition: gemm_traits.h:598

cutlass::gemm::GemmTileTraitsHelperB< MatrixLayout::kColumnMajor, GemmConfig_ >::GlobalTileTraits
GemmGlobalTileTraits< GemmOperand::kB, MatrixLayout::kColumnMajor, Scalar const, Shape< 1, GemmConfig_::OutputTile::kH, GemmConfig_::OutputTile::kD >, Shape< 1, GemmConfig_::kThreads/GemmConfig_::OutputTile::kD, GemmConfig_::OutputTile::kD >, GemmConfig_::kScalarsPerLdgB > GlobalTileTraits
The traits class to build the iterator to load data from global memory for B^N.
Definition: gemm_traits.h:301

cutlass::Copy
Definition: convert.h:69

cutlass::Fragment
A template defining Fragment Concept.
Definition: fragment.h:99

cutlass::gemm::GemmTraits::SharedLoadStream::stream_a
SharedLoadStreamA stream_a
The stream for A.
Definition: gemm_traits.h:626

cutlass::gemm::SimplifiedGemmTraitsHelper::SharedLoadStreamA
SharedLoadStream< SharedLoadIteratorA > SharedLoadStreamA
The stream to load A from shared memory.
Definition: gemm_traits.h:692

cutlass::gemm::GemmSharedStoreTileAbTraits
Definition: gemm_shared_tile.h:38

cutlass::gemm::GemmConfig::ScalarC
ScalarC_ ScalarC
The scalar for C.
Definition: gemm_traits.h:86

cutlass::gemm::GemmTraits::GlobalLoadStream::copy
CUTLASS_DEVICE void copy()
Trigger the copies from shared memory to registers.
Definition: gemm_traits.h:566

cutlass::gemm::GemmTileTraitsHelperA< MatrixLayout::kColumnMajor, GemmConfig_ >::SharedLoadTileTraits
GemmSharedLoadTileATraits< MultiplyAddScalar const, typename GemmConfig_::OutputTile, typename GemmConfig_::Warps, typename GemmConfig_::MultiplyAdd::ThreadsPerWarp, typename GemmConfig_::InstructionShape, GemmConfig_::kStages, GemmConfig_::kScalarsPerLdsA, 0 > SharedLoadTileTraits
The traits class to build the iterator to load from shared memory for A^N.
Definition: gemm_traits.h:199

cutlass::gemm::GemmTraits::Epilogue
Epilogue_ Epilogue
The epilogue.
Definition: gemm_traits.h:468

cutlass::gemm::GemmTraits::ScalarA
GlobalLoadStreamA_::Scalar ScalarA
The scalar for A.
Definition: gemm_traits.h:439

cutlass::IteratorAdvance::kH
Definition: tile_iterator.h:62

cutlass::gemm::GemmTileTraitsHelperA< MatrixLayout::kColumnMajor, GemmConfig_ >::GlobalTileTraits
GemmGlobalTileTraits< GemmOperand::kA, MatrixLayout::kColumnMajor, Scalar const, Shape< 1, GemmConfig_::OutputTile::kD, GemmConfig_::OutputTile::kW >, Shape< 1, ShapeCount< typename GemmConfig_::Warps >::kCount, GemmConfig_::kWarpSize >, GemmConfig_::kScalarsPerLdgA > GlobalTileTraits
The traits class to build the iterator to load data from global memory for A^N.
Definition: gemm_traits.h:165

cutlass::gemm::GemmConfig::Warps
ShapeDiv< OutputTile, AccumulatorsPerWarp >::Shape Warps
The number of warps.
Definition: gemm_traits.h:102

cutlass::gemm::GemmTileTraitsHelperA< MatrixLayout::kColumnMajor, GemmConfig_ >::Scalar
GemmConfig_::ScalarA Scalar
The input scalar.
Definition: gemm_traits.h:147

cutlass::gemm::GemmSharedLoadTileBTraits
Definition: gemm_shared_tile.h:198

cutlass::gemm::GemmTraits::SharedStoreStorageB
GlobalLoadStreamB::SharedStoreStorage SharedStoreStorageB
The shared storage for B.
Definition: gemm_traits.h:457

cutlass::gemm::GemmGlobalIteratorAb
Definition: gemm_global_tile.h:159

cutlass::gemm::GemmTraits::ScalarC
Epilogue::ScalarC ScalarC
The scalars in the epilogue.
Definition: gemm_traits.h:470

cutlass::gemm::SimplifiedGemmTraitsHelper::GlobalLoadStreamB
GlobalLoadStream< GlobalLoadIteratorB, SharedStoreIteratorB, GlobalTransformerB > GlobalLoadStreamB
The stream to load B from global memory to shared memory.
Definition: gemm_traits.h:683

cutlass::gemm::GemmTraits::SharedLoadStream::stream_b
SharedLoadStreamB stream_b
The stream for B.
Definition: gemm_traits.h:632

cutlass::gemm::GemmTraits::SharedLoadStream
Assemble the shared load stream for A/B.
Definition: gemm_traits.h:590

cutlass::gemm::GemmTraits::GlobalLoadStream::stream_b
GlobalLoadStreamB stream_b
The stream for B.
Definition: gemm_traits.h:586

cutlass::gemm::GemmTraits::MultiplyAdd
GemmConfig::MultiplyAdd MultiplyAdd
The multiply-add functor.
Definition: gemm_traits.h:463

cutlass::gemm::GemmTraits::shared_load_fence
static CUTLASS_DEVICE void shared_load_fence(bool in_loop)
The memory fence for shared loads.
Definition: gemm_traits.h:640

cutlass::gemm::GemmTraits::GemmConfig
GemmConfig_ GemmConfig
The configuration.
Definition: gemm_traits.h:430

cutlass::gemm::GlobalLoadStream
Definition: gemm_global_stream.h:161

cutlass::gemm::GemmTraits::SharedLoadStream::transformed_b
SharedLoadStreamB::TransformedFragment transformed_b[2]
The fragments to transform B.
Definition: gemm_traits.h:636

cutlass::gemm::GemmTileTraitsHelperB
Definition: gemm_traits.h:273

cutlass::gemm::GemmTraits::GlobalLoadStream::stream_a
GlobalLoadStreamA stream_a
The stream for A.
Definition: gemm_traits.h:584

cutlass::gemm::GemmTileTraitsHelperA< MatrixLayout::kRowMajor, GemmConfig_ >::SharedLoadTileTraits
GemmSharedLoadTileATraits< MultiplyAddScalar const, typename GemmConfig_::OutputTile, typename GemmConfig_::Warps, typename GemmConfig_::MultiplyAdd::ThreadsPerWarp, typename GemmConfig_::InstructionShape, GemmConfig_::kStages, GemmConfig_::kScalarsPerLdsA, SharedStoreTileTraits::kSkew > SharedLoadTileTraits
The traits class to build the iterator to load from shared memory for A^T.
Definition: gemm_traits.h:267

cutlass::gemm::ClearAccumulators
Definition: clear_accumulators.h:38

cutlass::gemm::GemmTraits::MainLoopSharedStorage::stream_b
StreamSharedStorage< GlobalLoadStreamB, SharedLoadStreamB > stream_b
Definition: gemm_traits.h:537

cutlass::gemm::GemmTraits::Params
The params.
Definition: gemm_traits.h:481

cutlass::gemm::GemmConfig::kScalarsPerLdgA
static int const kScalarsPerLdgA
The number of scalars per LDG/STS/LDS for A.
Definition: gemm_traits.h:109

cutlass::gemm::GemmTraits::SharedLoadStream::fragment_b
CUTLASS_DEVICE SharedLoadStreamB::Fragment const  & fragment_b(int step) const
The fragment B.
Definition: gemm_traits.h:615

cutlass::gemm::SimplifiedGemmTraitsHelper::GlobalTransformerB
Copy< typename GlobalLoadIteratorB::Fragment > GlobalTransformerB
The data converter for B before storing to shared memory.
Definition: gemm_traits.h:674

cutlass::gemm::GemmTileTraitsHelperB< MatrixLayout::kRowMajor, GemmConfig_ >::Scalar
GemmConfig_::ScalarB Scalar
The input scalar.
Definition: gemm_traits.h:351

cutlass::MatrixLayout
Describes layouts of matrices.
Definition: matrix_traits.h:35

cutlass::gemm::SimplifiedGemmTraitsHelper::GlobalLoadIteratorB
GemmGlobalIteratorAb< typename GemmTileTraitsHelperB_::GlobalTileTraits, Index_ > GlobalLoadIteratorB
The global iterator to load B from global memory.
Definition: gemm_traits.h:672

cutlass::TileLoadIterator
An iterator implementing Tile Load Iterator Concept for loading a tile from memory.
Definition: tile_iterator.h:302

cutlass::gemm::GemmTraits::MainLoopSharedStorage
Definition: gemm_traits.h:533

cutlass::MatrixLayout::kRowMajor
Definition: matrix_traits.h:36

cutlass::gemm::GemmTraits::GlobalLoadStream::residue
CUTLASS_DEVICE void residue(Index k, bool skip_clear=false)
Execute the residue code.
Definition: gemm_traits.h:578

cutlass::gemm::GemmConfig::Accumulators
MultiplyAdd::Accumulators Accumulators
The accumulators.
Definition: gemm_traits.h:99

cutlass::gemm::GemmTraits::ClearAccumulators
ClearAccumulators_ ClearAccumulators
Clear the accumulators.
Definition: gemm_traits.h:478

cutlass::gemm::SharedLoadStream
Definition: gemm_shared_stream.h:44

cutlass::gemm::GemmTileTraitsHelperA< MatrixLayout::kRowMajor, GemmConfig_ >::GlobalTileTraits
GemmGlobalTileTraits< GemmOperand::kA, MatrixLayout::kRowMajor, Scalar const, Shape< 1, GemmConfig_::OutputTile::kW, GemmConfig_::OutputTile::kD >, Shape< 1, GemmConfig_::kThreads/GemmConfig_::OutputTile::kD, GemmConfig_::OutputTile::kD >, GemmConfig_::kScalarsPerLdgA > GlobalTileTraits
The traits class to build the iterator to load data from global memory for A^T.
Definition: gemm_traits.h:228

reshape_tile.h
Defines a type for restructuring a tile.

gemm_operand.h
Defines constant expressions for mapping GEMM problem size and strides onto pitch-linear memory...

cutlass::ShapeDiv::Shape
Shape< A_::kD/B_::kD, A_::kH/B_::kH, A_::kW/B_::kW, A_::kC/B_::kC > Shape
Definition: shape.h:126

cutlass::gemm::GemmConfig::kScalarsPerStsB
static int const kScalarsPerStsB
Definition: gemm_traits.h:115

clear_accumulators.h
Defines abstractions for efficiently clearing accumulator tiles.

cutlass::gemm::GemmConfig
Definition: gemm_traits.h:79

cutlass::gemm::GemmTraits::GlobalLoadStream
Assemble the global load streams for A/B.
Definition: gemm_traits.h:551

cutlass::gemm::GemmConfig::kScalarsPerStsD
static int const kScalarsPerStsD
Definition: gemm_traits.h:123

cutlass::gemm::GemmTraits::shared_store_fence
static CUTLASS_DEVICE void shared_store_fence(bool in_loop)
The memory fence for shared stores.
Definition: gemm_traits.h:648

cutlass::gemm::GemmTileTraitsHelperA< MatrixLayout::kRowMajor, GemmConfig_ >::Scalar
GemmConfig_::ScalarA Scalar
The input scalar.
Definition: gemm_traits.h:210

cutlass::gemm::GemmTileTraitsHelperA
Definition: gemm_traits.h:137

cutlass::gemm::GemmTraits::Params::initialize
CUTLASS_HOST_DEVICE int initialize(GemmDesc_ const &desc)
Initialize the parameters.
Definition: gemm_traits.h:497

cutlass::gemm::GemmTraits::StreamSharedStorage::global
GlobalLoadStream_::SharedStorage global
Definition: gemm_traits.h:527

cutlass::GemmOperand::kB
Definition: matrix_traits.h:43

cutlass::gemm::IdentityBlockSwizzle
Definition: identity_block_swizzle.h:37

cutlass::gemm::GemmTileTraitsHelperA< MatrixLayout::kColumnMajor, GemmConfig_ >::SharedStoreTileTraits
GemmSharedStoreTileAbTraits< MultiplyAddScalar, Shape< GemmConfig_::kStages, GemmConfig_::OutputTile::kD/GemmConfig_::InstructionShape::kD, GemmConfig_::OutputTile::kW *GemmConfig_::InstructionShape::kD >, typename GlobalTileTraits::Threads, GemmConfig_::kScalarsPerStsA > SharedStoreTileTraits
The traits class to build the iterator to store data to shared memory for A^N.
Definition: gemm_traits.h:179

cutlass::gemm::GemmConfig::ScalarB
ScalarB_ ScalarB
The scalar for B.
Definition: gemm_traits.h:84

cutlass::gemm::GemmTileTraitsHelperB< MatrixLayout::kRowMajor, GemmConfig_ >::MultiplyAddScalar
GemmConfig_::MultiplyAdd::ScalarB MultiplyAddScalar
The scalar stored in shared memory.
Definition: gemm_traits.h:353

cutlass::gemm::GemmTileTraitsHelperB< MatrixLayout::kColumnMajor, GemmConfig_ >::MultiplyAddScalar
GemmConfig_::MultiplyAdd::ScalarB MultiplyAddScalar
The scalar stored in shared memory.
Definition: gemm_traits.h:285

cutlass::gemm::GemmTraits::ScalarB
GlobalLoadStreamB_::Scalar ScalarB
The scalar for B.
Definition: gemm_traits.h:446

CUTLASS_HOST_DEVICE
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:46

cutlass::gemm::GemmTraits::SharedStoreStorageA
GlobalLoadStreamA::SharedStoreStorage SharedStoreStorageA
The shared storage for A.
Definition: gemm_traits.h:454

cutlass::gemm::SimplifiedGemmTraitsHelper::GlobalLoadStreamA
GlobalLoadStream< GlobalLoadIteratorA, SharedStoreIteratorA, GlobalTransformerA > GlobalLoadStreamA
The stream to load A from global memory to shared memory.
Definition: gemm_traits.h:668

static_assert
#define static_assert(__e, __m)
Definition: platform.h:145

cutlass::gemm::GemmTraits
Definition: gemm_traits.h:428

cutlass::gemm::GemmConfig::AccumulatorsPerWarp
MultiplyAdd::AccumulatorsPerWarp AccumulatorsPerWarp
The number of accumulators per warp.
Definition: gemm_traits.h:97

cutlass::gemm::GemmTraits::SharedLoadStream::transformed_a
SharedLoadStreamA::TransformedFragment transformed_a[2]
The fragments to transform A.
Definition: gemm_traits.h:630

cutlass::gemm::GemmTraits::StreamSharedStorage::shared
SharedLoadStream_::SharedStorage shared
Definition: gemm_traits.h:529

cutlass::gemm::GemmTraits::Params::global_stream_b
GlobalLoadStreamB::Params global_stream_b
The params for the B stream.
Definition: gemm_traits.h:487

cutlass::gemm::GemmTraits::SharedLoadStream::fetched_b
SharedLoadStreamB::FetchedFragment fetched_b[2]
The fragments to fetch B.
Definition: gemm_traits.h:634

cutlass::Shape
A Shape implementing Layout Concept describing the dimensions of a cube.
Definition: shape.h:64

cutlass::gemm::GemmConfig::kScalarsPerLdgC
static int const kScalarsPerLdgC
The number of scalars per LDG for C.
Definition: gemm_traits.h:119

cutlass::gemm::GemmConfig::ScalarD
ScalarD_ ScalarD
The scalar for D.
Definition: gemm_traits.h:88

cutlass::gemm::GemmConfig::kThreads
static int const kThreads
The numnber of threads.
Definition: gemm_traits.h:106

identity_block_swizzle.h
Defies functors for mapping blockIdx to partitions of the GEMM computation.

cutlass::gemm::GemmTraits::Params::m
Index m
The dimensions of the GEMM.
Definition: gemm_traits.h:483

cutlass::gemm::GemmTraits::BlockSwizzle
BlockSwizzle_ BlockSwizzle
The block swizzle to reorganize the grid.
Definition: gemm_traits.h:474

cutlass::gemm::SimplifiedGemmTraitsHelper::SharedLoadIteratorA
TileLoadIterator< typename GemmTileTraitsHelperA_::SharedLoadTileTraits, typename GemmTileTraitsHelperA_::Scalar, IteratorAdvance::kH, MemorySpace::kShared > SharedLoadIteratorA
The iterator to load A from shared memory.
Definition: gemm_traits.h:690

cutlass::MatrixLayout::kColumnMajor
Definition: matrix_traits.h:36

cutlass::gemm::SimplifiedGemmTraitsHelper::SharedLoadIteratorB
TileLoadIterator< typename GemmTileTraitsHelperB_::SharedLoadTileTraits, typename GemmTileTraitsHelperB_::Scalar, IteratorAdvance::kH, MemorySpace::kShared > SharedLoadIteratorB
The iterator to load B from shared memory.
Definition: gemm_traits.h:698

cutlass::gemm::GemmTraits::SharedLoadStream::SharedLoadStream
CUTLASS_DEVICE SharedLoadStream(Params const &params, SharedStorage &shared_storage)
Ctor.
Definition: gemm_traits.h:592

cutlass::gemm::GemmTraits::GlobalLoadStream::GlobalLoadStream
CUTLASS_DEVICE GlobalLoadStream(Params const &params, SharedStorage &shared_storage, dim3 const &block)
Ctor.
Definition: gemm_traits.h:553

cutlass::gemm::GemmEpilogue::ScalarC
GlobalLoadIteratorC::Scalar ScalarC
The scalar for C.
Definition: gemm_epilogue.h:96

cutlass::gemm::GemmTraits::Index
Index_ Index
The index.
Definition: gemm_traits.h:476

cutlass::gemm::GemmTileTraitsHelperA< MatrixLayout::kColumnMajor, GemmConfig_ >::MultiplyAddScalar
GemmConfig_::MultiplyAdd::ScalarA MultiplyAddScalar
The scalar stored in shared memory.
Definition: gemm_traits.h:149

cutlass::gemm::SimplifiedGemmTraitsHelper::SharedStoreIteratorB
TileStoreIterator< typename GemmTileTraitsHelperB_::SharedStoreTileTraits, typename GemmTileTraitsHelperB_::SharedStoreTileTraits::Scalar, IteratorAdvance::kH, MemorySpace::kShared > SharedStoreIteratorB
The iterator to store B to shared memory.
Definition: gemm_traits.h:680

cutlass::gemm::GemmTraits::Params::epilogue
Epilogue::Params epilogue
The params for the epilogue.
Definition: gemm_traits.h:493

cutlass::MatrixLayout::Kind
Kind
Definition: matrix_traits.h:36

cutlass::gemm::GemmTraits::Params::global_stream_a
GlobalLoadStreamA::Params global_stream_a
The params for the A stream.
Definition: gemm_traits.h:485

cutlass::gemm::ClearAccumulators::SharedStorage
The shared storage.
Definition: clear_accumulators.h:40

cutlass::gemm::GemmTraits::SharedLoadStream::commit
CUTLASS_DEVICE void commit(int step)
Commit the data.
Definition: gemm_traits.h:604

cutlass::gemm::GemmConfig::kScalarsPerLdsD
static int const kScalarsPerLdsD
Definition: gemm_traits.h:124

gemm_global_stream.h
Implements efficient loading of the thread block-level tile from global memory and storing to shared ...

cutlass::gemm::GemmTraits::SharedStorage::main_loop
MainLoopSharedStorage main_loop
Definition: gemm_traits.h:545

cutlass::gemm::GemmTraits::kLayoutA
static MatrixLayout::Kind const kLayoutA
The layout of A.
Definition: gemm_traits.h:437

cutlass::gemm::GemmConfig::OutputTile
OutputTile_ OutputTile
The tile.
Definition: gemm_traits.h:91

cutlass::gemm::GemmConfig::kScalarsPerLdgB
static int const kScalarsPerLdgB
The number of scalars per LDG/STS/LDS for B.
Definition: gemm_traits.h:114

cutlass::GemmOperand::kA
Definition: matrix_traits.h:43

cutlass::gemm::SimplifiedGemmTraitsHelper
Definition: gemm_traits.h:654

cutlass::gemm::GemmGlobalTileTraits::Threads
ReshapeThreads< Tile, Threads_ >::Threads Threads
The threads shape.
Definition: gemm_global_tile.h:87

cutlass::gemm::SimplifiedGemmTraitsHelper::GlobalLoadIteratorA
GemmGlobalIteratorAb< typename GemmTileTraitsHelperA_::GlobalTileTraits, Index_ > GlobalLoadIteratorA
The global iterator to load A from global memory.
Definition: gemm_traits.h:657

cutlass::gemm::GemmTraits::OutputTile
GemmConfig::OutputTile OutputTile
The output tile.
Definition: gemm_traits.h:432

matrix_traits.h
Defines properties of matrices used to denote layout and operands to GEMM kernels.

cutlass::gemm::SimplifiedGemmTraitsHelper::GlobalTransformerA
Copy< typename GlobalLoadIteratorA::Fragment > GlobalTransformerA
The data converter for A before storing to shared memory.
Definition: gemm_traits.h:659

cutlass::gemm::GemmTraits::GlobalLoadStream::commit
CUTLASS_DEVICE void commit()
Commit the data.
Definition: gemm_traits.h:572

cutlass::gemm::GemmTileTraitsHelperB< MatrixLayout::kRowMajor, GemmConfig_ >::SharedLoadTileTraits
GemmSharedLoadTileBTraits< MultiplyAddScalar const, typename GemmConfig_::OutputTile, typename GemmConfig_::Warps, typename GemmConfig_::MultiplyAdd::ThreadsPerWarp, typename GemmConfig_::InstructionShape, GemmConfig_::kStages, GemmConfig_::kScalarsPerLdsB, 0 > SharedLoadTileTraits
The traits class to build the iterator to load from shared memory for B^T.
Definition: gemm_traits.h:403

cutlass::gemm::GemmTraits::MainLoopSharedStorage::clear
ClearAccumulators::SharedStorage clear
Definition: gemm_traits.h:539

cutlass::gemm::GemmTraits::MainLoopSharedStorage::stream_a
StreamSharedStorage< GlobalLoadStreamA, SharedLoadStreamA > stream_a
Definition: gemm_traits.h:535

cutlass::gemm::GemmTileTraitsHelperB< MatrixLayout::kRowMajor, GemmConfig_ >::GlobalTileTraits
GemmGlobalTileTraits< GemmOperand::kB, MatrixLayout::kRowMajor, Scalar const, Shape< 1, GemmConfig_::OutputTile::kD, GemmConfig_::OutputTile::kH >, Shape< 1, ShapeCount< typename GemmConfig_::Warps >::kCount, GemmConfig_::kWarpSize >, GemmConfig_::kScalarsPerLdgB > GlobalTileTraits
The traits class to build the iterator to load data from global memory for B^T.
Definition: gemm_traits.h:369

gemm_shared_stream.h
Defines abstractions for managing loading and storing fragments to shared memory in the efficient GEM...

cutlass::ShapeCount
Compute derived counted of a Layout Concept based class.
Definition: shape.h:79

convert.h
Defines conversion operations among Fragments of different base type.

cutlass::gemm::GemmTraits::Params::shared_stream_b
SharedLoadStreamB::Params shared_stream_b
The params for the B stream from shared memory.
Definition: gemm_traits.h:491

cutlass::gemm::SimplifiedGemmTraits
Definition: gemm_traits.h:723

cutlass::gemm::GemmTraits::SharedLoadStream::fragment_a
CUTLASS_DEVICE SharedLoadStreamA::Fragment const  & fragment_a(int step) const
The fragment A.
Definition: gemm_traits.h:610

cutlass::gemm::GemmTraits::kLayoutB
static MatrixLayout::Kind const kLayoutB
The layout of B.
Definition: gemm_traits.h:444

cutlass::gemm::GemmConfig::kAccumulatorsPerLdsB
static int const kAccumulatorsPerLdsB
Definition: gemm_traits.h:128

cutlass::gemm::GemmConfig::kStages
static int const kStages
The number of stages in shared memory to implement double, triple, more-buffering.
Definition: gemm_traits.h:131

cutlass::TileStoreIterator
An iterator implementing Tile Store Iterator Concept for storing a tile to memory.
Definition: tile_iterator.h:620

cutlass::gemm::ThreadMultiplyAdd::AccumulatorsPerWarp
ShapeMul< AccumulatorsPerThread, ThreadsPerWarp >::Shape AccumulatorsPerWarp
The number of accumulators per warp.
Definition: thread_multiply_add.h:51

cutlass::gemm::GemmTileTraitsHelperA< MatrixLayout::kRowMajor, GemmConfig_ >::MultiplyAddScalar
GemmConfig_::MultiplyAdd::ScalarA MultiplyAddScalar
The scalar stored in shared memory.
Definition: gemm_traits.h:212