docs/generated-html/hgemm__traits_8h_source.html

 /***************************************************************************************************
  * Copyright (c) 2017-2018, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright notice, this list of
  *       conditions and the following disclaimer.
  *     * Redistributions in binary form must reproduce the above copyright notice, this list of
  *       conditions and the following disclaimer in the documentation and/or other materials
  *       provided with the distribution.
  *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
  *       to endorse or promote products derived from this software without specific prior written
  *       permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
 #pragma once

 #include <cutlass/convert.h>
 #include <cutlass/reshape_tile.h>

 #include <cutlass/gemm/gemm.h>
 #include <cutlass/gemm/gemm_epilogue.h>
 #include <cutlass/gemm/gemm_epilogue_traits.h>
 #include <cutlass/gemm/gemm_global_tile.h>
 #include <cutlass/gemm/gemm_shared_tile.h>
 #include <cutlass/gemm/gemm_traits.h>
 #include <cutlass/gemm/hgemm_global_tile.h>
 #include <cutlass/gemm/hgemm_multiply_add.h>
 #include <cutlass/gemm/hgemm_swizzle.h>

 namespace cutlass {
 namespace gemm {


 template <
     typename OutputTile_,
     typename AccumulatorsPerThread_,
     int kScalarsPerLdgA_ = 2,
     int kScalarsPerLdgB_ = 2>
 struct HgemmConfig
     : public GemmConfig<
           half,
           half,
           half,
           half,
           OutputTile_,
           ThreadMultiplyAdd<AccumulatorsPerThread_, Shape<1, 4, 8>, half, half, half>,
           kScalarsPerLdgA_,
           kScalarsPerLdgA_,
           8,
           kScalarsPerLdgB_,
           kScalarsPerLdgB_,
           8,
           2,
           8,
           2,
           2> {};


 template <enum MatrixLayout::Kind kLayout_, typename Iterator_>
 struct HgemmTransformerA {};

 template <typename Iterator_>
 struct HgemmTransformerA<MatrixLayout::kColumnMajor, Iterator_> {
   typedef Convert<typename Iterator_::Fragment, typename Iterator_::Fragment> Transformer;
 };

 template <typename Iterator_>
 struct HgemmTransformerA<MatrixLayout::kRowMajor, Iterator_> {
   typedef HgemmSwizzle<Iterator_> Transformer;
 };


 template <enum MatrixLayout::Kind kLayout_, typename Iterator_>
 struct HgemmTransformerB {};

 template <typename Iterator_>
 struct HgemmTransformerB<MatrixLayout::kRowMajor, Iterator_> {
   typedef Convert<typename Iterator_::Fragment, typename Iterator_::Fragment> Transformer;
 };

 template <typename Iterator_>
 struct HgemmTransformerB<MatrixLayout::kColumnMajor, Iterator_> {
   typedef HgemmSwizzle<Iterator_> Transformer;
 };


 template <enum MatrixLayout::Kind kLayout_, typename GemmConfig_>
 struct HgemmTileTraitsHelperA : public GemmTileTraitsHelperA<kLayout_, GemmConfig_> {};


 template <typename GemmConfig_>
 struct HgemmTileTraitsHelperA<MatrixLayout::kRowMajor, GemmConfig_>
     : public GemmTileTraitsHelperA<MatrixLayout::kRowMajor, GemmConfig_> {
   typedef GemmTileTraitsHelperA<MatrixLayout::kRowMajor, GemmConfig_> Base;

   typedef HgemmCrosswiseGlobalTileTraits<
       GemmOperand::kA,
       // The layout.
       MatrixLayout::kRowMajor,
       // The pointer.
       half const,
       // The tile has size MxK in GEMM's terminology.
       Shape<1, GemmConfig_::OutputTile::kW, GemmConfig_::OutputTile::kD>,
       // The threads are distributed as (threads / K ) x K (the traits may reorganize).
       Shape<1, GemmConfig_::kThreads / GemmConfig_::OutputTile::kD, GemmConfig_::OutputTile::kD>,
       // The number of scalars per LDG (LDG.32 or LDG.128, etc)
       GemmConfig_::kScalarsPerLdgA>
       GlobalTileTraits;

   typedef GemmSharedStoreWithSkewTileAbTraits<
       // The pointer.
       half,
       // The tile has size KxM in GEMM's terminology.
       Shape<GemmConfig_::kStages,
             GemmConfig_::OutputTile::kD / GemmConfig_::InstructionShape::kD,
             GemmConfig_::OutputTile::kW * GemmConfig_::InstructionShape::kD>,
       // The threads are distributed as warps x 32(the traits may reorganize).
       typename GlobalTileTraits::Threads,
       // The number of scalars per STS (STS.32 or STS.128, etc).
       2,
       // The skew to avoid bank conflicts added in the tile W dimension.
       128 / sizeof(half) / GlobalTileTraits::Threads::kW / 2>
       SharedStoreTileTraits;

   typedef GemmSharedLoadTileATraits<
       // The pointer.
       half const,
       // The output tile size.
       typename GemmConfig_::OutputTile,
       // The number of warps.
       typename GemmConfig_::Warps,
       // The number of threads per warp.
       typename GemmConfig_::MultiplyAdd::ThreadsPerWarp,
       // The shape of the FMA instruction.
       typename GemmConfig_::InstructionShape,
       // The number of stages.
       GemmConfig_::kStages,
       // The number of scalars per LDS.
       8,
       // The skew.
       SharedStoreTileTraits::kSkew>
       SharedLoadTileTraits;
 };


 template <enum MatrixLayout::Kind kLayout_, typename GemmConfig_>
 struct HgemmTileTraitsHelperB : public GemmTileTraitsHelperB<kLayout_, GemmConfig_> {};


 template <typename GemmConfig_>
 struct HgemmTileTraitsHelperB<MatrixLayout::kColumnMajor, GemmConfig_>
     : public GemmTileTraitsHelperB<MatrixLayout::kColumnMajor, GemmConfig_> {
   typedef GemmTileTraitsHelperB<MatrixLayout::kColumnMajor, GemmConfig_> Base;

   typedef HgemmCrosswiseGlobalTileTraits<
       GemmOperand::kB,
       // The layout.
       MatrixLayout::kColumnMajor,
       // The pointer.
       half const,
       // The tile has size KxN in GEMM's terminology.
       Shape<1, GemmConfig_::OutputTile::kH, GemmConfig_::OutputTile::kD>,
       // The threads are distributed as (threads / K) x K (the traits may reorganize).
       Shape<1, GemmConfig_::kThreads / GemmConfig_::OutputTile::kD, GemmConfig_::OutputTile::kD>,
       // The number of scalars per LDG (LDG.32 or LDG.128, etc)
       GemmConfig_::kScalarsPerLdgB>
       GlobalTileTraits;

   typedef GemmSharedStoreWithSkewTileAbTraits<
       // The pointer.
       half,
       // The tile has size KxN in GEMM's terminology.
       Shape<GemmConfig_::kStages,
             GemmConfig_::OutputTile::kD / GemmConfig_::InstructionShape::kD,
             GemmConfig_::OutputTile::kH * GemmConfig_::InstructionShape::kD>,
       // The threads are distributed as (threads / K) x K (the traits may reorganize).
       typename GlobalTileTraits::Threads,
       // The number of scalars per STS (STS.32 or STS.128, etc).
       2,
       // The skew to avoid bank conflicts added in the tile W dimension.
       128 / sizeof(half) / GlobalTileTraits::Threads::kW / 2>
       SharedStoreTileTraits;

   typedef GemmSharedLoadTileBTraits<
       // The pointer.
       half const,
       // The output tile size.
       typename GemmConfig_::OutputTile,
       // The number of warps.
       typename GemmConfig_::Warps,
       // The number of threads per warp.
       typename GemmConfig_::MultiplyAdd::ThreadsPerWarp,
       // The shape of the FMA instruction.
       typename GemmConfig_::InstructionShape,
       // The number of stages.
       GemmConfig_::kStages,
       // The number of scalars per LDS.
       8,
       // The skew.
       SharedStoreTileTraits::kSkew>
       SharedLoadTileTraits;
 };


 template <
     MatrixLayout::Kind kLayoutA_,
     MatrixLayout::Kind kLayoutB_,
     typename OutputTile_,
     typename EpilogueFunctor_,
     typename AccumulatorsPerThread_ = Shape<32, 8, 8>,
     int kScalarsPerLdgA_ = 2,
     int kScalarsPerLdgB_ = 2,
     typename Index_ = int>
 struct HgemmTraitsHelper {
   typedef HgemmConfig<OutputTile_, AccumulatorsPerThread_, kScalarsPerLdgA_, kScalarsPerLdgB_>
       GemmConfig;
   typedef HgemmTileTraitsHelperA<kLayoutA_, GemmConfig> GemmTileTraitsHelperA;
   typedef HgemmTileTraitsHelperB<kLayoutB_, GemmConfig> GemmTileTraitsHelperB;

   typedef GemmGlobalIteratorAb<typename GemmTileTraitsHelperA::GlobalTileTraits, Index_>
       GlobalLoadIteratorA;
   typedef typename HgemmTransformerA<GemmTileTraitsHelperA::kLayout,
                                      GlobalLoadIteratorA>::Transformer GlobalTransformerA;
   typedef TileStoreIterator<typename GemmTileTraitsHelperA::SharedStoreTileTraits,
                             typename GemmTileTraitsHelperA::SharedStoreTileTraits::Scalar,
                             IteratorAdvance::kH,
                             MemorySpace::kShared>
       SharedStoreIteratorA;
   typedef GlobalLoadStream<GlobalLoadIteratorA, SharedStoreIteratorA, GlobalTransformerA>
       GlobalLoadStreamA;

   typedef GemmGlobalIteratorAb<typename GemmTileTraitsHelperB::GlobalTileTraits, Index_>
       GlobalLoadIteratorB;
   // The default transformer for B.
   typedef typename HgemmTransformerB<GemmTileTraitsHelperB::kLayout,
                                      GlobalLoadIteratorB>::Transformer GlobalTransformerB;
   typedef TileStoreIterator<typename GemmTileTraitsHelperB::SharedStoreTileTraits,
                             typename GemmTileTraitsHelperB::SharedStoreTileTraits::Scalar,
                             IteratorAdvance::kH,
                             MemorySpace::kShared>
       SharedStoreIteratorB;
   typedef GlobalLoadStream<GlobalLoadIteratorB, SharedStoreIteratorB, GlobalTransformerB>
       GlobalLoadStreamB;

   typedef TileLoadIterator<typename GemmTileTraitsHelperA::SharedLoadTileTraits,
                            typename GemmTileTraitsHelperA::SharedLoadTileTraits::Scalar,
                            IteratorAdvance::kH,
                            MemorySpace::kShared>
       SharedLoadIteratorA;
   typedef SharedLoadStream<SharedLoadIteratorA> SharedLoadStreamA;
   typedef TileLoadIterator<typename GemmTileTraitsHelperB::SharedLoadTileTraits,
                            typename GemmTileTraitsHelperB::SharedLoadTileTraits::Scalar,
                            IteratorAdvance::kH,
                            MemorySpace::kShared>
       SharedLoadIteratorB;
   typedef SharedLoadStream<SharedLoadIteratorB> SharedLoadStreamB;

   typedef typename GemmConfig::MultiplyAdd MultiplyAdd;
   typedef ClearAccumulators<typename MultiplyAdd::ScalarC> ClearAccumulators;

   typedef SimplifiedGemmEpilogueTraits<GemmConfig, EpilogueFunctor_, Index_> GemmEpilogueTraits;
   typedef GemmEpilogue<GemmEpilogueTraits> Epilogue;
 };


 template <
     MatrixLayout::Kind kLayoutA_,
     MatrixLayout::Kind kLayoutB_,
     typename OutputTile_ = Shape<8, 128, 128>,
     typename EpilogueFunctor_ = LinearScaling<half>,
     typename AccumulatorsPerThread_ = Shape<8, 8, 16>,
     int kScalarsPerLdgA_ = 2,
     int kScalarsPerLdgB_ = 2,
     typename Index_ = int,
     typename Helper_ = HgemmTraitsHelper<kLayoutA_,
                                          kLayoutB_,
                                          OutputTile_,
                                          EpilogueFunctor_,
                                          AccumulatorsPerThread_,
                                          kScalarsPerLdgA_,
                                          kScalarsPerLdgB_,
                                          Index_> >
 struct HgemmTraits : public GemmTraits<
                          // The config.
                          typename Helper_::GemmConfig,
                          // The stream to load A from global memory to shared memory.
                          typename Helper_::GlobalLoadStreamA,
                          // The stream to load B from global memory to shared memory.
                          typename Helper_::GlobalLoadStreamB,
                          // The stream to load A from shared memory.
                          typename Helper_::SharedLoadStreamA,
                          // The stream to load B from shared memory.
                          typename Helper_::SharedLoadStreamB,
                          // The epilogue.
                          typename Helper_::Epilogue,
                          // The block swizzle to reorganize the grid.
                          IdentityBlockSwizzle,
                          // The index.
                          Index_,
                          // The tool used to clear accumulators.
                          typename Helper_::ClearAccumulators> {};


 }  // namespace gemm
 }  // namespace cutlass
cutlass::gemm::HgemmTraitsHelper::GlobalLoadIteratorA
GemmGlobalIteratorAb< typename GemmTileTraitsHelperA::GlobalTileTraits, Index_ > GlobalLoadIteratorA
The iterator to load A from global memory.
Definition: hgemm_traits.h:282

cutlass::MemorySpace::kShared
Definition: load_store.h:42

cutlass::gemm::HgemmTransformerB< MatrixLayout::kColumnMajor, Iterator_ >::Transformer
HgemmSwizzle< Iterator_ > Transformer
Definition: hgemm_traits.h:119

cutlass
Definition: convert.h:33

cutlass::gemm::GemmSharedLoadTileATraits
Definition: gemm_shared_tile.h:129

cutlass::gemm::GemmSharedStoreWithSkewTileAbTraits
Definition: gemm_shared_tile.h:80

cutlass::gemm::GemmEpilogue
Definition: gemm_epilogue.h:53

gemm_global_tile.h
Defines iterators for efficiently loading and storing to global memory.

cutlass::gemm::HgemmTraitsHelper::GlobalLoadIteratorB
GemmGlobalIteratorAb< typename GemmTileTraitsHelperB::GlobalTileTraits, Index_ > GlobalLoadIteratorB
The iterator to load B from global memory.
Definition: hgemm_traits.h:298

cutlass::gemm::HgemmTraitsHelper::ClearAccumulators
ClearAccumulators< typename MultiplyAdd::ScalarC > ClearAccumulators
The object to clear accumulators.
Definition: hgemm_traits.h:332

gemm_traits.h
Defines structural properties of complete GEMM computation.

cutlass::gemm::HgemmTraitsHelper::SharedStoreIteratorA
TileStoreIterator< typename GemmTileTraitsHelperA::SharedStoreTileTraits, typename GemmTileTraitsHelperA::SharedStoreTileTraits::Scalar, IteratorAdvance::kH, MemorySpace::kShared > SharedStoreIteratorA
The iterator to store A to shared memory.
Definition: hgemm_traits.h:291

cutlass::gemm::HgemmTraitsHelper::GlobalLoadStreamA
GlobalLoadStream< GlobalLoadIteratorA, SharedStoreIteratorA, GlobalTransformerA > GlobalLoadStreamA
The stream to load A from global memory to shared memory.
Definition: hgemm_traits.h:294

cutlass::gemm::HgemmTileTraitsHelperB< MatrixLayout::kColumnMajor, GemmConfig_ >::GlobalTileTraits
HgemmCrosswiseGlobalTileTraits< GemmOperand::kB, MatrixLayout::kColumnMajor, half const, Shape< 1, GemmConfig_::OutputTile::kH, GemmConfig_::OutputTile::kD >, Shape< 1, GemmConfig_::kThreads/GemmConfig_::OutputTile::kD, GemmConfig_::OutputTile::kD >, GemmConfig_::kScalarsPerLdgB > GlobalTileTraits
The traits class to build the iterator to load data from global memory for B^N.
Definition: hgemm_traits.h:213

cutlass::gemm::HgemmTransformerA
Definition: hgemm_traits.h:95

cutlass::gemm::HgemmTileTraitsHelperB< MatrixLayout::kColumnMajor, GemmConfig_ >::Base
GemmTileTraitsHelperB< MatrixLayout::kColumnMajor, GemmConfig_ > Base
The base config.
Definition: hgemm_traits.h:198

cutlass::gemm::HgemmTraitsHelper::SharedLoadStreamA
SharedLoadStream< SharedLoadIteratorA > SharedLoadStreamA
The stream to load A from shared memory.
Definition: hgemm_traits.h:319

cutlass::gemm::HgemmTransformerA< MatrixLayout::kColumnMajor, Iterator_ >::Transformer
Convert< typename Iterator_::Fragment, typename Iterator_::Fragment > Transformer
Definition: hgemm_traits.h:99

cutlass::gemm::HgemmTraits
Definition: hgemm_traits.h:368

cutlass::gemm::HgemmTransformerA< MatrixLayout::kRowMajor, Iterator_ >::Transformer
HgemmSwizzle< Iterator_ > Transformer
Definition: hgemm_traits.h:104

cutlass::IteratorAdvance::kH
Definition: tile_iterator.h:62

cutlass::gemm::GemmSharedLoadTileBTraits
Definition: gemm_shared_tile.h:198

cutlass::gemm::HgemmTraitsHelper::SharedLoadIteratorB
TileLoadIterator< typename GemmTileTraitsHelperB::SharedLoadTileTraits, typename GemmTileTraitsHelperB::SharedLoadTileTraits::Scalar, IteratorAdvance::kH, MemorySpace::kShared > SharedLoadIteratorB
The iterator to load B from shared memory.
Definition: hgemm_traits.h:325

cutlass::gemm::GemmGlobalIteratorAb
Definition: gemm_global_tile.h:159

cutlass::gemm::HgemmTraitsHelper::Epilogue
GemmEpilogue< GemmEpilogueTraits > Epilogue
The epilogue.
Definition: hgemm_traits.h:337

cutlass::gemm::HgemmTraitsHelper::GlobalTransformerA
HgemmTransformerA< GemmTileTraitsHelperA::kLayout, GlobalLoadIteratorA >::Transformer GlobalTransformerA
The default transformer for A.
Definition: hgemm_traits.h:285

gemm_epilogue.h
Implements the epilogue phase of the GEMM kernel that efficiently updates global memory with the comp...

cutlass::gemm::GlobalLoadStream
Definition: gemm_global_stream.h:161

cutlass::gemm::GemmTileTraitsHelperB
Definition: gemm_traits.h:273

cutlass::gemm::HgemmTileTraitsHelperA
Definition: hgemm_traits.h:125

cutlass::MatrixLayout
Describes layouts of matrices.
Definition: matrix_traits.h:35

cutlass::gemm::HgemmTraitsHelper::SharedLoadStreamB
SharedLoadStream< SharedLoadIteratorB > SharedLoadStreamB
The stream to load B from shared memory.
Definition: hgemm_traits.h:327

cutlass::gemm::HgemmTransformerB
Definition: hgemm_traits.h:110

cutlass::gemm::HgemmTileTraitsHelperA< MatrixLayout::kRowMajor, GemmConfig_ >::Base
GemmTileTraitsHelperA< MatrixLayout::kRowMajor, GemmConfig_ > Base
The base config.
Definition: hgemm_traits.h:133

cutlass::gemm::HgemmTraitsHelper::SharedLoadIteratorA
TileLoadIterator< typename GemmTileTraitsHelperA::SharedLoadTileTraits, typename GemmTileTraitsHelperA::SharedLoadTileTraits::Scalar, IteratorAdvance::kH, MemorySpace::kShared > SharedLoadIteratorA
The iterator to load A from shared memory.
Definition: hgemm_traits.h:317

cutlass::TileLoadIterator
An iterator implementing Tile Load Iterator Concept for loading a tile from memory.
Definition: tile_iterator.h:302

cutlass::gemm::HgemmTraitsHelper::GemmEpilogueTraits
SimplifiedGemmEpilogueTraits< GemmConfig, EpilogueFunctor_, Index_ > GemmEpilogueTraits
The traits class for the epilogue.
Definition: hgemm_traits.h:335

gemm_shared_tile.h
Defines iterators for efficiently loading and storing tiles to and from shared memory.

cutlass::MatrixLayout::kRowMajor
Definition: matrix_traits.h:36

cutlass::gemm::GemmTileTraitsHelperA< MatrixLayout::kRowMajor, GemmConfig_ >
Definition: gemm_traits.h:205

cutlass::gemm::SharedLoadStream
Definition: gemm_shared_stream.h:44

reshape_tile.h
Defines a type for restructuring a tile.

hgemm_multiply_add.h
Specialization implementing multiply-add operation on half-precision floating point fragments...

cutlass::gemm::GemmConfig
Definition: gemm_traits.h:79

hgemm_swizzle.h
Transposes a tile of 16b elements. Used by HGEMM to construct a K-strided layout in shared memory for...

cutlass::gemm::GemmTileTraitsHelperA
Definition: gemm_traits.h:137

cutlass::gemm::HgemmTileTraitsHelperB< MatrixLayout::kColumnMajor, GemmConfig_ >::SharedLoadTileTraits
GemmSharedLoadTileBTraits< half const, typename GemmConfig_::OutputTile, typename GemmConfig_::Warps, typename GemmConfig_::MultiplyAdd::ThreadsPerWarp, typename GemmConfig_::InstructionShape, GemmConfig_::kStages, 8, SharedStoreTileTraits::kSkew > SharedLoadTileTraits
The traits class to build the iterator to load from shared memory for B^N.
Definition: hgemm_traits.h:249

cutlass::GemmOperand::kB
Definition: matrix_traits.h:43

cutlass::gemm::HgemmTraitsHelper::GemmConfig
HgemmConfig< OutputTile_, AccumulatorsPerThread_, kScalarsPerLdgA_, kScalarsPerLdgB_ > GemmConfig
The HGEMM config.
Definition: hgemm_traits.h:274

cutlass::gemm::HgemmTileTraitsHelperB
Definition: hgemm_traits.h:190

cutlass::gemm::HgemmTraitsHelper::GlobalLoadStreamB
GlobalLoadStream< GlobalLoadIteratorB, SharedStoreIteratorB, GlobalTransformerB > GlobalLoadStreamB
The stream to load B from global memory to shared memory.
Definition: hgemm_traits.h:310

cutlass::gemm::HgemmTraitsHelper::MultiplyAdd
GemmConfig::MultiplyAdd MultiplyAdd
The functor to do the multiply-add in the main loop.
Definition: hgemm_traits.h:330

cutlass::gemm::HgemmTraitsHelper::GemmTileTraitsHelperB
HgemmTileTraitsHelperB< kLayoutB_, GemmConfig > GemmTileTraitsHelperB
The GEMM config for B.
Definition: hgemm_traits.h:278

cutlass::gemm::GemmTraits
Definition: gemm_traits.h:428

cutlass::gemm::HgemmCrosswiseGlobalTileTraits
Definition: hgemm_global_tile.h:48

cutlass::Shape
A Shape implementing Layout Concept describing the dimensions of a cube.
Definition: shape.h:64

cutlass::gemm::SimplifiedGemmEpilogueTraits
Definition: gemm_epilogue_traits.h:300

cutlass::gemm::HgemmTileTraitsHelperA< MatrixLayout::kRowMajor, GemmConfig_ >::SharedLoadTileTraits
GemmSharedLoadTileATraits< half const, typename GemmConfig_::OutputTile, typename GemmConfig_::Warps, typename GemmConfig_::MultiplyAdd::ThreadsPerWarp, typename GemmConfig_::InstructionShape, GemmConfig_::kStages, 8, SharedStoreTileTraits::kSkew > SharedLoadTileTraits
The traits class to build the iterator to load from shared memory for A^T.
Definition: hgemm_traits.h:184

cutlass::gemm::HgemmTraitsHelper::GemmTileTraitsHelperA
HgemmTileTraitsHelperA< kLayoutA_, GemmConfig > GemmTileTraitsHelperA
The GEMM config for A.
Definition: hgemm_traits.h:276

cutlass::gemm::ThreadMultiplyAdd
Template performing matrix multiply-add operation within a thread.
Definition: thread_multiply_add.h:43

cutlass::MatrixLayout::kColumnMajor
Definition: matrix_traits.h:36

cutlass::MatrixLayout::Kind
Kind
Definition: matrix_traits.h:36

cutlass::gemm::HgemmTraitsHelper::GlobalTransformerB
HgemmTransformerB< GemmTileTraitsHelperB::kLayout, GlobalLoadIteratorB >::Transformer GlobalTransformerB
Definition: hgemm_traits.h:301

cutlass::gemm::GemmTileTraitsHelperB< MatrixLayout::kColumnMajor, GemmConfig_ >
Definition: gemm_traits.h:278

cutlass::gemm::HgemmTraitsHelper
Definition: hgemm_traits.h:271

cutlass::gemm::HgemmTileTraitsHelperA< MatrixLayout::kRowMajor, GemmConfig_ >::GlobalTileTraits
HgemmCrosswiseGlobalTileTraits< GemmOperand::kA, MatrixLayout::kRowMajor, half const, Shape< 1, GemmConfig_::OutputTile::kW, GemmConfig_::OutputTile::kD >, Shape< 1, GemmConfig_::kThreads/GemmConfig_::OutputTile::kD, GemmConfig_::OutputTile::kD >, GemmConfig_::kScalarsPerLdgA > GlobalTileTraits
The traits class to build the iterator to load data from global memory for A^T.
Definition: hgemm_traits.h:148

hgemm_global_tile.h
Tile traits used to construct global tile iterator for HGEMM. This is intended to partition the threa...

cutlass::gemm::LinearScaling
Functor to compute linear combination of fragments.
Definition: linear_scaling.h:40

cutlass::Convert
Definition: convert.h:38

cutlass::GemmOperand::kA
Definition: matrix_traits.h:43

gemm.h
Implements a software-pipelined efficient GEMM.

cutlass::gemm::GemmGlobalTileTraits::Threads
ReshapeThreads< Tile, Threads_ >::Threads Threads
The threads shape.
Definition: gemm_global_tile.h:87

gemm_epilogue_traits.h
Defines structural properties of the GEMM epilogue.

cutlass::gemm::HgemmSwizzle
Definition: hgemm_swizzle.h:40

convert.h
Defines conversion operations among Fragments of different base type.

cutlass::gemm::HgemmTransformerB< MatrixLayout::kRowMajor, Iterator_ >::Transformer
Convert< typename Iterator_::Fragment, typename Iterator_::Fragment > Transformer
Definition: hgemm_traits.h:114

cutlass::gemm::HgemmConfig
Definition: hgemm_traits.h:57

cutlass::TileStoreIterator
An iterator implementing Tile Store Iterator Concept for storing a tile to memory.
Definition: tile_iterator.h:620

cutlass::gemm::HgemmTraitsHelper::SharedStoreIteratorB
TileStoreIterator< typename GemmTileTraitsHelperB::SharedStoreTileTraits, typename GemmTileTraitsHelperB::SharedStoreTileTraits::Scalar, IteratorAdvance::kH, MemorySpace::kShared > SharedStoreIteratorB
The iterator to store B to shared memory.
Definition: hgemm_traits.h:307