74df0331f2a839e20abb2786c82b90487e8bef6a/docs/wmma__gemm__traits_8h_source.html

 /***************************************************************************************************
  * Copyright (c) 2017-2018, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright notice, this list of
  *       conditions and the following disclaimer.
  *     * Redistributions in binary form must reproduce the above copyright notice, this list of
  *       conditions and the following disclaimer in the documentation and/or other materials
  *       provided with the distribution.
  *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
  *       to endorse or promote products derived from this software without specific prior written
  *       permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
 #pragma once

 #include "cutlass/wmma_matrix.h"
 #ifdef CUTLASS_USE_WMMA_API

 #include "cutlass/convert.h"
 #include "cutlass/gemm/gemm.h"
 #include "cutlass/gemm/gemm_epilogue.h"
 #include "cutlass/gemm/gemm_epilogue_traits.h"
 #include "cutlass/gemm/gemm_global_tile.h"
 #include "cutlass/gemm/gemm_shared_tile.h"
 #include "cutlass/gemm/gemm_traits.h"
 #include "cutlass/gemm/wmma_gemm_epilogue_traits.h"
 #include "cutlass/gemm/wmma_gemm_global_tile.h"
 #include "cutlass/gemm/wmma_gemm_multiply_add.h"

 namespace cutlass {
 namespace gemm {


 template <
     MatrixLayout::Kind kLayoutA_,
     MatrixLayout::Kind kLayoutB_,
     typename OutputTile_,
     typename ScalarA_,
     typename ScalarB_,
     typename ScalarC_,
     typename Accumulator_,
     typename WarpGemmShape_,
     typename InstructionShape_,
     int kScalarsPerLdgA_,
     int kScalarsPerLdgB_>
 struct WmmaGemmConfig : public GemmConfig<
                             ScalarA_,
                             ScalarB_,
                             ScalarC_,
                             ScalarC_,
                             OutputTile_,
                             WmmaGemmMultiplyAdd<kLayoutA_,
                                                 ScalarA_,
                                                 kLayoutB_,
                                                 ScalarB_,
                                                 MatrixLayout::kColumnMajor,
                                                 Accumulator_,
                                                 WarpGemmShape_,
                                                 InstructionShape_>,
                             kScalarsPerLdgA_,
                             kScalarsPerLdgA_,
                             8,
                             kScalarsPerLdgB_,
                             kScalarsPerLdgB_,
                             8,
                             16 / sizeof(ScalarC_),
                             16 / sizeof(Accumulator_),
                             16 / sizeof(Accumulator_),
                             1,
                             false,
                             true,
                             false> {};


 template <enum MatrixLayout::Kind kLayout_,
           typename GemmConfig_,
           typename ScalarA_>
 struct WmmaGemmTileTraitsHelperA {};


 template <typename GemmConfig_, typename ScalarA_>
 struct WmmaGemmTileTraitsHelperA<MatrixLayout::kColumnMajor, GemmConfig_, ScalarA_>
     : public GemmTileTraitsHelperA<MatrixLayout::kColumnMajor, GemmConfig_> {
   typedef GemmTileTraitsHelperA<MatrixLayout::kColumnMajor, GemmConfig_> Base;

   static int const kSkew = 16 / sizeof(typename Base::MultiplyAddScalar);
   typedef Shape<GemmConfig_::kStages,
                 GemmConfig_::OutputTile::kD,
                 GemmConfig_::OutputTile::kW + kSkew>
       Tile;

   typedef WmmaMatrix<GemmOperand::kA,
                      MatrixLayout::kColumnMajor,
                      typename Base::MultiplyAddScalar,
                      typename GemmConfig_::InstructionShape>
       WmmaMatrix;

   typedef GemmSharedStoreTileAbTraits<
       // The pointer.
       typename Base::MultiplyAddScalar,
       // The tile has size KxM in GEMM's terminology.
       Tile,
       // The threads are distributed as warps x 32 (the traits may reorganize).
       typename Base::GlobalTileTraits::Threads,
       // The number of scalars per STS (STS.32 or STS.128, etc).
       GemmConfig_::kScalarsPerStsA>
       SharedStoreTileTraits;

   static int const kScalarsPerW = GemmConfig_::InstructionShape::kW * GemmConfig_::Warps::kW;
   static int const kScalarsPerIteration = Tile::kW * GemmConfig_::InstructionShape::kD;
   typedef WmmaGemmSharedLoadTileATraits<
       // The layout of the matrix.
       MatrixLayout::kColumnMajor,
       // The pointer.
       typename Base::MultiplyAddScalar,
       // The output tile size.
       Tile,
       // The number of warps.
       typename GemmConfig_::Warps,
       // The strides between warps.
       GemmConfig_::InstructionShape::kW,
       // The number of iterations to load the data.
       Shape<1, 1, GemmConfig_::OutputTile::kW / kScalarsPerW>,
       // The stride between iterations.
       Shape<kScalarsPerIteration, 0, kScalarsPerW, 0>,
       // The shape of the instruction.
       typename GemmConfig_::InstructionShape>
       SharedLoadTileTraits;
 };


 template <typename GemmConfig_, typename ScalarA_>
 struct WmmaGemmTileTraitsHelperA<MatrixLayout::kRowMajor, GemmConfig_, ScalarA_> {
   static MatrixLayout::Kind const kLayout = MatrixLayout::kRowMajor;

   typedef typename GemmConfig_::ScalarA Scalar;
   typedef typename GemmConfig_::MultiplyAdd::ScalarA MultiplyAddScalar;

   typedef WmmaMatrix<GemmOperand::kA,
                      MatrixLayout::kRowMajor,
                      MultiplyAddScalar,
                      typename GemmConfig_::InstructionShape>
       WmmaMatrix;

   typedef GemmGlobalTileTraits<
       // That's A.
       GemmOperand::kA,
       // A is row-major.
       MatrixLayout::kRowMajor,
       // The pointer is float const.
       Scalar const,
       // The tile has size KxM in GEMM's terminology.
       Shape<1, GemmConfig_::OutputTile::kW, GemmConfig_::OutputTile::kD>,
       // The threads are distributed as warps x 32 (the traits may reorganize).
       Shape<1, GemmConfig_::kThreads / GemmConfig_::OutputTile::kD, GemmConfig_::OutputTile::kD>,
       // The number of scalars per LDG (LDG.32 or LDG.128, etc).
       GemmConfig_::kScalarsPerLdgA>
       GlobalTileTraits;

   static int const kSkew = 16 / sizeof(MultiplyAddScalar);
   typedef Shape<GemmConfig_::kStages,
                 GemmConfig_::OutputTile::kW,
                 GemmConfig_::OutputTile::kD + kSkew>
       Tile;

   typedef GemmSharedStoreTileAbTraits<
       // The pointer.
       MultiplyAddScalar,
       // The tile has size KxM in GEMM's terminology.
       Tile,
       // The threads are distributed as warps x 32 (the traits may reorganize).
       typename GlobalTileTraits::Threads,
       // The number of scalars per STS (STS.32 or STS.128, etc).
       GemmConfig_::kScalarsPerStsA>
       SharedStoreTileTraits;

   static int const kScalarsPerW = GemmConfig_::InstructionShape::kW * GemmConfig_::Warps::kW;
   typedef WmmaGemmSharedLoadTileATraits<
       // The layout of the matrix.
       MatrixLayout::kRowMajor,
       // The pointer.
       MultiplyAddScalar,
       // The tile in shared memory.
       Tile,
       // The number of warps.
       typename GemmConfig_::Warps,
       // The strides between warps.
       GemmConfig_::InstructionShape::kW * Tile::kW,
       // The number of iterations to load the data.
       Shape<1, 1, GemmConfig_::OutputTile::kW / kScalarsPerW>,
       // The stride between iterations.
       Shape<GemmConfig_::InstructionShape::kD, 0, kScalarsPerW * Tile::kW>,
       // The shape of the instruction.
       typename GemmConfig_::InstructionShape>
       SharedLoadTileTraits;
 };


 #ifdef CUTLASS_USE_SUBBYTE_WMMA
 template <typename GemmConfig_>
 struct WmmaGemmTileTraitsHelperA<MatrixLayout::kRowMajor, GemmConfig_, Vector<bin1_t, 32> > {
   static MatrixLayout::Kind const kLayout = MatrixLayout::kRowMajor;

   typedef typename GemmConfig_::ScalarA Scalar;
   typedef typename GemmConfig_::MultiplyAdd::ScalarA MultiplyAddScalar;

   static int const kBitsPerScalar = sizeof(Scalar) * 8;

   typedef WmmaMatrix<GemmOperand::kA,
                      MatrixLayout::kRowMajor,
                      Vector<bin1_t, 32>,
                      typename GemmConfig_::InstructionShape>
       WmmaMatrix;

   typedef GemmGlobalTileTraits<
       // That's A.
       GemmOperand::kA,
       // A is row-major.
       MatrixLayout::kRowMajor,
       // The pointer is float const.
       Scalar const,
       // The tile has size KxM in GEMM's terminology.
       Shape<1, GemmConfig_::OutputTile::kW, GemmConfig_::OutputTile::kD / kBitsPerScalar>,
       // The threads are distributed as warps x 32 (the traits may reorganize).
       Shape<1,
             GemmConfig_::kThreads / (GemmConfig_::OutputTile::kD / kBitsPerScalar),
             GemmConfig_::OutputTile::kD / kBitsPerScalar>,
       // The number of scalars per LDG (LDG.32 or LDG.128, etc).
       GemmConfig_::kScalarsPerLdgA / kBitsPerScalar>
       GlobalTileTraits;

   static int const kSkew = 16 / sizeof(MultiplyAddScalar);
   typedef Shape<GemmConfig_::kStages,
                 GemmConfig_::OutputTile::kW,
                 GemmConfig_::OutputTile::kD / kBitsPerScalar + kSkew>
       Tile;

   typedef GemmSharedStoreTileAbTraits<
       // The pointer.
       MultiplyAddScalar,
       // The tile has size KxM in GEMM's terminology.
       Tile,
       // The threads are distributed as warps x 32 (the traits may reorganize).
       typename GlobalTileTraits::Threads,
       // The number of scalars per STS (STS.32 or STS.128, etc).
       GemmConfig_::kScalarsPerStsA / kBitsPerScalar>
       SharedStoreTileTraits;

   static int const kScalarsPerW = GemmConfig_::InstructionShape::kW * GemmConfig_::Warps::kW;
   typedef WmmaGemmSharedLoadTileATraits<
       // The layout of the matrix.
       MatrixLayout::kRowMajor,
       // The pointer.
       MultiplyAddScalar,
       // The tile in shared memory.
       Tile,
       // The number of warps.
       typename GemmConfig_::Warps,
       // The strides between warps.
       GemmConfig_::InstructionShape::kW * Tile::kW,
       // The number of iterations to load the data.
       Shape<1, 1, GemmConfig_::OutputTile::kW / kScalarsPerW>,
       // The stride between iterations.
       Shape<GemmConfig_::InstructionShape::kD / kBitsPerScalar, 0, kScalarsPerW * Tile::kW>,
       // The shape of the instruction.
       typename GemmConfig_::InstructionShape>
       SharedLoadTileTraits;
 };
 #endif


 #ifdef CUTLASS_USE_SUBBYTE_WMMA
 template <typename GemmConfig_>
 struct WmmaGemmTileTraitsHelperA<MatrixLayout::kRowMajor, GemmConfig_, Vector<uint4_t, 8> > {
   static MatrixLayout::Kind const kLayout = MatrixLayout::kRowMajor;

   typedef typename GemmConfig_::ScalarA Scalar;
   typedef typename GemmConfig_::MultiplyAdd::ScalarA MultiplyAddScalar;

   static int const kInt4PerScalar = sizeof(Scalar) * 2;

   typedef WmmaMatrix<GemmOperand::kA,
                      MatrixLayout::kRowMajor,
                      Vector<uint4_t, 8>,
                      typename GemmConfig_::InstructionShape>
       WmmaMatrix;

   typedef GemmGlobalTileTraits<
       // That's A.
       GemmOperand::kA,
       // A is row-major.
       MatrixLayout::kRowMajor,
       // The pointer is float const.
       Scalar const,
       // The tile has size KxM in GEMM's terminology.
       Shape<1, GemmConfig_::OutputTile::kW, GemmConfig_::OutputTile::kD / kInt4PerScalar>,
       // The threads are distributed as warps x 32 (the traits may reorganize).
       Shape<1,
             GemmConfig_::kThreads / (GemmConfig_::OutputTile::kD / kInt4PerScalar),
             GemmConfig_::OutputTile::kD / kInt4PerScalar>,
       // The number of scalars per LDG (LDG.32 or LDG.128, etc).
       GemmConfig_::kScalarsPerLdgA / kInt4PerScalar>
       GlobalTileTraits;

   static int const kSkew = 16 / sizeof(MultiplyAddScalar);
   typedef Shape<GemmConfig_::kStages,
                 GemmConfig_::OutputTile::kW,
                 GemmConfig_::OutputTile::kD / kInt4PerScalar + kSkew>
       Tile;

   typedef GemmSharedStoreTileAbTraits<
       // The pointer.
       MultiplyAddScalar,
       // The tile has size KxM in GEMM's terminology.
       Tile,
       // The threads are distributed as warps x 32 (the traits may reorganize).
       typename GlobalTileTraits::Threads,
       // The number of scalars per STS (STS.32 or STS.128, etc).
       GemmConfig_::kScalarsPerStsA / kInt4PerScalar>
       SharedStoreTileTraits;

   static int const kScalarsPerW = GemmConfig_::InstructionShape::kW * GemmConfig_::Warps::kW;
   typedef WmmaGemmSharedLoadTileATraits<
       // The layout of the matrix.
       MatrixLayout::kRowMajor,
       // The pointer.
       MultiplyAddScalar,
       // The tile in shared memory.
       Tile,
       // The number of warps.
       typename GemmConfig_::Warps,
       // The strides between warps.
       GemmConfig_::InstructionShape::kW * Tile::kW,
       // The number of iterations to load the data.
       Shape<1, 1, GemmConfig_::OutputTile::kW / kScalarsPerW>,
       // The stride between iterations.
       Shape<GemmConfig_::InstructionShape::kD / kInt4PerScalar, 0, kScalarsPerW * Tile::kW>,
       // The shape of the instruction.
       typename GemmConfig_::InstructionShape>
       SharedLoadTileTraits;
 };
 #endif


 #ifdef CUTLASS_USE_SUBBYTE_WMMA
 template <typename GemmConfig_>
 struct WmmaGemmTileTraitsHelperA<MatrixLayout::kRowMajor, GemmConfig_, Vector<int4_t, 8> > {
   static MatrixLayout::Kind const kLayout = MatrixLayout::kRowMajor;

   typedef typename GemmConfig_::ScalarA Scalar;
   typedef typename GemmConfig_::MultiplyAdd::ScalarA MultiplyAddScalar;

   static int const kInt4PerScalar = sizeof(Scalar) * 2;

   typedef WmmaMatrix<GemmOperand::kA,
                      MatrixLayout::kRowMajor,
                      Vector<int4_t, 8>,
                      typename GemmConfig_::InstructionShape>
       WmmaMatrix;

   typedef GemmGlobalTileTraits<
       // That's A.
       GemmOperand::kA,
       // A is row-major.
       MatrixLayout::kRowMajor,
       // The pointer is float const.
       Scalar const,
       // The tile has size KxM in GEMM's terminology.
       Shape<1, GemmConfig_::OutputTile::kW, GemmConfig_::OutputTile::kD / kInt4PerScalar>,
       // The threads are distributed as warps x 32 (the traits may reorganize).
       Shape<1,
             GemmConfig_::kThreads / (GemmConfig_::OutputTile::kD / kInt4PerScalar),
             GemmConfig_::OutputTile::kD / kInt4PerScalar>,
       // The number of scalars per LDG (LDG.32 or LDG.128, etc).
       GemmConfig_::kScalarsPerLdgA / kInt4PerScalar>
       GlobalTileTraits;

   static int const kSkew = 16 / sizeof(MultiplyAddScalar);
   typedef Shape<GemmConfig_::kStages,
                 GemmConfig_::OutputTile::kW,
                 GemmConfig_::OutputTile::kD / kInt4PerScalar + kSkew>
       Tile;

   typedef GemmSharedStoreTileAbTraits<
       // The pointer.
       MultiplyAddScalar,
       // The tile has size KxM in GEMM's terminology.
       Tile,
       // The threads are distributed as warps x 32 (the traits may reorganize).
       typename GlobalTileTraits::Threads,
       // The number of scalars per STS (STS.32 or STS.128, etc).
       GemmConfig_::kScalarsPerStsA / kInt4PerScalar>
       SharedStoreTileTraits;

   static int const kScalarsPerW = GemmConfig_::InstructionShape::kW * GemmConfig_::Warps::kW;
   typedef WmmaGemmSharedLoadTileATraits<
       // The layout of the matrix.
       MatrixLayout::kRowMajor,
       // The pointer.
       MultiplyAddScalar,
       // The tile in shared memory.
       Tile,
       // The number of warps.
       typename GemmConfig_::Warps,
       // The strides between warps.
       GemmConfig_::InstructionShape::kW * Tile::kW,
       // The number of iterations to load the data.
       Shape<1, 1, GemmConfig_::OutputTile::kW / kScalarsPerW>,
       // The stride between iterations.
       Shape<GemmConfig_::InstructionShape::kD / kInt4PerScalar, 0, kScalarsPerW * Tile::kW>,
       // The shape of the instruction.
       typename GemmConfig_::InstructionShape>
       SharedLoadTileTraits;
 };
 #endif


 template <enum MatrixLayout::Kind kLayout_,
           typename GemmConfig_,
           typename ScalarB_>
 struct WmmaGemmTileTraitsHelperB {};


 template <typename GemmConfig_, typename ScalarB_>
 struct WmmaGemmTileTraitsHelperB<MatrixLayout::kRowMajor, GemmConfig_, ScalarB_>
     : public GemmTileTraitsHelperB<MatrixLayout::kRowMajor, GemmConfig_> {
   typedef GemmTileTraitsHelperB<MatrixLayout::kRowMajor, GemmConfig_> Base;

   static int const kSkew = 16 / sizeof(typename Base::MultiplyAddScalar);
   typedef Shape<GemmConfig_::kStages,
                 GemmConfig_::OutputTile::kD,
                 GemmConfig_::OutputTile::kH + kSkew>
       Tile;

   typedef WmmaMatrix<GemmOperand::kB,
                      MatrixLayout::kRowMajor,
                      typename Base::MultiplyAddScalar,
                      typename GemmConfig_::InstructionShape>
       WmmaMatrix;

   typedef GemmSharedStoreTileAbTraits<
       // The pointer.
       typename Base::MultiplyAddScalar,
       // The tile has size KxM in GEMM's terminology.
       Tile,
       // The threads are distributed as warps x 32 (the traits may reorganize).
       typename Base::GlobalTileTraits::Threads,
       // The number of scalars per STS (STS.32 or STS.128, etc).
       GemmConfig_::kScalarsPerStsB>
       SharedStoreTileTraits;

   static int const kScalarsPerW = GemmConfig_::InstructionShape::kH * GemmConfig_::Warps::kH;
   static int const kScalarsPerIteration = Tile::kW * GemmConfig_::InstructionShape::kD;
   typedef WmmaGemmSharedLoadTileBTraits<
       // The layout of the matrix.
       MatrixLayout::kRowMajor,
       // The pointer.
       typename Base::MultiplyAddScalar,
       // The output tile size.
       Tile,
       // The number of warps.
       typename GemmConfig_::Warps,
       // The strides between warps.
       GemmConfig_::InstructionShape::kH,
       // The number of iterations to load the data.
       Shape<1, 1, GemmConfig_::OutputTile::kH / kScalarsPerW>,
       // The stride between iterations.
       Shape<kScalarsPerIteration, 0, kScalarsPerW, 0>,
       // The shape of the instruction.
       typename GemmConfig_::InstructionShape>
       SharedLoadTileTraits;
 };


 template <typename GemmConfig_, typename ScalarB_>
 struct WmmaGemmTileTraitsHelperB<MatrixLayout::kColumnMajor, GemmConfig_, ScalarB_> {
   static MatrixLayout::Kind const kLayout = MatrixLayout::kColumnMajor;

   typedef typename GemmConfig_::ScalarB Scalar;
   typedef typename GemmConfig_::MultiplyAdd::ScalarB MultiplyAddScalar;

   typedef WmmaMatrix<GemmOperand::kB,
                      MatrixLayout::kColumnMajor,
                      MultiplyAddScalar,
                      typename GemmConfig_::InstructionShape>
       WmmaMatrix;

   typedef GemmGlobalTileTraits<
       // That's B.
       GemmOperand::kB,
       // A is row-major.
       MatrixLayout::kColumnMajor,
       // The pointer is float const.
       Scalar const,
       // The tile has size KxM in GEMM's terminology.
       Shape<1, GemmConfig_::OutputTile::kH, GemmConfig_::OutputTile::kD>,
       // The threads are distributed as warps x 32 (the traits may reorganize).
       Shape<1, GemmConfig_::kThreads / GemmConfig_::OutputTile::kD, GemmConfig_::OutputTile::kD>,
       // The number of scalars per LDG (LDG.32 or LDG.128, etc).
       GemmConfig_::kScalarsPerLdgB>
       GlobalTileTraits;

   static int const kSkew = 16 / sizeof(MultiplyAddScalar);
   typedef Shape<GemmConfig_::kStages,
                 GemmConfig_::OutputTile::kH,
                 GemmConfig_::OutputTile::kD + kSkew>
       Tile;

   typedef GemmSharedStoreTileAbTraits<
       // The pointer.
       MultiplyAddScalar,
       // The tile has size KxM in GEMM's terminology.
       Tile,
       // The threads are distributed as warps x 32 (the traits may reorganize).
       typename GlobalTileTraits::Threads,
       // The number of scalars per STS (STS.32 or STS.128, etc).
       GemmConfig_::kScalarsPerStsB>
       SharedStoreTileTraits;

   static int const kScalarsPerW = GemmConfig_::InstructionShape::kH * GemmConfig_::Warps::kH;
   typedef WmmaGemmSharedLoadTileBTraits<
       // The layout of the matrix.
       MatrixLayout::kColumnMajor,
       // The pointer.
       MultiplyAddScalar,
       // The tile in shared memory.
       Tile,
       // The number of warps.
       typename GemmConfig_::Warps,
       // The strides between warps.
       GemmConfig_::InstructionShape::kH * Tile::kW,
       // The number of iterations to load the data.
       Shape<1, 1, GemmConfig_::OutputTile::kH / kScalarsPerW>,
       // The stride between iterations.
       Shape<GemmConfig_::InstructionShape::kD, 0, kScalarsPerW * Tile::kW>,
       // The shape of the instruction.
       typename GemmConfig_::InstructionShape>
       SharedLoadTileTraits;
 };


 #ifdef CUTLASS_USE_SUBBYTE_WMMA
 template <typename GemmConfig_>
 struct WmmaGemmTileTraitsHelperB<MatrixLayout::kColumnMajor, GemmConfig_, Vector<bin1_t, 32> > {
   static MatrixLayout::Kind const kLayout = MatrixLayout::kColumnMajor;

   typedef typename GemmConfig_::ScalarB Scalar;
   typedef typename GemmConfig_::MultiplyAdd::ScalarB MultiplyAddScalar;

   static int const kBitsPerScalar = sizeof(Scalar) * 8;

   typedef WmmaMatrix<GemmOperand::kB,
                      MatrixLayout::kColumnMajor,
                      Vector<bin1_t, 32>,
                      typename GemmConfig_::InstructionShape>
       WmmaMatrix;

   typedef GemmGlobalTileTraits<
       // That's B.
       GemmOperand::kB,
       // A is row-major.
       MatrixLayout::kColumnMajor,
       // The pointer is float const.
       Scalar const,
       // The tile has size KxM in GEMM's terminology.
       Shape<1, GemmConfig_::OutputTile::kH, GemmConfig_::OutputTile::kD / kBitsPerScalar>,
       // The threads are distributed as warps x 32 (the traits may reorganize).
       Shape<1,
             GemmConfig_::kThreads / (GemmConfig_::OutputTile::kD / kBitsPerScalar),
             GemmConfig_::OutputTile::kD / kBitsPerScalar>,
       // The number of scalars per LDG (LDG.32 or LDG.128, etc).
       GemmConfig_::kScalarsPerLdgB / kBitsPerScalar>
       GlobalTileTraits;

   static int const kSkew = 16 / sizeof(MultiplyAddScalar);
   typedef Shape<GemmConfig_::kStages,
                 GemmConfig_::OutputTile::kH,
                 GemmConfig_::OutputTile::kD / kBitsPerScalar + kSkew>
       Tile;

   typedef GemmSharedStoreTileAbTraits<
       // The pointer.
       MultiplyAddScalar,
       // The tile has size KxM in GEMM's terminology.
       Tile,
       // The threads are distributed as warps x 32 (the traits may reorganize).
       typename GlobalTileTraits::Threads,
       // The number of scalars per STS (STS.32 or STS.128, etc).
       GemmConfig_::kScalarsPerStsB / kBitsPerScalar>
       SharedStoreTileTraits;

   static int const kScalarsPerW = GemmConfig_::InstructionShape::kH * GemmConfig_::Warps::kH;
   typedef WmmaGemmSharedLoadTileBTraits<
       // The layout of the matrix.
       MatrixLayout::kColumnMajor,
       // The pointer.
       MultiplyAddScalar,
       // The tile in shared memory.
       Tile,
       // The number of warps.
       typename GemmConfig_::Warps,
       // The strides between warps.
       GemmConfig_::InstructionShape::kH * Tile::kW,
       // The number of iterations to load the data.
       Shape<1, 1, GemmConfig_::OutputTile::kH / kScalarsPerW>,
       // The stride between iterations.
       Shape<GemmConfig_::InstructionShape::kD / kBitsPerScalar, 0, kScalarsPerW * Tile::kW>,
       // The shape of the instruction.
       typename GemmConfig_::InstructionShape>
       SharedLoadTileTraits;
 };
 #endif


 #ifdef CUTLASS_USE_SUBBYTE_WMMA
 template <typename GemmConfig_>
 struct WmmaGemmTileTraitsHelperB<MatrixLayout::kColumnMajor, GemmConfig_, Vector<uint4_t, 8> > {
   static MatrixLayout::Kind const kLayout = MatrixLayout::kColumnMajor;

   typedef typename GemmConfig_::ScalarB Scalar;
   typedef typename GemmConfig_::MultiplyAdd::ScalarB MultiplyAddScalar;

   static int const kInt4PerScalar = sizeof(Scalar) * 2;

   typedef WmmaMatrix<GemmOperand::kB,
                      MatrixLayout::kColumnMajor,
                      Vector<uint4_t, 8>,
                      typename GemmConfig_::InstructionShape>
       WmmaMatrix;

   typedef GemmGlobalTileTraits<
       // That's B.
       GemmOperand::kB,
       // A is row-major.
       MatrixLayout::kColumnMajor,
       // The pointer is float const.
       Scalar const,
       // The tile has size KxM in GEMM's terminology.
       Shape<1, GemmConfig_::OutputTile::kH, GemmConfig_::OutputTile::kD / kInt4PerScalar>,
       // The threads are distributed as warps x 32 (the traits may reorganize).
       Shape<1,
             GemmConfig_::kThreads / (GemmConfig_::OutputTile::kD / kInt4PerScalar),
             GemmConfig_::OutputTile::kD / kInt4PerScalar>,
       // The number of scalars per LDG (LDG.32 or LDG.128, etc).
       GemmConfig_::kScalarsPerLdgB / kInt4PerScalar>
       GlobalTileTraits;

   static int const kSkew = 16 / sizeof(MultiplyAddScalar);
   typedef Shape<GemmConfig_::kStages,
                 GemmConfig_::OutputTile::kH,
                 GemmConfig_::OutputTile::kD / kInt4PerScalar + kSkew>
       Tile;

   typedef GemmSharedStoreTileAbTraits<
       // The pointer.
       MultiplyAddScalar,
       // The tile has size KxM in GEMM's terminology.
       Tile,
       // The threads are distributed as warps x 32 (the traits may reorganize).
       typename GlobalTileTraits::Threads,
       // The number of scalars per STS (STS.32 or STS.128, etc).
       GemmConfig_::kScalarsPerStsB / kInt4PerScalar>
       SharedStoreTileTraits;

   static int const kScalarsPerW = GemmConfig_::InstructionShape::kH * GemmConfig_::Warps::kH;
   typedef WmmaGemmSharedLoadTileBTraits<
       // The layout of the matrix.
       MatrixLayout::kColumnMajor,
       // The pointer.
       MultiplyAddScalar,
       // The tile in shared memory.
       Tile,
       // The number of warps.
       typename GemmConfig_::Warps,
       // The strides between warps.
       GemmConfig_::InstructionShape::kH * Tile::kW,
       // The number of iterations to load the data.
       Shape<1, 1, GemmConfig_::OutputTile::kH / kScalarsPerW>,
       // The stride between iterations.
       Shape<GemmConfig_::InstructionShape::kD / kInt4PerScalar, 0, kScalarsPerW * Tile::kW>,
       // The shape of the instruction.
       typename GemmConfig_::InstructionShape>
       SharedLoadTileTraits;
 };
 #endif


 #ifdef CUTLASS_USE_SUBBYTE_WMMA
 template <typename GemmConfig_>
 struct WmmaGemmTileTraitsHelperB<MatrixLayout::kColumnMajor, GemmConfig_, Vector<int4_t, 8> > {
   static MatrixLayout::Kind const kLayout = MatrixLayout::kColumnMajor;

   typedef typename GemmConfig_::ScalarB Scalar;
   typedef typename GemmConfig_::MultiplyAdd::ScalarB MultiplyAddScalar;

   static int const kInt4PerScalar = sizeof(Scalar) * 2;

   typedef WmmaMatrix<GemmOperand::kB,
                      MatrixLayout::kColumnMajor,
                      Vector<int4_t, 8>,
                      typename GemmConfig_::InstructionShape>
       WmmaMatrix;

   typedef GemmGlobalTileTraits<
       // That's B.
       GemmOperand::kB,
       // A is row-major.
       MatrixLayout::kColumnMajor,
       // The pointer is float const.
       Scalar const,
       // The tile has size KxM in GEMM's terminology.
       Shape<1, GemmConfig_::OutputTile::kH, GemmConfig_::OutputTile::kD / kInt4PerScalar>,
       // The threads are distributed as warps x 32 (the traits may reorganize).
       Shape<1,
             GemmConfig_::kThreads / (GemmConfig_::OutputTile::kD / kInt4PerScalar),
             GemmConfig_::OutputTile::kD / kInt4PerScalar>,
       // The number of scalars per LDG (LDG.32 or LDG.128, etc).
       GemmConfig_::kScalarsPerLdgB / kInt4PerScalar>
       GlobalTileTraits;

   static int const kSkew = 16 / sizeof(MultiplyAddScalar);
   typedef Shape<GemmConfig_::kStages,
                 GemmConfig_::OutputTile::kH,
                 GemmConfig_::OutputTile::kD / kInt4PerScalar + kSkew>
       Tile;

   typedef GemmSharedStoreTileAbTraits<
       // The pointer.
       MultiplyAddScalar,
       // The tile has size KxM in GEMM's terminology.
       Tile,
       // The threads are distributed as warps x 32 (the traits may reorganize).
       typename GlobalTileTraits::Threads,
       // The number of scalars per STS (STS.32 or STS.128, etc).
       GemmConfig_::kScalarsPerStsB / kInt4PerScalar>
       SharedStoreTileTraits;

   static int const kScalarsPerW = GemmConfig_::InstructionShape::kH * GemmConfig_::Warps::kH;
   typedef WmmaGemmSharedLoadTileBTraits<
       // The layout of the matrix.
       MatrixLayout::kColumnMajor,
       // The pointer.
       MultiplyAddScalar,
       // The tile in shared memory.
       Tile,
       // The number of warps.
       typename GemmConfig_::Warps,
       // The strides between warps.
       GemmConfig_::InstructionShape::kH * Tile::kW,
       // The number of iterations to load the data.
       Shape<1, 1, GemmConfig_::OutputTile::kH / kScalarsPerW>,
       // The stride between iterations.
       Shape<GemmConfig_::InstructionShape::kD / kInt4PerScalar, 0, kScalarsPerW * Tile::kW>,
       // The shape of the instruction.
       typename GemmConfig_::InstructionShape>
       SharedLoadTileTraits;
 };
 #endif


 template <
     MatrixLayout::Kind kLayoutA_,
     MatrixLayout::Kind kLayoutB_,
     typename OutputTile_,
     typename ScalarA_,
     typename ScalarB_,
     typename ScalarC_,
     typename Accumulator_,
     typename EpilogueFunctor_,
     typename WarpGemmShape_,
     typename InstructionShape_,
     int kScalarsPerLdgA_,
     int kScalarsPerLdgB_,
     typename Index_>
 struct WmmaGemmTraitsHelper {
   typedef WmmaGemmConfig<kLayoutA_,
                          kLayoutB_,
                          OutputTile_,
                          ScalarA_,
                          ScalarB_,
                          ScalarC_,
                          Accumulator_,
                          WarpGemmShape_,
                          InstructionShape_,
                          kScalarsPerLdgA_,
                          kScalarsPerLdgB_>
       GemmConfig;

   typedef WmmaGemmTileTraitsHelperA<kLayoutA_, GemmConfig, ScalarA_> GemmTileTraitsHelperA;
   typedef WmmaGemmTileTraitsHelperB<kLayoutB_, GemmConfig, ScalarB_> GemmTileTraitsHelperB;

   typedef GemmGlobalIteratorAb<typename GemmTileTraitsHelperA::GlobalTileTraits, Index_>
       GlobalLoadIteratorA;
   typedef Copy<typename GlobalLoadIteratorA::Fragment> GlobalTransformerA;
   typedef TileStoreIterator<typename GemmTileTraitsHelperA::SharedStoreTileTraits,
                             typename GemmTileTraitsHelperA::SharedStoreTileTraits::Scalar,
                             IteratorAdvance::kH,
                             MemorySpace::kShared>
       SharedStoreIteratorA;
   typedef GlobalLoadStream<GemmOperand::kA,
                               GlobalLoadIteratorA,
                               SharedStoreIteratorA,
                               GlobalTransformerA>
       GlobalLoadStreamA;

   typedef GemmGlobalIteratorAb<typename GemmTileTraitsHelperB::GlobalTileTraits, Index_>
       GlobalLoadIteratorB;
   // The default transformer for B.
   typedef Copy<typename GlobalLoadIteratorB::Fragment> GlobalTransformerB;
   typedef TileStoreIterator<typename GemmTileTraitsHelperB::SharedStoreTileTraits,
                             typename GemmTileTraitsHelperB::SharedStoreTileTraits::Scalar,
                             IteratorAdvance::kH,
                             MemorySpace::kShared>
       SharedStoreIteratorB;
   typedef GlobalLoadStream<GemmOperand::kB,
                               GlobalLoadIteratorB,
                               SharedStoreIteratorB,
                               GlobalTransformerB>
       GlobalLoadStreamB;

   typedef TileLoadIterator<typename GemmTileTraitsHelperA::SharedLoadTileTraits,
                            typename GemmTileTraitsHelperA::SharedLoadTileTraits::Scalar,
                            IteratorAdvance::kH,
                            MemorySpace::kShared,
                            Index_,
                            typename GemmTileTraitsHelperA::WmmaMatrix,
                            FragmentElementType::kWmmaMatrix>
       SharedLoadIteratorA;
   typedef SharedLoadStream<SharedLoadIteratorA> SharedLoadStreamA;
   typedef TileLoadIterator<typename GemmTileTraitsHelperB::SharedLoadTileTraits,
                            typename GemmTileTraitsHelperB::SharedLoadTileTraits::Scalar,
                            IteratorAdvance::kH,
                            MemorySpace::kShared,
                            Index_,
                            typename GemmTileTraitsHelperB::WmmaMatrix,
                            FragmentElementType::kWmmaMatrix>
       SharedLoadIteratorB;
   typedef SharedLoadStream<SharedLoadIteratorB> SharedLoadStreamB;

   typedef typename GemmConfig::MultiplyAdd MultiplyAdd;
   typedef ClearAccumulators<typename MultiplyAdd::ScalarC> ClearAccumulators;

   typedef WmmaGemmEpilogueTraitsHelper<GemmConfig, EpilogueFunctor_, Index_> EpilogueTraitsHelper;
   typedef SimplifiedGemmEpilogueTraits<GemmConfig, EpilogueFunctor_, Index_, EpilogueTraitsHelper>
       GemmEpilogueTraits;
   typedef GemmEpilogue<GemmEpilogueTraits> Epilogue;
 };


 template <typename OutputTile_, typename DefaultShape_ = Shape<64, 32, 64> >
 struct WmmaGemmAccumulatorsPerWarp {
   typedef typename ShapeMin<OutputTile_, DefaultShape_>::Shape Shape;
 };


 template <
     MatrixLayout::Kind kLayoutA_,
     MatrixLayout::Kind kLayoutB_,
     typename OutputTile_ = Shape<64, 128, 128>,
     typename ScalarA_ = half,
     typename ScalarB_ = half,
     typename ScalarC_ = float,
     typename EpilogueFunctor_ = LinearScaling<ScalarC_>,
     typename Accumulator_ = ScalarC_,
     typename WarpGemmShape_ = typename WmmaGemmAccumulatorsPerWarp<OutputTile_>::Shape,
     typename InstructionShape_ = Shape<16, 16, 16>,
     int kScalarsPerLdgA_ = 8,
     int kScalarsPerLdgB_ = 8,
     typename Index_ = int,
     typename Helper_ = WmmaGemmTraitsHelper<kLayoutA_,
                                             kLayoutB_,
                                             OutputTile_,
                                             ScalarA_,
                                             ScalarB_,
                                             ScalarC_,
                                             Accumulator_,
                                             EpilogueFunctor_,
                                             WarpGemmShape_,
                                             InstructionShape_,
                                             kScalarsPerLdgA_,
                                             kScalarsPerLdgB_,
                                             Index_> >
 struct WmmaGemmTraits : public GemmTraits<
                             // The config.
                             typename Helper_::GemmConfig,
                             // The stream to load A from global memory to shared memory.
                             typename Helper_::GlobalLoadStreamA,
                             // The stream to load B from global memory to shared memory.
                             typename Helper_::GlobalLoadStreamB,
                             // The stream to load A from shared memory.
                             typename Helper_::SharedLoadStreamA,
                             // The stream to load B from shared memory.
                             typename Helper_::SharedLoadStreamB,
                             // The epilogue.
                             typename Helper_::Epilogue,
                             // The block swizzle to reorganize the grid.
                             IdentityBlockSwizzle,
                             // The index.
                             Index_,
                             // The tool used to clear accumulators.
                             typename Helper_::ClearAccumulators> {};


 }  // namespace gemm
 }  // namespace cutlass

 #endif  // defined CUTLASS_USE_WMMA_API
wmma_matrix.h
Abstractions for loading and storing matrices using the CUDA WMMA API.

cutlass::MemorySpace::kShared
Definition: load_store.h:41

cutlass
Definition: convert.h:33

gemm_global_tile.h
Defines iterators for efficiently loading and storing to global memory.

gemm_traits.h
Defines structural properties of complete GEMM computation.

wmma_gemm_epilogue_traits.h
Defines structural properties of WMMA GEMM&#39;s epilogue phase.

cutlass::FragmentElementType::kWmmaMatrix
Definition: load_store.h:48

cutlass::IteratorAdvance::kH
Definition: tile_iterator.h:65

gemm_epilogue.h
Implements the epilogue phase of the GEMM kernel that efficiently updates global memory with the comp...

gemm_shared_tile.h
Defines iterators for efficiently loading and storing tiles to and from shared memory.

cutlass::gemm::GemmConfig::MultiplyAdd
MultiplyAdd_ MultiplyAdd
The functor to do D = A*B + C.
Definition: gemm_config.h:90

cutlass::MatrixLayout::kRowMajor
Definition: matrix_traits.h:159

cutlass::GemmOperand::kB
Definition: matrix_traits.h:357

wmma_gemm_global_tile.h
Defines tile iterator traits for loading thread block-level tile from global memory.

cutlass::MatrixLayout::kColumnMajor
Definition: matrix_traits.h:159

cutlass::MatrixLayout::Kind
Kind
Enumeration defining fundamental contiguous layouts.
Definition: matrix_traits.h:159

wmma_gemm_multiply_add.h
Implements warp-level matrix multiply-accumulate operation using CUDA WMMA API.

cutlass::GemmOperand::kA
Definition: matrix_traits.h:357

gemm.h
Implements a software-pipelined efficient GEMM.

gemm_epilogue_traits.h
Defines structural properties of the GEMM epilogue.

cutlass::ShapeMin::Shape
Shape<(A_::kD< B_::kD ? A_::kD :B_::kD),(A_::kH< B_::kH ? A_::kH :B_::kH),(A_::kW< B_::kW ? A_::kW :B_::kW),(A_::kC< B_::kC ? A_::kC :B_::kC)> Shape
Definition: shape.h:159

convert.h
Defines conversion operations among Fragments of different base type.