374882be53f2a7558aeb6c4955b8b9da75b29ecf/docs/wmma__gemm__epilogue__traits_8h_source.html

 /***************************************************************************************************
  * Copyright (c) 2017-2018, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright notice, this list of
  *       conditions and the following disclaimer.
  *     * Redistributions in binary form must reproduce the above copyright notice, this list of
  *       conditions and the following disclaimer in the documentation and/or other materials
  *       provided with the distribution.
  *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
  *       to endorse or promote products derived from this software without specific prior written
  *       permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
 #pragma once

 #include <cutlass/wmma_matrix.h>
 #ifdef CUTLASS_USE_WMMA_API

 #include <cutlass/convert.h>
 #include <cutlass/coord.h>
 #include <cutlass/gemm/gemm_global_stream.h>
 #include <cutlass/gemm/gemm_shared_stream.h>
 #include <cutlass/gemm/linear_scaling.h>
 #include <cutlass/gemm/wmma_gemm_global_tile.h>
 #include <cutlass/gemm/wmma_gemm_shared_tile.h>
 #include <cutlass/reshape_tile.h>
 #include <cutlass/tile_iterator.h>

 namespace cutlass {
 namespace gemm {


 template <typename GemmConfig_, typename EpilogueFunctor_, typename Index_ = int>
 struct WmmaGemmEpilogueTraitsHelper {
   typedef typename EpilogueFunctor_::Scalar Scalar;
   typedef typename GemmConfig_::OutputTile OutputTile;

   static int const kWmmasPerH =
       GemmConfig_::AccumulatorsPerWarp::kH / GemmConfig_::InstructionShape::kH;
   typedef Shape<1, 1, kWmmasPerH> Iterations;
   // The iteration strides in the H/W dimension.
   typedef Shape<0, 0, 0> Delta;
   typedef EpilogueFunctor_ Functor;

   typedef WmmaGemmSharedStoreTileDTraits<
       // The output layout.
       MatrixLayout::kColumnMajor,
       // The pointer is float.
       typename Functor::Scalar,
       // The output tile size.
       typename GemmConfig_::OutputTile,
       // The number of warps.
       typename GemmConfig_::Warps,
       // The shape of the instruction.
       typename GemmConfig_::InstructionShape>
       SharedStoreTileTraits;

   typedef WmmaMatrix<GemmOperand::kC,
                      MatrixLayout::kColumnMajor,
                      Scalar,
                      typename GemmConfig_::InstructionShape>
       WmmaMatrix;

   typedef TileStoreIterator<SharedStoreTileTraits,
                             typename SharedStoreTileTraits::Scalar,
                             IteratorAdvance::kH,
                             MemorySpace::kShared,
                             Index_,
                             WmmaMatrix,
                             IteratorFragment::kWmmaMatrix>
       SharedStoreIteratorD;

   typedef Copy<typename SharedStoreIteratorD::Fragment> SharedStoreTransformerD;

   typedef WmmaGemmSharedLoadTileDTraits<
       // The pointer.
       typename Functor::Scalar,
       // The tile size.
       typename SharedStoreIteratorD::Tile,
       // The number of threads.
       Shape<1, ShapeCount<typename GemmConfig_::Warps>::kCount, GemmConfig_::kWarpSize>,
       // The number of scalars per LDS.
       GemmConfig_::kScalarsPerLdsD>
       SharedLoadTileTraits;

   typedef TileLoadIterator<SharedLoadTileTraits,
                            typename SharedLoadTileTraits::Scalar,
                            IteratorAdvance::kH,
                            MemorySpace::kShared>
       SharedLoadIteratorD;

   typedef WmmaGemmGlobalIteratorCdTraits<
       // The pointer is float const.
       typename GemmConfig_::ScalarC const,
       // The tile has size (N / Iterations)xM in GEMM's terminology.
       Shape<1,
             GemmConfig_::OutputTile::kH / ShapeCount<Iterations>::kCount,
             GemmConfig_::OutputTile::kW>,
       // The threads are distributed as warps x 32 (the traits may reorganize).
       Shape<1, ShapeCount<typename GemmConfig_::Warps>::kCount, GemmConfig_::kWarpSize>,
       // The number of scalars per LDG (LDG.32 or LDG.128, etc).
       GemmConfig_::kScalarsPerLdgC>
       GlobalLoadTileTraits;

   typedef WmmaGemmGlobalIteratorCd<GlobalLoadTileTraits, Index_> GlobalLoadIteratorC;
   typedef Copy<typename GlobalLoadIteratorC::Fragment> GlobalTransformerC;

   typedef WmmaGemmGlobalIteratorCdTraits<
       // The pointer is float.
       typename GemmConfig_::ScalarD,
       // The tile has size (N / Iterations)xM in GEMM's terminology.
       Shape<1,
             GemmConfig_::OutputTile::kH / ShapeCount<Iterations>::kCount,
             GemmConfig_::OutputTile::kW>,
       // The threads are distributed as warps x 32 (the traits may reorganize).
       Shape<1, ShapeCount<typename GemmConfig_::Warps>::kCount, GemmConfig_::kWarpSize>,
       // The number of scalars per LDG (LDG.32 or LDG.128, etc).
       GemmConfig_::kScalarsPerStgD>
       GlobalStoreTileTraits;

   typedef WmmaGemmGlobalIteratorCd<GlobalStoreTileTraits, Index_> GlobalStoreIteratorD;
   typedef Copy<typename GlobalStoreIteratorD::Fragment> GlobalTransformerD;
 };


 }  // namespace gemm
 }  // namespace cutlass

 #endif  // defined CUTLASS_USE_WMMA_API
wmma_matrix.h
Abstractions for loading and storing matrices using the CUDA WMMA API.

cutlass::MemorySpace::kShared
Definition: load_store.h:42

cutlass
Definition: convert.h:33

tile_iterator.h
Defines the Tile Traits concept and iterators for loading and storing to tiles efficiently.

linear_scaling.h
Implements the BLAS linear scaling function alpha*AB + beta*C.

coord.h
A Coord is a coordinate of arbitrary rank into a tensor or matrix.

cutlass::IteratorAdvance::kH
Definition: tile_iterator.h:62

cutlass::GemmOperand::kC
Definition: matrix_traits.h:43

reshape_tile.h
Defines a type for restructuring a tile.

cutlass::IteratorFragment::kWmmaMatrix
Definition: tile_iterator.h:67

wmma_gemm_global_tile.h
Defines tile iterator traits for loading thread block-level tile from global memory.

cutlass::ShapeCount::kCount
static int const kCount
The number of elements in the 4D space.
Definition: shape.h:91

cutlass::MatrixLayout::kColumnMajor
Definition: matrix_traits.h:36

gemm_global_stream.h
Implements efficient loading of the thread block-level tile from global memory and storing to shared ...

gemm_shared_stream.h
Defines abstractions for managing loading and storing fragments to shared memory in the efficient GEM...

convert.h
Defines conversion operations among Fragments of different base type.

wmma_gemm_shared_tile.h
Defines iterator traits for efficiently loading and storing fragment to and from shared memory...