9fd55460c6c16d0edb11beb60087a05470776ede/docs/gemm__shared__tile_8h_source.html

 /***************************************************************************************************
  * Copyright (c) 2017-2018, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright notice, this list of
  *       conditions and the following disclaimer.
  *     * Redistributions in binary form must reproduce the above copyright notice, this list of
  *       conditions and the following disclaimer in the documentation and/or other materials
  *       provided with the distribution.
  *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
  *       to endorse or promote products derived from this software without specific prior written
  *       permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
 #pragma once

 #include <cutlass/gemm/gemm_operand.h>

 namespace cutlass {
 namespace gemm {


 template <typename Scalar_, typename Tile_, typename Threads_, int kScalarsPerSts_>
 struct GemmSharedStoreTileAbTraits {
   typedef typename platform::remove_const<Scalar_>::type Scalar;
   typedef Scalar_* Pointer;
   typedef typename ReshapeTile<Tile_, kScalarsPerSts_>::Tile Tile;
   typedef Threads_ Threads;
   typedef Shape<0, ShapeCount<Tile>::kWc, Tile::kC, kScalarsPerSts_> ThreadsStrides;
   static int const kSkew = 0;
   static int const kAccessSize = kScalarsPerSts_;
   static MemorySpace::Kind const kMemorySpace = MemorySpace::kShared;

   typedef Shape<1,
                 Tile::kH / Threads::kH,
                 Tile::kW / Threads::kW,
                 Tile::kC / Threads::kC / kAccessSize>
       Iterations;
   typedef Shape<0, Threads::kH * ShapeCount<Tile>::kWc, Threads::kW * kAccessSize> Delta;
   typedef Shape<0, Threads::kH * ShapeCount<Tile>::kWc, Threads::kW * kAccessSize>
       ImmediateOffsetStrides;

   struct ThreadOffset {
     CUTLASS_HOST_DEVICE
     Coord<4> operator()() const {
       int offset = ComputeThreadOffsetFromStrides<Threads, ThreadsStrides>::get();
       return make_Coord(0, 0, offset, 0);
     }
   };
 };


 template <typename Scalar_, typename Tile_, typename Threads_, int kScalarsPerSts_, int kSkew_>
 struct GemmSharedStoreWithSkewTileAbTraits {
   typedef typename platform::remove_const<Scalar_>::type Scalar;
   typedef Scalar_* Pointer;
   typedef typename ReshapeTile<Tile_, kScalarsPerSts_>::Tile TileWithoutSkew;
   typedef typename ReshapeTile<Shape<Tile_::kD, Tile_::kH, Tile_::kW + kSkew_>,
                                kScalarsPerSts_>::Tile Tile;
   typedef Threads_ Threads;
   static int const kSkew = kSkew_;
   static int const kAccessSize = kScalarsPerSts_;
   static MemorySpace::Kind const kMemorySpace = MemorySpace::kShared;

   typedef Shape<1, TileWithoutSkew::kH / Threads::kW, TileWithoutSkew::kW / Threads::kH> Iterations;
   typedef Shape<0, ShapeCount<Tile>::kWc, Threads::kH * kAccessSize> Delta;
   typedef Shape<0, ShapeCount<Tile>::kWc, Threads::kH * kAccessSize> ImmediateOffsetStrides;

   struct ThreadOffset {
     CUTLASS_HOST_DEVICE
     Coord<4> operator()() const {
       int offset = ComputeThreadOffsetFromStrides<Threads, ThreadsStrides>::get();
       return make_Coord(0, 0, offset, 0);
     }
   };

  protected:
   typedef Shape<0, kScalarsPerSts_, ShapeCount<Tile>::kHwc / Threads::kW> ThreadsStrides;
 };


 template <typename Scalar_,
           typename OutputTile_,
           typename Warps_,
           typename ThreadsPerWarp_,
           typename InstructionShape_,
           int kStages_,
           int kScalarsPerLds_,
           int kSkew_ = 0>
 struct GemmSharedLoadTileATraits {
   static GemmOperand::Kind const kOperand = GemmOperand::kA;
   typedef typename platform::remove_const<Scalar_>::type Scalar;
   typedef Scalar_* Pointer;
   typedef Shape<kStages_,
                 OutputTile_::kD / InstructionShape_::kD,
                 GetExtent<kOperand, OutputTile_>::kExtent * InstructionShape_::kD>
       TileWithoutSkew_;
   typedef Shape<kStages_, TileWithoutSkew_::kH, TileWithoutSkew_::kW + kSkew_> TileWithSkew;
   typedef typename ReshapeTile<TileWithoutSkew_, kScalarsPerLds_>::Tile TileWithoutSkew;
   typedef typename ReshapeTile<TileWithSkew, kScalarsPerLds_>::Tile Tile;
   typedef Warps_ Warps;
   typedef ThreadsPerWarp_ ThreadsPerWarp;
   // static int const kScalarsPerLds = kScalarsPerLds_;
   static int const kAccessSize = kScalarsPerLds_;
   static int const kSkew = kSkew_;
   static MemorySpace::Kind const kMemorySpace = MemorySpace::kShared;

   static int const kWarps = GetExtent<kOperand, Warps>::kExtent;
   static int const kThreadsPerWarp = GetExtent<kOperand, ThreadsPerWarp>::kExtent;

   typedef Shape<1, 1, TileWithoutSkew::kW / kWarps / kThreadsPerWarp /* / kScalarsPerLds*/>
       Iterations;
   typedef Shape<TileWithSkew::kW, 0, kWarps * kThreadsPerWarp * kAccessSize, 0> Delta;
   typedef Shape<TileWithSkew::kW, 0, kWarps * kThreadsPerWarp * kAccessSize, 0>
       ImmediateOffsetStrides;

   struct ThreadOffset {
     CUTLASS_HOST_DEVICE
     Coord<4> operator()() const {
       // Extract the warp.
       int const warp = threadIdx.x / kWarpSize % Warps::kW;
       // Compute the row offset for each thread
       int const lane = (threadIdx.x & 0x0e) / 2;
       // The offset.
       int const offset = (warp * ThreadsPerWarp::kW + lane) * kAccessSize;

       return make_Coord(0, 0, offset, 0);
     }
   };
 };


 template <typename Scalar_,
           typename OutputTile_,
           typename Warps_,
           typename ThreadsPerWarp_,
           typename InstructionShape_,
           int kStages_,
           int kScalarsPerLds_,
           int kSkew_ = 0>
 struct GemmSharedLoadTileBTraits {
   static GemmOperand::Kind const kOperand = GemmOperand::kB;
   typedef typename platform::remove_const<Scalar_>::type Scalar;
   typedef Scalar_* Pointer;
   typedef Shape<kStages_,
                 OutputTile_::kD / InstructionShape_::kD,
                 GetExtent<kOperand, OutputTile_>::kExtent * InstructionShape_::kD>
       TileWithoutSkew_;
   typedef Shape<kStages_, TileWithoutSkew_::kH, TileWithoutSkew_::kW + kSkew_> TileWithSkew;
   typedef typename ReshapeTile<TileWithoutSkew_, kScalarsPerLds_>::Tile TileWithoutSkew;
   typedef typename ReshapeTile<TileWithSkew, kScalarsPerLds_>::Tile Tile;
   typedef Warps_ Warps;
   typedef ThreadsPerWarp_ ThreadsPerWarp;
   static int const kAccessSize = kScalarsPerLds_;
   static int const kSkew = kSkew_;
   static MemorySpace::Kind const kMemorySpace = MemorySpace::kShared;

   static int const kWarps = GetExtent<kOperand, Warps>::kExtent;
   static int const kThreadsPerWarp = GetExtent<kOperand, ThreadsPerWarp>::kExtent;

   typedef Shape<1, 1, TileWithoutSkew::kW / kWarps / kThreadsPerWarp /* / kAccessSize*/> Iterations;
   typedef Shape<TileWithSkew::kW, 0, kWarps * kThreadsPerWarp * kAccessSize, 0> Delta;
   typedef Shape<TileWithSkew::kW, 0, kWarps * kThreadsPerWarp * kAccessSize, 0>
       ImmediateOffsetStrides;

   struct ThreadOffset {
     CUTLASS_HOST_DEVICE
     Coord<4> operator()() const {
       // The position of the warp.
       int const warp = threadIdx.x / (Warps::kW * kWarpSize);

       // Compute the column offset for each thread
       int const lane = (threadIdx.x & 0x10) / 8 + (threadIdx.x & 0x01);
       // The offset.
       int const offset = (warp * ThreadsPerWarp::kH + lane) * kAccessSize;

       return make_Coord(0, 0, offset, 0);
     }
   };
 };


 template <typename Scalar_,
           typename OutputTile_,
           typename Warps_,
           typename ThreadsPerWarp_,
           int kScalarsPerSts_,
           int kSkew_ = 0>
 struct GemmSharedStoreTileDTraits {
   typedef typename platform::remove_const<Scalar_>::type Scalar;
   typedef Scalar_* Pointer;
   typedef OutputTile_ OutputTile;
   typedef Warps_ Warps;
   typedef ThreadsPerWarp_ ThreadsPerWarp;
   static int const kAccessSize = kScalarsPerSts_;
   static int const kSkew = kSkew_;
   static MemorySpace::Kind const kMemorySpace = MemorySpace::kShared;

   static int const kScalarsPerThread = OutputTile_::kW / Warps::kW / ThreadsPerWarp::kW;
   static int const kThreads = ShapeCount<Warps>::kCount * kWarpSize;
   static int const kScalarsPerRow = kThreads / 2 * kScalarsPerThread + kSkew;

   typedef Shape<1, 2, kScalarsPerRow / kAccessSize, kAccessSize> Tile;
   typedef Shape<1, 1, kScalarsPerThread / kAccessSize> Iterations;
   typedef Shape<0, 0, Warps::kW * ThreadsPerWarp::kW * kAccessSize> Delta;
   typedef Shape<0, 0, Warps::kW * ThreadsPerWarp::kW * kAccessSize> ImmediateOffsetStrides;

   struct ThreadOffset {
     CUTLASS_HOST_DEVICE
     Coord<4> operator()() const {
       // We issue STS.128 in the epilogue to store the accumulators to shared memory. When we use
       // STS.128, we have to guarantee that threads in groups of 8 do not have bank conflicts (i.e
       // they write to different banks).

       // Odd threads go to the second half of shared memory.
       int const row = threadIdx.x & 0x01;

       int const warp_id = (threadIdx.x >> 5);

       int const warp_row = (warp_id % Warps::kW);
       int const warp_col = (warp_id / Warps::kW);

       int hi_halfwarp_offset = OutputTile::kW * ((threadIdx.x >> 4) & 1);
       int lo_halfwarp_offset = (((threadIdx.x >> 1) & 0x7) + warp_row * ThreadsPerWarp::kW);

       int col = kAccessSize * lo_halfwarp_offset +
                 warp_col * (ThreadsPerWarp::kH / 2) * OutputTile::kW + hi_halfwarp_offset;

       int offset = row * kScalarsPerRow + col;
       return make_Coord(0, 0, offset, 0);
     }
   };
 };


 template <typename Scalar_,
           typename OutputTile_,
           typename Warps_,
           typename ThreadsPerWarp_,
           int kTileH_,
           int kScalarsPerLds_,
           int kSkew_ = 0>
 struct GemmSharedLoadTileDTraits {
   typedef typename platform::remove_const<Scalar_>::type Scalar;
   typedef Scalar_* Pointer;
   typedef OutputTile_ OutputTile;
   typedef Warps_ Warps;
   typedef ThreadsPerWarp_ ThreadsPerWarp;
   static int const kAccessSize = kScalarsPerLds_;
   static int const kSkew = kSkew_;
   static MemorySpace::Kind const kMemorySpace = MemorySpace::kShared;

   static int const kScalarsPerThread = OutputTile_::kW / Warps::kW / ThreadsPerWarp::kW;
   static int const kThreads = ShapeCount<Warps>::kCount * kWarpSize;
   static int const kScalarsPerRow = kThreads / 2 * kScalarsPerThread + kSkew;

   typedef Shape<1, 2, kScalarsPerRow / kAccessSize, kAccessSize> Tile;

   // Compute the number of iterations per warp in the Tile::kH dimension.
   static int const kIterationsInHPerWarp = kTileH_ / ShapeCount<Warps>::kCount;

   // As shown above, the shared memory tile is composed of 2 rows and each rows is made of
   // kScalarsPerRow. A warp is expected to read from the 1st row, then move to the 2nd row and go
   // back to the 1st row. To model that scheme we define the Iterations shape as Shape<X, 2, ...>.
   // However, in some cases, we have only 1 iteration per warp. In that case, we must define the
   // shape as Shape<1, 1, ...>. The following code does that.
   static int const kIterationsH = kIterationsInHPerWarp == 1 ? 1 : 2;
   // As soon as we know kIterationsH, it is trivial to compute kIterationsD:
   static int const kIterationsD = kIterationsInHPerWarp / kIterationsH;

   typedef Shape<kIterationsD, kIterationsH, OutputTile::kW / kWarpSize / kAccessSize> Iterations;
   typedef Shape<OutputTile::kW, kScalarsPerRow, kWarpSize * kAccessSize> Delta;
   typedef Shape<OutputTile::kW, kScalarsPerRow, kWarpSize * kAccessSize> ImmediateOffsetStrides;

   struct ThreadOffset {
     CUTLASS_HOST_DEVICE
     Coord<4> operator()() const {
       // Each warp works on a different column.
       int const h = threadIdx.x / kWarpSize;
       // Compute the row.
       int const w = (threadIdx.x & (kWarpSize - 1)) * kAccessSize;
       int offset = 0;
       if (Iterations::kH == 1) {
         int const row = h & 0x1;
         int const col = h / 2;
         offset = row * ShapeCount<Tile>::kWc + col * OutputTile::kW * Iterations::kD + w;
       } else {
         offset = h * OutputTile::kW * Iterations::kD + w;
       }
       return make_Coord(0, 0, offset, 0);
     }
   };
 };


 }  // namespace gemm
 }  // namespace cutlass
cutlass::gemm::GemmSharedStoreWithSkewTileAbTraits::kAccessSize
static int const kAccessSize
The number of scalars per STS.
Definition: gemm_shared_tile.h:95

cutlass::ComputeThreadOffsetFromStrides::get
static CUTLASS_DEVICE int get()
Definition: shape.h:253

cutlass::gemm::GemmSharedLoadTileBTraits::Tile
ReshapeTile< TileWithSkew, kScalarsPerLds_ >::Tile Tile
The tile.
Definition: gemm_shared_tile.h:214

cutlass::gemm::GemmSharedLoadTileATraits::Tile
ReshapeTile< TileWithSkew, kScalarsPerLds_ >::Tile Tile
The tile.
Definition: gemm_shared_tile.h:145

cutlass::gemm::GemmSharedLoadTileBTraits::TileWithoutSkew
ReshapeTile< TileWithoutSkew_, kScalarsPerLds_ >::Tile TileWithoutSkew
The tile without skew after reshaping.
Definition: gemm_shared_tile.h:212

cutlass::gemm::GemmSharedLoadTileDTraits::kMemorySpace
static MemorySpace::Kind const kMemorySpace
The memory space.
Definition: gemm_shared_tile.h:351

cutlass::gemm::GemmSharedLoadTileDTraits::kScalarsPerThread
static int const kScalarsPerThread
The number of scalars per thread.
Definition: gemm_shared_tile.h:354

cutlass::MemorySpace::kShared
Definition: load_store.h:42

cutlass::gemm::GemmSharedLoadTileBTraits::ThreadOffset::operator()
CUTLASS_HOST_DEVICE Coord< 4 > operator()() const
Definition: gemm_shared_tile.h:242

cutlass::gemm::GemmSharedLoadTileBTraits::Iterations
Shape< 1, 1, TileWithoutSkew::kW/kWarps/kThreadsPerWarp > Iterations
The number of iterations needed to load/store the tile.
Definition: gemm_shared_tile.h:232

cutlass
Definition: convert.h:33

cutlass::gemm::GemmSharedLoadTileBTraits::kWarps
static int const kWarps
The number of warps.
Definition: gemm_shared_tile.h:227

cutlass::gemm::GemmSharedLoadTileATraits
Definition: gemm_shared_tile.h:129

cutlass::gemm::GemmSharedStoreTileAbTraits::Pointer
Scalar_ * Pointer
The pointer.
Definition: gemm_shared_tile.h:42

cutlass::gemm::GemmSharedStoreWithSkewTileAbTraits
Definition: gemm_shared_tile.h:80

cutlass::gemm::GemmSharedStoreTileDTraits::kScalarsPerRow
static int const kScalarsPerRow
The number of scalars per row. We build a tile with 2 rows (to avoid bank conflicts).
Definition: gemm_shared_tile.h:287

cutlass::platform::remove_const::type
T type
Definition: platform.h:369

cutlass::gemm::GemmSharedLoadTileATraits::Scalar
platform::remove_const< Scalar_ >::type Scalar
The scalar.
Definition: gemm_shared_tile.h:132

cutlass::gemm::GemmSharedStoreWithSkewTileAbTraits::ThreadOffset
Definition: gemm_shared_tile.h:106

cutlass::gemm::GemmSharedStoreTileDTraits::Iterations
Shape< 1, 1, kScalarsPerThread/kAccessSize > Iterations
The number of iterations needed to store the tile.
Definition: gemm_shared_tile.h:292

cutlass::gemm::GemmSharedLoadTileDTraits::kAccessSize
static int const kAccessSize
The number of scalars per LDG/STG.
Definition: gemm_shared_tile.h:347

cutlass::gemm::GemmSharedLoadTileATraits::ThreadsPerWarp
ThreadsPerWarp_ ThreadsPerWarp
The threads in a warp.
Definition: gemm_shared_tile.h:149

cutlass::ReshapeTile
Definition: reshape_tile.h:42

cutlass::make_Coord
CUTLASS_HOST_DEVICE Coord< 1 > make_Coord(int _0)
Helper to make a 2-element coordinate.
Definition: coord.h:241

cutlass::gemm::GemmSharedStoreTileAbTraits::ThreadsStrides
Shape< 0, ShapeCount< Tile >::kWc, Tile::kC, kScalarsPerSts_ > ThreadsStrides
The strides to compute the base position of the thread.
Definition: gemm_shared_tile.h:48

cutlass::gemm::GemmSharedStoreTileDTraits::kAccessSize
static int const kAccessSize
The number of scalars per LDG/STG.
Definition: gemm_shared_tile.h:276

cutlass::gemm::GemmSharedLoadTileDTraits::Iterations
Shape< kIterationsD, kIterationsH, OutputTile::kW/kWarpSize/kAccessSize > Iterations
The number of iterations needed to store the tile.
Definition: gemm_shared_tile.h:376

cutlass::gemm::GemmSharedLoadTileDTraits::kSkew
static int const kSkew
The skew.
Definition: gemm_shared_tile.h:349

cutlass::gemm::GemmSharedLoadTileBTraits::Warps
Warps_ Warps
The number of warps.
Definition: gemm_shared_tile.h:216

cutlass::gemm::GemmSharedLoadTileBTraits::kSkew
static int const kSkew
The skew.
Definition: gemm_shared_tile.h:222

cutlass::gemm::GemmSharedStoreTileAbTraits
Definition: gemm_shared_tile.h:38

cutlass::gemm::GemmSharedLoadTileBTraits::Scalar
platform::remove_const< Scalar_ >::type Scalar
The scalar.
Definition: gemm_shared_tile.h:201

cutlass::gemm::GemmSharedLoadTileDTraits::ThreadOffset
Computes the thread offset in (H, W) based on thread ID.
Definition: gemm_shared_tile.h:383

cutlass::gemm::GemmSharedLoadTileBTraits
Definition: gemm_shared_tile.h:198

cutlass::gemm::GemmSharedLoadTileATraits::kMemorySpace
static MemorySpace::Kind const kMemorySpace
The memory space.
Definition: gemm_shared_tile.h:156

cutlass::gemm::GemmSharedStoreTileAbTraits::Scalar
platform::remove_const< Scalar_ >::type Scalar
The scalar.
Definition: gemm_shared_tile.h:40

cutlass::gemm::GemmSharedLoadTileATraits::kOperand
static GemmOperand::Kind const kOperand
Definition: gemm_shared_tile.h:130

cutlass::gemm::GemmSharedStoreTileDTraits::kMemorySpace
static MemorySpace::Kind const kMemorySpace
The memory space.
Definition: gemm_shared_tile.h:280

cutlass::MemorySpace::Kind
Kind
Definition: load_store.h:40

cutlass::gemm::GemmSharedLoadTileBTraits::TileWithSkew
Shape< kStages_, TileWithoutSkew_::kH, TileWithoutSkew_::kW+kSkew_ > TileWithSkew
The tile with skew.
Definition: gemm_shared_tile.h:210

cutlass::gemm::GemmSharedLoadTileATraits::kAccessSize
static int const kAccessSize
The number of scalars per LDG/STG.
Definition: gemm_shared_tile.h:152

cutlass::Shape::kH
static int const kH
The height of the cube.
Definition: shape.h:68

cutlass::gemm::GemmSharedStoreTileAbTraits::Iterations
Shape< 1, Tile::kH/Threads::kH, Tile::kW/Threads::kW, Tile::kC/Threads::kC/kAccessSize > Iterations
The number of iterations needed to load/store the tile.
Definition: gemm_shared_tile.h:61

cutlass::gemm::GemmSharedStoreWithSkewTileAbTraits::kSkew
static int const kSkew
The skew.
Definition: gemm_shared_tile.h:93

cutlass::gemm::GemmSharedLoadTileATraits::Iterations
Shape< 1, 1, TileWithoutSkew::kW/kWarps/kThreadsPerWarp > Iterations
The number of iterations needed to load/store the tile.
Definition: gemm_shared_tile.h:165

cutlass::gemm::GemmSharedStoreTileDTraits::OutputTile
OutputTile_ OutputTile
The dimension of the output tile.
Definition: gemm_shared_tile.h:270

cutlass::gemm::GemmSharedLoadTileDTraits::kScalarsPerRow
static int const kScalarsPerRow
The number of scalars per row. We build a tile with 2 rows (to avoid bank conflicts).
Definition: gemm_shared_tile.h:358

cutlass::gemm::GemmSharedLoadTileBTraits::Pointer
Scalar_ * Pointer
The pointer.
Definition: gemm_shared_tile.h:203

cutlass::gemm::GemmSharedLoadTileATraits::Pointer
Scalar_ * Pointer
The pointer.
Definition: gemm_shared_tile.h:134

cutlass::gemm::GemmSharedStoreTileDTraits::Pointer
Scalar_ * Pointer
The pointer.
Definition: gemm_shared_tile.h:268

cutlass::gemm::GemmSharedStoreTileDTraits::kScalarsPerThread
static int const kScalarsPerThread
The number of scalars per thread.
Definition: gemm_shared_tile.h:283

cutlass::gemm::GemmSharedLoadTileDTraits::ImmediateOffsetStrides
Shape< OutputTile::kW, kScalarsPerRow, kWarpSize *kAccessSize > ImmediateOffsetStrides
The strides in each dimension between different loads/stores.
Definition: gemm_shared_tile.h:380

cutlass::gemm::GemmSharedStoreTileDTraits::ThreadOffset::operator()
CUTLASS_HOST_DEVICE Coord< 4 > operator()() const
Definition: gemm_shared_tile.h:301

cutlass::gemm::GemmSharedStoreTileAbTraits::kMemorySpace
static MemorySpace::Kind const kMemorySpace
The memory space.
Definition: gemm_shared_tile.h:54

cutlass::gemm::GemmSharedStoreTileAbTraits::kSkew
static int const kSkew
The skew.
Definition: gemm_shared_tile.h:50

cutlass::gemm::GemmSharedLoadTileBTraits::kThreadsPerWarp
static int const kThreadsPerWarp
The number of threads in one dimension of the warp.
Definition: gemm_shared_tile.h:229

cutlass::gemm::GemmSharedLoadTileBTraits::ThreadOffset
Computes the thread offset in (H, W) based on thread ID.
Definition: gemm_shared_tile.h:240

cutlass::gemm::GemmSharedStoreWithSkewTileAbTraits::ImmediateOffsetStrides
Shape< 0, ShapeCount< Tile >::kWc, Threads::kH *kAccessSize > ImmediateOffsetStrides
The strides in each dimension between different loads/stores.
Definition: gemm_shared_tile.h:104

cutlass::gemm::GemmSharedStoreTileDTraits::Tile
Shape< 1, 2, kScalarsPerRow/kAccessSize, kAccessSize > Tile
The tile.
Definition: gemm_shared_tile.h:290

cutlass::gemm::GemmSharedStoreTileAbTraits::kAccessSize
static int const kAccessSize
The number of scalars per LDG/STG.
Definition: gemm_shared_tile.h:52

cutlass::gemm::GemmSharedStoreTileAbTraits::Tile
ReshapeTile< Tile_, kScalarsPerSts_ >::Tile Tile
The tile.
Definition: gemm_shared_tile.h:44

cutlass::gemm::GemmSharedStoreTileAbTraits::ThreadOffset
Definition: gemm_shared_tile.h:68

cutlass::gemm::GemmSharedLoadTileDTraits::kIterationsInHPerWarp
static int const kIterationsInHPerWarp
Definition: gemm_shared_tile.h:364

cutlass::gemm::GemmSharedStoreTileDTraits::kSkew
static int const kSkew
The skew.
Definition: gemm_shared_tile.h:278

cutlass::gemm::GemmSharedLoadTileATraits::TileWithoutSkew
ReshapeTile< TileWithoutSkew_, kScalarsPerLds_ >::Tile TileWithoutSkew
The tile without skew after reshaping.
Definition: gemm_shared_tile.h:143

gemm_operand.h
Defines constant expressions for mapping GEMM problem size and strides onto pitch-linear memory...

cutlass::gemm::GemmSharedStoreTileAbTraits::ImmediateOffsetStrides
Shape< 0, Threads::kH *ShapeCount< Tile >::kWc, Threads::kW *kAccessSize > ImmediateOffsetStrides
The strides in each dimension between different loads/stores.
Definition: gemm_shared_tile.h:66

cutlass::gemm::GemmSharedLoadTileATraits::ImmediateOffsetStrides
Shape< TileWithSkew::kW, 0, kWarps *kThreadsPerWarp *kAccessSize, 0 > ImmediateOffsetStrides
The strides in each dimension between different loads/stores.
Definition: gemm_shared_tile.h:170

cutlass::gemm::GemmSharedStoreWithSkewTileAbTraits::Tile
ReshapeTile< Shape< Tile_::kD, Tile_::kH, Tile_::kW+kSkew_ >, kScalarsPerSts_ >::Tile Tile
The tile.
Definition: gemm_shared_tile.h:89

cutlass::gemm::GemmSharedStoreWithSkewTileAbTraits::ThreadsStrides
Shape< 0, kScalarsPerSts_, ShapeCount< Tile >::kHwc/Threads::kW > ThreadsStrides
The strides to compute the base position of the thread.
Definition: gemm_shared_tile.h:116

cutlass::gemm::GemmSharedStoreWithSkewTileAbTraits::TileWithoutSkew
ReshapeTile< Tile_, kScalarsPerSts_ >::Tile TileWithoutSkew
The tile without skews.
Definition: gemm_shared_tile.h:86

cutlass::gemm::GemmSharedLoadTileDTraits::kIterationsD
static int const kIterationsD
Definition: gemm_shared_tile.h:373

cutlass::gemm::GemmSharedLoadTileATraits::kWarps
static int const kWarps
The number of warps.
Definition: gemm_shared_tile.h:159

cutlass::GemmOperand::kB
Definition: matrix_traits.h:43

cutlass::gemm::GemmSharedStoreTileDTraits::ThreadsPerWarp
ThreadsPerWarp_ ThreadsPerWarp
The threads in the warps.
Definition: gemm_shared_tile.h:274

cutlass::gemm::GemmSharedLoadTileATraits::ThreadOffset
Computes the thread offset in (H, W) based on thread ID.
Definition: gemm_shared_tile.h:173

cutlass::gemm::GemmSharedLoadTileATraits::TileWithoutSkew_
Shape< kStages_, OutputTile_::kD/InstructionShape_::kD, GetExtent< kOperand, OutputTile_ >::kExtent *InstructionShape_::kD > TileWithoutSkew_
The tile without skew.
Definition: gemm_shared_tile.h:139

cutlass::gemm::GemmSharedLoadTileDTraits
Definition: gemm_shared_tile.h:335

cutlass::gemm::GemmSharedStoreWithSkewTileAbTraits::Threads
Threads_ Threads
The threads.
Definition: gemm_shared_tile.h:91

CUTLASS_HOST_DEVICE
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:46

cutlass::gemm::GemmSharedLoadTileDTraits::OutputTile
OutputTile_ OutputTile
The dimension of the output tile.
Definition: gemm_shared_tile.h:341

cutlass::gemm::GemmSharedStoreWithSkewTileAbTraits::Scalar
platform::remove_const< Scalar_ >::type Scalar
The scalar.
Definition: gemm_shared_tile.h:82

cutlass::gemm::GemmSharedLoadTileATraits::Delta
Shape< TileWithSkew::kW, 0, kWarps *kThreadsPerWarp *kAccessSize, 0 > Delta
The strides in each dimension between different loads/stores.
Definition: gemm_shared_tile.h:167

cutlass::gemm::GemmSharedStoreTileDTraits::ImmediateOffsetStrides
Shape< 0, 0, Warps::kW *ThreadsPerWarp::kW *kAccessSize > ImmediateOffsetStrides
The strides in each dimension between different loads/stores.
Definition: gemm_shared_tile.h:296

cutlass::gemm::GemmSharedLoadTileBTraits::kOperand
static GemmOperand::Kind const kOperand
Definition: gemm_shared_tile.h:199

cutlass::gemm::GemmSharedLoadTileDTraits::Tile
Shape< 1, 2, kScalarsPerRow/kAccessSize, kAccessSize > Tile
The tile.
Definition: gemm_shared_tile.h:361

cutlass::gemm::GemmSharedLoadTileATraits::kThreadsPerWarp
static int const kThreadsPerWarp
The number of threads in one dimension of the warp.
Definition: gemm_shared_tile.h:161

cutlass::Shape
A Shape implementing Layout Concept describing the dimensions of a cube.
Definition: shape.h:64

cutlass::gemm::GemmSharedStoreWithSkewTileAbTraits::Pointer
Scalar_ * Pointer
The pointer.
Definition: gemm_shared_tile.h:84

cutlass::gemm::GemmSharedLoadTileDTraits::Delta
Shape< OutputTile::kW, kScalarsPerRow, kWarpSize *kAccessSize > Delta
The strides in each dimension between different loads/stores.
Definition: gemm_shared_tile.h:378

cutlass::gemm::GemmSharedStoreWithSkewTileAbTraits::Iterations
Shape< 1, TileWithoutSkew::kH/Threads::kW, TileWithoutSkew::kW/Threads::kH > Iterations
The number of iterations needed to load/store the tile.
Definition: gemm_shared_tile.h:100

cutlass::gemm::GemmSharedLoadTileBTraits::TileWithoutSkew_
Shape< kStages_, OutputTile_::kD/InstructionShape_::kD, GetExtent< kOperand, OutputTile_ >::kExtent *InstructionShape_::kD > TileWithoutSkew_
The tile without skew.
Definition: gemm_shared_tile.h:208

cutlass::gemm::GemmSharedStoreTileAbTraits::Threads
Threads_ Threads
The threads.
Definition: gemm_shared_tile.h:46

cutlass::gemm::GetExtent
Definition: gemm_operand.h:50

cutlass::Coord< 4 >

cutlass::gemm::GemmSharedStoreTileAbTraits::Delta
Shape< 0, Threads::kH *ShapeCount< Tile >::kWc, Threads::kW *kAccessSize > Delta
The strides in each dimension between different loads/stores.
Definition: gemm_shared_tile.h:63

cutlass::gemm::GemmSharedLoadTileDTraits::kThreads
static int const kThreads
The number of threads.
Definition: gemm_shared_tile.h:356

cutlass::gemm::GemmSharedLoadTileATraits::Warps
Warps_ Warps
The number of warps.
Definition: gemm_shared_tile.h:147

cutlass::gemm::GemmSharedStoreWithSkewTileAbTraits::kMemorySpace
static MemorySpace::Kind const kMemorySpace
The memory space.
Definition: gemm_shared_tile.h:97

cutlass::gemm::GemmSharedLoadTileBTraits::kMemorySpace
static MemorySpace::Kind const kMemorySpace
The memory space.
Definition: gemm_shared_tile.h:224

cutlass::gemm::GemmSharedStoreTileAbTraits::ThreadOffset::operator()
CUTLASS_HOST_DEVICE Coord< 4 > operator()() const
Definition: gemm_shared_tile.h:70

cutlass::gemm::GemmSharedLoadTileATraits::ThreadOffset::operator()
CUTLASS_HOST_DEVICE Coord< 4 > operator()() const
Definition: gemm_shared_tile.h:175

cutlass::Shape::kD
static int const kD
The depth of the cube.
Definition: shape.h:66

cutlass::gemm::GemmSharedStoreTileDTraits::ThreadOffset
Computes the thread offset in (H, W) based on thread ID.
Definition: gemm_shared_tile.h:299

cutlass::gemm::GemmSharedLoadTileDTraits::Warps
Warps_ Warps
The warps in the tile.
Definition: gemm_shared_tile.h:343

cutlass::ReshapeTile::Tile
Tile_ Tile
Definition: reshape_tile.h:43

cutlass::gemm::GemmSharedStoreWithSkewTileAbTraits::Delta
Shape< 0, ShapeCount< Tile >::kWc, Threads::kH *kAccessSize > Delta
The strides in each dimension between different loads/stores.
Definition: gemm_shared_tile.h:102

cutlass::gemm::GemmSharedLoadTileDTraits::kIterationsH
static int const kIterationsH
Definition: gemm_shared_tile.h:371

cutlass::gemm::GemmSharedStoreTileDTraits::Delta
Shape< 0, 0, Warps::kW *ThreadsPerWarp::kW *kAccessSize > Delta
The strides in each dimension between different loads/stores.
Definition: gemm_shared_tile.h:294

cutlass::GemmOperand::Kind
Kind
Definition: matrix_traits.h:43

cutlass::gemm::GemmSharedLoadTileATraits::kSkew
static int const kSkew
The skew.
Definition: gemm_shared_tile.h:154

cutlass::gemm::GemmSharedLoadTileDTraits::ThreadsPerWarp
ThreadsPerWarp_ ThreadsPerWarp
The threads in the warps.
Definition: gemm_shared_tile.h:345

cutlass::GemmOperand::kA
Definition: matrix_traits.h:43

cutlass::gemm::GemmSharedLoadTileDTraits::Pointer
Scalar_ * Pointer
The pointer.
Definition: gemm_shared_tile.h:339

cutlass::gemm::GemmSharedStoreTileDTraits::kThreads
static int const kThreads
The number of threads.
Definition: gemm_shared_tile.h:285

cutlass::gemm::GemmSharedLoadTileBTraits::Delta
Shape< TileWithSkew::kW, 0, kWarps *kThreadsPerWarp *kAccessSize, 0 > Delta
The strides in each dimension between different loads/stores.
Definition: gemm_shared_tile.h:234

cutlass::gemm::GemmSharedLoadTileBTraits::ThreadsPerWarp
ThreadsPerWarp_ ThreadsPerWarp
The threads in a warp.
Definition: gemm_shared_tile.h:218

cutlass::gemm::GemmSharedStoreTileDTraits::Scalar
platform::remove_const< Scalar_ >::type Scalar
The scalar.
Definition: gemm_shared_tile.h:266

cutlass::ShapeCount
Compute derived counted of a Layout Concept based class.
Definition: shape.h:79

cutlass::gemm::GemmSharedLoadTileATraits::TileWithSkew
Shape< kStages_, TileWithoutSkew_::kH, TileWithoutSkew_::kW+kSkew_ > TileWithSkew
The tile with skew.
Definition: gemm_shared_tile.h:141

cutlass::gemm::GemmSharedStoreTileDTraits::Warps
Warps_ Warps
The warps in the tile.
Definition: gemm_shared_tile.h:272

cutlass::gemm::GemmSharedStoreWithSkewTileAbTraits::ThreadOffset::operator()
CUTLASS_HOST_DEVICE Coord< 4 > operator()() const
Definition: gemm_shared_tile.h:108

cutlass::gemm::GemmSharedLoadTileDTraits::Scalar
platform::remove_const< Scalar_ >::type Scalar
The scalar.
Definition: gemm_shared_tile.h:337

cutlass::gemm::GemmSharedLoadTileDTraits::ThreadOffset::operator()
CUTLASS_HOST_DEVICE Coord< 4 > operator()() const
Definition: gemm_shared_tile.h:385

cutlass::gemm::GemmSharedLoadTileBTraits::ImmediateOffsetStrides
Shape< TileWithSkew::kW, 0, kWarps *kThreadsPerWarp *kAccessSize, 0 > ImmediateOffsetStrides
The strides in each dimension between different loads/stores.
Definition: gemm_shared_tile.h:237

cutlass::gemm::GemmSharedStoreTileDTraits
Definition: gemm_shared_tile.h:264

cutlass::gemm::GemmSharedLoadTileBTraits::kAccessSize
static int const kAccessSize
The number of scalars per LDG/STG.
Definition: gemm_shared_tile.h:220