d7137f9c0a1633b76455109373887e1640713b5d/docs/gemm__global__tile_8h_source.html

 /***************************************************************************************************
  * Copyright (c) 2017-2018, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright notice, this list of
  *       conditions and the following disclaimer.
  *     * Redistributions in binary form must reproduce the above copyright notice, this list of
  *       conditions and the following disclaimer in the documentation and/or other materials
  *       provided with the distribution.
  *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
  *       to endorse or promote products derived from this software without specific prior written
  *       permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
 #pragma once

 #include "cutlass/coord.h"
 #include "cutlass/util/platform.h"

 #include "cutlass/gemm/gemm_operand.h"
 #include "cutlass/matrix_traits.h"
 #include "cutlass/predicate_vector.h"
 #include "cutlass/reshape_tile.h"
 #include "cutlass/tile_iterator.h"

 namespace cutlass {
 namespace gemm {


 // The following functor reshapes a tile of threads to match a tile of data. The idea is that when
 // the user wants to build the iterator traits, he/she may want to specify the tile independently
 // from the number of scalars loaded/stored per instruction. For example, in the row-major version
 // with a tile of size 128x8 - the user may want to that the iterator works with 32x8 threads if
 // each thread loads 1 scalar per LDG. If the user changes to 4 scalars per LDG, then the tile of
 // threads has to change. The code below detects that and correct the code automatically - it is
 // a helper when the user does not specify the right configuration.

 template <typename Tile_, typename Threads_, bool = (Tile_::kW < Threads_::kW)>
 struct ReshapeThreads {
   typedef Threads_ Threads;
 };

 template <typename Tile_, typename Threads_>
 struct ReshapeThreads<Tile_, Threads_, true> {
   typedef Shape<Threads_::kD, Threads_::kH * Threads_::kW / Tile_::kW, Tile_::kW, 1> Threads;
 };


 template <GemmOperand::Kind kOperand_,
           MatrixLayout::Kind kLayout_,
           typename Scalar_,
           typename Tile_,
           typename Threads_,
           int kAccessSize_>
 struct GemmGlobalTileTraits {
   static GemmOperand::Kind const kOperand = kOperand_;
   static MatrixLayout::Kind const kLayout = kLayout_;
   typedef typename platform::remove_const<Scalar_>::type Scalar;
   typedef Scalar_* Pointer;
   static int const kAccessSize = kAccessSize_;
   static MemorySpace::Kind const kMemorySpace = MemorySpace::kGlobal;
   typedef Tile_ Tile;
   typedef typename ReshapeTile<Tile_, kAccessSize_>::Tile VectorizedTile;
   typedef typename ReshapeThreads<VectorizedTile, Threads_>::Threads Threads;
   typedef Shape<1, 1, VectorizedTile::kC> ThreadsDelta;
   typedef Shape<0, Threads::kH, Threads::kW * kAccessSize> Delta;

   typedef Shape<0, 0, Threads::kW * ThreadsDelta::kW, kAccessSize> ImmediateOffsetStrides;
   typedef Shape<1,
                 VectorizedTile::kH / Threads::kH,
                 VectorizedTile::kW / Threads::kW,
                 VectorizedTile::kC / kAccessSize>
       Iterations;

   typedef GemmMultiplicandTraits<Tile, kOperand, kLayout> MultiplicandTraits;

   struct ThreadOffset {
     CUTLASS_HOST_DEVICE
     Coord<4> operator()() const {
       int thread_offset_h = threadIdx.x / Threads::kW * ThreadsDelta::kH;
       int thread_offset_w = threadIdx.x % Threads::kW * ThreadsDelta::kW;

       return make_Coord(0, thread_offset_h, thread_offset_w, 0);
     }
   };
 };


 template <typename Scalar_, typename Tile_, typename Threads_, int kStrideH_, int kAccessSize_>
 struct GemmGlobalTileCdTraits : public GemmGlobalTileTraits<GemmOperand::kC,
                                                             MatrixLayout::kColumnMajor,
                                                             Scalar_,
                                                             Tile_,
                                                             Threads_,
                                                             kAccessSize_> {
   typedef GemmGlobalTileTraits<GemmOperand::kC,
                                MatrixLayout::kColumnMajor,
                                Scalar_,
                                Tile_,
                                Threads_,
                                kAccessSize_>
       Base;

   static int const kStrideH = kStrideH_;
   typedef Shape<0, 0, Base::Delta::kW, Base::Delta::kC> Delta;

   typedef typename Base::Iterations Iterations;

   typedef typename Base::Threads Threads;

   typedef typename Base::ThreadsDelta ThreadsDelta;

   typedef typename Base::ImmediateOffsetStrides ImmediateOffsetStrides;

   struct ThreadOffset {
     CUTLASS_HOST_DEVICE
     Coord<4> operator()() const {
       int thread_offset_h = threadIdx.x / Threads::kW * kStrideH * Iterations::kH;
       int thread_offset_w = threadIdx.x % Threads::kW * ThreadsDelta::kW;

       return make_Coord(0, thread_offset_h, thread_offset_w, 0);
     }
   };
 };


 template <typename TileTraits_, typename Index_ = int>
 struct GemmGlobalIteratorAb
     : public TileLoadIterator<TileTraits_,
                               typename TileTraits_::Scalar,
                               TileTraits_::MultiplicandTraits::kKstrided ? IteratorAdvance::kH
                                                                          : IteratorAdvance::kW,
                               MemorySpace::kGlobal,
                               Index_> {
   typedef GemmGlobalIteratorAb<TileTraits_, Index_> This_;
   typedef TileLoadIterator<TileTraits_,
                            typename TileTraits_::Scalar,
                            TileTraits_::MultiplicandTraits::kKstrided ? IteratorAdvance::kH
                                                                       : IteratorAdvance::kW,
                            MemorySpace::kGlobal,
                            Index_>
       Base;
   static MatrixLayout::Kind const kLayout = TileTraits_::kLayout;
   typedef typename TileTraits_::Tile Tile;
   typedef typename Base::Fragment Fragment;
   typedef typename TileTraits_::Scalar Scalar;
   typedef typename TileTraits_::Threads Threads;
   typedef Index_ Index;
   typedef typename TileTraits_::ThreadOffset ThreadOffset;
   static IteratorAdvance::Kind const kAdvance = Base::kAdvance;

   typedef cutlass::PredicateVector<ShapeCount<typename Base::Iterations>::kCount> PredicateVector;

   typedef typename Base::Params BaseParams;

   struct Params : public BaseParams {
     CUTLASS_HOST_DEVICE int initialize(Scalar const* ptr,
                                        long long stride_d,
                                        Index stride_h) {
       Index inc_d = 0;
       Index inc_advance = 0;
       // Move by some columns for each iteration in the H dimension.
       Index inc_h = Base::Delta::kH * stride_h;

       // Move by some more columns in the number of iterations if the D dimension is > 1.
       if (Base::Delta::kD > 0) {
         inc_d = Base::Delta::kD * stride_h - (Base::Iterations::kH - 1) * inc_h;
       }

       // Move to the beginning of the next iteration.
       if (kAdvance == IteratorAdvance::kH && Base::Delta::kD > 0) {
         inc_advance = inc_d;
       } else if (kAdvance == IteratorAdvance::kH) {
         inc_advance = inc_h;
       } else if (Base::Delta::kD > 0) {
         inc_advance = (Base::Iterations::kW + 0) * ShapeCount<typename Base::Delta>::kWc -
                       (Base::Iterations::kH - 1) * inc_h -
                       (Base::Iterations::kD - 1) * Base::Delta::kD * stride_h;
       } else {
         inc_advance = (Base::Iterations::kW + 0) * ShapeCount<typename Base::Delta>::kWc -
                       (Base::Iterations::kH - 1) * inc_h;
       }

       Base::Params::initialize(
           ptr, stride_d, stride_h, 1, inc_d, inc_h, 0, inc_advance);
       return 0;
     }
   };

   Coord<4> thread_offset;
   Params params;
   PredicateVector predicates;

   CUTLASS_HOST_DEVICE void initialize_predicates(const Coord<3>& bounds, const Coord<3>& block_offset) {
     // Setup the masks to control loads.
     predicates.fill(0);

     // Fill in the bits of the predicate vector.
     for (int d = 0; d < Base::Iterations::kD; ++d) {
       for (int h = 0; h < Base::Iterations::kH; ++h) {
         for (int w = 0; w < Base::Iterations::kW; ++w) {
           for (int c = 0; c < Base::Iterations::kC; ++c) {
             bool flag = w * Base::Delta::kW + thread_offset[2] + block_offset[2] < bounds[2];
             if (kAdvance == IteratorAdvance::kH) {
               flag =
                   flag &&
                   (h * Base::Delta::kH + d * Base::Delta::kD) + thread_offset[1] + block_offset[1] <
                       bounds[1];
             } else {
               flag = flag && (h * Base::Delta::kH) + thread_offset[1] + block_offset[1] < bounds[1];
             }
             int const bit = ComputeOffsetFromShape<typename Base::Iterations>::get(d, h, w, c);
             predicates.set(bit, flag);
           }
         }
       }
     }
   }

   CUTLASS_HOST_DEVICE GemmGlobalIteratorAb(Params const& _params,
                                            const Coord<3>& bounds,
                                            const Coord<3>& threadblock_offset,
                                            ThreadOffset thread_offset_func = ThreadOffset())
       : params(_params) {
     thread_offset = thread_offset_func();
     // Setup the pointer.
     params.pointer += ((threadblock_offset[1] + thread_offset[1]) * params.stride_h +
                        (threadblock_offset[2] + thread_offset[2]));

   }

   CUTLASS_HOST_DEVICE void inc_w() { Base::inc_w(); }
   CUTLASS_HOST_DEVICE void inc_h() { params.pointer += params.inc_h; }
   CUTLASS_HOST_DEVICE void inc_d() { params.pointer += params.inc_d; }
   CUTLASS_HOST_DEVICE void inc_advance() { params.pointer += params.inc_advance; }

   CUTLASS_HOST_DEVICE void load_element(
       typename Base::AccessType& value, int d, int h, int w, int c) const {
     int const offset =
         ComputeOffsetFromStrides<typename Base::ImmediateOffsetStrides>::get(0, 0, w, c);
     Load<Scalar,
          Base::kAccessSize,
          Base::kMemorySpace,
          Base::kFragmentElementType,
          typename Base::FragmentElement,
          Base::Tile::kW,
          Base::kAccessSize * sizeof(Scalar)>::load(value, params.pointer, offset);
   }

   CUTLASS_HOST_DEVICE void residue(Index k) {
     // The coordinates of the thread.
     Index block_h = thread_offset[1];
     // The contiguous dimension.
     Index block_w = thread_offset[2];

     // Update the predicate vector.
     for (int d = 0; d < Base::Iterations::kD; ++d) {
       for (int h = 0; h < Base::Iterations::kH; ++h) {
         for (int w = 0; w < Base::Iterations::kW; ++w) {
           for (int c = 0; c < Base::Iterations::kC; ++c) {
             Index offset = 0;
             if (kAdvance == IteratorAdvance::kH) {
               offset += block_h + h * Base::Delta::kH + d * Base::Delta::kD;
             } else {
               offset += block_w + w * Base::Delta::kW;
             }

             int const bit = ComputeOffsetFromShape<typename Base::Iterations>::get(d, h, w, c);
             if (offset >= k) {
               predicates.set(bit, false);
             }
           }
         }
       }
     }
   }

   CUTLASS_HOST_DEVICE bool valid(int d, int h, int w, int c) const {
     int const bit = ComputeOffsetFromShape<typename Base::Iterations>::get(d, h, w, c);
     return predicates[bit];
   }

   CUTLASS_HOST_DEVICE GemmGlobalIteratorAb & operator+=(Coord<3> const &offset) {

     long long _offset = offset.template dot<long long>(
       make_Coord(params.stride_d, params.stride_h, params.stride_w)
     );

     params.pointer += _offset;
     return *this;
   }

   CUTLASS_HOST_DEVICE void add_pointer_offset(Index offset) { params.pointer += offset; }

   CUTLASS_HOST_DEVICE Index stride_advance(void) {
     Index stride = params.stride_h;
     if (kAdvance == IteratorAdvance::kW) {
       stride = params.stride_w;
     }
     return stride;
   }

   template <typename Fragment>
   CUTLASS_HOST_DEVICE void load_post_increment(Fragment& fragment) {
     typename Base::FragmentIterator frag_iterator(fragment);
     for (int d = 0; d < Base::Iterations::kD; ++d) {
       for (int h = 0; h < Base::Iterations::kH; ++h) {
         for (int w = 0; w < Base::Iterations::kW; ++w) {
           for (int c = 0; c < Base::Iterations::kC; ++c) {
             if (valid(d, h, w, c)) {
               load_element(
                   reinterpret_cast<typename Base::AccessType&>(frag_iterator.at(d, h, w, c)),
                   d,
                   h,
                   w,
                   c);
             }
           }
           if (w < Base::Iterations::kW - 1) {
             inc_w();
           }
         }
         if (h < Base::Iterations::kH - 1) {
           inc_h();
         }
       }
       if (d < Base::Iterations::kD - 1) {
         inc_d();
       }
     }
     inc_advance();
   }
 };


 template <typename TileTraits_, typename Index_ = int>
 struct GemmGlobalIteratorCd : public TileIteratorBase<TileTraits_,
                                                       typename TileTraits_::Scalar,
                                                       IteratorAdvance::kH,
                                                       MemorySpace::kGlobal,
                                                       Index_> {
   typedef GemmGlobalIteratorCd<TileTraits_, Index_> This_;
   typedef TileIteratorBase<TileTraits_,
                            typename TileTraits_::Scalar,
                            IteratorAdvance::kH,
                            MemorySpace::kGlobal,
                            Index_>
       Base;

   static MatrixLayout::Kind const kLayout = TileTraits_::kLayout;

   typedef typename TileTraits_::Scalar Scalar;
   typedef typename TileTraits_::Pointer Pointer;
   typedef typename TileTraits_::Threads Threads;
   typedef Index_ Index;
   typedef typename TileTraits_::ThreadOffset ThreadOffset;

   struct Params {
     Pointer pointer;
     long long stride_d;
     Index stride_h;
     Index inc_advance, inc_h;
     Index predicate_inc_advance, predicate_inc_h;
     Index predicate_offset;

     CUTLASS_HOST_DEVICE int initialize(Pointer pointer,
                                        long long batch_stride,
                                        Index ldm,
                                        Index bound,
                                        Index epilogue_stride_w,
                                        Index epilogue_delta_w) {
       // The pointer.
       this->pointer = pointer;
       // Stride per batch
       stride_d = batch_stride;
       // Each column of the matrix.
       stride_h = TileTraits_::ThreadsDelta::kH * ldm;
       // Each thread output 1 column per iteration. The stride between columns is given by the
       // number of scalars that are loaded per LDS for B.
       inc_h = ldm * TileTraits_::kStrideH;
       inc_advance =
           (ldm - ldm * TileTraits_::kStrideH * (Base::Iterations::kH - 1)) + epilogue_stride_w;

       predicate_offset = bound;
       predicate_inc_h = TileTraits_::kStrideH;
       predicate_inc_advance =
           -((TileTraits_::kStrideH * (Base::Iterations::kH - 1) - 1) + epilogue_delta_w);

       return 0;
     }
   };

   Params params;
   Coord<4> thread_offset;
   cutlass::PredicateVector<Base::Iterations::kW> predicates;

   CUTLASS_HOST_DEVICE GemmGlobalIteratorCd(Params const& _params,
                                            const Coord<3>& bounds,
                                            const Coord<3>& block_offset,
                                            ThreadOffset thread_offset_func = ThreadOffset())
       : params(_params) {
     thread_offset = thread_offset_func();
     // Prepare the vector of predicates.
     for (int i = 0; i < Base::Iterations::kW; ++i) {
       predicates.set(i, thread_offset[2] + i * Base::Delta::kW < bounds[2]);
     }
   }

   CUTLASS_HOST_DEVICE GemmGlobalIteratorCd(Params const& _params,
                                            const Coord<3>& bounds,
                                            const Coord<3>& block,
                                            int offset = 0,
                                            int pred_offset = 0,
                                            ThreadOffset thread_offset_func = ThreadOffset())
       : params(_params) {
     thread_offset = thread_offset_func();
     // Each warp works on a different column of the tile.
     int const h = thread_offset[1] + block[1];
     // Each lane writes a different element.
     int const w = thread_offset[2] + block[2];
     // Setup the pointer.
     params.pointer += ((h * params.stride_h + w) + offset);

     // Prepare the vector of predicates.
     for (int i = 0; i < Base::Iterations::kW; ++i) {
       predicates.set(i, w + i * Base::Delta::kW < bounds[2]);
     }
     params.predicate_offset -= (h + pred_offset);
   }

   CUTLASS_HOST_DEVICE void inc_c() {}
   CUTLASS_HOST_DEVICE void inc_w() {}
   CUTLASS_HOST_DEVICE void inc_h() {
     params.pointer += params.inc_h;
     params.predicate_offset -= params.predicate_inc_h;
   }
   CUTLASS_HOST_DEVICE void inc_d() {}
   CUTLASS_HOST_DEVICE void inc_advance() {
     params.pointer += params.inc_advance;
     params.predicate_offset -= params.predicate_inc_advance;
   }

   CUTLASS_HOST_DEVICE GemmGlobalIteratorCd & operator+=(Coord<3> const &offset) {
     long long _offset = offset.template dot<long long>(
       make_Coord(params.stride_d, params.stride_h, 1)
     );
     params.pointer += _offset;
     return *this;
   }

   CUTLASS_HOST_DEVICE void load_element(
       typename Base::AccessType& value, int d, int h, int w, int c) const {
     int const offset =
         ComputeOffsetFromStrides<typename Base::ImmediateOffsetStrides>::get(d, h, w, c);
     Load<Scalar,
          Base::kAccessSize,
          Base::kMemorySpace,
          Base::kFragmentElementType,
          typename Base::FragmentElement,
          Base::Tile::kW,
          Base::kAccessSize * sizeof(Scalar)>::load(value, params.pointer, offset);
   }

   CUTLASS_HOST_DEVICE void store_element(
       typename Base::AccessType const& value, int d, int h, int w, int c) {
     int const offset =
         ComputeOffsetFromStrides<typename Base::ImmediateOffsetStrides>::get(d, h, w, c);
     Store<Scalar,
           Base::kAccessSize,
           Base::kMemorySpace,
           Base::kFragmentElementType,
           typename Base::FragmentElement,
           Base::Tile::kW,
           Base::kAccessSize * sizeof(Scalar)>::store(value, params.pointer, offset);
   }

   CUTLASS_HOST_DEVICE bool valid(int d, int h, int w, int c) const {
     return predicates.at(w) && params.predicate_offset > 0;
   }

   CUTLASS_HOST_DEVICE void add_pointer_offset(Index offset) { params.pointer += offset; }

   template <typename Fragment>
   CUTLASS_HOST_DEVICE void load_post_increment(Fragment& fragment) {
     typename Base::FragmentIterator frag_iterator(fragment);
     for (int d = 0; d < Base::Iterations::kD; ++d) {
       for (int h = 0; h < Base::Iterations::kH; ++h) {
         for (int w = 0; w < Base::Iterations::kW; ++w) {
           for (int c = 0; c < Base::Iterations::kC; ++c) {
             if (valid(d, h, w, c)) {
               load_element(
                   reinterpret_cast<typename Base::AccessType&>(frag_iterator.at(d, h, w, c)),
                   d,
                   h,
                   w,
                   c);
             }
           }
           if (w < Base::Iterations::kW - 1) {
             inc_w();
           }
         }
         if (h < Base::Iterations::kH - 1) {
           inc_h();
         }
       }
       if (d < Base::Iterations::kD - 1) {
         inc_d();
       }
     }
     inc_advance();
   }

   template <typename Fragment>
   CUTLASS_HOST_DEVICE void store_post_increment(Fragment& fragment) {
     typename Base::FragmentIterator frag_iterator(fragment);
     for (int d = 0; d < Base::Iterations::kD; ++d) {
       for (int h = 0; h < Base::Iterations::kH; ++h) {
         for (int w = 0; w < Base::Iterations::kW; ++w) {
           for (int c = 0; c < Base::Iterations::kC; ++c) {
             if (valid(d, h, w, c)) {
               store_element(
                   reinterpret_cast<typename Base::AccessType&>(frag_iterator.at(d, h, w, c)),
                   d,
                   h,
                   w,
                   c);
             }
           }
           if (w < Base::Iterations::kW - 1) {
             inc_w();
           }
         }
         if (h < Base::Iterations::kH - 1) {
           inc_h();
         }
       }
       if (d < Base::Iterations::kD - 1) {
         inc_d();
       }
     }
     inc_advance();
   }
 };


 }  // namespace gemm
 }  // namespace cutlass
cutlass::gemm::GemmGlobalTileCdTraits
Definition: gemm_global_tile.h:120

cutlass::gemm::GemmGlobalTileTraits::Delta
Shape< 0, Threads::kH, Threads::kW *kAccessSize > Delta
The strides in each dimension between different loads/stores.
Definition: gemm_global_tile.h:92

cutlass::gemm::GemmGlobalIteratorCd::operator+=
CUTLASS_HOST_DEVICE GemmGlobalIteratorCd & operator+=(Coord< 3 > const &offset)
Adds a vector offset to the iterator.
Definition: gemm_global_tile.h:529

cutlass::gemm::GemmGlobalIteratorAb::load_post_increment
CUTLASS_HOST_DEVICE void load_post_increment(Fragment &fragment)
Definition: gemm_global_tile.h:362

cutlass::gemm::GemmGlobalIteratorCd::Params::inc_advance
Index inc_advance
The strides to increment the pointer.
Definition: gemm_global_tile.h:434

cutlass
Definition: convert.h:33

cutlass::gemm::GemmGlobalIteratorAb::PredicateVector
cutlass::PredicateVector< ShapeCount< typename Base::Iterations >::kCount > PredicateVector
Definition: gemm_global_tile.h:196

cutlass::gemm::GemmGlobalIteratorAb::kLayout
static MatrixLayout::Kind const kLayout
The layout.
Definition: gemm_global_tile.h:180

cutlass::gemm::GemmGlobalIteratorAb::GemmGlobalIteratorAb
CUTLASS_HOST_DEVICE GemmGlobalIteratorAb(Params const &_params, const Coord< 3 > &bounds, const Coord< 3 > &threadblock_offset, ThreadOffset thread_offset_func=ThreadOffset())
Ctor.
Definition: gemm_global_tile.h:270

cutlass::platform::remove_const::type
T type
Definition: platform.h:377

cutlass::TileLoadIterator::kAccessSize
static int const kAccessSize
The number of scalars accessed per load/store.
Definition: tile_iterator.h:461

cutlass::gemm::GemmGlobalIteratorAb::BaseParams
Base::Params BaseParams
Iterator parameters type.
Definition: gemm_global_tile.h:199

cutlass::gemm::GemmGlobalIteratorCd::inc_c
CUTLASS_HOST_DEVICE void inc_c()
Increment the pointer in the C dimension.
Definition: gemm_global_tile.h:512

cutlass::gemm::GemmGlobalIteratorCd::Index
Index_ Index
The index.
Definition: gemm_global_tile.h:421

tile_iterator.h
Defines the Tile Traits concept and iterators for loading and storing to tiles efficiently.

cutlass::gemm::GemmGlobalTileTraits::VectorizedTile
ReshapeTile< Tile_, kAccessSize_ >::Tile VectorizedTile
The vectorized tile shape.
Definition: gemm_global_tile.h:86

cutlass::gemm::GemmGlobalIteratorCd::This_
GemmGlobalIteratorCd< TileTraits_, Index_ > This_
This class.
Definition: gemm_global_tile.h:402

cutlass::gemm::GemmGlobalIteratorCd::kLayout
static MatrixLayout::Kind const kLayout
The layout.
Definition: gemm_global_tile.h:412

cutlass::gemm::GemmGlobalTileTraits
Definition: gemm_global_tile.h:70

cutlass::gemm::GemmGlobalTileTraits::Pointer
Scalar_ * Pointer
The pointer.
Definition: gemm_global_tile.h:78

cutlass::TileIteratorBase::FragmentIterator
FragmentIterator< Fragment, Iterations, AccessType > FragmentIterator
The fragment iterator.
Definition: tile_iterator.h:199

coord.h
A Coord is a coordinate of arbitrary rank into a tensor or matrix.

cutlass::IteratorAdvance::Kind
Kind
Definition: tile_iterator.h:65

cutlass::PredicateVector::at
CUTLASS_HOST_DEVICE bool at(int idx) const
Accesses a bit within the predicate vector.
Definition: predicate_vector.h:357

cutlass::MemorySpace::kGlobal
Definition: load_store.h:42

cutlass::make_Coord
CUTLASS_HOST_DEVICE Coord< 1 > make_Coord(int _0)
Helper to make a 2-element coordinate.
Definition: coord.h:318

cutlass::gemm::GemmGlobalTileTraits::ThreadsDelta
Shape< 1, 1, VectorizedTile::kC > ThreadsDelta
The relative offset between two elements in the H/W dimension in adjacent threads.
Definition: gemm_global_tile.h:90

cutlass::gemm::GemmGlobalTileTraits::MultiplicandTraits
GemmMultiplicandTraits< Tile, kOperand, kLayout > MultiplicandTraits
Definition: gemm_global_tile.h:103

cutlass::TileLoadIterator::FragmentElement
FragmentElement_ FragmentElement
Fragment element.
Definition: tile_iterator.h:425

cutlass::gemm::GemmGlobalTileTraits::kMemorySpace
static MemorySpace::Kind const kMemorySpace
The memory space.
Definition: gemm_global_tile.h:82

cutlass::gemm::GemmGlobalIteratorAb::load_element
CUTLASS_HOST_DEVICE void load_element(typename Base::AccessType &value, int d, int h, int w, int c) const
Loads a single fragment element from memory.
Definition: gemm_global_tile.h:292

cutlass::gemm::GemmGlobalIteratorCd::Base
TileIteratorBase< TileTraits_, typename TileTraits_::Scalar, IteratorAdvance::kH, MemorySpace::kGlobal, Index_ > Base
The base class.
Definition: gemm_global_tile.h:409

cutlass::TileIteratorBase< TileTraits_, TileTraits_::Scalar, IteratorAdvance::kH, MemorySpace::kGlobal, Index_ >::Fragment
Fragment< FragmentElement, ShapeCount< Iterations >::kCount *kAccessSize > Fragment
The fragment.
Definition: tile_iterator.h:196

cutlass::gemm::GemmGlobalTileCdTraits::Delta
Shape< 0, 0, Base::Delta::kW, Base::Delta::kC > Delta
Override the strides in each dimension between different loads/stores.
Definition: gemm_global_tile.h:138

cutlass::gemm::GemmGlobalIteratorCd::Params::predicate_inc_h
Index predicate_inc_h
Definition: gemm_global_tile.h:436

cutlass::TileLoadIterator::Params::initialize
CUTLASS_HOST_DEVICE int initialize()
Definition: tile_iterator.h:584

cutlass::TileLoadIterator::kMemorySpace
static MemorySpace::Kind const kMemorySpace
Source or destination memory space.
Definition: tile_iterator.h:434

cutlass::TileIteratorBase::Params::inc_d
long long inc_d
Definition: tile_iterator.h:223

cutlass::gemm::GemmGlobalTileTraits::Tile
Tile_ Tile
The tile shape.
Definition: gemm_global_tile.h:84

cutlass::TileLoadIterator::Fragment
Base::Fragment Fragment
Fragment definition.
Definition: tile_iterator.h:464

cutlass::IteratorAdvance::kH
Definition: tile_iterator.h:65

cutlass::TileIteratorBase::Params::inc_advance
long long inc_advance
Definition: tile_iterator.h:227

cutlass::gemm::GemmGlobalIteratorCd::load_post_increment
CUTLASS_HOST_DEVICE void load_post_increment(Fragment &fragment)
Loads and increments iterator.
Definition: gemm_global_tile.h:575

cutlass::gemm::GemmGlobalIteratorAb::Base
TileLoadIterator< TileTraits_, typename TileTraits_::Scalar, TileTraits_::MultiplicandTraits::kKstrided ? IteratorAdvance::kH :IteratorAdvance::kW, MemorySpace::kGlobal, Index_ > Base
The base class.
Definition: gemm_global_tile.h:178

cutlass::gemm::GemmGlobalIteratorAb::Params
Definition: gemm_global_tile.h:201

cutlass::gemm::GemmGlobalIteratorCd::GemmGlobalIteratorCd
CUTLASS_HOST_DEVICE GemmGlobalIteratorCd(Params const &_params, const Coord< 3 > &bounds, const Coord< 3 > &block, int offset=0, int pred_offset=0, ThreadOffset thread_offset_func=ThreadOffset())
Ctor.
Definition: gemm_global_tile.h:489

cutlass::GemmOperand::kC
Definition: matrix_traits.h:357

cutlass::ComputeOffsetFromShape::get
static CUTLASS_HOST_DEVICE int get(int d, int h, int w, int c)
Definition: shape.h:181

platform.h
C++ features that may be otherwise unimplemented for CUDA device functions.

cutlass::gemm::GemmGlobalIteratorAb
Definition: gemm_global_tile.h:163

cutlass::gemm::GemmGlobalTileCdTraits::Base
GemmGlobalTileTraits< GemmOperand::kC, MatrixLayout::kColumnMajor, Scalar_, Tile_, Threads_, kAccessSize_ > Base
The base class.
Definition: gemm_global_tile.h:133

cutlass::MemorySpace::Kind
Kind
Definition: load_store.h:39

cutlass::gemm::GemmGlobalIteratorAb::inc_d
CUTLASS_HOST_DEVICE void inc_d()
Increment the pointer in the D dimension.
Definition: gemm_global_tile.h:287

cutlass::gemm::GemmGlobalIteratorAb::kAdvance
static IteratorAdvance::Kind const kAdvance
Specifies in which dimension post-increment accesses advance.
Definition: gemm_global_tile.h:194

cutlass::gemm::GemmGlobalIteratorAb::Threads
TileTraits_::Threads Threads
The threads.
Definition: gemm_global_tile.h:188

cutlass::gemm::GemmGlobalTileCdTraits::kStrideH
static int const kStrideH
The stride in the H dimension.
Definition: gemm_global_tile.h:136

cutlass::Shape::kH
static int const kH
The height of the cube.
Definition: shape.h:68

cutlass::Store
Definition: load_store.h:178

cutlass::gemm::ReshapeThreads< Tile_, Threads_, true >::Threads
Shape< Threads_::kD, Threads_::kH *Threads_::kW/Tile_::kW, Tile_::kW, 1 > Threads
Definition: gemm_global_tile.h:59

cutlass::gemm::GemmGlobalIteratorCd::Params::predicate_inc_advance
Index predicate_inc_advance
The strides to increment the predicate offset.
Definition: gemm_global_tile.h:436

cutlass::gemm::GemmGlobalTileTraits::kOperand
static GemmOperand::Kind const kOperand
Identity of the operand.
Definition: gemm_global_tile.h:72

cutlass::TileIteratorBase::Params::stride_h
Index stride_h
Definition: tile_iterator.h:220

predicate_vector.h
Defines container classes and iterators for managing a statically sized vector of boolean predicates...

cutlass::gemm::GemmGlobalIteratorCd::valid
CUTLASS_HOST_DEVICE bool valid(int d, int h, int w, int c) const
Test the validity of the.
Definition: gemm_global_tile.h:566

cutlass::TileIteratorBase::kFragmentElementType
static FragmentElementType::Kind const kFragmentElementType
Specifies iterator storage fragment type (Scalar or WmmaMatrix)
Definition: tile_iterator.h:158

cutlass::gemm::GemmGlobalIteratorCd::inc_d
CUTLASS_HOST_DEVICE void inc_d()
Increment the pointer in the D dimension.
Definition: gemm_global_tile.h:521

cutlass::TileLoadIterator
An iterator implementing Tile Load Iterator Concept for loading a tile from memory.
Definition: tile_iterator.h:399

cutlass::gemm::ReshapeThreads< Tile_, Threads_, true >
Definition: gemm_global_tile.h:58

cutlass::gemm::GemmGlobalIteratorCd::Params::initialize
CUTLASS_HOST_DEVICE int initialize(Pointer pointer, long long batch_stride, Index ldm, Index bound, Index epilogue_stride_w, Index epilogue_delta_w)
Setup the params.
Definition: gemm_global_tile.h:441

cutlass::gemm::GemmGlobalIteratorAb::predicates
PredicateVector predicates
The predicates.
Definition: gemm_global_tile.h:241

cutlass::gemm::GemmGlobalTileTraits::Scalar
platform::remove_const< Scalar_ >::type Scalar
The scalar.
Definition: gemm_global_tile.h:76

reshape_tile.h
Defines a type for restructuring a tile.

gemm_operand.h
Defines constant expressions for mapping GEMM problem size and strides onto pitch-linear memory...

cutlass::gemm::GemmGlobalIteratorAb::inc_h
CUTLASS_HOST_DEVICE void inc_h()
Increment the pointer in the H dimension.
Definition: gemm_global_tile.h:285

cutlass::gemm::GemmGlobalIteratorAb::operator+=
CUTLASS_HOST_DEVICE GemmGlobalIteratorAb & operator+=(Coord< 3 > const &offset)
Adds a vector offset to the iterator.
Definition: gemm_global_tile.h:341

cutlass::gemm::GemmGlobalIteratorAb::Fragment
Base::Fragment Fragment
Fragment type loaded by the iterator.
Definition: gemm_global_tile.h:184

cutlass::gemm::GemmGlobalIteratorCd::Threads
TileTraits_::Threads Threads
The threads.
Definition: gemm_global_tile.h:419

cutlass::gemm::GemmGlobalTileCdTraits::ThreadOffset::operator()
CUTLASS_HOST_DEVICE Coord< 4 > operator()() const
Definition: gemm_global_tile.h:151

cutlass::gemm::GemmMultiplicandTraits
Definition: gemm_operand.h:67

cutlass::gemm::GemmGlobalTileTraits::ThreadOffset
Computes the thread offset in (H, W) based on thread ID.
Definition: gemm_global_tile.h:106

cutlass::gemm::GemmGlobalIteratorAb::inc_w
CUTLASS_HOST_DEVICE void inc_w()
Increment the pointer in the W dimension.
Definition: gemm_global_tile.h:283

cutlass::TileLoadIterator< TileTraits_, TileTraits_::Scalar, TileTraits_::MultiplicandTraits::kKstrided ? IteratorAdvance::kH :IteratorAdvance::kW, MemorySpace::kGlobal, Index_ >::load
CUTLASS_HOST_DEVICE void load(Fragment &fragment, PredicateIterator pred_it) const
Loads a fragment without advancing the iterator..
Definition: tile_iterator.h:771

cutlass::PredicateVector::fill
CUTLASS_HOST_DEVICE void fill(bool value=true)
Fills all predicates with a given value.
Definition: predicate_vector.h:344

cutlass::gemm::GemmGlobalIteratorCd::store_post_increment
CUTLASS_HOST_DEVICE void store_post_increment(Fragment &fragment)
Definition: gemm_global_tile.h:606

cutlass::gemm::GemmGlobalTileCdTraits::Threads
Base::Threads Threads
Definition: gemm_global_tile.h:142

cutlass::gemm::GemmGlobalIteratorCd::Params::stride_h
Index stride_h
The stride in the H dimension to setup the thread in the block.
Definition: gemm_global_tile.h:432

CUTLASS_HOST_DEVICE
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:46

cutlass::gemm::GemmGlobalIteratorAb::inc_advance
CUTLASS_HOST_DEVICE void inc_advance()
Increment the pointer to move to the next iteration.
Definition: gemm_global_tile.h:289

cutlass::gemm::GemmGlobalTileTraits::ThreadOffset::operator()
CUTLASS_HOST_DEVICE Coord< 4 > operator()() const
Definition: gemm_global_tile.h:108

cutlass::gemm::GemmGlobalIteratorCd::load_element
CUTLASS_HOST_DEVICE void load_element(typename Base::AccessType &value, int d, int h, int w, int c) const
Loads a single fragment element from memory.
Definition: gemm_global_tile.h:538

cutlass::TileIteratorBase::Params::inc_h
Index inc_h
Definition: tile_iterator.h:224

cutlass::gemm::GemmGlobalTileTraits::ImmediateOffsetStrides
Shape< 0, 0, Threads::kW *ThreadsDelta::kW, kAccessSize > ImmediateOffsetStrides
Strides for immediate offset computation.
Definition: gemm_global_tile.h:95

cutlass::PredicateVector
Statically sized array of bits implementing.
Definition: predicate_vector.h:105

cutlass::Vector
Definition: vector.h:62

cutlass::Load
Definition: load_store.h:60

cutlass::gemm::GemmGlobalIteratorCd::ThreadOffset
TileTraits_::ThreadOffset ThreadOffset
The thread offset.
Definition: gemm_global_tile.h:423

cutlass::Shape
A Shape implementing Layout Concept describing the dimensions of a cube.
Definition: shape.h:64

cutlass::gemm::GemmGlobalTileCdTraits::ImmediateOffsetStrides
Base::ImmediateOffsetStrides ImmediateOffsetStrides
Definition: gemm_global_tile.h:146

cutlass::TileIteratorBase::Params::stride_d
long long stride_d
Definition: tile_iterator.h:219

cutlass::gemm::GemmGlobalIteratorCd::Scalar
TileTraits_::Scalar Scalar
The scalar.
Definition: gemm_global_tile.h:415

cutlass::gemm::GemmGlobalIteratorCd::Params::inc_h
Index inc_h
Definition: gemm_global_tile.h:434

cutlass::gemm::GemmGlobalIteratorCd::predicates
cutlass::PredicateVector< Base::Iterations::kW > predicates
The predicates for the row.
Definition: gemm_global_tile.h:473

cutlass::TileIteratorBase::kMemorySpace
static MemorySpace::Kind const kMemorySpace
Source or destination memory space.
Definition: tile_iterator.h:161

cutlass::gemm::GemmGlobalIteratorCd::Params::pointer
Pointer pointer
The pointer.
Definition: gemm_global_tile.h:428

cutlass::gemm::GemmGlobalIteratorAb::This_
GemmGlobalIteratorAb< TileTraits_, Index_ > This_
This class.
Definition: gemm_global_tile.h:171

cutlass::Coord< 4 >

cutlass::gemm::GemmGlobalIteratorAb::Tile
TileTraits_::Tile Tile
The tile.
Definition: gemm_global_tile.h:182

cutlass::gemm::GemmGlobalTileTraits::Threads
ReshapeThreads< VectorizedTile, Threads_ >::Threads Threads
The threads shape.
Definition: gemm_global_tile.h:88

cutlass::gemm::GemmGlobalTileTraits::Iterations
Shape< 1, VectorizedTile::kH/Threads::kH, VectorizedTile::kW/Threads::kW, VectorizedTile::kC/kAccessSize > Iterations
The number of iterations needed to load/store the tile.
Definition: gemm_global_tile.h:101

cutlass::gemm::GemmGlobalIteratorCd::GemmGlobalIteratorCd
CUTLASS_HOST_DEVICE GemmGlobalIteratorCd(Params const &_params, const Coord< 3 > &bounds, const Coord< 3 > &block_offset, ThreadOffset thread_offset_func=ThreadOffset())
Ctor.
Definition: gemm_global_tile.h:476

cutlass::gemm::GemmGlobalIteratorCd::inc_advance
CUTLASS_HOST_DEVICE void inc_advance()
Increment the pointer to move to the next iteration.
Definition: gemm_global_tile.h:523

cutlass::gemm::GemmGlobalIteratorCd::inc_h
CUTLASS_HOST_DEVICE void inc_h()
Increment the pointer in the H dimension.
Definition: gemm_global_tile.h:516

cutlass::gemm::GemmGlobalIteratorCd::inc_w
CUTLASS_HOST_DEVICE void inc_w()
Increment the pointer in the W dimension.
Definition: gemm_global_tile.h:514

cutlass::TileIteratorBase
Iterator for accessing a stripmined tile in memory.
Definition: tile_iterator.h:144

cutlass::TileLoadIterator::Params::pointer
Scalar const  * pointer
Pointer to memory.
Definition: tile_iterator.h:493

cutlass::gemm::GemmGlobalIteratorCd::params
Params params
Parameters.
Definition: gemm_global_tile.h:469

cutlass::gemm::GemmGlobalIteratorCd
Definition: gemm_global_tile.h:396

cutlass::MatrixLayout::kColumnMajor
Definition: matrix_traits.h:159

cutlass::TileLoadIterator::kAdvance
static IteratorAdvance::Kind const kAdvance
Specifies in which dimension post-increment accesses advance.
Definition: tile_iterator.h:428

cutlass::gemm::GemmGlobalIteratorCd::thread_offset
Coord< 4 > thread_offset
Offset of an individual lane from the start of the tile.
Definition: gemm_global_tile.h:471

cutlass::TileIteratorBase::FragmentElement
FragmentElement_ FragmentElement
Fragment element.
Definition: tile_iterator.h:152

cutlass::gemm::GemmGlobalIteratorAb::ThreadOffset
TileTraits_::ThreadOffset ThreadOffset
The thread offset.
Definition: gemm_global_tile.h:192

cutlass::Shape::kW
static int const kW
The width of the cube.
Definition: shape.h:70

cutlass::PredicateVector::set
CUTLASS_HOST_DEVICE void set(int idx, bool value=true)
Set a bit within the predicate vector.
Definition: predicate_vector.h:365

cutlass::gemm::GemmGlobalIteratorAb::add_pointer_offset
CUTLASS_HOST_DEVICE void add_pointer_offset(Index offset)
Definition: gemm_global_tile.h:351

cutlass::TileIteratorBase::kAccessSize
static int const kAccessSize
The number of scalars accessed per load/store.
Definition: tile_iterator.h:185

cutlass::TileLoadIterator::Params
Parameters.
Definition: tile_iterator.h:491

cutlass::ComputeOffsetFromStrides::get
static CUTLASS_HOST_DEVICE int get(int d, int h, int w, int c)
Definition: shape.h:199

cutlass::gemm::GemmGlobalTileCdTraits::ThreadOffset
Computes the thread offset in (H, W) based on thread ID.
Definition: gemm_global_tile.h:149

cutlass::MatrixLayout::Kind
Kind
Enumeration defining fundamental contiguous layouts.
Definition: matrix_traits.h:159

cutlass::gemm::GemmGlobalIteratorCd::Params::stride_d
long long stride_d
The stride in the D dimension.
Definition: gemm_global_tile.h:430

cutlass::gemm::GemmGlobalIteratorAb::stride_advance
CUTLASS_HOST_DEVICE Index stride_advance(void)
Definition: gemm_global_tile.h:353

cutlass::TileLoadIterator::inc_w
CUTLASS_HOST_DEVICE void inc_w()
Increment in the W dimension.
Definition: tile_iterator.h:680

cutlass::gemm::GemmGlobalTileTraits::kAccessSize
static int const kAccessSize
The number of scalars per LDG/STG.
Definition: gemm_global_tile.h:80

cutlass::ReshapeTile::Tile
Tile_ Tile
Definition: reshape_tile.h:43

cutlass::IteratorAdvance::kW
Definition: tile_iterator.h:65

cutlass::gemm::GemmGlobalTileCdTraits::Iterations
Base::Iterations Iterations
Definition: gemm_global_tile.h:140

cutlass::gemm::GemmGlobalIteratorAb::Index
Index_ Index
The index.
Definition: gemm_global_tile.h:190

cutlass::gemm::GemmGlobalIteratorCd::Pointer
TileTraits_::Pointer Pointer
The pointer.
Definition: gemm_global_tile.h:417

cutlass::gemm::GemmGlobalIteratorAb::valid
CUTLASS_HOST_DEVICE bool valid(int d, int h, int w, int c) const
Is the valid?
Definition: gemm_global_tile.h:335

cutlass::TileLoadIterator::kFragmentElementType
static FragmentElementType::Kind const kFragmentElementType
Specifies type of iterator fragment storage (Salar or WmmaMatrix)
Definition: tile_iterator.h:431

cutlass::GemmOperand::Kind
Kind
Definition: matrix_traits.h:357

cutlass::gemm::GemmGlobalIteratorAb::Scalar
TileTraits_::Scalar Scalar
The scalar.
Definition: gemm_global_tile.h:186

cutlass::gemm::ReshapeThreads::Threads
Threads_ Threads
Definition: gemm_global_tile.h:54

cutlass::gemm::GemmGlobalIteratorAb::Params::initialize
CUTLASS_HOST_DEVICE int initialize(Scalar const *ptr, long long stride_d, Index stride_h)
Initializes params to load a strip-mined tile, given pointer and stride_h.
Definition: gemm_global_tile.h:203

cutlass::gemm::GemmGlobalIteratorCd::add_pointer_offset
CUTLASS_HOST_DEVICE void add_pointer_offset(Index offset)
add pointer offset
Definition: gemm_global_tile.h:571

cutlass::gemm::GemmGlobalIteratorAb::params
Params params
The parameters.
Definition: gemm_global_tile.h:239

matrix_traits.h
Defines properties of matrices used to denote layout and operands to GEMM kernels.

cutlass::gemm::GemmGlobalIteratorAb::initialize_predicates
CUTLASS_HOST_DEVICE void initialize_predicates(const Coord< 3 > &bounds, const Coord< 3 > &block_offset)
Definition: gemm_global_tile.h:243

cutlass::gemm::GemmGlobalIteratorCd::Params
The params.
Definition: gemm_global_tile.h:426

cutlass::gemm::GemmGlobalTileCdTraits::ThreadsDelta
Base::ThreadsDelta ThreadsDelta
Definition: gemm_global_tile.h:144

cutlass::gemm::GemmGlobalIteratorAb::thread_offset
Coord< 4 > thread_offset
Offset of an individual lane from the start of the tile.
Definition: gemm_global_tile.h:237

cutlass::TileLoadIterator::FragmentIterator
Base::FragmentIterator FragmentIterator
Fragment iterator definition.
Definition: tile_iterator.h:467

cutlass::ShapeCount
Compute derived counted of a Layout Concept based class.
Definition: shape.h:79

cutlass::gemm::GemmGlobalIteratorAb::residue
CUTLASS_HOST_DEVICE void residue(Index k)
That&#39;s the residue! Update the predicates.
Definition: gemm_global_tile.h:306

cutlass::gemm::GemmGlobalIteratorCd::Params::predicate_offset
Index predicate_offset
The column offset to compute the predicate for the columns.
Definition: gemm_global_tile.h:438

cutlass::gemm::GemmGlobalTileTraits::kLayout
static MatrixLayout::Kind const kLayout
The layout.
Definition: gemm_global_tile.h:74

cutlass::gemm::GemmGlobalIteratorCd::store_element
CUTLASS_HOST_DEVICE void store_element(typename Base::AccessType const &value, int d, int h, int w, int c)
Stores a single fragment element into memory.
Definition: gemm_global_tile.h:552

cutlass::TileIteratorBase::Params::stride_w
Index stride_w
Definition: tile_iterator.h:221