374882be53f2a7558aeb6c4955b8b9da75b29ecf/docs/gemm__global__tile_8h_source.html

 /***************************************************************************************************
  * Copyright (c) 2017-2018, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright notice, this list of
  *       conditions and the following disclaimer.
  *     * Redistributions in binary form must reproduce the above copyright notice, this list of
  *       conditions and the following disclaimer in the documentation and/or other materials
  *       provided with the distribution.
  *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
  *       to endorse or promote products derived from this software without specific prior written
  *       permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
 #pragma once

 #include <cutlass/coord.h>
 #include <cutlass/util/platform.h>

 #include <cutlass/gemm/gemm_operand.h>
 #include <cutlass/matrix_traits.h>
 #include <cutlass/predicate_vector.h>
 #include <cutlass/reshape_tile.h>
 #include <cutlass/tile_iterator.h>

 namespace cutlass {
 namespace gemm {


 // The following functor reshapes a tile of threads to match a tile of data. The idea is that when
 // the user wants to build the iterator traits, he/she may want to specify the tile independently
 // from the number of scalars loaded/stored per instruction. For example, in the row-major version
 // with a tile of size 128x8 - the user may want to that the iterator works with 32x8 threads if
 // each thread loads 1 scalar per LDG. If the user changes to 4 scalars per LDG, then the tile of
 // threads has to change. The code below detects that and correct the code automatically - it is
 // a helper when the user does not specify the right configuration.

 template <typename Tile_, typename Threads_, bool = (Tile_::kW < Threads_::kW)>
 struct ReshapeThreads {
   typedef Threads_ Threads;
 };

 template <typename Tile_, typename Threads_>
 struct ReshapeThreads<Tile_, Threads_, true> {
   typedef Shape<Threads_::kD, Threads_::kH * Threads_::kW / Tile_::kW, Tile_::kW, 1> Threads;
 };


 template <GemmOperand::Kind kOperand_,
           MatrixLayout::Kind kLayout_,
           typename Scalar_,
           typename Tile_,
           typename Threads_,
           int kAccessSize_>
 struct GemmGlobalTileTraits {
   static GemmOperand::Kind const kOperand = kOperand_;
   static MatrixLayout::Kind const kLayout = kLayout_;
   typedef typename platform::remove_const<Scalar_>::type Scalar;
   typedef Scalar_* Pointer;
   static int const kAccessSize = kAccessSize_;
   static MemorySpace::Kind const kMemorySpace = MemorySpace::kGlobal;

   typedef typename ReshapeTile<Tile_, kAccessSize_>::Tile Tile;
   typedef typename ReshapeThreads<Tile, Threads_>::Threads Threads;
   typedef Shape<1, 1, Tile::kC> ThreadsDelta;

   typedef Shape<0, Threads::kH, Threads::kW * kAccessSize> Delta;
   typedef Shape<0, 0, Threads::kW * ThreadsDelta::kW, kAccessSize> ImmediateOffsetStrides;
   typedef Shape<1, Tile::kH / Threads::kH, Tile::kW / Threads::kW, Tile::kC / kAccessSize>
       Iterations;

   typedef GemmMultiplicandTraits<Tile, kOperand, kLayout> MultiplicandTraits;

   struct ThreadOffset {
     CUTLASS_HOST_DEVICE
     Coord<4> operator()() const {
       int thread_offset_h = threadIdx.x / Threads::kW * ThreadsDelta::kH;
       int thread_offset_w = threadIdx.x % Threads::kW * ThreadsDelta::kW;

       return make_Coord(0, thread_offset_h, thread_offset_w, 0);
     }
   };
 };


 template <typename Scalar_, typename Tile_, typename Threads_, int kStrideH_, int kAccessSize_>
 struct GemmGlobalTileCdTraits : public GemmGlobalTileTraits<GemmOperand::kC,
                                                             MatrixLayout::kColumnMajor,
                                                             Scalar_,
                                                             Tile_,
                                                             Threads_,
                                                             kAccessSize_> {
   typedef GemmGlobalTileTraits<GemmOperand::kC,
                                MatrixLayout::kColumnMajor,
                                Scalar_,
                                Tile_,
                                Threads_,
                                kAccessSize_>
       Base;

   static int const kStrideH = kStrideH_;
   typedef Shape<0, 0, Base::Delta::kW, Base::Delta::kC> Delta;

   typedef typename Base::Iterations Iterations;

   typedef typename Base::Threads Threads;

   typedef typename Base::ThreadsDelta ThreadsDelta;

   typedef typename Base::ImmediateOffsetStrides ImmediateOffsetStrides;

   struct ThreadOffset {
     CUTLASS_HOST_DEVICE
     Coord<4> operator()() const {
       int thread_offset_h = threadIdx.x / Threads::kW * kStrideH * Iterations::kH;
       int thread_offset_w = threadIdx.x % Threads::kW * ThreadsDelta::kW;

       return make_Coord(0, thread_offset_h, thread_offset_w, 0);
     }
   };
 };


 template <typename TileTraits_, typename Index_ = int>
 struct GemmGlobalIteratorAb
     : public TileLoadIterator<TileTraits_,
                               typename TileTraits_::Scalar,
                               TileTraits_::MultiplicandTraits::kKstrided ? IteratorAdvance::kH
                                                                          : IteratorAdvance::kW,
                               MemorySpace::kGlobal,
                               Index_> {
   typedef GemmGlobalIteratorAb<TileTraits_, Index_> This_;

   typedef TileLoadIterator<TileTraits_,
                            typename TileTraits_::Scalar,
                            TileTraits_::MultiplicandTraits::kKstrided ? IteratorAdvance::kH
                                                                       : IteratorAdvance::kW,
                            MemorySpace::kGlobal,
                            Index_>
       Base;
   static MatrixLayout::Kind const kLayout = TileTraits_::kLayout;
   typedef typename Base::Fragment Fragment;
   typedef typename TileTraits_::Scalar Scalar;
   typedef typename TileTraits_::Threads Threads;
   typedef Index_ Index;
   typedef typename TileTraits_::ThreadOffset ThreadOffset;
   static IteratorAdvance::Kind const kAdvance = Base::kAdvance;

   typedef cutlass::PredicateVector<ShapeCount<typename Base::Iterations>::kCount> PredicateVector;

   typedef typename Base::Params BaseParams;

   struct Params : public BaseParams {
     CUTLASS_HOST_DEVICE int initialize(Scalar const* ptr, Index stride_h) {
       Index inc_d = 0;
       Index inc_advance = 0;
       // Move by some columns for each iteration in the H dimension.
       Index inc_h = Base::Delta::kH * stride_h;

       // Move by some more columns in the number of iterations if the D dimension is > 1.
       if (Base::Delta::kD > 0) {
         inc_d = Base::Delta::kD * stride_h - (Base::Iterations::kH - 1) * inc_h;
       }

       // Move to the beginning of the next iteration.
       if (kAdvance == IteratorAdvance::kH && Base::Delta::kD > 0) {
         inc_advance = inc_d;
       } else if (kAdvance == IteratorAdvance::kH) {
         inc_advance = inc_h;
       } else if (Base::Delta::kD > 0) {
         inc_advance = (Base::Iterations::kW + 0) * ShapeCount<typename Base::Delta>::kWc -
                       (Base::Iterations::kH - 1) * inc_h -
                       (Base::Iterations::kD - 1) * Base::Delta::kD * stride_h;
       } else {
         inc_advance = (Base::Iterations::kW + 0) * ShapeCount<typename Base::Delta>::kWc -
                       (Base::Iterations::kH - 1) * inc_h;
       }

       Base::Params::initialize(ptr, 0, stride_h, 0, inc_d, inc_h, 0, inc_advance);
       return 0;
     }
   };

   Coord<4> thread_offset;
   Params params;

   CUTLASS_DEVICE void initialize_predicates(const Coord<3>& bounds, const Coord<3>& block) {
     // Setup the masks to control loads.
     predicates.fill(0);

     int bounds_h, bounds_w;
     if (kAdvance == IteratorAdvance::kH) {
       bounds_w = bounds[2] - block[2];
       bounds_h = bounds[1];

     } else {
       bounds_w = bounds[1];
       bounds_h = bounds[2] - block[1];
     }

     // Fill in the bits of the predicate vector.
     for (int d = 0; d < Base::Iterations::kD; ++d) {
       for (int h = 0; h < Base::Iterations::kH; ++h) {
         for (int w = 0; w < Base::Iterations::kW; ++w) {
           for (int c = 0; c < Base::Iterations::kC; ++c) {
             bool flag = w * Base::Delta::kW < bounds_w;
             if (kAdvance == IteratorAdvance::kH) {
               flag = flag && (h * Base::Delta::kH + d * Base::Delta::kD) < bounds_h;
             } else {
               flag = flag && (h * Base::Delta::kH) < bounds_h;
             }
             int const bit = ComputeOffsetFromShape<typename Base::Iterations>::get(d, h, w, c);
             predicates.set(bit, flag);
           }
         }
       }
     }
   }

   CUTLASS_DEVICE GemmGlobalIteratorAb(Params const& _params,
                                       const Coord<3>& bounds,
                                       const Coord<3>& block,
                                       ThreadOffset thread_offset_func = ThreadOffset())
       : params(_params) {
     thread_offset = thread_offset_func();
     // The column.
     Index block_h = thread_offset[1];
     // The contiguous dimension.
     Index block_w = thread_offset[2];

     // Add the blocks indices.
     if (kAdvance == IteratorAdvance::kH) {
       block_h += block[1];
       block_w += block[2];

     } else {
       block_h += block[2];
       block_w += block[1];
     }

     // Setup the pointer.
     params.pointer += (block_h * params.stride_h + block_w);

     // Initialize predicates
     initialize_predicates(bounds, make_Coord(0, block_h, block_w));
   }

   CUTLASS_DEVICE void inc_h() { params.pointer += params.inc_h; }
   CUTLASS_DEVICE void inc_d() { params.pointer += params.inc_d; }
   CUTLASS_DEVICE void inc_advance() { params.pointer += params.inc_advance; }

   CUTLASS_HOST_DEVICE
   Scalar const* data() const { return params.pointer; }

   CUTLASS_DEVICE void residue(Index k) {
     // The coordinates of the thread.
     Index block_h = thread_offset[1];
     // The contiguous dimension.
     Index block_w = thread_offset[2];

     // Update the predicate vector.
     for (int d = 0; d < Base::Iterations::kD; ++d) {
       for (int h = 0; h < Base::Iterations::kH; ++h) {
         for (int w = 0; w < Base::Iterations::kW; ++w) {
           for (int c = 0; c < Base::Iterations::kC; ++c) {
             Index offset = 0;
             if (kAdvance == IteratorAdvance::kH) {
               offset += block_h + h * Base::Delta::kH + d * Base::Delta::kD;
             } else {
               offset += block_w + w * Base::Delta::kW;
             }

             int const bit = ComputeOffsetFromShape<typename Base::Iterations>::get(d, h, w, c);
             if (offset >= k) {
               predicates.set(bit, false);
             }
           }
         }
       }
     }
   }

   CUTLASS_DEVICE bool valid(int d, int h, int w, int c) const {
     int const bit = ComputeOffsetFromShape<typename Base::Iterations>::get(d, h, w, c);
     return predicates[bit];
   }

   PredicateVector predicates;
 };


 template <typename TileTraits_, typename Index_ = int>
 struct GemmGlobalIteratorCd : public TileIteratorBase<TileTraits_,
                                                       typename TileTraits_::Scalar,
                                                       IteratorAdvance::kH,
                                                       MemorySpace::kGlobal,
                                                       Index_> {
   typedef GemmGlobalIteratorCd<TileTraits_, Index_> This_;
   typedef TileIteratorBase<TileTraits_,
                            typename TileTraits_::Scalar,
                            IteratorAdvance::kH,
                            MemorySpace::kGlobal,
                            Index_>
       Base;

   static MatrixLayout::Kind const kLayout = TileTraits_::kLayout;

   typedef typename TileTraits_::Scalar Scalar;
   typedef typename TileTraits_::Pointer Pointer;
   typedef typename TileTraits_::Threads Threads;
   typedef Index_ Index;
   typedef typename TileTraits_::ThreadOffset ThreadOffset;

   struct Params {
     Pointer pointer;
     Index stride_h;
     Index inc_advance, inc_h;
     Index predicate_inc_advance, predicate_inc_h;
     Index predicate_offset;

     CUTLASS_HOST_DEVICE int initialize(
         Pointer pointer, Index ld, Index bound, Index epilogue_stride_w, Index epilogue_delta_w) {
       // The pointer.
       this->pointer = pointer;
       // Each column of the matrix.
       stride_h = TileTraits_::ThreadsDelta::kH * ld;
       // Each thread output 1 column per iteration. The stride between columns is given by the
       // number of scalars that are loaded per LDS for B.
       inc_h = ld * TileTraits_::kStrideH;
       inc_advance =
           (ld - ld * TileTraits_::kStrideH * (Base::Iterations::kH - 1)) + epilogue_stride_w;

       predicate_offset = bound;
       predicate_inc_h = TileTraits_::kStrideH;
       predicate_inc_advance =
           -((TileTraits_::kStrideH * (Base::Iterations::kH - 1) - 1) + epilogue_delta_w);

       return 0;
     }
   };

   Params params;
   Coord<4> thread_offset;

   CUTLASS_DEVICE GemmGlobalIteratorCd() {}

   CUTLASS_DEVICE GemmGlobalIteratorCd(Params const& params,
                                       const Coord<3>& bounds,
                                       const Coord<3>& block,
                                       int offset = 0,
                                       int pred_offset = 0,
                                       ThreadOffset thread_offset_func = ThreadOffset())
       : params(params) {
     thread_offset = thread_offset_func();
     // Each warp works on a different column of the tile.
     int const h = thread_offset[1] + block[1];
     // Each lane writes a different element.
     int const w = thread_offset[2] + block[2];
     // Setup the pointer.
     this->params.pointer += ((h * params.stride_h + w) + offset);

     // Prepare the vector of predicates.
     for (int i = 0; i < Base::Iterations::kW; ++i) {
       predicates.set(i, w + i * Base::Delta::kW < bounds[2]);
     }
     this->params.predicate_offset -= (h + pred_offset);
   }

   CUTLASS_DEVICE void inc_c() {}
   CUTLASS_DEVICE void inc_w() {}
   CUTLASS_DEVICE void inc_h() {
     params.pointer += params.inc_h;
     params.predicate_offset -= params.predicate_inc_h;
   }
   CUTLASS_DEVICE void inc_d() {}
   CUTLASS_DEVICE void inc_advance() {
     params.pointer += params.inc_advance;
     this->params.predicate_offset -= params.predicate_inc_advance;
   }

   CUTLASS_DEVICE bool valid(int d, int h, int w, int c) const {
     return predicates.at(w) && params.predicate_offset > 0;
   }

   CUTLASS_HOST_DEVICE
   Pointer data() { return params.pointer; }

   CUTLASS_HOST_DEVICE
   Pointer const data() const { return params.pointer; }

   cutlass::PredicateVector<Base::Iterations::kW> predicates;
 };


 }  // namespace gemm
 }  // namespace cutlass
cutlass::gemm::GemmGlobalTileCdTraits
Definition: gemm_global_tile.h:116

cutlass::gemm::GemmGlobalTileTraits::Delta
Shape< 0, Threads::kH, Threads::kW *kAccessSize > Delta
The strides in each dimension between different loads/stores.
Definition: gemm_global_tile.h:92

cutlass::gemm::GemmGlobalIteratorCd::Params::inc_advance
Index inc_advance
The strides to increment the pointer.
Definition: gemm_global_tile.h:384

cutlass::gemm::GemmGlobalIteratorCd::inc_d
CUTLASS_DEVICE void inc_d()
Increment the pointer in the D dimension.
Definition: gemm_global_tile.h:452

cutlass
Definition: convert.h:33

cutlass::gemm::GemmGlobalIteratorAb::PredicateVector
cutlass::PredicateVector< ShapeCount< typename Base::Iterations >::kCount > PredicateVector
Definition: gemm_global_tile.h:191

cutlass::gemm::GemmGlobalIteratorAb::kLayout
static MatrixLayout::Kind const kLayout
The layout.
Definition: gemm_global_tile.h:177

cutlass::platform::remove_const::type
T type
Definition: platform.h:369

cutlass::gemm::GemmGlobalIteratorAb::BaseParams
Base::Params BaseParams
Iterator parameters type.
Definition: gemm_global_tile.h:194

cutlass::gemm::GemmGlobalTileTraits::Iterations
Shape< 1, Tile::kH/Threads::kH, Tile::kW/Threads::kW, Tile::kC/kAccessSize > Iterations
The number of iterations needed to load/store the tile.
Definition: gemm_global_tile.h:97

cutlass::gemm::GemmGlobalIteratorCd::Index
Index_ Index
The index.
Definition: gemm_global_tile.h:373

tile_iterator.h
Defines the Tile Traits concept and iterators for loading and storing to tiles efficiently.

cutlass::gemm::GemmGlobalIteratorCd::This_
GemmGlobalIteratorCd< TileTraits_, Index_ > This_
This class.
Definition: gemm_global_tile.h:354

cutlass::gemm::GemmGlobalIteratorCd::kLayout
static MatrixLayout::Kind const kLayout
The layout.
Definition: gemm_global_tile.h:364

cutlass::gemm::GemmGlobalTileTraits
Definition: gemm_global_tile.h:70

cutlass::gemm::GemmGlobalTileTraits::Pointer
Scalar_ * Pointer
The pointer.
Definition: gemm_global_tile.h:78

coord.h
A Coord is a coordinate of arbitrary rank into a tensor or matrix.

cutlass::IteratorAdvance::Kind
Kind
Definition: tile_iterator.h:62

cutlass::PredicateVector::at
CUTLASS_HOST_DEVICE bool at(int idx) const
Accesses a bit within the predicate vector.
Definition: predicate_vector.h:356

cutlass::MemorySpace::kGlobal
Definition: load_store.h:43

cutlass::make_Coord
CUTLASS_HOST_DEVICE Coord< 1 > make_Coord(int _0)
Helper to make a 2-element coordinate.
Definition: coord.h:241

cutlass::gemm::GemmGlobalTileTraits::MultiplicandTraits
GemmMultiplicandTraits< Tile, kOperand, kLayout > MultiplicandTraits
Definition: gemm_global_tile.h:99

cutlass::gemm::GemmGlobalTileTraits::kMemorySpace
static MemorySpace::Kind const kMemorySpace
The memory space.
Definition: gemm_global_tile.h:82

cutlass::gemm::GemmGlobalIteratorCd::Base
TileIteratorBase< TileTraits_, typename TileTraits_::Scalar, IteratorAdvance::kH, MemorySpace::kGlobal, Index_ > Base
The base class.
Definition: gemm_global_tile.h:361

cutlass::gemm::GemmGlobalTileTraits::ThreadsDelta
Shape< 1, 1, Tile::kC > ThreadsDelta
The relative offset between two elements in the H/W dimension in adjacent threads.
Definition: gemm_global_tile.h:89

cutlass::gemm::GemmGlobalTileCdTraits::Delta
Shape< 0, 0, Base::Delta::kW, Base::Delta::kC > Delta
Override the strides in each dimension between different loads/stores.
Definition: gemm_global_tile.h:134

cutlass::gemm::GemmGlobalIteratorCd::Params::predicate_inc_h
Index predicate_inc_h
Definition: gemm_global_tile.h:386

cutlass::ComputeOffsetFromShape::get
static CUTLASS_DEVICE int get(int d, int h, int w, int c)
Definition: shape.h:166

cutlass::gemm::GemmGlobalIteratorCd::data
CUTLASS_HOST_DEVICE Pointer const data() const
Definition: gemm_global_tile.h:469

cutlass::gemm::GemmGlobalIteratorAb::initialize_predicates
CUTLASS_DEVICE void initialize_predicates(const Coord< 3 > &bounds, const Coord< 3 > &block)
Definition: gemm_global_tile.h:233

cutlass::IteratorAdvance::kH
Definition: tile_iterator.h:62

cutlass::TileLoadIterator::kAdvance
static IteratorAdvance::Kind const kAdvance
Specifies in which dimension post-increment accesses advance.
Definition: tile_iterator.h:331

cutlass::gemm::GemmGlobalIteratorAb::Base
TileLoadIterator< TileTraits_, typename TileTraits_::Scalar, TileTraits_::MultiplicandTraits::kKstrided ? IteratorAdvance::kH :IteratorAdvance::kW, MemorySpace::kGlobal, Index_ > Base
The base class.
Definition: gemm_global_tile.h:175

cutlass::gemm::GemmGlobalIteratorAb::valid
CUTLASS_DEVICE bool valid(int d, int h, int w, int c) const
Is the iterator valid?
Definition: gemm_global_tile.h:336

cutlass::gemm::GemmGlobalIteratorAb::Params
Definition: gemm_global_tile.h:196

cutlass::GemmOperand::kC
Definition: matrix_traits.h:43

platform.h
C++ features that may be otherwise unimplemented for CUDA device functions.

cutlass::gemm::GemmGlobalIteratorAb
Definition: gemm_global_tile.h:159

cutlass::gemm::GemmGlobalIteratorCd::inc_advance
CUTLASS_DEVICE void inc_advance()
Increment the pointer to move to the next iteration.
Definition: gemm_global_tile.h:454

cutlass::gemm::GemmGlobalTileCdTraits::Base
GemmGlobalTileTraits< GemmOperand::kC, MatrixLayout::kColumnMajor, Scalar_, Tile_, Threads_, kAccessSize_ > Base
The base class.
Definition: gemm_global_tile.h:129

cutlass::MemorySpace::Kind
Kind
Definition: load_store.h:40

cutlass::TileIteratorBase::Params::stride_h
Index stride_h
Definition: tile_iterator.h:172

cutlass::gemm::GemmGlobalIteratorAb::kAdvance
static IteratorAdvance::Kind const kAdvance
Specifies in which dimension post-increment accesses advance.
Definition: gemm_global_tile.h:189

cutlass::gemm::GemmGlobalIteratorAb::Threads
TileTraits_::Threads Threads
The threads.
Definition: gemm_global_tile.h:183

cutlass::gemm::GemmGlobalIteratorAb::Params::initialize
CUTLASS_HOST_DEVICE int initialize(Scalar const *ptr, Index stride_h)
Initializes params to load a strip-mined tile, given pointer and stride_h.
Definition: gemm_global_tile.h:198

cutlass::TileLoadIterator::Params::initialize
CUTLASS_HOST_DEVICE int initialize()
Definition: tile_iterator.h:425

cutlass::gemm::GemmGlobalTileCdTraits::kStrideH
static int const kStrideH
The stride in the H dimension.
Definition: gemm_global_tile.h:132

cutlass::Shape::kH
static int const kH
The height of the cube.
Definition: shape.h:68

cutlass::gemm::ReshapeThreads< Tile_, Threads_, true >::Threads
Shape< Threads_::kD, Threads_::kH *Threads_::kW/Tile_::kW, Tile_::kW, 1 > Threads
Definition: gemm_global_tile.h:59

cutlass::gemm::GemmGlobalIteratorCd::Params::predicate_inc_advance
Index predicate_inc_advance
The strides to increment the predicate offset.
Definition: gemm_global_tile.h:386

cutlass::gemm::GemmGlobalTileTraits::kOperand
static GemmOperand::Kind const kOperand
Identity of the operand.
Definition: gemm_global_tile.h:72

cutlass::TileIteratorBase::Params::inc_h
Index inc_h
Definition: tile_iterator.h:176

predicate_vector.h
Defines container classes and iterators for managing a statically sized vector of boolean predicates...

cutlass::TileLoadIterator
An iterator implementing Tile Load Iterator Concept for loading a tile from memory.
Definition: tile_iterator.h:302

cutlass::gemm::ReshapeThreads< Tile_, Threads_, true >
Definition: gemm_global_tile.h:58

cutlass::gemm::GemmGlobalIteratorAb::predicates
PredicateVector predicates
The predicates.
Definition: gemm_global_tile.h:342

cutlass::gemm::GemmGlobalTileTraits::Scalar
platform::remove_const< Scalar_ >::type Scalar
The scalar.
Definition: gemm_global_tile.h:76

cutlass::gemm::GemmGlobalIteratorAb::data
CUTLASS_HOST_DEVICE Scalar const  * data() const
Returns the current pointer.
Definition: gemm_global_tile.h:304

reshape_tile.h
Defines a type for restructuring a tile.

gemm_operand.h
Defines constant expressions for mapping GEMM problem size and strides onto pitch-linear memory...

cutlass::gemm::GemmGlobalIteratorAb::Fragment
Base::Fragment Fragment
Fragment type loaded by the iterator.
Definition: gemm_global_tile.h:179

cutlass::gemm::GemmGlobalIteratorCd::Threads
TileTraits_::Threads Threads
The threads.
Definition: gemm_global_tile.h:371

cutlass::gemm::GemmGlobalTileCdTraits::ThreadOffset::operator()
CUTLASS_HOST_DEVICE Coord< 4 > operator()() const
Definition: gemm_global_tile.h:147

cutlass::gemm::GemmGlobalIteratorCd::inc_h
CUTLASS_DEVICE void inc_h()
Increment the pointer in the H dimension.
Definition: gemm_global_tile.h:447

cutlass::gemm::GemmGlobalIteratorCd::GemmGlobalIteratorCd
CUTLASS_DEVICE GemmGlobalIteratorCd(Params const &params, const Coord< 3 > &bounds, const Coord< 3 > &block, int offset=0, int pred_offset=0, ThreadOffset thread_offset_func=ThreadOffset())
Ctor.
Definition: gemm_global_tile.h:420

cutlass::gemm::GemmMultiplicandTraits
Definition: gemm_operand.h:67

cutlass::gemm::GemmGlobalTileTraits::ThreadOffset
Computes the thread offset in (H, W) based on thread ID.
Definition: gemm_global_tile.h:102

cutlass::TileIteratorBase::Params::inc_advance
Index inc_advance
Definition: tile_iterator.h:179

cutlass::gemm::GemmGlobalIteratorAb::residue
CUTLASS_DEVICE void residue(Index k)
That&#39;s the residue! Update the predicates.
Definition: gemm_global_tile.h:307

cutlass::PredicateVector::fill
CUTLASS_HOST_DEVICE void fill(bool value=true)
Fills all predicates with a given value.
Definition: predicate_vector.h:343

cutlass::gemm::GemmGlobalIteratorAb::GemmGlobalIteratorAb
CUTLASS_DEVICE GemmGlobalIteratorAb(Params const &_params, const Coord< 3 > &bounds, const Coord< 3 > &block, ThreadOffset thread_offset_func=ThreadOffset())
Ctor.
Definition: gemm_global_tile.h:267

cutlass::gemm::GemmGlobalIteratorCd::Params::initialize
CUTLASS_HOST_DEVICE int initialize(Pointer pointer, Index ld, Index bound, Index epilogue_stride_w, Index epilogue_delta_w)
Setup the params.
Definition: gemm_global_tile.h:391

cutlass::gemm::GemmGlobalIteratorCd::inc_c
CUTLASS_DEVICE void inc_c()
Increment the pointer in the C dimension.
Definition: gemm_global_tile.h:443

cutlass::gemm::GemmGlobalIteratorCd::data
CUTLASS_HOST_DEVICE Pointer data()
Returns the raw pointer.
Definition: gemm_global_tile.h:466

cutlass::TileLoadIterator::Params::pointer
Scalar const  * pointer
Pointer to memory.
Definition: tile_iterator.h:390

cutlass::gemm::GemmGlobalTileCdTraits::Threads
Base::Threads Threads
Definition: gemm_global_tile.h:138

cutlass::gemm::GemmGlobalIteratorCd::Params::stride_h
Index stride_h
The stride in the H dimension to setup the thread in the block.
Definition: gemm_global_tile.h:382

CUTLASS_HOST_DEVICE
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:46

cutlass::gemm::GemmGlobalTileTraits::ThreadOffset::operator()
CUTLASS_HOST_DEVICE Coord< 4 > operator()() const
Definition: gemm_global_tile.h:104

cutlass::gemm::GemmGlobalTileTraits::ImmediateOffsetStrides
Shape< 0, 0, Threads::kW *ThreadsDelta::kW, kAccessSize > ImmediateOffsetStrides
Strides for immediate offset computation.
Definition: gemm_global_tile.h:94

cutlass::PredicateVector
Statically sized array of bits implementing.
Definition: predicate_vector.h:104

cutlass::gemm::GemmGlobalIteratorAb::inc_h
CUTLASS_DEVICE void inc_h()
Increment the pointer in the H dimension.
Definition: gemm_global_tile.h:296

cutlass::gemm::GemmGlobalIteratorCd::ThreadOffset
TileTraits_::ThreadOffset ThreadOffset
The thread offset.
Definition: gemm_global_tile.h:375

cutlass::Shape
A Shape implementing Layout Concept describing the dimensions of a cube.
Definition: shape.h:64

cutlass::gemm::GemmGlobalTileCdTraits::ImmediateOffsetStrides
Base::ImmediateOffsetStrides ImmediateOffsetStrides
Definition: gemm_global_tile.h:142

cutlass::gemm::GemmGlobalIteratorCd::Scalar
TileTraits_::Scalar Scalar
The scalar.
Definition: gemm_global_tile.h:367

cutlass::gemm::GemmGlobalIteratorCd::Params::inc_h
Index inc_h
Definition: gemm_global_tile.h:384

cutlass::gemm::GemmGlobalIteratorCd::predicates
cutlass::PredicateVector< Base::Iterations::kW > predicates
The predicates for the row.
Definition: gemm_global_tile.h:472

cutlass::gemm::GemmGlobalIteratorAb::inc_d
CUTLASS_DEVICE void inc_d()
Increment the pointer in the D dimension.
Definition: gemm_global_tile.h:298

cutlass::gemm::GemmGlobalIteratorCd::Params::pointer
Pointer pointer
The pointer.
Definition: gemm_global_tile.h:380

cutlass::gemm::GemmGlobalIteratorAb::This_
GemmGlobalIteratorAb< TileTraits_, Index_ > This_
This class.
Definition: gemm_global_tile.h:167

cutlass::Coord< 4 >

cutlass::gemm::GemmGlobalTileTraits::Tile
ReshapeTile< Tile_, kAccessSize_ >::Tile Tile
The tile shape.
Definition: gemm_global_tile.h:85

cutlass::TileLoadIterator::Fragment
Base::Fragment Fragment
Fragment definition.
Definition: tile_iterator.h:364

cutlass::TileIteratorBase
Iterator for accessing a stripmined tile in memory.
Definition: tile_iterator.h:102

cutlass::gemm::GemmGlobalIteratorCd::inc_w
CUTLASS_DEVICE void inc_w()
Increment the pointer in the W dimension.
Definition: gemm_global_tile.h:445

cutlass::gemm::GemmGlobalIteratorCd::params
Params params
Definition: gemm_global_tile.h:412

cutlass::gemm::GemmGlobalIteratorCd
Definition: gemm_global_tile.h:348

cutlass::MatrixLayout::kColumnMajor
Definition: matrix_traits.h:36

cutlass::gemm::GemmGlobalIteratorCd::thread_offset
Coord< 4 > thread_offset
Offset of an individual lane from the start of the tile.
Definition: gemm_global_tile.h:414

cutlass::gemm::GemmGlobalIteratorAb::ThreadOffset
TileTraits_::ThreadOffset ThreadOffset
The thread offset.
Definition: gemm_global_tile.h:187

cutlass::Shape::kW
static int const kW
The width of the cube.
Definition: shape.h:70

cutlass::PredicateVector::set
CUTLASS_HOST_DEVICE void set(int idx, bool value=true)
Set a bit within the predicate vector.
Definition: predicate_vector.h:364

cutlass::TileLoadIterator::Params
Parameters.
Definition: tile_iterator.h:388

cutlass::gemm::GemmGlobalTileCdTraits::ThreadOffset
Computes the thread offset in (H, W) based on thread ID.
Definition: gemm_global_tile.h:145

cutlass::MatrixLayout::Kind
Kind
Definition: matrix_traits.h:36

cutlass::gemm::GemmGlobalTileTraits::kAccessSize
static int const kAccessSize
The number of scalars per LDG/STG.
Definition: gemm_global_tile.h:80

cutlass::ReshapeTile::Tile
Tile_ Tile
Definition: reshape_tile.h:43

cutlass::IteratorAdvance::kW
Definition: tile_iterator.h:62

cutlass::gemm::GemmGlobalTileCdTraits::Iterations
Base::Iterations Iterations
Definition: gemm_global_tile.h:136

cutlass::gemm::GemmGlobalIteratorAb::Index
Index_ Index
The index.
Definition: gemm_global_tile.h:185

cutlass::gemm::GemmGlobalIteratorCd::Pointer
TileTraits_::Pointer Pointer
The pointer.
Definition: gemm_global_tile.h:369

cutlass::GemmOperand::Kind
Kind
Definition: matrix_traits.h:43

cutlass::gemm::GemmGlobalIteratorAb::Scalar
TileTraits_::Scalar Scalar
The scalar.
Definition: gemm_global_tile.h:181

cutlass::gemm::ReshapeThreads::Threads
Threads_ Threads
Definition: gemm_global_tile.h:54

cutlass::gemm::GemmGlobalTileTraits::Threads
ReshapeThreads< Tile, Threads_ >::Threads Threads
The threads shape.
Definition: gemm_global_tile.h:87

cutlass::gemm::GemmGlobalIteratorAb::inc_advance
CUTLASS_DEVICE void inc_advance()
Increment the pointer to move to the next iteration.
Definition: gemm_global_tile.h:300

cutlass::gemm::GemmGlobalIteratorCd::GemmGlobalIteratorCd
CUTLASS_DEVICE GemmGlobalIteratorCd()
Ctor.
Definition: gemm_global_tile.h:417

cutlass::gemm::GemmGlobalIteratorAb::params
Params params
The parameters.
Definition: gemm_global_tile.h:231

matrix_traits.h
Defines properties of matrices used to denote layout and operands to GEMM kernels.

cutlass::gemm::GemmGlobalIteratorCd::Params
The params.
Definition: gemm_global_tile.h:378

cutlass::gemm::GemmGlobalTileCdTraits::ThreadsDelta
Base::ThreadsDelta ThreadsDelta
Definition: gemm_global_tile.h:140

cutlass::gemm::GemmGlobalIteratorCd::valid
CUTLASS_DEVICE bool valid(int d, int h, int w, int c) const
Test the validity of the iterator.
Definition: gemm_global_tile.h:460

cutlass::gemm::GemmGlobalIteratorAb::thread_offset
Coord< 4 > thread_offset
Offset of an individual lane from the start of the tile.
Definition: gemm_global_tile.h:229

cutlass::ShapeCount
Compute derived counted of a Layout Concept based class.
Definition: shape.h:79

cutlass::gemm::GemmGlobalIteratorCd::Params::predicate_offset
Index predicate_offset
The column offset to compute the predicate for the columns.
Definition: gemm_global_tile.h:388

cutlass::TileIteratorBase::Params::inc_d
Index inc_d
Definition: tile_iterator.h:175

cutlass::gemm::GemmGlobalTileTraits::kLayout
static MatrixLayout::Kind const kLayout
The layout.
Definition: gemm_global_tile.h:74