d7137f9c0a1633b76455109373887e1640713b5d/docs/gemm__global__stream_8h_source.html

 /***************************************************************************************************
  * Copyright (c) 2017-2018, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright notice, this list of
  *       conditions and the following disclaimer.
  *     * Redistributions in binary form must reproduce the above copyright notice, this list of
  *       conditions and the following disclaimer in the documentation and/or other materials
  *       provided with the distribution.
  *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
  *       to endorse or promote products derived from this software without specific prior written
  *       permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
 #pragma once

 #include "cutlass/coord.h"
 #include "cutlass/convert.h"
 #include "cutlass/gemm/gemm_global_tile.h"
 #include "cutlass/tile_allocation.h"

 namespace cutlass {
 namespace gemm {


 template <
     GemmOperand::Kind Operand,
     typename LoadIterator_,
     typename StoreIterator_,
     typename Transformer_>

 struct GlobalLoadStream {
   static GemmOperand::Kind const kOperand = Operand;
   typedef LoadIterator_ LoadIterator;
   typedef Transformer_ Transformer;
   typedef StoreIterator_ StoreIterator;

   typedef typename LoadIterator::Fragment FetchedFragment;
   typedef typename Transformer::OutputFragment TransformedFragment;
   static_assert((platform::is_same<FetchedFragment, typename Transformer::InputFragment>::value),
                 "");
   typedef TransformedFragment Fragment;
   static_assert((platform::is_same<TransformedFragment, typename StoreIterator::Fragment>::value),
                 "");

   static MatrixLayout::Kind const kLayout = LoadIterator::kLayout;
   typedef typename LoadIterator::Scalar Scalar;
   typedef typename LoadIterator::Pointer Pointer;
   typedef typename LoadIterator::Index Index;
   typedef typename LoadIterator::Tile Tile;

   typedef TileAllocation<typename StoreIterator::Scalar, typename StoreIterator::Tile>
       ThreadblockTileStorage;

   typedef typename ThreadblockTileStorage::TensorRef ThreadblockTileRef;

   struct Params {
     // The load iterator.
     typename LoadIterator::Params load_iterator;
     // The store iterator.
     typename StoreIterator::Params store_iterator;
     // Offset to residue.
     Index offset_to_residue;

     CUTLASS_HOST_DEVICE int initialize(Pointer pointer,
                                        long long batch_stride,
                                        Index ldm,
                                        Index _offset_to_residue) {

       offset_to_residue = _offset_to_residue;
       int error_code = load_iterator.initialize(pointer, batch_stride, ldm);
       if (error_code) {
         return error_code;
       }
       return store_iterator.initialize();
     }
   };

   struct SharedStorage {};

   //
   // Static member functions
   //

   CUTLASS_DEVICE static Coord<3> project_coordinate(Coord<3> const& coord, Index d_offset = 0) {
     bool const kKstrided =
         GemmMultiplicandTraits<typename LoadIterator::Tile, kOperand, kLayout>::kKstrided;
     Coord<3> tile_coord = ProjectOperand<kOperand, kKstrided>::project(coord);
     return make_Coord(
         tile_coord[0] + d_offset, tile_coord[1], tile_coord[2] / LoadIterator::Tile::kC);
   }

   CUTLASS_DEVICE GlobalLoadStream(
       Params const& _params,
       SharedStorage& shared_storage,
       ThreadblockTileRef const& threadblock_tile_ref,
       Coord<3> const bounds,
       Coord<3> const& _threadblock_offset)
       : params(_params),
         multiplicand_bounds(project_coordinate(bounds, 1)),
         threadblock_offset(project_coordinate(_threadblock_offset)),
         load_iterator(params.load_iterator,
           project_coordinate(bounds, 1), /*multiplicant_bounds*/
           project_coordinate(_threadblock_offset) /*threablock_offset*/),
         transformer(),
         store_iterator(params.store_iterator, threadblock_tile_ref.data())
   {
     load_iterator.initialize_predicates(multiplicand_bounds, threadblock_offset);
     fetched_fragment.clear();
   }


   CUTLASS_DEVICE void copy() { load_iterator.load_post_increment(fetched_fragment); }

   CUTLASS_DEVICE void commit() {
     transformer.transform(fetched_fragment, transformed_fragment);
     store_iterator.store_post_increment(transformed_fragment);
     store_iterator.inc_stage();
   }

   CUTLASS_DEVICE void residue(Index k, bool skip_clear = false) {
     load_iterator.residue(k);
     if (!skip_clear) {
       fetched_fragment.clear();
     }
   }

   CUTLASS_DEVICE void move_to_residue(Index k, Index kTileK) {
     Index kResidue = k % kTileK;
     if (kResidue) {
       residue(kResidue);
     }
     load_iterator.add_pointer_offset(params.offset_to_residue * load_iterator.stride_advance());
   }

   CUTLASS_DEVICE void rollback(void) {
     load_iterator.initialize_predicates(multiplicand_bounds, threadblock_offset);

     int const kBlock = kOperand == GemmOperand::kA
                            ? (kLayout == MatrixLayout::kColumnMajor ? Tile::kH : Tile::kW)
                            : (kLayout == MatrixLayout::kRowMajor ? Tile::kH : Tile::kW);

     load_iterator.add_pointer_offset(-(params.offset_to_residue + kBlock) *
                                      load_iterator.stride_advance());
   }

   CUTLASS_DEVICE GlobalLoadStream &operator+=(Coord<3> const &offset) {
     load_iterator += offset;
     return *this;
   }

   //
   // Data members
   //

   Params params;
   Coord<3> multiplicand_bounds;
   Coord<3> threadblock_offset;
   LoadIterator load_iterator;
   FetchedFragment fetched_fragment;
   Transformer transformer;
   TransformedFragment transformed_fragment;
   StoreIterator store_iterator;
 };

 }  // namespace gemm
 }  // namespace cutlass
cutlass::gemm::GlobalLoadStream::ThreadblockTileRef
ThreadblockTileStorage::TensorRef ThreadblockTileRef
Tensor reference to threadblock tile.
Definition: gemm_global_stream.h:91

cutlass::gemm::GlobalLoadStream::Pointer
LoadIterator::Pointer Pointer
The pointer.
Definition: gemm_global_stream.h:80

cutlass::gemm::GlobalLoadStream::load_iterator
LoadIterator load_iterator
The iterator.
Definition: gemm_global_stream.h:212

cutlass
Definition: convert.h:33

cutlass::gemm::GlobalLoadStream::store_iterator
StoreIterator store_iterator
The store iterator.
Definition: gemm_global_stream.h:220

cutlass::gemm::GlobalLoadStream::params
Params params
Parameters.
Definition: gemm_global_stream.h:206

gemm_global_tile.h
Defines iterators for efficiently loading and storing to global memory.

cutlass::platform::is_same
std::is_same (false specialization)
Definition: platform.h:420

cutlass::TileAllocation::TensorRef
TensorRef< Scalar, 4 > TensorRef
Defines the tensor reference for this allocation.
Definition: tile_allocation.h:62

cutlass::gemm::GlobalLoadStream::kOperand
static GemmOperand::Kind const kOperand
Indicates the type of GEMM operand.
Definition: gemm_global_stream.h:54

cutlass::gemm::GlobalLoadStream::operator+=
CUTLASS_DEVICE GlobalLoadStream & operator+=(Coord< 3 > const &offset)
Adds a Coord<3> to the underlying global load iterator.
Definition: gemm_global_stream.h:196

cutlass::gemm::GlobalLoadStream::copy
CUTLASS_DEVICE void copy()
Load the data from shared memory to the fetch fragment.
Definition: gemm_global_stream.h:157

coord.h
A Coord is a coordinate of arbitrary rank into a tensor or matrix.

cutlass::gemm::GlobalLoadStream::multiplicand_bounds
Coord< 3 > multiplicand_bounds
Multiplicand bounds.
Definition: gemm_global_stream.h:208

cutlass::make_Coord
CUTLASS_HOST_DEVICE Coord< 1 > make_Coord(int _0)
Helper to make a 2-element coordinate.
Definition: coord.h:318

cutlass::gemm::GlobalLoadStream::kLayout
static MatrixLayout::Kind const kLayout
Make sure the transformed fragment is the same as the store fragment.
Definition: gemm_global_stream.h:76

cutlass::gemm::GlobalLoadStream::Params::store_iterator
StoreIterator::Params store_iterator
Definition: gemm_global_stream.h:98

cutlass::gemm::GlobalLoadStream::fetched_fragment
FetchedFragment fetched_fragment
The fragment to fetch from shared memory.
Definition: gemm_global_stream.h:214

cutlass::gemm::GlobalLoadStream::Params::initialize
CUTLASS_HOST_DEVICE int initialize(Pointer pointer, long long batch_stride, Index ldm, Index _offset_to_residue)
Setup the params.
Definition: gemm_global_stream.h:103

cutlass::gemm::GlobalLoadStream
Definition: gemm_global_stream.h:52

cutlass::gemm::GlobalLoadStream::SharedStorage
Definition: gemm_global_stream.h:120

cutlass::gemm::GlobalLoadStream::Scalar
LoadIterator::Scalar Scalar
The scalar type of the iterator.
Definition: gemm_global_stream.h:78

cutlass::gemm::GlobalLoadStream::residue
CUTLASS_DEVICE void residue(Index k, bool skip_clear=false)
Execute the residue code.
Definition: gemm_global_stream.h:167

cutlass::gemm::GlobalLoadStream::transformed_fragment
TransformedFragment transformed_fragment
The fragment to convert the data after it has been fetched from shared memory.
Definition: gemm_global_stream.h:218

cutlass::MatrixLayout::kRowMajor
Definition: matrix_traits.h:159

tile_allocation.h
Defines a fragment based on a Shape<> template.

cutlass::gemm::GlobalLoadStream::Params::offset_to_residue
Index offset_to_residue
Definition: gemm_global_stream.h:100

cutlass::gemm::GlobalLoadStream::Fragment
TransformedFragment Fragment
Make sure the fragments match.
Definition: gemm_global_stream.h:68

cutlass::gemm::GlobalLoadStream::LoadIterator
LoadIterator_ LoadIterator
The load iterator.
Definition: gemm_global_stream.h:56

cutlass::gemm::GemmMultiplicandTraits
Definition: gemm_operand.h:67

cutlass::gemm::GlobalLoadStream::commit
CUTLASS_DEVICE void commit()
Commit the data.
Definition: gemm_global_stream.h:160

CUTLASS_HOST_DEVICE
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:46

cutlass::TileAllocation
Class for storing a tile in memory and accessing it through a tensor ref.
Definition: tile_allocation.h:41

cutlass::gemm::GlobalLoadStream::transformer
Transformer transformer
The transformer.
Definition: gemm_global_stream.h:216

static_assert
#define static_assert(__e, __m)
Definition: platform.h:153

cutlass::gemm::ProjectOperand
Definition: gemm_operand.h:96

cutlass::Coord< 3 >

cutlass::gemm::GlobalLoadStream::StoreIterator
StoreIterator_ StoreIterator
The store iterator to write to shared memory.
Definition: gemm_global_stream.h:60

cutlass::MatrixLayout::kColumnMajor
Definition: matrix_traits.h:159

cutlass::gemm::GlobalLoadStream::ThreadblockTileStorage
TileAllocation< typename StoreIterator::Scalar, typename StoreIterator::Tile > ThreadblockTileStorage
Shared memory allocation for the tile.
Definition: gemm_global_stream.h:88

cutlass::gemm::GlobalLoadStream::Params::load_iterator
LoadIterator::Params load_iterator
Definition: gemm_global_stream.h:96

cutlass::gemm::GlobalLoadStream::Params
The params.
Definition: gemm_global_stream.h:94

cutlass::gemm::GlobalLoadStream::Transformer
Transformer_ Transformer
The transformer.
Definition: gemm_global_stream.h:58

cutlass::MatrixLayout::Kind
Kind
Enumeration defining fundamental contiguous layouts.
Definition: matrix_traits.h:159

cutlass::gemm::GlobalLoadStream::threadblock_offset
Coord< 3 > threadblock_offset
Threadblock offset.
Definition: gemm_global_stream.h:210

cutlass::gemm::GlobalLoadStream::Index
LoadIterator::Index Index
The index.
Definition: gemm_global_stream.h:82

cutlass::gemm::GlobalLoadStream::project_coordinate
static CUTLASS_DEVICE Coord< 3 > project_coordinate(Coord< 3 > const &coord, Index d_offset=0)
Maps a coordinate in the GEMM&#39;s (K, N, M) coordinate system to global memory.
Definition: gemm_global_stream.h:127

cutlass::GemmOperand::Kind
Kind
Definition: matrix_traits.h:357

cutlass::GemmOperand::kA
Definition: matrix_traits.h:357

cutlass::gemm::GlobalLoadStream::move_to_residue
CUTLASS_DEVICE void move_to_residue(Index k, Index kTileK)
Move to the residue portion.
Definition: gemm_global_stream.h:175

cutlass::gemm::GlobalLoadStream::FetchedFragment
LoadIterator::Fragment FetchedFragment
The fragment that is copied from shared memory.
Definition: gemm_global_stream.h:63

cutlass::gemm::GlobalLoadStream::TransformedFragment
Transformer::OutputFragment TransformedFragment
The fragment that is obtained after the transformation by the transformer.
Definition: gemm_global_stream.h:65

cutlass::gemm::GlobalLoadStream::GlobalLoadStream
CUTLASS_DEVICE GlobalLoadStream(Params const &_params, SharedStorage &shared_storage, ThreadblockTileRef const &threadblock_tile_ref, Coord< 3 > const bounds, Coord< 3 > const &_threadblock_offset)
Ctor.
Definition: gemm_global_stream.h:136

convert.h
Defines conversion operations among Fragments of different base type.

cutlass::gemm::GlobalLoadStream::Tile
LoadIterator::Tile Tile
The tile.
Definition: gemm_global_stream.h:84

cutlass::gemm::GlobalLoadStream::rollback
CUTLASS_DEVICE void rollback(void)
Rollback to the beginning of the first tile.
Definition: gemm_global_stream.h:184