74df0331f2a839e20abb2786c82b90487e8bef6a/docs/gemm__stream__pair_8h_source.html

 /***************************************************************************************************
  * Copyright (c) 2017-2018, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright notice, this list of
  *       conditions and the following disclaimer.
  *     * Redistributions in binary form must reproduce the above copyright notice, this list of
  *       conditions and the following disclaimer in the documentation and/or other materials
  *       provided with the distribution.
  *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
  *       to endorse or promote products derived from this software without specific prior written
  *       permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
 #pragma once

 #include "cutlass/convert.h"
 #include "cutlass/matrix_traits.h"
 #include "cutlass/reshape_tile.h"
 #include "cutlass/tile_allocation.h"
 #include "cutlass/tile_iterator.h"

 #include "cutlass/gemm/clear_accumulators.h"
 #include "cutlass/gemm/gemm_config.h"
 #include "cutlass/gemm/gemm_global_stream.h"
 #include "cutlass/gemm/gemm_operand.h"
 #include "cutlass/gemm/gemm_shared_stream.h"
 #include "cutlass/gemm/threadblock_swizzle.h"

 namespace cutlass {
 namespace gemm {


 template <typename StreamA_, typename StreamB_, bool kResidueInProlog_>
 struct GlobalLoadStreamPair {
   //
   // Type definitions
   //

   typedef StreamA_ StreamA;

   typedef StreamB_ StreamB;

   struct Params {
     typename StreamA::Params stream_a;

     typename StreamB::Params stream_b;

     CUTLASS_HOST_DEVICE
     Params() {}

     CUTLASS_HOST_DEVICE
     Params(typename StreamA::Params const &_params_A, typename StreamB::Params const &_params_B)
         : stream_a(_params_A), stream_b(_params_B) {}
   };

   typedef typename StreamA::Index Index;

   typedef ZipTileAllocation<typename StreamA::ThreadblockTileStorage,
                               typename StreamB::ThreadblockTileStorage>
       ThreadblockTileStorage;

   typedef typename ThreadblockTileStorage::TensorRef ThreadblockTileRef;

   struct SharedStorage {
     typename StreamA::SharedStorage stream_a;
     typename StreamB::SharedStorage stream_b;
   };

   //
   // Data members
   //

   StreamA stream_a;

   StreamB stream_b;

   //
   // Methods
   //

   CUTLASS_DEVICE GlobalLoadStreamPair(Params const &params,
                                       SharedStorage &shared_storage,
                                       ThreadblockTileRef const &threadblock_tile_ref,
                                       Coord<3> const &bounds,
                                       Coord<3> const &block_offset = make_Coord(0, 0, 0))
       : stream_a(params.stream_a,
                  shared_storage.stream_a,
                  threadblock_tile_ref.first,
                  bounds,
                  block_offset),
         stream_b(params.stream_b,
                  shared_storage.stream_b,
                  threadblock_tile_ref.second,
                  bounds,
                  block_offset) {}

   CUTLASS_DEVICE
   GlobalLoadStreamPair & operator+=(Coord<3> const offset) {
     stream_a += offset;
     stream_b += offset;
     return *this;
   }

   CUTLASS_DEVICE void copy() {
     stream_a.copy();
     stream_b.copy();
   }

   CUTLASS_DEVICE void commit() {
     stream_a.commit();
     stream_b.commit();
   }

   CUTLASS_DEVICE void residue(Index k, bool skip_clear = false) {
     stream_a.residue(k, skip_clear);
     stream_b.residue(k, skip_clear);
   }

   CUTLASS_DEVICE void move_to_residue(Index k, Index kTileK) {
     if (kResidueInProlog_) {
       stream_a.move_to_residue(k, kTileK);
       stream_b.move_to_residue(k, kTileK);
     } else if (k < kTileK) {
       residue(k, true);
     }
   }

   CUTLASS_DEVICE void rollback(bool kRollback) {
     if (kResidueInProlog_ && kRollback) {
       stream_a.rollback();
       stream_b.rollback();
     }
   }
 };

 template <typename StreamA_, typename StreamB_>
 struct SharedStreamPair {
   //
   // Type definitions
   //

   typedef StreamA_ StreamA;

   typedef StreamB_ StreamB;

   struct Params {
     typename StreamA::Params stream_a;

     typename StreamB::Params stream_b;
   };

   typedef ZipTensorRef<typename StreamA::TensorRef,
                        typename StreamB::TensorRef >
       ThreadblockTileRef;

   //
   // Data members
   //

   StreamA stream_a;

   StreamB stream_b;

   //
   // Methods
   //

   CUTLASS_DEVICE SharedStreamPair(Params const &params, ThreadblockTileRef const &threadblock_tile_ref)
       : stream_a(params.stream_a, threadblock_tile_ref.first),
         stream_b(params.stream_b, threadblock_tile_ref.second) {}

   CUTLASS_DEVICE void copy(int step) {
     stream_a.copy(step);
     stream_b.copy(step);
   }

   CUTLASS_DEVICE void commit(int step) {
     stream_a.commit(step);
     stream_b.commit(step);
   }

   CUTLASS_DEVICE
   typename StreamA::TransformedFragment const &fragment_a(int step) const {
     return stream_a.fragment(step);
   }

   CUTLASS_DEVICE
   typename StreamB::TransformedFragment const &fragment_b(int step) const {
     return stream_b.fragment(step);
   }

   CUTLASS_DEVICE void inc_stage() {
     stream_a.inc_stage();
     stream_b.inc_stage();
   }
 };


 }  // namespace gemm
 }  // namespace cutlass
cutlass::gemm::GlobalLoadStreamPair::Params::Params
CUTLASS_HOST_DEVICE Params(typename StreamA::Params const &_params_A, typename StreamB::Params const &_params_B)
Constructs a global load stream pair Params object.
Definition: gemm_stream_pair.h:75

cutlass::gemm::GlobalLoadStreamPair::operator+=
CUTLASS_DEVICE GlobalLoadStreamPair & operator+=(Coord< 3 > const offset)
Definition: gemm_stream_pair.h:128

cutlass::gemm::SharedStreamPair::StreamA
StreamA_ StreamA
Stream for A multiplicand.
Definition: gemm_stream_pair.h:179

cutlass::gemm::GlobalLoadStreamPair::move_to_residue
CUTLASS_DEVICE void move_to_residue(Index k, Index kTileK)
Move to residue.
Definition: gemm_stream_pair.h:153

cutlass
Definition: convert.h:33

cutlass::gemm::GlobalLoadStreamPair::SharedStorage::stream_b
StreamB::SharedStorage stream_b
Definition: gemm_stream_pair.h:93

cutlass::ZipTensorRef
Definition: zip_tensor_ref.h:38

tile_iterator.h
Defines the Tile Traits concept and iterators for loading and storing to tiles efficiently.

cutlass::gemm::SharedStreamPair::Params::stream_a
StreamA::Params stream_a
Definition: gemm_stream_pair.h:187

cutlass::gemm::GlobalLoadStreamPair::SharedStorage
Defines a structure containing shared storage for each pair.
Definition: gemm_stream_pair.h:91

cutlass::make_Coord
CUTLASS_HOST_DEVICE Coord< 1 > make_Coord(int _0)
Helper to make a 2-element coordinate.
Definition: coord.h:318

cutlass::gemm::GlobalLoadStreamPair::ThreadblockTileStorage
ZipTileAllocation< typename StreamA::ThreadblockTileStorage, typename StreamB::ThreadblockTileStorage > ThreadblockTileStorage
Shared memory allocation for threadblock-scoped GEMM tile.
Definition: gemm_stream_pair.h:85

cutlass::gemm::GlobalLoadStreamPair::residue
CUTLASS_DEVICE void residue(Index k, bool skip_clear=false)
Execute the residue code.
Definition: gemm_stream_pair.h:147

cutlass::gemm::GlobalLoadStreamPair::GlobalLoadStreamPair
CUTLASS_DEVICE GlobalLoadStreamPair(Params const &params, SharedStorage &shared_storage, ThreadblockTileRef const &threadblock_tile_ref, Coord< 3 > const &bounds, Coord< 3 > const &block_offset=make_Coord(0, 0, 0))
Ctor.
Definition: gemm_stream_pair.h:111

cutlass::gemm::GlobalLoadStreamPair::ThreadblockTileRef
ThreadblockTileStorage::TensorRef ThreadblockTileRef
ZipTensorRef to threadblock tiles.
Definition: gemm_stream_pair.h:88

cutlass::gemm::SharedStreamPair::SharedStreamPair
CUTLASS_DEVICE SharedStreamPair(Params const &params, ThreadblockTileRef const &threadblock_tile_ref)
Construct with the composable structure.
Definition: gemm_stream_pair.h:213

cutlass::gemm::GlobalLoadStreamPair::StreamB
StreamB_ StreamB
Stream for B multiplicand.
Definition: gemm_stream_pair.h:59

cutlass::gemm::SharedStreamPair
Collect the global load streams for multiplicands.
Definition: gemm_stream_pair.h:173

cutlass::gemm::SharedStreamPair::fragment_b
CUTLASS_DEVICE StreamB::TransformedFragment const  & fragment_b(int step) const
The fragment B.
Definition: gemm_stream_pair.h:237

cutlass::gemm::GlobalLoadStreamPair::Params::stream_b
StreamB::Params stream_b
Parameters object for StreamB.
Definition: gemm_stream_pair.h:67

cutlass::gemm::GlobalLoadStreamPair::rollback
CUTLASS_DEVICE void rollback(bool kRollback)
Rollback to beginning of first tile.
Definition: gemm_stream_pair.h:163

cutlass::gemm::SharedStreamPair::stream_a
StreamA stream_a
The stream for A.
Definition: gemm_stream_pair.h:203

cutlass::gemm::SharedStreamPair::fragment_a
CUTLASS_DEVICE StreamA::TransformedFragment const  & fragment_a(int step) const
The fragment A.
Definition: gemm_stream_pair.h:231

cutlass::gemm::GlobalLoadStreamPair
Collect the global load streams for multiplicands.
Definition: gemm_stream_pair.h:50

cutlass::gemm::SharedStreamPair::copy
CUTLASS_DEVICE void copy(int step)
Trigger the copies from shared memory to registers.
Definition: gemm_stream_pair.h:218

tile_allocation.h
Defines a fragment based on a Shape<> template.

cutlass::gemm::GlobalLoadStreamPair::Params
Parameters object.
Definition: gemm_stream_pair.h:62

reshape_tile.h
Defines a type for restructuring a tile.

gemm_operand.h
Defines constant expressions for mapping GEMM problem size and strides onto pitch-linear memory...

clear_accumulators.h
Defines abstractions for efficiently clearing accumulator tiles.

cutlass::gemm::GlobalLoadStreamPair::StreamA
StreamA_ StreamA
Stream for A multiplicand.
Definition: gemm_stream_pair.h:56

cutlass::gemm::GlobalLoadStreamPair::stream_a
StreamA stream_a
Stream for A multiplicand.
Definition: gemm_stream_pair.h:101

cutlass::gemm::SharedStreamPair::ThreadblockTileRef
ZipTensorRef< typename StreamA::TensorRef, typename StreamB::TensorRef > ThreadblockTileRef
Shared memory allocation for threadblock-scoped GEMM tile.
Definition: gemm_stream_pair.h:196

cutlass::gemm::GlobalLoadStreamPair::Index
StreamA::Index Index
Assumes the A stream defines the index type.
Definition: gemm_stream_pair.h:80

cutlass::ZipTileAllocation
Manages a pair of tile allocations as if they are one allocation.
Definition: tile_allocation.h:100

CUTLASS_HOST_DEVICE
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:46

gemm_config.h
Defines properties of GEMM computation that impose some constraints on caller.

cutlass::gemm::GlobalLoadStreamPair::Params::Params
CUTLASS_HOST_DEVICE Params()
Default constructor.
Definition: gemm_stream_pair.h:71

cutlass::gemm::SharedStreamPair::StreamB
StreamB_ StreamB
Stream for B multiplicand.
Definition: gemm_stream_pair.h:182

cutlass::gemm::GlobalLoadStreamPair::SharedStorage::stream_a
StreamA::SharedStorage stream_a
Definition: gemm_stream_pair.h:92

cutlass::gemm::SharedStreamPair::commit
CUTLASS_DEVICE void commit(int step)
Commit the data.
Definition: gemm_stream_pair.h:224

cutlass::Coord< 3 >

cutlass::gemm::GlobalLoadStreamPair::commit
CUTLASS_DEVICE void commit()
Commit the data.
Definition: gemm_stream_pair.h:141

gemm_global_stream.h
Implements efficient loading of the thread block-level tile from global memory and storing to shared ...

cutlass::gemm::SharedStreamPair::stream_b
StreamB stream_b
The stream for B.
Definition: gemm_stream_pair.h:206

cutlass::gemm::SharedStreamPair::inc_stage
CUTLASS_DEVICE void inc_stage()
Increment the stage.
Definition: gemm_stream_pair.h:242

cutlass::gemm::SharedStreamPair::Params
Parameters object passed to load iterators.
Definition: gemm_stream_pair.h:185

cutlass::gemm::SharedStreamPair::Params::stream_b
StreamB::Params stream_b
Definition: gemm_stream_pair.h:190

threadblock_swizzle.h
Defies functors for mapping blockIdx to partitions of the GEMM computation.

matrix_traits.h
Defines properties of matrices used to denote layout and operands to GEMM kernels.

cutlass::gemm::GlobalLoadStreamPair::copy
CUTLASS_DEVICE void copy()
Trigger the copies from shared memory to registers.
Definition: gemm_stream_pair.h:135

cutlass::gemm::GlobalLoadStreamPair::Params::stream_a
StreamA::Params stream_a
Parameters object for StreamA.
Definition: gemm_stream_pair.h:64

gemm_shared_stream.h
Defines abstractions for managing loading and storing fragments to shared memory in the efficient GEM...

cutlass::gemm::GlobalLoadStreamPair::stream_b
StreamB stream_b
Stream for B multiplicand.
Definition: gemm_stream_pair.h:104

convert.h
Defines conversion operations among Fragments of different base type.