Cutlass
CUDA Templates for Linear Algebra Subroutines and Solvers
Namespaces | Classes | Functions
cutlass Namespace Reference

Namespaces

 gemm
 
 platform
 

Classes

struct  AlignedStruct
 
struct  ComputeOffsetFromShape
 Compute the offset for the given coordinates in a cube. More...
 
struct  ComputeOffsetFromShape< Shape< 1, kSh_, kSw_, 1 > >
 Compute the offset for the given coordinates in a cube with one channel and a depth of 1. More...
 
struct  ComputeOffsetFromShape< Shape< 1, kSh_, kSw_, kSc_ > >
 Compute the offset for the given coordinates in a cube with a depth of 1. More...
 
struct  ComputeOffsetFromStrides
 Compute the offset for the given coordinates in a cube. More...
 
struct  ComputeOffsetFromStrides< Shape< 1, S_h_, S_w_, 1 > >
 Compute the offset for the given coordinates in a cube with one channel and a depth of 1. More...
 
struct  ComputeOffsetFromStrides< Shape< 1, S_h_, S_w_, S_c_ > >
 Compute the offset for the given coordinates in a cube with a depth of 1. More...
 
struct  ComputeThreadOffsetFromStrides
 Decompose threadId.x into coordinate of a cube whose dimensions are specified by Threads_. Afterwards compute the offset of those coordinates using Strides_. More...
 
struct  ComputeThreadOffsetFromStrides< Shape< 1, T_h_, T_w_, 1 >, Shape< 1, S_h_, S_w_, 1 > >
 Specialization for D=1 and C=1. More...
 
struct  ComputeThreadOffsetFromStrides< Shape< 1, T_h_, T_w_, T_c_ >, Shape< 1, S_h_, S_w_, S_c_ > >
 Specialization for D=1. More...
 
struct  ConstPredicateTileAdapter
 Adapter to enable random access to predicates via logical coordinate within a tile. More...
 
struct  Convert
 
struct  Convert< Fragment< InputScalar_, kScalars_ >, Fragment< OutputScalar_, kScalars_ > >
 
struct  Coord
 Statically-sized array specifying Coords within a tensor. More...
 
struct  Copy
 
struct  divide_assert
 
struct  Extent
 Returns the extent of a scalar or vector. More...
 
struct  Extent< Vector< T, Lanes > >
 Returns the number of lanes of a vector if need be. More...
 
struct  Extent< Vector< T, Lanes > const >
 Returns the number of lanes of a vector if need be. More...
 
struct  Fragment
 A template defining Fragment Concept. More...
 
struct  FragmentConstIterator
 
struct  FragmentIterator
 A template defining Fragment Iterator Concept. More...
 
struct  FragmentLoad
 
struct  FragmentLoad< IteratorFragment::kScalar, kAccessSize, Scalar_, Memory_, FragmentElement_, kStride >
 
struct  FragmentLoad< IteratorFragment::kWmmaMatrix, kAccessSize, Scalar_, Memory_, FragmentElement_, kStride >
 
struct  FragmentStore
 
struct  FragmentStore< IteratorFragment::kScalar, kAccessSize, Scalar_, Memory_, FragmentElement_, kStride >
 
struct  FragmentStore< IteratorFragment::kWmmaMatrix, kAccessSize, Scalar_, Memory_, FragmentElement_, kStride >
 
struct  GemmOperand
 Gemm operand - D = A * B + C. More...
 
struct  Identity
 Describes identity elements. More...
 
struct  is_pow2
 
struct  IteratorAdvance
 Specifies dimension in which post-increment accesses advance. More...
 
struct  IteratorFragment
 Specifies whether iterator storage fragment consists of Scalar values or WMMA matrix. More...
 
struct  Load
 
struct  Load< double, 2, Memory_, true, 16 >
 
struct  Load< Scalar_, Lanes_, Memory_, true, 16 >
 
struct  Load< Scalar_, Lanes_, Memory_, true, 4 >
 
struct  Load< Scalar_, Lanes_, Memory_, true, 8 >
 
struct  log2_down
 
struct  log2_down< N, 1, Count >
 
struct  log2_up
 
struct  log2_up< N, 1, Count >
 
struct  MatrixLayout
 Describes layouts of matrices. More...
 
struct  MemorySpace
 Enum to specify which memory space data resides in. More...
 
struct  PredicateTileAdapter
 Adapter to enable random access to predicates via logical coordinate within a tile. More...
 
struct  PredicateVector
 Statically sized array of bits implementing. More...
 
struct  ReshapeTile
 
struct  ReshapeTile< Tile_, kAccessSize_, true >
 
struct  Shape
 A Shape implementing Layout Concept describing the dimensions of a cube. More...
 
struct  ShapeAdd
 
struct  ShapeCount
 Compute derived counted of a Layout Concept based class. More...
 
struct  ShapeDiv
 
struct  ShapeMax
 
struct  ShapeMin
 
struct  ShapeMul
 
struct  ShapeScale
 
struct  ShapeStrides
 
struct  ShapeSub
 
struct  sqrt_est
 
struct  StorageType
 
struct  StorageType< 1 >
 
struct  StorageType< 2 >
 
struct  StorageType< 4 >
 
struct  Store
 
struct  Store< double, 2, Memory_, true, 16 >
 
struct  Store< Scalar_, Lanes_, Memory_, true, 16 >
 
struct  Store< Scalar_, Lanes_, Memory_, true, 4 >
 
struct  Store< Scalar_, Lanes_, Memory_, true, 8 >
 
class  TensorRef
 Structure modeling a pointer and stride into a tensor. More...
 
class  TensorView
 Host-side reference implementation of tensor operations. More...
 
struct  TiledThreadOffset
 Basic thread offset function computed from a thread shape. More...
 
struct  TileIteratorBase
 Iterator for accessing a stripmined tile in memory. More...
 
struct  TileLoadIterator
 An iterator implementing Tile Load Iterator Concept for loading a tile from memory. More...
 
struct  TileStoreIterator
 An iterator implementing Tile Store Iterator Concept for storing a tile to memory. More...
 
struct  TileTraits
 A template defining Tile Traits Concept. More...
 
struct  TileTraitsContiguousMajor
 
struct  TileTraitsStandard
 Chooses 'best' shape to enable warp raking along contiguous dimension if possible. More...
 
struct  TileTraitsStrideMajor
 
struct  TileTraitsWarpRake
 Tiling in which warps rake across the contiguous dimension. More...
 
struct  TrivialPredicateTileAdapter
 Always returns true predicate. More...
 
union  Vector
 
union  Vector< half, kLanes_ >
 
struct  Vectorize
 
struct  Vectorize< Element_, 1 >
 
struct  VectorTraits
 Traits describing properties of vectors and scalar-as-vectors. More...
 
struct  VectorTraits< Vector< T, Lanes > >
 Partial specialization for actual cutlass::Vector. More...
 
struct  VectorTraits< Vector< T, Lanes > const >
 Partial specialization for actual cutlass::Vector. More...
 

Functions

CUTLASS_HOST_DEVICE Coord< 1 > make_Coord (int _0)
 Helper to make a 2-element coordinate. More...
 
CUTLASS_HOST_DEVICE Coord< 2 > make_Coord (int _0, int _1)
 Helper to make a 2-element coordinate. More...
 
CUTLASS_HOST_DEVICE Coord< 3 > make_Coord (int _0, int _1, int _2)
 Helper to make a 3-element coordinate. More...
 
CUTLASS_HOST_DEVICE Coord< 4 > make_Coord (int _0, int _1, int _2, int _3)
 Helper to make a 4-element coordinate. More...
 
CUTLASS_HOST_DEVICE Coord< 2 > get_Coord_hw (Coord< 3 > const &coord)
 Getter. More...
 
CUTLASS_HOST_DEVICE Coord< 2 > get_Coord_hw (Coord< 4 > const &coord)
 Getter. More...
 
CUTLASS_HOST_DEVICE Coord< 3 > get_Coord_hwc (Coord< 4 > const &coord)
 Getter. More...
 
CUTLASS_HOST_DEVICE Coord< 3 > get_Coord_dhw (Coord< 4 > const &coord)
 Getter. More...
 
template<typename InputIterator , typename Fragment >
CUTLASS_HOST_DEVICE void iterator_load (InputIterator &iterator, Fragment &fragment)
 Loads a fragment from an input iterator. More...
 
template<typename InputIterator , typename Fragment >
CUTLASS_DEVICE void shared_iterator_load (InputIterator &iterator, Fragment &fragment)
 Loads a fragment from a shared memory input iterator. More...
 
template<typename InputIterator , typename Fragment >
CUTLASS_DEVICE void shared_iterator_load (InputIterator &iterator, Fragment &fragment, int d)
 Loads a fragment from a shared memory input iterator. More...
 
template<typename InputIterator , typename Fragment , typename ConstPredicateAdapter >
CUTLASS_HOST_DEVICE void iterator_load_post_increment (InputIterator &iterator, Fragment &fragment, typename InputIterator::Index offset, ConstPredicateAdapter predicate_adapter)
 Loads a fragment from an input iterator, masked by a predicate iterator. More...
 
template<typename InputIterator , typename Fragment >
CUTLASS_HOST_DEVICE void iterator_load_post_increment (InputIterator &iterator, Fragment &fragment, typename InputIterator::Index offset=0)
 Loads a fragment from an input iterator. More...
 
template<typename InputIterator , typename Fragment , typename ConstPredicateAdapter >
CUTLASS_HOST_DEVICE void iterator_load_post_increment (InputIterator &iterator, Fragment &fragment, ConstPredicateAdapter pred_it)
 Loads a fragment from an input iterator. More...
 
template<typename InputIterator , typename Fragment , typename ConstPredicateAdapter >
CUTLASS_HOST_DEVICE void iterator_load (InputIterator const &_iterator, Fragment &fragment, typename InputIterator::Index offset, ConstPredicateAdapter predicate_adapter)
 
template<typename InputIterator , typename Fragment >
CUTLASS_HOST_DEVICE void iterator_load (InputIterator const &iterator, Fragment &fragment, typename InputIterator::Index offset=0)
 Loads a fragment from an input iterator. More...
 
template<typename InputIterator , typename Fragment , typename ConstPredicateAdapter >
CUTLASS_HOST_DEVICE void iterator_load (InputIterator const &iterator, Fragment &fragment, ConstPredicateAdapter pred_it)
 Loads a fragment from an input iterator. More...
 
template<typename OutputIterator , typename Fragment >
CUTLASS_HOST_DEVICE void iterator_store (OutputIterator &iterator, Fragment &fragment)
 Stores a fragment to an output iterator. More...
 
template<typename OutputIterator , typename Fragment >
CUTLASS_DEVICE void shared_iterator_store (OutputIterator &iterator, Fragment const &fragment)
 Stores a fragment to a shared memory output iterator. More...
 
template<typename OutputIterator , typename Fragment , typename ConstPredicateAdapter >
CUTLASS_HOST_DEVICE void iterator_store_post_increment (OutputIterator &iterator, Fragment const &fragment, typename OutputIterator::Index offset, ConstPredicateAdapter predicate_adapter)
 Stores a fragment to an output iterator, masked by a predicate iterator. More...
 
template<typename OutputIterator , typename Fragment >
CUTLASS_HOST_DEVICE void iterator_store_post_increment (OutputIterator &iterator, Fragment const &fragment, typename OutputIterator::Index offset=0)
 Stores a fragment to an output iterator. More...
 
template<typename OutputIterator , typename Fragment , typename ConstPredicateAdapter >
CUTLASS_HOST_DEVICE void iterator_store_post_increment (OutputIterator &iterator, Fragment const &fragment, ConstPredicateAdapter pred_it)
 Stores a fragment to an output iterator. More...
 
template<typename OutputIterator , typename Fragment , typename ConstPredicateAdapter >
CUTLASS_HOST_DEVICE void iterator_store (OutputIterator const &_iterator, Fragment const &fragment, typename OutputIterator::Index offset, ConstPredicateAdapter predicate_adapter)
 Stores a fragment to an output iterator, masked by a predicate iterator. More...
 
template<typename OutputIterator , typename Fragment >
CUTLASS_HOST_DEVICE void iterator_store (OutputIterator const &iterator, Fragment const &fragment, typename OutputIterator::Index offset=0)
 Stores a fragment to an output iterator. More...
 
template<typename OutputIterator , typename Fragment , typename ConstPredicateAdapter >
CUTLASS_HOST_DEVICE void iterator_store (OutputIterator const &iterator, Fragment const &fragment, ConstPredicateAdapter pred_it)
 Stores a fragment to an output iterator. More...
 
template<typename dividend_t , typename divisor_t >
CUTLASS_HOST_DEVICE dividend_t round_nearest (dividend_t dividend, divisor_t divisor)
 
template<typename value_t >
CUTLASS_HOST_DEVICE value_t gcd (value_t a, value_t b)
 
template<typename value_t >
CUTLASS_HOST_DEVICE value_t lcm (value_t a, value_t b)
 
__host__ CUTLASS_DEVICE cudaError_t cuda_perror_impl (cudaError_t error, const char *filename, int line)
 The corresponding error message is printed to stderr (or stdout in device code) along with the supplied source context. More...
 
template<>
struct __align__ (1) AlignedStruct< 1 >
 
template<>
struct __align__ (2) AlignedStruct< 2 >
 
template<>
struct __align__ (4) AlignedStruct< 4 >
 
template<>
struct __align__ (8) AlignedStruct< 8 >
 
template<>
struct __align__ (16) AlignedStruct< 16 >
 
template<>
struct __align__ (32) AlignedStruct< 32 >
 
template<>
struct __align__ (64) AlignedStruct< 64 >
 
template<typename Scalar_ >
CUTLASS_DEVICE void make_zero (Scalar_ &x)
 
template<typename Scalar_ , int kLanes_>
CUTLASS_DEVICE void make_zero (Vector< Scalar_, kLanes_ > &vec)
 

Function Documentation

◆ __align__() [1/7]

template<>
struct cutlass::__align__ ( )

◆ __align__() [2/7]

template<>
struct cutlass::__align__ ( )

◆ __align__() [3/7]

template<>
struct cutlass::__align__ ( )

◆ __align__() [4/7]

template<>
struct cutlass::__align__ ( 16  )

◆ __align__() [5/7]

template<>
struct cutlass::__align__ ( 32  )

◆ __align__() [6/7]

template<>
struct cutlass::__align__ ( 64  )

◆ __align__() [7/7]

template<>
struct cutlass::__align__ ( )

◆ cuda_perror_impl()

__host__ CUTLASS_DEVICE cudaError_t cutlass::cuda_perror_impl ( cudaError_t  error,
const char *  filename,
int  line 
)
Returns
The CUDA error.

◆ gcd()

template<typename value_t >
CUTLASS_HOST_DEVICE value_t cutlass::gcd ( value_t  a,
value_t  b 
)

Greatest common divisor

◆ get_Coord_dhw()

CUTLASS_HOST_DEVICE Coord<3> cutlass::get_Coord_dhw ( Coord< 4 > const &  coord)

◆ get_Coord_hw() [1/2]

CUTLASS_HOST_DEVICE Coord<2> cutlass::get_Coord_hw ( Coord< 3 > const &  coord)

◆ get_Coord_hw() [2/2]

CUTLASS_HOST_DEVICE Coord<2> cutlass::get_Coord_hw ( Coord< 4 > const &  coord)

◆ get_Coord_hwc()

CUTLASS_HOST_DEVICE Coord<3> cutlass::get_Coord_hwc ( Coord< 4 > const &  coord)

◆ iterator_load() [1/4]

template<typename InputIterator , typename Fragment >
CUTLASS_HOST_DEVICE void cutlass::iterator_load ( InputIterator &  iterator,
Fragment fragment 
)

◆ iterator_load() [2/4]

template<typename InputIterator , typename Fragment , typename ConstPredicateAdapter >
CUTLASS_HOST_DEVICE void cutlass::iterator_load ( InputIterator const &  _iterator,
Fragment fragment,
typename InputIterator::Index  offset,
ConstPredicateAdapter  predicate_adapter 
)

◆ iterator_load() [3/4]

template<typename InputIterator , typename Fragment >
CUTLASS_HOST_DEVICE void cutlass::iterator_load ( InputIterator const &  iterator,
Fragment fragment,
typename InputIterator::Index  offset = 0 
)

◆ iterator_load() [4/4]

template<typename InputIterator , typename Fragment , typename ConstPredicateAdapter >
CUTLASS_HOST_DEVICE void cutlass::iterator_load ( InputIterator const &  iterator,
Fragment fragment,
ConstPredicateAdapter  pred_it 
)

◆ iterator_load_post_increment() [1/3]

template<typename InputIterator , typename Fragment , typename ConstPredicateAdapter >
CUTLASS_HOST_DEVICE void cutlass::iterator_load_post_increment ( InputIterator &  iterator,
Fragment fragment,
typename InputIterator::Index  offset,
ConstPredicateAdapter  predicate_adapter 
)

◆ iterator_load_post_increment() [2/3]

template<typename InputIterator , typename Fragment >
CUTLASS_HOST_DEVICE void cutlass::iterator_load_post_increment ( InputIterator &  iterator,
Fragment fragment,
typename InputIterator::Index  offset = 0 
)

◆ iterator_load_post_increment() [3/3]

template<typename InputIterator , typename Fragment , typename ConstPredicateAdapter >
CUTLASS_HOST_DEVICE void cutlass::iterator_load_post_increment ( InputIterator &  iterator,
Fragment fragment,
ConstPredicateAdapter  pred_it 
)

◆ iterator_store() [1/4]

template<typename OutputIterator , typename Fragment >
CUTLASS_HOST_DEVICE void cutlass::iterator_store ( OutputIterator &  iterator,
Fragment fragment 
)

◆ iterator_store() [2/4]

template<typename OutputIterator , typename Fragment , typename ConstPredicateAdapter >
CUTLASS_HOST_DEVICE void cutlass::iterator_store ( OutputIterator const &  _iterator,
Fragment const &  fragment,
typename OutputIterator::Index  offset,
ConstPredicateAdapter  predicate_adapter 
)

◆ iterator_store() [3/4]

template<typename OutputIterator , typename Fragment >
CUTLASS_HOST_DEVICE void cutlass::iterator_store ( OutputIterator const &  iterator,
Fragment const &  fragment,
typename OutputIterator::Index  offset = 0 
)

◆ iterator_store() [4/4]

template<typename OutputIterator , typename Fragment , typename ConstPredicateAdapter >
CUTLASS_HOST_DEVICE void cutlass::iterator_store ( OutputIterator const &  iterator,
Fragment const &  fragment,
ConstPredicateAdapter  pred_it 
)

◆ iterator_store_post_increment() [1/3]

template<typename OutputIterator , typename Fragment , typename ConstPredicateAdapter >
CUTLASS_HOST_DEVICE void cutlass::iterator_store_post_increment ( OutputIterator &  iterator,
Fragment const &  fragment,
typename OutputIterator::Index  offset,
ConstPredicateAdapter  predicate_adapter 
)

◆ iterator_store_post_increment() [2/3]

template<typename OutputIterator , typename Fragment >
CUTLASS_HOST_DEVICE void cutlass::iterator_store_post_increment ( OutputIterator &  iterator,
Fragment const &  fragment,
typename OutputIterator::Index  offset = 0 
)

◆ iterator_store_post_increment() [3/3]

template<typename OutputIterator , typename Fragment , typename ConstPredicateAdapter >
CUTLASS_HOST_DEVICE void cutlass::iterator_store_post_increment ( OutputIterator &  iterator,
Fragment const &  fragment,
ConstPredicateAdapter  pred_it 
)

◆ lcm()

template<typename value_t >
CUTLASS_HOST_DEVICE value_t cutlass::lcm ( value_t  a,
value_t  b 
)

Least common multiple

◆ make_Coord() [1/4]

CUTLASS_HOST_DEVICE Coord<1> cutlass::make_Coord ( int  _0)

◆ make_Coord() [2/4]

CUTLASS_HOST_DEVICE Coord<2> cutlass::make_Coord ( int  _0,
int  _1 
)

◆ make_Coord() [3/4]

CUTLASS_HOST_DEVICE Coord<3> cutlass::make_Coord ( int  _0,
int  _1,
int  _2 
)

◆ make_Coord() [4/4]

CUTLASS_HOST_DEVICE Coord<4> cutlass::make_Coord ( int  _0,
int  _1,
int  _2,
int  _3 
)

◆ make_zero() [1/2]

template<typename Scalar_ >
CUTLASS_DEVICE void cutlass::make_zero ( Scalar_ &  x)

◆ make_zero() [2/2]

template<typename Scalar_ , int kLanes_>
CUTLASS_DEVICE void cutlass::make_zero ( Vector< Scalar_, kLanes_ > &  vec)

◆ round_nearest()

template<typename dividend_t , typename divisor_t >
CUTLASS_HOST_DEVICE dividend_t cutlass::round_nearest ( dividend_t  dividend,
divisor_t  divisor 
)

Round dividend up to the nearest multiple of divisor

◆ shared_iterator_load() [1/2]

template<typename InputIterator , typename Fragment >
CUTLASS_DEVICE void cutlass::shared_iterator_load ( InputIterator &  iterator,
Fragment fragment 
)

◆ shared_iterator_load() [2/2]

template<typename InputIterator , typename Fragment >
CUTLASS_DEVICE void cutlass::shared_iterator_load ( InputIterator &  iterator,
Fragment fragment,
int  d 
)

◆ shared_iterator_store()

template<typename OutputIterator , typename Fragment >
CUTLASS_DEVICE void cutlass::shared_iterator_store ( OutputIterator &  iterator,
Fragment const &  fragment 
)