
CUTLASS 2.0 Substantially refactored for - Better performance, particularly for native Turing Tensor Cores - Robust and durable templates spanning the design space - Encapsulated functionality embodying modern C++11 programming techniques - Optimized containers and data types for efficient, generic, portable device code Updates to: - Quick start guide - Documentation - Utilities - CUTLASS Profiler Native Turing Tensor Cores - Efficient GEMM kernels targeting Turing Tensor Cores - Mixed-precision floating point, 8-bit integer, 4-bit integer, and binarized operands Coverage of existing CUTLASS functionality: - GEMM kernels targeting CUDA and Tensor Cores in NVIDIA GPUs - Volta Tensor Cores through native mma.sync and through WMMA API - Optimizations such as parallel reductions, threadblock rasterization, and intra-threadblock reductions - Batched GEMM operations - Complex-valued GEMMs Note: this commit and all that follow require a host compiler supporting C++11 or greater.
32 lines
3.8 KiB
JavaScript
32 lines
3.8 KiB
JavaScript
var searchData=
|
|
[
|
|
['scalario',['ScalarIO',['../structcutlass_1_1ScalarIO.html',1,'cutlass']]],
|
|
['semaphore',['Semaphore',['../classcutlass_1_1Semaphore.html',1,'cutlass']]],
|
|
['sharedloaditerator',['SharedLoadIterator',['../classcutlass_1_1epilogue_1_1threadblock_1_1SharedLoadIterator.html',1,'cutlass::epilogue::threadblock']]],
|
|
['sharedstorage',['SharedStorage',['../structcutlass_1_1epilogue_1_1EpilogueWorkspace_1_1SharedStorage.html',1,'cutlass::epilogue::EpilogueWorkspace']]],
|
|
['sharedstorage',['SharedStorage',['../structcutlass_1_1reduction_1_1kernel_1_1ReduceSplitK_1_1SharedStorage.html',1,'cutlass::reduction::kernel::ReduceSplitK']]],
|
|
['sharedstorage',['SharedStorage',['../unioncutlass_1_1gemm_1_1kernel_1_1GemmSplitKParallel_1_1SharedStorage.html',1,'cutlass::gemm::kernel::GemmSplitKParallel']]],
|
|
['sharedstorage',['SharedStorage',['../structcutlass_1_1epilogue_1_1threadblock_1_1DirectEpilogueTensorOp_1_1SharedStorage.html',1,'cutlass::epilogue::threadblock::DirectEpilogueTensorOp']]],
|
|
['sharedstorage',['SharedStorage',['../unioncutlass_1_1gemm_1_1kernel_1_1GemmBatched_1_1SharedStorage.html',1,'cutlass::gemm::kernel::GemmBatched']]],
|
|
['sharedstorage',['SharedStorage',['../unioncutlass_1_1gemm_1_1kernel_1_1Gemm_1_1SharedStorage.html',1,'cutlass::gemm::kernel::Gemm']]],
|
|
['sharedstorage',['SharedStorage',['../structcutlass_1_1epilogue_1_1threadblock_1_1InterleavedEpilogue_1_1SharedStorage.html',1,'cutlass::epilogue::threadblock::InterleavedEpilogue']]],
|
|
['sharedstorage',['SharedStorage',['../classcutlass_1_1gemm_1_1threadblock_1_1MmaBase_1_1SharedStorage.html',1,'cutlass::gemm::threadblock::MmaBase']]],
|
|
['sharedstorage',['SharedStorage',['../structcutlass_1_1epilogue_1_1threadblock_1_1EpilogueBase_1_1SharedStorage.html',1,'cutlass::epilogue::threadblock::EpilogueBase']]],
|
|
['simtpolicy',['SimtPolicy',['../structcutlass_1_1epilogue_1_1warp_1_1SimtPolicy.html',1,'cutlass::epilogue::warp']]],
|
|
['simtpolicy_3c_20warpshape_5f_2c_20operator_5f_2c_20layout_3a_3arowmajor_2c_20mmasimtpolicy_5f_20_3e',['SimtPolicy< WarpShape_, Operator_, layout::RowMajor, MmaSimtPolicy_ >',['../structcutlass_1_1epilogue_1_1warp_1_1SimtPolicy_3_01WarpShape___00_01Operator___00_01layout_1_1Rcef1c60e23e997017ae176c92931151d.html',1,'cutlass::epilogue::warp']]],
|
|
['sizeof_5fbits',['sizeof_bits',['../structcutlass_1_1sizeof__bits.html',1,'cutlass']]],
|
|
['sizeof_5fbits_3c_20array_3c_20t_2c_20n_2c_20registersized_20_3e_20_3e',['sizeof_bits< Array< T, N, RegisterSized > >',['../structcutlass_1_1sizeof__bits_3_01Array_3_01T_00_01N_00_01RegisterSized_01_4_01_4.html',1,'cutlass']]],
|
|
['sizeof_5fbits_3c_20bin1_5ft_20_3e',['sizeof_bits< bin1_t >',['../structcutlass_1_1sizeof__bits_3_01bin1__t_01_4.html',1,'cutlass']]],
|
|
['sizeof_5fbits_3c_20int4b_5ft_20_3e',['sizeof_bits< int4b_t >',['../structcutlass_1_1sizeof__bits_3_01int4b__t_01_4.html',1,'cutlass']]],
|
|
['sizeof_5fbits_3c_20uint1b_5ft_20_3e',['sizeof_bits< uint1b_t >',['../structcutlass_1_1sizeof__bits_3_01uint1b__t_01_4.html',1,'cutlass']]],
|
|
['sizeof_5fbits_3c_20uint4b_5ft_20_3e',['sizeof_bits< uint4b_t >',['../structcutlass_1_1sizeof__bits_3_01uint4b__t_01_4.html',1,'cutlass']]],
|
|
['sm50',['Sm50',['../structcutlass_1_1arch_1_1Sm50.html',1,'cutlass::arch']]],
|
|
['sm60',['Sm60',['../structcutlass_1_1arch_1_1Sm60.html',1,'cutlass::arch']]],
|
|
['sm61',['Sm61',['../structcutlass_1_1arch_1_1Sm61.html',1,'cutlass::arch']]],
|
|
['sm70',['Sm70',['../structcutlass_1_1arch_1_1Sm70.html',1,'cutlass::arch']]],
|
|
['sm72',['Sm72',['../structcutlass_1_1arch_1_1Sm72.html',1,'cutlass::arch']]],
|
|
['sm75',['Sm75',['../structcutlass_1_1arch_1_1Sm75.html',1,'cutlass::arch']]],
|
|
['sqrt_5fest',['sqrt_est',['../structcutlass_1_1sqrt__est.html',1,'cutlass']]],
|
|
['subbytereference',['SubbyteReference',['../classcutlass_1_1SubbyteReference.html',1,'cutlass']]]
|
|
];
|