
CUTLASS 2.0 Substantially refactored for - Better performance, particularly for native Turing Tensor Cores - Robust and durable templates spanning the design space - Encapsulated functionality embodying modern C++11 programming techniques - Optimized containers and data types for efficient, generic, portable device code Updates to: - Quick start guide - Documentation - Utilities - CUTLASS Profiler Native Turing Tensor Cores - Efficient GEMM kernels targeting Turing Tensor Cores - Mixed-precision floating point, 8-bit integer, 4-bit integer, and binarized operands Coverage of existing CUTLASS functionality: - GEMM kernels targeting CUDA and Tensor Cores in NVIDIA GPUs - Volta Tensor Cores through native mma.sync and through WMMA API - Optimizations such as parallel reductions, threadblock rasterization, and intra-threadblock reductions - Batched GEMM operations - Complex-valued GEMMs Note: this commit and all that follow require a host compiler supporting C++11 or greater.
6 lines
2.9 KiB
JavaScript
6 lines
2.9 KiB
JavaScript
var searchData=
|
|
[
|
|
['unique_5fptr',['unique_ptr',['../classcutlass_1_1platform_1_1unique__ptr.html#aa8a370bc7e4c2d99eb85e7fea27b3179',1,'cutlass::platform::unique_ptr::unique_ptr()'],['../classcutlass_1_1platform_1_1unique__ptr.html#a14c8bf5a5deefe4a6602ccd5c5af364c',1,'cutlass::platform::unique_ptr::unique_ptr(pointer p)']]],
|
|
['update',['update',['../classcutlass_1_1gemm_1_1device_1_1Gemm.html#aaaa871717d2fbe254a434160bc5acc65',1,'cutlass::gemm::device::Gemm::update()'],['../classcutlass_1_1gemm_1_1device_1_1Gemm_3_01ElementA___00_01LayoutA___00_01ElementB___00_01Layout4d0960ae6b1d1bf19e6239dbd002249c.html#a2b6c5275c173d73cffe8e6b6b1ccf2c1',1,'cutlass::gemm::device::Gemm< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, SplitKSerial, Operator_, IsBetaZero >::update()'],['../classcutlass_1_1gemm_1_1device_1_1GemmBatched.html#ab7c3e9a33a1c62513ec6eee3e2598df6',1,'cutlass::gemm::device::GemmBatched::update()'],['../classcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_0c9bb6f4463ab6085e6008b5d5ad6abfd.html#a9f0c7054068175c1891e4820857603c3',1,'cutlass::gemm::device::GemmBatched< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ >::update()'],['../classcutlass_1_1gemm_1_1device_1_1GemmComplex.html#a55acf39b8cdf92ab537348c3f0816513',1,'cutlass::gemm::device::GemmComplex::update()'],['../classcutlass_1_1gemm_1_1device_1_1GemmComplex_3_01ElementA___00_01LayoutA___00_01ElementB___00_07c56401b4df75709ae636675d9980a9a.html#a08446a157a60f7f1e23315c1ece09bce',1,'cutlass::gemm::device::GemmComplex< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, TransformA, TransformB, SplitKSerial >::update()'],['../classcutlass_1_1gemm_1_1device_1_1GemmSplitKParallel.html#a07ad725857d7eb191cbfc135df22b781',1,'cutlass::gemm::device::GemmSplitKParallel::update()'],['../classcutlass_1_1gemm_1_1device_1_1GemmSplitKParallel_3_01ElementA___00_01LayoutA___00_01ElementBbe7c1f7154ad5b5bf9d4d28301e2b457.html#a44facae3996ed3da5fdb4398e469b773',1,'cutlass::gemm::device::GemmSplitKParallel< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ConvertScaledOp_, ReductionOp_, ThreadblockSwizzle_, Stages, kAlignmentA, kAlignmentB, Operator_ >::update()']]]
|
|
];
|