Fix typos 2 (#842)
Co-authored-by: Haicheng Wu <57973641+hwu36@users.noreply.github.com>
This commit is contained in:
parent
c4f6b8c6bc
commit
7e370c9637
@ -328,7 +328,7 @@ or a subset of kernels for NVIDIA Ampere and Turing architecture:
|
|||||||
|
|
||||||
### Building a subset Tensor Core GEMM kernels
|
### Building a subset Tensor Core GEMM kernels
|
||||||
|
|
||||||
To compile a subset of Tensor Core GEMM kernels with FP32 accumulation and FP16 input targetting NVIDIA Ampere and Turing architecture,
|
To compile a subset of Tensor Core GEMM kernels with FP32 accumulation and FP16 input targeting NVIDIA Ampere and Turing architecture,
|
||||||
use the below cmake command line:
|
use the below cmake command line:
|
||||||
```bash
|
```bash
|
||||||
$ cmake .. -DCUTLASS_NVCC_ARCHS='75;80' -DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_s*gemm_f16_*_nt_align8
|
$ cmake .. -DCUTLASS_NVCC_ARCHS='75;80' -DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_s*gemm_f16_*_nt_align8
|
||||||
@ -376,7 +376,7 @@ reference_device: Passed
|
|||||||
|
|
||||||
### Building one CUDA Core GEMM kernel
|
### Building one CUDA Core GEMM kernel
|
||||||
|
|
||||||
To compile one SGEMM kernel targetting NVIDIA Ampere and Turing architecture, use the below cmake command line:
|
To compile one SGEMM kernel targeting NVIDIA Ampere and Turing architecture, use the below cmake command line:
|
||||||
```bash
|
```bash
|
||||||
$ cmake .. -DCUTLASS_NVCC_ARCHS='75;80' -DCUTLASS_LIBRARY_KERNELS=cutlass_simt_sgemm_128x128_8x2_nn_align1
|
$ cmake .. -DCUTLASS_NVCC_ARCHS='75;80' -DCUTLASS_LIBRARY_KERNELS=cutlass_simt_sgemm_128x128_8x2_nn_align1
|
||||||
...
|
...
|
||||||
@ -418,7 +418,7 @@ $ ./tools/profiler/cutlass_profiler --kernels=sgemm --m=3456 --n=4096 --k=4096
|
|||||||
### Building a subset of Tensor Core Convolution kernels
|
### Building a subset of Tensor Core Convolution kernels
|
||||||
|
|
||||||
To compile a subset of Tensor core convolution kernels implementing forward propagation (fprop) with FP32 accumulation
|
To compile a subset of Tensor core convolution kernels implementing forward propagation (fprop) with FP32 accumulation
|
||||||
and FP16 input targetting NVIDIA Ampere and Turing architecture, use the below cmake command line:
|
and FP16 input targeting NVIDIA Ampere and Turing architecture, use the below cmake command line:
|
||||||
```bash
|
```bash
|
||||||
$ cmake .. -DCUTLASS_NVCC_ARCHS='75;80' -DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_s*fprop_optimized_f16
|
$ cmake .. -DCUTLASS_NVCC_ARCHS='75;80' -DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_s*fprop_optimized_f16
|
||||||
...
|
...
|
||||||
@ -466,7 +466,7 @@ reference_device: Passed
|
|||||||
### Building one Convolution CUDA kernel
|
### Building one Convolution CUDA kernel
|
||||||
|
|
||||||
To compile and run one CUDA Core convolution kernel implementing forward propagation (fprop) with F32 accumulation
|
To compile and run one CUDA Core convolution kernel implementing forward propagation (fprop) with F32 accumulation
|
||||||
and FP32 input targetting NVIDIA Ampere and Turing architecture, use the below cmake command line:
|
and FP32 input targeting NVIDIA Ampere and Turing architecture, use the below cmake command line:
|
||||||
```bash
|
```bash
|
||||||
$ cmake .. -DCUTLASS_NVCC_ARCHS='75;80' -DCUTLASS_LIBRARY_KERNELS=cutlass_simt_sfprop_optimized_128x128_8x2_nhwc
|
$ cmake .. -DCUTLASS_NVCC_ARCHS='75;80' -DCUTLASS_LIBRARY_KERNELS=cutlass_simt_sfprop_optimized_128x128_8x2_nhwc
|
||||||
...
|
...
|
||||||
|
@ -280,15 +280,15 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
|
|||||||
<tr id="row_0_3_0_13_" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1gemm_1_1device_1_1DefaultGemmConfiguration_3_01arch_1_1OpClassWmmaTensorOp_00_0884059ecad03bea3e86c4cf722226097.html" target="_self">DefaultGemmConfiguration< arch::OpClassWmmaTensorOp, ArchTag, ElementA, ElementB, ElementC, ElementAccumulator ></a></td><td class="desc"></td></tr>
|
<tr id="row_0_3_0_13_" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1gemm_1_1device_1_1DefaultGemmConfiguration_3_01arch_1_1OpClassWmmaTensorOp_00_0884059ecad03bea3e86c4cf722226097.html" target="_self">DefaultGemmConfiguration< arch::OpClassWmmaTensorOp, ArchTag, ElementA, ElementB, ElementC, ElementAccumulator ></a></td><td class="desc"></td></tr>
|
||||||
<tr id="row_0_3_0_14_" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span id="arr_0_3_0_14_" class="arrow" onclick="toggleFolder('0_3_0_14_')">►</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1Gemm.html" target="_self">Gemm</a></td><td class="desc"></td></tr>
|
<tr id="row_0_3_0_14_" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span id="arr_0_3_0_14_" class="arrow" onclick="toggleFolder('0_3_0_14_')">►</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1Gemm.html" target="_self">Gemm</a></td><td class="desc"></td></tr>
|
||||||
<tr id="row_0_3_0_14_0_" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1gemm_1_1device_1_1Gemm_1_1Arguments.html" target="_self">Arguments</a></td><td class="desc">Argument structure </td></tr>
|
<tr id="row_0_3_0_14_0_" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1gemm_1_1device_1_1Gemm_1_1Arguments.html" target="_self">Arguments</a></td><td class="desc">Argument structure </td></tr>
|
||||||
<tr id="row_0_3_0_15_" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span id="arr_0_3_0_15_" class="arrow" onclick="toggleFolder('0_3_0_15_')">►</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1Gemm_3_01ElementA___00_01LayoutA___00_01ElementB___00_01Layout4d0960ae6b1d1bf19e6239dbd002249c.html" target="_self">Gemm< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, SplitKSerial, Operator_, IsBetaZero ></a></td><td class="desc">Parital specialization for column-major output exchanges problem size and operand </td></tr>
|
<tr id="row_0_3_0_15_" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span id="arr_0_3_0_15_" class="arrow" onclick="toggleFolder('0_3_0_15_')">►</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1Gemm_3_01ElementA___00_01LayoutA___00_01ElementB___00_01Layout4d0960ae6b1d1bf19e6239dbd002249c.html" target="_self">Gemm< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, SplitKSerial, Operator_, IsBetaZero ></a></td><td class="desc">Partial specialization for column-major output exchanges problem size and operand </td></tr>
|
||||||
<tr id="row_0_3_0_15_0_" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1gemm_1_1device_1_1Gemm_3_01ElementA___00_01LayoutA___00_01ElementB___00_01Layou1b211cc9c97c022d8fe10f2dd32c8709.html" target="_self">Arguments</a></td><td class="desc">Argument structure </td></tr>
|
<tr id="row_0_3_0_15_0_" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1gemm_1_1device_1_1Gemm_3_01ElementA___00_01LayoutA___00_01ElementB___00_01Layou1b211cc9c97c022d8fe10f2dd32c8709.html" target="_self">Arguments</a></td><td class="desc">Argument structure </td></tr>
|
||||||
<tr id="row_0_3_0_16_" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span id="arr_0_3_0_16_" class="arrow" onclick="toggleFolder('0_3_0_16_')">►</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmBatched.html" target="_self">GemmBatched</a></td><td class="desc"></td></tr>
|
<tr id="row_0_3_0_16_" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span id="arr_0_3_0_16_" class="arrow" onclick="toggleFolder('0_3_0_16_')">►</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmBatched.html" target="_self">GemmBatched</a></td><td class="desc"></td></tr>
|
||||||
<tr id="row_0_3_0_16_0_" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1gemm_1_1device_1_1GemmBatched_1_1Arguments.html" target="_self">Arguments</a></td><td class="desc">Argument structure </td></tr>
|
<tr id="row_0_3_0_16_0_" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1gemm_1_1device_1_1GemmBatched_1_1Arguments.html" target="_self">Arguments</a></td><td class="desc">Argument structure </td></tr>
|
||||||
<tr id="row_0_3_0_17_" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span id="arr_0_3_0_17_" class="arrow" onclick="toggleFolder('0_3_0_17_')">►</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_0c9bb6f4463ab6085e6008b5d5ad6abfd.html" target="_self">GemmBatched< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ ></a></td><td class="desc">Parital specialization for column-major output exchanges problem size and operand </td></tr>
|
<tr id="row_0_3_0_17_" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span id="arr_0_3_0_17_" class="arrow" onclick="toggleFolder('0_3_0_17_')">►</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_0c9bb6f4463ab6085e6008b5d5ad6abfd.html" target="_self">GemmBatched< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ ></a></td><td class="desc">Partial specialization for column-major output exchanges problem size and operand </td></tr>
|
||||||
<tr id="row_0_3_0_17_0_" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_213d78696663f4231cd52c6a277c60e5.html" target="_self">Arguments</a></td><td class="desc">Argument structure </td></tr>
|
<tr id="row_0_3_0_17_0_" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_213d78696663f4231cd52c6a277c60e5.html" target="_self">Arguments</a></td><td class="desc">Argument structure </td></tr>
|
||||||
<tr id="row_0_3_0_18_" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span id="arr_0_3_0_18_" class="arrow" onclick="toggleFolder('0_3_0_18_')">►</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmComplex.html" target="_self">GemmComplex</a></td><td class="desc"></td></tr>
|
<tr id="row_0_3_0_18_" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span id="arr_0_3_0_18_" class="arrow" onclick="toggleFolder('0_3_0_18_')">►</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmComplex.html" target="_self">GemmComplex</a></td><td class="desc"></td></tr>
|
||||||
<tr id="row_0_3_0_18_0_" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1gemm_1_1device_1_1GemmComplex_1_1Arguments.html" target="_self">Arguments</a></td><td class="desc">Argument structure </td></tr>
|
<tr id="row_0_3_0_18_0_" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1gemm_1_1device_1_1GemmComplex_1_1Arguments.html" target="_self">Arguments</a></td><td class="desc">Argument structure </td></tr>
|
||||||
<tr id="row_0_3_0_19_" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span id="arr_0_3_0_19_" class="arrow" onclick="toggleFolder('0_3_0_19_')">►</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmComplex_3_01ElementA___00_01LayoutA___00_01ElementB___00_07c56401b4df75709ae636675d9980a9a.html" target="_self">GemmComplex< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, TransformA, TransformB, SplitKSerial ></a></td><td class="desc">Parital specialization for column-major output exchanges problem size and operand </td></tr>
|
<tr id="row_0_3_0_19_" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span id="arr_0_3_0_19_" class="arrow" onclick="toggleFolder('0_3_0_19_')">►</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmComplex_3_01ElementA___00_01LayoutA___00_01ElementB___00_07c56401b4df75709ae636675d9980a9a.html" target="_self">GemmComplex< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, TransformA, TransformB, SplitKSerial ></a></td><td class="desc">Partial specialization for column-major output exchanges problem size and operand </td></tr>
|
||||||
<tr id="row_0_3_0_19_0_" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1gemm_1_1device_1_1GemmComplex_3_01ElementA___00_01LayoutA___00_01ElementB___00_a3923967cafb5cb9774c320dc24baa77.html" target="_self">Arguments</a></td><td class="desc">Argument structure </td></tr>
|
<tr id="row_0_3_0_19_0_" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1gemm_1_1device_1_1GemmComplex_3_01ElementA___00_01LayoutA___00_01ElementB___00_a3923967cafb5cb9774c320dc24baa77.html" target="_self">Arguments</a></td><td class="desc">Argument structure </td></tr>
|
||||||
<tr id="row_0_3_0_20_" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span id="arr_0_3_0_20_" class="arrow" onclick="toggleFolder('0_3_0_20_')">►</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmSplitKParallel.html" target="_self">GemmSplitKParallel</a></td><td class="desc"></td></tr>
|
<tr id="row_0_3_0_20_" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span id="arr_0_3_0_20_" class="arrow" onclick="toggleFolder('0_3_0_20_')">►</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmSplitKParallel.html" target="_self">GemmSplitKParallel</a></td><td class="desc"></td></tr>
|
||||||
<tr id="row_0_3_0_20_0_" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1gemm_1_1device_1_1GemmSplitKParallel_1_1Arguments.html" target="_self">Arguments</a></td><td class="desc">Argument structure </td></tr>
|
<tr id="row_0_3_0_20_0_" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1gemm_1_1device_1_1GemmSplitKParallel_1_1Arguments.html" target="_self">Arguments</a></td><td class="desc">Argument structure </td></tr>
|
||||||
@ -594,7 +594,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
|
|||||||
<tr id="row_0_8_1_4_" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1device_1_1Gemm.html" target="_self">Gemm</a></td><td class="desc"></td></tr>
|
<tr id="row_0_8_1_4_" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1device_1_1Gemm.html" target="_self">Gemm</a></td><td class="desc"></td></tr>
|
||||||
<tr id="row_0_8_1_5_" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout4e016ab7cfc644acd7cb4ae770339773.html" target="_self">Gemm< ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, AccumulatorType, arch::OpMultiplyAdd ></a></td><td class="desc">Partial specialization for multiply-add </td></tr>
|
<tr id="row_0_8_1_5_" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout4e016ab7cfc644acd7cb4ae770339773.html" target="_self">Gemm< ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, AccumulatorType, arch::OpMultiplyAdd ></a></td><td class="desc">Partial specialization for multiply-add </td></tr>
|
||||||
<tr id="row_0_8_1_6_" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout30b72addd464a2ca4a26785cbfd77a8e.html" target="_self">Gemm< ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, AccumulatorType, arch::OpMultiplyAddSaturate ></a></td><td class="desc">Partial specialization for multiply-add-saturate </td></tr>
|
<tr id="row_0_8_1_6_" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout30b72addd464a2ca4a26785cbfd77a8e.html" target="_self">Gemm< ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, AccumulatorType, arch::OpMultiplyAddSaturate ></a></td><td class="desc">Partial specialization for multiply-add-saturate </td></tr>
|
||||||
<tr id="row_0_8_1_7_" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout660562b232f408218828ca5915b7e73a.html" target="_self">Gemm< ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, AccumulatorType, arch::OpXorPopc ></a></td><td class="desc">Parital specialization for XOR-popc </td></tr>
|
<tr id="row_0_8_1_7_" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout660562b232f408218828ca5915b7e73a.html" target="_self">Gemm< ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, AccumulatorType, arch::OpXorPopc ></a></td><td class="desc">Partial specialization for XOR-popc </td></tr>
|
||||||
<tr id="row_0_8_1_8_" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1device_1_1TensorDiagonalForEach.html" target="_self">TensorDiagonalForEach</a></td><td class="desc">Launches a kernel calling a functor for each element along a tensor's diagonal </td></tr>
|
<tr id="row_0_8_1_8_" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1device_1_1TensorDiagonalForEach.html" target="_self">TensorDiagonalForEach</a></td><td class="desc">Launches a kernel calling a functor for each element along a tensor's diagonal </td></tr>
|
||||||
<tr id="row_0_8_1_9_" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1device_1_1TensorForEach.html" target="_self">TensorForEach</a></td><td class="desc">Launches a kernel calling a functor for each element in a tensor's index space </td></tr>
|
<tr id="row_0_8_1_9_" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1device_1_1TensorForEach.html" target="_self">TensorForEach</a></td><td class="desc">Launches a kernel calling a functor for each element in a tensor's index space </td></tr>
|
||||||
<tr id="row_0_8_2_" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span id="arr_0_8_2_" class="arrow" onclick="toggleFolder('0_8_2_')">►</span><span class="icona"><span class="icon">N</span></span><a class="el" href="namespacecutlass_1_1reference_1_1host.html" target="_self">host</a></td><td class="desc"></td></tr>
|
<tr id="row_0_8_2_" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span id="arr_0_8_2_" class="arrow" onclick="toggleFolder('0_8_2_')">►</span><span class="icona"><span class="icon">N</span></span><a class="el" href="namespacecutlass_1_1reference_1_1host.html" target="_self">host</a></td><td class="desc"></td></tr>
|
||||||
@ -620,7 +620,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
|
|||||||
<tr id="row_0_8_2_2_" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1host_1_1Gemm.html" target="_self">Gemm</a></td><td class="desc"></td></tr>
|
<tr id="row_0_8_2_2_" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1host_1_1Gemm.html" target="_self">Gemm</a></td><td class="desc"></td></tr>
|
||||||
<tr id="row_0_8_2_3_" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_193dd3a37f00deff1e5dcd7c310afb1f.html" target="_self">Gemm< ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, ComputeType, arch::OpMultiplyAdd ></a></td><td class="desc">Partial specialization for multiply-add </td></tr>
|
<tr id="row_0_8_2_3_" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_193dd3a37f00deff1e5dcd7c310afb1f.html" target="_self">Gemm< ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, ComputeType, arch::OpMultiplyAdd ></a></td><td class="desc">Partial specialization for multiply-add </td></tr>
|
||||||
<tr id="row_0_8_2_4_" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_55729eac7dbd6bf311ea36f680e83e93.html" target="_self">Gemm< ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, ComputeType, arch::OpMultiplyAddSaturate ></a></td><td class="desc">Partial specialization for multiply-add-saturate </td></tr>
|
<tr id="row_0_8_2_4_" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_55729eac7dbd6bf311ea36f680e83e93.html" target="_self">Gemm< ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, ComputeType, arch::OpMultiplyAddSaturate ></a></td><td class="desc">Partial specialization for multiply-add-saturate </td></tr>
|
||||||
<tr id="row_0_8_2_5_" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_4f3f32c4b336238abfd741e87bfced46.html" target="_self">Gemm< ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, ComputeType, arch::OpXorPopc ></a></td><td class="desc">Parital specialization for XOR-popc </td></tr>
|
<tr id="row_0_8_2_5_" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_4f3f32c4b336238abfd741e87bfced46.html" target="_self">Gemm< ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, ComputeType, arch::OpXorPopc ></a></td><td class="desc">Partial specialization for XOR-popc </td></tr>
|
||||||
<tr id="row_0_9_" style="display:none;"><td class="entry"><span style="width:16px;display:inline-block;"> </span><span id="arr_0_9_" class="arrow" onclick="toggleFolder('0_9_')">►</span><span class="icona"><span class="icon">N</span></span><a class="el" href="namespacecutlass_1_1thread.html" target="_self">thread</a></td><td class="desc"></td></tr>
|
<tr id="row_0_9_" style="display:none;"><td class="entry"><span style="width:16px;display:inline-block;"> </span><span id="arr_0_9_" class="arrow" onclick="toggleFolder('0_9_')">►</span><span class="icona"><span class="icon">N</span></span><a class="el" href="namespacecutlass_1_1thread.html" target="_self">thread</a></td><td class="desc"></td></tr>
|
||||||
<tr id="row_0_9_0_" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1thread_1_1Matrix.html" target="_self">Matrix</a></td><td class="desc">Per-thread matrix object storing a packed matrix </td></tr>
|
<tr id="row_0_9_0_" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1thread_1_1Matrix.html" target="_self">Matrix</a></td><td class="desc">Per-thread matrix object storing a packed matrix </td></tr>
|
||||||
<tr id="row_0_10_" style="display:none;"><td class="entry"><span style="width:16px;display:inline-block;"> </span><span id="arr_0_10_" class="arrow" onclick="toggleFolder('0_10_')">►</span><span class="icona"><span class="icon">N</span></span><a class="el" href="namespacecutlass_1_1transform.html" target="_self">transform</a></td><td class="desc"></td></tr>
|
<tr id="row_0_10_" style="display:none;"><td class="entry"><span style="width:16px;display:inline-block;"> </span><span id="arr_0_10_" class="arrow" onclick="toggleFolder('0_10_')">►</span><span class="icona"><span class="icon">N</span></span><a class="el" href="namespacecutlass_1_1transform.html" target="_self">transform</a></td><td class="desc"></td></tr>
|
||||||
|
@ -108,7 +108,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
|
|||||||
</div><!--header-->
|
</div><!--header-->
|
||||||
<div class="contents">
|
<div class="contents">
|
||||||
|
|
||||||
<p>Parital specialization for column-major output exchanges problem size and operand.
|
<p>Partial specialization for column-major output exchanges problem size and operand.
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
<p><code>#include <<a class="el" href="device_2gemm__batched_8h_source.html">gemm_batched.h</a>></code></p>
|
<p><code>#include <<a class="el" href="device_2gemm__batched_8h_source.html">gemm_batched.h</a>></code></p>
|
||||||
|
@ -108,7 +108,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
|
|||||||
</div><!--header-->
|
</div><!--header-->
|
||||||
<div class="contents">
|
<div class="contents">
|
||||||
|
|
||||||
<p>Parital specialization for column-major output exchanges problem size and operand.
|
<p>Partial specialization for column-major output exchanges problem size and operand.
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
<p><code>#include <<a class="el" href="include_2cutlass_2gemm_2device_2gemm__complex_8h_source.html">gemm_complex.h</a>></code></p>
|
<p><code>#include <<a class="el" href="include_2cutlass_2gemm_2device_2gemm__complex_8h_source.html">gemm_complex.h</a>></code></p>
|
||||||
|
@ -108,7 +108,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
|
|||||||
</div><!--header-->
|
</div><!--header-->
|
||||||
<div class="contents">
|
<div class="contents">
|
||||||
|
|
||||||
<p>Parital specialization for column-major output exchanges problem size and operand.
|
<p>Partial specialization for column-major output exchanges problem size and operand.
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
<p><code>#include <<a class="el" href="include_2cutlass_2gemm_2device_2gemm_8h_source.html">gemm.h</a>></code></p>
|
<p><code>#include <<a class="el" href="include_2cutlass_2gemm_2device_2gemm_8h_source.html">gemm.h</a>></code></p>
|
||||||
|
File diff suppressed because one or more lines are too long
@ -130,7 +130,7 @@ Classes</h2></td></tr>
|
|||||||
<tr class="memdesc:"><td class="mdescLeft"> </td><td class="mdescRight">Argument structure. <a href="structcutlass_1_1gemm_1_1device_1_1GemmBatched_1_1Arguments.html#details">More...</a><br /></td></tr>
|
<tr class="memdesc:"><td class="mdescLeft"> </td><td class="mdescRight">Argument structure. <a href="structcutlass_1_1gemm_1_1device_1_1GemmBatched_1_1Arguments.html#details">More...</a><br /></td></tr>
|
||||||
<tr class="separator:"><td class="memSeparator" colspan="2"> </td></tr>
|
<tr class="separator:"><td class="memSeparator" colspan="2"> </td></tr>
|
||||||
<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">class  </td><td class="memItemRight" valign="bottom"><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_0c9bb6f4463ab6085e6008b5d5ad6abfd.html">cutlass::gemm::device::GemmBatched< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ ></a></td></tr>
|
<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">class  </td><td class="memItemRight" valign="bottom"><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_0c9bb6f4463ab6085e6008b5d5ad6abfd.html">cutlass::gemm::device::GemmBatched< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ ></a></td></tr>
|
||||||
<tr class="memdesc:"><td class="mdescLeft"> </td><td class="mdescRight">Parital specialization for column-major output exchanges problem size and operand. <a href="classcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_0c9bb6f4463ab6085e6008b5d5ad6abfd.html#details">More...</a><br /></td></tr>
|
<tr class="memdesc:"><td class="mdescLeft"> </td><td class="mdescRight">Partial specialization for column-major output exchanges problem size and operand. <a href="classcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_0c9bb6f4463ab6085e6008b5d5ad6abfd.html#details">More...</a><br /></td></tr>
|
||||||
<tr class="separator:"><td class="memSeparator" colspan="2"> </td></tr>
|
<tr class="separator:"><td class="memSeparator" colspan="2"> </td></tr>
|
||||||
<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct  </td><td class="memItemRight" valign="bottom"><a class="el" href="structcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_213d78696663f4231cd52c6a277c60e5.html">cutlass::gemm::device::GemmBatched< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ >::Arguments</a></td></tr>
|
<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct  </td><td class="memItemRight" valign="bottom"><a class="el" href="structcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_213d78696663f4231cd52c6a277c60e5.html">cutlass::gemm::device::GemmBatched< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ >::Arguments</a></td></tr>
|
||||||
<tr class="memdesc:"><td class="mdescLeft"> </td><td class="mdescRight">Argument structure. <a href="structcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_213d78696663f4231cd52c6a277c60e5.html#details">More...</a><br /></td></tr>
|
<tr class="memdesc:"><td class="mdescLeft"> </td><td class="mdescRight">Argument structure. <a href="structcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_213d78696663f4231cd52c6a277c60e5.html#details">More...</a><br /></td></tr>
|
||||||
|
File diff suppressed because one or more lines are too long
@ -237,7 +237,7 @@ Functions</h2></td></tr>
|
|||||||
<tr class="separator:a6e23d479ebb3760d5846ed1b67e450e4"><td class="memSeparator" colspan="2"> </td></tr>
|
<tr class="separator:a6e23d479ebb3760d5846ed1b67e450e4"><td class="memSeparator" colspan="2"> </td></tr>
|
||||||
<tr class="memitem:a6b0f21995c4fd5c33617550e6905c78e"><td class="memTemplParams" colspan="2">template<typename Element , typename Layout > </td></tr>
|
<tr class="memitem:a6b0f21995c4fd5c33617550e6905c78e"><td class="memTemplParams" colspan="2">template<typename Element , typename Layout > </td></tr>
|
||||||
<tr class="memitem:a6b0f21995c4fd5c33617550e6905c78e"><td class="memTemplItemLeft" align="right" valign="top">void </td><td class="memTemplItemRight" valign="bottom"><a class="el" href="namespacecutlass_1_1reference_1_1device.html#a6b0f21995c4fd5c33617550e6905c78e">cutlass::reference::device::TensorFillIdentity</a> (TensorView< Element, Layout > view)</td></tr>
|
<tr class="memitem:a6b0f21995c4fd5c33617550e6905c78e"><td class="memTemplItemLeft" align="right" valign="top">void </td><td class="memTemplItemRight" valign="bottom"><a class="el" href="namespacecutlass_1_1reference_1_1device.html#a6b0f21995c4fd5c33617550e6905c78e">cutlass::reference::device::TensorFillIdentity</a> (TensorView< Element, Layout > view)</td></tr>
|
||||||
<tr class="memdesc:a6b0f21995c4fd5c33617550e6905c78e"><td class="mdescLeft"> </td><td class="mdescRight">Fills a tensor's digonal with 1 and 0 everywhere else. <a href="namespacecutlass_1_1reference_1_1device.html#a6b0f21995c4fd5c33617550e6905c78e">More...</a><br /></td></tr>
|
<tr class="memdesc:a6b0f21995c4fd5c33617550e6905c78e"><td class="mdescLeft"> </td><td class="mdescRight">Fills a tensor's diagonal with 1 and 0 everywhere else. <a href="namespacecutlass_1_1reference_1_1device.html#a6b0f21995c4fd5c33617550e6905c78e">More...</a><br /></td></tr>
|
||||||
<tr class="separator:a6b0f21995c4fd5c33617550e6905c78e"><td class="memSeparator" colspan="2"> </td></tr>
|
<tr class="separator:a6b0f21995c4fd5c33617550e6905c78e"><td class="memSeparator" colspan="2"> </td></tr>
|
||||||
<tr class="memitem:aaff3d7919a2f2dce14eb254c17eead9a"><td class="memTemplParams" colspan="2">template<typename Element , typename Layout > </td></tr>
|
<tr class="memitem:aaff3d7919a2f2dce14eb254c17eead9a"><td class="memTemplParams" colspan="2">template<typename Element , typename Layout > </td></tr>
|
||||||
<tr class="memitem:aaff3d7919a2f2dce14eb254c17eead9a"><td class="memTemplItemLeft" align="right" valign="top">void </td><td class="memTemplItemRight" valign="bottom"><a class="el" href="namespacecutlass_1_1reference_1_1device.html#aaff3d7919a2f2dce14eb254c17eead9a">cutlass::reference::device::TensorUpdateDiagonal</a> (TensorView< Element, Layout > view, Element diag=Element(1))</td></tr>
|
<tr class="memitem:aaff3d7919a2f2dce14eb254c17eead9a"><td class="memTemplItemLeft" align="right" valign="top">void </td><td class="memTemplItemRight" valign="bottom"><a class="el" href="namespacecutlass_1_1reference_1_1device.html#aaff3d7919a2f2dce14eb254c17eead9a">cutlass::reference::device::TensorUpdateDiagonal</a> (TensorView< Element, Layout > view, Element diag=Element(1))</td></tr>
|
||||||
|
@ -125,7 +125,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
|
|||||||
<div class="ttc" id="structcutlass_1_1reference_1_1device_1_1detail_1_1RandomGaussianFunc_1_1Params_html"><div class="ttname"><a href="structcutlass_1_1reference_1_1device_1_1detail_1_1RandomGaussianFunc_1_1Params.html">cutlass::reference::device::detail::RandomGaussianFunc::Params</a></div><div class="ttdoc">Parameters structure. </div><div class="ttdef"><b>Definition:</b> device/tensor_fill.h:99</div></div>
|
<div class="ttc" id="structcutlass_1_1reference_1_1device_1_1detail_1_1RandomGaussianFunc_1_1Params_html"><div class="ttname"><a href="structcutlass_1_1reference_1_1device_1_1detail_1_1RandomGaussianFunc_1_1Params.html">cutlass::reference::device::detail::RandomGaussianFunc::Params</a></div><div class="ttdoc">Parameters structure. </div><div class="ttdef"><b>Definition:</b> device/tensor_fill.h:99</div></div>
|
||||||
<div class="ttc" id="structcutlass_1_1Distribution_html_a07cb089b346ef06e198f6043128264fb"><div class="ttname"><a href="structcutlass_1_1Distribution.html#a07cb089b346ef06e198f6043128264fb">cutlass::Distribution::kind</a></div><div class="ttdeci">Kind kind</div><div class="ttdoc">Active variant kind. </div><div class="ttdef"><b>Definition:</b> distribution.h:64</div></div>
|
<div class="ttc" id="structcutlass_1_1Distribution_html_a07cb089b346ef06e198f6043128264fb"><div class="ttname"><a href="structcutlass_1_1Distribution.html#a07cb089b346ef06e198f6043128264fb">cutlass::Distribution::kind</a></div><div class="ttdeci">Kind kind</div><div class="ttdoc">Active variant kind. </div><div class="ttdef"><b>Definition:</b> distribution.h:64</div></div>
|
||||||
<div class="ttc" id="structcutlass_1_1reference_1_1device_1_1detail_1_1TensorFillRandomUniformFunc_1_1Params_html_a267e7ea4e77076cc9be7d639b3cef64d"><div class="ttname"><a href="structcutlass_1_1reference_1_1device_1_1detail_1_1TensorFillRandomUniformFunc_1_1Params.html#a267e7ea4e77076cc9be7d639b3cef64d">cutlass::reference::device::detail::TensorFillRandomUniformFunc::Params::Params</a></div><div class="ttdeci">Params(TensorView view_=TensorView(), typename RandomFunc::Params random_=RandomFunc::Params())</div><div class="ttdoc">Construction of Gaussian RNG functor. </div><div class="ttdef"><b>Definition:</b> device/tensor_fill.h:422</div></div>
|
<div class="ttc" id="structcutlass_1_1reference_1_1device_1_1detail_1_1TensorFillRandomUniformFunc_1_1Params_html_a267e7ea4e77076cc9be7d639b3cef64d"><div class="ttname"><a href="structcutlass_1_1reference_1_1device_1_1detail_1_1TensorFillRandomUniformFunc_1_1Params.html#a267e7ea4e77076cc9be7d639b3cef64d">cutlass::reference::device::detail::TensorFillRandomUniformFunc::Params::Params</a></div><div class="ttdeci">Params(TensorView view_=TensorView(), typename RandomFunc::Params random_=RandomFunc::Params())</div><div class="ttdoc">Construction of Gaussian RNG functor. </div><div class="ttdef"><b>Definition:</b> device/tensor_fill.h:422</div></div>
|
||||||
<div class="ttc" id="namespacecutlass_1_1reference_1_1device_html_a6b0f21995c4fd5c33617550e6905c78e"><div class="ttname"><a href="namespacecutlass_1_1reference_1_1device.html#a6b0f21995c4fd5c33617550e6905c78e">cutlass::reference::device::TensorFillIdentity</a></div><div class="ttdeci">void TensorFillIdentity(TensorView< Element, Layout > view)</div><div class="ttdoc">Fills a tensor&#39;s digonal with 1 and 0 everywhere else. </div><div class="ttdef"><b>Definition:</b> device/tensor_fill.h:630</div></div>
|
<div class="ttc" id="namespacecutlass_1_1reference_1_1device_html_a6b0f21995c4fd5c33617550e6905c78e"><div class="ttname"><a href="namespacecutlass_1_1reference_1_1device.html#a6b0f21995c4fd5c33617550e6905c78e">cutlass::reference::device::TensorFillIdentity</a></div><div class="ttdeci">void TensorFillIdentity(TensorView< Element, Layout > view)</div><div class="ttdoc">Fills a tensor&#39;s diagonal with 1 and 0 everywhere else. </div><div class="ttdef"><b>Definition:</b> device/tensor_fill.h:630</div></div>
|
||||||
<div class="ttc" id="classcutlass_1_1TensorView_html_a7d3914dd5042c9c40be9e21a7b4e9ece"><div class="ttname"><a href="classcutlass_1_1TensorView.html#a7d3914dd5042c9c40be9e21a7b4e9ece">cutlass::TensorView::extent</a></div><div class="ttdeci">CUTLASS_HOST_DEVICE TensorCoord const & extent() const </div><div class="ttdoc">Returns the extent of the view (the size along each logical dimension). </div><div class="ttdef"><b>Definition:</b> tensor_view.h:167</div></div>
|
<div class="ttc" id="classcutlass_1_1TensorView_html_a7d3914dd5042c9c40be9e21a7b4e9ece"><div class="ttname"><a href="classcutlass_1_1TensorView.html#a7d3914dd5042c9c40be9e21a7b4e9ece">cutlass::TensorView::extent</a></div><div class="ttdeci">CUTLASS_HOST_DEVICE TensorCoord const & extent() const </div><div class="ttdoc">Returns the extent of the view (the size along each logical dimension). </div><div class="ttdef"><b>Definition:</b> tensor_view.h:167</div></div>
|
||||||
<div class="ttc" id="structcutlass_1_1reference_1_1device_1_1detail_1_1TensorUpdateDiagonalFunc_html"><div class="ttname"><a href="structcutlass_1_1reference_1_1device_1_1detail_1_1TensorUpdateDiagonalFunc.html">cutlass::reference::device::detail::TensorUpdateDiagonalFunc</a></div><div class="ttdoc">Computes a random Gaussian distribution. </div><div class="ttdef"><b>Definition:</b> device/tensor_fill.h:645</div></div>
|
<div class="ttc" id="structcutlass_1_1reference_1_1device_1_1detail_1_1TensorUpdateDiagonalFunc_html"><div class="ttname"><a href="structcutlass_1_1reference_1_1device_1_1detail_1_1TensorUpdateDiagonalFunc.html">cutlass::reference::device::detail::TensorUpdateDiagonalFunc</a></div><div class="ttdoc">Computes a random Gaussian distribution. </div><div class="ttdef"><b>Definition:</b> device/tensor_fill.h:645</div></div>
|
||||||
<div class="ttc" id="structcutlass_1_1reference_1_1device_1_1detail_1_1RandomUniformFunc_1_1Params_html_afe8637b103e25ec2e9b731389fa049be"><div class="ttname"><a href="structcutlass_1_1reference_1_1device_1_1detail_1_1RandomUniformFunc_1_1Params.html#afe8637b103e25ec2e9b731389fa049be">cutlass::reference::device::detail::RandomUniformFunc::Params::int_scale</a></div><div class="ttdeci">int int_scale</div><div class="ttdef"><b>Definition:</b> device/tensor_fill.h:315</div></div>
|
<div class="ttc" id="structcutlass_1_1reference_1_1device_1_1detail_1_1RandomUniformFunc_1_1Params_html_afe8637b103e25ec2e9b731389fa049be"><div class="ttname"><a href="structcutlass_1_1reference_1_1device_1_1detail_1_1RandomUniformFunc_1_1Params.html#afe8637b103e25ec2e9b731389fa049be">cutlass::reference::device::detail::RandomUniformFunc::Params::int_scale</a></div><div class="ttdeci">int int_scale</div><div class="ttdef"><b>Definition:</b> device/tensor_fill.h:315</div></div>
|
||||||
|
File diff suppressed because one or more lines are too long
@ -141,7 +141,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
|
|||||||
<li>Semaphore()
|
<li>Semaphore()
|
||||||
: <a class="el" href="classcutlass_1_1Semaphore.html#a2ce4cd07fe773efa429f726cfbd98070">cutlass::Semaphore</a>
|
: <a class="el" href="classcutlass_1_1Semaphore.html#a2ce4cd07fe773efa429f726cfbd98070">cutlass::Semaphore</a>
|
||||||
</li>
|
</li>
|
||||||
<li>seperate_string()
|
<li>separate_string()
|
||||||
: <a class="el" href="structcutlass_1_1CommandLine.html#a5f86e4b2bd8c44b739c83530d77c5590">cutlass::CommandLine</a>
|
: <a class="el" href="structcutlass_1_1CommandLine.html#a5f86e4b2bd8c44b739c83530d77c5590">cutlass::CommandLine</a>
|
||||||
</li>
|
</li>
|
||||||
<li>set()
|
<li>set()
|
||||||
|
@ -172,7 +172,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
|
|||||||
<li>Semaphore()
|
<li>Semaphore()
|
||||||
: <a class="el" href="classcutlass_1_1Semaphore.html#a2ce4cd07fe773efa429f726cfbd98070">cutlass::Semaphore</a>
|
: <a class="el" href="classcutlass_1_1Semaphore.html#a2ce4cd07fe773efa429f726cfbd98070">cutlass::Semaphore</a>
|
||||||
</li>
|
</li>
|
||||||
<li>seperate_string()
|
<li>separate_string()
|
||||||
: <a class="el" href="structcutlass_1_1CommandLine.html#a5f86e4b2bd8c44b739c83530d77c5590">cutlass::CommandLine</a>
|
: <a class="el" href="structcutlass_1_1CommandLine.html#a5f86e4b2bd8c44b739c83530d77c5590">cutlass::CommandLine</a>
|
||||||
</li>
|
</li>
|
||||||
<li>sequential
|
<li>sequential
|
||||||
|
@ -312,23 +312,23 @@ This inheritance list is sorted roughly, but not completely, alphabetically:</di
|
|||||||
<tr id="row_197_"><td class="entry"><span style="width:16px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1host_1_1Gemm.html" target="_self">cutlass::reference::host::Gemm< ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, ComputeType, InnerProductOp ></a></td><td class="desc"></td></tr>
|
<tr id="row_197_"><td class="entry"><span style="width:16px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1host_1_1Gemm.html" target="_self">cutlass::reference::host::Gemm< ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, ComputeType, InnerProductOp ></a></td><td class="desc"></td></tr>
|
||||||
<tr id="row_198_" class="even"><td class="entry"><span style="width:16px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout4e016ab7cfc644acd7cb4ae770339773.html" target="_self">cutlass::reference::device::Gemm< ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, AccumulatorType, arch::OpMultiplyAdd ></a></td><td class="desc">Partial specialization for multiply-add </td></tr>
|
<tr id="row_198_" class="even"><td class="entry"><span style="width:16px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout4e016ab7cfc644acd7cb4ae770339773.html" target="_self">cutlass::reference::device::Gemm< ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, AccumulatorType, arch::OpMultiplyAdd ></a></td><td class="desc">Partial specialization for multiply-add </td></tr>
|
||||||
<tr id="row_199_"><td class="entry"><span style="width:16px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout30b72addd464a2ca4a26785cbfd77a8e.html" target="_self">cutlass::reference::device::Gemm< ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, AccumulatorType, arch::OpMultiplyAddSaturate ></a></td><td class="desc">Partial specialization for multiply-add-saturate </td></tr>
|
<tr id="row_199_"><td class="entry"><span style="width:16px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout30b72addd464a2ca4a26785cbfd77a8e.html" target="_self">cutlass::reference::device::Gemm< ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, AccumulatorType, arch::OpMultiplyAddSaturate ></a></td><td class="desc">Partial specialization for multiply-add-saturate </td></tr>
|
||||||
<tr id="row_200_" class="even"><td class="entry"><span style="width:16px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout660562b232f408218828ca5915b7e73a.html" target="_self">cutlass::reference::device::Gemm< ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, AccumulatorType, arch::OpXorPopc ></a></td><td class="desc">Parital specialization for XOR-popc </td></tr>
|
<tr id="row_200_" class="even"><td class="entry"><span style="width:16px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout660562b232f408218828ca5915b7e73a.html" target="_self">cutlass::reference::device::Gemm< ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, AccumulatorType, arch::OpXorPopc ></a></td><td class="desc">Partial specialization for XOR-popc </td></tr>
|
||||||
<tr id="row_201_"><td class="entry"><span style="width:16px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_193dd3a37f00deff1e5dcd7c310afb1f.html" target="_self">cutlass::reference::host::Gemm< ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, ComputeType, arch::OpMultiplyAdd ></a></td><td class="desc">Partial specialization for multiply-add </td></tr>
|
<tr id="row_201_"><td class="entry"><span style="width:16px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_193dd3a37f00deff1e5dcd7c310afb1f.html" target="_self">cutlass::reference::host::Gemm< ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, ComputeType, arch::OpMultiplyAdd ></a></td><td class="desc">Partial specialization for multiply-add </td></tr>
|
||||||
<tr id="row_202_" class="even"><td class="entry"><span style="width:16px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_55729eac7dbd6bf311ea36f680e83e93.html" target="_self">cutlass::reference::host::Gemm< ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, ComputeType, arch::OpMultiplyAddSaturate ></a></td><td class="desc">Partial specialization for multiply-add-saturate </td></tr>
|
<tr id="row_202_" class="even"><td class="entry"><span style="width:16px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_55729eac7dbd6bf311ea36f680e83e93.html" target="_self">cutlass::reference::host::Gemm< ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, ComputeType, arch::OpMultiplyAddSaturate ></a></td><td class="desc">Partial specialization for multiply-add-saturate </td></tr>
|
||||||
<tr id="row_203_"><td class="entry"><span style="width:16px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_4f3f32c4b336238abfd741e87bfced46.html" target="_self">cutlass::reference::host::Gemm< ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, ComputeType, arch::OpXorPopc ></a></td><td class="desc">Parital specialization for XOR-popc </td></tr>
|
<tr id="row_203_"><td class="entry"><span style="width:16px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_4f3f32c4b336238abfd741e87bfced46.html" target="_self">cutlass::reference::host::Gemm< ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, ComputeType, arch::OpXorPopc ></a></td><td class="desc">Partial specialization for XOR-popc </td></tr>
|
||||||
<tr id="row_204_" class="even"><td class="entry"><span style="width:16px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1Gemm_3_01ElementA___00_01LayoutA___00_01ElementB___00_01Layout4d0960ae6b1d1bf19e6239dbd002249c.html" target="_self">cutlass::gemm::device::Gemm< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, SplitKSerial, Operator_, IsBetaZero ></a></td><td class="desc">Parital specialization for column-major output exchanges problem size and operand </td></tr>
|
<tr id="row_204_" class="even"><td class="entry"><span style="width:16px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1Gemm_3_01ElementA___00_01LayoutA___00_01ElementB___00_01Layout4d0960ae6b1d1bf19e6239dbd002249c.html" target="_self">cutlass::gemm::device::Gemm< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, SplitKSerial, Operator_, IsBetaZero ></a></td><td class="desc">Partial specialization for column-major output exchanges problem size and operand </td></tr>
|
||||||
<tr id="row_205_"><td class="entry"><span style="width:16px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1Gemm.html" target="_self">cutlass::gemm::device::Gemm< ElementB, typename layout::LayoutTranspose< LayoutB >::type, ElementA, typename layout::LayoutTranspose< LayoutA >::type, ElementC, layout::RowMajor, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages, kAlignmentB, kAlignmentA, SplitKSerial, Operator, kIsBetaZero ></a></td><td class="desc"></td></tr>
|
<tr id="row_205_"><td class="entry"><span style="width:16px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1Gemm.html" target="_self">cutlass::gemm::device::Gemm< ElementB, typename layout::LayoutTranspose< LayoutB >::type, ElementA, typename layout::LayoutTranspose< LayoutA >::type, ElementC, layout::RowMajor, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages, kAlignmentB, kAlignmentA, SplitKSerial, Operator, kIsBetaZero ></a></td><td class="desc"></td></tr>
|
||||||
<tr id="row_206_" class="even"><td class="entry"><span style="width:16px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1library_1_1GemmArguments.html" target="_self">cutlass::library::GemmArguments</a></td><td class="desc">Arguments for GEMM </td></tr>
|
<tr id="row_206_" class="even"><td class="entry"><span style="width:16px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1library_1_1GemmArguments.html" target="_self">cutlass::library::GemmArguments</a></td><td class="desc">Arguments for GEMM </td></tr>
|
||||||
<tr id="row_207_"><td class="entry"><span style="width:16px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1library_1_1GemmArrayArguments.html" target="_self">cutlass::library::GemmArrayArguments</a></td><td class="desc">Arguments for GEMM - used by all the GEMM operations </td></tr>
|
<tr id="row_207_"><td class="entry"><span style="width:16px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1library_1_1GemmArrayArguments.html" target="_self">cutlass::library::GemmArrayArguments</a></td><td class="desc">Arguments for GEMM - used by all the GEMM operations </td></tr>
|
||||||
<tr id="row_208_" class="even"><td class="entry"><span style="width:16px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1library_1_1GemmArrayConfiguration.html" target="_self">cutlass::library::GemmArrayConfiguration</a></td><td class="desc">Configuration for batched GEMM in which multiple matrix products are computed </td></tr>
|
<tr id="row_208_" class="even"><td class="entry"><span style="width:16px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1library_1_1GemmArrayConfiguration.html" target="_self">cutlass::library::GemmArrayConfiguration</a></td><td class="desc">Configuration for batched GEMM in which multiple matrix products are computed </td></tr>
|
||||||
<tr id="row_209_"><td class="entry"><span style="width:16px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmBatched.html" target="_self">cutlass::gemm::device::GemmBatched< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, LayoutC_, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ ></a></td><td class="desc"></td></tr>
|
<tr id="row_209_"><td class="entry"><span style="width:16px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmBatched.html" target="_self">cutlass::gemm::device::GemmBatched< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, LayoutC_, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ ></a></td><td class="desc"></td></tr>
|
||||||
<tr id="row_210_" class="even"><td class="entry"><span style="width:16px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1gemm_1_1kernel_1_1GemmBatched.html" target="_self">cutlass::gemm::kernel::GemmBatched< Mma_, Epilogue_, ThreadblockSwizzle_ ></a></td><td class="desc"></td></tr>
|
<tr id="row_210_" class="even"><td class="entry"><span style="width:16px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1gemm_1_1kernel_1_1GemmBatched.html" target="_self">cutlass::gemm::kernel::GemmBatched< Mma_, Epilogue_, ThreadblockSwizzle_ ></a></td><td class="desc"></td></tr>
|
||||||
<tr id="row_211_"><td class="entry"><span style="width:16px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_0c9bb6f4463ab6085e6008b5d5ad6abfd.html" target="_self">cutlass::gemm::device::GemmBatched< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ ></a></td><td class="desc">Parital specialization for column-major output exchanges problem size and operand </td></tr>
|
<tr id="row_211_"><td class="entry"><span style="width:16px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_0c9bb6f4463ab6085e6008b5d5ad6abfd.html" target="_self">cutlass::gemm::device::GemmBatched< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ ></a></td><td class="desc">Partial specialization for column-major output exchanges problem size and operand </td></tr>
|
||||||
<tr id="row_212_" class="even"><td class="entry"><span style="width:16px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmBatched.html" target="_self">cutlass::gemm::device::GemmBatched< ElementB, typename layout::LayoutTranspose< LayoutB >::type, ElementA, typename layout::LayoutTranspose< LayoutA >::type, ElementC, layout::RowMajor, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages, kAlignmentB, kAlignmentA ></a></td><td class="desc"></td></tr>
|
<tr id="row_212_" class="even"><td class="entry"><span style="width:16px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmBatched.html" target="_self">cutlass::gemm::device::GemmBatched< ElementB, typename layout::LayoutTranspose< LayoutB >::type, ElementA, typename layout::LayoutTranspose< LayoutA >::type, ElementC, layout::RowMajor, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages, kAlignmentB, kAlignmentA ></a></td><td class="desc"></td></tr>
|
||||||
<tr id="row_213_"><td class="entry"><span style="width:16px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1library_1_1GemmBatchedConfiguration.html" target="_self">cutlass::library::GemmBatchedConfiguration</a></td><td class="desc">Configuration for batched GEMM in which multiple matrix products are computed </td></tr>
|
<tr id="row_213_"><td class="entry"><span style="width:16px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1library_1_1GemmBatchedConfiguration.html" target="_self">cutlass::library::GemmBatchedConfiguration</a></td><td class="desc">Configuration for batched GEMM in which multiple matrix products are computed </td></tr>
|
||||||
<tr id="row_214_" class="even"><td class="entry"><span style="width:16px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1gemm_1_1threadblock_1_1GemmBatchedIdentityThreadblockSwizzle.html" target="_self">cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle</a></td><td class="desc">Threadblock swizzling function for batched GEMMs </td></tr>
|
<tr id="row_214_" class="even"><td class="entry"><span style="width:16px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1gemm_1_1threadblock_1_1GemmBatchedIdentityThreadblockSwizzle.html" target="_self">cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle</a></td><td class="desc">Threadblock swizzling function for batched GEMMs </td></tr>
|
||||||
<tr id="row_215_"><td class="entry"><span style="width:16px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmComplex.html" target="_self">cutlass::gemm::device::GemmComplex< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, LayoutC_, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, TransformA, TransformB, SplitKSerial ></a></td><td class="desc"></td></tr>
|
<tr id="row_215_"><td class="entry"><span style="width:16px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmComplex.html" target="_self">cutlass::gemm::device::GemmComplex< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, LayoutC_, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, TransformA, TransformB, SplitKSerial ></a></td><td class="desc"></td></tr>
|
||||||
<tr id="row_216_" class="even"><td class="entry"><span style="width:16px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmComplex_3_01ElementA___00_01LayoutA___00_01ElementB___00_07c56401b4df75709ae636675d9980a9a.html" target="_self">cutlass::gemm::device::GemmComplex< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, TransformA, TransformB, SplitKSerial ></a></td><td class="desc">Parital specialization for column-major output exchanges problem size and operand </td></tr>
|
<tr id="row_216_" class="even"><td class="entry"><span style="width:16px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmComplex_3_01ElementA___00_01LayoutA___00_01ElementB___00_07c56401b4df75709ae636675d9980a9a.html" target="_self">cutlass::gemm::device::GemmComplex< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, TransformA, TransformB, SplitKSerial ></a></td><td class="desc">Partial specialization for column-major output exchanges problem size and operand </td></tr>
|
||||||
<tr id="row_217_"><td class="entry"><span style="width:16px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmComplex.html" target="_self">cutlass::gemm::device::GemmComplex< ElementB, typename layout::LayoutTranspose< LayoutB >::type, ElementA, typename layout::LayoutTranspose< LayoutA >::type, ElementC, layout::RowMajor, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages, TransformA, TransformB, SplitKSerial ></a></td><td class="desc"></td></tr>
|
<tr id="row_217_"><td class="entry"><span style="width:16px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmComplex.html" target="_self">cutlass::gemm::device::GemmComplex< ElementB, typename layout::LayoutTranspose< LayoutB >::type, ElementA, typename layout::LayoutTranspose< LayoutA >::type, ElementC, layout::RowMajor, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages, TransformA, TransformB, SplitKSerial ></a></td><td class="desc"></td></tr>
|
||||||
<tr id="row_218_" class="even"><td class="entry"><span style="width:16px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1library_1_1GemmConfiguration.html" target="_self">cutlass::library::GemmConfiguration</a></td><td class="desc">Configuration for basic GEMM operations </td></tr>
|
<tr id="row_218_" class="even"><td class="entry"><span style="width:16px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1library_1_1GemmConfiguration.html" target="_self">cutlass::library::GemmConfiguration</a></td><td class="desc">Configuration for basic GEMM operations </td></tr>
|
||||||
<tr id="row_219_"><td class="entry"><span style="width:16px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1gemm_1_1threadblock_1_1GemmHorizontalThreadblockSwizzle.html" target="_self">cutlass::gemm::threadblock::GemmHorizontalThreadblockSwizzle</a></td><td class="desc">Threadblock swizzling function for GEMMs </td></tr>
|
<tr id="row_219_"><td class="entry"><span style="width:16px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1gemm_1_1threadblock_1_1GemmHorizontalThreadblockSwizzle.html" target="_self">cutlass::gemm::threadblock::GemmHorizontalThreadblockSwizzle</a></td><td class="desc">Threadblock swizzling function for GEMMs </td></tr>
|
||||||
|
@ -192,7 +192,7 @@ Functions</h2></td></tr>
|
|||||||
<tr class="separator:a1c81144ca36832a48d04d1b5b6498080"><td class="memSeparator" colspan="2"> </td></tr>
|
<tr class="separator:a1c81144ca36832a48d04d1b5b6498080"><td class="memSeparator" colspan="2"> </td></tr>
|
||||||
<tr class="memitem:a29548cb522d9c147cf34263ecac75d89"><td class="memTemplParams" colspan="2">template<typename Element , typename Layout > </td></tr>
|
<tr class="memitem:a29548cb522d9c147cf34263ecac75d89"><td class="memTemplParams" colspan="2">template<typename Element , typename Layout > </td></tr>
|
||||||
<tr class="memitem:a29548cb522d9c147cf34263ecac75d89"><td class="memTemplItemLeft" align="right" valign="top">void </td><td class="memTemplItemRight" valign="bottom"><a class="el" href="namespacecutlass_1_1reference_1_1host.html#a29548cb522d9c147cf34263ecac75d89">cutlass::reference::host::TensorFillIdentity</a> (TensorView< Element, Layout > dst)</td></tr>
|
<tr class="memitem:a29548cb522d9c147cf34263ecac75d89"><td class="memTemplItemLeft" align="right" valign="top">void </td><td class="memTemplItemRight" valign="bottom"><a class="el" href="namespacecutlass_1_1reference_1_1host.html#a29548cb522d9c147cf34263ecac75d89">cutlass::reference::host::TensorFillIdentity</a> (TensorView< Element, Layout > dst)</td></tr>
|
||||||
<tr class="memdesc:a29548cb522d9c147cf34263ecac75d89"><td class="mdescLeft"> </td><td class="mdescRight">Helper to fill a tensor's digonal with 1 and 0 everywhere else. <a href="namespacecutlass_1_1reference_1_1host.html#a29548cb522d9c147cf34263ecac75d89">More...</a><br /></td></tr>
|
<tr class="memdesc:a29548cb522d9c147cf34263ecac75d89"><td class="mdescLeft"> </td><td class="mdescRight">Helper to fill a tensor's diagonal with 1 and 0 everywhere else. <a href="namespacecutlass_1_1reference_1_1host.html#a29548cb522d9c147cf34263ecac75d89">More...</a><br /></td></tr>
|
||||||
<tr class="separator:a29548cb522d9c147cf34263ecac75d89"><td class="memSeparator" colspan="2"> </td></tr>
|
<tr class="separator:a29548cb522d9c147cf34263ecac75d89"><td class="memSeparator" colspan="2"> </td></tr>
|
||||||
<tr class="memitem:acbf747241e8ac6ef9b1702b735a7913e"><td class="memTemplParams" colspan="2">template<typename Element , typename Layout > </td></tr>
|
<tr class="memitem:acbf747241e8ac6ef9b1702b735a7913e"><td class="memTemplParams" colspan="2">template<typename Element , typename Layout > </td></tr>
|
||||||
<tr class="memitem:acbf747241e8ac6ef9b1702b735a7913e"><td class="memTemplItemLeft" align="right" valign="top">void </td><td class="memTemplItemRight" valign="bottom"><a class="el" href="namespacecutlass_1_1reference_1_1host.html#acbf747241e8ac6ef9b1702b735a7913e">cutlass::reference::host::TensorUpdateDiagonal</a> (TensorView< Element, Layout > dst, Element val=Element(1))</td></tr>
|
<tr class="memitem:acbf747241e8ac6ef9b1702b735a7913e"><td class="memTemplItemLeft" align="right" valign="top">void </td><td class="memTemplItemRight" valign="bottom"><a class="el" href="namespacecutlass_1_1reference_1_1host.html#acbf747241e8ac6ef9b1702b735a7913e">cutlass::reference::host::TensorUpdateDiagonal</a> (TensorView< Element, Layout > dst, Element val=Element(1))</td></tr>
|
||||||
|
@ -132,7 +132,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
|
|||||||
<div class="ttc" id="structcutlass_1_1reference_1_1host_1_1detail_1_1RandomGaussianFunc_html_a4c9943f36faab7d4928b1f130d0b784c"><div class="ttname"><a href="structcutlass_1_1reference_1_1host_1_1detail_1_1RandomGaussianFunc.html#a4c9943f36faab7d4928b1f130d0b784c">cutlass::reference::host::detail::RandomGaussianFunc::int_scale</a></div><div class="ttdeci">int int_scale</div><div class="ttdef"><b>Definition:</b> host/tensor_fill.h:115</div></div>
|
<div class="ttc" id="structcutlass_1_1reference_1_1host_1_1detail_1_1RandomGaussianFunc_html_a4c9943f36faab7d4928b1f130d0b784c"><div class="ttname"><a href="structcutlass_1_1reference_1_1host_1_1detail_1_1RandomGaussianFunc.html#a4c9943f36faab7d4928b1f130d0b784c">cutlass::reference::host::detail::RandomGaussianFunc::int_scale</a></div><div class="ttdeci">int int_scale</div><div class="ttdef"><b>Definition:</b> host/tensor_fill.h:115</div></div>
|
||||||
<div class="ttc" id="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorUpdateOffDiagonalFunc_html"><div class="ttname"><a href="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorUpdateOffDiagonalFunc.html">cutlass::reference::host::detail::TensorUpdateOffDiagonalFunc</a></div><div class="ttdoc">< Layout function </div><div class="ttdef"><b>Definition:</b> host/tensor_fill.h:597</div></div>
|
<div class="ttc" id="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorUpdateOffDiagonalFunc_html"><div class="ttname"><a href="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorUpdateOffDiagonalFunc.html">cutlass::reference::host::detail::TensorUpdateOffDiagonalFunc</a></div><div class="ttdoc">< Layout function </div><div class="ttdef"><b>Definition:</b> host/tensor_fill.h:597</div></div>
|
||||||
<div class="ttc" id="structcutlass_1_1reference_1_1host_1_1detail_1_1RandomUniformFunc_3_01complex_3_01Element_01_4_01_4_html_ad0de7d4946af855288d7f9cccb9a18eb"><div class="ttname"><a href="structcutlass_1_1reference_1_1host_1_1detail_1_1RandomUniformFunc_3_01complex_3_01Element_01_4_01_4.html#ad0de7d4946af855288d7f9cccb9a18eb">cutlass::reference::host::detail::RandomUniformFunc< complex< Element > >::int_scale</a></div><div class="ttdeci">int int_scale</div><div class="ttdef"><b>Definition:</b> host/tensor_fill.h:357</div></div>
|
<div class="ttc" id="structcutlass_1_1reference_1_1host_1_1detail_1_1RandomUniformFunc_3_01complex_3_01Element_01_4_01_4_html_ad0de7d4946af855288d7f9cccb9a18eb"><div class="ttname"><a href="structcutlass_1_1reference_1_1host_1_1detail_1_1RandomUniformFunc_3_01complex_3_01Element_01_4_01_4.html#ad0de7d4946af855288d7f9cccb9a18eb">cutlass::reference::host::detail::RandomUniformFunc< complex< Element > >::int_scale</a></div><div class="ttdeci">int int_scale</div><div class="ttdef"><b>Definition:</b> host/tensor_fill.h:357</div></div>
|
||||||
<div class="ttc" id="namespacecutlass_1_1reference_1_1host_html_a29548cb522d9c147cf34263ecac75d89"><div class="ttname"><a href="namespacecutlass_1_1reference_1_1host.html#a29548cb522d9c147cf34263ecac75d89">cutlass::reference::host::TensorFillIdentity</a></div><div class="ttdeci">void TensorFillIdentity(TensorView< Element, Layout > dst)</div><div class="ttdoc">Helper to fill a tensor&#39;s digonal with 1 and 0 everywhere else. </div><div class="ttdef"><b>Definition:</b> host/tensor_fill.h:564</div></div>
|
<div class="ttc" id="namespacecutlass_1_1reference_1_1host_html_a29548cb522d9c147cf34263ecac75d89"><div class="ttname"><a href="namespacecutlass_1_1reference_1_1host.html#a29548cb522d9c147cf34263ecac75d89">cutlass::reference::host::TensorFillIdentity</a></div><div class="ttdeci">void TensorFillIdentity(TensorView< Element, Layout > dst)</div><div class="ttdoc">Helper to fill a tensor&#39;s diagonal with 1 and 0 everywhere else. </div><div class="ttdef"><b>Definition:</b> host/tensor_fill.h:564</div></div>
|
||||||
<div class="ttc" id="structcutlass_1_1reference_1_1host_1_1detail_1_1RandomUniformFunc_3_01complex_3_01Element_01_4_01_4_html_a6ef7020f1108432fe51853dffb7e727c"><div class="ttname"><a href="structcutlass_1_1reference_1_1host_1_1detail_1_1RandomUniformFunc_3_01complex_3_01Element_01_4_01_4.html#a6ef7020f1108432fe51853dffb7e727c">cutlass::reference::host::detail::RandomUniformFunc< complex< Element > >::operator()</a></div><div class="ttdeci">complex< Element > operator()() const </div><div class="ttdoc">Compute random value and update RNG state. </div><div class="ttdef"><b>Definition:</b> host/tensor_fill.h:375</div></div>
|
<div class="ttc" id="structcutlass_1_1reference_1_1host_1_1detail_1_1RandomUniformFunc_3_01complex_3_01Element_01_4_01_4_html_a6ef7020f1108432fe51853dffb7e727c"><div class="ttname"><a href="structcutlass_1_1reference_1_1host_1_1detail_1_1RandomUniformFunc_3_01complex_3_01Element_01_4_01_4.html#a6ef7020f1108432fe51853dffb7e727c">cutlass::reference::host::detail::RandomUniformFunc< complex< Element > >::operator()</a></div><div class="ttdeci">complex< Element > operator()() const </div><div class="ttdoc">Compute random value and update RNG state. </div><div class="ttdef"><b>Definition:</b> host/tensor_fill.h:375</div></div>
|
||||||
<div class="ttc" id="namespacecutlass_html_a67f9e83dd59615eff837ea66984c121c"><div class="ttname"><a href="namespacecutlass.html#a67f9e83dd59615eff837ea66984c121c">cutlass::log</a></div><div class="ttdeci">CUTLASS_HOST_DEVICE complex< T > log(complex< T > const &z)</div><div class="ttdoc">Computes the complex exponential of z. </div><div class="ttdef"><b>Definition:</b> complex.h:381</div></div>
|
<div class="ttc" id="namespacecutlass_html_a67f9e83dd59615eff837ea66984c121c"><div class="ttname"><a href="namespacecutlass.html#a67f9e83dd59615eff837ea66984c121c">cutlass::log</a></div><div class="ttdeci">CUTLASS_HOST_DEVICE complex< T > log(complex< T > const &z)</div><div class="ttdoc">Computes the complex exponential of z. </div><div class="ttdef"><b>Definition:</b> complex.h:381</div></div>
|
||||||
<div class="ttc" id="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorFillGaussianFunc_html_a4e447a80bd94cde69fa66f9e9d882b28"><div class="ttname"><a href="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorFillGaussianFunc.html#a4e447a80bd94cde69fa66f9e9d882b28">cutlass::reference::host::detail::TensorFillGaussianFunc::operator()</a></div><div class="ttdeci">void operator()(Coord< Layout::kRank > const &coord) const </div><div class="ttdoc">Compute random value and update RNG state. </div><div class="ttdef"><b>Definition:</b> host/tensor_fill.h:236</div></div>
|
<div class="ttc" id="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorFillGaussianFunc_html_a4e447a80bd94cde69fa66f9e9d882b28"><div class="ttname"><a href="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorFillGaussianFunc.html#a4e447a80bd94cde69fa66f9e9d882b28">cutlass::reference::host::detail::TensorFillGaussianFunc::operator()</a></div><div class="ttdeci">void operator()(Coord< Layout::kRank > const &coord) const </div><div class="ttdoc">Compute random value and update RNG state. </div><div class="ttdef"><b>Definition:</b> host/tensor_fill.h:236</div></div>
|
||||||
|
@ -104,7 +104,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
|
|||||||
<div class="ttc" id="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorForEachHelper_html_a1161a761c596e714982fe30141211cca"><div class="ttname"><a href="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorForEachHelper.html#a1161a761c596e714982fe30141211cca">cutlass::reference::host::detail::TensorForEachHelper::kActiveRank</a></div><div class="ttdeci">static int const kActiveRank</div><div class="ttdoc">Index of the active rank. </div><div class="ttdef"><b>Definition:</b> host/tensor_foreach.h:44</div></div>
|
<div class="ttc" id="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorForEachHelper_html_a1161a761c596e714982fe30141211cca"><div class="ttname"><a href="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorForEachHelper.html#a1161a761c596e714982fe30141211cca">cutlass::reference::host::detail::TensorForEachHelper::kActiveRank</a></div><div class="ttdeci">static int const kActiveRank</div><div class="ttdoc">Index of the active rank. </div><div class="ttdef"><b>Definition:</b> host/tensor_foreach.h:44</div></div>
|
||||||
<div class="ttc" id="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorForEachHelper_html_aa63906bbecfe42eec1991c9176f066d9"><div class="ttname"><a href="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorForEachHelper.html#aa63906bbecfe42eec1991c9176f066d9">cutlass::reference::host::detail::TensorForEachHelper::TensorForEachHelper</a></div><div class="ttdeci">TensorForEachHelper(Func &func, Coord< Rank > const &extent, Coord< Rank > &coord)</div><div class="ttdoc">Constructor for general rank. </div><div class="ttdef"><b>Definition:</b> host/tensor_foreach.h:47</div></div>
|
<div class="ttc" id="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorForEachHelper_html_aa63906bbecfe42eec1991c9176f066d9"><div class="ttname"><a href="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorForEachHelper.html#aa63906bbecfe42eec1991c9176f066d9">cutlass::reference::host::detail::TensorForEachHelper::TensorForEachHelper</a></div><div class="ttdeci">TensorForEachHelper(Func &func, Coord< Rank > const &extent, Coord< Rank > &coord)</div><div class="ttdoc">Constructor for general rank. </div><div class="ttdef"><b>Definition:</b> host/tensor_foreach.h:47</div></div>
|
||||||
<div class="ttc" id="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorForEachHelper_html"><div class="ttname"><a href="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorForEachHelper.html">cutlass::reference::host::detail::TensorForEachHelper</a></div><div class="ttdoc">Helper to perform for-each operation. </div><div class="ttdef"><b>Definition:</b> host/tensor_foreach.h:41</div></div>
|
<div class="ttc" id="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorForEachHelper_html"><div class="ttname"><a href="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorForEachHelper.html">cutlass::reference::host::detail::TensorForEachHelper</a></div><div class="ttdoc">Helper to perform for-each operation. </div><div class="ttdef"><b>Definition:</b> host/tensor_foreach.h:41</div></div>
|
||||||
<div class="ttc" id="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorForEachHelper_3_01Func_00_01Rank_00_010_01_4_html_a5029a4405a9a5e64011addb43bb88120"><div class="ttname"><a href="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorForEachHelper_3_01Func_00_01Rank_00_010_01_4.html#a5029a4405a9a5e64011addb43bb88120">cutlass::reference::host::detail::TensorForEachHelper< Func, Rank, 0 >::TensorForEachHelper</a></div><div class="ttdeci">TensorForEachHelper(Func &func, Coord< Rank > const &extent, Coord< Rank > &coord)</div><div class="ttdoc">Constructor for fastest chaning rank. </div><div class="ttdef"><b>Definition:</b> host/tensor_foreach.h:67</div></div>
|
<div class="ttc" id="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorForEachHelper_3_01Func_00_01Rank_00_010_01_4_html_a5029a4405a9a5e64011addb43bb88120"><div class="ttname"><a href="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorForEachHelper_3_01Func_00_01Rank_00_010_01_4.html#a5029a4405a9a5e64011addb43bb88120">cutlass::reference::host::detail::TensorForEachHelper< Func, Rank, 0 >::TensorForEachHelper</a></div><div class="ttdeci">TensorForEachHelper(Func &func, Coord< Rank > const &extent, Coord< Rank > &coord)</div><div class="ttdoc">Constructor for fastest changing rank. </div><div class="ttdef"><b>Definition:</b> host/tensor_foreach.h:67</div></div>
|
||||||
<div class="ttc" id="structcutlass_1_1Coord_html"><div class="ttname"><a href="structcutlass_1_1Coord.html">cutlass::Coord</a></div><div class="ttdoc">Statically-sized array specifying Coords within a tensor. </div><div class="ttdef"><b>Definition:</b> coord.h:43</div></div>
|
<div class="ttc" id="structcutlass_1_1Coord_html"><div class="ttname"><a href="structcutlass_1_1Coord.html">cutlass::Coord</a></div><div class="ttdoc">Statically-sized array specifying Coords within a tensor. </div><div class="ttdef"><b>Definition:</b> coord.h:43</div></div>
|
||||||
<div class="ttc" id="namespacecutlass_1_1reference_1_1host_html_a3825b1aaaf5e5abf0de5f427e3481ada"><div class="ttname"><a href="namespacecutlass_1_1reference_1_1host.html#a3825b1aaaf5e5abf0de5f427e3481ada">cutlass::reference::host::TensorForEachLambda</a></div><div class="ttdeci">void TensorForEachLambda(Coord< Rank > extent, Func func)</div><div class="ttdoc">Iterates over the index space of a tensor and calls a C++ lambda. </div><div class="ttdef"><b>Definition:</b> host/tensor_foreach.h:98</div></div>
|
<div class="ttc" id="namespacecutlass_1_1reference_1_1host_html_a3825b1aaaf5e5abf0de5f427e3481ada"><div class="ttname"><a href="namespacecutlass_1_1reference_1_1host.html#a3825b1aaaf5e5abf0de5f427e3481ada">cutlass::reference::host::TensorForEachLambda</a></div><div class="ttdeci">void TensorForEachLambda(Coord< Rank > extent, Func func)</div><div class="ttdoc">Iterates over the index space of a tensor and calls a C++ lambda. </div><div class="ttdef"><b>Definition:</b> host/tensor_foreach.h:98</div></div>
|
||||||
<div class="ttc" id="namespacecutlass_1_1reference_1_1host_html_a8c798c04df572b34e3ed3976d69f993d"><div class="ttname"><a href="namespacecutlass_1_1reference_1_1host.html#a8c798c04df572b34e3ed3976d69f993d">cutlass::reference::host::TensorForEach</a></div><div class="ttdeci">void TensorForEach(Coord< Rank > extent, Func &func)</div><div class="ttdoc">Iterates over the index space of a tensor. </div><div class="ttdef"><b>Definition:</b> host/tensor_foreach.h:87</div></div>
|
<div class="ttc" id="namespacecutlass_1_1reference_1_1host_html_a8c798c04df572b34e3ed3976d69f993d"><div class="ttname"><a href="namespacecutlass_1_1reference_1_1host.html#a8c798c04df572b34e3ed3976d69f993d">cutlass::reference::host::TensorForEach</a></div><div class="ttdeci">void TensorForEach(Coord< Rank > extent, Func &func)</div><div class="ttdoc">Iterates over the index space of a tensor. </div><div class="ttdef"><b>Definition:</b> host/tensor_foreach.h:87</div></div>
|
||||||
|
@ -130,7 +130,7 @@ Classes</h2></td></tr>
|
|||||||
<tr class="memdesc:"><td class="mdescLeft"> </td><td class="mdescRight">Argument structure. <a href="structcutlass_1_1gemm_1_1device_1_1Gemm_1_1Arguments.html#details">More...</a><br /></td></tr>
|
<tr class="memdesc:"><td class="mdescLeft"> </td><td class="mdescRight">Argument structure. <a href="structcutlass_1_1gemm_1_1device_1_1Gemm_1_1Arguments.html#details">More...</a><br /></td></tr>
|
||||||
<tr class="separator:"><td class="memSeparator" colspan="2"> </td></tr>
|
<tr class="separator:"><td class="memSeparator" colspan="2"> </td></tr>
|
||||||
<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">class  </td><td class="memItemRight" valign="bottom"><a class="el" href="classcutlass_1_1gemm_1_1device_1_1Gemm_3_01ElementA___00_01LayoutA___00_01ElementB___00_01Layout4d0960ae6b1d1bf19e6239dbd002249c.html">cutlass::gemm::device::Gemm< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, SplitKSerial, Operator_, IsBetaZero ></a></td></tr>
|
<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">class  </td><td class="memItemRight" valign="bottom"><a class="el" href="classcutlass_1_1gemm_1_1device_1_1Gemm_3_01ElementA___00_01LayoutA___00_01ElementB___00_01Layout4d0960ae6b1d1bf19e6239dbd002249c.html">cutlass::gemm::device::Gemm< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, SplitKSerial, Operator_, IsBetaZero ></a></td></tr>
|
||||||
<tr class="memdesc:"><td class="mdescLeft"> </td><td class="mdescRight">Parital specialization for column-major output exchanges problem size and operand. <a href="classcutlass_1_1gemm_1_1device_1_1Gemm_3_01ElementA___00_01LayoutA___00_01ElementB___00_01Layout4d0960ae6b1d1bf19e6239dbd002249c.html#details">More...</a><br /></td></tr>
|
<tr class="memdesc:"><td class="mdescLeft"> </td><td class="mdescRight">Partial specialization for column-major output exchanges problem size and operand. <a href="classcutlass_1_1gemm_1_1device_1_1Gemm_3_01ElementA___00_01LayoutA___00_01ElementB___00_01Layout4d0960ae6b1d1bf19e6239dbd002249c.html#details">More...</a><br /></td></tr>
|
||||||
<tr class="separator:"><td class="memSeparator" colspan="2"> </td></tr>
|
<tr class="separator:"><td class="memSeparator" colspan="2"> </td></tr>
|
||||||
<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct  </td><td class="memItemRight" valign="bottom"><a class="el" href="structcutlass_1_1gemm_1_1device_1_1Gemm_3_01ElementA___00_01LayoutA___00_01ElementB___00_01Layou1b211cc9c97c022d8fe10f2dd32c8709.html">cutlass::gemm::device::Gemm< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, SplitKSerial, Operator_, IsBetaZero >::Arguments</a></td></tr>
|
<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct  </td><td class="memItemRight" valign="bottom"><a class="el" href="structcutlass_1_1gemm_1_1device_1_1Gemm_3_01ElementA___00_01LayoutA___00_01ElementB___00_01Layou1b211cc9c97c022d8fe10f2dd32c8709.html">cutlass::gemm::device::Gemm< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, SplitKSerial, Operator_, IsBetaZero >::Arguments</a></td></tr>
|
||||||
<tr class="memdesc:"><td class="mdescLeft"> </td><td class="mdescRight">Argument structure. <a href="structcutlass_1_1gemm_1_1device_1_1Gemm_3_01ElementA___00_01LayoutA___00_01ElementB___00_01Layou1b211cc9c97c022d8fe10f2dd32c8709.html#details">More...</a><br /></td></tr>
|
<tr class="memdesc:"><td class="mdescLeft"> </td><td class="mdescRight">Argument structure. <a href="structcutlass_1_1gemm_1_1device_1_1Gemm_3_01ElementA___00_01LayoutA___00_01ElementB___00_01Layou1b211cc9c97c022d8fe10f2dd32c8709.html#details">More...</a><br /></td></tr>
|
||||||
|
@ -130,7 +130,7 @@ Classes</h2></td></tr>
|
|||||||
<tr class="memdesc:"><td class="mdescLeft"> </td><td class="mdescRight">Argument structure. <a href="structcutlass_1_1gemm_1_1device_1_1GemmComplex_1_1Arguments.html#details">More...</a><br /></td></tr>
|
<tr class="memdesc:"><td class="mdescLeft"> </td><td class="mdescRight">Argument structure. <a href="structcutlass_1_1gemm_1_1device_1_1GemmComplex_1_1Arguments.html#details">More...</a><br /></td></tr>
|
||||||
<tr class="separator:"><td class="memSeparator" colspan="2"> </td></tr>
|
<tr class="separator:"><td class="memSeparator" colspan="2"> </td></tr>
|
||||||
<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">class  </td><td class="memItemRight" valign="bottom"><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmComplex_3_01ElementA___00_01LayoutA___00_01ElementB___00_07c56401b4df75709ae636675d9980a9a.html">cutlass::gemm::device::GemmComplex< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, TransformA, TransformB, SplitKSerial ></a></td></tr>
|
<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">class  </td><td class="memItemRight" valign="bottom"><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmComplex_3_01ElementA___00_01LayoutA___00_01ElementB___00_07c56401b4df75709ae636675d9980a9a.html">cutlass::gemm::device::GemmComplex< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, TransformA, TransformB, SplitKSerial ></a></td></tr>
|
||||||
<tr class="memdesc:"><td class="mdescLeft"> </td><td class="mdescRight">Parital specialization for column-major output exchanges problem size and operand. <a href="classcutlass_1_1gemm_1_1device_1_1GemmComplex_3_01ElementA___00_01LayoutA___00_01ElementB___00_07c56401b4df75709ae636675d9980a9a.html#details">More...</a><br /></td></tr>
|
<tr class="memdesc:"><td class="mdescLeft"> </td><td class="mdescRight">Partial specialization for column-major output exchanges problem size and operand. <a href="classcutlass_1_1gemm_1_1device_1_1GemmComplex_3_01ElementA___00_01LayoutA___00_01ElementB___00_07c56401b4df75709ae636675d9980a9a.html#details">More...</a><br /></td></tr>
|
||||||
<tr class="separator:"><td class="memSeparator" colspan="2"> </td></tr>
|
<tr class="separator:"><td class="memSeparator" colspan="2"> </td></tr>
|
||||||
<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct  </td><td class="memItemRight" valign="bottom"><a class="el" href="structcutlass_1_1gemm_1_1device_1_1GemmComplex_3_01ElementA___00_01LayoutA___00_01ElementB___00_a3923967cafb5cb9774c320dc24baa77.html">cutlass::gemm::device::GemmComplex< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, TransformA, TransformB, SplitKSerial >::Arguments</a></td></tr>
|
<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct  </td><td class="memItemRight" valign="bottom"><a class="el" href="structcutlass_1_1gemm_1_1device_1_1GemmComplex_3_01ElementA___00_01LayoutA___00_01ElementB___00_a3923967cafb5cb9774c320dc24baa77.html">cutlass::gemm::device::GemmComplex< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, TransformA, TransformB, SplitKSerial >::Arguments</a></td></tr>
|
||||||
<tr class="memdesc:"><td class="mdescLeft"> </td><td class="mdescRight">Argument structure. <a href="structcutlass_1_1gemm_1_1device_1_1GemmComplex_3_01ElementA___00_01LayoutA___00_01ElementB___00_a3923967cafb5cb9774c320dc24baa77.html#details">More...</a><br /></td></tr>
|
<tr class="memdesc:"><td class="mdescLeft"> </td><td class="mdescRight">Argument structure. <a href="structcutlass_1_1gemm_1_1device_1_1GemmComplex_3_01ElementA___00_01LayoutA___00_01ElementB___00_a3923967cafb5cb9774c320dc24baa77.html#details">More...</a><br /></td></tr>
|
||||||
|
File diff suppressed because one or more lines are too long
@ -134,17 +134,17 @@ Classes</h2></td></tr>
|
|||||||
<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">class  </td><td class="memItemRight" valign="bottom"><a class="el" href="classcutlass_1_1gemm_1_1device_1_1Gemm.html">Gemm</a></td></tr>
|
<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">class  </td><td class="memItemRight" valign="bottom"><a class="el" href="classcutlass_1_1gemm_1_1device_1_1Gemm.html">Gemm</a></td></tr>
|
||||||
<tr class="separator:"><td class="memSeparator" colspan="2"> </td></tr>
|
<tr class="separator:"><td class="memSeparator" colspan="2"> </td></tr>
|
||||||
<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">class  </td><td class="memItemRight" valign="bottom"><a class="el" href="classcutlass_1_1gemm_1_1device_1_1Gemm_3_01ElementA___00_01LayoutA___00_01ElementB___00_01Layout4d0960ae6b1d1bf19e6239dbd002249c.html">Gemm< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, SplitKSerial, Operator_, IsBetaZero ></a></td></tr>
|
<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">class  </td><td class="memItemRight" valign="bottom"><a class="el" href="classcutlass_1_1gemm_1_1device_1_1Gemm_3_01ElementA___00_01LayoutA___00_01ElementB___00_01Layout4d0960ae6b1d1bf19e6239dbd002249c.html">Gemm< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, SplitKSerial, Operator_, IsBetaZero ></a></td></tr>
|
||||||
<tr class="memdesc:"><td class="mdescLeft"> </td><td class="mdescRight">Parital specialization for column-major output exchanges problem size and operand. <a href="classcutlass_1_1gemm_1_1device_1_1Gemm_3_01ElementA___00_01LayoutA___00_01ElementB___00_01Layout4d0960ae6b1d1bf19e6239dbd002249c.html#details">More...</a><br /></td></tr>
|
<tr class="memdesc:"><td class="mdescLeft"> </td><td class="mdescRight">Partial specialization for column-major output exchanges problem size and operand. <a href="classcutlass_1_1gemm_1_1device_1_1Gemm_3_01ElementA___00_01LayoutA___00_01ElementB___00_01Layout4d0960ae6b1d1bf19e6239dbd002249c.html#details">More...</a><br /></td></tr>
|
||||||
<tr class="separator:"><td class="memSeparator" colspan="2"> </td></tr>
|
<tr class="separator:"><td class="memSeparator" colspan="2"> </td></tr>
|
||||||
<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">class  </td><td class="memItemRight" valign="bottom"><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmBatched.html">GemmBatched</a></td></tr>
|
<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">class  </td><td class="memItemRight" valign="bottom"><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmBatched.html">GemmBatched</a></td></tr>
|
||||||
<tr class="separator:"><td class="memSeparator" colspan="2"> </td></tr>
|
<tr class="separator:"><td class="memSeparator" colspan="2"> </td></tr>
|
||||||
<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">class  </td><td class="memItemRight" valign="bottom"><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_0c9bb6f4463ab6085e6008b5d5ad6abfd.html">GemmBatched< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ ></a></td></tr>
|
<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">class  </td><td class="memItemRight" valign="bottom"><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_0c9bb6f4463ab6085e6008b5d5ad6abfd.html">GemmBatched< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ ></a></td></tr>
|
||||||
<tr class="memdesc:"><td class="mdescLeft"> </td><td class="mdescRight">Parital specialization for column-major output exchanges problem size and operand. <a href="classcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_0c9bb6f4463ab6085e6008b5d5ad6abfd.html#details">More...</a><br /></td></tr>
|
<tr class="memdesc:"><td class="mdescLeft"> </td><td class="mdescRight">Partial specialization for column-major output exchanges problem size and operand. <a href="classcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_0c9bb6f4463ab6085e6008b5d5ad6abfd.html#details">More...</a><br /></td></tr>
|
||||||
<tr class="separator:"><td class="memSeparator" colspan="2"> </td></tr>
|
<tr class="separator:"><td class="memSeparator" colspan="2"> </td></tr>
|
||||||
<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">class  </td><td class="memItemRight" valign="bottom"><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmComplex.html">GemmComplex</a></td></tr>
|
<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">class  </td><td class="memItemRight" valign="bottom"><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmComplex.html">GemmComplex</a></td></tr>
|
||||||
<tr class="separator:"><td class="memSeparator" colspan="2"> </td></tr>
|
<tr class="separator:"><td class="memSeparator" colspan="2"> </td></tr>
|
||||||
<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">class  </td><td class="memItemRight" valign="bottom"><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmComplex_3_01ElementA___00_01LayoutA___00_01ElementB___00_07c56401b4df75709ae636675d9980a9a.html">GemmComplex< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, TransformA, TransformB, SplitKSerial ></a></td></tr>
|
<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">class  </td><td class="memItemRight" valign="bottom"><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmComplex_3_01ElementA___00_01LayoutA___00_01ElementB___00_07c56401b4df75709ae636675d9980a9a.html">GemmComplex< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, TransformA, TransformB, SplitKSerial ></a></td></tr>
|
||||||
<tr class="memdesc:"><td class="mdescLeft"> </td><td class="mdescRight">Parital specialization for column-major output exchanges problem size and operand. <a href="classcutlass_1_1gemm_1_1device_1_1GemmComplex_3_01ElementA___00_01LayoutA___00_01ElementB___00_07c56401b4df75709ae636675d9980a9a.html#details">More...</a><br /></td></tr>
|
<tr class="memdesc:"><td class="mdescLeft"> </td><td class="mdescRight">Partial specialization for column-major output exchanges problem size and operand. <a href="classcutlass_1_1gemm_1_1device_1_1GemmComplex_3_01ElementA___00_01LayoutA___00_01ElementB___00_07c56401b4df75709ae636675d9980a9a.html#details">More...</a><br /></td></tr>
|
||||||
<tr class="separator:"><td class="memSeparator" colspan="2"> </td></tr>
|
<tr class="separator:"><td class="memSeparator" colspan="2"> </td></tr>
|
||||||
<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">class  </td><td class="memItemRight" valign="bottom"><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmSplitKParallel.html">GemmSplitKParallel</a></td></tr>
|
<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">class  </td><td class="memItemRight" valign="bottom"><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmSplitKParallel.html">GemmSplitKParallel</a></td></tr>
|
||||||
<tr class="separator:"><td class="memSeparator" colspan="2"> </td></tr>
|
<tr class="separator:"><td class="memSeparator" colspan="2"> </td></tr>
|
||||||
|
@ -125,7 +125,7 @@ Classes</h2></td></tr>
|
|||||||
<tr class="memdesc:"><td class="mdescLeft"> </td><td class="mdescRight">Partial specialization for multiply-add-saturate. <a href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout30b72addd464a2ca4a26785cbfd77a8e.html#details">More...</a><br /></td></tr>
|
<tr class="memdesc:"><td class="mdescLeft"> </td><td class="mdescRight">Partial specialization for multiply-add-saturate. <a href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout30b72addd464a2ca4a26785cbfd77a8e.html#details">More...</a><br /></td></tr>
|
||||||
<tr class="separator:"><td class="memSeparator" colspan="2"> </td></tr>
|
<tr class="separator:"><td class="memSeparator" colspan="2"> </td></tr>
|
||||||
<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct  </td><td class="memItemRight" valign="bottom"><a class="el" href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout660562b232f408218828ca5915b7e73a.html">Gemm< ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, AccumulatorType, arch::OpXorPopc ></a></td></tr>
|
<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct  </td><td class="memItemRight" valign="bottom"><a class="el" href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout660562b232f408218828ca5915b7e73a.html">Gemm< ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, AccumulatorType, arch::OpXorPopc ></a></td></tr>
|
||||||
<tr class="memdesc:"><td class="mdescLeft"> </td><td class="mdescRight">Parital specialization for XOR-popc. <a href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout660562b232f408218828ca5915b7e73a.html#details">More...</a><br /></td></tr>
|
<tr class="memdesc:"><td class="mdescLeft"> </td><td class="mdescRight">Partial specialization for XOR-popc. <a href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout660562b232f408218828ca5915b7e73a.html#details">More...</a><br /></td></tr>
|
||||||
<tr class="separator:"><td class="memSeparator" colspan="2"> </td></tr>
|
<tr class="separator:"><td class="memSeparator" colspan="2"> </td></tr>
|
||||||
<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct  </td><td class="memItemRight" valign="bottom"><a class="el" href="structcutlass_1_1reference_1_1device_1_1TensorDiagonalForEach.html">TensorDiagonalForEach</a></td></tr>
|
<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct  </td><td class="memItemRight" valign="bottom"><a class="el" href="structcutlass_1_1reference_1_1device_1_1TensorDiagonalForEach.html">TensorDiagonalForEach</a></td></tr>
|
||||||
<tr class="memdesc:"><td class="mdescLeft"> </td><td class="mdescRight">Launches a kernel calling a functor for each element along a tensor's diagonal. <a href="structcutlass_1_1reference_1_1device_1_1TensorDiagonalForEach.html#details">More...</a><br /></td></tr>
|
<tr class="memdesc:"><td class="mdescLeft"> </td><td class="mdescRight">Launches a kernel calling a functor for each element along a tensor's diagonal. <a href="structcutlass_1_1reference_1_1device_1_1TensorDiagonalForEach.html#details">More...</a><br /></td></tr>
|
||||||
@ -183,7 +183,7 @@ Functions</h2></td></tr>
|
|||||||
<tr class="separator:a6e23d479ebb3760d5846ed1b67e450e4"><td class="memSeparator" colspan="2"> </td></tr>
|
<tr class="separator:a6e23d479ebb3760d5846ed1b67e450e4"><td class="memSeparator" colspan="2"> </td></tr>
|
||||||
<tr class="memitem:a6b0f21995c4fd5c33617550e6905c78e"><td class="memTemplParams" colspan="2">template<typename Element , typename Layout > </td></tr>
|
<tr class="memitem:a6b0f21995c4fd5c33617550e6905c78e"><td class="memTemplParams" colspan="2">template<typename Element , typename Layout > </td></tr>
|
||||||
<tr class="memitem:a6b0f21995c4fd5c33617550e6905c78e"><td class="memTemplItemLeft" align="right" valign="top">void </td><td class="memTemplItemRight" valign="bottom"><a class="el" href="namespacecutlass_1_1reference_1_1device.html#a6b0f21995c4fd5c33617550e6905c78e">TensorFillIdentity</a> (<a class="el" href="classcutlass_1_1TensorView.html">TensorView</a>< Element, Layout > view)</td></tr>
|
<tr class="memitem:a6b0f21995c4fd5c33617550e6905c78e"><td class="memTemplItemLeft" align="right" valign="top">void </td><td class="memTemplItemRight" valign="bottom"><a class="el" href="namespacecutlass_1_1reference_1_1device.html#a6b0f21995c4fd5c33617550e6905c78e">TensorFillIdentity</a> (<a class="el" href="classcutlass_1_1TensorView.html">TensorView</a>< Element, Layout > view)</td></tr>
|
||||||
<tr class="memdesc:a6b0f21995c4fd5c33617550e6905c78e"><td class="mdescLeft"> </td><td class="mdescRight">Fills a tensor's digonal with 1 and 0 everywhere else. <a href="#a6b0f21995c4fd5c33617550e6905c78e">More...</a><br /></td></tr>
|
<tr class="memdesc:a6b0f21995c4fd5c33617550e6905c78e"><td class="mdescLeft"> </td><td class="mdescRight">Fills a tensor's diagonal with 1 and 0 everywhere else. <a href="#a6b0f21995c4fd5c33617550e6905c78e">More...</a><br /></td></tr>
|
||||||
<tr class="separator:a6b0f21995c4fd5c33617550e6905c78e"><td class="memSeparator" colspan="2"> </td></tr>
|
<tr class="separator:a6b0f21995c4fd5c33617550e6905c78e"><td class="memSeparator" colspan="2"> </td></tr>
|
||||||
<tr class="memitem:aaff3d7919a2f2dce14eb254c17eead9a"><td class="memTemplParams" colspan="2">template<typename Element , typename Layout > </td></tr>
|
<tr class="memitem:aaff3d7919a2f2dce14eb254c17eead9a"><td class="memTemplParams" colspan="2">template<typename Element , typename Layout > </td></tr>
|
||||||
<tr class="memitem:aaff3d7919a2f2dce14eb254c17eead9a"><td class="memTemplItemLeft" align="right" valign="top">void </td><td class="memTemplItemRight" valign="bottom"><a class="el" href="namespacecutlass_1_1reference_1_1device.html#aaff3d7919a2f2dce14eb254c17eead9a">TensorUpdateDiagonal</a> (<a class="el" href="classcutlass_1_1TensorView.html">TensorView</a>< Element, Layout > view, Element diag=Element(1))</td></tr>
|
<tr class="memitem:aaff3d7919a2f2dce14eb254c17eead9a"><td class="memTemplItemLeft" align="right" valign="top">void </td><td class="memTemplItemRight" valign="bottom"><a class="el" href="namespacecutlass_1_1reference_1_1device.html#aaff3d7919a2f2dce14eb254c17eead9a">TensorUpdateDiagonal</a> (<a class="el" href="classcutlass_1_1TensorView.html">TensorView</a>< Element, Layout > view, Element diag=Element(1))</td></tr>
|
||||||
|
@ -122,7 +122,7 @@ Classes</h2></td></tr>
|
|||||||
<tr class="memdesc:"><td class="mdescLeft"> </td><td class="mdescRight">Partial specialization for multiply-add-saturate. <a href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_55729eac7dbd6bf311ea36f680e83e93.html#details">More...</a><br /></td></tr>
|
<tr class="memdesc:"><td class="mdescLeft"> </td><td class="mdescRight">Partial specialization for multiply-add-saturate. <a href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_55729eac7dbd6bf311ea36f680e83e93.html#details">More...</a><br /></td></tr>
|
||||||
<tr class="separator:"><td class="memSeparator" colspan="2"> </td></tr>
|
<tr class="separator:"><td class="memSeparator" colspan="2"> </td></tr>
|
||||||
<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct  </td><td class="memItemRight" valign="bottom"><a class="el" href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_4f3f32c4b336238abfd741e87bfced46.html">Gemm< ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, ComputeType, arch::OpXorPopc ></a></td></tr>
|
<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct  </td><td class="memItemRight" valign="bottom"><a class="el" href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_4f3f32c4b336238abfd741e87bfced46.html">Gemm< ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, ComputeType, arch::OpXorPopc ></a></td></tr>
|
||||||
<tr class="memdesc:"><td class="mdescLeft"> </td><td class="mdescRight">Parital specialization for XOR-popc. <a href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_4f3f32c4b336238abfd741e87bfced46.html#details">More...</a><br /></td></tr>
|
<tr class="memdesc:"><td class="mdescLeft"> </td><td class="mdescRight">Partial specialization for XOR-popc. <a href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_4f3f32c4b336238abfd741e87bfced46.html#details">More...</a><br /></td></tr>
|
||||||
<tr class="separator:"><td class="memSeparator" colspan="2"> </td></tr>
|
<tr class="separator:"><td class="memSeparator" colspan="2"> </td></tr>
|
||||||
</table><table class="memberdecls">
|
</table><table class="memberdecls">
|
||||||
<tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="func-members"></a>
|
<tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="func-members"></a>
|
||||||
@ -247,7 +247,7 @@ Functions</h2></td></tr>
|
|||||||
<tr class="separator:a1c81144ca36832a48d04d1b5b6498080"><td class="memSeparator" colspan="2"> </td></tr>
|
<tr class="separator:a1c81144ca36832a48d04d1b5b6498080"><td class="memSeparator" colspan="2"> </td></tr>
|
||||||
<tr class="memitem:a29548cb522d9c147cf34263ecac75d89"><td class="memTemplParams" colspan="2">template<typename Element , typename Layout > </td></tr>
|
<tr class="memitem:a29548cb522d9c147cf34263ecac75d89"><td class="memTemplParams" colspan="2">template<typename Element , typename Layout > </td></tr>
|
||||||
<tr class="memitem:a29548cb522d9c147cf34263ecac75d89"><td class="memTemplItemLeft" align="right" valign="top">void </td><td class="memTemplItemRight" valign="bottom"><a class="el" href="namespacecutlass_1_1reference_1_1host.html#a29548cb522d9c147cf34263ecac75d89">TensorFillIdentity</a> (<a class="el" href="classcutlass_1_1TensorView.html">TensorView</a>< Element, Layout > dst)</td></tr>
|
<tr class="memitem:a29548cb522d9c147cf34263ecac75d89"><td class="memTemplItemLeft" align="right" valign="top">void </td><td class="memTemplItemRight" valign="bottom"><a class="el" href="namespacecutlass_1_1reference_1_1host.html#a29548cb522d9c147cf34263ecac75d89">TensorFillIdentity</a> (<a class="el" href="classcutlass_1_1TensorView.html">TensorView</a>< Element, Layout > dst)</td></tr>
|
||||||
<tr class="memdesc:a29548cb522d9c147cf34263ecac75d89"><td class="mdescLeft"> </td><td class="mdescRight">Helper to fill a tensor's digonal with 1 and 0 everywhere else. <a href="#a29548cb522d9c147cf34263ecac75d89">More...</a><br /></td></tr>
|
<tr class="memdesc:a29548cb522d9c147cf34263ecac75d89"><td class="mdescLeft"> </td><td class="mdescRight">Helper to fill a tensor's diagonal with 1 and 0 everywhere else. <a href="#a29548cb522d9c147cf34263ecac75d89">More...</a><br /></td></tr>
|
||||||
<tr class="separator:a29548cb522d9c147cf34263ecac75d89"><td class="memSeparator" colspan="2"> </td></tr>
|
<tr class="separator:a29548cb522d9c147cf34263ecac75d89"><td class="memSeparator" colspan="2"> </td></tr>
|
||||||
<tr class="memitem:acbf747241e8ac6ef9b1702b735a7913e"><td class="memTemplParams" colspan="2">template<typename Element , typename Layout > </td></tr>
|
<tr class="memitem:acbf747241e8ac6ef9b1702b735a7913e"><td class="memTemplParams" colspan="2">template<typename Element , typename Layout > </td></tr>
|
||||||
<tr class="memitem:acbf747241e8ac6ef9b1702b735a7913e"><td class="memTemplItemLeft" align="right" valign="top">void </td><td class="memTemplItemRight" valign="bottom"><a class="el" href="namespacecutlass_1_1reference_1_1host.html#acbf747241e8ac6ef9b1702b735a7913e">TensorUpdateDiagonal</a> (<a class="el" href="classcutlass_1_1TensorView.html">TensorView</a>< Element, Layout > dst, Element val=Element(1))</td></tr>
|
<tr class="memitem:acbf747241e8ac6ef9b1702b735a7913e"><td class="memTemplItemLeft" align="right" valign="top">void </td><td class="memTemplItemRight" valign="bottom"><a class="el" href="namespacecutlass_1_1reference_1_1host.html#acbf747241e8ac6ef9b1702b735a7913e">TensorUpdateDiagonal</a> (<a class="el" href="classcutlass_1_1TensorView.html">TensorView</a>< Element, Layout > dst, Element val=Element(1))</td></tr>
|
||||||
|
@ -14,7 +14,7 @@ var searchData=
|
|||||||
['semaphore',['Semaphore',['../classcutlass_1_1Semaphore.html',1,'cutlass']]],
|
['semaphore',['Semaphore',['../classcutlass_1_1Semaphore.html',1,'cutlass']]],
|
||||||
['semaphore',['Semaphore',['../classcutlass_1_1Semaphore.html#a2ce4cd07fe773efa429f726cfbd98070',1,'cutlass::Semaphore::Semaphore()'],['../structcutlass_1_1gemm_1_1kernel_1_1Gemm_1_1Params.html#adec6d0c6d74e7f456196f453e302fbbb',1,'cutlass::gemm::kernel::Gemm::Params::semaphore()']]],
|
['semaphore',['Semaphore',['../classcutlass_1_1Semaphore.html#a2ce4cd07fe773efa429f726cfbd98070',1,'cutlass::Semaphore::Semaphore()'],['../structcutlass_1_1gemm_1_1kernel_1_1Gemm_1_1Params.html#adec6d0c6d74e7f456196f453e302fbbb',1,'cutlass::gemm::kernel::Gemm::Params::semaphore()']]],
|
||||||
['semaphore_2eh',['semaphore.h',['../semaphore_8h.html',1,'']]],
|
['semaphore_2eh',['semaphore.h',['../semaphore_8h.html',1,'']]],
|
||||||
['seperate_5fstring',['seperate_string',['../structcutlass_1_1CommandLine.html#a5f86e4b2bd8c44b739c83530d77c5590',1,'cutlass::CommandLine']]],
|
['separate_5fstring',['separate_string',['../structcutlass_1_1CommandLine.html#a5f86e4b2bd8c44b739c83530d77c5590',1,'cutlass::CommandLine']]],
|
||||||
['sequential',['sequential',['../structcutlass_1_1Distribution.html#ab86d975567ef141ff82067b1f41cd3ee',1,'cutlass::Distribution::sequential()'],['../structcutlass_1_1Distribution.html#a499f4023e0d42356ce71d38cc32bf92aa39d3cf55e90573c8d1dfb483cfb410dc',1,'cutlass::Distribution::Sequential()']]],
|
['sequential',['sequential',['../structcutlass_1_1Distribution.html#ab86d975567ef141ff82067b1f41cd3ee',1,'cutlass::Distribution::sequential()'],['../structcutlass_1_1Distribution.html#a499f4023e0d42356ce71d38cc32bf92aa39d3cf55e90573c8d1dfb483cfb410dc',1,'cutlass::Distribution::Sequential()']]],
|
||||||
['set',['set',['../classcutlass_1_1PredicateVector_1_1Iterator.html#aadfd039b5622098c9e46706a27122575',1,'cutlass::PredicateVector::Iterator::set()'],['../structcutlass_1_1PredicateVector.html#a062fa8a8df725ef08ced2ffcca8336af',1,'cutlass::PredicateVector::set()'],['../classcutlass_1_1SubbyteReference.html#a6473e57520d8ee7afbd95c1e1641e05a',1,'cutlass::SubbyteReference::set()']]],
|
['set',['set',['../classcutlass_1_1PredicateVector_1_1Iterator.html#aadfd039b5622098c9e46706a27122575',1,'cutlass::PredicateVector::Iterator::set()'],['../structcutlass_1_1PredicateVector.html#a062fa8a8df725ef08ced2ffcca8336af',1,'cutlass::PredicateVector::set()'],['../classcutlass_1_1SubbyteReference.html#a6473e57520d8ee7afbd95c1e1641e05a',1,'cutlass::SubbyteReference::set()']]],
|
||||||
['set_5fgaussian',['set_gaussian',['../structcutlass_1_1Distribution.html#ad594b5ec1d577e8ef03d4d808a8220b1',1,'cutlass::Distribution']]],
|
['set_5fgaussian',['set_gaussian',['../structcutlass_1_1Distribution.html#ad594b5ec1d577e8ef03d4d808a8220b1',1,'cutlass::Distribution']]],
|
||||||
|
@ -3,7 +3,7 @@ var searchData=
|
|||||||
['scalar_5fop',['scalar_op',['../structcutlass_1_1minimum_3_01Array_3_01T_00_01N_01_4_01_4.html#a4b42227184cb7c796460062c46a84b57',1,'cutlass::minimum< Array< T, N > >']]],
|
['scalar_5fop',['scalar_op',['../structcutlass_1_1minimum_3_01Array_3_01T_00_01N_01_4_01_4.html#a4b42227184cb7c796460062c46a84b57',1,'cutlass::minimum< Array< T, N > >']]],
|
||||||
['scalario',['ScalarIO',['../structcutlass_1_1ScalarIO.html#ad4166575521254088bf6c6300c351714',1,'cutlass::ScalarIO::ScalarIO()'],['../structcutlass_1_1ScalarIO.html#a5227e1e9ed24326ad4f8dc94d186186f',1,'cutlass::ScalarIO::ScalarIO(T value)']]],
|
['scalario',['ScalarIO',['../structcutlass_1_1ScalarIO.html#ad4166575521254088bf6c6300c351714',1,'cutlass::ScalarIO::ScalarIO()'],['../structcutlass_1_1ScalarIO.html#a5227e1e9ed24326ad4f8dc94d186186f',1,'cutlass::ScalarIO::ScalarIO(T value)']]],
|
||||||
['semaphore',['Semaphore',['../classcutlass_1_1Semaphore.html#a2ce4cd07fe773efa429f726cfbd98070',1,'cutlass::Semaphore']]],
|
['semaphore',['Semaphore',['../classcutlass_1_1Semaphore.html#a2ce4cd07fe773efa429f726cfbd98070',1,'cutlass::Semaphore']]],
|
||||||
['seperate_5fstring',['seperate_string',['../structcutlass_1_1CommandLine.html#a5f86e4b2bd8c44b739c83530d77c5590',1,'cutlass::CommandLine']]],
|
['separate_5fstring',['separate_string',['../structcutlass_1_1CommandLine.html#a5f86e4b2bd8c44b739c83530d77c5590',1,'cutlass::CommandLine']]],
|
||||||
['set',['set',['../classcutlass_1_1PredicateVector_1_1Iterator.html#aadfd039b5622098c9e46706a27122575',1,'cutlass::PredicateVector::Iterator::set()'],['../structcutlass_1_1PredicateVector.html#a062fa8a8df725ef08ced2ffcca8336af',1,'cutlass::PredicateVector::set()'],['../classcutlass_1_1SubbyteReference.html#a6473e57520d8ee7afbd95c1e1641e05a',1,'cutlass::SubbyteReference::set()']]],
|
['set',['set',['../classcutlass_1_1PredicateVector_1_1Iterator.html#aadfd039b5622098c9e46706a27122575',1,'cutlass::PredicateVector::Iterator::set()'],['../structcutlass_1_1PredicateVector.html#a062fa8a8df725ef08ced2ffcca8336af',1,'cutlass::PredicateVector::set()'],['../classcutlass_1_1SubbyteReference.html#a6473e57520d8ee7afbd95c1e1641e05a',1,'cutlass::SubbyteReference::set()']]],
|
||||||
['set_5fgaussian',['set_gaussian',['../structcutlass_1_1Distribution.html#ad594b5ec1d577e8ef03d4d808a8220b1',1,'cutlass::Distribution']]],
|
['set_5fgaussian',['set_gaussian',['../structcutlass_1_1Distribution.html#ad594b5ec1d577e8ef03d4d808a8220b1',1,'cutlass::Distribution']]],
|
||||||
['set_5fidentity',['set_identity',['../structcutlass_1_1Distribution.html#aad2cf02af3d520544d89843cc4295858',1,'cutlass::Distribution']]],
|
['set_5fidentity',['set_identity',['../structcutlass_1_1Distribution.html#aad2cf02af3d520544d89843cc4295858',1,'cutlass::Distribution']]],
|
||||||
|
@ -115,7 +115,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
|
|||||||
<tr><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html#a1603f1c65c6d8d3d4262443b40e5c290">keys</a></td><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html">cutlass::CommandLine</a></td><td class="entry"></td></tr>
|
<tr><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html#a1603f1c65c6d8d3d4262443b40e5c290">keys</a></td><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html">cutlass::CommandLine</a></td><td class="entry"></td></tr>
|
||||||
<tr class="even"><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html#a0bee40a3cc6078a08eec5d4ca4711f61">num_naked_args</a>() const </td><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html">cutlass::CommandLine</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
|
<tr class="even"><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html#a0bee40a3cc6078a08eec5d4ca4711f61">num_naked_args</a>() const </td><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html">cutlass::CommandLine</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
|
||||||
<tr><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html#a228e1a273d223eec4b2f6d73135d3c1e">parsed_argc</a>() const </td><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html">cutlass::CommandLine</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
|
<tr><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html#a228e1a273d223eec4b2f6d73135d3c1e">parsed_argc</a>() const </td><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html">cutlass::CommandLine</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
|
||||||
<tr class="even"><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html#a5f86e4b2bd8c44b739c83530d77c5590">seperate_string</a>(std::string const &str, std::vector< value_t > &vals, char sep= ',')</td><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html">cutlass::CommandLine</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">static</span></td></tr>
|
<tr class="even"><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html#a5f86e4b2bd8c44b739c83530d77c5590">separate_string</a>(std::string const &str, std::vector< value_t > &vals, char sep= ',')</td><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html">cutlass::CommandLine</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">static</span></td></tr>
|
||||||
<tr><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html#a1944da52162e04b12a82ce0c1ade676e">tokenize</a>(std::vector< std::pair< std::string, std::string > > &tokens, std::string const &str, char delim= ',', char sep= ':')</td><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html">cutlass::CommandLine</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">static</span></td></tr>
|
<tr><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html#a1944da52162e04b12a82ce0c1ade676e">tokenize</a>(std::vector< std::pair< std::string, std::string > > &tokens, std::string const &str, char delim= ',', char sep= ':')</td><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html">cutlass::CommandLine</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">static</span></td></tr>
|
||||||
<tr class="even"><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html#a440c25cfb006f218ff4705a43320a28b">tokenize</a>(std::vector< std::string > &tokens, std::string const &str, char delim= ',', char sep= ':')</td><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html">cutlass::CommandLine</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">static</span></td></tr>
|
<tr class="even"><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html#a440c25cfb006f218ff4705a43320a28b">tokenize</a>(std::vector< std::string > &tokens, std::string const &str, char delim= ',', char sep= ':')</td><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html">cutlass::CommandLine</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">static</span></td></tr>
|
||||||
<tr><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html#ade127841e9730589f611b618e9440012">values</a></td><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html">cutlass::CommandLine</a></td><td class="entry"></td></tr>
|
<tr><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html#ade127841e9730589f611b618e9440012">values</a></td><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html">cutlass::CommandLine</a></td><td class="entry"></td></tr>
|
||||||
|
@ -151,7 +151,7 @@ Static Public Member Functions</h2></td></tr>
|
|||||||
<tr class="memdesc:a440c25cfb006f218ff4705a43320a28b"><td class="mdescLeft"> </td><td class="mdescRight">Tokenizes a comma-delimited list of string pairs delimited by ':'. <a href="#a440c25cfb006f218ff4705a43320a28b">More...</a><br /></td></tr>
|
<tr class="memdesc:a440c25cfb006f218ff4705a43320a28b"><td class="mdescLeft"> </td><td class="mdescRight">Tokenizes a comma-delimited list of string pairs delimited by ':'. <a href="#a440c25cfb006f218ff4705a43320a28b">More...</a><br /></td></tr>
|
||||||
<tr class="separator:a440c25cfb006f218ff4705a43320a28b"><td class="memSeparator" colspan="2"> </td></tr>
|
<tr class="separator:a440c25cfb006f218ff4705a43320a28b"><td class="memSeparator" colspan="2"> </td></tr>
|
||||||
<tr class="memitem:a5f86e4b2bd8c44b739c83530d77c5590"><td class="memTemplParams" colspan="2">template<typename value_t > </td></tr>
|
<tr class="memitem:a5f86e4b2bd8c44b739c83530d77c5590"><td class="memTemplParams" colspan="2">template<typename value_t > </td></tr>
|
||||||
<tr class="memitem:a5f86e4b2bd8c44b739c83530d77c5590"><td class="memTemplItemLeft" align="right" valign="top">static void </td><td class="memTemplItemRight" valign="bottom"><a class="el" href="structcutlass_1_1CommandLine.html#a5f86e4b2bd8c44b739c83530d77c5590">seperate_string</a> (std::string const &str, std::vector< value_t > &vals, char sep= ',')</td></tr>
|
<tr class="memitem:a5f86e4b2bd8c44b739c83530d77c5590"><td class="memTemplItemLeft" align="right" valign="top">static void </td><td class="memTemplItemRight" valign="bottom"><a class="el" href="structcutlass_1_1CommandLine.html#a5f86e4b2bd8c44b739c83530d77c5590">separate_string</a> (std::string const &str, std::vector< value_t > &vals, char sep= ',')</td></tr>
|
||||||
<tr class="separator:a5f86e4b2bd8c44b739c83530d77c5590"><td class="memSeparator" colspan="2"> </td></tr>
|
<tr class="separator:a5f86e4b2bd8c44b739c83530d77c5590"><td class="memSeparator" colspan="2"> </td></tr>
|
||||||
</table><table class="memberdecls">
|
</table><table class="memberdecls">
|
||||||
<tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="pub-attribs"></a>
|
<tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="pub-attribs"></a>
|
||||||
@ -548,7 +548,7 @@ template<typename value_t > </div>
|
|||||||
<td class="mlabels-left">
|
<td class="mlabels-left">
|
||||||
<table class="memname">
|
<table class="memname">
|
||||||
<tr>
|
<tr>
|
||||||
<td class="memname">static void cutlass::CommandLine::seperate_string </td>
|
<td class="memname">static void cutlass::CommandLine::separate_string </td>
|
||||||
<td>(</td>
|
<td>(</td>
|
||||||
<td class="paramtype">std::string const & </td>
|
<td class="paramtype">std::string const & </td>
|
||||||
<td class="paramname"><em>str</em>, </td>
|
<td class="paramname"><em>str</em>, </td>
|
||||||
|
@ -104,7 +104,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
|
|||||||
</div><!--header-->
|
</div><!--header-->
|
||||||
<div class="contents">
|
<div class="contents">
|
||||||
|
|
||||||
<p>Parital specialization for XOR-popc.
|
<p>Partial specialization for XOR-popc.
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
<p><code>#include <<a class="el" href="tools_2util_2include_2cutlass_2util_2reference_2device_2gemm_8h_source.html">gemm.h</a>></code></p>
|
<p><code>#include <<a class="el" href="tools_2util_2include_2cutlass_2util_2reference_2device_2gemm_8h_source.html">gemm.h</a>></code></p>
|
||||||
|
@ -112,7 +112,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
|
|||||||
<tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="pub-methods"></a>
|
<tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="pub-methods"></a>
|
||||||
Public Member Functions</h2></td></tr>
|
Public Member Functions</h2></td></tr>
|
||||||
<tr class="memitem:a89e10e059c3ffcfe2640cf6291353937"><td class="memItemLeft" align="right" valign="top">__inline__ __device__ </td><td class="memItemRight" valign="bottom"><a class="el" href="structcutlass_1_1reference_1_1device_1_1kernel_1_1detail_1_1TensorForEachHelper_3_01Func_00_01Rank_00_010_01_4.html#a89e10e059c3ffcfe2640cf6291353937">TensorForEachHelper</a> (Func &func, <a class="el" href="structcutlass_1_1Coord.html">Coord</a>< Rank > const &size, <a class="el" href="structcutlass_1_1Coord.html">Coord</a>< Rank > &coord, int64_t index)</td></tr>
|
<tr class="memitem:a89e10e059c3ffcfe2640cf6291353937"><td class="memItemLeft" align="right" valign="top">__inline__ __device__ </td><td class="memItemRight" valign="bottom"><a class="el" href="structcutlass_1_1reference_1_1device_1_1kernel_1_1detail_1_1TensorForEachHelper_3_01Func_00_01Rank_00_010_01_4.html#a89e10e059c3ffcfe2640cf6291353937">TensorForEachHelper</a> (Func &func, <a class="el" href="structcutlass_1_1Coord.html">Coord</a>< Rank > const &size, <a class="el" href="structcutlass_1_1Coord.html">Coord</a>< Rank > &coord, int64_t index)</td></tr>
|
||||||
<tr class="memdesc:a89e10e059c3ffcfe2640cf6291353937"><td class="mdescLeft"> </td><td class="mdescRight">Constructor for fastest chaning rank. <a href="#a89e10e059c3ffcfe2640cf6291353937">More...</a><br /></td></tr>
|
<tr class="memdesc:a89e10e059c3ffcfe2640cf6291353937"><td class="mdescLeft"> </td><td class="mdescRight">Constructor for fastest changing rank. <a href="#a89e10e059c3ffcfe2640cf6291353937">More...</a><br /></td></tr>
|
||||||
<tr class="separator:a89e10e059c3ffcfe2640cf6291353937"><td class="memSeparator" colspan="2"> </td></tr>
|
<tr class="separator:a89e10e059c3ffcfe2640cf6291353937"><td class="memSeparator" colspan="2"> </td></tr>
|
||||||
</table>
|
</table>
|
||||||
<h2 class="groupheader">Constructor & Destructor Documentation</h2>
|
<h2 class="groupheader">Constructor & Destructor Documentation</h2>
|
||||||
|
@ -104,7 +104,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
|
|||||||
</div><!--header-->
|
</div><!--header-->
|
||||||
<div class="contents">
|
<div class="contents">
|
||||||
|
|
||||||
<p>Parital specialization for XOR-popc.
|
<p>Partial specialization for XOR-popc.
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
<p><code>#include <<a class="el" href="tools_2util_2include_2cutlass_2util_2reference_2host_2gemm_8h_source.html">gemm.h</a>></code></p>
|
<p><code>#include <<a class="el" href="tools_2util_2include_2cutlass_2util_2reference_2host_2gemm_8h_source.html">gemm.h</a>></code></p>
|
||||||
|
@ -113,7 +113,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
|
|||||||
<tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="pub-methods"></a>
|
<tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="pub-methods"></a>
|
||||||
Public Member Functions</h2></td></tr>
|
Public Member Functions</h2></td></tr>
|
||||||
<tr class="memitem:a5029a4405a9a5e64011addb43bb88120"><td class="memItemLeft" align="right" valign="top"> </td><td class="memItemRight" valign="bottom"><a class="el" href="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorForEachHelper_3_01Func_00_01Rank_00_010_01_4.html#a5029a4405a9a5e64011addb43bb88120">TensorForEachHelper</a> (Func &func, <a class="el" href="structcutlass_1_1Coord.html">Coord</a>< Rank > const &extent, <a class="el" href="structcutlass_1_1Coord.html">Coord</a>< Rank > &coord)</td></tr>
|
<tr class="memitem:a5029a4405a9a5e64011addb43bb88120"><td class="memItemLeft" align="right" valign="top"> </td><td class="memItemRight" valign="bottom"><a class="el" href="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorForEachHelper_3_01Func_00_01Rank_00_010_01_4.html#a5029a4405a9a5e64011addb43bb88120">TensorForEachHelper</a> (Func &func, <a class="el" href="structcutlass_1_1Coord.html">Coord</a>< Rank > const &extent, <a class="el" href="structcutlass_1_1Coord.html">Coord</a>< Rank > &coord)</td></tr>
|
||||||
<tr class="memdesc:a5029a4405a9a5e64011addb43bb88120"><td class="mdescLeft"> </td><td class="mdescRight">Constructor for fastest chaning rank. <a href="#a5029a4405a9a5e64011addb43bb88120">More...</a><br /></td></tr>
|
<tr class="memdesc:a5029a4405a9a5e64011addb43bb88120"><td class="mdescLeft"> </td><td class="mdescRight">Constructor for fastest changing rank. <a href="#a5029a4405a9a5e64011addb43bb88120">More...</a><br /></td></tr>
|
||||||
<tr class="separator:a5029a4405a9a5e64011addb43bb88120"><td class="memSeparator" colspan="2"> </td></tr>
|
<tr class="separator:a5029a4405a9a5e64011addb43bb88120"><td class="memSeparator" colspan="2"> </td></tr>
|
||||||
</table><table class="memberdecls">
|
</table><table class="memberdecls">
|
||||||
<tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="pub-static-attribs"></a>
|
<tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="pub-static-attribs"></a>
|
||||||
|
@ -134,7 +134,7 @@ Classes</h2></td></tr>
|
|||||||
<tr class="memdesc:"><td class="mdescLeft"> </td><td class="mdescRight">Partial specialization for multiply-add-saturate. <a href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout30b72addd464a2ca4a26785cbfd77a8e.html#details">More...</a><br /></td></tr>
|
<tr class="memdesc:"><td class="mdescLeft"> </td><td class="mdescRight">Partial specialization for multiply-add-saturate. <a href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout30b72addd464a2ca4a26785cbfd77a8e.html#details">More...</a><br /></td></tr>
|
||||||
<tr class="separator:"><td class="memSeparator" colspan="2"> </td></tr>
|
<tr class="separator:"><td class="memSeparator" colspan="2"> </td></tr>
|
||||||
<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct  </td><td class="memItemRight" valign="bottom"><a class="el" href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout660562b232f408218828ca5915b7e73a.html">cutlass::reference::device::Gemm< ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, AccumulatorType, arch::OpXorPopc ></a></td></tr>
|
<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct  </td><td class="memItemRight" valign="bottom"><a class="el" href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout660562b232f408218828ca5915b7e73a.html">cutlass::reference::device::Gemm< ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, AccumulatorType, arch::OpXorPopc ></a></td></tr>
|
||||||
<tr class="memdesc:"><td class="mdescLeft"> </td><td class="mdescRight">Parital specialization for XOR-popc. <a href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout660562b232f408218828ca5915b7e73a.html#details">More...</a><br /></td></tr>
|
<tr class="memdesc:"><td class="mdescLeft"> </td><td class="mdescRight">Partial specialization for XOR-popc. <a href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout660562b232f408218828ca5915b7e73a.html#details">More...</a><br /></td></tr>
|
||||||
<tr class="separator:"><td class="memSeparator" colspan="2"> </td></tr>
|
<tr class="separator:"><td class="memSeparator" colspan="2"> </td></tr>
|
||||||
</table><table class="memberdecls">
|
</table><table class="memberdecls">
|
||||||
<tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="namespaces"></a>
|
<tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="namespaces"></a>
|
||||||
|
@ -141,7 +141,7 @@ Classes</h2></td></tr>
|
|||||||
<tr class="memdesc:"><td class="mdescLeft"> </td><td class="mdescRight">Partial specialization for multiply-add-saturate. <a href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_55729eac7dbd6bf311ea36f680e83e93.html#details">More...</a><br /></td></tr>
|
<tr class="memdesc:"><td class="mdescLeft"> </td><td class="mdescRight">Partial specialization for multiply-add-saturate. <a href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_55729eac7dbd6bf311ea36f680e83e93.html#details">More...</a><br /></td></tr>
|
||||||
<tr class="separator:"><td class="memSeparator" colspan="2"> </td></tr>
|
<tr class="separator:"><td class="memSeparator" colspan="2"> </td></tr>
|
||||||
<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct  </td><td class="memItemRight" valign="bottom"><a class="el" href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_4f3f32c4b336238abfd741e87bfced46.html">cutlass::reference::host::Gemm< ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, ComputeType, arch::OpXorPopc ></a></td></tr>
|
<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct  </td><td class="memItemRight" valign="bottom"><a class="el" href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_4f3f32c4b336238abfd741e87bfced46.html">cutlass::reference::host::Gemm< ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, ComputeType, arch::OpXorPopc ></a></td></tr>
|
||||||
<tr class="memdesc:"><td class="mdescLeft"> </td><td class="mdescRight">Parital specialization for XOR-popc. <a href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_4f3f32c4b336238abfd741e87bfced46.html#details">More...</a><br /></td></tr>
|
<tr class="memdesc:"><td class="mdescLeft"> </td><td class="mdescRight">Partial specialization for XOR-popc. <a href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_4f3f32c4b336238abfd741e87bfced46.html#details">More...</a><br /></td></tr>
|
||||||
<tr class="separator:"><td class="memSeparator" colspan="2"> </td></tr>
|
<tr class="separator:"><td class="memSeparator" colspan="2"> </td></tr>
|
||||||
</table><table class="memberdecls">
|
</table><table class="memberdecls">
|
||||||
<tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="namespaces"></a>
|
<tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="namespaces"></a>
|
||||||
|
File diff suppressed because one or more lines are too long
@ -47,7 +47,7 @@
|
|||||||
or utilities within CUTLASS. Such utilities are demonstrated elsewhere in other examples and are
|
or utilities within CUTLASS. Such utilities are demonstrated elsewhere in other examples and are
|
||||||
prevalent in the CUTLASS unit tests.
|
prevalent in the CUTLASS unit tests.
|
||||||
|
|
||||||
This example has delibrately been kept similar to the basic_gemm example from cutass-1.3 to
|
This example has delibrately been kept similar to the basic_gemm example from cutlass-1.3 to
|
||||||
highlight the minimum amount of differences needed to transition to cutlass-2.0.
|
highlight the minimum amount of differences needed to transition to cutlass-2.0.
|
||||||
|
|
||||||
Cutlass-1.3 sgemm: https://github.com/NVIDIA/cutlass/blob/master/examples/00_basic_gemm/basic_gemm.cu
|
Cutlass-1.3 sgemm: https://github.com/NVIDIA/cutlass/blob/master/examples/00_basic_gemm/basic_gemm.cu
|
||||||
|
@ -75,7 +75,7 @@ Now that we setup the properties of data, we have to setup properties of computa
|
|||||||
Second, we create template variables of tile sizes for thread-block, warp and mma-op to 128x128x32,
|
Second, we create template variables of tile sizes for thread-block, warp and mma-op to 128x128x32,
|
||||||
64x64x32, 8x8x4 (MxNxK) respectively. When passed to instantiate CUTLASS GEMM kernel, it internally
|
64x64x32, 8x8x4 (MxNxK) respectively. When passed to instantiate CUTLASS GEMM kernel, it internally
|
||||||
deduce the amount of threads needed per thread-block, amount of shared memory, storing data in
|
deduce the amount of threads needed per thread-block, amount of shared memory, storing data in
|
||||||
bank-conflict free manner, and ton of other variables required to compose, intialize and launch a
|
bank-conflict free manner, and ton of other variables required to compose, initialize and launch a
|
||||||
high performance GEMM kernel. This is the beauty of CUTLASS, it relieves developer from
|
high performance GEMM kernel. This is the beauty of CUTLASS, it relieves developer from
|
||||||
understanding and coding complicated hardware optimizations which can easily go wrong.
|
understanding and coding complicated hardware optimizations which can easily go wrong.
|
||||||
|
|
||||||
@ -107,7 +107,7 @@ is done which threadblock launched on an SM, CUDA SM architecture of GPU you wan
|
|||||||
These are all put together to create a template variable which describes CUTLASS GEMM kernel using
|
These are all put together to create a template variable which describes CUTLASS GEMM kernel using
|
||||||
cutlass::gemm::device::Gemm template.
|
cutlass::gemm::device::Gemm template.
|
||||||
|
|
||||||
The next step is to intialize physical data, instantiate and initialize CUTLASS kernel and run it.
|
The next step is to initialize physical data, instantiate and initialize CUTLASS kernel and run it.
|
||||||
We use CUTLASS utilities to initialize, fill, compare matrices as they are simple and doesn't come
|
We use CUTLASS utilities to initialize, fill, compare matrices as they are simple and doesn't come
|
||||||
in the way of learning CUTLASS.
|
in the way of learning CUTLASS.
|
||||||
|
|
||||||
@ -115,7 +115,7 @@ Once all the matrices are initialized and filled with data, create arguments tup
|
|||||||
kernel which takes problem size (M = 5120, N = 4096 and K = 4096), matrices, alpha, beta and the
|
kernel which takes problem size (M = 5120, N = 4096 and K = 4096), matrices, alpha, beta and the
|
||||||
important one, split k-dimension factor. Along with that, we query CUTLASS if any scratch-space
|
important one, split k-dimension factor. Along with that, we query CUTLASS if any scratch-space
|
||||||
memory required by the kernel we instantiated. If yes, we create it and pass it along with other
|
memory required by the kernel we instantiated. If yes, we create it and pass it along with other
|
||||||
arguments created to intialize CUTLASS kernel then, the kernel is launched.
|
arguments created to initialize CUTLASS kernel then, the kernel is launched.
|
||||||
|
|
||||||
In this example, we later on launch a reference gemm kernel (from CUTLASS utilities) to compare if
|
In this example, we later on launch a reference gemm kernel (from CUTLASS utilities) to compare if
|
||||||
the output from CUTLASS kernel is same as reference GEMM kernel.
|
the output from CUTLASS kernel is same as reference GEMM kernel.
|
||||||
|
@ -74,7 +74,7 @@ Now that we setup the properties of data, we have to setup properties of computa
|
|||||||
Second, we create template variables of tile sizes for thread-block, warp and mma-op to 128x256x64,
|
Second, we create template variables of tile sizes for thread-block, warp and mma-op to 128x256x64,
|
||||||
64x64x16, 8x8x16 (MxNxK) respectively. When passed to instantiate CUTLASS GEMM kernel, it internally
|
64x64x16, 8x8x16 (MxNxK) respectively. When passed to instantiate CUTLASS GEMM kernel, it internally
|
||||||
deduce the amount of threads needed per thread-block, amount of shared memory, storing data in
|
deduce the amount of threads needed per thread-block, amount of shared memory, storing data in
|
||||||
bank-conflict free manner, and ton of other variables required to compose, intialize and launch a
|
bank-conflict free manner, and ton of other variables required to compose, initialize and launch a
|
||||||
high performance GEMM kernel. This is the beauty of CUTLASS, it relieves developer from
|
high performance GEMM kernel. This is the beauty of CUTLASS, it relieves developer from
|
||||||
understanding and coding complicated hardware optimizations which can easily go wrong.
|
understanding and coding complicated hardware optimizations which can easily go wrong.
|
||||||
|
|
||||||
@ -106,7 +106,7 @@ is done which threadblock launched on an SM, CUDA SM architecture of GPU you wan
|
|||||||
These are all put together to create a template variable which describes CUTLASS GEMM kernel using
|
These are all put together to create a template variable which describes CUTLASS GEMM kernel using
|
||||||
cutlass::gemm::device::Gemm template.
|
cutlass::gemm::device::Gemm template.
|
||||||
|
|
||||||
The next step is to intialize physical data, instantiate and initialize CUTLASS kernel and run it.
|
The next step is to initialize physical data, instantiate and initialize CUTLASS kernel and run it.
|
||||||
We use CUTLASS utilities to initialize, fill, compare matrices as they are simple and doesn't come
|
We use CUTLASS utilities to initialize, fill, compare matrices as they are simple and doesn't come
|
||||||
in the way of learning CUTLASS.
|
in the way of learning CUTLASS.
|
||||||
|
|
||||||
@ -114,7 +114,7 @@ Once all the matrices are initialized and filled with data, create arguments tup
|
|||||||
kernel which takes problem size (M = 5120, N = 4096 and K = 4096), matrices, alpha, beta and the
|
kernel which takes problem size (M = 5120, N = 4096 and K = 4096), matrices, alpha, beta and the
|
||||||
important one, split k-dimension factor. Along with that, we query CUTLASS if any scratch-space
|
important one, split k-dimension factor. Along with that, we query CUTLASS if any scratch-space
|
||||||
memory required by the kernel we instantiated. If yes, we create it and pass it along with other
|
memory required by the kernel we instantiated. If yes, we create it and pass it along with other
|
||||||
arguments created to intialize CUTLASS kernel then, the kernel is launched.
|
arguments created to initialize CUTLASS kernel then, the kernel is launched.
|
||||||
|
|
||||||
In this example, we later on launch a reference gemm kernel (from CUTLASS utilities) to compare if
|
In this example, we later on launch a reference gemm kernel (from CUTLASS utilities) to compare if
|
||||||
the output from CUTLASS kernel is same as reference GEMM kernel.
|
the output from CUTLASS kernel is same as reference GEMM kernel.
|
||||||
|
@ -76,7 +76,7 @@ Now that we setup the properties of data, we have to setup properties of computa
|
|||||||
Second, we create template variables of tile sizes for thread-block, warp and mma-op to 128x128x128,
|
Second, we create template variables of tile sizes for thread-block, warp and mma-op to 128x128x128,
|
||||||
64x64x128, 8x8x32 (MxNxK) respectively. When passed to instantiate CUTLASS Implicit GEMM kernel, it
|
64x64x128, 8x8x32 (MxNxK) respectively. When passed to instantiate CUTLASS Implicit GEMM kernel, it
|
||||||
internally deduces the amount of threads needed per thread-block, amount of shared memory, storing
|
internally deduces the amount of threads needed per thread-block, amount of shared memory, storing
|
||||||
data in bank-conflict free manner, and ton of other variables required to compose, intialize and
|
data in bank-conflict free manner, and ton of other variables required to compose, initialize and
|
||||||
launch a high performance Implicit GEMM kernel. This is the beauty of CUTLASS, it relieves developer
|
launch a high performance Implicit GEMM kernel. This is the beauty of CUTLASS, it relieves developer
|
||||||
from understanding and coding complicated hardware optimizations which can easily go wrong.
|
from understanding and coding complicated hardware optimizations which can easily go wrong.
|
||||||
|
|
||||||
@ -108,7 +108,7 @@ is done which threadblock launched on an SM, CUDA SM architecture of GPU you wan
|
|||||||
These are all put together to create a template variable which describes CUTLASS Implicit GEMM
|
These are all put together to create a template variable which describes CUTLASS Implicit GEMM
|
||||||
kernel using cutlass::conv::device::ImplicitGemm template.
|
kernel using cutlass::conv::device::ImplicitGemm template.
|
||||||
|
|
||||||
The next step is to intialize physical data, instantiate and initialize CUTLASS kernel and run it.
|
The next step is to initialize physical data, instantiate and initialize CUTLASS kernel and run it.
|
||||||
We use CUTLASS utilities to initialize, fill, compare tensors as they are simple and doesn't come
|
We use CUTLASS utilities to initialize, fill, compare tensors as they are simple and doesn't come
|
||||||
in the way of learning CUTLASS.
|
in the way of learning CUTLASS.
|
||||||
|
|
||||||
@ -117,7 +117,7 @@ kernel which takes problem size (N = 1, H = 64, W = 64, C = 128), filter size (K
|
|||||||
R = 3, S = 3, C = 128 ), padding, strides, dilation, tensors, alpha, beta and the
|
R = 3, S = 3, C = 128 ), padding, strides, dilation, tensors, alpha, beta and the
|
||||||
important one, split k-dimension factor. Along with that, we query CUTLASS if any scratch-space
|
important one, split k-dimension factor. Along with that, we query CUTLASS if any scratch-space
|
||||||
memory required by the kernel we instantiated. If yes, we create it and pass it along with other
|
memory required by the kernel we instantiated. If yes, we create it and pass it along with other
|
||||||
arguments created to intialize CUTLASS kernel then, the kernel is launched.
|
arguments created to initialize CUTLASS kernel then, the kernel is launched.
|
||||||
|
|
||||||
In this example, we later on launch a reference convolution kernel (from CUTLASS utilities) to
|
In this example, we later on launch a reference convolution kernel (from CUTLASS utilities) to
|
||||||
compare if the output from CUTLASS kernel is same as the reference implicit GEMM kernel.
|
compare if the output from CUTLASS kernel is same as the reference implicit GEMM kernel.
|
||||||
|
@ -321,7 +321,7 @@ public:
|
|||||||
int smem_write_stage_idx = 1;
|
int smem_write_stage_idx = 1;
|
||||||
|
|
||||||
// Issue loads during the first warp-level matrix multiply-add *AFTER* issuing
|
// Issue loads during the first warp-level matrix multiply-add *AFTER* issuing
|
||||||
// shared memory loads (which have the tighest latency requirement).
|
// shared memory loads (which have the tightest latency requirement).
|
||||||
|
|
||||||
//
|
//
|
||||||
// Mainloop
|
// Mainloop
|
||||||
@ -461,7 +461,7 @@ public:
|
|||||||
int gemm_k_iterations_1 = FragmentIteratorA1::Policy::kIterations / Base::kWarpGemmIterations1;
|
int gemm_k_iterations_1 = FragmentIteratorA1::Policy::kIterations / Base::kWarpGemmIterations1;
|
||||||
|
|
||||||
// Issue loads during the first warp-level matrix multiply-add *AFTER* issuing
|
// Issue loads during the first warp-level matrix multiply-add *AFTER* issuing
|
||||||
// shared memory loads (which have the tighest latency requirement).
|
// shared memory loads (which have the tightest latency requirement).
|
||||||
|
|
||||||
//
|
//
|
||||||
// Mainloop
|
// Mainloop
|
||||||
|
@ -341,7 +341,7 @@ public:
|
|||||||
int smem_write_stage_idx = 1;
|
int smem_write_stage_idx = 1;
|
||||||
|
|
||||||
// Issue loads during the first warp-level matrix multiply-add *AFTER* issuing
|
// Issue loads during the first warp-level matrix multiply-add *AFTER* issuing
|
||||||
// shared memory loads (which have the tighest latency requirement).
|
// shared memory loads (which have the tightest latency requirement).
|
||||||
|
|
||||||
//
|
//
|
||||||
// Mainloop
|
// Mainloop
|
||||||
|
@ -325,7 +325,7 @@ public:
|
|||||||
iterator_B0.clear_mask(gemm_k_iterations_0 <= 1);
|
iterator_B0.clear_mask(gemm_k_iterations_0 <= 1);
|
||||||
|
|
||||||
// Issue loads during the first warp-level matrix multiply-add *AFTER* issuing
|
// Issue loads during the first warp-level matrix multiply-add *AFTER* issuing
|
||||||
// shared memory loads (which have the tighest latency requirement).
|
// shared memory loads (which have the tightest latency requirement).
|
||||||
|
|
||||||
//
|
//
|
||||||
// Mainloop
|
// Mainloop
|
||||||
|
@ -346,7 +346,7 @@ public:
|
|||||||
iterator_B0.clear_mask(gemm_k_iterations_0 <= 1);
|
iterator_B0.clear_mask(gemm_k_iterations_0 <= 1);
|
||||||
|
|
||||||
// Issue loads during the first warp-level matrix multiply-add *AFTER* issuing
|
// Issue loads during the first warp-level matrix multiply-add *AFTER* issuing
|
||||||
// shared memory loads (which have the tighest latency requirement).
|
// shared memory loads (which have the tightest latency requirement).
|
||||||
|
|
||||||
//
|
//
|
||||||
// Mainloop
|
// Mainloop
|
||||||
|
@ -73,7 +73,7 @@ Now that we setup the properties of data, we have to setup properties of computa
|
|||||||
Second, we create template variables of tile sizes for thread-block, warp and mma-op to 128x128x64,
|
Second, we create template variables of tile sizes for thread-block, warp and mma-op to 128x128x64,
|
||||||
64x64x64, 16x8x16 (MxNxK) respectively. When passed to instantiate CUTLASS Implicit GEMM kernel, it
|
64x64x64, 16x8x16 (MxNxK) respectively. When passed to instantiate CUTLASS Implicit GEMM kernel, it
|
||||||
internally deduces the amount of threads needed per thread-block, amount of shared memory, storing
|
internally deduces the amount of threads needed per thread-block, amount of shared memory, storing
|
||||||
data in bank-conflict free manner, and ton of other variables required to compose, intialize and
|
data in bank-conflict free manner, and ton of other variables required to compose, initialize and
|
||||||
launch a high performance Implicit GEMM kernel. This is the beauty of CUTLASS, it relieves developer
|
launch a high performance Implicit GEMM kernel. This is the beauty of CUTLASS, it relieves developer
|
||||||
from understanding and coding complicated hardware optimizations which can easily go wrong.
|
from understanding and coding complicated hardware optimizations which can easily go wrong.
|
||||||
|
|
||||||
@ -95,7 +95,7 @@ is done which threadblock launched on an SM, CUDA SM architecture of GPU you wan
|
|||||||
These are all put together to create a template variable which describes CUTLASS Implicit GEMM
|
These are all put together to create a template variable which describes CUTLASS Implicit GEMM
|
||||||
kernel using cutlass::conv::device::ImplicitGemm template.
|
kernel using cutlass::conv::device::ImplicitGemm template.
|
||||||
|
|
||||||
The next step is to intialize physical data, instantiate and initialize CUTLASS kernel and run it.
|
The next step is to initialize physical data, instantiate and initialize CUTLASS kernel and run it.
|
||||||
We use CUTLASS utilities to initialize, fill, compare tensors as they are simple and doesn't come
|
We use CUTLASS utilities to initialize, fill, compare tensors as they are simple and doesn't come
|
||||||
in the way of learning CUTLASS.
|
in the way of learning CUTLASS.
|
||||||
|
|
||||||
@ -104,7 +104,7 @@ kernel which takes problem size (N = 1, H = 64, W = 64, C = 128), filter size (K
|
|||||||
R = 3, S = 3, C = 128 ), padding, strides, dilation, tensors, alpha, beta and the
|
R = 3, S = 3, C = 128 ), padding, strides, dilation, tensors, alpha, beta and the
|
||||||
important one, split k-dimension factor. Along with that, we query CUTLASS if any scratch-space
|
important one, split k-dimension factor. Along with that, we query CUTLASS if any scratch-space
|
||||||
memory required by the kernel we instantiated. If yes, we create it and pass it along with other
|
memory required by the kernel we instantiated. If yes, we create it and pass it along with other
|
||||||
arguments created to intialize CUTLASS kernel then, the kernel is launched.
|
arguments created to initialize CUTLASS kernel then, the kernel is launched.
|
||||||
|
|
||||||
In this example, we later on launch a reference convolution kernel (from CUTLASS utilities) to
|
In this example, we later on launch a reference convolution kernel (from CUTLASS utilities) to
|
||||||
compare if the output from CUTLASS kernel is same as the reference implicit GEMM kernel.
|
compare if the output from CUTLASS kernel is same as the reference implicit GEMM kernel.
|
||||||
|
@ -36,7 +36,7 @@ computing GEMM. So the output also contains either a Mx1 or 1XN vector. It onl
|
|||||||
core instructions.
|
core instructions.
|
||||||
|
|
||||||
Most of the reduction is done in gemm/warp level, see gemm/warp/mma_with_reduction_tensor_op.h
|
Most of the reduction is done in gemm/warp level, see gemm/warp/mma_with_reduction_tensor_op.h
|
||||||
A few bit of reduction is done in the epilouge before storing the vector, see
|
A few bit of reduction is done in the epilogue before storing the vector, see
|
||||||
epilogue/threadblock/epilogue_gemm_k_reduction.h
|
epilogue/threadblock/epilogue_gemm_k_reduction.h
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
@ -1088,7 +1088,7 @@ int main(int argc, char const **args) {
|
|||||||
|
|
||||||
// Determine kernel configuration based on head size.
|
// Determine kernel configuration based on head size.
|
||||||
// If head size is less than or equal to 64, each block operates over 64 queries and
|
// If head size is less than or equal to 64, each block operates over 64 queries and
|
||||||
// 64 keys, and parital results can be stored in the register file.
|
// 64 keys, and partial results can be stored in the register file.
|
||||||
// If head size is greater than 64, each block operates over 32 queries and 128 keys,
|
// If head size is greater than 64, each block operates over 32 queries and 128 keys,
|
||||||
// and partial results are stored in shared memory.
|
// and partial results are stored in shared memory.
|
||||||
if (options.head_size_v > 64) {
|
if (options.head_size_v > 64) {
|
||||||
|
@ -1173,7 +1173,7 @@ int main(int argc, char const **args) {
|
|||||||
|
|
||||||
// Determine kernel configuration based on head size.
|
// Determine kernel configuration based on head size.
|
||||||
// If head size is less than or equal to 64, each block operates over 64 queries and
|
// If head size is less than or equal to 64, each block operates over 64 queries and
|
||||||
// 64 keys, and parital results can be stored in the register file.
|
// 64 keys, and partial results can be stored in the register file.
|
||||||
// If head size is greater than 64, each block operates over 32 queries and 128 keys,
|
// If head size is greater than 64, each block operates over 32 queries and 128 keys,
|
||||||
// and partial results are stored in shared memory.
|
// and partial results are stored in shared memory.
|
||||||
if (options.head_size_v > 64) {
|
if (options.head_size_v > 64) {
|
||||||
|
@ -310,7 +310,7 @@ class CustomMmaPipelined : public CustomMmaBase<Shape_, Policy_, 2> {
|
|||||||
iterator_B.clear_mask(gemm_k_iterations <= 1);
|
iterator_B.clear_mask(gemm_k_iterations <= 1);
|
||||||
|
|
||||||
// Issue loads during the first warp-level matrix multiply-add *AFTER*
|
// Issue loads during the first warp-level matrix multiply-add *AFTER*
|
||||||
// issuing shared memory loads (which have the tighest latency requirement).
|
// issuing shared memory loads (which have the tightest latency requirement).
|
||||||
|
|
||||||
//
|
//
|
||||||
// Mainloop
|
// Mainloop
|
||||||
|
@ -600,7 +600,7 @@ class MmaPipelinedFromSharedMemory : public MmaBaseFromSharedMemory<
|
|||||||
iterator_B.clear_mask(gemm_k_iterations <= 1);
|
iterator_B.clear_mask(gemm_k_iterations <= 1);
|
||||||
|
|
||||||
// Issue loads during the first warp-level matrix multiply-add *AFTER*
|
// Issue loads during the first warp-level matrix multiply-add *AFTER*
|
||||||
// issuing shared memory loads (which have the tighest latency requirement).
|
// issuing shared memory loads (which have the tightest latency requirement).
|
||||||
|
|
||||||
//
|
//
|
||||||
// Mainloop
|
// Mainloop
|
||||||
|
@ -181,7 +181,7 @@ class PredicatedTileAccessIteratorResidualLast<
|
|||||||
BytePointer pointer_;
|
BytePointer pointer_;
|
||||||
|
|
||||||
/// Below is used when Gather is turned on. We need to record strided_offset
|
/// Below is used when Gather is turned on. We need to record strided_offset
|
||||||
/// and contiguous_offset seperated to compute the offset by using
|
/// and contiguous_offset separated to compute the offset by using
|
||||||
///
|
///
|
||||||
/// offset = contiguous_offset + indices[strided_offset]
|
/// offset = contiguous_offset + indices[strided_offset]
|
||||||
///
|
///
|
||||||
|
@ -86,14 +86,14 @@ class gen_default_b2b_mma:
|
|||||||
"OperatorClass", str(stage), "Operator")
|
"OperatorClass", str(stage), "Operator")
|
||||||
return gen_code
|
return gen_code
|
||||||
|
|
||||||
def gen_using_FusedAddBiasEpilouge(self):
|
def gen_using_FusedAddBiasEpilogue(self):
|
||||||
gen_code = ""
|
gen_code = ""
|
||||||
for i in range(self.b2b_num - 1):
|
for i in range(self.b2b_num - 1):
|
||||||
code_using = helper.var_idx("using FusedAddBiasEpilouge", i)
|
code_using = helper.var_idx("using FusedAddBiasEpilogue", i)
|
||||||
epilouge_name = "typename cutlass::epilogue::threadblock::DefaultFusedBiasActEpilogueTensorOp"
|
epilogue_name = "typename cutlass::epilogue::threadblock::DefaultFusedBiasActEpilogueTensorOp"
|
||||||
template_args = helper.var_idx("<ThreadblockShape", i) + helper.var_idx(",typename MmaCore", i) + helper.var_idx("::MmaPolicy::Operator, 1, EpilogueOutputOp", i) + ", 2>::Epilogue"
|
template_args = helper.var_idx("<ThreadblockShape", i) + helper.var_idx(",typename MmaCore", i) + helper.var_idx("::MmaPolicy::Operator, 1, EpilogueOutputOp", i) + ", 2>::Epilogue"
|
||||||
|
|
||||||
gen_code += code_using + " = " + epilouge_name + template_args + ";\n"
|
gen_code += code_using + " = " + epilogue_name + template_args + ";\n"
|
||||||
|
|
||||||
return gen_code
|
return gen_code
|
||||||
|
|
||||||
@ -161,12 +161,12 @@ class gen_default_b2b_mma:
|
|||||||
MmaPipelined_param_list += "ElementAccumulator0, layout::RowMajor, "
|
MmaPipelined_param_list += "ElementAccumulator0, layout::RowMajor, "
|
||||||
|
|
||||||
for i in range(self.b2b_num - 1):
|
for i in range(self.b2b_num - 1):
|
||||||
epilouge_name = "EpilogueOutputOp" + str(i)
|
epilogue_name = "EpilogueOutputOp" + str(i)
|
||||||
MmaPipelined_param_list += epilouge_name + ", "
|
MmaPipelined_param_list += epilogue_name + ", "
|
||||||
|
|
||||||
for i in range(self.b2b_num - 1):
|
for i in range(self.b2b_num - 1):
|
||||||
epilouge_name = "FusedAddBiasEpilouge" + str(i)
|
epilogue_name = "FusedAddBiasEpilogue" + str(i)
|
||||||
MmaPipelined_param_list += epilouge_name + ", "
|
MmaPipelined_param_list += epilogue_name + ", "
|
||||||
|
|
||||||
for i in range(self.b2b_num):
|
for i in range(self.b2b_num):
|
||||||
MmaPolicy = "typename MmaCore" + str(i) + "::MmaPolicy"
|
MmaPolicy = "typename MmaCore" + str(i) + "::MmaPolicy"
|
||||||
@ -198,7 +198,7 @@ class gen_default_b2b_mma:
|
|||||||
mmacore_codebody = self.gen_using_MmaCore(2)
|
mmacore_codebody = self.gen_using_MmaCore(2)
|
||||||
iterator_codebody = self.gen_using_Iterator()
|
iterator_codebody = self.gen_using_Iterator()
|
||||||
fragment_iterator_codebody = self.gen_fragment_iterator()
|
fragment_iterator_codebody = self.gen_fragment_iterator()
|
||||||
epilogue_iterator_codebody = self.gen_using_FusedAddBiasEpilouge()
|
epilogue_iterator_codebody = self.gen_using_FusedAddBiasEpilogue()
|
||||||
threadBlockMma = self.gen_threadblockmma()
|
threadBlockMma = self.gen_threadblockmma()
|
||||||
specialized_code = mmacore_codebody + iterator_codebody + fragment_iterator_codebody + epilogue_iterator_codebody + threadBlockMma
|
specialized_code = mmacore_codebody + iterator_codebody + fragment_iterator_codebody + epilogue_iterator_codebody + threadBlockMma
|
||||||
|
|
||||||
@ -352,7 +352,7 @@ class gen_b2b_mme_pipelined:
|
|||||||
}\n\
|
}\n\
|
||||||
\n\
|
\n\
|
||||||
// Issue loads during the first warp-level matrix multiply-add *AFTER* issuing \n\
|
// Issue loads during the first warp-level matrix multiply-add *AFTER* issuing \n\
|
||||||
// shared memory loads (which have the tighest latency requirement).\n\
|
// shared memory loads (which have the tightest latency requirement).\n\
|
||||||
\n\
|
\n\
|
||||||
//\n\
|
//\n\
|
||||||
// Mainloop\n\
|
// Mainloop\n\
|
||||||
@ -459,7 +459,7 @@ class gen_b2b_mme_pipelined:
|
|||||||
}\n\
|
}\n\
|
||||||
\n\
|
\n\
|
||||||
// Issue loads during the first warp-level matrix multiply-add *AFTER* issuing \n\
|
// Issue loads during the first warp-level matrix multiply-add *AFTER* issuing \n\
|
||||||
// shared memory loads (which have the tighest latency requirement).\n\
|
// shared memory loads (which have the tightest latency requirement).\n\
|
||||||
iterator_A.load(tb_frag_A);\n\
|
iterator_A.load(tb_frag_A);\n\
|
||||||
\n\
|
\n\
|
||||||
//\n\
|
//\n\
|
||||||
@ -490,7 +490,7 @@ class gen_b2b_mme_pipelined:
|
|||||||
__syncthreads();\n\
|
__syncthreads();\n\
|
||||||
\n\
|
\n\
|
||||||
// Issue loads during the first warp-level matrix multiply-add *AFTER* issuing \n\
|
// Issue loads during the first warp-level matrix multiply-add *AFTER* issuing \n\
|
||||||
// shared memory loads (which have the tighest latency requirement).\n\
|
// shared memory loads (which have the tightest latency requirement).\n\
|
||||||
iterator_A.load(tb_frag_A);\n\
|
iterator_A.load(tb_frag_A);\n\
|
||||||
\n\
|
\n\
|
||||||
++this->smem_iterator_B0_;\n\
|
++this->smem_iterator_B0_;\n\
|
||||||
@ -549,12 +549,12 @@ class gen_b2b_mme_pipelined:
|
|||||||
code = "// " + str(id + 1) + " Gemm"
|
code = "// " + str(id + 1) + " Gemm"
|
||||||
code += " /// Iterator to load a warp-scoped tile of A1 operand from intermediate accumulator tile\n"
|
code += " /// Iterator to load a warp-scoped tile of A1 operand from intermediate accumulator tile\n"
|
||||||
|
|
||||||
code += " " + helper.var_idx("FragmentC", id - 1) + helper.var_idx(" after_epilouge_accu", id - 1) + ";\n"
|
code += " " + helper.var_idx("FragmentC", id - 1) + helper.var_idx(" after_epilogue_accu", id - 1) + ";\n"
|
||||||
code += " " + helper.var_idx("epilogue_", id - 1) + helper.var_idx("(output_op_", id - 1) + helper.var_idx(", accum", id - 1) \
|
code += " " + helper.var_idx("epilogue_", id - 1) + helper.var_idx("(output_op_", id - 1) + helper.var_idx(", accum", id - 1) \
|
||||||
+ helper.var_idx(", after_epilouge_accu", id - 1) + helper.var_idx(", iterator_C", id - 1) +");\n"
|
+ helper.var_idx(", after_epilogue_accu", id - 1) + helper.var_idx(", iterator_C", id - 1) +");\n"
|
||||||
|
|
||||||
# FragmentIteratorA1 warp_tile_iterator_A1_(accum0);
|
# FragmentIteratorA1 warp_tile_iterator_A1_(accum0);
|
||||||
code += " " + helper.var_idx("FragmentIteratorA", id) + helper.var_idx(" warp_tile_iterator_A", id) +"_(" + helper.var_idx("after_epilouge_accu", id - 1) + ");\n"
|
code += " " + helper.var_idx("FragmentIteratorA", id) + helper.var_idx(" warp_tile_iterator_A", id) +"_(" + helper.var_idx("after_epilogue_accu", id - 1) + ");\n"
|
||||||
# FragmentB1 tb_frag_B1;
|
# FragmentB1 tb_frag_B1;
|
||||||
code += " " + helper.var_idx("FragmentB", id) + " " + helper.var_idx("tb_frag_B", id) + ";\n"
|
code += " " + helper.var_idx("FragmentB", id) + " " + helper.var_idx("tb_frag_B", id) + ";\n"
|
||||||
# tb_frag_B1.clear();
|
# tb_frag_B1.clear();
|
||||||
@ -990,7 +990,7 @@ class gen_threadblock:
|
|||||||
|
|
||||||
|
|
||||||
self.gen_b2b_mma_base = gen_b2b_mma_base(template_param, gen_class_name, b2b_num, cutlass_deps_root, project_root)
|
self.gen_b2b_mma_base = gen_b2b_mma_base(template_param, gen_class_name, b2b_num, cutlass_deps_root, project_root)
|
||||||
self.gen_b2b_mma_piplined = gen_b2b_mme_pipelined(template_param, gen_class_name, b2b_num, cutlass_deps_root, project_root)
|
self.gen_b2b_mma_pipelined = gen_b2b_mme_pipelined(template_param, gen_class_name, b2b_num, cutlass_deps_root, project_root)
|
||||||
self.gen_default_b2b_mma = gen_default_b2b_mma(template_param, gen_class_name, b2b_num, cutlass_deps_root, project_root)
|
self.gen_default_b2b_mma = gen_default_b2b_mma(template_param, gen_class_name, b2b_num, cutlass_deps_root, project_root)
|
||||||
|
|
||||||
|
|
||||||
@ -1001,7 +1001,7 @@ class gen_threadblock:
|
|||||||
|
|
||||||
with open(self.file_dir + "b2b_mma_base.h", "w+") as f:
|
with open(self.file_dir + "b2b_mma_base.h", "w+") as f:
|
||||||
f.write(base_code)
|
f.write(base_code)
|
||||||
pipeline_code = self.gen_b2b_mma_piplined.gen_code(first_use_1stage = first_use_1stage)
|
pipeline_code = self.gen_b2b_mma_pipelined.gen_code(first_use_1stage = first_use_1stage)
|
||||||
print("[INFO]: Gen kernel code [b2b_mma_pipelined.h]output Dir: is ", self.file_dir)
|
print("[INFO]: Gen kernel code [b2b_mma_pipelined.h]output Dir: is ", self.file_dir)
|
||||||
|
|
||||||
with open(self.file_dir + "b2b_mma_pipelined.h", "w+") as f:
|
with open(self.file_dir + "b2b_mma_pipelined.h", "w+") as f:
|
||||||
|
@ -45,7 +45,7 @@ class gen_verify:
|
|||||||
self.user_header_file = ""
|
self.user_header_file = ""
|
||||||
for header in user_header_file:
|
for header in user_header_file:
|
||||||
self.user_header_file += "#include \"" + header + "\"\n"
|
self.user_header_file += "#include \"" + header + "\"\n"
|
||||||
self.seperate_cutlass = gen_basic.gen_volta_turing_fuse_act_impl(fuse_gemm_info, gen_class_name, user_header_file, output_dir)
|
self.separate_cutlass = gen_basic.gen_volta_turing_fuse_act_impl(fuse_gemm_info, gen_class_name, user_header_file, output_dir)
|
||||||
self.gen_params()
|
self.gen_params()
|
||||||
self.output_dir = output_dir
|
self.output_dir = output_dir
|
||||||
|
|
||||||
@ -53,14 +53,14 @@ class gen_verify:
|
|||||||
def gen_code(self):
|
def gen_code(self):
|
||||||
code = ""
|
code = ""
|
||||||
code += self.user_header_file
|
code += self.user_header_file
|
||||||
code += self.seperate_cutlass.gen_using(False) #False -> Turing, True -> Volta
|
code += self.separate_cutlass.gen_using(False) #False -> Turing, True -> Volta
|
||||||
|
|
||||||
code_body = ""
|
code_body = ""
|
||||||
for i in range(self.b2b_num):
|
for i in range(self.b2b_num):
|
||||||
code_body += " " + helper.var_idx("Gemm", i) + helper.var_idx(" gemm_op_", i) + ";\n"
|
code_body += " " + helper.var_idx("Gemm", i) + helper.var_idx(" gemm_op_", i) + ";\n"
|
||||||
code_body += " " + helper.var_idx("gemm_op_", i) + helper.var_idx(".initialize(Arguments_", i) + ", nullptr);\n"
|
code_body += " " + helper.var_idx("gemm_op_", i) + helper.var_idx(".initialize(Arguments_", i) + ", nullptr);\n"
|
||||||
|
|
||||||
code_body += self.seperate_cutlass.gen_run()
|
code_body += self.separate_cutlass.gen_run()
|
||||||
|
|
||||||
code += ir.gen_func(self.name, self.params, code_body)
|
code += ir.gen_func(self.name, self.params, code_body)
|
||||||
helper.write_2_headfile("cutlass_verify.h", self.output_dir, code)
|
helper.write_2_headfile("cutlass_verify.h", self.output_dir, code)
|
||||||
@ -87,6 +87,6 @@ class gen_verify:
|
|||||||
|
|
||||||
def gen_initialize():
|
def gen_initialize():
|
||||||
code = ""
|
code = ""
|
||||||
initialize_code = self.seperate_cutlass.gen_initialize()
|
initialize_code = self.separate_cutlass.gen_initialize()
|
||||||
|
|
||||||
code = ir.gen_func("initialize", [[]])
|
code = ir.gen_func("initialize", [[]])
|
||||||
|
@ -83,23 +83,23 @@ def list_2_string(input_list, ):
|
|||||||
return rtn_string
|
return rtn_string
|
||||||
|
|
||||||
|
|
||||||
def get_epilouge_info(layer_info):
|
def get_epilogue_info(layer_info):
|
||||||
return layer_info['epilogue']
|
return layer_info['epilogue']
|
||||||
|
|
||||||
def get_epilogue_tp(layer_info):
|
def get_epilogue_tp(layer_info):
|
||||||
epilogue_info = get_epilouge_info(layer_info)
|
epilogue_info = get_epilogue_info(layer_info)
|
||||||
return epilogue_info['tp']
|
return epilogue_info['tp']
|
||||||
|
|
||||||
def get_epilogue_add_bias_or_not(layer_info):
|
def get_epilogue_add_bias_or_not(layer_info):
|
||||||
epilogue_info = get_epilouge_info(layer_info)
|
epilogue_info = get_epilogue_info(layer_info)
|
||||||
return epilogue_info['bias']['addbias']
|
return epilogue_info['bias']['addbias']
|
||||||
|
|
||||||
def get_epilogue_add_bias_tp(layer_info):
|
def get_epilogue_add_bias_tp(layer_info):
|
||||||
epilogue_info = get_epilouge_info(layer_info)
|
epilogue_info = get_epilogue_info(layer_info)
|
||||||
return epilogue_info['bias']['bias_tp']
|
return epilogue_info['bias']['bias_tp']
|
||||||
|
|
||||||
def get_epilogue_args(layer_info):
|
def get_epilogue_args(layer_info):
|
||||||
epilogue_info = get_epilouge_info(layer_info)
|
epilogue_info = get_epilogue_info(layer_info)
|
||||||
return epilogue_info['args']
|
return epilogue_info['args']
|
||||||
|
|
||||||
def get_epilogue_bias_shape(layer_info):
|
def get_epilogue_bias_shape(layer_info):
|
||||||
|
@ -33,7 +33,7 @@
|
|||||||
\brief Hopper GEMM example leveraging collective operation builders.
|
\brief Hopper GEMM example leveraging collective operation builders.
|
||||||
|
|
||||||
This example showcases the use of CUTLASS's CollectiveBuilder to easily construct performant kernels
|
This example showcases the use of CUTLASS's CollectiveBuilder to easily construct performant kernels
|
||||||
targetting the NVIDIA Hopper architecture.
|
targeting the NVIDIA Hopper architecture.
|
||||||
|
|
||||||
Background and motivation
|
Background and motivation
|
||||||
-------------------------
|
-------------------------
|
||||||
@ -45,7 +45,7 @@
|
|||||||
However, DefaultGemmConfigurations leave multiple opportunities for improvement, which are addressed
|
However, DefaultGemmConfigurations leave multiple opportunities for improvement, which are addressed
|
||||||
in CUTLASS 3:
|
in CUTLASS 3:
|
||||||
(1) DefaultGemmConfigurations do not allow one to use a more-performant set of parameters without
|
(1) DefaultGemmConfigurations do not allow one to use a more-performant set of parameters without
|
||||||
specifying every parameter. For example, the DefaultGemmConfigurations for GEMMs targetting
|
specifying every parameter. For example, the DefaultGemmConfigurations for GEMMs targeting
|
||||||
Ampere specify that three pipeline stages should be used regardless of the sizes of operands.
|
Ampere specify that three pipeline stages should be used regardless of the sizes of operands.
|
||||||
If one wished to increase this value, one would also need to specify all other template parameters.
|
If one wished to increase this value, one would also need to specify all other template parameters.
|
||||||
This leaves a gap between a high-level ease-of-use interface and a lower-level detailed interface.
|
This leaves a gap between a high-level ease-of-use interface and a lower-level detailed interface.
|
||||||
@ -55,7 +55,7 @@
|
|||||||
|
|
||||||
Alongside these opportunities for improvement, the Hopper architecture offers new features that increase
|
Alongside these opportunities for improvement, the Hopper architecture offers new features that increase
|
||||||
the number of valid configurations of a kernel. In addition to the many template parameters already available
|
the number of valid configurations of a kernel. In addition to the many template parameters already available
|
||||||
in CUTLASS 2 kernels, CUTLASS 3 kernels targetting Hopper also have various scheduling modes to select from that control:
|
in CUTLASS 2 kernels, CUTLASS 3 kernels targeting Hopper also have various scheduling modes to select from that control:
|
||||||
(1) how data is to be loaded (e.g., using the Hopper TMA feature or Ampere cp.async)
|
(1) how data is to be loaded (e.g., using the Hopper TMA feature or Ampere cp.async)
|
||||||
(2) how work is to be divided among warps in a thread block (e.g., whether to use "warp specialization")
|
(2) how work is to be divided among warps in a thread block (e.g., whether to use "warp specialization")
|
||||||
(3) whether persistent thread blocks should be used
|
(3) whether persistent thread blocks should be used
|
||||||
@ -64,13 +64,13 @@
|
|||||||
Introduction to the CollectiveBuilder
|
Introduction to the CollectiveBuilder
|
||||||
-------------------------------------
|
-------------------------------------
|
||||||
CUTLASS 3 introduces the CollectiveBuilder to further ease the process of selecting template parameters
|
CUTLASS 3 introduces the CollectiveBuilder to further ease the process of selecting template parameters
|
||||||
for kernels targetting Hopper. Similar to the DefaultGemmConfigurations used in CUTLASS 2, the CollectiveBuilder
|
for kernels targeting Hopper. Similar to the DefaultGemmConfigurations used in CUTLASS 2, the CollectiveBuilder
|
||||||
takes in a small set of template parameters (e.g., the data types of operands A and B). It then automatically
|
takes in a small set of template parameters (e.g., the data types of operands A and B). It then automatically
|
||||||
determines the data loading strategy to use depending on whether the Hopper TMA feature can be used with the provided
|
determines the data loading strategy to use depending on whether the Hopper TMA feature can be used with the provided
|
||||||
parameters. If one does not indicate a particular scheduling policy or stage count to use (by using `Auto` template
|
parameters. If one does not indicate a particular scheduling policy or stage count to use (by using `Auto` template
|
||||||
parameters), the CollectiveBuilder will also automatically select these.
|
parameters), the CollectiveBuilder will also automatically select these.
|
||||||
|
|
||||||
Unlike DefaultGemmConfigurations a parital specialization of the CollectiveBuilder is not needed for many
|
Unlike DefaultGemmConfigurations a partial specialization of the CollectiveBuilder is not needed for many
|
||||||
configurations of operand types. Instead the CollectiveBuilder "builds" a configuration based on generic
|
configurations of operand types. Instead the CollectiveBuilder "builds" a configuration based on generic
|
||||||
properties of the specified operands, layouts, and other parameters. For example, when the stage count
|
properties of the specified operands, layouts, and other parameters. For example, when the stage count
|
||||||
is set to `Auto`, the CollectiveBuilder may automatically calculate the maximum number of stages that
|
is set to `Auto`, the CollectiveBuilder may automatically calculate the maximum number of stages that
|
||||||
@ -90,7 +90,7 @@
|
|||||||
Details of this example
|
Details of this example
|
||||||
-----------------------
|
-----------------------
|
||||||
This example walks through the use of the CollectiveBuilder with various schedules and stage counts specified.
|
This example walks through the use of the CollectiveBuilder with various schedules and stage counts specified.
|
||||||
This example also illustrates how CUTLASS 3 GEMMs targetting Hopper automatically support batched GEMMs by simply
|
This example also illustrates how CUTLASS 3 GEMMs targeting Hopper automatically support batched GEMMs by simply
|
||||||
extending the problem size with an additional tensor rank.
|
extending the problem size with an additional tensor rank.
|
||||||
|
|
||||||
Example usage:
|
Example usage:
|
||||||
@ -162,7 +162,7 @@ struct Options {
|
|||||||
|
|
||||||
out << "49_hopper_gemm_schedules_with_collective_builder\n\n"
|
out << "49_hopper_gemm_schedules_with_collective_builder\n\n"
|
||||||
<< " This example showcases the use of CUTLASS's collective operation builders to easily construct\n"
|
<< " This example showcases the use of CUTLASS's collective operation builders to easily construct\n"
|
||||||
<< " performant kernels targetting NVIDIA's Hopper architecture.\n\n"
|
<< " performant kernels targeting NVIDIA's Hopper architecture.\n\n"
|
||||||
<< "Options:\n\n"
|
<< "Options:\n\n"
|
||||||
<< " --help If specified, displays this usage statement\n\n"
|
<< " --help If specified, displays this usage statement\n\n"
|
||||||
<< " --m=<int> Sets the M extent of the GEMM\n"
|
<< " --m=<int> Sets the M extent of the GEMM\n"
|
||||||
|
@ -718,7 +718,7 @@ make_tma_copy(CopyOp,
|
|||||||
<< "\nswizzle " << smem_swizzle
|
<< "\nswizzle " << smem_swizzle
|
||||||
<< "\nl2Promotion " << tma_l2Promotion
|
<< "\nl2Promotion " << tma_l2Promotion
|
||||||
<< "\noobFill " << tma_oobFill << std::endl;
|
<< "\noobFill " << tma_oobFill << std::endl;
|
||||||
std::cerr << "Error: Failed to intialize the TMA descriptor " << result << std::endl;
|
std::cerr << "Error: Failed to initialize the TMA descriptor " << result << std::endl;
|
||||||
assert(false);
|
assert(false);
|
||||||
}
|
}
|
||||||
#endif // (__CUDACC_VER_MAJOR__ >= 12)
|
#endif // (__CUDACC_VER_MAJOR__ >= 12)
|
||||||
|
@ -98,11 +98,11 @@ struct OpClassSimt {};
|
|||||||
|
|
||||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
/// Tag classifing operators as Tensor Core operations.
|
/// Tag classifying operators as Tensor Core operations.
|
||||||
struct OpClassTensorOp {};
|
struct OpClassTensorOp {};
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
/// Tag classifing operators as WMMA Tensor Core operations
|
/// Tag classifying operators as WMMA Tensor Core operations
|
||||||
struct OpClassWmmaTensorOp {};
|
struct OpClassWmmaTensorOp {};
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
@ -230,7 +230,7 @@ public:
|
|||||||
offset_p[s] = (mapped_h + problem_size_.pad_h - filter_r) / problem_size_.stride_h;
|
offset_p[s] = (mapped_h + problem_size_.pad_h - filter_r) / problem_size_.stride_h;
|
||||||
offset_q[s] = (mapped_w + problem_size_.pad_w - filter_s) / problem_size_.stride_w;
|
offset_q[s] = (mapped_w + problem_size_.pad_w - filter_s) / problem_size_.stride_w;
|
||||||
|
|
||||||
// Intialize pointers for gemm_k=0
|
// Initialize pointers for gemm_k=0
|
||||||
TensorCoord coord{offset_n[s], offset_p[s], offset_q[s], filter_k_};
|
TensorCoord coord{offset_n[s], offset_p[s], offset_q[s], filter_k_};
|
||||||
|
|
||||||
pointer_[s] += params_.layout(coord) * sizeof_bits<Element>::value / 8;
|
pointer_[s] += params_.layout(coord) * sizeof_bits<Element>::value / 8;
|
||||||
@ -341,7 +341,7 @@ public:
|
|||||||
|
|
||||||
next_idx = 1;
|
next_idx = 1;
|
||||||
|
|
||||||
// Restore bytes in q coordinate (Mma in filter s dimenstion)
|
// Restore bytes in q coordinate (Mma in filter s dimension)
|
||||||
reset_bytes = reset_bytes_s_;
|
reset_bytes = reset_bytes_s_;
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
@ -351,7 +351,7 @@ public:
|
|||||||
|
|
||||||
next_idx = 2;
|
next_idx = 2;
|
||||||
|
|
||||||
// Restore bytes in p and q coordinate (Mma in filter s and r dimenstion)
|
// Restore bytes in p and q coordinate (Mma in filter s and r dimension)
|
||||||
reset_bytes = reset_bytes_r_;
|
reset_bytes = reset_bytes_r_;
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
|
@ -195,7 +195,7 @@ public:
|
|||||||
s = filter_s_[iteration_contiguous_];
|
s = filter_s_[iteration_contiguous_];
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
/// Multiple access to support non-128b alignment in contiguous dimenstion
|
/// Multiple access to support non-128b alignment in contiguous dimension
|
||||||
c = (filter_c_[iteration_contiguous_] + iteration_vector_ * AccessType::kElements) % problem_size_.C;
|
c = (filter_c_[iteration_contiguous_] + iteration_vector_ * AccessType::kElements) % problem_size_.C;
|
||||||
int wrap_c = (filter_c_[iteration_contiguous_] + iteration_vector_ * AccessType::kElements) / problem_size_.C;
|
int wrap_c = (filter_c_[iteration_contiguous_] + iteration_vector_ * AccessType::kElements) / problem_size_.C;
|
||||||
s = (filter_s_[iteration_contiguous_] + wrap_c) % problem_size_.S;
|
s = (filter_s_[iteration_contiguous_] + wrap_c) % problem_size_.S;
|
||||||
|
@ -212,7 +212,7 @@ public:
|
|||||||
|
|
||||||
if (kAccessesPerVector > 1) {
|
if (kAccessesPerVector > 1) {
|
||||||
// This code section is only to support non-128b alignment
|
// This code section is only to support non-128b alignment
|
||||||
// Multiple access to support non-128b alignment in contiguous dimenstion
|
// Multiple access to support non-128b alignment in contiguous dimension
|
||||||
int wrap_c;
|
int wrap_c;
|
||||||
params_.c_divmod(wrap_c, c, c + iteration_vector_ * AccessType::kElements);
|
params_.c_divmod(wrap_c, c, c + iteration_vector_ * AccessType::kElements);
|
||||||
|
|
||||||
|
@ -241,7 +241,7 @@ public:
|
|||||||
int rs_plane_idx = 0;
|
int rs_plane_idx = 0;
|
||||||
|
|
||||||
// Issue loads during the first warp-level matrix multiply-add *AFTER* issuing
|
// Issue loads during the first warp-level matrix multiply-add *AFTER* issuing
|
||||||
// shared memory loads (which have the tighest latency requirement).
|
// shared memory loads (which have the tightest latency requirement).
|
||||||
|
|
||||||
//
|
//
|
||||||
// Mainloop
|
// Mainloop
|
||||||
|
@ -238,7 +238,7 @@ public:
|
|||||||
int smem_write_stage_idx = 1;
|
int smem_write_stage_idx = 1;
|
||||||
|
|
||||||
// Issue loads during the first warp-level matrix multiply-add *AFTER* issuing
|
// Issue loads during the first warp-level matrix multiply-add *AFTER* issuing
|
||||||
// shared memory loads (which have the tighest latency requirement).
|
// shared memory loads (which have the tightest latency requirement).
|
||||||
|
|
||||||
//
|
//
|
||||||
// Mainloop
|
// Mainloop
|
||||||
|
@ -67,7 +67,7 @@ static int get_strided_dgrad_tile_m(
|
|||||||
// CUTLASS strided dgrad performance for stride > filter, i.e., stride={2x2} and filter={1x1})
|
// CUTLASS strided dgrad performance for stride > filter, i.e., stride={2x2} and filter={1x1})
|
||||||
//
|
//
|
||||||
// * Optimization *
|
// * Optimization *
|
||||||
// Only launch CTAs in M dimenstion which contribute to a row in Dx output
|
// Only launch CTAs in M dimension which contribute to a row in Dx output
|
||||||
//
|
//
|
||||||
//
|
//
|
||||||
// * Constraints *
|
// * Constraints *
|
||||||
@ -107,7 +107,7 @@ struct StridedDgradHorizontalThreadblockSwizzle :
|
|||||||
// compute number of tiles in m dimension
|
// compute number of tiles in m dimension
|
||||||
int tile_m = get_strided_dgrad_tile_m(problem_size, tile_size.m());
|
int tile_m = get_strided_dgrad_tile_m(problem_size, tile_size.m());
|
||||||
|
|
||||||
// compute number of tiles in n dimenstion
|
// compute number of tiles in n dimension
|
||||||
int tile_n = (implicit_gemm_problem_size.n() + tile_size.n() - 1) / tile_size.n();
|
int tile_n = (implicit_gemm_problem_size.n() + tile_size.n() - 1) / tile_size.n();
|
||||||
|
|
||||||
return gemm::GemmCoord(
|
return gemm::GemmCoord(
|
||||||
@ -148,7 +148,7 @@ struct StridedDgradIdentityThreadblockSwizzle :
|
|||||||
// compute number of tiles in m dimension
|
// compute number of tiles in m dimension
|
||||||
int tile_m = get_strided_dgrad_tile_m(problem_size, tile_size.m());
|
int tile_m = get_strided_dgrad_tile_m(problem_size, tile_size.m());
|
||||||
|
|
||||||
// compute number of tiles in n dimenstion
|
// compute number of tiles in n dimension
|
||||||
int tile_n = (implicit_gemm_problem_size.n() + tile_size.n() - 1) / tile_size.n();
|
int tile_n = (implicit_gemm_problem_size.n() + tile_size.n() - 1) / tile_size.n();
|
||||||
|
|
||||||
return gemm::GemmCoord(
|
return gemm::GemmCoord(
|
||||||
|
@ -77,7 +77,7 @@ namespace threadblock {
|
|||||||
// D = dr + j di = (ar*br - ai*bi) + j (ar*bi + ai*br)
|
// D = dr + j di = (ar*br - ai*bi) + j (ar*bi + ai*br)
|
||||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
template <
|
template <
|
||||||
/// Epilouge Shape
|
/// Epilogue Shape
|
||||||
typename Shape_,
|
typename Shape_,
|
||||||
/// Warp-level mma operator
|
/// Warp-level mma operator
|
||||||
typename WarpMmaTensorOp_,
|
typename WarpMmaTensorOp_,
|
||||||
|
@ -78,7 +78,7 @@ namespace threadblock {
|
|||||||
// D = dr + j di = (ar*br - ai*bi) + j (ar*bi + ai*br)
|
// D = dr + j di = (ar*br - ai*bi) + j (ar*bi + ai*br)
|
||||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
template <
|
template <
|
||||||
/// Epilouge Shape
|
/// Epilogue Shape
|
||||||
typename Shape_,
|
typename Shape_,
|
||||||
/// Warp-level mma operator
|
/// Warp-level mma operator
|
||||||
typename WarpMmaTensorOp_,
|
typename WarpMmaTensorOp_,
|
||||||
|
@ -198,7 +198,7 @@ private:
|
|||||||
/// A thread's starting column
|
/// A thread's starting column
|
||||||
Index thread_start_column_;
|
Index thread_start_column_;
|
||||||
|
|
||||||
/// Initial thread ouput location
|
/// Initial thread output location
|
||||||
int thread_start_n_, thread_start_p_, thread_start_q_;
|
int thread_start_n_, thread_start_p_, thread_start_q_;
|
||||||
|
|
||||||
/// Current threadblock tile index
|
/// Current threadblock tile index
|
||||||
|
@ -186,10 +186,10 @@ private:
|
|||||||
/// Extent of the matrix tile in rows
|
/// Extent of the matrix tile in rows
|
||||||
Index extent_row_;
|
Index extent_row_;
|
||||||
|
|
||||||
/// Starting Dx h and w dimenstion for strided dgrad mapping
|
/// Starting Dx h and w dimension for strided dgrad mapping
|
||||||
int start_h_, start_w_;
|
int start_h_, start_w_;
|
||||||
|
|
||||||
/// Effective Dy P and Q dimenstions for strided dgrad mapping
|
/// Effective Dy P and Q dimensions for strided dgrad mapping
|
||||||
int p_, q_;
|
int p_, q_;
|
||||||
|
|
||||||
/// A thread's starting row position (assuming steady-state predicates have been computed)
|
/// A thread's starting row position (assuming steady-state predicates have been computed)
|
||||||
|
@ -547,7 +547,7 @@ public:
|
|||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
/// Parital specialization for column-major output exchanges problem size and operand.
|
/// Partial specialization for column-major output exchanges problem size and operand.
|
||||||
template <
|
template <
|
||||||
/// Element type for A matrix operand
|
/// Element type for A matrix operand
|
||||||
typename ElementA_,
|
typename ElementA_,
|
||||||
|
@ -521,7 +521,7 @@ public:
|
|||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
/// Parital specialization for column-major output exchanges problem size and operand.
|
/// Partial specialization for column-major output exchanges problem size and operand.
|
||||||
template <
|
template <
|
||||||
/// Element type for A matrix operand
|
/// Element type for A matrix operand
|
||||||
typename ElementA_,
|
typename ElementA_,
|
||||||
|
@ -476,7 +476,7 @@ public:
|
|||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
/// Parital specialization for column-major output exchanges problem size and operand.
|
/// Partial specialization for column-major output exchanges problem size and operand.
|
||||||
template <
|
template <
|
||||||
/// Element type for A matrix operand
|
/// Element type for A matrix operand
|
||||||
typename ElementA_,
|
typename ElementA_,
|
||||||
|
@ -454,7 +454,7 @@ public:
|
|||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
/// Parital specialization for column-major output exchanges problem size and operand.
|
/// Partial specialization for column-major output exchanges problem size and operand.
|
||||||
template <
|
template <
|
||||||
/// Element type for A matrix operand
|
/// Element type for A matrix operand
|
||||||
typename ElementA_,
|
typename ElementA_,
|
||||||
|
@ -475,7 +475,7 @@ public:
|
|||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
/// Parital specialization for column-major output exchanges problem size and operand.
|
/// Partial specialization for column-major output exchanges problem size and operand.
|
||||||
template <
|
template <
|
||||||
/// Element type for A matrix operand
|
/// Element type for A matrix operand
|
||||||
typename ElementA_,
|
typename ElementA_,
|
||||||
|
@ -194,7 +194,7 @@ class GemmLayernormMainloopFusion :
|
|||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
/// Parital specialization for column-major output exchanges problem size and operand.
|
/// Partial specialization for column-major output exchanges problem size and operand.
|
||||||
template <
|
template <
|
||||||
/// Element type for A matrix operand
|
/// Element type for A matrix operand
|
||||||
typename ElementA_,
|
typename ElementA_,
|
||||||
|
@ -219,7 +219,7 @@ class GemmUniversal :
|
|||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
/// Parital specialization for column-major output exchanges problem size and operand.
|
/// Partial specialization for column-major output exchanges problem size and operand.
|
||||||
template <
|
template <
|
||||||
/// Element type for A matrix operand
|
/// Element type for A matrix operand
|
||||||
typename ElementA_,
|
typename ElementA_,
|
||||||
|
@ -198,7 +198,7 @@ class GemmUniversalWithBroadcast :
|
|||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
/// Parital specialization for column-major output exchanges problem size and operand.
|
/// Partial specialization for column-major output exchanges problem size and operand.
|
||||||
template <
|
template <
|
||||||
/// Element type for A matrix operand
|
/// Element type for A matrix operand
|
||||||
typename ElementA_,
|
typename ElementA_,
|
||||||
|
@ -211,7 +211,7 @@ class GemmWithKReduction :
|
|||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
/// Parital specialization for column-major output exchanges problem size and operand.
|
/// Partial specialization for column-major output exchanges problem size and operand.
|
||||||
template <
|
template <
|
||||||
/// Element type for A matrix operand
|
/// Element type for A matrix operand
|
||||||
typename ElementA_,
|
typename ElementA_,
|
||||||
|
@ -348,7 +348,7 @@ public:
|
|||||||
};
|
};
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
/// Parital specialization for column-major output exchange operand.
|
/// Partial specialization for column-major output exchange operand.
|
||||||
template <
|
template <
|
||||||
/// Element type for A matrix operand
|
/// Element type for A matrix operand
|
||||||
typename ElementA_,
|
typename ElementA_,
|
||||||
|
@ -325,7 +325,7 @@ public:
|
|||||||
};
|
};
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
/// Parital specialization for column-major output exchange operand.
|
/// Partial specialization for column-major output exchange operand.
|
||||||
template <
|
template <
|
||||||
/// Element type for A matrix operand
|
/// Element type for A matrix operand
|
||||||
typename ElementA_,
|
typename ElementA_,
|
||||||
|
@ -408,7 +408,7 @@ public:
|
|||||||
call GEMM mainloop for with RowMajor efficient-epilogue
|
call GEMM mainloop for with RowMajor efficient-epilogue
|
||||||
********************************************************************************************************/
|
********************************************************************************************************/
|
||||||
|
|
||||||
/// Parital specialization for column-major output exchanges problem size and operand.
|
/// Partial specialization for column-major output exchanges problem size and operand.
|
||||||
template <
|
template <
|
||||||
/// Element type for A matrix operand
|
/// Element type for A matrix operand
|
||||||
typename ElementA_,
|
typename ElementA_,
|
||||||
|
@ -563,7 +563,7 @@ For the mainloop and trmm kernel, `A` and `B` points to left-side and right-side
|
|||||||
call GEMM mainloop for with RowMajor efficient-epilogue
|
call GEMM mainloop for with RowMajor efficient-epilogue
|
||||||
********************************************************************************************************/
|
********************************************************************************************************/
|
||||||
|
|
||||||
/// Parital specialization for column-major output exchanges problem size and operand.
|
/// Partial specialization for column-major output exchanges problem size and operand.
|
||||||
template <
|
template <
|
||||||
/// Element type for A matrix operand
|
/// Element type for A matrix operand
|
||||||
typename ElementA_,
|
typename ElementA_,
|
||||||
|
@ -137,7 +137,7 @@ struct DefaultGemmWithBroadcast {
|
|||||||
|
|
||||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
/// Parital specialization: ArchTag = cutlass::arch::Sm70
|
/// Partial specialization: ArchTag = cutlass::arch::Sm70
|
||||||
///
|
///
|
||||||
///
|
///
|
||||||
template <
|
template <
|
||||||
|
@ -138,7 +138,7 @@ struct DefaultGemmWithReduction {
|
|||||||
|
|
||||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
/// Parital specialization: ArchTag = cutlass::arch::Sm70
|
/// Partial specialization: ArchTag = cutlass::arch::Sm70
|
||||||
///
|
///
|
||||||
///
|
///
|
||||||
template <
|
template <
|
||||||
|
@ -138,7 +138,7 @@
|
|||||||
i = i_macro
|
i = i_macro
|
||||||
j = j_macro
|
j = j_macro
|
||||||
|
|
||||||
Handling cases with grid dimensions that aren't multiples of eachother
|
Handling cases with grid dimensions that aren't multiples of each other
|
||||||
----------------------------------------------------------------------
|
----------------------------------------------------------------------
|
||||||
Even though threadblock shapes M and N are typically multiples of one another, the grid
|
Even though threadblock shapes M and N are typically multiples of one another, the grid
|
||||||
for a given problem may not have dimensions of the same ratio as that of the threadblock.
|
for a given problem may not have dimensions of the same ratio as that of the threadblock.
|
||||||
|
@ -196,7 +196,7 @@ public:
|
|||||||
// Any Tensor Op MMA Atom in the WGMMA ISA is arch conditional to sm90a.
|
// Any Tensor Op MMA Atom in the WGMMA ISA is arch conditional to sm90a.
|
||||||
#if ! defined(__CUDA_ARCH_FEAT_SM90_ALL)
|
#if ! defined(__CUDA_ARCH_FEAT_SM90_ALL)
|
||||||
if constexpr(size<0>(typename TiledMma::AtomShape_MNK{}) == 64) {
|
if constexpr(size<0>(typename TiledMma::AtomShape_MNK{}) == 64) {
|
||||||
printf("ERROR : Arch conditional MMA instruction used without targetting sm90a compute capability. Aborting.\n");
|
printf("ERROR : Arch conditional MMA instruction used without targeting sm90a compute capability. Aborting.\n");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
@ -186,7 +186,7 @@ public:
|
|||||||
// Any Tensor Op MMA Atom in the WGMMA ISA is arch conditional to sm90a.
|
// Any Tensor Op MMA Atom in the WGMMA ISA is arch conditional to sm90a.
|
||||||
#if ! defined(__CUDA_ARCH_FEAT_SM90_ALL)
|
#if ! defined(__CUDA_ARCH_FEAT_SM90_ALL)
|
||||||
if constexpr(size<0>(typename TiledMma::AtomShape_MNK{}) == 64) {
|
if constexpr(size<0>(typename TiledMma::AtomShape_MNK{}) == 64) {
|
||||||
printf("ERROR : Arch conditional MMA instruction used without targetting sm90a compute capability. Aborting.\n");
|
printf("ERROR : Arch conditional MMA instruction used without targeting sm90a compute capability. Aborting.\n");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
@ -258,7 +258,7 @@ public:
|
|||||||
// Any Tensor Op MMA Atom in the WGMMA ISA is arch conditional to sm90a.
|
// Any Tensor Op MMA Atom in the WGMMA ISA is arch conditional to sm90a.
|
||||||
#if ! defined(__CUDA_ARCH_FEAT_SM90_ALL)
|
#if ! defined(__CUDA_ARCH_FEAT_SM90_ALL)
|
||||||
if constexpr(size<0>(typename TiledMma::AtomShape_MNK{}) == 64) {
|
if constexpr(size<0>(typename TiledMma::AtomShape_MNK{}) == 64) {
|
||||||
printf("ERROR : Arch conditional MMA instruction used without targetting sm90a compute capability. Aborting.\n");
|
printf("ERROR : Arch conditional MMA instruction used without targeting sm90a compute capability. Aborting.\n");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
@ -271,7 +271,7 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Issue loads during the first warp-level matrix multiply-add *AFTER* issuing
|
// Issue loads during the first warp-level matrix multiply-add *AFTER* issuing
|
||||||
// shared memory loads (which have the tighest latency requirement).
|
// shared memory loads (which have the tightest latency requirement).
|
||||||
|
|
||||||
//
|
//
|
||||||
// Mainloop
|
// Mainloop
|
||||||
|
@ -321,7 +321,7 @@ public:
|
|||||||
iterator_B_imag.clear_mask(gemm_k_iterations <= 1);
|
iterator_B_imag.clear_mask(gemm_k_iterations <= 1);
|
||||||
|
|
||||||
// Issue loads during the first warp-level matrix multiply-add *AFTER* issuing
|
// Issue loads during the first warp-level matrix multiply-add *AFTER* issuing
|
||||||
// shared memory loads (which have the tighest latency requirement).
|
// shared memory loads (which have the tightest latency requirement).
|
||||||
|
|
||||||
//
|
//
|
||||||
// Mainloop
|
// Mainloop
|
||||||
|
@ -83,7 +83,7 @@ struct TensorReductionAffineContiguousParams {
|
|||||||
uint64_t outer_count; /// Number of elements in outer index space
|
uint64_t outer_count; /// Number of elements in outer index space
|
||||||
|
|
||||||
ElementOutput * destination; /// Pointer to output tensor of rank kReducedRank
|
ElementOutput * destination; /// Pointer to output tensor of rank kReducedRank
|
||||||
ElementSource const * source; /// Poitner to source pointer of rank kRank
|
ElementSource const * source; /// Pointer to source pointer of rank kRank
|
||||||
ReductionOp reduction_op; /// Reduction operator
|
ReductionOp reduction_op; /// Reduction operator
|
||||||
ElementCompute reduction_identity; /// Identity element used by reduction operator
|
ElementCompute reduction_identity; /// Identity element used by reduction operator
|
||||||
ElementCompute *device_workspace; /// Pointer to device workspace for inter-CTA reductions
|
ElementCompute *device_workspace; /// Pointer to device workspace for inter-CTA reductions
|
||||||
|
@ -85,7 +85,7 @@ struct TensorReductionAffineStridedParams {
|
|||||||
uint64_t outer_count; /// Number of elements in outer index space
|
uint64_t outer_count; /// Number of elements in outer index space
|
||||||
|
|
||||||
ElementOutput * destination; /// Pointer to output tensor of rank kReducedRank
|
ElementOutput * destination; /// Pointer to output tensor of rank kReducedRank
|
||||||
ElementSource const * source; /// Poitner to source pointer of rank kRank
|
ElementSource const * source; /// Pointer to source pointer of rank kRank
|
||||||
ReductionOp reduction_op; /// Reduction operator
|
ReductionOp reduction_op; /// Reduction operator
|
||||||
ElementCompute reduction_identity; /// Identity element for reduction operator
|
ElementCompute reduction_identity; /// Identity element for reduction operator
|
||||||
ElementCompute *device_workspace; /// Pointer to device workspace for inter-CTA reductions
|
ElementCompute *device_workspace; /// Pointer to device workspace for inter-CTA reductions
|
||||||
|
@ -399,7 +399,7 @@ class PredicatedTileAccessIterator<Shape_, Element_, layout::PitchLinear,
|
|||||||
bool is_residue_tile_;
|
bool is_residue_tile_;
|
||||||
|
|
||||||
/// Below is used when Gather is turned on. We need to record strided_offset
|
/// Below is used when Gather is turned on. We need to record strided_offset
|
||||||
/// and contiguous_offset seperated to compute the offset by using
|
/// and contiguous_offset separated to compute the offset by using
|
||||||
///
|
///
|
||||||
/// offset = contiguous_offset + indices[strided_offset]
|
/// offset = contiguous_offset + indices[strided_offset]
|
||||||
///
|
///
|
||||||
|
@ -1079,7 +1079,7 @@ class RegularTileIterator<
|
|||||||
//
|
//
|
||||||
|
|
||||||
/// The crosswised elements will be stored in a line.
|
/// The crosswised elements will be stored in a line.
|
||||||
/// line_size is size of crosswised dimention plus padding.
|
/// line_size is size of crosswised dimension plus padding.
|
||||||
/// in units of AccessType
|
/// in units of AccessType
|
||||||
Index line_size;
|
Index line_size;
|
||||||
|
|
||||||
|
@ -347,7 +347,7 @@ creating GEMM-B tile in shared memory.
|
|||||||
The improvements covered by optimized iterators are:
|
The improvements covered by optimized iterators are:
|
||||||
- (a) Precomputing kernel-invariant pointer deltas on the host
|
- (a) Precomputing kernel-invariant pointer deltas on the host
|
||||||
- (b) Computing cta-invariant mask predicates on device-side iterator ctors
|
- (b) Computing cta-invariant mask predicates on device-side iterator ctors
|
||||||
- (c) Use of [fast divmod](/include/cutlass/fast_math.h) to map GEMM dimenstions to convolution tensors.
|
- (c) Use of [fast divmod](/include/cutlass/fast_math.h) to map GEMM dimensions to convolution tensors.
|
||||||
For example, _optimized_ activation iterator uses fast divmod to map GEMM _M_ to NPQ
|
For example, _optimized_ activation iterator uses fast divmod to map GEMM _M_ to NPQ
|
||||||
for activation iterator
|
for activation iterator
|
||||||
|
|
||||||
|
@ -587,7 +587,8 @@ To instantiate all operations supporting all tile sizes, data types, and alignme
|
|||||||
```bash
|
```bash
|
||||||
$ cmake .. -DCUTLASS_NVCC_ARCHS='70;75;80' -DCUTLASS_LIBRARY_KERNELS=all
|
$ cmake .. -DCUTLASS_NVCC_ARCHS='70;75;80' -DCUTLASS_LIBRARY_KERNELS=all
|
||||||
```
|
```
|
||||||
The above command line generates about twenty thousand kernels targetting NVIDIA Ampere, Turing, and Volta architectures.
|
|
||||||
|
The above command line generates about twenty thousand kernels targeting NVIDIA Ampere, Turing, and Volta architectures.
|
||||||
Compiling thousands of kernels for three different architectures is time consuming. Additionaly, this would also result
|
Compiling thousands of kernels for three different architectures is time consuming. Additionaly, this would also result
|
||||||
in a large binary size and on some platforms linker to fail on building the library.
|
in a large binary size and on some platforms linker to fail on building the library.
|
||||||
|
|
||||||
@ -641,13 +642,13 @@ $ cmake .. -DCUTLASS_NVCC_ARCHS='80' -DCUTLASS_LIBRARY_KERNELS=s16816fprop,s1681
|
|||||||
$ cmake .. -DCUTLASS_NVCC_ARCHS='50;60;61;70;75;80' -DCUTLASS_LIBRARY_KERNELS=sfprop
|
$ cmake .. -DCUTLASS_NVCC_ARCHS='50;60;61;70;75;80' -DCUTLASS_LIBRARY_KERNELS=sfprop
|
||||||
```
|
```
|
||||||
|
|
||||||
**Example.** All forward propagation (fprop) convolution kernels with FP32 accumulation and FP16 input targetting NVIDIA Ampere's 16816 Tensor Core operation
|
**Example.** All forward propagation (fprop) convolution kernels with FP32 accumulation and FP16 input targeting NVIDIA Ampere's 16816 Tensor Core operation
|
||||||
```bash
|
```bash
|
||||||
$ cmake .. -DCUTLASS_NVCC_ARCHS='80' -DCUTLASS_LIBRARY_KERNELS=s16816fprop_*_f16
|
$ cmake .. -DCUTLASS_NVCC_ARCHS='80' -DCUTLASS_LIBRARY_KERNELS=s16816fprop_*_f16
|
||||||
```
|
```
|
||||||
|
|
||||||
**Example.** All backward weight gradient (wgrad) convolution kernels with FP32 accumulation, FP16 input, and optimized global memory iterator
|
**Example.** All backward weight gradient (wgrad) convolution kernels with FP32 accumulation, FP16 input, and optimized global memory iterator
|
||||||
targetting NVIDIA Ampere, Turing, and Volta Tensor Core operations
|
targeting NVIDIA Ampere, Turing, and Volta Tensor Core operations
|
||||||
```bash
|
```bash
|
||||||
$ cmake .. -DCUTLASS_NVCC_ARCHS='70;75;80' -DCUTLASS_LIBRARY_KERNELS=tensorop*s*wgrad_optimized_f16
|
$ cmake .. -DCUTLASS_NVCC_ARCHS='70;75;80' -DCUTLASS_LIBRARY_KERNELS=tensorop*s*wgrad_optimized_f16
|
||||||
```
|
```
|
||||||
|
@ -573,7 +573,7 @@ bool TestSpecificConv2d(
|
|||||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// TestAllConv: Runs cutlass::conv::device::ImplicitGemmConvolution operator and compares it with reference
|
// TestAllConv: Runs cutlass::conv::device::ImplicitGemmConvolution operator and compares it with reference
|
||||||
// TestAllConv runs conv operator on default conv problem sizes from test::conv::device::TestbedConv2dProblemSizes
|
// TestAllConv runs conv operator on default conv problem sizes from test::conv::device::TestbedConv2dProblemSizes
|
||||||
// Additionaly, each conv2d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes
|
// Additionally, each conv2d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes
|
||||||
// (conv_blacklist_sizes)
|
// (conv_blacklist_sizes)
|
||||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
template <typename ImplicitGemm>
|
template <typename ImplicitGemm>
|
||||||
|
@ -517,7 +517,7 @@ public:
|
|||||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// TestAllConv: Runs cutlass::conv::device::ImplicitGemmConvolution operator and compares it with reference
|
// TestAllConv: Runs cutlass::conv::device::ImplicitGemmConvolution operator and compares it with reference
|
||||||
// TestAllConv runs conv operator on default conv problem sizes from test::conv::device::TestbedConv2dProblemSizes
|
// TestAllConv runs conv operator on default conv problem sizes from test::conv::device::TestbedConv2dProblemSizes
|
||||||
// Additionaly, each conv2d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes
|
// Additionally, each conv2d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes
|
||||||
// (conv_blacklist_sizes)
|
// (conv_blacklist_sizes)
|
||||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
template <typename ImplicitGemm, int InterleavedK>
|
template <typename ImplicitGemm, int InterleavedK>
|
||||||
|
@ -502,7 +502,7 @@ public:
|
|||||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// TestAllConv: Runs cutlass::conv::device::ImplicitGemmConvolution operator and compares it with reference
|
// TestAllConv: Runs cutlass::conv::device::ImplicitGemmConvolution operator and compares it with reference
|
||||||
// TestAllConv runs conv operator on default conv problem sizes from test::conv::device::TestbedConv2dProblemSizes
|
// TestAllConv runs conv operator on default conv problem sizes from test::conv::device::TestbedConv2dProblemSizes
|
||||||
// Additionaly, each conv2d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes
|
// Additionally, each conv2d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes
|
||||||
// (conv_blacklist_sizes)
|
// (conv_blacklist_sizes)
|
||||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
template <typename ImplicitGemm,
|
template <typename ImplicitGemm,
|
||||||
|
@ -464,7 +464,7 @@ public:
|
|||||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// TestAllConv: Runs cutlass::conv::device::ImplicitGemmConvolution operator and compares it with reference
|
// TestAllConv: Runs cutlass::conv::device::ImplicitGemmConvolution operator and compares it with reference
|
||||||
// TestAllConv runs conv operator on default conv problem sizes from test::conv::device::TestbedConv2dProblemSizes
|
// TestAllConv runs conv operator on default conv problem sizes from test::conv::device::TestbedConv2dProblemSizes
|
||||||
// Additionaly, each conv2d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes
|
// Additionally, each conv2d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes
|
||||||
// (conv_blacklist_sizes)
|
// (conv_blacklist_sizes)
|
||||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
template <typename ImplicitGemm>
|
template <typename ImplicitGemm>
|
||||||
|
@ -522,7 +522,7 @@ public:
|
|||||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// TestAllConv: Runs cutlass::conv::device::ImplicitGemmConvolution operator and compares it with reference
|
// TestAllConv: Runs cutlass::conv::device::ImplicitGemmConvolution operator and compares it with reference
|
||||||
// TestAllConv runs conv operator on default conv problem sizes from test::conv::device::TestbedConv2dProblemSizes
|
// TestAllConv runs conv operator on default conv problem sizes from test::conv::device::TestbedConv2dProblemSizes
|
||||||
// Additionaly, each conv3d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes
|
// Additionally, each conv3d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes
|
||||||
// (conv_blacklist_sizes)
|
// (conv_blacklist_sizes)
|
||||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
@ -638,7 +638,7 @@ struct DefaultGemmConfigurationToCutlass3Types<
|
|||||||
GmemTiledCopyB, SmemLayoutAtomB, SmemCopyAtomB, cute::identity // B
|
GmemTiledCopyB, SmemLayoutAtomB, SmemCopyAtomB, cute::identity // B
|
||||||
>;
|
>;
|
||||||
|
|
||||||
// Epilouge
|
// Epilogue
|
||||||
using CollectiveEpilogue = epilogue::collective::DefaultEpilogue<
|
using CollectiveEpilogue = epilogue::collective::DefaultEpilogue<
|
||||||
TagToStrideC_t<LayoutC>,
|
TagToStrideC_t<LayoutC>,
|
||||||
TagToStrideC_t<LayoutC>,
|
TagToStrideC_t<LayoutC>,
|
||||||
|
@ -321,13 +321,13 @@ public:
|
|||||||
NumericTypeID element_C, /// Data type of C and D matrix
|
NumericTypeID element_C, /// Data type of C and D matrix
|
||||||
|
|
||||||
void const * const * ptr_C_real, /// Pointer to array containing pointers to real part of C matrices
|
void const * const * ptr_C_real, /// Pointer to array containing pointers to real part of C matrices
|
||||||
void const * const * ptr_C_imag, /// Pointer to array containing poitners to imaginary part of C matrices
|
void const * const * ptr_C_imag, /// Pointer to array containing pointers to imaginary part of C matrices
|
||||||
|
|
||||||
int64_t ldc_real, /// Leading dimension of real part of C matrix
|
int64_t ldc_real, /// Leading dimension of real part of C matrix
|
||||||
int64_t ldc_imag, /// Leading dimension of imaginary part of C matrix
|
int64_t ldc_imag, /// Leading dimension of imaginary part of C matrix
|
||||||
|
|
||||||
void * const * ptr_D_real, /// Pointer to array containing pointers to real part of D matrices
|
void * const * ptr_D_real, /// Pointer to array containing pointers to real part of D matrices
|
||||||
void * const * ptr_D_imag, /// Pointer to array containing poitners to imaginary part of D matrices
|
void * const * ptr_D_imag, /// Pointer to array containing pointers to imaginary part of D matrices
|
||||||
|
|
||||||
int64_t ldd_real, /// Leading dimension of real part of D matrix
|
int64_t ldd_real, /// Leading dimension of real part of D matrix
|
||||||
int64_t ldd_imag /// Leading dimension of imaginary part of D matrix
|
int64_t ldd_imag /// Leading dimension of imaginary part of D matrix
|
||||||
|
@ -518,7 +518,7 @@ struct GemmDescription : public OperationDescription {
|
|||||||
|
|
||||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
/// Desciprion for structured sparse GEMMs.
|
/// Description for structured sparse GEMMs.
|
||||||
struct SparseGemmDescription : public GemmDescription {
|
struct SparseGemmDescription : public GemmDescription {
|
||||||
|
|
||||||
/// Description structure for structured sparse GEMM
|
/// Description structure for structured sparse GEMM
|
||||||
@ -1160,7 +1160,7 @@ struct GemmGroupedArguments {
|
|||||||
// OperationKind: kSparseGemm
|
// OperationKind: kSparseGemm
|
||||||
//
|
//
|
||||||
|
|
||||||
/// Computes GEMM assumine one of the inputs has 2:4 structured sparsity.
|
/// Computes GEMM assuming one of the inputs has 2:4 structured sparsity.
|
||||||
struct SparseGemmConfiguration {
|
struct SparseGemmConfiguration {
|
||||||
|
|
||||||
GemmUniversalMode mode;
|
GemmUniversalMode mode;
|
||||||
@ -1187,7 +1187,7 @@ struct SparseGemmArguments {
|
|||||||
void const *B; /// pointer to B matrix
|
void const *B; /// pointer to B matrix
|
||||||
void const *C; /// pointer to C matrix
|
void const *C; /// pointer to C matrix
|
||||||
void *D; /// pointer to D matrix
|
void *D; /// pointer to D matrix
|
||||||
void const *E; /// pointer to E matric (metadata)
|
void const *E; /// pointer to E matrix (metadata)
|
||||||
|
|
||||||
void const *alpha; /// pointer to alpha scalar
|
void const *alpha; /// pointer to alpha scalar
|
||||||
void const *beta; /// pointer to beta scalar
|
void const *beta; /// pointer to beta scalar
|
||||||
@ -1465,7 +1465,7 @@ struct ConvArguments {
|
|||||||
/// pointer to implicit gemm matrix C
|
/// pointer to implicit gemm matrix C
|
||||||
void const *C;
|
void const *C;
|
||||||
|
|
||||||
/// pointer to implicit gemm desitination matrix D
|
/// pointer to implicit gemm destination matrix D
|
||||||
void *D;
|
void *D;
|
||||||
|
|
||||||
/// Host or device pointer to alpha scalar
|
/// Host or device pointer to alpha scalar
|
||||||
@ -1487,16 +1487,16 @@ struct ConvArguments {
|
|||||||
//
|
//
|
||||||
struct ReductionConfiguration {
|
struct ReductionConfiguration {
|
||||||
|
|
||||||
/// Redcution problem size
|
/// Reduction problem size
|
||||||
MatrixCoord problem_size;
|
MatrixCoord problem_size;
|
||||||
|
|
||||||
/// Number of partitions to reduce
|
/// Number of partitions to reduce
|
||||||
int partitions;
|
int partitions;
|
||||||
|
|
||||||
/// Number of lements between each partition
|
/// Number of elements between each partition
|
||||||
int64_t partition_stride;
|
int64_t partition_stride;
|
||||||
|
|
||||||
/// leading dimension of 'w'orksace operand
|
/// leading dimension of 'w'orkspace operand
|
||||||
int64_t ldw;
|
int64_t ldw;
|
||||||
|
|
||||||
/// leading dimension of 's'ource operand
|
/// leading dimension of 's'ource operand
|
||||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user