Fix typos 2 (#842)

Co-authored-by: Haicheng Wu <57973641+hwu36@users.noreply.github.com>
This commit is contained in:
Alexander Pivovarov 2023-03-09 20:22:56 -08:00 committed by GitHub
parent c4f6b8c6bc
commit 7e370c9637
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
161 changed files with 310 additions and 309 deletions

View File

@ -328,7 +328,7 @@ or a subset of kernels for NVIDIA Ampere and Turing architecture:
### Building a subset Tensor Core GEMM kernels ### Building a subset Tensor Core GEMM kernels
To compile a subset of Tensor Core GEMM kernels with FP32 accumulation and FP16 input targetting NVIDIA Ampere and Turing architecture, To compile a subset of Tensor Core GEMM kernels with FP32 accumulation and FP16 input targeting NVIDIA Ampere and Turing architecture,
use the below cmake command line: use the below cmake command line:
```bash ```bash
$ cmake .. -DCUTLASS_NVCC_ARCHS='75;80' -DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_s*gemm_f16_*_nt_align8 $ cmake .. -DCUTLASS_NVCC_ARCHS='75;80' -DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_s*gemm_f16_*_nt_align8
@ -376,7 +376,7 @@ reference_device: Passed
### Building one CUDA Core GEMM kernel ### Building one CUDA Core GEMM kernel
To compile one SGEMM kernel targetting NVIDIA Ampere and Turing architecture, use the below cmake command line: To compile one SGEMM kernel targeting NVIDIA Ampere and Turing architecture, use the below cmake command line:
```bash ```bash
$ cmake .. -DCUTLASS_NVCC_ARCHS='75;80' -DCUTLASS_LIBRARY_KERNELS=cutlass_simt_sgemm_128x128_8x2_nn_align1 $ cmake .. -DCUTLASS_NVCC_ARCHS='75;80' -DCUTLASS_LIBRARY_KERNELS=cutlass_simt_sgemm_128x128_8x2_nn_align1
... ...
@ -418,7 +418,7 @@ $ ./tools/profiler/cutlass_profiler --kernels=sgemm --m=3456 --n=4096 --k=4096
### Building a subset of Tensor Core Convolution kernels ### Building a subset of Tensor Core Convolution kernels
To compile a subset of Tensor core convolution kernels implementing forward propagation (fprop) with FP32 accumulation To compile a subset of Tensor core convolution kernels implementing forward propagation (fprop) with FP32 accumulation
and FP16 input targetting NVIDIA Ampere and Turing architecture, use the below cmake command line: and FP16 input targeting NVIDIA Ampere and Turing architecture, use the below cmake command line:
```bash ```bash
$ cmake .. -DCUTLASS_NVCC_ARCHS='75;80' -DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_s*fprop_optimized_f16 $ cmake .. -DCUTLASS_NVCC_ARCHS='75;80' -DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_s*fprop_optimized_f16
... ...
@ -466,7 +466,7 @@ reference_device: Passed
### Building one Convolution CUDA kernel ### Building one Convolution CUDA kernel
To compile and run one CUDA Core convolution kernel implementing forward propagation (fprop) with F32 accumulation To compile and run one CUDA Core convolution kernel implementing forward propagation (fprop) with F32 accumulation
and FP32 input targetting NVIDIA Ampere and Turing architecture, use the below cmake command line: and FP32 input targeting NVIDIA Ampere and Turing architecture, use the below cmake command line:
```bash ```bash
$ cmake .. -DCUTLASS_NVCC_ARCHS='75;80' -DCUTLASS_LIBRARY_KERNELS=cutlass_simt_sfprop_optimized_128x128_8x2_nhwc $ cmake .. -DCUTLASS_NVCC_ARCHS='75;80' -DCUTLASS_LIBRARY_KERNELS=cutlass_simt_sfprop_optimized_128x128_8x2_nhwc
... ...

View File

@ -280,15 +280,15 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
<tr id="row_0_3_0_13_" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1gemm_1_1device_1_1DefaultGemmConfiguration_3_01arch_1_1OpClassWmmaTensorOp_00_0884059ecad03bea3e86c4cf722226097.html" target="_self">DefaultGemmConfiguration&lt; arch::OpClassWmmaTensorOp, ArchTag, ElementA, ElementB, ElementC, ElementAccumulator &gt;</a></td><td class="desc"></td></tr> <tr id="row_0_3_0_13_" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1gemm_1_1device_1_1DefaultGemmConfiguration_3_01arch_1_1OpClassWmmaTensorOp_00_0884059ecad03bea3e86c4cf722226097.html" target="_self">DefaultGemmConfiguration&lt; arch::OpClassWmmaTensorOp, ArchTag, ElementA, ElementB, ElementC, ElementAccumulator &gt;</a></td><td class="desc"></td></tr>
<tr id="row_0_3_0_14_" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span id="arr_0_3_0_14_" class="arrow" onclick="toggleFolder('0_3_0_14_')">&#9658;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1Gemm.html" target="_self">Gemm</a></td><td class="desc"></td></tr> <tr id="row_0_3_0_14_" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span id="arr_0_3_0_14_" class="arrow" onclick="toggleFolder('0_3_0_14_')">&#9658;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1Gemm.html" target="_self">Gemm</a></td><td class="desc"></td></tr>
<tr id="row_0_3_0_14_0_" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1gemm_1_1device_1_1Gemm_1_1Arguments.html" target="_self">Arguments</a></td><td class="desc">Argument structure </td></tr> <tr id="row_0_3_0_14_0_" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1gemm_1_1device_1_1Gemm_1_1Arguments.html" target="_self">Arguments</a></td><td class="desc">Argument structure </td></tr>
<tr id="row_0_3_0_15_" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span id="arr_0_3_0_15_" class="arrow" onclick="toggleFolder('0_3_0_15_')">&#9658;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1Gemm_3_01ElementA___00_01LayoutA___00_01ElementB___00_01Layout4d0960ae6b1d1bf19e6239dbd002249c.html" target="_self">Gemm&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, SplitKSerial, Operator_, IsBetaZero &gt;</a></td><td class="desc">Parital specialization for column-major output exchanges problem size and operand </td></tr> <tr id="row_0_3_0_15_" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span id="arr_0_3_0_15_" class="arrow" onclick="toggleFolder('0_3_0_15_')">&#9658;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1Gemm_3_01ElementA___00_01LayoutA___00_01ElementB___00_01Layout4d0960ae6b1d1bf19e6239dbd002249c.html" target="_self">Gemm&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, SplitKSerial, Operator_, IsBetaZero &gt;</a></td><td class="desc">Partial specialization for column-major output exchanges problem size and operand </td></tr>
<tr id="row_0_3_0_15_0_" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1gemm_1_1device_1_1Gemm_3_01ElementA___00_01LayoutA___00_01ElementB___00_01Layou1b211cc9c97c022d8fe10f2dd32c8709.html" target="_self">Arguments</a></td><td class="desc">Argument structure </td></tr> <tr id="row_0_3_0_15_0_" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1gemm_1_1device_1_1Gemm_3_01ElementA___00_01LayoutA___00_01ElementB___00_01Layou1b211cc9c97c022d8fe10f2dd32c8709.html" target="_self">Arguments</a></td><td class="desc">Argument structure </td></tr>
<tr id="row_0_3_0_16_" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span id="arr_0_3_0_16_" class="arrow" onclick="toggleFolder('0_3_0_16_')">&#9658;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmBatched.html" target="_self">GemmBatched</a></td><td class="desc"></td></tr> <tr id="row_0_3_0_16_" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span id="arr_0_3_0_16_" class="arrow" onclick="toggleFolder('0_3_0_16_')">&#9658;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmBatched.html" target="_self">GemmBatched</a></td><td class="desc"></td></tr>
<tr id="row_0_3_0_16_0_" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1gemm_1_1device_1_1GemmBatched_1_1Arguments.html" target="_self">Arguments</a></td><td class="desc">Argument structure </td></tr> <tr id="row_0_3_0_16_0_" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1gemm_1_1device_1_1GemmBatched_1_1Arguments.html" target="_self">Arguments</a></td><td class="desc">Argument structure </td></tr>
<tr id="row_0_3_0_17_" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span id="arr_0_3_0_17_" class="arrow" onclick="toggleFolder('0_3_0_17_')">&#9658;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_0c9bb6f4463ab6085e6008b5d5ad6abfd.html" target="_self">GemmBatched&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ &gt;</a></td><td class="desc">Parital specialization for column-major output exchanges problem size and operand </td></tr> <tr id="row_0_3_0_17_" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span id="arr_0_3_0_17_" class="arrow" onclick="toggleFolder('0_3_0_17_')">&#9658;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_0c9bb6f4463ab6085e6008b5d5ad6abfd.html" target="_self">GemmBatched&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ &gt;</a></td><td class="desc">Partial specialization for column-major output exchanges problem size and operand </td></tr>
<tr id="row_0_3_0_17_0_" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_213d78696663f4231cd52c6a277c60e5.html" target="_self">Arguments</a></td><td class="desc">Argument structure </td></tr> <tr id="row_0_3_0_17_0_" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_213d78696663f4231cd52c6a277c60e5.html" target="_self">Arguments</a></td><td class="desc">Argument structure </td></tr>
<tr id="row_0_3_0_18_" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span id="arr_0_3_0_18_" class="arrow" onclick="toggleFolder('0_3_0_18_')">&#9658;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmComplex.html" target="_self">GemmComplex</a></td><td class="desc"></td></tr> <tr id="row_0_3_0_18_" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span id="arr_0_3_0_18_" class="arrow" onclick="toggleFolder('0_3_0_18_')">&#9658;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmComplex.html" target="_self">GemmComplex</a></td><td class="desc"></td></tr>
<tr id="row_0_3_0_18_0_" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1gemm_1_1device_1_1GemmComplex_1_1Arguments.html" target="_self">Arguments</a></td><td class="desc">Argument structure </td></tr> <tr id="row_0_3_0_18_0_" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1gemm_1_1device_1_1GemmComplex_1_1Arguments.html" target="_self">Arguments</a></td><td class="desc">Argument structure </td></tr>
<tr id="row_0_3_0_19_" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span id="arr_0_3_0_19_" class="arrow" onclick="toggleFolder('0_3_0_19_')">&#9658;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmComplex_3_01ElementA___00_01LayoutA___00_01ElementB___00_07c56401b4df75709ae636675d9980a9a.html" target="_self">GemmComplex&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, TransformA, TransformB, SplitKSerial &gt;</a></td><td class="desc">Parital specialization for column-major output exchanges problem size and operand </td></tr> <tr id="row_0_3_0_19_" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span id="arr_0_3_0_19_" class="arrow" onclick="toggleFolder('0_3_0_19_')">&#9658;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmComplex_3_01ElementA___00_01LayoutA___00_01ElementB___00_07c56401b4df75709ae636675d9980a9a.html" target="_self">GemmComplex&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, TransformA, TransformB, SplitKSerial &gt;</a></td><td class="desc">Partial specialization for column-major output exchanges problem size and operand </td></tr>
<tr id="row_0_3_0_19_0_" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1gemm_1_1device_1_1GemmComplex_3_01ElementA___00_01LayoutA___00_01ElementB___00_a3923967cafb5cb9774c320dc24baa77.html" target="_self">Arguments</a></td><td class="desc">Argument structure </td></tr> <tr id="row_0_3_0_19_0_" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1gemm_1_1device_1_1GemmComplex_3_01ElementA___00_01LayoutA___00_01ElementB___00_a3923967cafb5cb9774c320dc24baa77.html" target="_self">Arguments</a></td><td class="desc">Argument structure </td></tr>
<tr id="row_0_3_0_20_" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span id="arr_0_3_0_20_" class="arrow" onclick="toggleFolder('0_3_0_20_')">&#9658;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmSplitKParallel.html" target="_self">GemmSplitKParallel</a></td><td class="desc"></td></tr> <tr id="row_0_3_0_20_" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span id="arr_0_3_0_20_" class="arrow" onclick="toggleFolder('0_3_0_20_')">&#9658;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmSplitKParallel.html" target="_self">GemmSplitKParallel</a></td><td class="desc"></td></tr>
<tr id="row_0_3_0_20_0_" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1gemm_1_1device_1_1GemmSplitKParallel_1_1Arguments.html" target="_self">Arguments</a></td><td class="desc">Argument structure </td></tr> <tr id="row_0_3_0_20_0_" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1gemm_1_1device_1_1GemmSplitKParallel_1_1Arguments.html" target="_self">Arguments</a></td><td class="desc">Argument structure </td></tr>
@ -594,7 +594,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
<tr id="row_0_8_1_4_" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1device_1_1Gemm.html" target="_self">Gemm</a></td><td class="desc"></td></tr> <tr id="row_0_8_1_4_" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1device_1_1Gemm.html" target="_self">Gemm</a></td><td class="desc"></td></tr>
<tr id="row_0_8_1_5_" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout4e016ab7cfc644acd7cb4ae770339773.html" target="_self">Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, AccumulatorType, arch::OpMultiplyAdd &gt;</a></td><td class="desc">Partial specialization for multiply-add </td></tr> <tr id="row_0_8_1_5_" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout4e016ab7cfc644acd7cb4ae770339773.html" target="_self">Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, AccumulatorType, arch::OpMultiplyAdd &gt;</a></td><td class="desc">Partial specialization for multiply-add </td></tr>
<tr id="row_0_8_1_6_" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout30b72addd464a2ca4a26785cbfd77a8e.html" target="_self">Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, AccumulatorType, arch::OpMultiplyAddSaturate &gt;</a></td><td class="desc">Partial specialization for multiply-add-saturate </td></tr> <tr id="row_0_8_1_6_" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout30b72addd464a2ca4a26785cbfd77a8e.html" target="_self">Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, AccumulatorType, arch::OpMultiplyAddSaturate &gt;</a></td><td class="desc">Partial specialization for multiply-add-saturate </td></tr>
<tr id="row_0_8_1_7_" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout660562b232f408218828ca5915b7e73a.html" target="_self">Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, AccumulatorType, arch::OpXorPopc &gt;</a></td><td class="desc">Parital specialization for XOR-popc </td></tr> <tr id="row_0_8_1_7_" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout660562b232f408218828ca5915b7e73a.html" target="_self">Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, AccumulatorType, arch::OpXorPopc &gt;</a></td><td class="desc">Partial specialization for XOR-popc </td></tr>
<tr id="row_0_8_1_8_" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1device_1_1TensorDiagonalForEach.html" target="_self">TensorDiagonalForEach</a></td><td class="desc">Launches a kernel calling a functor for each element along a tensor's diagonal </td></tr> <tr id="row_0_8_1_8_" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1device_1_1TensorDiagonalForEach.html" target="_self">TensorDiagonalForEach</a></td><td class="desc">Launches a kernel calling a functor for each element along a tensor's diagonal </td></tr>
<tr id="row_0_8_1_9_" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1device_1_1TensorForEach.html" target="_self">TensorForEach</a></td><td class="desc">Launches a kernel calling a functor for each element in a tensor's index space </td></tr> <tr id="row_0_8_1_9_" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1device_1_1TensorForEach.html" target="_self">TensorForEach</a></td><td class="desc">Launches a kernel calling a functor for each element in a tensor's index space </td></tr>
<tr id="row_0_8_2_" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;">&#160;</span><span id="arr_0_8_2_" class="arrow" onclick="toggleFolder('0_8_2_')">&#9658;</span><span class="icona"><span class="icon">N</span></span><a class="el" href="namespacecutlass_1_1reference_1_1host.html" target="_self">host</a></td><td class="desc"></td></tr> <tr id="row_0_8_2_" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;">&#160;</span><span id="arr_0_8_2_" class="arrow" onclick="toggleFolder('0_8_2_')">&#9658;</span><span class="icona"><span class="icon">N</span></span><a class="el" href="namespacecutlass_1_1reference_1_1host.html" target="_self">host</a></td><td class="desc"></td></tr>
@ -620,7 +620,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
<tr id="row_0_8_2_2_" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1host_1_1Gemm.html" target="_self">Gemm</a></td><td class="desc"></td></tr> <tr id="row_0_8_2_2_" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1host_1_1Gemm.html" target="_self">Gemm</a></td><td class="desc"></td></tr>
<tr id="row_0_8_2_3_" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_193dd3a37f00deff1e5dcd7c310afb1f.html" target="_self">Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, ComputeType, arch::OpMultiplyAdd &gt;</a></td><td class="desc">Partial specialization for multiply-add </td></tr> <tr id="row_0_8_2_3_" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_193dd3a37f00deff1e5dcd7c310afb1f.html" target="_self">Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, ComputeType, arch::OpMultiplyAdd &gt;</a></td><td class="desc">Partial specialization for multiply-add </td></tr>
<tr id="row_0_8_2_4_" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_55729eac7dbd6bf311ea36f680e83e93.html" target="_self">Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, ComputeType, arch::OpMultiplyAddSaturate &gt;</a></td><td class="desc">Partial specialization for multiply-add-saturate </td></tr> <tr id="row_0_8_2_4_" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_55729eac7dbd6bf311ea36f680e83e93.html" target="_self">Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, ComputeType, arch::OpMultiplyAddSaturate &gt;</a></td><td class="desc">Partial specialization for multiply-add-saturate </td></tr>
<tr id="row_0_8_2_5_" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_4f3f32c4b336238abfd741e87bfced46.html" target="_self">Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, ComputeType, arch::OpXorPopc &gt;</a></td><td class="desc">Parital specialization for XOR-popc </td></tr> <tr id="row_0_8_2_5_" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_4f3f32c4b336238abfd741e87bfced46.html" target="_self">Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, ComputeType, arch::OpXorPopc &gt;</a></td><td class="desc">Partial specialization for XOR-popc </td></tr>
<tr id="row_0_9_" style="display:none;"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span id="arr_0_9_" class="arrow" onclick="toggleFolder('0_9_')">&#9658;</span><span class="icona"><span class="icon">N</span></span><a class="el" href="namespacecutlass_1_1thread.html" target="_self">thread</a></td><td class="desc"></td></tr> <tr id="row_0_9_" style="display:none;"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span id="arr_0_9_" class="arrow" onclick="toggleFolder('0_9_')">&#9658;</span><span class="icona"><span class="icon">N</span></span><a class="el" href="namespacecutlass_1_1thread.html" target="_self">thread</a></td><td class="desc"></td></tr>
<tr id="row_0_9_0_" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1thread_1_1Matrix.html" target="_self">Matrix</a></td><td class="desc">Per-thread matrix object storing a packed matrix </td></tr> <tr id="row_0_9_0_" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1thread_1_1Matrix.html" target="_self">Matrix</a></td><td class="desc">Per-thread matrix object storing a packed matrix </td></tr>
<tr id="row_0_10_" style="display:none;"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span id="arr_0_10_" class="arrow" onclick="toggleFolder('0_10_')">&#9658;</span><span class="icona"><span class="icon">N</span></span><a class="el" href="namespacecutlass_1_1transform.html" target="_self">transform</a></td><td class="desc"></td></tr> <tr id="row_0_10_" style="display:none;"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span id="arr_0_10_" class="arrow" onclick="toggleFolder('0_10_')">&#9658;</span><span class="icona"><span class="icon">N</span></span><a class="el" href="namespacecutlass_1_1transform.html" target="_self">transform</a></td><td class="desc"></td></tr>

View File

@ -108,7 +108,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
</div><!--header--> </div><!--header-->
<div class="contents"> <div class="contents">
<p>Parital specialization for column-major output exchanges problem size and operand. <p>Partial specialization for column-major output exchanges problem size and operand.
</p> </p>
<p><code>#include &lt;<a class="el" href="device_2gemm__batched_8h_source.html">gemm_batched.h</a>&gt;</code></p> <p><code>#include &lt;<a class="el" href="device_2gemm__batched_8h_source.html">gemm_batched.h</a>&gt;</code></p>

View File

@ -108,7 +108,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
</div><!--header--> </div><!--header-->
<div class="contents"> <div class="contents">
<p>Parital specialization for column-major output exchanges problem size and operand. <p>Partial specialization for column-major output exchanges problem size and operand.
</p> </p>
<p><code>#include &lt;<a class="el" href="include_2cutlass_2gemm_2device_2gemm__complex_8h_source.html">gemm_complex.h</a>&gt;</code></p> <p><code>#include &lt;<a class="el" href="include_2cutlass_2gemm_2device_2gemm__complex_8h_source.html">gemm_complex.h</a>&gt;</code></p>

View File

@ -108,7 +108,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
</div><!--header--> </div><!--header-->
<div class="contents"> <div class="contents">
<p>Parital specialization for column-major output exchanges problem size and operand. <p>Partial specialization for column-major output exchanges problem size and operand.
</p> </p>
<p><code>#include &lt;<a class="el" href="include_2cutlass_2gemm_2device_2gemm_8h_source.html">gemm.h</a>&gt;</code></p> <p><code>#include &lt;<a class="el" href="include_2cutlass_2gemm_2device_2gemm_8h_source.html">gemm.h</a>&gt;</code></p>

File diff suppressed because one or more lines are too long

View File

@ -130,7 +130,7 @@ Classes</h2></td></tr>
<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Argument structure. <a href="structcutlass_1_1gemm_1_1device_1_1GemmBatched_1_1Arguments.html#details">More...</a><br /></td></tr> <tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Argument structure. <a href="structcutlass_1_1gemm_1_1device_1_1GemmBatched_1_1Arguments.html#details">More...</a><br /></td></tr>
<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr> <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">class &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_0c9bb6f4463ab6085e6008b5d5ad6abfd.html">cutlass::gemm::device::GemmBatched&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ &gt;</a></td></tr> <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">class &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_0c9bb6f4463ab6085e6008b5d5ad6abfd.html">cutlass::gemm::device::GemmBatched&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ &gt;</a></td></tr>
<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Parital specialization for column-major output exchanges problem size and operand. <a href="classcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_0c9bb6f4463ab6085e6008b5d5ad6abfd.html#details">More...</a><br /></td></tr> <tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Partial specialization for column-major output exchanges problem size and operand. <a href="classcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_0c9bb6f4463ab6085e6008b5d5ad6abfd.html#details">More...</a><br /></td></tr>
<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr> <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_213d78696663f4231cd52c6a277c60e5.html">cutlass::gemm::device::GemmBatched&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ &gt;::Arguments</a></td></tr> <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_213d78696663f4231cd52c6a277c60e5.html">cutlass::gemm::device::GemmBatched&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ &gt;::Arguments</a></td></tr>
<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Argument structure. <a href="structcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_213d78696663f4231cd52c6a277c60e5.html#details">More...</a><br /></td></tr> <tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Argument structure. <a href="structcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_213d78696663f4231cd52c6a277c60e5.html#details">More...</a><br /></td></tr>

File diff suppressed because one or more lines are too long

View File

@ -237,7 +237,7 @@ Functions</h2></td></tr>
<tr class="separator:a6e23d479ebb3760d5846ed1b67e450e4"><td class="memSeparator" colspan="2">&#160;</td></tr> <tr class="separator:a6e23d479ebb3760d5846ed1b67e450e4"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:a6b0f21995c4fd5c33617550e6905c78e"><td class="memTemplParams" colspan="2">template&lt;typename Element , typename Layout &gt; </td></tr> <tr class="memitem:a6b0f21995c4fd5c33617550e6905c78e"><td class="memTemplParams" colspan="2">template&lt;typename Element , typename Layout &gt; </td></tr>
<tr class="memitem:a6b0f21995c4fd5c33617550e6905c78e"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="namespacecutlass_1_1reference_1_1device.html#a6b0f21995c4fd5c33617550e6905c78e">cutlass::reference::device::TensorFillIdentity</a> (TensorView&lt; Element, Layout &gt; view)</td></tr> <tr class="memitem:a6b0f21995c4fd5c33617550e6905c78e"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="namespacecutlass_1_1reference_1_1device.html#a6b0f21995c4fd5c33617550e6905c78e">cutlass::reference::device::TensorFillIdentity</a> (TensorView&lt; Element, Layout &gt; view)</td></tr>
<tr class="memdesc:a6b0f21995c4fd5c33617550e6905c78e"><td class="mdescLeft">&#160;</td><td class="mdescRight">Fills a tensor's digonal with 1 and 0 everywhere else. <a href="namespacecutlass_1_1reference_1_1device.html#a6b0f21995c4fd5c33617550e6905c78e">More...</a><br /></td></tr> <tr class="memdesc:a6b0f21995c4fd5c33617550e6905c78e"><td class="mdescLeft">&#160;</td><td class="mdescRight">Fills a tensor's diagonal with 1 and 0 everywhere else. <a href="namespacecutlass_1_1reference_1_1device.html#a6b0f21995c4fd5c33617550e6905c78e">More...</a><br /></td></tr>
<tr class="separator:a6b0f21995c4fd5c33617550e6905c78e"><td class="memSeparator" colspan="2">&#160;</td></tr> <tr class="separator:a6b0f21995c4fd5c33617550e6905c78e"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:aaff3d7919a2f2dce14eb254c17eead9a"><td class="memTemplParams" colspan="2">template&lt;typename Element , typename Layout &gt; </td></tr> <tr class="memitem:aaff3d7919a2f2dce14eb254c17eead9a"><td class="memTemplParams" colspan="2">template&lt;typename Element , typename Layout &gt; </td></tr>
<tr class="memitem:aaff3d7919a2f2dce14eb254c17eead9a"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="namespacecutlass_1_1reference_1_1device.html#aaff3d7919a2f2dce14eb254c17eead9a">cutlass::reference::device::TensorUpdateDiagonal</a> (TensorView&lt; Element, Layout &gt; view, Element diag=Element(1))</td></tr> <tr class="memitem:aaff3d7919a2f2dce14eb254c17eead9a"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="namespacecutlass_1_1reference_1_1device.html#aaff3d7919a2f2dce14eb254c17eead9a">cutlass::reference::device::TensorUpdateDiagonal</a> (TensorView&lt; Element, Layout &gt; view, Element diag=Element(1))</td></tr>

View File

@ -125,7 +125,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
<div class="ttc" id="structcutlass_1_1reference_1_1device_1_1detail_1_1RandomGaussianFunc_1_1Params_html"><div class="ttname"><a href="structcutlass_1_1reference_1_1device_1_1detail_1_1RandomGaussianFunc_1_1Params.html">cutlass::reference::device::detail::RandomGaussianFunc::Params</a></div><div class="ttdoc">Parameters structure. </div><div class="ttdef"><b>Definition:</b> device/tensor_fill.h:99</div></div> <div class="ttc" id="structcutlass_1_1reference_1_1device_1_1detail_1_1RandomGaussianFunc_1_1Params_html"><div class="ttname"><a href="structcutlass_1_1reference_1_1device_1_1detail_1_1RandomGaussianFunc_1_1Params.html">cutlass::reference::device::detail::RandomGaussianFunc::Params</a></div><div class="ttdoc">Parameters structure. </div><div class="ttdef"><b>Definition:</b> device/tensor_fill.h:99</div></div>
<div class="ttc" id="structcutlass_1_1Distribution_html_a07cb089b346ef06e198f6043128264fb"><div class="ttname"><a href="structcutlass_1_1Distribution.html#a07cb089b346ef06e198f6043128264fb">cutlass::Distribution::kind</a></div><div class="ttdeci">Kind kind</div><div class="ttdoc">Active variant kind. </div><div class="ttdef"><b>Definition:</b> distribution.h:64</div></div> <div class="ttc" id="structcutlass_1_1Distribution_html_a07cb089b346ef06e198f6043128264fb"><div class="ttname"><a href="structcutlass_1_1Distribution.html#a07cb089b346ef06e198f6043128264fb">cutlass::Distribution::kind</a></div><div class="ttdeci">Kind kind</div><div class="ttdoc">Active variant kind. </div><div class="ttdef"><b>Definition:</b> distribution.h:64</div></div>
<div class="ttc" id="structcutlass_1_1reference_1_1device_1_1detail_1_1TensorFillRandomUniformFunc_1_1Params_html_a267e7ea4e77076cc9be7d639b3cef64d"><div class="ttname"><a href="structcutlass_1_1reference_1_1device_1_1detail_1_1TensorFillRandomUniformFunc_1_1Params.html#a267e7ea4e77076cc9be7d639b3cef64d">cutlass::reference::device::detail::TensorFillRandomUniformFunc::Params::Params</a></div><div class="ttdeci">Params(TensorView view_=TensorView(), typename RandomFunc::Params random_=RandomFunc::Params())</div><div class="ttdoc">Construction of Gaussian RNG functor. </div><div class="ttdef"><b>Definition:</b> device/tensor_fill.h:422</div></div> <div class="ttc" id="structcutlass_1_1reference_1_1device_1_1detail_1_1TensorFillRandomUniformFunc_1_1Params_html_a267e7ea4e77076cc9be7d639b3cef64d"><div class="ttname"><a href="structcutlass_1_1reference_1_1device_1_1detail_1_1TensorFillRandomUniformFunc_1_1Params.html#a267e7ea4e77076cc9be7d639b3cef64d">cutlass::reference::device::detail::TensorFillRandomUniformFunc::Params::Params</a></div><div class="ttdeci">Params(TensorView view_=TensorView(), typename RandomFunc::Params random_=RandomFunc::Params())</div><div class="ttdoc">Construction of Gaussian RNG functor. </div><div class="ttdef"><b>Definition:</b> device/tensor_fill.h:422</div></div>
<div class="ttc" id="namespacecutlass_1_1reference_1_1device_html_a6b0f21995c4fd5c33617550e6905c78e"><div class="ttname"><a href="namespacecutlass_1_1reference_1_1device.html#a6b0f21995c4fd5c33617550e6905c78e">cutlass::reference::device::TensorFillIdentity</a></div><div class="ttdeci">void TensorFillIdentity(TensorView&lt; Element, Layout &gt; view)</div><div class="ttdoc">Fills a tensor&amp;#39;s digonal with 1 and 0 everywhere else. </div><div class="ttdef"><b>Definition:</b> device/tensor_fill.h:630</div></div> <div class="ttc" id="namespacecutlass_1_1reference_1_1device_html_a6b0f21995c4fd5c33617550e6905c78e"><div class="ttname"><a href="namespacecutlass_1_1reference_1_1device.html#a6b0f21995c4fd5c33617550e6905c78e">cutlass::reference::device::TensorFillIdentity</a></div><div class="ttdeci">void TensorFillIdentity(TensorView&lt; Element, Layout &gt; view)</div><div class="ttdoc">Fills a tensor&amp;#39;s diagonal with 1 and 0 everywhere else. </div><div class="ttdef"><b>Definition:</b> device/tensor_fill.h:630</div></div>
<div class="ttc" id="classcutlass_1_1TensorView_html_a7d3914dd5042c9c40be9e21a7b4e9ece"><div class="ttname"><a href="classcutlass_1_1TensorView.html#a7d3914dd5042c9c40be9e21a7b4e9ece">cutlass::TensorView::extent</a></div><div class="ttdeci">CUTLASS_HOST_DEVICE TensorCoord const &amp; extent() const </div><div class="ttdoc">Returns the extent of the view (the size along each logical dimension). </div><div class="ttdef"><b>Definition:</b> tensor_view.h:167</div></div> <div class="ttc" id="classcutlass_1_1TensorView_html_a7d3914dd5042c9c40be9e21a7b4e9ece"><div class="ttname"><a href="classcutlass_1_1TensorView.html#a7d3914dd5042c9c40be9e21a7b4e9ece">cutlass::TensorView::extent</a></div><div class="ttdeci">CUTLASS_HOST_DEVICE TensorCoord const &amp; extent() const </div><div class="ttdoc">Returns the extent of the view (the size along each logical dimension). </div><div class="ttdef"><b>Definition:</b> tensor_view.h:167</div></div>
<div class="ttc" id="structcutlass_1_1reference_1_1device_1_1detail_1_1TensorUpdateDiagonalFunc_html"><div class="ttname"><a href="structcutlass_1_1reference_1_1device_1_1detail_1_1TensorUpdateDiagonalFunc.html">cutlass::reference::device::detail::TensorUpdateDiagonalFunc</a></div><div class="ttdoc">Computes a random Gaussian distribution. </div><div class="ttdef"><b>Definition:</b> device/tensor_fill.h:645</div></div> <div class="ttc" id="structcutlass_1_1reference_1_1device_1_1detail_1_1TensorUpdateDiagonalFunc_html"><div class="ttname"><a href="structcutlass_1_1reference_1_1device_1_1detail_1_1TensorUpdateDiagonalFunc.html">cutlass::reference::device::detail::TensorUpdateDiagonalFunc</a></div><div class="ttdoc">Computes a random Gaussian distribution. </div><div class="ttdef"><b>Definition:</b> device/tensor_fill.h:645</div></div>
<div class="ttc" id="structcutlass_1_1reference_1_1device_1_1detail_1_1RandomUniformFunc_1_1Params_html_afe8637b103e25ec2e9b731389fa049be"><div class="ttname"><a href="structcutlass_1_1reference_1_1device_1_1detail_1_1RandomUniformFunc_1_1Params.html#afe8637b103e25ec2e9b731389fa049be">cutlass::reference::device::detail::RandomUniformFunc::Params::int_scale</a></div><div class="ttdeci">int int_scale</div><div class="ttdef"><b>Definition:</b> device/tensor_fill.h:315</div></div> <div class="ttc" id="structcutlass_1_1reference_1_1device_1_1detail_1_1RandomUniformFunc_1_1Params_html_afe8637b103e25ec2e9b731389fa049be"><div class="ttname"><a href="structcutlass_1_1reference_1_1device_1_1detail_1_1RandomUniformFunc_1_1Params.html#afe8637b103e25ec2e9b731389fa049be">cutlass::reference::device::detail::RandomUniformFunc::Params::int_scale</a></div><div class="ttdeci">int int_scale</div><div class="ttdef"><b>Definition:</b> device/tensor_fill.h:315</div></div>

File diff suppressed because one or more lines are too long

View File

@ -141,7 +141,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
<li>Semaphore() <li>Semaphore()
: <a class="el" href="classcutlass_1_1Semaphore.html#a2ce4cd07fe773efa429f726cfbd98070">cutlass::Semaphore</a> : <a class="el" href="classcutlass_1_1Semaphore.html#a2ce4cd07fe773efa429f726cfbd98070">cutlass::Semaphore</a>
</li> </li>
<li>seperate_string() <li>separate_string()
: <a class="el" href="structcutlass_1_1CommandLine.html#a5f86e4b2bd8c44b739c83530d77c5590">cutlass::CommandLine</a> : <a class="el" href="structcutlass_1_1CommandLine.html#a5f86e4b2bd8c44b739c83530d77c5590">cutlass::CommandLine</a>
</li> </li>
<li>set() <li>set()

View File

@ -172,7 +172,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
<li>Semaphore() <li>Semaphore()
: <a class="el" href="classcutlass_1_1Semaphore.html#a2ce4cd07fe773efa429f726cfbd98070">cutlass::Semaphore</a> : <a class="el" href="classcutlass_1_1Semaphore.html#a2ce4cd07fe773efa429f726cfbd98070">cutlass::Semaphore</a>
</li> </li>
<li>seperate_string() <li>separate_string()
: <a class="el" href="structcutlass_1_1CommandLine.html#a5f86e4b2bd8c44b739c83530d77c5590">cutlass::CommandLine</a> : <a class="el" href="structcutlass_1_1CommandLine.html#a5f86e4b2bd8c44b739c83530d77c5590">cutlass::CommandLine</a>
</li> </li>
<li>sequential <li>sequential

View File

@ -312,23 +312,23 @@ This inheritance list is sorted roughly, but not completely, alphabetically:</di
<tr id="row_197_"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1host_1_1Gemm.html" target="_self">cutlass::reference::host::Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, ComputeType, InnerProductOp &gt;</a></td><td class="desc"></td></tr> <tr id="row_197_"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1host_1_1Gemm.html" target="_self">cutlass::reference::host::Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, ComputeType, InnerProductOp &gt;</a></td><td class="desc"></td></tr>
<tr id="row_198_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout4e016ab7cfc644acd7cb4ae770339773.html" target="_self">cutlass::reference::device::Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, AccumulatorType, arch::OpMultiplyAdd &gt;</a></td><td class="desc">Partial specialization for multiply-add </td></tr> <tr id="row_198_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout4e016ab7cfc644acd7cb4ae770339773.html" target="_self">cutlass::reference::device::Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, AccumulatorType, arch::OpMultiplyAdd &gt;</a></td><td class="desc">Partial specialization for multiply-add </td></tr>
<tr id="row_199_"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout30b72addd464a2ca4a26785cbfd77a8e.html" target="_self">cutlass::reference::device::Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, AccumulatorType, arch::OpMultiplyAddSaturate &gt;</a></td><td class="desc">Partial specialization for multiply-add-saturate </td></tr> <tr id="row_199_"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout30b72addd464a2ca4a26785cbfd77a8e.html" target="_self">cutlass::reference::device::Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, AccumulatorType, arch::OpMultiplyAddSaturate &gt;</a></td><td class="desc">Partial specialization for multiply-add-saturate </td></tr>
<tr id="row_200_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout660562b232f408218828ca5915b7e73a.html" target="_self">cutlass::reference::device::Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, AccumulatorType, arch::OpXorPopc &gt;</a></td><td class="desc">Parital specialization for XOR-popc </td></tr> <tr id="row_200_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout660562b232f408218828ca5915b7e73a.html" target="_self">cutlass::reference::device::Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, AccumulatorType, arch::OpXorPopc &gt;</a></td><td class="desc">Partial specialization for XOR-popc </td></tr>
<tr id="row_201_"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_193dd3a37f00deff1e5dcd7c310afb1f.html" target="_self">cutlass::reference::host::Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, ComputeType, arch::OpMultiplyAdd &gt;</a></td><td class="desc">Partial specialization for multiply-add </td></tr> <tr id="row_201_"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_193dd3a37f00deff1e5dcd7c310afb1f.html" target="_self">cutlass::reference::host::Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, ComputeType, arch::OpMultiplyAdd &gt;</a></td><td class="desc">Partial specialization for multiply-add </td></tr>
<tr id="row_202_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_55729eac7dbd6bf311ea36f680e83e93.html" target="_self">cutlass::reference::host::Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, ComputeType, arch::OpMultiplyAddSaturate &gt;</a></td><td class="desc">Partial specialization for multiply-add-saturate </td></tr> <tr id="row_202_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_55729eac7dbd6bf311ea36f680e83e93.html" target="_self">cutlass::reference::host::Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, ComputeType, arch::OpMultiplyAddSaturate &gt;</a></td><td class="desc">Partial specialization for multiply-add-saturate </td></tr>
<tr id="row_203_"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_4f3f32c4b336238abfd741e87bfced46.html" target="_self">cutlass::reference::host::Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, ComputeType, arch::OpXorPopc &gt;</a></td><td class="desc">Parital specialization for XOR-popc </td></tr> <tr id="row_203_"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_4f3f32c4b336238abfd741e87bfced46.html" target="_self">cutlass::reference::host::Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, ComputeType, arch::OpXorPopc &gt;</a></td><td class="desc">Partial specialization for XOR-popc </td></tr>
<tr id="row_204_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1Gemm_3_01ElementA___00_01LayoutA___00_01ElementB___00_01Layout4d0960ae6b1d1bf19e6239dbd002249c.html" target="_self">cutlass::gemm::device::Gemm&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, SplitKSerial, Operator_, IsBetaZero &gt;</a></td><td class="desc">Parital specialization for column-major output exchanges problem size and operand </td></tr> <tr id="row_204_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1Gemm_3_01ElementA___00_01LayoutA___00_01ElementB___00_01Layout4d0960ae6b1d1bf19e6239dbd002249c.html" target="_self">cutlass::gemm::device::Gemm&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, SplitKSerial, Operator_, IsBetaZero &gt;</a></td><td class="desc">Partial specialization for column-major output exchanges problem size and operand </td></tr>
<tr id="row_205_"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1Gemm.html" target="_self">cutlass::gemm::device::Gemm&lt; ElementB, typename layout::LayoutTranspose&lt; LayoutB &gt;::type, ElementA, typename layout::LayoutTranspose&lt; LayoutA &gt;::type, ElementC, layout::RowMajor, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages, kAlignmentB, kAlignmentA, SplitKSerial, Operator, kIsBetaZero &gt;</a></td><td class="desc"></td></tr> <tr id="row_205_"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1Gemm.html" target="_self">cutlass::gemm::device::Gemm&lt; ElementB, typename layout::LayoutTranspose&lt; LayoutB &gt;::type, ElementA, typename layout::LayoutTranspose&lt; LayoutA &gt;::type, ElementC, layout::RowMajor, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages, kAlignmentB, kAlignmentA, SplitKSerial, Operator, kIsBetaZero &gt;</a></td><td class="desc"></td></tr>
<tr id="row_206_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1library_1_1GemmArguments.html" target="_self">cutlass::library::GemmArguments</a></td><td class="desc">Arguments for GEMM </td></tr> <tr id="row_206_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1library_1_1GemmArguments.html" target="_self">cutlass::library::GemmArguments</a></td><td class="desc">Arguments for GEMM </td></tr>
<tr id="row_207_"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1library_1_1GemmArrayArguments.html" target="_self">cutlass::library::GemmArrayArguments</a></td><td class="desc">Arguments for GEMM - used by all the GEMM operations </td></tr> <tr id="row_207_"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1library_1_1GemmArrayArguments.html" target="_self">cutlass::library::GemmArrayArguments</a></td><td class="desc">Arguments for GEMM - used by all the GEMM operations </td></tr>
<tr id="row_208_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1library_1_1GemmArrayConfiguration.html" target="_self">cutlass::library::GemmArrayConfiguration</a></td><td class="desc">Configuration for batched GEMM in which multiple matrix products are computed </td></tr> <tr id="row_208_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1library_1_1GemmArrayConfiguration.html" target="_self">cutlass::library::GemmArrayConfiguration</a></td><td class="desc">Configuration for batched GEMM in which multiple matrix products are computed </td></tr>
<tr id="row_209_"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmBatched.html" target="_self">cutlass::gemm::device::GemmBatched&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, LayoutC_, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ &gt;</a></td><td class="desc"></td></tr> <tr id="row_209_"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmBatched.html" target="_self">cutlass::gemm::device::GemmBatched&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, LayoutC_, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ &gt;</a></td><td class="desc"></td></tr>
<tr id="row_210_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1gemm_1_1kernel_1_1GemmBatched.html" target="_self">cutlass::gemm::kernel::GemmBatched&lt; Mma_, Epilogue_, ThreadblockSwizzle_ &gt;</a></td><td class="desc"></td></tr> <tr id="row_210_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1gemm_1_1kernel_1_1GemmBatched.html" target="_self">cutlass::gemm::kernel::GemmBatched&lt; Mma_, Epilogue_, ThreadblockSwizzle_ &gt;</a></td><td class="desc"></td></tr>
<tr id="row_211_"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_0c9bb6f4463ab6085e6008b5d5ad6abfd.html" target="_self">cutlass::gemm::device::GemmBatched&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ &gt;</a></td><td class="desc">Parital specialization for column-major output exchanges problem size and operand </td></tr> <tr id="row_211_"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_0c9bb6f4463ab6085e6008b5d5ad6abfd.html" target="_self">cutlass::gemm::device::GemmBatched&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ &gt;</a></td><td class="desc">Partial specialization for column-major output exchanges problem size and operand </td></tr>
<tr id="row_212_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmBatched.html" target="_self">cutlass::gemm::device::GemmBatched&lt; ElementB, typename layout::LayoutTranspose&lt; LayoutB &gt;::type, ElementA, typename layout::LayoutTranspose&lt; LayoutA &gt;::type, ElementC, layout::RowMajor, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages, kAlignmentB, kAlignmentA &gt;</a></td><td class="desc"></td></tr> <tr id="row_212_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmBatched.html" target="_self">cutlass::gemm::device::GemmBatched&lt; ElementB, typename layout::LayoutTranspose&lt; LayoutB &gt;::type, ElementA, typename layout::LayoutTranspose&lt; LayoutA &gt;::type, ElementC, layout::RowMajor, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages, kAlignmentB, kAlignmentA &gt;</a></td><td class="desc"></td></tr>
<tr id="row_213_"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1library_1_1GemmBatchedConfiguration.html" target="_self">cutlass::library::GemmBatchedConfiguration</a></td><td class="desc">Configuration for batched GEMM in which multiple matrix products are computed </td></tr> <tr id="row_213_"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1library_1_1GemmBatchedConfiguration.html" target="_self">cutlass::library::GemmBatchedConfiguration</a></td><td class="desc">Configuration for batched GEMM in which multiple matrix products are computed </td></tr>
<tr id="row_214_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1gemm_1_1threadblock_1_1GemmBatchedIdentityThreadblockSwizzle.html" target="_self">cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle</a></td><td class="desc">Threadblock swizzling function for batched GEMMs </td></tr> <tr id="row_214_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1gemm_1_1threadblock_1_1GemmBatchedIdentityThreadblockSwizzle.html" target="_self">cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle</a></td><td class="desc">Threadblock swizzling function for batched GEMMs </td></tr>
<tr id="row_215_"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmComplex.html" target="_self">cutlass::gemm::device::GemmComplex&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, LayoutC_, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, TransformA, TransformB, SplitKSerial &gt;</a></td><td class="desc"></td></tr> <tr id="row_215_"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmComplex.html" target="_self">cutlass::gemm::device::GemmComplex&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, LayoutC_, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, TransformA, TransformB, SplitKSerial &gt;</a></td><td class="desc"></td></tr>
<tr id="row_216_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmComplex_3_01ElementA___00_01LayoutA___00_01ElementB___00_07c56401b4df75709ae636675d9980a9a.html" target="_self">cutlass::gemm::device::GemmComplex&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, TransformA, TransformB, SplitKSerial &gt;</a></td><td class="desc">Parital specialization for column-major output exchanges problem size and operand </td></tr> <tr id="row_216_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmComplex_3_01ElementA___00_01LayoutA___00_01ElementB___00_07c56401b4df75709ae636675d9980a9a.html" target="_self">cutlass::gemm::device::GemmComplex&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, TransformA, TransformB, SplitKSerial &gt;</a></td><td class="desc">Partial specialization for column-major output exchanges problem size and operand </td></tr>
<tr id="row_217_"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmComplex.html" target="_self">cutlass::gemm::device::GemmComplex&lt; ElementB, typename layout::LayoutTranspose&lt; LayoutB &gt;::type, ElementA, typename layout::LayoutTranspose&lt; LayoutA &gt;::type, ElementC, layout::RowMajor, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages, TransformA, TransformB, SplitKSerial &gt;</a></td><td class="desc"></td></tr> <tr id="row_217_"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmComplex.html" target="_self">cutlass::gemm::device::GemmComplex&lt; ElementB, typename layout::LayoutTranspose&lt; LayoutB &gt;::type, ElementA, typename layout::LayoutTranspose&lt; LayoutA &gt;::type, ElementC, layout::RowMajor, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages, TransformA, TransformB, SplitKSerial &gt;</a></td><td class="desc"></td></tr>
<tr id="row_218_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1library_1_1GemmConfiguration.html" target="_self">cutlass::library::GemmConfiguration</a></td><td class="desc">Configuration for basic GEMM operations </td></tr> <tr id="row_218_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1library_1_1GemmConfiguration.html" target="_self">cutlass::library::GemmConfiguration</a></td><td class="desc">Configuration for basic GEMM operations </td></tr>
<tr id="row_219_"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1gemm_1_1threadblock_1_1GemmHorizontalThreadblockSwizzle.html" target="_self">cutlass::gemm::threadblock::GemmHorizontalThreadblockSwizzle</a></td><td class="desc">Threadblock swizzling function for GEMMs </td></tr> <tr id="row_219_"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1gemm_1_1threadblock_1_1GemmHorizontalThreadblockSwizzle.html" target="_self">cutlass::gemm::threadblock::GemmHorizontalThreadblockSwizzle</a></td><td class="desc">Threadblock swizzling function for GEMMs </td></tr>

View File

@ -192,7 +192,7 @@ Functions</h2></td></tr>
<tr class="separator:a1c81144ca36832a48d04d1b5b6498080"><td class="memSeparator" colspan="2">&#160;</td></tr> <tr class="separator:a1c81144ca36832a48d04d1b5b6498080"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:a29548cb522d9c147cf34263ecac75d89"><td class="memTemplParams" colspan="2">template&lt;typename Element , typename Layout &gt; </td></tr> <tr class="memitem:a29548cb522d9c147cf34263ecac75d89"><td class="memTemplParams" colspan="2">template&lt;typename Element , typename Layout &gt; </td></tr>
<tr class="memitem:a29548cb522d9c147cf34263ecac75d89"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="namespacecutlass_1_1reference_1_1host.html#a29548cb522d9c147cf34263ecac75d89">cutlass::reference::host::TensorFillIdentity</a> (TensorView&lt; Element, Layout &gt; dst)</td></tr> <tr class="memitem:a29548cb522d9c147cf34263ecac75d89"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="namespacecutlass_1_1reference_1_1host.html#a29548cb522d9c147cf34263ecac75d89">cutlass::reference::host::TensorFillIdentity</a> (TensorView&lt; Element, Layout &gt; dst)</td></tr>
<tr class="memdesc:a29548cb522d9c147cf34263ecac75d89"><td class="mdescLeft">&#160;</td><td class="mdescRight">Helper to fill a tensor's digonal with 1 and 0 everywhere else. <a href="namespacecutlass_1_1reference_1_1host.html#a29548cb522d9c147cf34263ecac75d89">More...</a><br /></td></tr> <tr class="memdesc:a29548cb522d9c147cf34263ecac75d89"><td class="mdescLeft">&#160;</td><td class="mdescRight">Helper to fill a tensor's diagonal with 1 and 0 everywhere else. <a href="namespacecutlass_1_1reference_1_1host.html#a29548cb522d9c147cf34263ecac75d89">More...</a><br /></td></tr>
<tr class="separator:a29548cb522d9c147cf34263ecac75d89"><td class="memSeparator" colspan="2">&#160;</td></tr> <tr class="separator:a29548cb522d9c147cf34263ecac75d89"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:acbf747241e8ac6ef9b1702b735a7913e"><td class="memTemplParams" colspan="2">template&lt;typename Element , typename Layout &gt; </td></tr> <tr class="memitem:acbf747241e8ac6ef9b1702b735a7913e"><td class="memTemplParams" colspan="2">template&lt;typename Element , typename Layout &gt; </td></tr>
<tr class="memitem:acbf747241e8ac6ef9b1702b735a7913e"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="namespacecutlass_1_1reference_1_1host.html#acbf747241e8ac6ef9b1702b735a7913e">cutlass::reference::host::TensorUpdateDiagonal</a> (TensorView&lt; Element, Layout &gt; dst, Element val=Element(1))</td></tr> <tr class="memitem:acbf747241e8ac6ef9b1702b735a7913e"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="namespacecutlass_1_1reference_1_1host.html#acbf747241e8ac6ef9b1702b735a7913e">cutlass::reference::host::TensorUpdateDiagonal</a> (TensorView&lt; Element, Layout &gt; dst, Element val=Element(1))</td></tr>

View File

@ -132,7 +132,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
<div class="ttc" id="structcutlass_1_1reference_1_1host_1_1detail_1_1RandomGaussianFunc_html_a4c9943f36faab7d4928b1f130d0b784c"><div class="ttname"><a href="structcutlass_1_1reference_1_1host_1_1detail_1_1RandomGaussianFunc.html#a4c9943f36faab7d4928b1f130d0b784c">cutlass::reference::host::detail::RandomGaussianFunc::int_scale</a></div><div class="ttdeci">int int_scale</div><div class="ttdef"><b>Definition:</b> host/tensor_fill.h:115</div></div> <div class="ttc" id="structcutlass_1_1reference_1_1host_1_1detail_1_1RandomGaussianFunc_html_a4c9943f36faab7d4928b1f130d0b784c"><div class="ttname"><a href="structcutlass_1_1reference_1_1host_1_1detail_1_1RandomGaussianFunc.html#a4c9943f36faab7d4928b1f130d0b784c">cutlass::reference::host::detail::RandomGaussianFunc::int_scale</a></div><div class="ttdeci">int int_scale</div><div class="ttdef"><b>Definition:</b> host/tensor_fill.h:115</div></div>
<div class="ttc" id="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorUpdateOffDiagonalFunc_html"><div class="ttname"><a href="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorUpdateOffDiagonalFunc.html">cutlass::reference::host::detail::TensorUpdateOffDiagonalFunc</a></div><div class="ttdoc">&lt; Layout function </div><div class="ttdef"><b>Definition:</b> host/tensor_fill.h:597</div></div> <div class="ttc" id="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorUpdateOffDiagonalFunc_html"><div class="ttname"><a href="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorUpdateOffDiagonalFunc.html">cutlass::reference::host::detail::TensorUpdateOffDiagonalFunc</a></div><div class="ttdoc">&lt; Layout function </div><div class="ttdef"><b>Definition:</b> host/tensor_fill.h:597</div></div>
<div class="ttc" id="structcutlass_1_1reference_1_1host_1_1detail_1_1RandomUniformFunc_3_01complex_3_01Element_01_4_01_4_html_ad0de7d4946af855288d7f9cccb9a18eb"><div class="ttname"><a href="structcutlass_1_1reference_1_1host_1_1detail_1_1RandomUniformFunc_3_01complex_3_01Element_01_4_01_4.html#ad0de7d4946af855288d7f9cccb9a18eb">cutlass::reference::host::detail::RandomUniformFunc&lt; complex&lt; Element &gt; &gt;::int_scale</a></div><div class="ttdeci">int int_scale</div><div class="ttdef"><b>Definition:</b> host/tensor_fill.h:357</div></div> <div class="ttc" id="structcutlass_1_1reference_1_1host_1_1detail_1_1RandomUniformFunc_3_01complex_3_01Element_01_4_01_4_html_ad0de7d4946af855288d7f9cccb9a18eb"><div class="ttname"><a href="structcutlass_1_1reference_1_1host_1_1detail_1_1RandomUniformFunc_3_01complex_3_01Element_01_4_01_4.html#ad0de7d4946af855288d7f9cccb9a18eb">cutlass::reference::host::detail::RandomUniformFunc&lt; complex&lt; Element &gt; &gt;::int_scale</a></div><div class="ttdeci">int int_scale</div><div class="ttdef"><b>Definition:</b> host/tensor_fill.h:357</div></div>
<div class="ttc" id="namespacecutlass_1_1reference_1_1host_html_a29548cb522d9c147cf34263ecac75d89"><div class="ttname"><a href="namespacecutlass_1_1reference_1_1host.html#a29548cb522d9c147cf34263ecac75d89">cutlass::reference::host::TensorFillIdentity</a></div><div class="ttdeci">void TensorFillIdentity(TensorView&lt; Element, Layout &gt; dst)</div><div class="ttdoc">Helper to fill a tensor&amp;#39;s digonal with 1 and 0 everywhere else. </div><div class="ttdef"><b>Definition:</b> host/tensor_fill.h:564</div></div> <div class="ttc" id="namespacecutlass_1_1reference_1_1host_html_a29548cb522d9c147cf34263ecac75d89"><div class="ttname"><a href="namespacecutlass_1_1reference_1_1host.html#a29548cb522d9c147cf34263ecac75d89">cutlass::reference::host::TensorFillIdentity</a></div><div class="ttdeci">void TensorFillIdentity(TensorView&lt; Element, Layout &gt; dst)</div><div class="ttdoc">Helper to fill a tensor&amp;#39;s diagonal with 1 and 0 everywhere else. </div><div class="ttdef"><b>Definition:</b> host/tensor_fill.h:564</div></div>
<div class="ttc" id="structcutlass_1_1reference_1_1host_1_1detail_1_1RandomUniformFunc_3_01complex_3_01Element_01_4_01_4_html_a6ef7020f1108432fe51853dffb7e727c"><div class="ttname"><a href="structcutlass_1_1reference_1_1host_1_1detail_1_1RandomUniformFunc_3_01complex_3_01Element_01_4_01_4.html#a6ef7020f1108432fe51853dffb7e727c">cutlass::reference::host::detail::RandomUniformFunc&lt; complex&lt; Element &gt; &gt;::operator()</a></div><div class="ttdeci">complex&lt; Element &gt; operator()() const </div><div class="ttdoc">Compute random value and update RNG state. </div><div class="ttdef"><b>Definition:</b> host/tensor_fill.h:375</div></div> <div class="ttc" id="structcutlass_1_1reference_1_1host_1_1detail_1_1RandomUniformFunc_3_01complex_3_01Element_01_4_01_4_html_a6ef7020f1108432fe51853dffb7e727c"><div class="ttname"><a href="structcutlass_1_1reference_1_1host_1_1detail_1_1RandomUniformFunc_3_01complex_3_01Element_01_4_01_4.html#a6ef7020f1108432fe51853dffb7e727c">cutlass::reference::host::detail::RandomUniformFunc&lt; complex&lt; Element &gt; &gt;::operator()</a></div><div class="ttdeci">complex&lt; Element &gt; operator()() const </div><div class="ttdoc">Compute random value and update RNG state. </div><div class="ttdef"><b>Definition:</b> host/tensor_fill.h:375</div></div>
<div class="ttc" id="namespacecutlass_html_a67f9e83dd59615eff837ea66984c121c"><div class="ttname"><a href="namespacecutlass.html#a67f9e83dd59615eff837ea66984c121c">cutlass::log</a></div><div class="ttdeci">CUTLASS_HOST_DEVICE complex&lt; T &gt; log(complex&lt; T &gt; const &amp;z)</div><div class="ttdoc">Computes the complex exponential of z. </div><div class="ttdef"><b>Definition:</b> complex.h:381</div></div> <div class="ttc" id="namespacecutlass_html_a67f9e83dd59615eff837ea66984c121c"><div class="ttname"><a href="namespacecutlass.html#a67f9e83dd59615eff837ea66984c121c">cutlass::log</a></div><div class="ttdeci">CUTLASS_HOST_DEVICE complex&lt; T &gt; log(complex&lt; T &gt; const &amp;z)</div><div class="ttdoc">Computes the complex exponential of z. </div><div class="ttdef"><b>Definition:</b> complex.h:381</div></div>
<div class="ttc" id="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorFillGaussianFunc_html_a4e447a80bd94cde69fa66f9e9d882b28"><div class="ttname"><a href="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorFillGaussianFunc.html#a4e447a80bd94cde69fa66f9e9d882b28">cutlass::reference::host::detail::TensorFillGaussianFunc::operator()</a></div><div class="ttdeci">void operator()(Coord&lt; Layout::kRank &gt; const &amp;coord) const </div><div class="ttdoc">Compute random value and update RNG state. </div><div class="ttdef"><b>Definition:</b> host/tensor_fill.h:236</div></div> <div class="ttc" id="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorFillGaussianFunc_html_a4e447a80bd94cde69fa66f9e9d882b28"><div class="ttname"><a href="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorFillGaussianFunc.html#a4e447a80bd94cde69fa66f9e9d882b28">cutlass::reference::host::detail::TensorFillGaussianFunc::operator()</a></div><div class="ttdeci">void operator()(Coord&lt; Layout::kRank &gt; const &amp;coord) const </div><div class="ttdoc">Compute random value and update RNG state. </div><div class="ttdef"><b>Definition:</b> host/tensor_fill.h:236</div></div>

View File

@ -104,7 +104,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
<div class="ttc" id="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorForEachHelper_html_a1161a761c596e714982fe30141211cca"><div class="ttname"><a href="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorForEachHelper.html#a1161a761c596e714982fe30141211cca">cutlass::reference::host::detail::TensorForEachHelper::kActiveRank</a></div><div class="ttdeci">static int const kActiveRank</div><div class="ttdoc">Index of the active rank. </div><div class="ttdef"><b>Definition:</b> host/tensor_foreach.h:44</div></div> <div class="ttc" id="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorForEachHelper_html_a1161a761c596e714982fe30141211cca"><div class="ttname"><a href="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorForEachHelper.html#a1161a761c596e714982fe30141211cca">cutlass::reference::host::detail::TensorForEachHelper::kActiveRank</a></div><div class="ttdeci">static int const kActiveRank</div><div class="ttdoc">Index of the active rank. </div><div class="ttdef"><b>Definition:</b> host/tensor_foreach.h:44</div></div>
<div class="ttc" id="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorForEachHelper_html_aa63906bbecfe42eec1991c9176f066d9"><div class="ttname"><a href="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorForEachHelper.html#aa63906bbecfe42eec1991c9176f066d9">cutlass::reference::host::detail::TensorForEachHelper::TensorForEachHelper</a></div><div class="ttdeci">TensorForEachHelper(Func &amp;func, Coord&lt; Rank &gt; const &amp;extent, Coord&lt; Rank &gt; &amp;coord)</div><div class="ttdoc">Constructor for general rank. </div><div class="ttdef"><b>Definition:</b> host/tensor_foreach.h:47</div></div> <div class="ttc" id="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorForEachHelper_html_aa63906bbecfe42eec1991c9176f066d9"><div class="ttname"><a href="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorForEachHelper.html#aa63906bbecfe42eec1991c9176f066d9">cutlass::reference::host::detail::TensorForEachHelper::TensorForEachHelper</a></div><div class="ttdeci">TensorForEachHelper(Func &amp;func, Coord&lt; Rank &gt; const &amp;extent, Coord&lt; Rank &gt; &amp;coord)</div><div class="ttdoc">Constructor for general rank. </div><div class="ttdef"><b>Definition:</b> host/tensor_foreach.h:47</div></div>
<div class="ttc" id="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorForEachHelper_html"><div class="ttname"><a href="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorForEachHelper.html">cutlass::reference::host::detail::TensorForEachHelper</a></div><div class="ttdoc">Helper to perform for-each operation. </div><div class="ttdef"><b>Definition:</b> host/tensor_foreach.h:41</div></div> <div class="ttc" id="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorForEachHelper_html"><div class="ttname"><a href="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorForEachHelper.html">cutlass::reference::host::detail::TensorForEachHelper</a></div><div class="ttdoc">Helper to perform for-each operation. </div><div class="ttdef"><b>Definition:</b> host/tensor_foreach.h:41</div></div>
<div class="ttc" id="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorForEachHelper_3_01Func_00_01Rank_00_010_01_4_html_a5029a4405a9a5e64011addb43bb88120"><div class="ttname"><a href="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorForEachHelper_3_01Func_00_01Rank_00_010_01_4.html#a5029a4405a9a5e64011addb43bb88120">cutlass::reference::host::detail::TensorForEachHelper&lt; Func, Rank, 0 &gt;::TensorForEachHelper</a></div><div class="ttdeci">TensorForEachHelper(Func &amp;func, Coord&lt; Rank &gt; const &amp;extent, Coord&lt; Rank &gt; &amp;coord)</div><div class="ttdoc">Constructor for fastest chaning rank. </div><div class="ttdef"><b>Definition:</b> host/tensor_foreach.h:67</div></div> <div class="ttc" id="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorForEachHelper_3_01Func_00_01Rank_00_010_01_4_html_a5029a4405a9a5e64011addb43bb88120"><div class="ttname"><a href="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorForEachHelper_3_01Func_00_01Rank_00_010_01_4.html#a5029a4405a9a5e64011addb43bb88120">cutlass::reference::host::detail::TensorForEachHelper&lt; Func, Rank, 0 &gt;::TensorForEachHelper</a></div><div class="ttdeci">TensorForEachHelper(Func &amp;func, Coord&lt; Rank &gt; const &amp;extent, Coord&lt; Rank &gt; &amp;coord)</div><div class="ttdoc">Constructor for fastest changing rank. </div><div class="ttdef"><b>Definition:</b> host/tensor_foreach.h:67</div></div>
<div class="ttc" id="structcutlass_1_1Coord_html"><div class="ttname"><a href="structcutlass_1_1Coord.html">cutlass::Coord</a></div><div class="ttdoc">Statically-sized array specifying Coords within a tensor. </div><div class="ttdef"><b>Definition:</b> coord.h:43</div></div> <div class="ttc" id="structcutlass_1_1Coord_html"><div class="ttname"><a href="structcutlass_1_1Coord.html">cutlass::Coord</a></div><div class="ttdoc">Statically-sized array specifying Coords within a tensor. </div><div class="ttdef"><b>Definition:</b> coord.h:43</div></div>
<div class="ttc" id="namespacecutlass_1_1reference_1_1host_html_a3825b1aaaf5e5abf0de5f427e3481ada"><div class="ttname"><a href="namespacecutlass_1_1reference_1_1host.html#a3825b1aaaf5e5abf0de5f427e3481ada">cutlass::reference::host::TensorForEachLambda</a></div><div class="ttdeci">void TensorForEachLambda(Coord&lt; Rank &gt; extent, Func func)</div><div class="ttdoc">Iterates over the index space of a tensor and calls a C++ lambda. </div><div class="ttdef"><b>Definition:</b> host/tensor_foreach.h:98</div></div> <div class="ttc" id="namespacecutlass_1_1reference_1_1host_html_a3825b1aaaf5e5abf0de5f427e3481ada"><div class="ttname"><a href="namespacecutlass_1_1reference_1_1host.html#a3825b1aaaf5e5abf0de5f427e3481ada">cutlass::reference::host::TensorForEachLambda</a></div><div class="ttdeci">void TensorForEachLambda(Coord&lt; Rank &gt; extent, Func func)</div><div class="ttdoc">Iterates over the index space of a tensor and calls a C++ lambda. </div><div class="ttdef"><b>Definition:</b> host/tensor_foreach.h:98</div></div>
<div class="ttc" id="namespacecutlass_1_1reference_1_1host_html_a8c798c04df572b34e3ed3976d69f993d"><div class="ttname"><a href="namespacecutlass_1_1reference_1_1host.html#a8c798c04df572b34e3ed3976d69f993d">cutlass::reference::host::TensorForEach</a></div><div class="ttdeci">void TensorForEach(Coord&lt; Rank &gt; extent, Func &amp;func)</div><div class="ttdoc">Iterates over the index space of a tensor. </div><div class="ttdef"><b>Definition:</b> host/tensor_foreach.h:87</div></div> <div class="ttc" id="namespacecutlass_1_1reference_1_1host_html_a8c798c04df572b34e3ed3976d69f993d"><div class="ttname"><a href="namespacecutlass_1_1reference_1_1host.html#a8c798c04df572b34e3ed3976d69f993d">cutlass::reference::host::TensorForEach</a></div><div class="ttdeci">void TensorForEach(Coord&lt; Rank &gt; extent, Func &amp;func)</div><div class="ttdoc">Iterates over the index space of a tensor. </div><div class="ttdef"><b>Definition:</b> host/tensor_foreach.h:87</div></div>

View File

@ -130,7 +130,7 @@ Classes</h2></td></tr>
<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Argument structure. <a href="structcutlass_1_1gemm_1_1device_1_1Gemm_1_1Arguments.html#details">More...</a><br /></td></tr> <tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Argument structure. <a href="structcutlass_1_1gemm_1_1device_1_1Gemm_1_1Arguments.html#details">More...</a><br /></td></tr>
<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr> <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">class &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classcutlass_1_1gemm_1_1device_1_1Gemm_3_01ElementA___00_01LayoutA___00_01ElementB___00_01Layout4d0960ae6b1d1bf19e6239dbd002249c.html">cutlass::gemm::device::Gemm&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, SplitKSerial, Operator_, IsBetaZero &gt;</a></td></tr> <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">class &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classcutlass_1_1gemm_1_1device_1_1Gemm_3_01ElementA___00_01LayoutA___00_01ElementB___00_01Layout4d0960ae6b1d1bf19e6239dbd002249c.html">cutlass::gemm::device::Gemm&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, SplitKSerial, Operator_, IsBetaZero &gt;</a></td></tr>
<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Parital specialization for column-major output exchanges problem size and operand. <a href="classcutlass_1_1gemm_1_1device_1_1Gemm_3_01ElementA___00_01LayoutA___00_01ElementB___00_01Layout4d0960ae6b1d1bf19e6239dbd002249c.html#details">More...</a><br /></td></tr> <tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Partial specialization for column-major output exchanges problem size and operand. <a href="classcutlass_1_1gemm_1_1device_1_1Gemm_3_01ElementA___00_01LayoutA___00_01ElementB___00_01Layout4d0960ae6b1d1bf19e6239dbd002249c.html#details">More...</a><br /></td></tr>
<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr> <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structcutlass_1_1gemm_1_1device_1_1Gemm_3_01ElementA___00_01LayoutA___00_01ElementB___00_01Layou1b211cc9c97c022d8fe10f2dd32c8709.html">cutlass::gemm::device::Gemm&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, SplitKSerial, Operator_, IsBetaZero &gt;::Arguments</a></td></tr> <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structcutlass_1_1gemm_1_1device_1_1Gemm_3_01ElementA___00_01LayoutA___00_01ElementB___00_01Layou1b211cc9c97c022d8fe10f2dd32c8709.html">cutlass::gemm::device::Gemm&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, SplitKSerial, Operator_, IsBetaZero &gt;::Arguments</a></td></tr>
<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Argument structure. <a href="structcutlass_1_1gemm_1_1device_1_1Gemm_3_01ElementA___00_01LayoutA___00_01ElementB___00_01Layou1b211cc9c97c022d8fe10f2dd32c8709.html#details">More...</a><br /></td></tr> <tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Argument structure. <a href="structcutlass_1_1gemm_1_1device_1_1Gemm_3_01ElementA___00_01LayoutA___00_01ElementB___00_01Layou1b211cc9c97c022d8fe10f2dd32c8709.html#details">More...</a><br /></td></tr>

View File

@ -130,7 +130,7 @@ Classes</h2></td></tr>
<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Argument structure. <a href="structcutlass_1_1gemm_1_1device_1_1GemmComplex_1_1Arguments.html#details">More...</a><br /></td></tr> <tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Argument structure. <a href="structcutlass_1_1gemm_1_1device_1_1GemmComplex_1_1Arguments.html#details">More...</a><br /></td></tr>
<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr> <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">class &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmComplex_3_01ElementA___00_01LayoutA___00_01ElementB___00_07c56401b4df75709ae636675d9980a9a.html">cutlass::gemm::device::GemmComplex&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, TransformA, TransformB, SplitKSerial &gt;</a></td></tr> <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">class &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmComplex_3_01ElementA___00_01LayoutA___00_01ElementB___00_07c56401b4df75709ae636675d9980a9a.html">cutlass::gemm::device::GemmComplex&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, TransformA, TransformB, SplitKSerial &gt;</a></td></tr>
<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Parital specialization for column-major output exchanges problem size and operand. <a href="classcutlass_1_1gemm_1_1device_1_1GemmComplex_3_01ElementA___00_01LayoutA___00_01ElementB___00_07c56401b4df75709ae636675d9980a9a.html#details">More...</a><br /></td></tr> <tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Partial specialization for column-major output exchanges problem size and operand. <a href="classcutlass_1_1gemm_1_1device_1_1GemmComplex_3_01ElementA___00_01LayoutA___00_01ElementB___00_07c56401b4df75709ae636675d9980a9a.html#details">More...</a><br /></td></tr>
<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr> <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structcutlass_1_1gemm_1_1device_1_1GemmComplex_3_01ElementA___00_01LayoutA___00_01ElementB___00_a3923967cafb5cb9774c320dc24baa77.html">cutlass::gemm::device::GemmComplex&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, TransformA, TransformB, SplitKSerial &gt;::Arguments</a></td></tr> <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structcutlass_1_1gemm_1_1device_1_1GemmComplex_3_01ElementA___00_01LayoutA___00_01ElementB___00_a3923967cafb5cb9774c320dc24baa77.html">cutlass::gemm::device::GemmComplex&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, TransformA, TransformB, SplitKSerial &gt;::Arguments</a></td></tr>
<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Argument structure. <a href="structcutlass_1_1gemm_1_1device_1_1GemmComplex_3_01ElementA___00_01LayoutA___00_01ElementB___00_a3923967cafb5cb9774c320dc24baa77.html#details">More...</a><br /></td></tr> <tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Argument structure. <a href="structcutlass_1_1gemm_1_1device_1_1GemmComplex_3_01ElementA___00_01LayoutA___00_01ElementB___00_a3923967cafb5cb9774c320dc24baa77.html#details">More...</a><br /></td></tr>

File diff suppressed because one or more lines are too long

View File

@ -134,17 +134,17 @@ Classes</h2></td></tr>
<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">class &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classcutlass_1_1gemm_1_1device_1_1Gemm.html">Gemm</a></td></tr> <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">class &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classcutlass_1_1gemm_1_1device_1_1Gemm.html">Gemm</a></td></tr>
<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr> <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">class &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classcutlass_1_1gemm_1_1device_1_1Gemm_3_01ElementA___00_01LayoutA___00_01ElementB___00_01Layout4d0960ae6b1d1bf19e6239dbd002249c.html">Gemm&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, SplitKSerial, Operator_, IsBetaZero &gt;</a></td></tr> <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">class &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classcutlass_1_1gemm_1_1device_1_1Gemm_3_01ElementA___00_01LayoutA___00_01ElementB___00_01Layout4d0960ae6b1d1bf19e6239dbd002249c.html">Gemm&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, SplitKSerial, Operator_, IsBetaZero &gt;</a></td></tr>
<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Parital specialization for column-major output exchanges problem size and operand. <a href="classcutlass_1_1gemm_1_1device_1_1Gemm_3_01ElementA___00_01LayoutA___00_01ElementB___00_01Layout4d0960ae6b1d1bf19e6239dbd002249c.html#details">More...</a><br /></td></tr> <tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Partial specialization for column-major output exchanges problem size and operand. <a href="classcutlass_1_1gemm_1_1device_1_1Gemm_3_01ElementA___00_01LayoutA___00_01ElementB___00_01Layout4d0960ae6b1d1bf19e6239dbd002249c.html#details">More...</a><br /></td></tr>
<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr> <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">class &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmBatched.html">GemmBatched</a></td></tr> <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">class &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmBatched.html">GemmBatched</a></td></tr>
<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr> <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">class &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_0c9bb6f4463ab6085e6008b5d5ad6abfd.html">GemmBatched&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ &gt;</a></td></tr> <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">class &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_0c9bb6f4463ab6085e6008b5d5ad6abfd.html">GemmBatched&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ &gt;</a></td></tr>
<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Parital specialization for column-major output exchanges problem size and operand. <a href="classcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_0c9bb6f4463ab6085e6008b5d5ad6abfd.html#details">More...</a><br /></td></tr> <tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Partial specialization for column-major output exchanges problem size and operand. <a href="classcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_0c9bb6f4463ab6085e6008b5d5ad6abfd.html#details">More...</a><br /></td></tr>
<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr> <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">class &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmComplex.html">GemmComplex</a></td></tr> <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">class &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmComplex.html">GemmComplex</a></td></tr>
<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr> <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">class &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmComplex_3_01ElementA___00_01LayoutA___00_01ElementB___00_07c56401b4df75709ae636675d9980a9a.html">GemmComplex&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, TransformA, TransformB, SplitKSerial &gt;</a></td></tr> <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">class &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmComplex_3_01ElementA___00_01LayoutA___00_01ElementB___00_07c56401b4df75709ae636675d9980a9a.html">GemmComplex&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, TransformA, TransformB, SplitKSerial &gt;</a></td></tr>
<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Parital specialization for column-major output exchanges problem size and operand. <a href="classcutlass_1_1gemm_1_1device_1_1GemmComplex_3_01ElementA___00_01LayoutA___00_01ElementB___00_07c56401b4df75709ae636675d9980a9a.html#details">More...</a><br /></td></tr> <tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Partial specialization for column-major output exchanges problem size and operand. <a href="classcutlass_1_1gemm_1_1device_1_1GemmComplex_3_01ElementA___00_01LayoutA___00_01ElementB___00_07c56401b4df75709ae636675d9980a9a.html#details">More...</a><br /></td></tr>
<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr> <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">class &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmSplitKParallel.html">GemmSplitKParallel</a></td></tr> <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">class &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmSplitKParallel.html">GemmSplitKParallel</a></td></tr>
<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr> <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>

View File

@ -125,7 +125,7 @@ Classes</h2></td></tr>
<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Partial specialization for multiply-add-saturate. <a href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout30b72addd464a2ca4a26785cbfd77a8e.html#details">More...</a><br /></td></tr> <tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Partial specialization for multiply-add-saturate. <a href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout30b72addd464a2ca4a26785cbfd77a8e.html#details">More...</a><br /></td></tr>
<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr> <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout660562b232f408218828ca5915b7e73a.html">Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, AccumulatorType, arch::OpXorPopc &gt;</a></td></tr> <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout660562b232f408218828ca5915b7e73a.html">Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, AccumulatorType, arch::OpXorPopc &gt;</a></td></tr>
<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Parital specialization for XOR-popc. <a href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout660562b232f408218828ca5915b7e73a.html#details">More...</a><br /></td></tr> <tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Partial specialization for XOR-popc. <a href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout660562b232f408218828ca5915b7e73a.html#details">More...</a><br /></td></tr>
<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr> <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structcutlass_1_1reference_1_1device_1_1TensorDiagonalForEach.html">TensorDiagonalForEach</a></td></tr> <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structcutlass_1_1reference_1_1device_1_1TensorDiagonalForEach.html">TensorDiagonalForEach</a></td></tr>
<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Launches a kernel calling a functor for each element along a tensor's diagonal. <a href="structcutlass_1_1reference_1_1device_1_1TensorDiagonalForEach.html#details">More...</a><br /></td></tr> <tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Launches a kernel calling a functor for each element along a tensor's diagonal. <a href="structcutlass_1_1reference_1_1device_1_1TensorDiagonalForEach.html#details">More...</a><br /></td></tr>
@ -183,7 +183,7 @@ Functions</h2></td></tr>
<tr class="separator:a6e23d479ebb3760d5846ed1b67e450e4"><td class="memSeparator" colspan="2">&#160;</td></tr> <tr class="separator:a6e23d479ebb3760d5846ed1b67e450e4"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:a6b0f21995c4fd5c33617550e6905c78e"><td class="memTemplParams" colspan="2">template&lt;typename Element , typename Layout &gt; </td></tr> <tr class="memitem:a6b0f21995c4fd5c33617550e6905c78e"><td class="memTemplParams" colspan="2">template&lt;typename Element , typename Layout &gt; </td></tr>
<tr class="memitem:a6b0f21995c4fd5c33617550e6905c78e"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="namespacecutlass_1_1reference_1_1device.html#a6b0f21995c4fd5c33617550e6905c78e">TensorFillIdentity</a> (<a class="el" href="classcutlass_1_1TensorView.html">TensorView</a>&lt; Element, Layout &gt; view)</td></tr> <tr class="memitem:a6b0f21995c4fd5c33617550e6905c78e"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="namespacecutlass_1_1reference_1_1device.html#a6b0f21995c4fd5c33617550e6905c78e">TensorFillIdentity</a> (<a class="el" href="classcutlass_1_1TensorView.html">TensorView</a>&lt; Element, Layout &gt; view)</td></tr>
<tr class="memdesc:a6b0f21995c4fd5c33617550e6905c78e"><td class="mdescLeft">&#160;</td><td class="mdescRight">Fills a tensor's digonal with 1 and 0 everywhere else. <a href="#a6b0f21995c4fd5c33617550e6905c78e">More...</a><br /></td></tr> <tr class="memdesc:a6b0f21995c4fd5c33617550e6905c78e"><td class="mdescLeft">&#160;</td><td class="mdescRight">Fills a tensor's diagonal with 1 and 0 everywhere else. <a href="#a6b0f21995c4fd5c33617550e6905c78e">More...</a><br /></td></tr>
<tr class="separator:a6b0f21995c4fd5c33617550e6905c78e"><td class="memSeparator" colspan="2">&#160;</td></tr> <tr class="separator:a6b0f21995c4fd5c33617550e6905c78e"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:aaff3d7919a2f2dce14eb254c17eead9a"><td class="memTemplParams" colspan="2">template&lt;typename Element , typename Layout &gt; </td></tr> <tr class="memitem:aaff3d7919a2f2dce14eb254c17eead9a"><td class="memTemplParams" colspan="2">template&lt;typename Element , typename Layout &gt; </td></tr>
<tr class="memitem:aaff3d7919a2f2dce14eb254c17eead9a"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="namespacecutlass_1_1reference_1_1device.html#aaff3d7919a2f2dce14eb254c17eead9a">TensorUpdateDiagonal</a> (<a class="el" href="classcutlass_1_1TensorView.html">TensorView</a>&lt; Element, Layout &gt; view, Element diag=Element(1))</td></tr> <tr class="memitem:aaff3d7919a2f2dce14eb254c17eead9a"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="namespacecutlass_1_1reference_1_1device.html#aaff3d7919a2f2dce14eb254c17eead9a">TensorUpdateDiagonal</a> (<a class="el" href="classcutlass_1_1TensorView.html">TensorView</a>&lt; Element, Layout &gt; view, Element diag=Element(1))</td></tr>

View File

@ -122,7 +122,7 @@ Classes</h2></td></tr>
<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Partial specialization for multiply-add-saturate. <a href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_55729eac7dbd6bf311ea36f680e83e93.html#details">More...</a><br /></td></tr> <tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Partial specialization for multiply-add-saturate. <a href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_55729eac7dbd6bf311ea36f680e83e93.html#details">More...</a><br /></td></tr>
<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr> <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_4f3f32c4b336238abfd741e87bfced46.html">Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, ComputeType, arch::OpXorPopc &gt;</a></td></tr> <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_4f3f32c4b336238abfd741e87bfced46.html">Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, ComputeType, arch::OpXorPopc &gt;</a></td></tr>
<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Parital specialization for XOR-popc. <a href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_4f3f32c4b336238abfd741e87bfced46.html#details">More...</a><br /></td></tr> <tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Partial specialization for XOR-popc. <a href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_4f3f32c4b336238abfd741e87bfced46.html#details">More...</a><br /></td></tr>
<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr> <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
</table><table class="memberdecls"> </table><table class="memberdecls">
<tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="func-members"></a> <tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="func-members"></a>
@ -247,7 +247,7 @@ Functions</h2></td></tr>
<tr class="separator:a1c81144ca36832a48d04d1b5b6498080"><td class="memSeparator" colspan="2">&#160;</td></tr> <tr class="separator:a1c81144ca36832a48d04d1b5b6498080"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:a29548cb522d9c147cf34263ecac75d89"><td class="memTemplParams" colspan="2">template&lt;typename Element , typename Layout &gt; </td></tr> <tr class="memitem:a29548cb522d9c147cf34263ecac75d89"><td class="memTemplParams" colspan="2">template&lt;typename Element , typename Layout &gt; </td></tr>
<tr class="memitem:a29548cb522d9c147cf34263ecac75d89"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="namespacecutlass_1_1reference_1_1host.html#a29548cb522d9c147cf34263ecac75d89">TensorFillIdentity</a> (<a class="el" href="classcutlass_1_1TensorView.html">TensorView</a>&lt; Element, Layout &gt; dst)</td></tr> <tr class="memitem:a29548cb522d9c147cf34263ecac75d89"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="namespacecutlass_1_1reference_1_1host.html#a29548cb522d9c147cf34263ecac75d89">TensorFillIdentity</a> (<a class="el" href="classcutlass_1_1TensorView.html">TensorView</a>&lt; Element, Layout &gt; dst)</td></tr>
<tr class="memdesc:a29548cb522d9c147cf34263ecac75d89"><td class="mdescLeft">&#160;</td><td class="mdescRight">Helper to fill a tensor's digonal with 1 and 0 everywhere else. <a href="#a29548cb522d9c147cf34263ecac75d89">More...</a><br /></td></tr> <tr class="memdesc:a29548cb522d9c147cf34263ecac75d89"><td class="mdescLeft">&#160;</td><td class="mdescRight">Helper to fill a tensor's diagonal with 1 and 0 everywhere else. <a href="#a29548cb522d9c147cf34263ecac75d89">More...</a><br /></td></tr>
<tr class="separator:a29548cb522d9c147cf34263ecac75d89"><td class="memSeparator" colspan="2">&#160;</td></tr> <tr class="separator:a29548cb522d9c147cf34263ecac75d89"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:acbf747241e8ac6ef9b1702b735a7913e"><td class="memTemplParams" colspan="2">template&lt;typename Element , typename Layout &gt; </td></tr> <tr class="memitem:acbf747241e8ac6ef9b1702b735a7913e"><td class="memTemplParams" colspan="2">template&lt;typename Element , typename Layout &gt; </td></tr>
<tr class="memitem:acbf747241e8ac6ef9b1702b735a7913e"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="namespacecutlass_1_1reference_1_1host.html#acbf747241e8ac6ef9b1702b735a7913e">TensorUpdateDiagonal</a> (<a class="el" href="classcutlass_1_1TensorView.html">TensorView</a>&lt; Element, Layout &gt; dst, Element val=Element(1))</td></tr> <tr class="memitem:acbf747241e8ac6ef9b1702b735a7913e"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="namespacecutlass_1_1reference_1_1host.html#acbf747241e8ac6ef9b1702b735a7913e">TensorUpdateDiagonal</a> (<a class="el" href="classcutlass_1_1TensorView.html">TensorView</a>&lt; Element, Layout &gt; dst, Element val=Element(1))</td></tr>

View File

@ -14,7 +14,7 @@ var searchData=
['semaphore',['Semaphore',['../classcutlass_1_1Semaphore.html',1,'cutlass']]], ['semaphore',['Semaphore',['../classcutlass_1_1Semaphore.html',1,'cutlass']]],
['semaphore',['Semaphore',['../classcutlass_1_1Semaphore.html#a2ce4cd07fe773efa429f726cfbd98070',1,'cutlass::Semaphore::Semaphore()'],['../structcutlass_1_1gemm_1_1kernel_1_1Gemm_1_1Params.html#adec6d0c6d74e7f456196f453e302fbbb',1,'cutlass::gemm::kernel::Gemm::Params::semaphore()']]], ['semaphore',['Semaphore',['../classcutlass_1_1Semaphore.html#a2ce4cd07fe773efa429f726cfbd98070',1,'cutlass::Semaphore::Semaphore()'],['../structcutlass_1_1gemm_1_1kernel_1_1Gemm_1_1Params.html#adec6d0c6d74e7f456196f453e302fbbb',1,'cutlass::gemm::kernel::Gemm::Params::semaphore()']]],
['semaphore_2eh',['semaphore.h',['../semaphore_8h.html',1,'']]], ['semaphore_2eh',['semaphore.h',['../semaphore_8h.html',1,'']]],
['seperate_5fstring',['seperate_string',['../structcutlass_1_1CommandLine.html#a5f86e4b2bd8c44b739c83530d77c5590',1,'cutlass::CommandLine']]], ['separate_5fstring',['separate_string',['../structcutlass_1_1CommandLine.html#a5f86e4b2bd8c44b739c83530d77c5590',1,'cutlass::CommandLine']]],
['sequential',['sequential',['../structcutlass_1_1Distribution.html#ab86d975567ef141ff82067b1f41cd3ee',1,'cutlass::Distribution::sequential()'],['../structcutlass_1_1Distribution.html#a499f4023e0d42356ce71d38cc32bf92aa39d3cf55e90573c8d1dfb483cfb410dc',1,'cutlass::Distribution::Sequential()']]], ['sequential',['sequential',['../structcutlass_1_1Distribution.html#ab86d975567ef141ff82067b1f41cd3ee',1,'cutlass::Distribution::sequential()'],['../structcutlass_1_1Distribution.html#a499f4023e0d42356ce71d38cc32bf92aa39d3cf55e90573c8d1dfb483cfb410dc',1,'cutlass::Distribution::Sequential()']]],
['set',['set',['../classcutlass_1_1PredicateVector_1_1Iterator.html#aadfd039b5622098c9e46706a27122575',1,'cutlass::PredicateVector::Iterator::set()'],['../structcutlass_1_1PredicateVector.html#a062fa8a8df725ef08ced2ffcca8336af',1,'cutlass::PredicateVector::set()'],['../classcutlass_1_1SubbyteReference.html#a6473e57520d8ee7afbd95c1e1641e05a',1,'cutlass::SubbyteReference::set()']]], ['set',['set',['../classcutlass_1_1PredicateVector_1_1Iterator.html#aadfd039b5622098c9e46706a27122575',1,'cutlass::PredicateVector::Iterator::set()'],['../structcutlass_1_1PredicateVector.html#a062fa8a8df725ef08ced2ffcca8336af',1,'cutlass::PredicateVector::set()'],['../classcutlass_1_1SubbyteReference.html#a6473e57520d8ee7afbd95c1e1641e05a',1,'cutlass::SubbyteReference::set()']]],
['set_5fgaussian',['set_gaussian',['../structcutlass_1_1Distribution.html#ad594b5ec1d577e8ef03d4d808a8220b1',1,'cutlass::Distribution']]], ['set_5fgaussian',['set_gaussian',['../structcutlass_1_1Distribution.html#ad594b5ec1d577e8ef03d4d808a8220b1',1,'cutlass::Distribution']]],

View File

@ -3,7 +3,7 @@ var searchData=
['scalar_5fop',['scalar_op',['../structcutlass_1_1minimum_3_01Array_3_01T_00_01N_01_4_01_4.html#a4b42227184cb7c796460062c46a84b57',1,'cutlass::minimum&lt; Array&lt; T, N &gt; &gt;']]], ['scalar_5fop',['scalar_op',['../structcutlass_1_1minimum_3_01Array_3_01T_00_01N_01_4_01_4.html#a4b42227184cb7c796460062c46a84b57',1,'cutlass::minimum&lt; Array&lt; T, N &gt; &gt;']]],
['scalario',['ScalarIO',['../structcutlass_1_1ScalarIO.html#ad4166575521254088bf6c6300c351714',1,'cutlass::ScalarIO::ScalarIO()'],['../structcutlass_1_1ScalarIO.html#a5227e1e9ed24326ad4f8dc94d186186f',1,'cutlass::ScalarIO::ScalarIO(T value)']]], ['scalario',['ScalarIO',['../structcutlass_1_1ScalarIO.html#ad4166575521254088bf6c6300c351714',1,'cutlass::ScalarIO::ScalarIO()'],['../structcutlass_1_1ScalarIO.html#a5227e1e9ed24326ad4f8dc94d186186f',1,'cutlass::ScalarIO::ScalarIO(T value)']]],
['semaphore',['Semaphore',['../classcutlass_1_1Semaphore.html#a2ce4cd07fe773efa429f726cfbd98070',1,'cutlass::Semaphore']]], ['semaphore',['Semaphore',['../classcutlass_1_1Semaphore.html#a2ce4cd07fe773efa429f726cfbd98070',1,'cutlass::Semaphore']]],
['seperate_5fstring',['seperate_string',['../structcutlass_1_1CommandLine.html#a5f86e4b2bd8c44b739c83530d77c5590',1,'cutlass::CommandLine']]], ['separate_5fstring',['separate_string',['../structcutlass_1_1CommandLine.html#a5f86e4b2bd8c44b739c83530d77c5590',1,'cutlass::CommandLine']]],
['set',['set',['../classcutlass_1_1PredicateVector_1_1Iterator.html#aadfd039b5622098c9e46706a27122575',1,'cutlass::PredicateVector::Iterator::set()'],['../structcutlass_1_1PredicateVector.html#a062fa8a8df725ef08ced2ffcca8336af',1,'cutlass::PredicateVector::set()'],['../classcutlass_1_1SubbyteReference.html#a6473e57520d8ee7afbd95c1e1641e05a',1,'cutlass::SubbyteReference::set()']]], ['set',['set',['../classcutlass_1_1PredicateVector_1_1Iterator.html#aadfd039b5622098c9e46706a27122575',1,'cutlass::PredicateVector::Iterator::set()'],['../structcutlass_1_1PredicateVector.html#a062fa8a8df725ef08ced2ffcca8336af',1,'cutlass::PredicateVector::set()'],['../classcutlass_1_1SubbyteReference.html#a6473e57520d8ee7afbd95c1e1641e05a',1,'cutlass::SubbyteReference::set()']]],
['set_5fgaussian',['set_gaussian',['../structcutlass_1_1Distribution.html#ad594b5ec1d577e8ef03d4d808a8220b1',1,'cutlass::Distribution']]], ['set_5fgaussian',['set_gaussian',['../structcutlass_1_1Distribution.html#ad594b5ec1d577e8ef03d4d808a8220b1',1,'cutlass::Distribution']]],
['set_5fidentity',['set_identity',['../structcutlass_1_1Distribution.html#aad2cf02af3d520544d89843cc4295858',1,'cutlass::Distribution']]], ['set_5fidentity',['set_identity',['../structcutlass_1_1Distribution.html#aad2cf02af3d520544d89843cc4295858',1,'cutlass::Distribution']]],

View File

@ -115,7 +115,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
<tr><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html#a1603f1c65c6d8d3d4262443b40e5c290">keys</a></td><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html">cutlass::CommandLine</a></td><td class="entry"></td></tr> <tr><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html#a1603f1c65c6d8d3d4262443b40e5c290">keys</a></td><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html">cutlass::CommandLine</a></td><td class="entry"></td></tr>
<tr class="even"><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html#a0bee40a3cc6078a08eec5d4ca4711f61">num_naked_args</a>() const </td><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html">cutlass::CommandLine</a></td><td class="entry"><span class="mlabel">inline</span></td></tr> <tr class="even"><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html#a0bee40a3cc6078a08eec5d4ca4711f61">num_naked_args</a>() const </td><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html">cutlass::CommandLine</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
<tr><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html#a228e1a273d223eec4b2f6d73135d3c1e">parsed_argc</a>() const </td><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html">cutlass::CommandLine</a></td><td class="entry"><span class="mlabel">inline</span></td></tr> <tr><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html#a228e1a273d223eec4b2f6d73135d3c1e">parsed_argc</a>() const </td><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html">cutlass::CommandLine</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
<tr class="even"><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html#a5f86e4b2bd8c44b739c83530d77c5590">seperate_string</a>(std::string const &amp;str, std::vector&lt; value_t &gt; &amp;vals, char sep= ',')</td><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html">cutlass::CommandLine</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">static</span></td></tr> <tr class="even"><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html#a5f86e4b2bd8c44b739c83530d77c5590">separate_string</a>(std::string const &amp;str, std::vector&lt; value_t &gt; &amp;vals, char sep= ',')</td><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html">cutlass::CommandLine</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">static</span></td></tr>
<tr><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html#a1944da52162e04b12a82ce0c1ade676e">tokenize</a>(std::vector&lt; std::pair&lt; std::string, std::string &gt; &gt; &amp;tokens, std::string const &amp;str, char delim= ',', char sep= ':')</td><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html">cutlass::CommandLine</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">static</span></td></tr> <tr><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html#a1944da52162e04b12a82ce0c1ade676e">tokenize</a>(std::vector&lt; std::pair&lt; std::string, std::string &gt; &gt; &amp;tokens, std::string const &amp;str, char delim= ',', char sep= ':')</td><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html">cutlass::CommandLine</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">static</span></td></tr>
<tr class="even"><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html#a440c25cfb006f218ff4705a43320a28b">tokenize</a>(std::vector&lt; std::string &gt; &amp;tokens, std::string const &amp;str, char delim= ',', char sep= ':')</td><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html">cutlass::CommandLine</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">static</span></td></tr> <tr class="even"><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html#a440c25cfb006f218ff4705a43320a28b">tokenize</a>(std::vector&lt; std::string &gt; &amp;tokens, std::string const &amp;str, char delim= ',', char sep= ':')</td><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html">cutlass::CommandLine</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">static</span></td></tr>
<tr><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html#ade127841e9730589f611b618e9440012">values</a></td><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html">cutlass::CommandLine</a></td><td class="entry"></td></tr> <tr><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html#ade127841e9730589f611b618e9440012">values</a></td><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html">cutlass::CommandLine</a></td><td class="entry"></td></tr>

View File

@ -151,7 +151,7 @@ Static Public Member Functions</h2></td></tr>
<tr class="memdesc:a440c25cfb006f218ff4705a43320a28b"><td class="mdescLeft">&#160;</td><td class="mdescRight">Tokenizes a comma-delimited list of string pairs delimited by ':'. <a href="#a440c25cfb006f218ff4705a43320a28b">More...</a><br /></td></tr> <tr class="memdesc:a440c25cfb006f218ff4705a43320a28b"><td class="mdescLeft">&#160;</td><td class="mdescRight">Tokenizes a comma-delimited list of string pairs delimited by ':'. <a href="#a440c25cfb006f218ff4705a43320a28b">More...</a><br /></td></tr>
<tr class="separator:a440c25cfb006f218ff4705a43320a28b"><td class="memSeparator" colspan="2">&#160;</td></tr> <tr class="separator:a440c25cfb006f218ff4705a43320a28b"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:a5f86e4b2bd8c44b739c83530d77c5590"><td class="memTemplParams" colspan="2">template&lt;typename value_t &gt; </td></tr> <tr class="memitem:a5f86e4b2bd8c44b739c83530d77c5590"><td class="memTemplParams" colspan="2">template&lt;typename value_t &gt; </td></tr>
<tr class="memitem:a5f86e4b2bd8c44b739c83530d77c5590"><td class="memTemplItemLeft" align="right" valign="top">static void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="structcutlass_1_1CommandLine.html#a5f86e4b2bd8c44b739c83530d77c5590">seperate_string</a> (std::string const &amp;str, std::vector&lt; value_t &gt; &amp;vals, char sep= ',')</td></tr> <tr class="memitem:a5f86e4b2bd8c44b739c83530d77c5590"><td class="memTemplItemLeft" align="right" valign="top">static void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="structcutlass_1_1CommandLine.html#a5f86e4b2bd8c44b739c83530d77c5590">separate_string</a> (std::string const &amp;str, std::vector&lt; value_t &gt; &amp;vals, char sep= ',')</td></tr>
<tr class="separator:a5f86e4b2bd8c44b739c83530d77c5590"><td class="memSeparator" colspan="2">&#160;</td></tr> <tr class="separator:a5f86e4b2bd8c44b739c83530d77c5590"><td class="memSeparator" colspan="2">&#160;</td></tr>
</table><table class="memberdecls"> </table><table class="memberdecls">
<tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="pub-attribs"></a> <tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="pub-attribs"></a>
@ -548,7 +548,7 @@ template&lt;typename value_t &gt; </div>
<td class="mlabels-left"> <td class="mlabels-left">
<table class="memname"> <table class="memname">
<tr> <tr>
<td class="memname">static void cutlass::CommandLine::seperate_string </td> <td class="memname">static void cutlass::CommandLine::separate_string </td>
<td>(</td> <td>(</td>
<td class="paramtype">std::string const &amp;&#160;</td> <td class="paramtype">std::string const &amp;&#160;</td>
<td class="paramname"><em>str</em>, </td> <td class="paramname"><em>str</em>, </td>

View File

@ -104,7 +104,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
</div><!--header--> </div><!--header-->
<div class="contents"> <div class="contents">
<p>Parital specialization for XOR-popc. <p>Partial specialization for XOR-popc.
</p> </p>
<p><code>#include &lt;<a class="el" href="tools_2util_2include_2cutlass_2util_2reference_2device_2gemm_8h_source.html">gemm.h</a>&gt;</code></p> <p><code>#include &lt;<a class="el" href="tools_2util_2include_2cutlass_2util_2reference_2device_2gemm_8h_source.html">gemm.h</a>&gt;</code></p>

View File

@ -112,7 +112,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
<tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="pub-methods"></a> <tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="pub-methods"></a>
Public Member Functions</h2></td></tr> Public Member Functions</h2></td></tr>
<tr class="memitem:a89e10e059c3ffcfe2640cf6291353937"><td class="memItemLeft" align="right" valign="top">__inline__ __device__&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structcutlass_1_1reference_1_1device_1_1kernel_1_1detail_1_1TensorForEachHelper_3_01Func_00_01Rank_00_010_01_4.html#a89e10e059c3ffcfe2640cf6291353937">TensorForEachHelper</a> (Func &amp;func, <a class="el" href="structcutlass_1_1Coord.html">Coord</a>&lt; Rank &gt; const &amp;size, <a class="el" href="structcutlass_1_1Coord.html">Coord</a>&lt; Rank &gt; &amp;coord, int64_t index)</td></tr> <tr class="memitem:a89e10e059c3ffcfe2640cf6291353937"><td class="memItemLeft" align="right" valign="top">__inline__ __device__&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structcutlass_1_1reference_1_1device_1_1kernel_1_1detail_1_1TensorForEachHelper_3_01Func_00_01Rank_00_010_01_4.html#a89e10e059c3ffcfe2640cf6291353937">TensorForEachHelper</a> (Func &amp;func, <a class="el" href="structcutlass_1_1Coord.html">Coord</a>&lt; Rank &gt; const &amp;size, <a class="el" href="structcutlass_1_1Coord.html">Coord</a>&lt; Rank &gt; &amp;coord, int64_t index)</td></tr>
<tr class="memdesc:a89e10e059c3ffcfe2640cf6291353937"><td class="mdescLeft">&#160;</td><td class="mdescRight">Constructor for fastest chaning rank. <a href="#a89e10e059c3ffcfe2640cf6291353937">More...</a><br /></td></tr> <tr class="memdesc:a89e10e059c3ffcfe2640cf6291353937"><td class="mdescLeft">&#160;</td><td class="mdescRight">Constructor for fastest changing rank. <a href="#a89e10e059c3ffcfe2640cf6291353937">More...</a><br /></td></tr>
<tr class="separator:a89e10e059c3ffcfe2640cf6291353937"><td class="memSeparator" colspan="2">&#160;</td></tr> <tr class="separator:a89e10e059c3ffcfe2640cf6291353937"><td class="memSeparator" colspan="2">&#160;</td></tr>
</table> </table>
<h2 class="groupheader">Constructor &amp; Destructor Documentation</h2> <h2 class="groupheader">Constructor &amp; Destructor Documentation</h2>

View File

@ -104,7 +104,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
</div><!--header--> </div><!--header-->
<div class="contents"> <div class="contents">
<p>Parital specialization for XOR-popc. <p>Partial specialization for XOR-popc.
</p> </p>
<p><code>#include &lt;<a class="el" href="tools_2util_2include_2cutlass_2util_2reference_2host_2gemm_8h_source.html">gemm.h</a>&gt;</code></p> <p><code>#include &lt;<a class="el" href="tools_2util_2include_2cutlass_2util_2reference_2host_2gemm_8h_source.html">gemm.h</a>&gt;</code></p>

View File

@ -113,7 +113,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
<tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="pub-methods"></a> <tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="pub-methods"></a>
Public Member Functions</h2></td></tr> Public Member Functions</h2></td></tr>
<tr class="memitem:a5029a4405a9a5e64011addb43bb88120"><td class="memItemLeft" align="right" valign="top">&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorForEachHelper_3_01Func_00_01Rank_00_010_01_4.html#a5029a4405a9a5e64011addb43bb88120">TensorForEachHelper</a> (Func &amp;func, <a class="el" href="structcutlass_1_1Coord.html">Coord</a>&lt; Rank &gt; const &amp;extent, <a class="el" href="structcutlass_1_1Coord.html">Coord</a>&lt; Rank &gt; &amp;coord)</td></tr> <tr class="memitem:a5029a4405a9a5e64011addb43bb88120"><td class="memItemLeft" align="right" valign="top">&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorForEachHelper_3_01Func_00_01Rank_00_010_01_4.html#a5029a4405a9a5e64011addb43bb88120">TensorForEachHelper</a> (Func &amp;func, <a class="el" href="structcutlass_1_1Coord.html">Coord</a>&lt; Rank &gt; const &amp;extent, <a class="el" href="structcutlass_1_1Coord.html">Coord</a>&lt; Rank &gt; &amp;coord)</td></tr>
<tr class="memdesc:a5029a4405a9a5e64011addb43bb88120"><td class="mdescLeft">&#160;</td><td class="mdescRight">Constructor for fastest chaning rank. <a href="#a5029a4405a9a5e64011addb43bb88120">More...</a><br /></td></tr> <tr class="memdesc:a5029a4405a9a5e64011addb43bb88120"><td class="mdescLeft">&#160;</td><td class="mdescRight">Constructor for fastest changing rank. <a href="#a5029a4405a9a5e64011addb43bb88120">More...</a><br /></td></tr>
<tr class="separator:a5029a4405a9a5e64011addb43bb88120"><td class="memSeparator" colspan="2">&#160;</td></tr> <tr class="separator:a5029a4405a9a5e64011addb43bb88120"><td class="memSeparator" colspan="2">&#160;</td></tr>
</table><table class="memberdecls"> </table><table class="memberdecls">
<tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="pub-static-attribs"></a> <tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="pub-static-attribs"></a>

View File

@ -134,7 +134,7 @@ Classes</h2></td></tr>
<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Partial specialization for multiply-add-saturate. <a href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout30b72addd464a2ca4a26785cbfd77a8e.html#details">More...</a><br /></td></tr> <tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Partial specialization for multiply-add-saturate. <a href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout30b72addd464a2ca4a26785cbfd77a8e.html#details">More...</a><br /></td></tr>
<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr> <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout660562b232f408218828ca5915b7e73a.html">cutlass::reference::device::Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, AccumulatorType, arch::OpXorPopc &gt;</a></td></tr> <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout660562b232f408218828ca5915b7e73a.html">cutlass::reference::device::Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, AccumulatorType, arch::OpXorPopc &gt;</a></td></tr>
<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Parital specialization for XOR-popc. <a href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout660562b232f408218828ca5915b7e73a.html#details">More...</a><br /></td></tr> <tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Partial specialization for XOR-popc. <a href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout660562b232f408218828ca5915b7e73a.html#details">More...</a><br /></td></tr>
<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr> <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
</table><table class="memberdecls"> </table><table class="memberdecls">
<tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="namespaces"></a> <tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="namespaces"></a>

View File

@ -141,7 +141,7 @@ Classes</h2></td></tr>
<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Partial specialization for multiply-add-saturate. <a href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_55729eac7dbd6bf311ea36f680e83e93.html#details">More...</a><br /></td></tr> <tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Partial specialization for multiply-add-saturate. <a href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_55729eac7dbd6bf311ea36f680e83e93.html#details">More...</a><br /></td></tr>
<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr> <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_4f3f32c4b336238abfd741e87bfced46.html">cutlass::reference::host::Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, ComputeType, arch::OpXorPopc &gt;</a></td></tr> <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_4f3f32c4b336238abfd741e87bfced46.html">cutlass::reference::host::Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, ComputeType, arch::OpXorPopc &gt;</a></td></tr>
<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Parital specialization for XOR-popc. <a href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_4f3f32c4b336238abfd741e87bfced46.html#details">More...</a><br /></td></tr> <tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Partial specialization for XOR-popc. <a href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_4f3f32c4b336238abfd741e87bfced46.html#details">More...</a><br /></td></tr>
<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr> <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
</table><table class="memberdecls"> </table><table class="memberdecls">
<tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="namespaces"></a> <tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="namespaces"></a>

File diff suppressed because one or more lines are too long

View File

@ -47,7 +47,7 @@
or utilities within CUTLASS. Such utilities are demonstrated elsewhere in other examples and are or utilities within CUTLASS. Such utilities are demonstrated elsewhere in other examples and are
prevalent in the CUTLASS unit tests. prevalent in the CUTLASS unit tests.
This example has delibrately been kept similar to the basic_gemm example from cutass-1.3 to This example has delibrately been kept similar to the basic_gemm example from cutlass-1.3 to
highlight the minimum amount of differences needed to transition to cutlass-2.0. highlight the minimum amount of differences needed to transition to cutlass-2.0.
Cutlass-1.3 sgemm: https://github.com/NVIDIA/cutlass/blob/master/examples/00_basic_gemm/basic_gemm.cu Cutlass-1.3 sgemm: https://github.com/NVIDIA/cutlass/blob/master/examples/00_basic_gemm/basic_gemm.cu

View File

@ -75,7 +75,7 @@ Now that we setup the properties of data, we have to setup properties of computa
Second, we create template variables of tile sizes for thread-block, warp and mma-op to 128x128x32, Second, we create template variables of tile sizes for thread-block, warp and mma-op to 128x128x32,
64x64x32, 8x8x4 (MxNxK) respectively. When passed to instantiate CUTLASS GEMM kernel, it internally 64x64x32, 8x8x4 (MxNxK) respectively. When passed to instantiate CUTLASS GEMM kernel, it internally
deduce the amount of threads needed per thread-block, amount of shared memory, storing data in deduce the amount of threads needed per thread-block, amount of shared memory, storing data in
bank-conflict free manner, and ton of other variables required to compose, intialize and launch a bank-conflict free manner, and ton of other variables required to compose, initialize and launch a
high performance GEMM kernel. This is the beauty of CUTLASS, it relieves developer from high performance GEMM kernel. This is the beauty of CUTLASS, it relieves developer from
understanding and coding complicated hardware optimizations which can easily go wrong. understanding and coding complicated hardware optimizations which can easily go wrong.
@ -107,7 +107,7 @@ is done which threadblock launched on an SM, CUDA SM architecture of GPU you wan
These are all put together to create a template variable which describes CUTLASS GEMM kernel using These are all put together to create a template variable which describes CUTLASS GEMM kernel using
cutlass::gemm::device::Gemm template. cutlass::gemm::device::Gemm template.
The next step is to intialize physical data, instantiate and initialize CUTLASS kernel and run it. The next step is to initialize physical data, instantiate and initialize CUTLASS kernel and run it.
We use CUTLASS utilities to initialize, fill, compare matrices as they are simple and doesn't come We use CUTLASS utilities to initialize, fill, compare matrices as they are simple and doesn't come
in the way of learning CUTLASS. in the way of learning CUTLASS.
@ -115,7 +115,7 @@ Once all the matrices are initialized and filled with data, create arguments tup
kernel which takes problem size (M = 5120, N = 4096 and K = 4096), matrices, alpha, beta and the kernel which takes problem size (M = 5120, N = 4096 and K = 4096), matrices, alpha, beta and the
important one, split k-dimension factor. Along with that, we query CUTLASS if any scratch-space important one, split k-dimension factor. Along with that, we query CUTLASS if any scratch-space
memory required by the kernel we instantiated. If yes, we create it and pass it along with other memory required by the kernel we instantiated. If yes, we create it and pass it along with other
arguments created to intialize CUTLASS kernel then, the kernel is launched. arguments created to initialize CUTLASS kernel then, the kernel is launched.
In this example, we later on launch a reference gemm kernel (from CUTLASS utilities) to compare if In this example, we later on launch a reference gemm kernel (from CUTLASS utilities) to compare if
the output from CUTLASS kernel is same as reference GEMM kernel. the output from CUTLASS kernel is same as reference GEMM kernel.

View File

@ -74,7 +74,7 @@ Now that we setup the properties of data, we have to setup properties of computa
Second, we create template variables of tile sizes for thread-block, warp and mma-op to 128x256x64, Second, we create template variables of tile sizes for thread-block, warp and mma-op to 128x256x64,
64x64x16, 8x8x16 (MxNxK) respectively. When passed to instantiate CUTLASS GEMM kernel, it internally 64x64x16, 8x8x16 (MxNxK) respectively. When passed to instantiate CUTLASS GEMM kernel, it internally
deduce the amount of threads needed per thread-block, amount of shared memory, storing data in deduce the amount of threads needed per thread-block, amount of shared memory, storing data in
bank-conflict free manner, and ton of other variables required to compose, intialize and launch a bank-conflict free manner, and ton of other variables required to compose, initialize and launch a
high performance GEMM kernel. This is the beauty of CUTLASS, it relieves developer from high performance GEMM kernel. This is the beauty of CUTLASS, it relieves developer from
understanding and coding complicated hardware optimizations which can easily go wrong. understanding and coding complicated hardware optimizations which can easily go wrong.
@ -106,7 +106,7 @@ is done which threadblock launched on an SM, CUDA SM architecture of GPU you wan
These are all put together to create a template variable which describes CUTLASS GEMM kernel using These are all put together to create a template variable which describes CUTLASS GEMM kernel using
cutlass::gemm::device::Gemm template. cutlass::gemm::device::Gemm template.
The next step is to intialize physical data, instantiate and initialize CUTLASS kernel and run it. The next step is to initialize physical data, instantiate and initialize CUTLASS kernel and run it.
We use CUTLASS utilities to initialize, fill, compare matrices as they are simple and doesn't come We use CUTLASS utilities to initialize, fill, compare matrices as they are simple and doesn't come
in the way of learning CUTLASS. in the way of learning CUTLASS.
@ -114,7 +114,7 @@ Once all the matrices are initialized and filled with data, create arguments tup
kernel which takes problem size (M = 5120, N = 4096 and K = 4096), matrices, alpha, beta and the kernel which takes problem size (M = 5120, N = 4096 and K = 4096), matrices, alpha, beta and the
important one, split k-dimension factor. Along with that, we query CUTLASS if any scratch-space important one, split k-dimension factor. Along with that, we query CUTLASS if any scratch-space
memory required by the kernel we instantiated. If yes, we create it and pass it along with other memory required by the kernel we instantiated. If yes, we create it and pass it along with other
arguments created to intialize CUTLASS kernel then, the kernel is launched. arguments created to initialize CUTLASS kernel then, the kernel is launched.
In this example, we later on launch a reference gemm kernel (from CUTLASS utilities) to compare if In this example, we later on launch a reference gemm kernel (from CUTLASS utilities) to compare if
the output from CUTLASS kernel is same as reference GEMM kernel. the output from CUTLASS kernel is same as reference GEMM kernel.

View File

@ -76,7 +76,7 @@ Now that we setup the properties of data, we have to setup properties of computa
Second, we create template variables of tile sizes for thread-block, warp and mma-op to 128x128x128, Second, we create template variables of tile sizes for thread-block, warp and mma-op to 128x128x128,
64x64x128, 8x8x32 (MxNxK) respectively. When passed to instantiate CUTLASS Implicit GEMM kernel, it 64x64x128, 8x8x32 (MxNxK) respectively. When passed to instantiate CUTLASS Implicit GEMM kernel, it
internally deduces the amount of threads needed per thread-block, amount of shared memory, storing internally deduces the amount of threads needed per thread-block, amount of shared memory, storing
data in bank-conflict free manner, and ton of other variables required to compose, intialize and data in bank-conflict free manner, and ton of other variables required to compose, initialize and
launch a high performance Implicit GEMM kernel. This is the beauty of CUTLASS, it relieves developer launch a high performance Implicit GEMM kernel. This is the beauty of CUTLASS, it relieves developer
from understanding and coding complicated hardware optimizations which can easily go wrong. from understanding and coding complicated hardware optimizations which can easily go wrong.
@ -108,7 +108,7 @@ is done which threadblock launched on an SM, CUDA SM architecture of GPU you wan
These are all put together to create a template variable which describes CUTLASS Implicit GEMM These are all put together to create a template variable which describes CUTLASS Implicit GEMM
kernel using cutlass::conv::device::ImplicitGemm template. kernel using cutlass::conv::device::ImplicitGemm template.
The next step is to intialize physical data, instantiate and initialize CUTLASS kernel and run it. The next step is to initialize physical data, instantiate and initialize CUTLASS kernel and run it.
We use CUTLASS utilities to initialize, fill, compare tensors as they are simple and doesn't come We use CUTLASS utilities to initialize, fill, compare tensors as they are simple and doesn't come
in the way of learning CUTLASS. in the way of learning CUTLASS.
@ -117,7 +117,7 @@ kernel which takes problem size (N = 1, H = 64, W = 64, C = 128), filter size (K
R = 3, S = 3, C = 128 ), padding, strides, dilation, tensors, alpha, beta and the R = 3, S = 3, C = 128 ), padding, strides, dilation, tensors, alpha, beta and the
important one, split k-dimension factor. Along with that, we query CUTLASS if any scratch-space important one, split k-dimension factor. Along with that, we query CUTLASS if any scratch-space
memory required by the kernel we instantiated. If yes, we create it and pass it along with other memory required by the kernel we instantiated. If yes, we create it and pass it along with other
arguments created to intialize CUTLASS kernel then, the kernel is launched. arguments created to initialize CUTLASS kernel then, the kernel is launched.
In this example, we later on launch a reference convolution kernel (from CUTLASS utilities) to In this example, we later on launch a reference convolution kernel (from CUTLASS utilities) to
compare if the output from CUTLASS kernel is same as the reference implicit GEMM kernel. compare if the output from CUTLASS kernel is same as the reference implicit GEMM kernel.

View File

@ -321,7 +321,7 @@ public:
int smem_write_stage_idx = 1; int smem_write_stage_idx = 1;
// Issue loads during the first warp-level matrix multiply-add *AFTER* issuing // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing
// shared memory loads (which have the tighest latency requirement). // shared memory loads (which have the tightest latency requirement).
// //
// Mainloop // Mainloop
@ -461,7 +461,7 @@ public:
int gemm_k_iterations_1 = FragmentIteratorA1::Policy::kIterations / Base::kWarpGemmIterations1; int gemm_k_iterations_1 = FragmentIteratorA1::Policy::kIterations / Base::kWarpGemmIterations1;
// Issue loads during the first warp-level matrix multiply-add *AFTER* issuing // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing
// shared memory loads (which have the tighest latency requirement). // shared memory loads (which have the tightest latency requirement).
// //
// Mainloop // Mainloop

View File

@ -341,7 +341,7 @@ public:
int smem_write_stage_idx = 1; int smem_write_stage_idx = 1;
// Issue loads during the first warp-level matrix multiply-add *AFTER* issuing // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing
// shared memory loads (which have the tighest latency requirement). // shared memory loads (which have the tightest latency requirement).
// //
// Mainloop // Mainloop

View File

@ -325,7 +325,7 @@ public:
iterator_B0.clear_mask(gemm_k_iterations_0 <= 1); iterator_B0.clear_mask(gemm_k_iterations_0 <= 1);
// Issue loads during the first warp-level matrix multiply-add *AFTER* issuing // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing
// shared memory loads (which have the tighest latency requirement). // shared memory loads (which have the tightest latency requirement).
// //
// Mainloop // Mainloop

View File

@ -346,7 +346,7 @@ public:
iterator_B0.clear_mask(gemm_k_iterations_0 <= 1); iterator_B0.clear_mask(gemm_k_iterations_0 <= 1);
// Issue loads during the first warp-level matrix multiply-add *AFTER* issuing // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing
// shared memory loads (which have the tighest latency requirement). // shared memory loads (which have the tightest latency requirement).
// //
// Mainloop // Mainloop

View File

@ -73,7 +73,7 @@ Now that we setup the properties of data, we have to setup properties of computa
Second, we create template variables of tile sizes for thread-block, warp and mma-op to 128x128x64, Second, we create template variables of tile sizes for thread-block, warp and mma-op to 128x128x64,
64x64x64, 16x8x16 (MxNxK) respectively. When passed to instantiate CUTLASS Implicit GEMM kernel, it 64x64x64, 16x8x16 (MxNxK) respectively. When passed to instantiate CUTLASS Implicit GEMM kernel, it
internally deduces the amount of threads needed per thread-block, amount of shared memory, storing internally deduces the amount of threads needed per thread-block, amount of shared memory, storing
data in bank-conflict free manner, and ton of other variables required to compose, intialize and data in bank-conflict free manner, and ton of other variables required to compose, initialize and
launch a high performance Implicit GEMM kernel. This is the beauty of CUTLASS, it relieves developer launch a high performance Implicit GEMM kernel. This is the beauty of CUTLASS, it relieves developer
from understanding and coding complicated hardware optimizations which can easily go wrong. from understanding and coding complicated hardware optimizations which can easily go wrong.
@ -95,7 +95,7 @@ is done which threadblock launched on an SM, CUDA SM architecture of GPU you wan
These are all put together to create a template variable which describes CUTLASS Implicit GEMM These are all put together to create a template variable which describes CUTLASS Implicit GEMM
kernel using cutlass::conv::device::ImplicitGemm template. kernel using cutlass::conv::device::ImplicitGemm template.
The next step is to intialize physical data, instantiate and initialize CUTLASS kernel and run it. The next step is to initialize physical data, instantiate and initialize CUTLASS kernel and run it.
We use CUTLASS utilities to initialize, fill, compare tensors as they are simple and doesn't come We use CUTLASS utilities to initialize, fill, compare tensors as they are simple and doesn't come
in the way of learning CUTLASS. in the way of learning CUTLASS.
@ -104,7 +104,7 @@ kernel which takes problem size (N = 1, H = 64, W = 64, C = 128), filter size (K
R = 3, S = 3, C = 128 ), padding, strides, dilation, tensors, alpha, beta and the R = 3, S = 3, C = 128 ), padding, strides, dilation, tensors, alpha, beta and the
important one, split k-dimension factor. Along with that, we query CUTLASS if any scratch-space important one, split k-dimension factor. Along with that, we query CUTLASS if any scratch-space
memory required by the kernel we instantiated. If yes, we create it and pass it along with other memory required by the kernel we instantiated. If yes, we create it and pass it along with other
arguments created to intialize CUTLASS kernel then, the kernel is launched. arguments created to initialize CUTLASS kernel then, the kernel is launched.
In this example, we later on launch a reference convolution kernel (from CUTLASS utilities) to In this example, we later on launch a reference convolution kernel (from CUTLASS utilities) to
compare if the output from CUTLASS kernel is same as the reference implicit GEMM kernel. compare if the output from CUTLASS kernel is same as the reference implicit GEMM kernel.

View File

@ -36,7 +36,7 @@ computing GEMM. So the output also contains either a Mx1 or 1XN vector. It onl
core instructions. core instructions.
Most of the reduction is done in gemm/warp level, see gemm/warp/mma_with_reduction_tensor_op.h Most of the reduction is done in gemm/warp level, see gemm/warp/mma_with_reduction_tensor_op.h
A few bit of reduction is done in the epilouge before storing the vector, see A few bit of reduction is done in the epilogue before storing the vector, see
epilogue/threadblock/epilogue_gemm_k_reduction.h epilogue/threadblock/epilogue_gemm_k_reduction.h
*/ */

View File

@ -1088,7 +1088,7 @@ int main(int argc, char const **args) {
// Determine kernel configuration based on head size. // Determine kernel configuration based on head size.
// If head size is less than or equal to 64, each block operates over 64 queries and // If head size is less than or equal to 64, each block operates over 64 queries and
// 64 keys, and parital results can be stored in the register file. // 64 keys, and partial results can be stored in the register file.
// If head size is greater than 64, each block operates over 32 queries and 128 keys, // If head size is greater than 64, each block operates over 32 queries and 128 keys,
// and partial results are stored in shared memory. // and partial results are stored in shared memory.
if (options.head_size_v > 64) { if (options.head_size_v > 64) {

View File

@ -1173,7 +1173,7 @@ int main(int argc, char const **args) {
// Determine kernel configuration based on head size. // Determine kernel configuration based on head size.
// If head size is less than or equal to 64, each block operates over 64 queries and // If head size is less than or equal to 64, each block operates over 64 queries and
// 64 keys, and parital results can be stored in the register file. // 64 keys, and partial results can be stored in the register file.
// If head size is greater than 64, each block operates over 32 queries and 128 keys, // If head size is greater than 64, each block operates over 32 queries and 128 keys,
// and partial results are stored in shared memory. // and partial results are stored in shared memory.
if (options.head_size_v > 64) { if (options.head_size_v > 64) {

View File

@ -310,7 +310,7 @@ class CustomMmaPipelined : public CustomMmaBase<Shape_, Policy_, 2> {
iterator_B.clear_mask(gemm_k_iterations <= 1); iterator_B.clear_mask(gemm_k_iterations <= 1);
// Issue loads during the first warp-level matrix multiply-add *AFTER* // Issue loads during the first warp-level matrix multiply-add *AFTER*
// issuing shared memory loads (which have the tighest latency requirement). // issuing shared memory loads (which have the tightest latency requirement).
// //
// Mainloop // Mainloop

View File

@ -600,7 +600,7 @@ class MmaPipelinedFromSharedMemory : public MmaBaseFromSharedMemory<
iterator_B.clear_mask(gemm_k_iterations <= 1); iterator_B.clear_mask(gemm_k_iterations <= 1);
// Issue loads during the first warp-level matrix multiply-add *AFTER* // Issue loads during the first warp-level matrix multiply-add *AFTER*
// issuing shared memory loads (which have the tighest latency requirement). // issuing shared memory loads (which have the tightest latency requirement).
// //
// Mainloop // Mainloop

View File

@ -181,7 +181,7 @@ class PredicatedTileAccessIteratorResidualLast<
BytePointer pointer_; BytePointer pointer_;
/// Below is used when Gather is turned on. We need to record strided_offset /// Below is used when Gather is turned on. We need to record strided_offset
/// and contiguous_offset seperated to compute the offset by using /// and contiguous_offset separated to compute the offset by using
/// ///
/// offset = contiguous_offset + indices[strided_offset] /// offset = contiguous_offset + indices[strided_offset]
/// ///

View File

@ -86,14 +86,14 @@ class gen_default_b2b_mma:
"OperatorClass", str(stage), "Operator") "OperatorClass", str(stage), "Operator")
return gen_code return gen_code
def gen_using_FusedAddBiasEpilouge(self): def gen_using_FusedAddBiasEpilogue(self):
gen_code = "" gen_code = ""
for i in range(self.b2b_num - 1): for i in range(self.b2b_num - 1):
code_using = helper.var_idx("using FusedAddBiasEpilouge", i) code_using = helper.var_idx("using FusedAddBiasEpilogue", i)
epilouge_name = "typename cutlass::epilogue::threadblock::DefaultFusedBiasActEpilogueTensorOp" epilogue_name = "typename cutlass::epilogue::threadblock::DefaultFusedBiasActEpilogueTensorOp"
template_args = helper.var_idx("<ThreadblockShape", i) + helper.var_idx(",typename MmaCore", i) + helper.var_idx("::MmaPolicy::Operator, 1, EpilogueOutputOp", i) + ", 2>::Epilogue" template_args = helper.var_idx("<ThreadblockShape", i) + helper.var_idx(",typename MmaCore", i) + helper.var_idx("::MmaPolicy::Operator, 1, EpilogueOutputOp", i) + ", 2>::Epilogue"
gen_code += code_using + " = " + epilouge_name + template_args + ";\n" gen_code += code_using + " = " + epilogue_name + template_args + ";\n"
return gen_code return gen_code
@ -161,12 +161,12 @@ class gen_default_b2b_mma:
MmaPipelined_param_list += "ElementAccumulator0, layout::RowMajor, " MmaPipelined_param_list += "ElementAccumulator0, layout::RowMajor, "
for i in range(self.b2b_num - 1): for i in range(self.b2b_num - 1):
epilouge_name = "EpilogueOutputOp" + str(i) epilogue_name = "EpilogueOutputOp" + str(i)
MmaPipelined_param_list += epilouge_name + ", " MmaPipelined_param_list += epilogue_name + ", "
for i in range(self.b2b_num - 1): for i in range(self.b2b_num - 1):
epilouge_name = "FusedAddBiasEpilouge" + str(i) epilogue_name = "FusedAddBiasEpilogue" + str(i)
MmaPipelined_param_list += epilouge_name + ", " MmaPipelined_param_list += epilogue_name + ", "
for i in range(self.b2b_num): for i in range(self.b2b_num):
MmaPolicy = "typename MmaCore" + str(i) + "::MmaPolicy" MmaPolicy = "typename MmaCore" + str(i) + "::MmaPolicy"
@ -198,7 +198,7 @@ class gen_default_b2b_mma:
mmacore_codebody = self.gen_using_MmaCore(2) mmacore_codebody = self.gen_using_MmaCore(2)
iterator_codebody = self.gen_using_Iterator() iterator_codebody = self.gen_using_Iterator()
fragment_iterator_codebody = self.gen_fragment_iterator() fragment_iterator_codebody = self.gen_fragment_iterator()
epilogue_iterator_codebody = self.gen_using_FusedAddBiasEpilouge() epilogue_iterator_codebody = self.gen_using_FusedAddBiasEpilogue()
threadBlockMma = self.gen_threadblockmma() threadBlockMma = self.gen_threadblockmma()
specialized_code = mmacore_codebody + iterator_codebody + fragment_iterator_codebody + epilogue_iterator_codebody + threadBlockMma specialized_code = mmacore_codebody + iterator_codebody + fragment_iterator_codebody + epilogue_iterator_codebody + threadBlockMma
@ -352,7 +352,7 @@ class gen_b2b_mme_pipelined:
}\n\ }\n\
\n\ \n\
// Issue loads during the first warp-level matrix multiply-add *AFTER* issuing \n\ // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing \n\
// shared memory loads (which have the tighest latency requirement).\n\ // shared memory loads (which have the tightest latency requirement).\n\
\n\ \n\
//\n\ //\n\
// Mainloop\n\ // Mainloop\n\
@ -459,7 +459,7 @@ class gen_b2b_mme_pipelined:
}\n\ }\n\
\n\ \n\
// Issue loads during the first warp-level matrix multiply-add *AFTER* issuing \n\ // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing \n\
// shared memory loads (which have the tighest latency requirement).\n\ // shared memory loads (which have the tightest latency requirement).\n\
iterator_A.load(tb_frag_A);\n\ iterator_A.load(tb_frag_A);\n\
\n\ \n\
//\n\ //\n\
@ -490,7 +490,7 @@ class gen_b2b_mme_pipelined:
__syncthreads();\n\ __syncthreads();\n\
\n\ \n\
// Issue loads during the first warp-level matrix multiply-add *AFTER* issuing \n\ // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing \n\
// shared memory loads (which have the tighest latency requirement).\n\ // shared memory loads (which have the tightest latency requirement).\n\
iterator_A.load(tb_frag_A);\n\ iterator_A.load(tb_frag_A);\n\
\n\ \n\
++this->smem_iterator_B0_;\n\ ++this->smem_iterator_B0_;\n\
@ -549,12 +549,12 @@ class gen_b2b_mme_pipelined:
code = "// " + str(id + 1) + " Gemm" code = "// " + str(id + 1) + " Gemm"
code += " /// Iterator to load a warp-scoped tile of A1 operand from intermediate accumulator tile\n" code += " /// Iterator to load a warp-scoped tile of A1 operand from intermediate accumulator tile\n"
code += " " + helper.var_idx("FragmentC", id - 1) + helper.var_idx(" after_epilouge_accu", id - 1) + ";\n" code += " " + helper.var_idx("FragmentC", id - 1) + helper.var_idx(" after_epilogue_accu", id - 1) + ";\n"
code += " " + helper.var_idx("epilogue_", id - 1) + helper.var_idx("(output_op_", id - 1) + helper.var_idx(", accum", id - 1) \ code += " " + helper.var_idx("epilogue_", id - 1) + helper.var_idx("(output_op_", id - 1) + helper.var_idx(", accum", id - 1) \
+ helper.var_idx(", after_epilouge_accu", id - 1) + helper.var_idx(", iterator_C", id - 1) +");\n" + helper.var_idx(", after_epilogue_accu", id - 1) + helper.var_idx(", iterator_C", id - 1) +");\n"
# FragmentIteratorA1 warp_tile_iterator_A1_(accum0); # FragmentIteratorA1 warp_tile_iterator_A1_(accum0);
code += " " + helper.var_idx("FragmentIteratorA", id) + helper.var_idx(" warp_tile_iterator_A", id) +"_(" + helper.var_idx("after_epilouge_accu", id - 1) + ");\n" code += " " + helper.var_idx("FragmentIteratorA", id) + helper.var_idx(" warp_tile_iterator_A", id) +"_(" + helper.var_idx("after_epilogue_accu", id - 1) + ");\n"
# FragmentB1 tb_frag_B1; # FragmentB1 tb_frag_B1;
code += " " + helper.var_idx("FragmentB", id) + " " + helper.var_idx("tb_frag_B", id) + ";\n" code += " " + helper.var_idx("FragmentB", id) + " " + helper.var_idx("tb_frag_B", id) + ";\n"
# tb_frag_B1.clear(); # tb_frag_B1.clear();
@ -990,7 +990,7 @@ class gen_threadblock:
self.gen_b2b_mma_base = gen_b2b_mma_base(template_param, gen_class_name, b2b_num, cutlass_deps_root, project_root) self.gen_b2b_mma_base = gen_b2b_mma_base(template_param, gen_class_name, b2b_num, cutlass_deps_root, project_root)
self.gen_b2b_mma_piplined = gen_b2b_mme_pipelined(template_param, gen_class_name, b2b_num, cutlass_deps_root, project_root) self.gen_b2b_mma_pipelined = gen_b2b_mme_pipelined(template_param, gen_class_name, b2b_num, cutlass_deps_root, project_root)
self.gen_default_b2b_mma = gen_default_b2b_mma(template_param, gen_class_name, b2b_num, cutlass_deps_root, project_root) self.gen_default_b2b_mma = gen_default_b2b_mma(template_param, gen_class_name, b2b_num, cutlass_deps_root, project_root)
@ -1001,7 +1001,7 @@ class gen_threadblock:
with open(self.file_dir + "b2b_mma_base.h", "w+") as f: with open(self.file_dir + "b2b_mma_base.h", "w+") as f:
f.write(base_code) f.write(base_code)
pipeline_code = self.gen_b2b_mma_piplined.gen_code(first_use_1stage = first_use_1stage) pipeline_code = self.gen_b2b_mma_pipelined.gen_code(first_use_1stage = first_use_1stage)
print("[INFO]: Gen kernel code [b2b_mma_pipelined.h]output Dir: is ", self.file_dir) print("[INFO]: Gen kernel code [b2b_mma_pipelined.h]output Dir: is ", self.file_dir)
with open(self.file_dir + "b2b_mma_pipelined.h", "w+") as f: with open(self.file_dir + "b2b_mma_pipelined.h", "w+") as f:

View File

@ -45,7 +45,7 @@ class gen_verify:
self.user_header_file = "" self.user_header_file = ""
for header in user_header_file: for header in user_header_file:
self.user_header_file += "#include \"" + header + "\"\n" self.user_header_file += "#include \"" + header + "\"\n"
self.seperate_cutlass = gen_basic.gen_volta_turing_fuse_act_impl(fuse_gemm_info, gen_class_name, user_header_file, output_dir) self.separate_cutlass = gen_basic.gen_volta_turing_fuse_act_impl(fuse_gemm_info, gen_class_name, user_header_file, output_dir)
self.gen_params() self.gen_params()
self.output_dir = output_dir self.output_dir = output_dir
@ -53,14 +53,14 @@ class gen_verify:
def gen_code(self): def gen_code(self):
code = "" code = ""
code += self.user_header_file code += self.user_header_file
code += self.seperate_cutlass.gen_using(False) #False -> Turing, True -> Volta code += self.separate_cutlass.gen_using(False) #False -> Turing, True -> Volta
code_body = "" code_body = ""
for i in range(self.b2b_num): for i in range(self.b2b_num):
code_body += " " + helper.var_idx("Gemm", i) + helper.var_idx(" gemm_op_", i) + ";\n" code_body += " " + helper.var_idx("Gemm", i) + helper.var_idx(" gemm_op_", i) + ";\n"
code_body += " " + helper.var_idx("gemm_op_", i) + helper.var_idx(".initialize(Arguments_", i) + ", nullptr);\n" code_body += " " + helper.var_idx("gemm_op_", i) + helper.var_idx(".initialize(Arguments_", i) + ", nullptr);\n"
code_body += self.seperate_cutlass.gen_run() code_body += self.separate_cutlass.gen_run()
code += ir.gen_func(self.name, self.params, code_body) code += ir.gen_func(self.name, self.params, code_body)
helper.write_2_headfile("cutlass_verify.h", self.output_dir, code) helper.write_2_headfile("cutlass_verify.h", self.output_dir, code)
@ -87,6 +87,6 @@ class gen_verify:
def gen_initialize(): def gen_initialize():
code = "" code = ""
initialize_code = self.seperate_cutlass.gen_initialize() initialize_code = self.separate_cutlass.gen_initialize()
code = ir.gen_func("initialize", [[]]) code = ir.gen_func("initialize", [[]])

View File

@ -83,23 +83,23 @@ def list_2_string(input_list, ):
return rtn_string return rtn_string
def get_epilouge_info(layer_info): def get_epilogue_info(layer_info):
return layer_info['epilogue'] return layer_info['epilogue']
def get_epilogue_tp(layer_info): def get_epilogue_tp(layer_info):
epilogue_info = get_epilouge_info(layer_info) epilogue_info = get_epilogue_info(layer_info)
return epilogue_info['tp'] return epilogue_info['tp']
def get_epilogue_add_bias_or_not(layer_info): def get_epilogue_add_bias_or_not(layer_info):
epilogue_info = get_epilouge_info(layer_info) epilogue_info = get_epilogue_info(layer_info)
return epilogue_info['bias']['addbias'] return epilogue_info['bias']['addbias']
def get_epilogue_add_bias_tp(layer_info): def get_epilogue_add_bias_tp(layer_info):
epilogue_info = get_epilouge_info(layer_info) epilogue_info = get_epilogue_info(layer_info)
return epilogue_info['bias']['bias_tp'] return epilogue_info['bias']['bias_tp']
def get_epilogue_args(layer_info): def get_epilogue_args(layer_info):
epilogue_info = get_epilouge_info(layer_info) epilogue_info = get_epilogue_info(layer_info)
return epilogue_info['args'] return epilogue_info['args']
def get_epilogue_bias_shape(layer_info): def get_epilogue_bias_shape(layer_info):

View File

@ -33,7 +33,7 @@
\brief Hopper GEMM example leveraging collective operation builders. \brief Hopper GEMM example leveraging collective operation builders.
This example showcases the use of CUTLASS's CollectiveBuilder to easily construct performant kernels This example showcases the use of CUTLASS's CollectiveBuilder to easily construct performant kernels
targetting the NVIDIA Hopper architecture. targeting the NVIDIA Hopper architecture.
Background and motivation Background and motivation
------------------------- -------------------------
@ -45,7 +45,7 @@
However, DefaultGemmConfigurations leave multiple opportunities for improvement, which are addressed However, DefaultGemmConfigurations leave multiple opportunities for improvement, which are addressed
in CUTLASS 3: in CUTLASS 3:
(1) DefaultGemmConfigurations do not allow one to use a more-performant set of parameters without (1) DefaultGemmConfigurations do not allow one to use a more-performant set of parameters without
specifying every parameter. For example, the DefaultGemmConfigurations for GEMMs targetting specifying every parameter. For example, the DefaultGemmConfigurations for GEMMs targeting
Ampere specify that three pipeline stages should be used regardless of the sizes of operands. Ampere specify that three pipeline stages should be used regardless of the sizes of operands.
If one wished to increase this value, one would also need to specify all other template parameters. If one wished to increase this value, one would also need to specify all other template parameters.
This leaves a gap between a high-level ease-of-use interface and a lower-level detailed interface. This leaves a gap between a high-level ease-of-use interface and a lower-level detailed interface.
@ -55,7 +55,7 @@
Alongside these opportunities for improvement, the Hopper architecture offers new features that increase Alongside these opportunities for improvement, the Hopper architecture offers new features that increase
the number of valid configurations of a kernel. In addition to the many template parameters already available the number of valid configurations of a kernel. In addition to the many template parameters already available
in CUTLASS 2 kernels, CUTLASS 3 kernels targetting Hopper also have various scheduling modes to select from that control: in CUTLASS 2 kernels, CUTLASS 3 kernels targeting Hopper also have various scheduling modes to select from that control:
(1) how data is to be loaded (e.g., using the Hopper TMA feature or Ampere cp.async) (1) how data is to be loaded (e.g., using the Hopper TMA feature or Ampere cp.async)
(2) how work is to be divided among warps in a thread block (e.g., whether to use "warp specialization") (2) how work is to be divided among warps in a thread block (e.g., whether to use "warp specialization")
(3) whether persistent thread blocks should be used (3) whether persistent thread blocks should be used
@ -64,13 +64,13 @@
Introduction to the CollectiveBuilder Introduction to the CollectiveBuilder
------------------------------------- -------------------------------------
CUTLASS 3 introduces the CollectiveBuilder to further ease the process of selecting template parameters CUTLASS 3 introduces the CollectiveBuilder to further ease the process of selecting template parameters
for kernels targetting Hopper. Similar to the DefaultGemmConfigurations used in CUTLASS 2, the CollectiveBuilder for kernels targeting Hopper. Similar to the DefaultGemmConfigurations used in CUTLASS 2, the CollectiveBuilder
takes in a small set of template parameters (e.g., the data types of operands A and B). It then automatically takes in a small set of template parameters (e.g., the data types of operands A and B). It then automatically
determines the data loading strategy to use depending on whether the Hopper TMA feature can be used with the provided determines the data loading strategy to use depending on whether the Hopper TMA feature can be used with the provided
parameters. If one does not indicate a particular scheduling policy or stage count to use (by using `Auto` template parameters. If one does not indicate a particular scheduling policy or stage count to use (by using `Auto` template
parameters), the CollectiveBuilder will also automatically select these. parameters), the CollectiveBuilder will also automatically select these.
Unlike DefaultGemmConfigurations a parital specialization of the CollectiveBuilder is not needed for many Unlike DefaultGemmConfigurations a partial specialization of the CollectiveBuilder is not needed for many
configurations of operand types. Instead the CollectiveBuilder "builds" a configuration based on generic configurations of operand types. Instead the CollectiveBuilder "builds" a configuration based on generic
properties of the specified operands, layouts, and other parameters. For example, when the stage count properties of the specified operands, layouts, and other parameters. For example, when the stage count
is set to `Auto`, the CollectiveBuilder may automatically calculate the maximum number of stages that is set to `Auto`, the CollectiveBuilder may automatically calculate the maximum number of stages that
@ -90,7 +90,7 @@
Details of this example Details of this example
----------------------- -----------------------
This example walks through the use of the CollectiveBuilder with various schedules and stage counts specified. This example walks through the use of the CollectiveBuilder with various schedules and stage counts specified.
This example also illustrates how CUTLASS 3 GEMMs targetting Hopper automatically support batched GEMMs by simply This example also illustrates how CUTLASS 3 GEMMs targeting Hopper automatically support batched GEMMs by simply
extending the problem size with an additional tensor rank. extending the problem size with an additional tensor rank.
Example usage: Example usage:
@ -162,7 +162,7 @@ struct Options {
out << "49_hopper_gemm_schedules_with_collective_builder\n\n" out << "49_hopper_gemm_schedules_with_collective_builder\n\n"
<< " This example showcases the use of CUTLASS's collective operation builders to easily construct\n" << " This example showcases the use of CUTLASS's collective operation builders to easily construct\n"
<< " performant kernels targetting NVIDIA's Hopper architecture.\n\n" << " performant kernels targeting NVIDIA's Hopper architecture.\n\n"
<< "Options:\n\n" << "Options:\n\n"
<< " --help If specified, displays this usage statement\n\n" << " --help If specified, displays this usage statement\n\n"
<< " --m=<int> Sets the M extent of the GEMM\n" << " --m=<int> Sets the M extent of the GEMM\n"

View File

@ -718,7 +718,7 @@ make_tma_copy(CopyOp,
<< "\nswizzle " << smem_swizzle << "\nswizzle " << smem_swizzle
<< "\nl2Promotion " << tma_l2Promotion << "\nl2Promotion " << tma_l2Promotion
<< "\noobFill " << tma_oobFill << std::endl; << "\noobFill " << tma_oobFill << std::endl;
std::cerr << "Error: Failed to intialize the TMA descriptor " << result << std::endl; std::cerr << "Error: Failed to initialize the TMA descriptor " << result << std::endl;
assert(false); assert(false);
} }
#endif // (__CUDACC_VER_MAJOR__ >= 12) #endif // (__CUDACC_VER_MAJOR__ >= 12)

View File

@ -98,11 +98,11 @@ struct OpClassSimt {};
///////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////
/// Tag classifing operators as Tensor Core operations. /// Tag classifying operators as Tensor Core operations.
struct OpClassTensorOp {}; struct OpClassTensorOp {};
///////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////
/// Tag classifing operators as WMMA Tensor Core operations /// Tag classifying operators as WMMA Tensor Core operations
struct OpClassWmmaTensorOp {}; struct OpClassWmmaTensorOp {};
///////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -230,7 +230,7 @@ public:
offset_p[s] = (mapped_h + problem_size_.pad_h - filter_r) / problem_size_.stride_h; offset_p[s] = (mapped_h + problem_size_.pad_h - filter_r) / problem_size_.stride_h;
offset_q[s] = (mapped_w + problem_size_.pad_w - filter_s) / problem_size_.stride_w; offset_q[s] = (mapped_w + problem_size_.pad_w - filter_s) / problem_size_.stride_w;
// Intialize pointers for gemm_k=0 // Initialize pointers for gemm_k=0
TensorCoord coord{offset_n[s], offset_p[s], offset_q[s], filter_k_}; TensorCoord coord{offset_n[s], offset_p[s], offset_q[s], filter_k_};
pointer_[s] += params_.layout(coord) * sizeof_bits<Element>::value / 8; pointer_[s] += params_.layout(coord) * sizeof_bits<Element>::value / 8;
@ -341,7 +341,7 @@ public:
next_idx = 1; next_idx = 1;
// Restore bytes in q coordinate (Mma in filter s dimenstion) // Restore bytes in q coordinate (Mma in filter s dimension)
reset_bytes = reset_bytes_s_; reset_bytes = reset_bytes_s_;
} else { } else {
@ -351,7 +351,7 @@ public:
next_idx = 2; next_idx = 2;
// Restore bytes in p and q coordinate (Mma in filter s and r dimenstion) // Restore bytes in p and q coordinate (Mma in filter s and r dimension)
reset_bytes = reset_bytes_r_; reset_bytes = reset_bytes_r_;
} }
#else #else

View File

@ -195,7 +195,7 @@ public:
s = filter_s_[iteration_contiguous_]; s = filter_s_[iteration_contiguous_];
} }
else { else {
/// Multiple access to support non-128b alignment in contiguous dimenstion /// Multiple access to support non-128b alignment in contiguous dimension
c = (filter_c_[iteration_contiguous_] + iteration_vector_ * AccessType::kElements) % problem_size_.C; c = (filter_c_[iteration_contiguous_] + iteration_vector_ * AccessType::kElements) % problem_size_.C;
int wrap_c = (filter_c_[iteration_contiguous_] + iteration_vector_ * AccessType::kElements) / problem_size_.C; int wrap_c = (filter_c_[iteration_contiguous_] + iteration_vector_ * AccessType::kElements) / problem_size_.C;
s = (filter_s_[iteration_contiguous_] + wrap_c) % problem_size_.S; s = (filter_s_[iteration_contiguous_] + wrap_c) % problem_size_.S;

View File

@ -212,7 +212,7 @@ public:
if (kAccessesPerVector > 1) { if (kAccessesPerVector > 1) {
// This code section is only to support non-128b alignment // This code section is only to support non-128b alignment
// Multiple access to support non-128b alignment in contiguous dimenstion // Multiple access to support non-128b alignment in contiguous dimension
int wrap_c; int wrap_c;
params_.c_divmod(wrap_c, c, c + iteration_vector_ * AccessType::kElements); params_.c_divmod(wrap_c, c, c + iteration_vector_ * AccessType::kElements);

View File

@ -241,7 +241,7 @@ public:
int rs_plane_idx = 0; int rs_plane_idx = 0;
// Issue loads during the first warp-level matrix multiply-add *AFTER* issuing // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing
// shared memory loads (which have the tighest latency requirement). // shared memory loads (which have the tightest latency requirement).
// //
// Mainloop // Mainloop

View File

@ -238,7 +238,7 @@ public:
int smem_write_stage_idx = 1; int smem_write_stage_idx = 1;
// Issue loads during the first warp-level matrix multiply-add *AFTER* issuing // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing
// shared memory loads (which have the tighest latency requirement). // shared memory loads (which have the tightest latency requirement).
// //
// Mainloop // Mainloop

View File

@ -67,7 +67,7 @@ static int get_strided_dgrad_tile_m(
// CUTLASS strided dgrad performance for stride > filter, i.e., stride={2x2} and filter={1x1}) // CUTLASS strided dgrad performance for stride > filter, i.e., stride={2x2} and filter={1x1})
// //
// * Optimization * // * Optimization *
// Only launch CTAs in M dimenstion which contribute to a row in Dx output // Only launch CTAs in M dimension which contribute to a row in Dx output
// //
// //
// * Constraints * // * Constraints *
@ -107,7 +107,7 @@ struct StridedDgradHorizontalThreadblockSwizzle :
// compute number of tiles in m dimension // compute number of tiles in m dimension
int tile_m = get_strided_dgrad_tile_m(problem_size, tile_size.m()); int tile_m = get_strided_dgrad_tile_m(problem_size, tile_size.m());
// compute number of tiles in n dimenstion // compute number of tiles in n dimension
int tile_n = (implicit_gemm_problem_size.n() + tile_size.n() - 1) / tile_size.n(); int tile_n = (implicit_gemm_problem_size.n() + tile_size.n() - 1) / tile_size.n();
return gemm::GemmCoord( return gemm::GemmCoord(
@ -148,7 +148,7 @@ struct StridedDgradIdentityThreadblockSwizzle :
// compute number of tiles in m dimension // compute number of tiles in m dimension
int tile_m = get_strided_dgrad_tile_m(problem_size, tile_size.m()); int tile_m = get_strided_dgrad_tile_m(problem_size, tile_size.m());
// compute number of tiles in n dimenstion // compute number of tiles in n dimension
int tile_n = (implicit_gemm_problem_size.n() + tile_size.n() - 1) / tile_size.n(); int tile_n = (implicit_gemm_problem_size.n() + tile_size.n() - 1) / tile_size.n();
return gemm::GemmCoord( return gemm::GemmCoord(

View File

@ -77,7 +77,7 @@ namespace threadblock {
// D = dr + j di = (ar*br - ai*bi) + j (ar*bi + ai*br) // D = dr + j di = (ar*br - ai*bi) + j (ar*bi + ai*br)
///////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////
template < template <
/// Epilouge Shape /// Epilogue Shape
typename Shape_, typename Shape_,
/// Warp-level mma operator /// Warp-level mma operator
typename WarpMmaTensorOp_, typename WarpMmaTensorOp_,

View File

@ -78,7 +78,7 @@ namespace threadblock {
// D = dr + j di = (ar*br - ai*bi) + j (ar*bi + ai*br) // D = dr + j di = (ar*br - ai*bi) + j (ar*bi + ai*br)
///////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////
template < template <
/// Epilouge Shape /// Epilogue Shape
typename Shape_, typename Shape_,
/// Warp-level mma operator /// Warp-level mma operator
typename WarpMmaTensorOp_, typename WarpMmaTensorOp_,

View File

@ -198,7 +198,7 @@ private:
/// A thread's starting column /// A thread's starting column
Index thread_start_column_; Index thread_start_column_;
/// Initial thread ouput location /// Initial thread output location
int thread_start_n_, thread_start_p_, thread_start_q_; int thread_start_n_, thread_start_p_, thread_start_q_;
/// Current threadblock tile index /// Current threadblock tile index

View File

@ -186,10 +186,10 @@ private:
/// Extent of the matrix tile in rows /// Extent of the matrix tile in rows
Index extent_row_; Index extent_row_;
/// Starting Dx h and w dimenstion for strided dgrad mapping /// Starting Dx h and w dimension for strided dgrad mapping
int start_h_, start_w_; int start_h_, start_w_;
/// Effective Dy P and Q dimenstions for strided dgrad mapping /// Effective Dy P and Q dimensions for strided dgrad mapping
int p_, q_; int p_, q_;
/// A thread's starting row position (assuming steady-state predicates have been computed) /// A thread's starting row position (assuming steady-state predicates have been computed)

View File

@ -547,7 +547,7 @@ public:
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
/// Parital specialization for column-major output exchanges problem size and operand. /// Partial specialization for column-major output exchanges problem size and operand.
template < template <
/// Element type for A matrix operand /// Element type for A matrix operand
typename ElementA_, typename ElementA_,

View File

@ -521,7 +521,7 @@ public:
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
/// Parital specialization for column-major output exchanges problem size and operand. /// Partial specialization for column-major output exchanges problem size and operand.
template < template <
/// Element type for A matrix operand /// Element type for A matrix operand
typename ElementA_, typename ElementA_,

View File

@ -476,7 +476,7 @@ public:
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
/// Parital specialization for column-major output exchanges problem size and operand. /// Partial specialization for column-major output exchanges problem size and operand.
template < template <
/// Element type for A matrix operand /// Element type for A matrix operand
typename ElementA_, typename ElementA_,

View File

@ -454,7 +454,7 @@ public:
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
/// Parital specialization for column-major output exchanges problem size and operand. /// Partial specialization for column-major output exchanges problem size and operand.
template < template <
/// Element type for A matrix operand /// Element type for A matrix operand
typename ElementA_, typename ElementA_,

View File

@ -475,7 +475,7 @@ public:
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
/// Parital specialization for column-major output exchanges problem size and operand. /// Partial specialization for column-major output exchanges problem size and operand.
template < template <
/// Element type for A matrix operand /// Element type for A matrix operand
typename ElementA_, typename ElementA_,

View File

@ -194,7 +194,7 @@ class GemmLayernormMainloopFusion :
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
/// Parital specialization for column-major output exchanges problem size and operand. /// Partial specialization for column-major output exchanges problem size and operand.
template < template <
/// Element type for A matrix operand /// Element type for A matrix operand
typename ElementA_, typename ElementA_,

View File

@ -219,7 +219,7 @@ class GemmUniversal :
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
/// Parital specialization for column-major output exchanges problem size and operand. /// Partial specialization for column-major output exchanges problem size and operand.
template < template <
/// Element type for A matrix operand /// Element type for A matrix operand
typename ElementA_, typename ElementA_,

View File

@ -198,7 +198,7 @@ class GemmUniversalWithBroadcast :
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
/// Parital specialization for column-major output exchanges problem size and operand. /// Partial specialization for column-major output exchanges problem size and operand.
template < template <
/// Element type for A matrix operand /// Element type for A matrix operand
typename ElementA_, typename ElementA_,

View File

@ -211,7 +211,7 @@ class GemmWithKReduction :
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
/// Parital specialization for column-major output exchanges problem size and operand. /// Partial specialization for column-major output exchanges problem size and operand.
template < template <
/// Element type for A matrix operand /// Element type for A matrix operand
typename ElementA_, typename ElementA_,

View File

@ -348,7 +348,7 @@ public:
}; };
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
/// Parital specialization for column-major output exchange operand. /// Partial specialization for column-major output exchange operand.
template < template <
/// Element type for A matrix operand /// Element type for A matrix operand
typename ElementA_, typename ElementA_,

View File

@ -325,7 +325,7 @@ public:
}; };
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
/// Parital specialization for column-major output exchange operand. /// Partial specialization for column-major output exchange operand.
template < template <
/// Element type for A matrix operand /// Element type for A matrix operand
typename ElementA_, typename ElementA_,

View File

@ -408,7 +408,7 @@ public:
call GEMM mainloop for with RowMajor efficient-epilogue call GEMM mainloop for with RowMajor efficient-epilogue
********************************************************************************************************/ ********************************************************************************************************/
/// Parital specialization for column-major output exchanges problem size and operand. /// Partial specialization for column-major output exchanges problem size and operand.
template < template <
/// Element type for A matrix operand /// Element type for A matrix operand
typename ElementA_, typename ElementA_,

View File

@ -563,7 +563,7 @@ For the mainloop and trmm kernel, `A` and `B` points to left-side and right-side
call GEMM mainloop for with RowMajor efficient-epilogue call GEMM mainloop for with RowMajor efficient-epilogue
********************************************************************************************************/ ********************************************************************************************************/
/// Parital specialization for column-major output exchanges problem size and operand. /// Partial specialization for column-major output exchanges problem size and operand.
template < template <
/// Element type for A matrix operand /// Element type for A matrix operand
typename ElementA_, typename ElementA_,

View File

@ -137,7 +137,7 @@ struct DefaultGemmWithBroadcast {
///////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////
/// Parital specialization: ArchTag = cutlass::arch::Sm70 /// Partial specialization: ArchTag = cutlass::arch::Sm70
/// ///
/// ///
template < template <

View File

@ -138,7 +138,7 @@ struct DefaultGemmWithReduction {
///////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////
/// Parital specialization: ArchTag = cutlass::arch::Sm70 /// Partial specialization: ArchTag = cutlass::arch::Sm70
/// ///
/// ///
template < template <

View File

@ -138,7 +138,7 @@
i = i_macro i = i_macro
j = j_macro j = j_macro
Handling cases with grid dimensions that aren't multiples of eachother Handling cases with grid dimensions that aren't multiples of each other
---------------------------------------------------------------------- ----------------------------------------------------------------------
Even though threadblock shapes M and N are typically multiples of one another, the grid Even though threadblock shapes M and N are typically multiples of one another, the grid
for a given problem may not have dimensions of the same ratio as that of the threadblock. for a given problem may not have dimensions of the same ratio as that of the threadblock.

View File

@ -196,7 +196,7 @@ public:
// Any Tensor Op MMA Atom in the WGMMA ISA is arch conditional to sm90a. // Any Tensor Op MMA Atom in the WGMMA ISA is arch conditional to sm90a.
#if ! defined(__CUDA_ARCH_FEAT_SM90_ALL) #if ! defined(__CUDA_ARCH_FEAT_SM90_ALL)
if constexpr(size<0>(typename TiledMma::AtomShape_MNK{}) == 64) { if constexpr(size<0>(typename TiledMma::AtomShape_MNK{}) == 64) {
printf("ERROR : Arch conditional MMA instruction used without targetting sm90a compute capability. Aborting.\n"); printf("ERROR : Arch conditional MMA instruction used without targeting sm90a compute capability. Aborting.\n");
return; return;
} }
#endif #endif

View File

@ -186,7 +186,7 @@ public:
// Any Tensor Op MMA Atom in the WGMMA ISA is arch conditional to sm90a. // Any Tensor Op MMA Atom in the WGMMA ISA is arch conditional to sm90a.
#if ! defined(__CUDA_ARCH_FEAT_SM90_ALL) #if ! defined(__CUDA_ARCH_FEAT_SM90_ALL)
if constexpr(size<0>(typename TiledMma::AtomShape_MNK{}) == 64) { if constexpr(size<0>(typename TiledMma::AtomShape_MNK{}) == 64) {
printf("ERROR : Arch conditional MMA instruction used without targetting sm90a compute capability. Aborting.\n"); printf("ERROR : Arch conditional MMA instruction used without targeting sm90a compute capability. Aborting.\n");
return; return;
} }
#endif #endif

View File

@ -258,7 +258,7 @@ public:
// Any Tensor Op MMA Atom in the WGMMA ISA is arch conditional to sm90a. // Any Tensor Op MMA Atom in the WGMMA ISA is arch conditional to sm90a.
#if ! defined(__CUDA_ARCH_FEAT_SM90_ALL) #if ! defined(__CUDA_ARCH_FEAT_SM90_ALL)
if constexpr(size<0>(typename TiledMma::AtomShape_MNK{}) == 64) { if constexpr(size<0>(typename TiledMma::AtomShape_MNK{}) == 64) {
printf("ERROR : Arch conditional MMA instruction used without targetting sm90a compute capability. Aborting.\n"); printf("ERROR : Arch conditional MMA instruction used without targeting sm90a compute capability. Aborting.\n");
return; return;
} }
#endif #endif

View File

@ -271,7 +271,7 @@ public:
} }
// Issue loads during the first warp-level matrix multiply-add *AFTER* issuing // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing
// shared memory loads (which have the tighest latency requirement). // shared memory loads (which have the tightest latency requirement).
// //
// Mainloop // Mainloop

View File

@ -321,7 +321,7 @@ public:
iterator_B_imag.clear_mask(gemm_k_iterations <= 1); iterator_B_imag.clear_mask(gemm_k_iterations <= 1);
// Issue loads during the first warp-level matrix multiply-add *AFTER* issuing // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing
// shared memory loads (which have the tighest latency requirement). // shared memory loads (which have the tightest latency requirement).
// //
// Mainloop // Mainloop

View File

@ -83,7 +83,7 @@ struct TensorReductionAffineContiguousParams {
uint64_t outer_count; /// Number of elements in outer index space uint64_t outer_count; /// Number of elements in outer index space
ElementOutput * destination; /// Pointer to output tensor of rank kReducedRank ElementOutput * destination; /// Pointer to output tensor of rank kReducedRank
ElementSource const * source; /// Poitner to source pointer of rank kRank ElementSource const * source; /// Pointer to source pointer of rank kRank
ReductionOp reduction_op; /// Reduction operator ReductionOp reduction_op; /// Reduction operator
ElementCompute reduction_identity; /// Identity element used by reduction operator ElementCompute reduction_identity; /// Identity element used by reduction operator
ElementCompute *device_workspace; /// Pointer to device workspace for inter-CTA reductions ElementCompute *device_workspace; /// Pointer to device workspace for inter-CTA reductions

View File

@ -85,7 +85,7 @@ struct TensorReductionAffineStridedParams {
uint64_t outer_count; /// Number of elements in outer index space uint64_t outer_count; /// Number of elements in outer index space
ElementOutput * destination; /// Pointer to output tensor of rank kReducedRank ElementOutput * destination; /// Pointer to output tensor of rank kReducedRank
ElementSource const * source; /// Poitner to source pointer of rank kRank ElementSource const * source; /// Pointer to source pointer of rank kRank
ReductionOp reduction_op; /// Reduction operator ReductionOp reduction_op; /// Reduction operator
ElementCompute reduction_identity; /// Identity element for reduction operator ElementCompute reduction_identity; /// Identity element for reduction operator
ElementCompute *device_workspace; /// Pointer to device workspace for inter-CTA reductions ElementCompute *device_workspace; /// Pointer to device workspace for inter-CTA reductions

View File

@ -399,7 +399,7 @@ class PredicatedTileAccessIterator<Shape_, Element_, layout::PitchLinear,
bool is_residue_tile_; bool is_residue_tile_;
/// Below is used when Gather is turned on. We need to record strided_offset /// Below is used when Gather is turned on. We need to record strided_offset
/// and contiguous_offset seperated to compute the offset by using /// and contiguous_offset separated to compute the offset by using
/// ///
/// offset = contiguous_offset + indices[strided_offset] /// offset = contiguous_offset + indices[strided_offset]
/// ///

View File

@ -1079,7 +1079,7 @@ class RegularTileIterator<
// //
/// The crosswised elements will be stored in a line. /// The crosswised elements will be stored in a line.
/// line_size is size of crosswised dimention plus padding. /// line_size is size of crosswised dimension plus padding.
/// in units of AccessType /// in units of AccessType
Index line_size; Index line_size;

View File

@ -347,7 +347,7 @@ creating GEMM-B tile in shared memory.
The improvements covered by optimized iterators are: The improvements covered by optimized iterators are:
- (a) Precomputing kernel-invariant pointer deltas on the host - (a) Precomputing kernel-invariant pointer deltas on the host
- (b) Computing cta-invariant mask predicates on device-side iterator ctors - (b) Computing cta-invariant mask predicates on device-side iterator ctors
- (c) Use of [fast divmod](/include/cutlass/fast_math.h) to map GEMM dimenstions to convolution tensors. - (c) Use of [fast divmod](/include/cutlass/fast_math.h) to map GEMM dimensions to convolution tensors.
For example, _optimized_ activation iterator uses fast divmod to map GEMM _M_ to NPQ For example, _optimized_ activation iterator uses fast divmod to map GEMM _M_ to NPQ
for activation iterator for activation iterator

View File

@ -587,7 +587,8 @@ To instantiate all operations supporting all tile sizes, data types, and alignme
```bash ```bash
$ cmake .. -DCUTLASS_NVCC_ARCHS='70;75;80' -DCUTLASS_LIBRARY_KERNELS=all $ cmake .. -DCUTLASS_NVCC_ARCHS='70;75;80' -DCUTLASS_LIBRARY_KERNELS=all
``` ```
The above command line generates about twenty thousand kernels targetting NVIDIA Ampere, Turing, and Volta architectures.
The above command line generates about twenty thousand kernels targeting NVIDIA Ampere, Turing, and Volta architectures.
Compiling thousands of kernels for three different architectures is time consuming. Additionaly, this would also result Compiling thousands of kernels for three different architectures is time consuming. Additionaly, this would also result
in a large binary size and on some platforms linker to fail on building the library. in a large binary size and on some platforms linker to fail on building the library.
@ -641,13 +642,13 @@ $ cmake .. -DCUTLASS_NVCC_ARCHS='80' -DCUTLASS_LIBRARY_KERNELS=s16816fprop,s1681
$ cmake .. -DCUTLASS_NVCC_ARCHS='50;60;61;70;75;80' -DCUTLASS_LIBRARY_KERNELS=sfprop $ cmake .. -DCUTLASS_NVCC_ARCHS='50;60;61;70;75;80' -DCUTLASS_LIBRARY_KERNELS=sfprop
``` ```
**Example.** All forward propagation (fprop) convolution kernels with FP32 accumulation and FP16 input targetting NVIDIA Ampere's 16816 Tensor Core operation **Example.** All forward propagation (fprop) convolution kernels with FP32 accumulation and FP16 input targeting NVIDIA Ampere's 16816 Tensor Core operation
```bash ```bash
$ cmake .. -DCUTLASS_NVCC_ARCHS='80' -DCUTLASS_LIBRARY_KERNELS=s16816fprop_*_f16 $ cmake .. -DCUTLASS_NVCC_ARCHS='80' -DCUTLASS_LIBRARY_KERNELS=s16816fprop_*_f16
``` ```
**Example.** All backward weight gradient (wgrad) convolution kernels with FP32 accumulation, FP16 input, and optimized global memory iterator **Example.** All backward weight gradient (wgrad) convolution kernels with FP32 accumulation, FP16 input, and optimized global memory iterator
targetting NVIDIA Ampere, Turing, and Volta Tensor Core operations targeting NVIDIA Ampere, Turing, and Volta Tensor Core operations
```bash ```bash
$ cmake .. -DCUTLASS_NVCC_ARCHS='70;75;80' -DCUTLASS_LIBRARY_KERNELS=tensorop*s*wgrad_optimized_f16 $ cmake .. -DCUTLASS_NVCC_ARCHS='70;75;80' -DCUTLASS_LIBRARY_KERNELS=tensorop*s*wgrad_optimized_f16
``` ```

View File

@ -573,7 +573,7 @@ bool TestSpecificConv2d(
///////////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////////////
// TestAllConv: Runs cutlass::conv::device::ImplicitGemmConvolution operator and compares it with reference // TestAllConv: Runs cutlass::conv::device::ImplicitGemmConvolution operator and compares it with reference
// TestAllConv runs conv operator on default conv problem sizes from test::conv::device::TestbedConv2dProblemSizes // TestAllConv runs conv operator on default conv problem sizes from test::conv::device::TestbedConv2dProblemSizes
// Additionaly, each conv2d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes // Additionally, each conv2d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes
// (conv_blacklist_sizes) // (conv_blacklist_sizes)
///////////////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////////////////
template <typename ImplicitGemm> template <typename ImplicitGemm>

View File

@ -517,7 +517,7 @@ public:
///////////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////////////
// TestAllConv: Runs cutlass::conv::device::ImplicitGemmConvolution operator and compares it with reference // TestAllConv: Runs cutlass::conv::device::ImplicitGemmConvolution operator and compares it with reference
// TestAllConv runs conv operator on default conv problem sizes from test::conv::device::TestbedConv2dProblemSizes // TestAllConv runs conv operator on default conv problem sizes from test::conv::device::TestbedConv2dProblemSizes
// Additionaly, each conv2d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes // Additionally, each conv2d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes
// (conv_blacklist_sizes) // (conv_blacklist_sizes)
///////////////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////////////////
template <typename ImplicitGemm, int InterleavedK> template <typename ImplicitGemm, int InterleavedK>

View File

@ -502,7 +502,7 @@ public:
///////////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////////////
// TestAllConv: Runs cutlass::conv::device::ImplicitGemmConvolution operator and compares it with reference // TestAllConv: Runs cutlass::conv::device::ImplicitGemmConvolution operator and compares it with reference
// TestAllConv runs conv operator on default conv problem sizes from test::conv::device::TestbedConv2dProblemSizes // TestAllConv runs conv operator on default conv problem sizes from test::conv::device::TestbedConv2dProblemSizes
// Additionaly, each conv2d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes // Additionally, each conv2d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes
// (conv_blacklist_sizes) // (conv_blacklist_sizes)
///////////////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////////////////
template <typename ImplicitGemm, template <typename ImplicitGemm,

View File

@ -464,7 +464,7 @@ public:
///////////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////////////
// TestAllConv: Runs cutlass::conv::device::ImplicitGemmConvolution operator and compares it with reference // TestAllConv: Runs cutlass::conv::device::ImplicitGemmConvolution operator and compares it with reference
// TestAllConv runs conv operator on default conv problem sizes from test::conv::device::TestbedConv2dProblemSizes // TestAllConv runs conv operator on default conv problem sizes from test::conv::device::TestbedConv2dProblemSizes
// Additionaly, each conv2d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes // Additionally, each conv2d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes
// (conv_blacklist_sizes) // (conv_blacklist_sizes)
///////////////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////////////////
template <typename ImplicitGemm> template <typename ImplicitGemm>

View File

@ -522,7 +522,7 @@ public:
///////////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////////////
// TestAllConv: Runs cutlass::conv::device::ImplicitGemmConvolution operator and compares it with reference // TestAllConv: Runs cutlass::conv::device::ImplicitGemmConvolution operator and compares it with reference
// TestAllConv runs conv operator on default conv problem sizes from test::conv::device::TestbedConv2dProblemSizes // TestAllConv runs conv operator on default conv problem sizes from test::conv::device::TestbedConv2dProblemSizes
// Additionaly, each conv3d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes // Additionally, each conv3d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes
// (conv_blacklist_sizes) // (conv_blacklist_sizes)
///////////////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -638,7 +638,7 @@ struct DefaultGemmConfigurationToCutlass3Types<
GmemTiledCopyB, SmemLayoutAtomB, SmemCopyAtomB, cute::identity // B GmemTiledCopyB, SmemLayoutAtomB, SmemCopyAtomB, cute::identity // B
>; >;
// Epilouge // Epilogue
using CollectiveEpilogue = epilogue::collective::DefaultEpilogue< using CollectiveEpilogue = epilogue::collective::DefaultEpilogue<
TagToStrideC_t<LayoutC>, TagToStrideC_t<LayoutC>,
TagToStrideC_t<LayoutC>, TagToStrideC_t<LayoutC>,

View File

@ -321,13 +321,13 @@ public:
NumericTypeID element_C, /// Data type of C and D matrix NumericTypeID element_C, /// Data type of C and D matrix
void const * const * ptr_C_real, /// Pointer to array containing pointers to real part of C matrices void const * const * ptr_C_real, /// Pointer to array containing pointers to real part of C matrices
void const * const * ptr_C_imag, /// Pointer to array containing poitners to imaginary part of C matrices void const * const * ptr_C_imag, /// Pointer to array containing pointers to imaginary part of C matrices
int64_t ldc_real, /// Leading dimension of real part of C matrix int64_t ldc_real, /// Leading dimension of real part of C matrix
int64_t ldc_imag, /// Leading dimension of imaginary part of C matrix int64_t ldc_imag, /// Leading dimension of imaginary part of C matrix
void * const * ptr_D_real, /// Pointer to array containing pointers to real part of D matrices void * const * ptr_D_real, /// Pointer to array containing pointers to real part of D matrices
void * const * ptr_D_imag, /// Pointer to array containing poitners to imaginary part of D matrices void * const * ptr_D_imag, /// Pointer to array containing pointers to imaginary part of D matrices
int64_t ldd_real, /// Leading dimension of real part of D matrix int64_t ldd_real, /// Leading dimension of real part of D matrix
int64_t ldd_imag /// Leading dimension of imaginary part of D matrix int64_t ldd_imag /// Leading dimension of imaginary part of D matrix

View File

@ -518,7 +518,7 @@ struct GemmDescription : public OperationDescription {
///////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////
/// Desciprion for structured sparse GEMMs. /// Description for structured sparse GEMMs.
struct SparseGemmDescription : public GemmDescription { struct SparseGemmDescription : public GemmDescription {
/// Description structure for structured sparse GEMM /// Description structure for structured sparse GEMM
@ -1160,7 +1160,7 @@ struct GemmGroupedArguments {
// OperationKind: kSparseGemm // OperationKind: kSparseGemm
// //
/// Computes GEMM assumine one of the inputs has 2:4 structured sparsity. /// Computes GEMM assuming one of the inputs has 2:4 structured sparsity.
struct SparseGemmConfiguration { struct SparseGemmConfiguration {
GemmUniversalMode mode; GemmUniversalMode mode;
@ -1187,7 +1187,7 @@ struct SparseGemmArguments {
void const *B; /// pointer to B matrix void const *B; /// pointer to B matrix
void const *C; /// pointer to C matrix void const *C; /// pointer to C matrix
void *D; /// pointer to D matrix void *D; /// pointer to D matrix
void const *E; /// pointer to E matric (metadata) void const *E; /// pointer to E matrix (metadata)
void const *alpha; /// pointer to alpha scalar void const *alpha; /// pointer to alpha scalar
void const *beta; /// pointer to beta scalar void const *beta; /// pointer to beta scalar
@ -1465,7 +1465,7 @@ struct ConvArguments {
/// pointer to implicit gemm matrix C /// pointer to implicit gemm matrix C
void const *C; void const *C;
/// pointer to implicit gemm desitination matrix D /// pointer to implicit gemm destination matrix D
void *D; void *D;
/// Host or device pointer to alpha scalar /// Host or device pointer to alpha scalar
@ -1487,16 +1487,16 @@ struct ConvArguments {
// //
struct ReductionConfiguration { struct ReductionConfiguration {
/// Redcution problem size /// Reduction problem size
MatrixCoord problem_size; MatrixCoord problem_size;
/// Number of partitions to reduce /// Number of partitions to reduce
int partitions; int partitions;
/// Number of lements between each partition /// Number of elements between each partition
int64_t partition_stride; int64_t partition_stride;
/// leading dimension of 'w'orksace operand /// leading dimension of 'w'orkspace operand
int64_t ldw; int64_t ldw;
/// leading dimension of 's'ource operand /// leading dimension of 's'ource operand

Some files were not shown because too many files have changed in this diff Show More