Fix typos 2 (#842)

Co-authored-by: Haicheng Wu <57973641+hwu36@users.noreply.github.com>
2023-03-09 20:22:56 -08:00 · 2023-03-09 20:22:56 -08:00 · 7e370c9637
commit 7e370c9637
parent c4f6b8c6bc
161 changed files with 310 additions and 309 deletions
--- a/README.md
+++ b/README.md
@ -328,7 +328,7 @@ or a subset of kernels for NVIDIA Ampere and Turing architecture:
 ### Building a subset Tensor Core GEMM kernels
-To compile a subset of Tensor Core GEMM kernels with FP32 accumulation and FP16 input targetting NVIDIA Ampere and Turing architecture, 
+To compile a subset of Tensor Core GEMM kernels with FP32 accumulation and FP16 input targeting NVIDIA Ampere and Turing architecture, 
 use the below cmake command line:
 ```bash
 $ cmake .. -DCUTLASS_NVCC_ARCHS='75;80' -DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_s*gemm_f16_*_nt_align8
@ -376,7 +376,7 @@ reference_device: Passed
 ### Building one CUDA Core GEMM kernel
-To compile one SGEMM kernel targetting NVIDIA Ampere and Turing architecture, use the below cmake command line:
+To compile one SGEMM kernel targeting NVIDIA Ampere and Turing architecture, use the below cmake command line:
 ```bash
 $ cmake .. -DCUTLASS_NVCC_ARCHS='75;80' -DCUTLASS_LIBRARY_KERNELS=cutlass_simt_sgemm_128x128_8x2_nn_align1
 ...
@ -418,7 +418,7 @@ $ ./tools/profiler/cutlass_profiler --kernels=sgemm --m=3456 --n=4096 --k=4096
 ### Building a subset of Tensor Core Convolution kernels
 To compile a subset of Tensor core convolution kernels implementing forward propagation (fprop) with FP32 accumulation 
-and FP16 input targetting NVIDIA Ampere and Turing architecture, use the below cmake command line:
+and FP16 input targeting NVIDIA Ampere and Turing architecture, use the below cmake command line:
 ```bash
 $ cmake .. -DCUTLASS_NVCC_ARCHS='75;80' -DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_s*fprop_optimized_f16
 ...
@ -466,7 +466,7 @@ reference_device: Passed
 ### Building one Convolution CUDA kernel
 To compile and run one CUDA Core convolution kernel implementing forward propagation (fprop) with F32 accumulation 
-and FP32 input targetting NVIDIA Ampere and Turing architecture, use the below cmake command line:
+and FP32 input targeting NVIDIA Ampere and Turing architecture, use the below cmake command line:
 ```bash
 $ cmake .. -DCUTLASS_NVCC_ARCHS='75;80' -DCUTLASS_LIBRARY_KERNELS=cutlass_simt_sfprop_optimized_128x128_8x2_nhwc
 ...
--- a/docs/annotated.html
+++ b/docs/annotated.html
@ -280,15 +280,15 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
 <tr id="row_0_3_0_13_" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1gemm_1_1device_1_1DefaultGemmConfiguration_3_01arch_1_1OpClassWmmaTensorOp_00_0884059ecad03bea3e86c4cf722226097.html" target="_self">DefaultGemmConfiguration&lt; arch::OpClassWmmaTensorOp, ArchTag, ElementA, ElementB, ElementC, ElementAccumulator &gt;</a></td><td class="desc"></td></tr>
 <tr id="row_0_3_0_14_" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span id="arr_0_3_0_14_" class="arrow" onclick="toggleFolder('0_3_0_14_')">&#9658;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1Gemm.html" target="_self">Gemm</a></td><td class="desc"></td></tr>
 <tr id="row_0_3_0_14_0_" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1gemm_1_1device_1_1Gemm_1_1Arguments.html" target="_self">Arguments</a></td><td class="desc">Argument structure </td></tr>
-<tr id="row_0_3_0_15_" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span id="arr_0_3_0_15_" class="arrow" onclick="toggleFolder('0_3_0_15_')">&#9658;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1Gemm_3_01ElementA___00_01LayoutA___00_01ElementB___00_01Layout4d0960ae6b1d1bf19e6239dbd002249c.html" target="_self">Gemm&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, SplitKSerial, Operator_, IsBetaZero &gt;</a></td><td class="desc">Parital specialization for column-major output exchanges problem size and operand </td></tr>
+<tr id="row_0_3_0_15_" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span id="arr_0_3_0_15_" class="arrow" onclick="toggleFolder('0_3_0_15_')">&#9658;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1Gemm_3_01ElementA___00_01LayoutA___00_01ElementB___00_01Layout4d0960ae6b1d1bf19e6239dbd002249c.html" target="_self">Gemm&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, SplitKSerial, Operator_, IsBetaZero &gt;</a></td><td class="desc">Partial specialization for column-major output exchanges problem size and operand </td></tr>
 <tr id="row_0_3_0_15_0_" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1gemm_1_1device_1_1Gemm_3_01ElementA___00_01LayoutA___00_01ElementB___00_01Layou1b211cc9c97c022d8fe10f2dd32c8709.html" target="_self">Arguments</a></td><td class="desc">Argument structure </td></tr>
 <tr id="row_0_3_0_16_" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span id="arr_0_3_0_16_" class="arrow" onclick="toggleFolder('0_3_0_16_')">&#9658;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmBatched.html" target="_self">GemmBatched</a></td><td class="desc"></td></tr>
 <tr id="row_0_3_0_16_0_" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1gemm_1_1device_1_1GemmBatched_1_1Arguments.html" target="_self">Arguments</a></td><td class="desc">Argument structure </td></tr>
-<tr id="row_0_3_0_17_" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span id="arr_0_3_0_17_" class="arrow" onclick="toggleFolder('0_3_0_17_')">&#9658;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_0c9bb6f4463ab6085e6008b5d5ad6abfd.html" target="_self">GemmBatched&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ &gt;</a></td><td class="desc">Parital specialization for column-major output exchanges problem size and operand </td></tr>
+<tr id="row_0_3_0_17_" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span id="arr_0_3_0_17_" class="arrow" onclick="toggleFolder('0_3_0_17_')">&#9658;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_0c9bb6f4463ab6085e6008b5d5ad6abfd.html" target="_self">GemmBatched&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ &gt;</a></td><td class="desc">Partial specialization for column-major output exchanges problem size and operand </td></tr>
 <tr id="row_0_3_0_17_0_" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_213d78696663f4231cd52c6a277c60e5.html" target="_self">Arguments</a></td><td class="desc">Argument structure </td></tr>
 <tr id="row_0_3_0_18_" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span id="arr_0_3_0_18_" class="arrow" onclick="toggleFolder('0_3_0_18_')">&#9658;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmComplex.html" target="_self">GemmComplex</a></td><td class="desc"></td></tr>
 <tr id="row_0_3_0_18_0_" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1gemm_1_1device_1_1GemmComplex_1_1Arguments.html" target="_self">Arguments</a></td><td class="desc">Argument structure </td></tr>
-<tr id="row_0_3_0_19_" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span id="arr_0_3_0_19_" class="arrow" onclick="toggleFolder('0_3_0_19_')">&#9658;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmComplex_3_01ElementA___00_01LayoutA___00_01ElementB___00_07c56401b4df75709ae636675d9980a9a.html" target="_self">GemmComplex&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, TransformA, TransformB, SplitKSerial &gt;</a></td><td class="desc">Parital specialization for column-major output exchanges problem size and operand </td></tr>
+<tr id="row_0_3_0_19_" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span id="arr_0_3_0_19_" class="arrow" onclick="toggleFolder('0_3_0_19_')">&#9658;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmComplex_3_01ElementA___00_01LayoutA___00_01ElementB___00_07c56401b4df75709ae636675d9980a9a.html" target="_self">GemmComplex&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, TransformA, TransformB, SplitKSerial &gt;</a></td><td class="desc">Partial specialization for column-major output exchanges problem size and operand </td></tr>
 <tr id="row_0_3_0_19_0_" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1gemm_1_1device_1_1GemmComplex_3_01ElementA___00_01LayoutA___00_01ElementB___00_a3923967cafb5cb9774c320dc24baa77.html" target="_self">Arguments</a></td><td class="desc">Argument structure </td></tr>
 <tr id="row_0_3_0_20_" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span id="arr_0_3_0_20_" class="arrow" onclick="toggleFolder('0_3_0_20_')">&#9658;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmSplitKParallel.html" target="_self">GemmSplitKParallel</a></td><td class="desc"></td></tr>
 <tr id="row_0_3_0_20_0_" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1gemm_1_1device_1_1GemmSplitKParallel_1_1Arguments.html" target="_self">Arguments</a></td><td class="desc">Argument structure </td></tr>
@ -594,7 +594,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
 <tr id="row_0_8_1_4_" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1device_1_1Gemm.html" target="_self">Gemm</a></td><td class="desc"></td></tr>
 <tr id="row_0_8_1_5_" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout4e016ab7cfc644acd7cb4ae770339773.html" target="_self">Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, AccumulatorType, arch::OpMultiplyAdd &gt;</a></td><td class="desc">Partial specialization for multiply-add </td></tr>
 <tr id="row_0_8_1_6_" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout30b72addd464a2ca4a26785cbfd77a8e.html" target="_self">Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, AccumulatorType, arch::OpMultiplyAddSaturate &gt;</a></td><td class="desc">Partial specialization for multiply-add-saturate </td></tr>
-<tr id="row_0_8_1_7_" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout660562b232f408218828ca5915b7e73a.html" target="_self">Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, AccumulatorType, arch::OpXorPopc &gt;</a></td><td class="desc">Parital specialization for XOR-popc </td></tr>
+<tr id="row_0_8_1_7_" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout660562b232f408218828ca5915b7e73a.html" target="_self">Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, AccumulatorType, arch::OpXorPopc &gt;</a></td><td class="desc">Partial specialization for XOR-popc </td></tr>
 <tr id="row_0_8_1_8_" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1device_1_1TensorDiagonalForEach.html" target="_self">TensorDiagonalForEach</a></td><td class="desc">Launches a kernel calling a functor for each element along a tensor's diagonal </td></tr>
 <tr id="row_0_8_1_9_" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1device_1_1TensorForEach.html" target="_self">TensorForEach</a></td><td class="desc">Launches a kernel calling a functor for each element in a tensor's index space </td></tr>
 <tr id="row_0_8_2_" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;">&#160;</span><span id="arr_0_8_2_" class="arrow" onclick="toggleFolder('0_8_2_')">&#9658;</span><span class="icona"><span class="icon">N</span></span><a class="el" href="namespacecutlass_1_1reference_1_1host.html" target="_self">host</a></td><td class="desc"></td></tr>
@ -620,7 +620,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
 <tr id="row_0_8_2_2_" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1host_1_1Gemm.html" target="_self">Gemm</a></td><td class="desc"></td></tr>
 <tr id="row_0_8_2_3_" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_193dd3a37f00deff1e5dcd7c310afb1f.html" target="_self">Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, ComputeType, arch::OpMultiplyAdd &gt;</a></td><td class="desc">Partial specialization for multiply-add </td></tr>
 <tr id="row_0_8_2_4_" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_55729eac7dbd6bf311ea36f680e83e93.html" target="_self">Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, ComputeType, arch::OpMultiplyAddSaturate &gt;</a></td><td class="desc">Partial specialization for multiply-add-saturate </td></tr>
-<tr id="row_0_8_2_5_" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_4f3f32c4b336238abfd741e87bfced46.html" target="_self">Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, ComputeType, arch::OpXorPopc &gt;</a></td><td class="desc">Parital specialization for XOR-popc </td></tr>
+<tr id="row_0_8_2_5_" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_4f3f32c4b336238abfd741e87bfced46.html" target="_self">Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, ComputeType, arch::OpXorPopc &gt;</a></td><td class="desc">Partial specialization for XOR-popc </td></tr>
 <tr id="row_0_9_" style="display:none;"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span id="arr_0_9_" class="arrow" onclick="toggleFolder('0_9_')">&#9658;</span><span class="icona"><span class="icon">N</span></span><a class="el" href="namespacecutlass_1_1thread.html" target="_self">thread</a></td><td class="desc"></td></tr>
 <tr id="row_0_9_0_" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1thread_1_1Matrix.html" target="_self">Matrix</a></td><td class="desc">Per-thread matrix object storing a packed matrix </td></tr>
 <tr id="row_0_10_" style="display:none;"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span id="arr_0_10_" class="arrow" onclick="toggleFolder('0_10_')">&#9658;</span><span class="icona"><span class="icon">N</span></span><a class="el" href="namespacecutlass_1_1transform.html" target="_self">transform</a></td><td class="desc"></td></tr>
--- a/docs/classcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_0c9bb6f4463ab6085e6008b5d5ad6abfd.html
+++ b/docs/classcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_0c9bb6f4463ab6085e6008b5d5ad6abfd.html
@ -108,7 +108,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
 </div><!--header-->
 <div class="contents">
-<p>Parital specialization for column-major output exchanges problem size and operand.  
+<p>Partial specialization for column-major output exchanges problem size and operand.
 </p>
 <p><code>#include &lt;<a class="el" href="device_2gemm__batched_8h_source.html">gemm_batched.h</a>&gt;</code></p>
--- a/docs/classcutlass_1_1gemm_1_1device_1_1GemmComplex_3_01ElementA___00_01LayoutA___00_01ElementB___00_07c56401b4df75709ae636675d9980a9a.html
+++ b/docs/classcutlass_1_1gemm_1_1device_1_1GemmComplex_3_01ElementA___00_01LayoutA___00_01ElementB___00_07c56401b4df75709ae636675d9980a9a.html
@ -108,7 +108,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
 </div><!--header-->
 <div class="contents">
-<p>Parital specialization for column-major output exchanges problem size and operand.  
+<p>Partial specialization for column-major output exchanges problem size and operand.
 </p>
 <p><code>#include &lt;<a class="el" href="include_2cutlass_2gemm_2device_2gemm__complex_8h_source.html">gemm_complex.h</a>&gt;</code></p>
--- a/docs/classcutlass_1_1gemm_1_1device_1_1Gemm_3_01ElementA___00_01LayoutA___00_01ElementB___00_01Layout4d0960ae6b1d1bf19e6239dbd002249c.html
+++ b/docs/classcutlass_1_1gemm_1_1device_1_1Gemm_3_01ElementA___00_01LayoutA___00_01ElementB___00_01Layout4d0960ae6b1d1bf19e6239dbd002249c.html
@ -108,7 +108,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
 </div><!--header-->
 <div class="contents">
-<p>Parital specialization for column-major output exchanges problem size and operand.  
+<p>Partial specialization for column-major output exchanges problem size and operand.
 </p>
 <p><code>#include &lt;<a class="el" href="include_2cutlass_2gemm_2device_2gemm_8h_source.html">gemm.h</a>&gt;</code></p>
--- a/docs/command__line_8h_source.html
+++ b/docs/command__line_8h_source.html
--- a/docs/device_2gemm__batched_8h.html
+++ b/docs/device_2gemm__batched_8h.html
@ -130,7 +130,7 @@ Classes</h2></td></tr>
 <tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Argument structure.  <a href="structcutlass_1_1gemm_1_1device_1_1GemmBatched_1_1Arguments.html#details">More...</a><br /></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">class &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_0c9bb6f4463ab6085e6008b5d5ad6abfd.html">cutlass::gemm::device::GemmBatched&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ &gt;</a></td></tr>
-<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Parital specialization for column-major output exchanges problem size and operand.  <a href="classcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_0c9bb6f4463ab6085e6008b5d5ad6abfd.html#details">More...</a><br /></td></tr>
+<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Partial specialization for column-major output exchanges problem size and operand.  <a href="classcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_0c9bb6f4463ab6085e6008b5d5ad6abfd.html#details">More...</a><br /></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_213d78696663f4231cd52c6a277c60e5.html">cutlass::gemm::device::GemmBatched&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ &gt;::Arguments</a></td></tr>
 <tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Argument structure.  <a href="structcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_213d78696663f4231cd52c6a277c60e5.html#details">More...</a><br /></td></tr>
--- a/docs/device_2kernel_2tensor__foreach_8h_source.html
+++ b/docs/device_2kernel_2tensor__foreach_8h_source.html
--- a/docs/device_2tensor__fill_8h.html
+++ b/docs/device_2tensor__fill_8h.html
@ -237,7 +237,7 @@ Functions</h2></td></tr>
 <tr class="separator:a6e23d479ebb3760d5846ed1b67e450e4"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a6b0f21995c4fd5c33617550e6905c78e"><td class="memTemplParams" colspan="2">template&lt;typename Element , typename Layout &gt; </td></tr>
 <tr class="memitem:a6b0f21995c4fd5c33617550e6905c78e"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="namespacecutlass_1_1reference_1_1device.html#a6b0f21995c4fd5c33617550e6905c78e">cutlass::reference::device::TensorFillIdentity</a> (TensorView&lt; Element, Layout &gt; view)</td></tr>
-<tr class="memdesc:a6b0f21995c4fd5c33617550e6905c78e"><td class="mdescLeft">&#160;</td><td class="mdescRight">Fills a tensor's digonal with 1 and 0 everywhere else.  <a href="namespacecutlass_1_1reference_1_1device.html#a6b0f21995c4fd5c33617550e6905c78e">More...</a><br /></td></tr>
+<tr class="memdesc:a6b0f21995c4fd5c33617550e6905c78e"><td class="mdescLeft">&#160;</td><td class="mdescRight">Fills a tensor's diagonal with 1 and 0 everywhere else.  <a href="namespacecutlass_1_1reference_1_1device.html#a6b0f21995c4fd5c33617550e6905c78e">More...</a><br /></td></tr>
 <tr class="separator:a6b0f21995c4fd5c33617550e6905c78e"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:aaff3d7919a2f2dce14eb254c17eead9a"><td class="memTemplParams" colspan="2">template&lt;typename Element , typename Layout &gt; </td></tr>
 <tr class="memitem:aaff3d7919a2f2dce14eb254c17eead9a"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="namespacecutlass_1_1reference_1_1device.html#aaff3d7919a2f2dce14eb254c17eead9a">cutlass::reference::device::TensorUpdateDiagonal</a> (TensorView&lt; Element, Layout &gt; view, Element diag=Element(1))</td></tr>
--- a/docs/device_2tensor__fill_8h_source.html
+++ b/docs/device_2tensor__fill_8h_source.html
@ -125,7 +125,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
 <div class="ttc" id="structcutlass_1_1reference_1_1device_1_1detail_1_1RandomGaussianFunc_1_1Params_html"><div class="ttname"><a href="structcutlass_1_1reference_1_1device_1_1detail_1_1RandomGaussianFunc_1_1Params.html">cutlass::reference::device::detail::RandomGaussianFunc::Params</a></div><div class="ttdoc">Parameters structure. </div><div class="ttdef"><b>Definition:</b> device/tensor_fill.h:99</div></div>
 <div class="ttc" id="structcutlass_1_1Distribution_html_a07cb089b346ef06e198f6043128264fb"><div class="ttname"><a href="structcutlass_1_1Distribution.html#a07cb089b346ef06e198f6043128264fb">cutlass::Distribution::kind</a></div><div class="ttdeci">Kind kind</div><div class="ttdoc">Active variant kind. </div><div class="ttdef"><b>Definition:</b> distribution.h:64</div></div>
 <div class="ttc" id="structcutlass_1_1reference_1_1device_1_1detail_1_1TensorFillRandomUniformFunc_1_1Params_html_a267e7ea4e77076cc9be7d639b3cef64d"><div class="ttname"><a href="structcutlass_1_1reference_1_1device_1_1detail_1_1TensorFillRandomUniformFunc_1_1Params.html#a267e7ea4e77076cc9be7d639b3cef64d">cutlass::reference::device::detail::TensorFillRandomUniformFunc::Params::Params</a></div><div class="ttdeci">Params(TensorView view_=TensorView(), typename RandomFunc::Params random_=RandomFunc::Params())</div><div class="ttdoc">Construction of Gaussian RNG functor. </div><div class="ttdef"><b>Definition:</b> device/tensor_fill.h:422</div></div>
-<div class="ttc" id="namespacecutlass_1_1reference_1_1device_html_a6b0f21995c4fd5c33617550e6905c78e"><div class="ttname"><a href="namespacecutlass_1_1reference_1_1device.html#a6b0f21995c4fd5c33617550e6905c78e">cutlass::reference::device::TensorFillIdentity</a></div><div class="ttdeci">void TensorFillIdentity(TensorView&lt; Element, Layout &gt; view)</div><div class="ttdoc">Fills a tensor&amp;#39;s digonal with 1 and 0 everywhere else. </div><div class="ttdef"><b>Definition:</b> device/tensor_fill.h:630</div></div>
+<div class="ttc" id="namespacecutlass_1_1reference_1_1device_html_a6b0f21995c4fd5c33617550e6905c78e"><div class="ttname"><a href="namespacecutlass_1_1reference_1_1device.html#a6b0f21995c4fd5c33617550e6905c78e">cutlass::reference::device::TensorFillIdentity</a></div><div class="ttdeci">void TensorFillIdentity(TensorView&lt; Element, Layout &gt; view)</div><div class="ttdoc">Fills a tensor&amp;#39;s diagonal with 1 and 0 everywhere else. </div><div class="ttdef"><b>Definition:</b> device/tensor_fill.h:630</div></div>
 <div class="ttc" id="classcutlass_1_1TensorView_html_a7d3914dd5042c9c40be9e21a7b4e9ece"><div class="ttname"><a href="classcutlass_1_1TensorView.html#a7d3914dd5042c9c40be9e21a7b4e9ece">cutlass::TensorView::extent</a></div><div class="ttdeci">CUTLASS_HOST_DEVICE TensorCoord const &amp; extent() const </div><div class="ttdoc">Returns the extent of the view (the size along each logical dimension). </div><div class="ttdef"><b>Definition:</b> tensor_view.h:167</div></div>
 <div class="ttc" id="structcutlass_1_1reference_1_1device_1_1detail_1_1TensorUpdateDiagonalFunc_html"><div class="ttname"><a href="structcutlass_1_1reference_1_1device_1_1detail_1_1TensorUpdateDiagonalFunc.html">cutlass::reference::device::detail::TensorUpdateDiagonalFunc</a></div><div class="ttdoc">Computes a random Gaussian distribution. </div><div class="ttdef"><b>Definition:</b> device/tensor_fill.h:645</div></div>
 <div class="ttc" id="structcutlass_1_1reference_1_1device_1_1detail_1_1RandomUniformFunc_1_1Params_html_afe8637b103e25ec2e9b731389fa049be"><div class="ttname"><a href="structcutlass_1_1reference_1_1device_1_1detail_1_1RandomUniformFunc_1_1Params.html#afe8637b103e25ec2e9b731389fa049be">cutlass::reference::device::detail::RandomUniformFunc::Params::int_scale</a></div><div class="ttdeci">int int_scale</div><div class="ttdef"><b>Definition:</b> device/tensor_fill.h:315</div></div>
--- a/docs/device_2tensor__foreach_8h_source.html
+++ b/docs/device_2tensor__foreach_8h_source.html
--- a/docs/functions_func_s.html
+++ b/docs/functions_func_s.html
@ -141,7 +141,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
 <li>Semaphore()
 : <a class="el" href="classcutlass_1_1Semaphore.html#a2ce4cd07fe773efa429f726cfbd98070">cutlass::Semaphore</a>
 </li>
-<li>seperate_string()
+<li>separate_string()
 : <a class="el" href="structcutlass_1_1CommandLine.html#a5f86e4b2bd8c44b739c83530d77c5590">cutlass::CommandLine</a>
 </li>
 <li>set()
--- a/docs/functions_s.html
+++ b/docs/functions_s.html
@ -172,7 +172,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
 <li>Semaphore()
 : <a class="el" href="classcutlass_1_1Semaphore.html#a2ce4cd07fe773efa429f726cfbd98070">cutlass::Semaphore</a>
 </li>
-<li>seperate_string()
+<li>separate_string()
 : <a class="el" href="structcutlass_1_1CommandLine.html#a5f86e4b2bd8c44b739c83530d77c5590">cutlass::CommandLine</a>
 </li>
 <li>sequential
--- a/docs/hierarchy.html
+++ b/docs/hierarchy.html
@ -312,23 +312,23 @@ This inheritance list is sorted roughly, but not completely, alphabetically:</di
 <tr id="row_197_"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1host_1_1Gemm.html" target="_self">cutlass::reference::host::Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, ComputeType, InnerProductOp &gt;</a></td><td class="desc"></td></tr>
 <tr id="row_198_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout4e016ab7cfc644acd7cb4ae770339773.html" target="_self">cutlass::reference::device::Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, AccumulatorType, arch::OpMultiplyAdd &gt;</a></td><td class="desc">Partial specialization for multiply-add </td></tr>
 <tr id="row_199_"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout30b72addd464a2ca4a26785cbfd77a8e.html" target="_self">cutlass::reference::device::Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, AccumulatorType, arch::OpMultiplyAddSaturate &gt;</a></td><td class="desc">Partial specialization for multiply-add-saturate </td></tr>
-<tr id="row_200_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout660562b232f408218828ca5915b7e73a.html" target="_self">cutlass::reference::device::Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, AccumulatorType, arch::OpXorPopc &gt;</a></td><td class="desc">Parital specialization for XOR-popc </td></tr>
+<tr id="row_200_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout660562b232f408218828ca5915b7e73a.html" target="_self">cutlass::reference::device::Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, AccumulatorType, arch::OpXorPopc &gt;</a></td><td class="desc">Partial specialization for XOR-popc </td></tr>
 <tr id="row_201_"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_193dd3a37f00deff1e5dcd7c310afb1f.html" target="_self">cutlass::reference::host::Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, ComputeType, arch::OpMultiplyAdd &gt;</a></td><td class="desc">Partial specialization for multiply-add </td></tr>
 <tr id="row_202_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_55729eac7dbd6bf311ea36f680e83e93.html" target="_self">cutlass::reference::host::Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, ComputeType, arch::OpMultiplyAddSaturate &gt;</a></td><td class="desc">Partial specialization for multiply-add-saturate </td></tr>
-<tr id="row_203_"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_4f3f32c4b336238abfd741e87bfced46.html" target="_self">cutlass::reference::host::Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, ComputeType, arch::OpXorPopc &gt;</a></td><td class="desc">Parital specialization for XOR-popc </td></tr>
+<tr id="row_203_"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_4f3f32c4b336238abfd741e87bfced46.html" target="_self">cutlass::reference::host::Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, ComputeType, arch::OpXorPopc &gt;</a></td><td class="desc">Partial specialization for XOR-popc </td></tr>
-<tr id="row_204_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1Gemm_3_01ElementA___00_01LayoutA___00_01ElementB___00_01Layout4d0960ae6b1d1bf19e6239dbd002249c.html" target="_self">cutlass::gemm::device::Gemm&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, SplitKSerial, Operator_, IsBetaZero &gt;</a></td><td class="desc">Parital specialization for column-major output exchanges problem size and operand </td></tr>
+<tr id="row_204_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1Gemm_3_01ElementA___00_01LayoutA___00_01ElementB___00_01Layout4d0960ae6b1d1bf19e6239dbd002249c.html" target="_self">cutlass::gemm::device::Gemm&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, SplitKSerial, Operator_, IsBetaZero &gt;</a></td><td class="desc">Partial specialization for column-major output exchanges problem size and operand </td></tr>
 <tr id="row_205_"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1Gemm.html" target="_self">cutlass::gemm::device::Gemm&lt; ElementB, typename layout::LayoutTranspose&lt; LayoutB &gt;::type, ElementA, typename layout::LayoutTranspose&lt; LayoutA &gt;::type, ElementC, layout::RowMajor, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages, kAlignmentB, kAlignmentA, SplitKSerial, Operator, kIsBetaZero &gt;</a></td><td class="desc"></td></tr>
 <tr id="row_206_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1library_1_1GemmArguments.html" target="_self">cutlass::library::GemmArguments</a></td><td class="desc">Arguments for GEMM </td></tr>
 <tr id="row_207_"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1library_1_1GemmArrayArguments.html" target="_self">cutlass::library::GemmArrayArguments</a></td><td class="desc">Arguments for GEMM - used by all the GEMM operations </td></tr>
 <tr id="row_208_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1library_1_1GemmArrayConfiguration.html" target="_self">cutlass::library::GemmArrayConfiguration</a></td><td class="desc">Configuration for batched GEMM in which multiple matrix products are computed </td></tr>
 <tr id="row_209_"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmBatched.html" target="_self">cutlass::gemm::device::GemmBatched&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, LayoutC_, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ &gt;</a></td><td class="desc"></td></tr>
 <tr id="row_210_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1gemm_1_1kernel_1_1GemmBatched.html" target="_self">cutlass::gemm::kernel::GemmBatched&lt; Mma_, Epilogue_, ThreadblockSwizzle_ &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_211_"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_0c9bb6f4463ab6085e6008b5d5ad6abfd.html" target="_self">cutlass::gemm::device::GemmBatched&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ &gt;</a></td><td class="desc">Parital specialization for column-major output exchanges problem size and operand </td></tr>
+<tr id="row_211_"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_0c9bb6f4463ab6085e6008b5d5ad6abfd.html" target="_self">cutlass::gemm::device::GemmBatched&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ &gt;</a></td><td class="desc">Partial specialization for column-major output exchanges problem size and operand </td></tr>
 <tr id="row_212_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmBatched.html" target="_self">cutlass::gemm::device::GemmBatched&lt; ElementB, typename layout::LayoutTranspose&lt; LayoutB &gt;::type, ElementA, typename layout::LayoutTranspose&lt; LayoutA &gt;::type, ElementC, layout::RowMajor, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages, kAlignmentB, kAlignmentA &gt;</a></td><td class="desc"></td></tr>
 <tr id="row_213_"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1library_1_1GemmBatchedConfiguration.html" target="_self">cutlass::library::GemmBatchedConfiguration</a></td><td class="desc">Configuration for batched GEMM in which multiple matrix products are computed </td></tr>
 <tr id="row_214_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1gemm_1_1threadblock_1_1GemmBatchedIdentityThreadblockSwizzle.html" target="_self">cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle</a></td><td class="desc">Threadblock swizzling function for batched GEMMs </td></tr>
 <tr id="row_215_"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmComplex.html" target="_self">cutlass::gemm::device::GemmComplex&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, LayoutC_, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, TransformA, TransformB, SplitKSerial &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_216_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmComplex_3_01ElementA___00_01LayoutA___00_01ElementB___00_07c56401b4df75709ae636675d9980a9a.html" target="_self">cutlass::gemm::device::GemmComplex&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, TransformA, TransformB, SplitKSerial &gt;</a></td><td class="desc">Parital specialization for column-major output exchanges problem size and operand </td></tr>
+<tr id="row_216_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmComplex_3_01ElementA___00_01LayoutA___00_01ElementB___00_07c56401b4df75709ae636675d9980a9a.html" target="_self">cutlass::gemm::device::GemmComplex&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, TransformA, TransformB, SplitKSerial &gt;</a></td><td class="desc">Partial specialization for column-major output exchanges problem size and operand </td></tr>
 <tr id="row_217_"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmComplex.html" target="_self">cutlass::gemm::device::GemmComplex&lt; ElementB, typename layout::LayoutTranspose&lt; LayoutB &gt;::type, ElementA, typename layout::LayoutTranspose&lt; LayoutA &gt;::type, ElementC, layout::RowMajor, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages, TransformA, TransformB, SplitKSerial &gt;</a></td><td class="desc"></td></tr>
 <tr id="row_218_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1library_1_1GemmConfiguration.html" target="_self">cutlass::library::GemmConfiguration</a></td><td class="desc">Configuration for basic GEMM operations </td></tr>
 <tr id="row_219_"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1gemm_1_1threadblock_1_1GemmHorizontalThreadblockSwizzle.html" target="_self">cutlass::gemm::threadblock::GemmHorizontalThreadblockSwizzle</a></td><td class="desc">Threadblock swizzling function for GEMMs </td></tr>
--- a/docs/host_2tensor__fill_8h.html
+++ b/docs/host_2tensor__fill_8h.html
@ -192,7 +192,7 @@ Functions</h2></td></tr>
 <tr class="separator:a1c81144ca36832a48d04d1b5b6498080"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a29548cb522d9c147cf34263ecac75d89"><td class="memTemplParams" colspan="2">template&lt;typename Element , typename Layout &gt; </td></tr>
 <tr class="memitem:a29548cb522d9c147cf34263ecac75d89"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="namespacecutlass_1_1reference_1_1host.html#a29548cb522d9c147cf34263ecac75d89">cutlass::reference::host::TensorFillIdentity</a> (TensorView&lt; Element, Layout &gt; dst)</td></tr>
-<tr class="memdesc:a29548cb522d9c147cf34263ecac75d89"><td class="mdescLeft">&#160;</td><td class="mdescRight">Helper to fill a tensor's digonal with 1 and 0 everywhere else.  <a href="namespacecutlass_1_1reference_1_1host.html#a29548cb522d9c147cf34263ecac75d89">More...</a><br /></td></tr>
+<tr class="memdesc:a29548cb522d9c147cf34263ecac75d89"><td class="mdescLeft">&#160;</td><td class="mdescRight">Helper to fill a tensor's diagonal with 1 and 0 everywhere else.  <a href="namespacecutlass_1_1reference_1_1host.html#a29548cb522d9c147cf34263ecac75d89">More...</a><br /></td></tr>
 <tr class="separator:a29548cb522d9c147cf34263ecac75d89"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:acbf747241e8ac6ef9b1702b735a7913e"><td class="memTemplParams" colspan="2">template&lt;typename Element , typename Layout &gt; </td></tr>
 <tr class="memitem:acbf747241e8ac6ef9b1702b735a7913e"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="namespacecutlass_1_1reference_1_1host.html#acbf747241e8ac6ef9b1702b735a7913e">cutlass::reference::host::TensorUpdateDiagonal</a> (TensorView&lt; Element, Layout &gt; dst, Element val=Element(1))</td></tr>
--- a/docs/host_2tensor__fill_8h_source.html
+++ b/docs/host_2tensor__fill_8h_source.html
@ -132,7 +132,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
 <div class="ttc" id="structcutlass_1_1reference_1_1host_1_1detail_1_1RandomGaussianFunc_html_a4c9943f36faab7d4928b1f130d0b784c"><div class="ttname"><a href="structcutlass_1_1reference_1_1host_1_1detail_1_1RandomGaussianFunc.html#a4c9943f36faab7d4928b1f130d0b784c">cutlass::reference::host::detail::RandomGaussianFunc::int_scale</a></div><div class="ttdeci">int int_scale</div><div class="ttdef"><b>Definition:</b> host/tensor_fill.h:115</div></div>
 <div class="ttc" id="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorUpdateOffDiagonalFunc_html"><div class="ttname"><a href="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorUpdateOffDiagonalFunc.html">cutlass::reference::host::detail::TensorUpdateOffDiagonalFunc</a></div><div class="ttdoc">&lt; Layout function </div><div class="ttdef"><b>Definition:</b> host/tensor_fill.h:597</div></div>
 <div class="ttc" id="structcutlass_1_1reference_1_1host_1_1detail_1_1RandomUniformFunc_3_01complex_3_01Element_01_4_01_4_html_ad0de7d4946af855288d7f9cccb9a18eb"><div class="ttname"><a href="structcutlass_1_1reference_1_1host_1_1detail_1_1RandomUniformFunc_3_01complex_3_01Element_01_4_01_4.html#ad0de7d4946af855288d7f9cccb9a18eb">cutlass::reference::host::detail::RandomUniformFunc&lt; complex&lt; Element &gt; &gt;::int_scale</a></div><div class="ttdeci">int int_scale</div><div class="ttdef"><b>Definition:</b> host/tensor_fill.h:357</div></div>
-<div class="ttc" id="namespacecutlass_1_1reference_1_1host_html_a29548cb522d9c147cf34263ecac75d89"><div class="ttname"><a href="namespacecutlass_1_1reference_1_1host.html#a29548cb522d9c147cf34263ecac75d89">cutlass::reference::host::TensorFillIdentity</a></div><div class="ttdeci">void TensorFillIdentity(TensorView&lt; Element, Layout &gt; dst)</div><div class="ttdoc">Helper to fill a tensor&amp;#39;s digonal with 1 and 0 everywhere else. </div><div class="ttdef"><b>Definition:</b> host/tensor_fill.h:564</div></div>
+<div class="ttc" id="namespacecutlass_1_1reference_1_1host_html_a29548cb522d9c147cf34263ecac75d89"><div class="ttname"><a href="namespacecutlass_1_1reference_1_1host.html#a29548cb522d9c147cf34263ecac75d89">cutlass::reference::host::TensorFillIdentity</a></div><div class="ttdeci">void TensorFillIdentity(TensorView&lt; Element, Layout &gt; dst)</div><div class="ttdoc">Helper to fill a tensor&amp;#39;s diagonal with 1 and 0 everywhere else. </div><div class="ttdef"><b>Definition:</b> host/tensor_fill.h:564</div></div>
 <div class="ttc" id="structcutlass_1_1reference_1_1host_1_1detail_1_1RandomUniformFunc_3_01complex_3_01Element_01_4_01_4_html_a6ef7020f1108432fe51853dffb7e727c"><div class="ttname"><a href="structcutlass_1_1reference_1_1host_1_1detail_1_1RandomUniformFunc_3_01complex_3_01Element_01_4_01_4.html#a6ef7020f1108432fe51853dffb7e727c">cutlass::reference::host::detail::RandomUniformFunc&lt; complex&lt; Element &gt; &gt;::operator()</a></div><div class="ttdeci">complex&lt; Element &gt; operator()() const </div><div class="ttdoc">Compute random value and update RNG state. </div><div class="ttdef"><b>Definition:</b> host/tensor_fill.h:375</div></div>
 <div class="ttc" id="namespacecutlass_html_a67f9e83dd59615eff837ea66984c121c"><div class="ttname"><a href="namespacecutlass.html#a67f9e83dd59615eff837ea66984c121c">cutlass::log</a></div><div class="ttdeci">CUTLASS_HOST_DEVICE complex&lt; T &gt; log(complex&lt; T &gt; const &amp;z)</div><div class="ttdoc">Computes the complex exponential of z. </div><div class="ttdef"><b>Definition:</b> complex.h:381</div></div>
 <div class="ttc" id="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorFillGaussianFunc_html_a4e447a80bd94cde69fa66f9e9d882b28"><div class="ttname"><a href="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorFillGaussianFunc.html#a4e447a80bd94cde69fa66f9e9d882b28">cutlass::reference::host::detail::TensorFillGaussianFunc::operator()</a></div><div class="ttdeci">void operator()(Coord&lt; Layout::kRank &gt; const &amp;coord) const </div><div class="ttdoc">Compute random value and update RNG state. </div><div class="ttdef"><b>Definition:</b> host/tensor_fill.h:236</div></div>
--- a/docs/host_2tensor__foreach_8h_source.html
+++ b/docs/host_2tensor__foreach_8h_source.html
@ -104,7 +104,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
 <div class="ttc" id="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorForEachHelper_html_a1161a761c596e714982fe30141211cca"><div class="ttname"><a href="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorForEachHelper.html#a1161a761c596e714982fe30141211cca">cutlass::reference::host::detail::TensorForEachHelper::kActiveRank</a></div><div class="ttdeci">static int const kActiveRank</div><div class="ttdoc">Index of the active rank. </div><div class="ttdef"><b>Definition:</b> host/tensor_foreach.h:44</div></div>
 <div class="ttc" id="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorForEachHelper_html_aa63906bbecfe42eec1991c9176f066d9"><div class="ttname"><a href="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorForEachHelper.html#aa63906bbecfe42eec1991c9176f066d9">cutlass::reference::host::detail::TensorForEachHelper::TensorForEachHelper</a></div><div class="ttdeci">TensorForEachHelper(Func &amp;func, Coord&lt; Rank &gt; const &amp;extent, Coord&lt; Rank &gt; &amp;coord)</div><div class="ttdoc">Constructor for general rank. </div><div class="ttdef"><b>Definition:</b> host/tensor_foreach.h:47</div></div>
 <div class="ttc" id="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorForEachHelper_html"><div class="ttname"><a href="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorForEachHelper.html">cutlass::reference::host::detail::TensorForEachHelper</a></div><div class="ttdoc">Helper to perform for-each operation. </div><div class="ttdef"><b>Definition:</b> host/tensor_foreach.h:41</div></div>
-<div class="ttc" id="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorForEachHelper_3_01Func_00_01Rank_00_010_01_4_html_a5029a4405a9a5e64011addb43bb88120"><div class="ttname"><a href="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorForEachHelper_3_01Func_00_01Rank_00_010_01_4.html#a5029a4405a9a5e64011addb43bb88120">cutlass::reference::host::detail::TensorForEachHelper&lt; Func, Rank, 0 &gt;::TensorForEachHelper</a></div><div class="ttdeci">TensorForEachHelper(Func &amp;func, Coord&lt; Rank &gt; const &amp;extent, Coord&lt; Rank &gt; &amp;coord)</div><div class="ttdoc">Constructor for fastest chaning rank. </div><div class="ttdef"><b>Definition:</b> host/tensor_foreach.h:67</div></div>
+<div class="ttc" id="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorForEachHelper_3_01Func_00_01Rank_00_010_01_4_html_a5029a4405a9a5e64011addb43bb88120"><div class="ttname"><a href="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorForEachHelper_3_01Func_00_01Rank_00_010_01_4.html#a5029a4405a9a5e64011addb43bb88120">cutlass::reference::host::detail::TensorForEachHelper&lt; Func, Rank, 0 &gt;::TensorForEachHelper</a></div><div class="ttdeci">TensorForEachHelper(Func &amp;func, Coord&lt; Rank &gt; const &amp;extent, Coord&lt; Rank &gt; &amp;coord)</div><div class="ttdoc">Constructor for fastest changing rank. </div><div class="ttdef"><b>Definition:</b> host/tensor_foreach.h:67</div></div>
 <div class="ttc" id="structcutlass_1_1Coord_html"><div class="ttname"><a href="structcutlass_1_1Coord.html">cutlass::Coord</a></div><div class="ttdoc">Statically-sized array specifying Coords within a tensor. </div><div class="ttdef"><b>Definition:</b> coord.h:43</div></div>
 <div class="ttc" id="namespacecutlass_1_1reference_1_1host_html_a3825b1aaaf5e5abf0de5f427e3481ada"><div class="ttname"><a href="namespacecutlass_1_1reference_1_1host.html#a3825b1aaaf5e5abf0de5f427e3481ada">cutlass::reference::host::TensorForEachLambda</a></div><div class="ttdeci">void TensorForEachLambda(Coord&lt; Rank &gt; extent, Func func)</div><div class="ttdoc">Iterates over the index space of a tensor and calls a C++ lambda. </div><div class="ttdef"><b>Definition:</b> host/tensor_foreach.h:98</div></div>
 <div class="ttc" id="namespacecutlass_1_1reference_1_1host_html_a8c798c04df572b34e3ed3976d69f993d"><div class="ttname"><a href="namespacecutlass_1_1reference_1_1host.html#a8c798c04df572b34e3ed3976d69f993d">cutlass::reference::host::TensorForEach</a></div><div class="ttdeci">void TensorForEach(Coord&lt; Rank &gt; extent, Func &amp;func)</div><div class="ttdoc">Iterates over the index space of a tensor. </div><div class="ttdef"><b>Definition:</b> host/tensor_foreach.h:87</div></div>
--- a/docs/include_2cutlass_2gemm_2device_2gemm_8h.html
+++ b/docs/include_2cutlass_2gemm_2device_2gemm_8h.html
@ -130,7 +130,7 @@ Classes</h2></td></tr>
 <tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Argument structure.  <a href="structcutlass_1_1gemm_1_1device_1_1Gemm_1_1Arguments.html#details">More...</a><br /></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">class &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classcutlass_1_1gemm_1_1device_1_1Gemm_3_01ElementA___00_01LayoutA___00_01ElementB___00_01Layout4d0960ae6b1d1bf19e6239dbd002249c.html">cutlass::gemm::device::Gemm&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, SplitKSerial, Operator_, IsBetaZero &gt;</a></td></tr>
-<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Parital specialization for column-major output exchanges problem size and operand.  <a href="classcutlass_1_1gemm_1_1device_1_1Gemm_3_01ElementA___00_01LayoutA___00_01ElementB___00_01Layout4d0960ae6b1d1bf19e6239dbd002249c.html#details">More...</a><br /></td></tr>
+<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Partial specialization for column-major output exchanges problem size and operand.  <a href="classcutlass_1_1gemm_1_1device_1_1Gemm_3_01ElementA___00_01LayoutA___00_01ElementB___00_01Layout4d0960ae6b1d1bf19e6239dbd002249c.html#details">More...</a><br /></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structcutlass_1_1gemm_1_1device_1_1Gemm_3_01ElementA___00_01LayoutA___00_01ElementB___00_01Layou1b211cc9c97c022d8fe10f2dd32c8709.html">cutlass::gemm::device::Gemm&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, SplitKSerial, Operator_, IsBetaZero &gt;::Arguments</a></td></tr>
 <tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Argument structure.  <a href="structcutlass_1_1gemm_1_1device_1_1Gemm_3_01ElementA___00_01LayoutA___00_01ElementB___00_01Layou1b211cc9c97c022d8fe10f2dd32c8709.html#details">More...</a><br /></td></tr>
--- a/docs/include_2cutlass_2gemm_2device_2gemm__complex_8h.html
+++ b/docs/include_2cutlass_2gemm_2device_2gemm__complex_8h.html
@ -130,7 +130,7 @@ Classes</h2></td></tr>
 <tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Argument structure.  <a href="structcutlass_1_1gemm_1_1device_1_1GemmComplex_1_1Arguments.html#details">More...</a><br /></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">class &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmComplex_3_01ElementA___00_01LayoutA___00_01ElementB___00_07c56401b4df75709ae636675d9980a9a.html">cutlass::gemm::device::GemmComplex&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, TransformA, TransformB, SplitKSerial &gt;</a></td></tr>
-<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Parital specialization for column-major output exchanges problem size and operand.  <a href="classcutlass_1_1gemm_1_1device_1_1GemmComplex_3_01ElementA___00_01LayoutA___00_01ElementB___00_07c56401b4df75709ae636675d9980a9a.html#details">More...</a><br /></td></tr>
+<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Partial specialization for column-major output exchanges problem size and operand.  <a href="classcutlass_1_1gemm_1_1device_1_1GemmComplex_3_01ElementA___00_01LayoutA___00_01ElementB___00_07c56401b4df75709ae636675d9980a9a.html#details">More...</a><br /></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structcutlass_1_1gemm_1_1device_1_1GemmComplex_3_01ElementA___00_01LayoutA___00_01ElementB___00_a3923967cafb5cb9774c320dc24baa77.html">cutlass::gemm::device::GemmComplex&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, TransformA, TransformB, SplitKSerial &gt;::Arguments</a></td></tr>
 <tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Argument structure.  <a href="structcutlass_1_1gemm_1_1device_1_1GemmComplex_3_01ElementA___00_01LayoutA___00_01ElementB___00_a3923967cafb5cb9774c320dc24baa77.html#details">More...</a><br /></td></tr>
--- a/docs/mma__pipelined_8h_source.html
+++ b/docs/mma__pipelined_8h_source.html
--- a/docs/namespacecutlass_1_1gemm_1_1device.html
+++ b/docs/namespacecutlass_1_1gemm_1_1device.html
@ -134,17 +134,17 @@ Classes</h2></td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">class &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classcutlass_1_1gemm_1_1device_1_1Gemm.html">Gemm</a></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">class &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classcutlass_1_1gemm_1_1device_1_1Gemm_3_01ElementA___00_01LayoutA___00_01ElementB___00_01Layout4d0960ae6b1d1bf19e6239dbd002249c.html">Gemm&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, SplitKSerial, Operator_, IsBetaZero &gt;</a></td></tr>
-<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Parital specialization for column-major output exchanges problem size and operand.  <a href="classcutlass_1_1gemm_1_1device_1_1Gemm_3_01ElementA___00_01LayoutA___00_01ElementB___00_01Layout4d0960ae6b1d1bf19e6239dbd002249c.html#details">More...</a><br /></td></tr>
+<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Partial specialization for column-major output exchanges problem size and operand.  <a href="classcutlass_1_1gemm_1_1device_1_1Gemm_3_01ElementA___00_01LayoutA___00_01ElementB___00_01Layout4d0960ae6b1d1bf19e6239dbd002249c.html#details">More...</a><br /></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">class &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmBatched.html">GemmBatched</a></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">class &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_0c9bb6f4463ab6085e6008b5d5ad6abfd.html">GemmBatched&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ &gt;</a></td></tr>
-<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Parital specialization for column-major output exchanges problem size and operand.  <a href="classcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_0c9bb6f4463ab6085e6008b5d5ad6abfd.html#details">More...</a><br /></td></tr>
+<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Partial specialization for column-major output exchanges problem size and operand.  <a href="classcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_0c9bb6f4463ab6085e6008b5d5ad6abfd.html#details">More...</a><br /></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">class &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmComplex.html">GemmComplex</a></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">class &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmComplex_3_01ElementA___00_01LayoutA___00_01ElementB___00_07c56401b4df75709ae636675d9980a9a.html">GemmComplex&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, TransformA, TransformB, SplitKSerial &gt;</a></td></tr>
-<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Parital specialization for column-major output exchanges problem size and operand.  <a href="classcutlass_1_1gemm_1_1device_1_1GemmComplex_3_01ElementA___00_01LayoutA___00_01ElementB___00_07c56401b4df75709ae636675d9980a9a.html#details">More...</a><br /></td></tr>
+<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Partial specialization for column-major output exchanges problem size and operand.  <a href="classcutlass_1_1gemm_1_1device_1_1GemmComplex_3_01ElementA___00_01LayoutA___00_01ElementB___00_07c56401b4df75709ae636675d9980a9a.html#details">More...</a><br /></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">class &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmSplitKParallel.html">GemmSplitKParallel</a></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
--- a/docs/namespacecutlass_1_1reference_1_1device.html
+++ b/docs/namespacecutlass_1_1reference_1_1device.html
@ -125,7 +125,7 @@ Classes</h2></td></tr>
 <tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Partial specialization for multiply-add-saturate.  <a href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout30b72addd464a2ca4a26785cbfd77a8e.html#details">More...</a><br /></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout660562b232f408218828ca5915b7e73a.html">Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, AccumulatorType, arch::OpXorPopc &gt;</a></td></tr>
-<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Parital specialization for XOR-popc.  <a href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout660562b232f408218828ca5915b7e73a.html#details">More...</a><br /></td></tr>
+<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Partial specialization for XOR-popc.  <a href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout660562b232f408218828ca5915b7e73a.html#details">More...</a><br /></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structcutlass_1_1reference_1_1device_1_1TensorDiagonalForEach.html">TensorDiagonalForEach</a></td></tr>
 <tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Launches a kernel calling a functor for each element along a tensor's diagonal.  <a href="structcutlass_1_1reference_1_1device_1_1TensorDiagonalForEach.html#details">More...</a><br /></td></tr>
@ -183,7 +183,7 @@ Functions</h2></td></tr>
 <tr class="separator:a6e23d479ebb3760d5846ed1b67e450e4"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a6b0f21995c4fd5c33617550e6905c78e"><td class="memTemplParams" colspan="2">template&lt;typename Element , typename Layout &gt; </td></tr>
 <tr class="memitem:a6b0f21995c4fd5c33617550e6905c78e"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="namespacecutlass_1_1reference_1_1device.html#a6b0f21995c4fd5c33617550e6905c78e">TensorFillIdentity</a> (<a class="el" href="classcutlass_1_1TensorView.html">TensorView</a>&lt; Element, Layout &gt; view)</td></tr>
-<tr class="memdesc:a6b0f21995c4fd5c33617550e6905c78e"><td class="mdescLeft">&#160;</td><td class="mdescRight">Fills a tensor's digonal with 1 and 0 everywhere else.  <a href="#a6b0f21995c4fd5c33617550e6905c78e">More...</a><br /></td></tr>
+<tr class="memdesc:a6b0f21995c4fd5c33617550e6905c78e"><td class="mdescLeft">&#160;</td><td class="mdescRight">Fills a tensor's diagonal with 1 and 0 everywhere else.  <a href="#a6b0f21995c4fd5c33617550e6905c78e">More...</a><br /></td></tr>
 <tr class="separator:a6b0f21995c4fd5c33617550e6905c78e"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:aaff3d7919a2f2dce14eb254c17eead9a"><td class="memTemplParams" colspan="2">template&lt;typename Element , typename Layout &gt; </td></tr>
 <tr class="memitem:aaff3d7919a2f2dce14eb254c17eead9a"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="namespacecutlass_1_1reference_1_1device.html#aaff3d7919a2f2dce14eb254c17eead9a">TensorUpdateDiagonal</a> (<a class="el" href="classcutlass_1_1TensorView.html">TensorView</a>&lt; Element, Layout &gt; view, Element diag=Element(1))</td></tr>
--- a/docs/namespacecutlass_1_1reference_1_1host.html
+++ b/docs/namespacecutlass_1_1reference_1_1host.html
@ -122,7 +122,7 @@ Classes</h2></td></tr>
 <tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Partial specialization for multiply-add-saturate.  <a href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_55729eac7dbd6bf311ea36f680e83e93.html#details">More...</a><br /></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_4f3f32c4b336238abfd741e87bfced46.html">Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, ComputeType, arch::OpXorPopc &gt;</a></td></tr>
-<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Parital specialization for XOR-popc.  <a href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_4f3f32c4b336238abfd741e87bfced46.html#details">More...</a><br /></td></tr>
+<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Partial specialization for XOR-popc.  <a href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_4f3f32c4b336238abfd741e87bfced46.html#details">More...</a><br /></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table><table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="func-members"></a>
@ -247,7 +247,7 @@ Functions</h2></td></tr>
 <tr class="separator:a1c81144ca36832a48d04d1b5b6498080"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a29548cb522d9c147cf34263ecac75d89"><td class="memTemplParams" colspan="2">template&lt;typename Element , typename Layout &gt; </td></tr>
 <tr class="memitem:a29548cb522d9c147cf34263ecac75d89"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="namespacecutlass_1_1reference_1_1host.html#a29548cb522d9c147cf34263ecac75d89">TensorFillIdentity</a> (<a class="el" href="classcutlass_1_1TensorView.html">TensorView</a>&lt; Element, Layout &gt; dst)</td></tr>
-<tr class="memdesc:a29548cb522d9c147cf34263ecac75d89"><td class="mdescLeft">&#160;</td><td class="mdescRight">Helper to fill a tensor's digonal with 1 and 0 everywhere else.  <a href="#a29548cb522d9c147cf34263ecac75d89">More...</a><br /></td></tr>
+<tr class="memdesc:a29548cb522d9c147cf34263ecac75d89"><td class="mdescLeft">&#160;</td><td class="mdescRight">Helper to fill a tensor's diagonal with 1 and 0 everywhere else.  <a href="#a29548cb522d9c147cf34263ecac75d89">More...</a><br /></td></tr>
 <tr class="separator:a29548cb522d9c147cf34263ecac75d89"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:acbf747241e8ac6ef9b1702b735a7913e"><td class="memTemplParams" colspan="2">template&lt;typename Element , typename Layout &gt; </td></tr>
 <tr class="memitem:acbf747241e8ac6ef9b1702b735a7913e"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="namespacecutlass_1_1reference_1_1host.html#acbf747241e8ac6ef9b1702b735a7913e">TensorUpdateDiagonal</a> (<a class="el" href="classcutlass_1_1TensorView.html">TensorView</a>&lt; Element, Layout &gt; dst, Element val=Element(1))</td></tr>
--- a/docs/search/all_12.js
+++ b/docs/search/all_12.js
@ -14,7 +14,7 @@ var searchData=
  ['semaphore',['Semaphore',['../classcutlass_1_1Semaphore.html',1,'cutlass']]],
  ['semaphore',['Semaphore',['../classcutlass_1_1Semaphore.html#a2ce4cd07fe773efa429f726cfbd98070',1,'cutlass::Semaphore::Semaphore()'],['../structcutlass_1_1gemm_1_1kernel_1_1Gemm_1_1Params.html#adec6d0c6d74e7f456196f453e302fbbb',1,'cutlass::gemm::kernel::Gemm::Params::semaphore()']]],
  ['semaphore_2eh',['semaphore.h',['../semaphore_8h.html',1,'']]],
-  ['seperate_5fstring',['seperate_string',['../structcutlass_1_1CommandLine.html#a5f86e4b2bd8c44b739c83530d77c5590',1,'cutlass::CommandLine']]],
+  ['separate_5fstring',['separate_string',['../structcutlass_1_1CommandLine.html#a5f86e4b2bd8c44b739c83530d77c5590',1,'cutlass::CommandLine']]],
  ['sequential',['sequential',['../structcutlass_1_1Distribution.html#ab86d975567ef141ff82067b1f41cd3ee',1,'cutlass::Distribution::sequential()'],['../structcutlass_1_1Distribution.html#a499f4023e0d42356ce71d38cc32bf92aa39d3cf55e90573c8d1dfb483cfb410dc',1,'cutlass::Distribution::Sequential()']]],
  ['set',['set',['../classcutlass_1_1PredicateVector_1_1Iterator.html#aadfd039b5622098c9e46706a27122575',1,'cutlass::PredicateVector::Iterator::set()'],['../structcutlass_1_1PredicateVector.html#a062fa8a8df725ef08ced2ffcca8336af',1,'cutlass::PredicateVector::set()'],['../classcutlass_1_1SubbyteReference.html#a6473e57520d8ee7afbd95c1e1641e05a',1,'cutlass::SubbyteReference::set()']]],
  ['set_5fgaussian',['set_gaussian',['../structcutlass_1_1Distribution.html#ad594b5ec1d577e8ef03d4d808a8220b1',1,'cutlass::Distribution']]],
--- a/docs/search/functions_12.js
+++ b/docs/search/functions_12.js
@ -3,7 +3,7 @@ var searchData=
  ['scalar_5fop',['scalar_op',['../structcutlass_1_1minimum_3_01Array_3_01T_00_01N_01_4_01_4.html#a4b42227184cb7c796460062c46a84b57',1,'cutlass::minimum&lt; Array&lt; T, N &gt; &gt;']]],
  ['scalario',['ScalarIO',['../structcutlass_1_1ScalarIO.html#ad4166575521254088bf6c6300c351714',1,'cutlass::ScalarIO::ScalarIO()'],['../structcutlass_1_1ScalarIO.html#a5227e1e9ed24326ad4f8dc94d186186f',1,'cutlass::ScalarIO::ScalarIO(T value)']]],
  ['semaphore',['Semaphore',['../classcutlass_1_1Semaphore.html#a2ce4cd07fe773efa429f726cfbd98070',1,'cutlass::Semaphore']]],
-  ['seperate_5fstring',['seperate_string',['../structcutlass_1_1CommandLine.html#a5f86e4b2bd8c44b739c83530d77c5590',1,'cutlass::CommandLine']]],
+  ['separate_5fstring',['separate_string',['../structcutlass_1_1CommandLine.html#a5f86e4b2bd8c44b739c83530d77c5590',1,'cutlass::CommandLine']]],
  ['set',['set',['../classcutlass_1_1PredicateVector_1_1Iterator.html#aadfd039b5622098c9e46706a27122575',1,'cutlass::PredicateVector::Iterator::set()'],['../structcutlass_1_1PredicateVector.html#a062fa8a8df725ef08ced2ffcca8336af',1,'cutlass::PredicateVector::set()'],['../classcutlass_1_1SubbyteReference.html#a6473e57520d8ee7afbd95c1e1641e05a',1,'cutlass::SubbyteReference::set()']]],
  ['set_5fgaussian',['set_gaussian',['../structcutlass_1_1Distribution.html#ad594b5ec1d577e8ef03d4d808a8220b1',1,'cutlass::Distribution']]],
  ['set_5fidentity',['set_identity',['../structcutlass_1_1Distribution.html#aad2cf02af3d520544d89843cc4295858',1,'cutlass::Distribution']]],
--- a/docs/structcutlass_1_1CommandLine-members.html
+++ b/docs/structcutlass_1_1CommandLine-members.html
@ -115,7 +115,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
  <tr><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html#a1603f1c65c6d8d3d4262443b40e5c290">keys</a></td><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html">cutlass::CommandLine</a></td><td class="entry"></td></tr>
  <tr class="even"><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html#a0bee40a3cc6078a08eec5d4ca4711f61">num_naked_args</a>() const </td><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html">cutlass::CommandLine</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
  <tr><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html#a228e1a273d223eec4b2f6d73135d3c1e">parsed_argc</a>() const </td><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html">cutlass::CommandLine</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html#a5f86e4b2bd8c44b739c83530d77c5590">seperate_string</a>(std::string const &amp;str, std::vector&lt; value_t &gt; &amp;vals, char sep= ',')</td><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html">cutlass::CommandLine</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">static</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html#a5f86e4b2bd8c44b739c83530d77c5590">separate_string</a>(std::string const &amp;str, std::vector&lt; value_t &gt; &amp;vals, char sep= ',')</td><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html">cutlass::CommandLine</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">static</span></td></tr>
  <tr><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html#a1944da52162e04b12a82ce0c1ade676e">tokenize</a>(std::vector&lt; std::pair&lt; std::string, std::string &gt; &gt; &amp;tokens, std::string const &amp;str, char delim= ',', char sep= ':')</td><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html">cutlass::CommandLine</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">static</span></td></tr>
  <tr class="even"><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html#a440c25cfb006f218ff4705a43320a28b">tokenize</a>(std::vector&lt; std::string &gt; &amp;tokens, std::string const &amp;str, char delim= ',', char sep= ':')</td><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html">cutlass::CommandLine</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">static</span></td></tr>
  <tr><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html#ade127841e9730589f611b618e9440012">values</a></td><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html">cutlass::CommandLine</a></td><td class="entry"></td></tr>
--- a/docs/structcutlass_1_1CommandLine.html
+++ b/docs/structcutlass_1_1CommandLine.html
@ -151,7 +151,7 @@ Static Public Member Functions</h2></td></tr>
 <tr class="memdesc:a440c25cfb006f218ff4705a43320a28b"><td class="mdescLeft">&#160;</td><td class="mdescRight">Tokenizes a comma-delimited list of string pairs delimited by ':'.  <a href="#a440c25cfb006f218ff4705a43320a28b">More...</a><br /></td></tr>
 <tr class="separator:a440c25cfb006f218ff4705a43320a28b"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a5f86e4b2bd8c44b739c83530d77c5590"><td class="memTemplParams" colspan="2">template&lt;typename value_t &gt; </td></tr>
-<tr class="memitem:a5f86e4b2bd8c44b739c83530d77c5590"><td class="memTemplItemLeft" align="right" valign="top">static void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="structcutlass_1_1CommandLine.html#a5f86e4b2bd8c44b739c83530d77c5590">seperate_string</a> (std::string const &amp;str, std::vector&lt; value_t &gt; &amp;vals, char sep= ',')</td></tr>
+<tr class="memitem:a5f86e4b2bd8c44b739c83530d77c5590"><td class="memTemplItemLeft" align="right" valign="top">static void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="structcutlass_1_1CommandLine.html#a5f86e4b2bd8c44b739c83530d77c5590">separate_string</a> (std::string const &amp;str, std::vector&lt; value_t &gt; &amp;vals, char sep= ',')</td></tr>
 <tr class="separator:a5f86e4b2bd8c44b739c83530d77c5590"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table><table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="pub-attribs"></a>
@ -548,7 +548,7 @@ template&lt;typename value_t &gt; </div>
  <td class="mlabels-left">
      <table class="memname">
        <tr>
-          <td class="memname">static void cutlass::CommandLine::seperate_string </td>
+          <td class="memname">static void cutlass::CommandLine::separate_string </td>
          <td>(</td>
          <td class="paramtype">std::string const &amp;&#160;</td>
          <td class="paramname"><em>str</em>, </td>
--- a/docs/structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout660562b232f408218828ca5915b7e73a.html
+++ b/docs/structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout660562b232f408218828ca5915b7e73a.html
@ -104,7 +104,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
 </div><!--header-->
 <div class="contents">
-<p>Parital specialization for XOR-popc.  
+<p>Partial specialization for XOR-popc.
 </p>
 <p><code>#include &lt;<a class="el" href="tools_2util_2include_2cutlass_2util_2reference_2device_2gemm_8h_source.html">gemm.h</a>&gt;</code></p>
--- a/docs/structcutlass_1_1reference_1_1device_1_1kernel_1_1detail_1_1TensorForEachHelper_3_01Func_00_01Rank_00_010_01_4.html
+++ b/docs/structcutlass_1_1reference_1_1device_1_1kernel_1_1detail_1_1TensorForEachHelper_3_01Func_00_01Rank_00_010_01_4.html
@ -112,7 +112,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="pub-methods"></a>
 Public Member Functions</h2></td></tr>
 <tr class="memitem:a89e10e059c3ffcfe2640cf6291353937"><td class="memItemLeft" align="right" valign="top">__inline__ __device__&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structcutlass_1_1reference_1_1device_1_1kernel_1_1detail_1_1TensorForEachHelper_3_01Func_00_01Rank_00_010_01_4.html#a89e10e059c3ffcfe2640cf6291353937">TensorForEachHelper</a> (Func &amp;func, <a class="el" href="structcutlass_1_1Coord.html">Coord</a>&lt; Rank &gt; const &amp;size, <a class="el" href="structcutlass_1_1Coord.html">Coord</a>&lt; Rank &gt; &amp;coord, int64_t index)</td></tr>
-<tr class="memdesc:a89e10e059c3ffcfe2640cf6291353937"><td class="mdescLeft">&#160;</td><td class="mdescRight">Constructor for fastest chaning rank.  <a href="#a89e10e059c3ffcfe2640cf6291353937">More...</a><br /></td></tr>
+<tr class="memdesc:a89e10e059c3ffcfe2640cf6291353937"><td class="mdescLeft">&#160;</td><td class="mdescRight">Constructor for fastest changing rank.  <a href="#a89e10e059c3ffcfe2640cf6291353937">More...</a><br /></td></tr>
 <tr class="separator:a89e10e059c3ffcfe2640cf6291353937"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table>
 <h2 class="groupheader">Constructor &amp; Destructor Documentation</h2>
--- a/docs/structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_4f3f32c4b336238abfd741e87bfced46.html
+++ b/docs/structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_4f3f32c4b336238abfd741e87bfced46.html
@ -104,7 +104,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
 </div><!--header-->
 <div class="contents">
-<p>Parital specialization for XOR-popc.  
+<p>Partial specialization for XOR-popc.
 </p>
 <p><code>#include &lt;<a class="el" href="tools_2util_2include_2cutlass_2util_2reference_2host_2gemm_8h_source.html">gemm.h</a>&gt;</code></p>
--- a/docs/structcutlass_1_1reference_1_1host_1_1detail_1_1TensorForEachHelper_3_01Func_00_01Rank_00_010_01_4.html
+++ b/docs/structcutlass_1_1reference_1_1host_1_1detail_1_1TensorForEachHelper_3_01Func_00_01Rank_00_010_01_4.html
@ -113,7 +113,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="pub-methods"></a>
 Public Member Functions</h2></td></tr>
 <tr class="memitem:a5029a4405a9a5e64011addb43bb88120"><td class="memItemLeft" align="right" valign="top">&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorForEachHelper_3_01Func_00_01Rank_00_010_01_4.html#a5029a4405a9a5e64011addb43bb88120">TensorForEachHelper</a> (Func &amp;func, <a class="el" href="structcutlass_1_1Coord.html">Coord</a>&lt; Rank &gt; const &amp;extent, <a class="el" href="structcutlass_1_1Coord.html">Coord</a>&lt; Rank &gt; &amp;coord)</td></tr>
-<tr class="memdesc:a5029a4405a9a5e64011addb43bb88120"><td class="mdescLeft">&#160;</td><td class="mdescRight">Constructor for fastest chaning rank.  <a href="#a5029a4405a9a5e64011addb43bb88120">More...</a><br /></td></tr>
+<tr class="memdesc:a5029a4405a9a5e64011addb43bb88120"><td class="mdescLeft">&#160;</td><td class="mdescRight">Constructor for fastest changing rank.  <a href="#a5029a4405a9a5e64011addb43bb88120">More...</a><br /></td></tr>
 <tr class="separator:a5029a4405a9a5e64011addb43bb88120"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table><table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="pub-static-attribs"></a>
--- a/docs/tools_2util_2include_2cutlass_2util_2reference_2device_2gemm_8h.html
+++ b/docs/tools_2util_2include_2cutlass_2util_2reference_2device_2gemm_8h.html
@ -134,7 +134,7 @@ Classes</h2></td></tr>
 <tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Partial specialization for multiply-add-saturate.  <a href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout30b72addd464a2ca4a26785cbfd77a8e.html#details">More...</a><br /></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout660562b232f408218828ca5915b7e73a.html">cutlass::reference::device::Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, AccumulatorType, arch::OpXorPopc &gt;</a></td></tr>
-<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Parital specialization for XOR-popc.  <a href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout660562b232f408218828ca5915b7e73a.html#details">More...</a><br /></td></tr>
+<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Partial specialization for XOR-popc.  <a href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout660562b232f408218828ca5915b7e73a.html#details">More...</a><br /></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table><table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="namespaces"></a>
--- a/docs/tools_2util_2include_2cutlass_2util_2reference_2host_2gemm_8h.html
+++ b/docs/tools_2util_2include_2cutlass_2util_2reference_2host_2gemm_8h.html
@ -141,7 +141,7 @@ Classes</h2></td></tr>
 <tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Partial specialization for multiply-add-saturate.  <a href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_55729eac7dbd6bf311ea36f680e83e93.html#details">More...</a><br /></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_4f3f32c4b336238abfd741e87bfced46.html">cutlass::reference::host::Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, ComputeType, arch::OpXorPopc &gt;</a></td></tr>
-<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Parital specialization for XOR-popc.  <a href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_4f3f32c4b336238abfd741e87bfced46.html#details">More...</a><br /></td></tr>
+<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Partial specialization for XOR-popc.  <a href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_4f3f32c4b336238abfd741e87bfced46.html#details">More...</a><br /></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table><table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="namespaces"></a>
--- a/docs/wmma__sm75_8h_source.html
+++ b/docs/wmma__sm75_8h_source.html
--- a/examples/00_basic_gemm/basic_gemm.cu
+++ b/examples/00_basic_gemm/basic_gemm.cu
@ -47,7 +47,7 @@
  or utilities within CUTLASS. Such utilities are demonstrated elsewhere in other examples and are
  prevalent in the CUTLASS unit tests.
-  This example has delibrately been kept similar to the basic_gemm example from cutass-1.3 to 
+  This example has delibrately been kept similar to the basic_gemm example from cutlass-1.3 to
  highlight the minimum amount of differences needed to transition to cutlass-2.0.
  Cutlass-1.3 sgemm: https://github.com/NVIDIA/cutlass/blob/master/examples/00_basic_gemm/basic_gemm.cu
--- a/examples/07_volta_tensorop_gemm/volta_tensorop_gemm.cu
+++ b/examples/07_volta_tensorop_gemm/volta_tensorop_gemm.cu
@ -75,7 +75,7 @@ Now that we setup the properties of data, we have to setup properties of computa
 Second, we create template variables of tile sizes for thread-block, warp and mma-op to 128x128x32,
 64x64x32, 8x8x4 (MxNxK) respectively. When passed to instantiate CUTLASS GEMM kernel, it internally
 deduce the amount of threads needed per thread-block, amount of shared memory, storing data in
-bank-conflict free manner, and ton of other variables required to compose, intialize and launch a
+bank-conflict free manner, and ton of other variables required to compose, initialize and launch a
 high performance GEMM kernel. This is the beauty of CUTLASS, it relieves developer from
 understanding and coding complicated hardware optimizations which can easily go wrong.
@ -107,7 +107,7 @@ is done which threadblock launched on an SM, CUDA SM architecture of GPU you wan
 These are all put together to create a template variable which describes CUTLASS GEMM kernel using
 cutlass::gemm::device::Gemm template.
-The next step is to intialize physical data, instantiate and initialize CUTLASS kernel and run it.
+The next step is to initialize physical data, instantiate and initialize CUTLASS kernel and run it.
 We use CUTLASS utilities to initialize, fill, compare matrices as they are simple and doesn't come
 in the way of learning CUTLASS.
@ -115,7 +115,7 @@ Once all the matrices are initialized and filled with data, create arguments tup
 kernel which takes problem size (M = 5120, N = 4096 and K = 4096), matrices, alpha, beta and the
 important one, split k-dimension factor. Along with that, we query CUTLASS if any scratch-space
 memory required by the kernel we instantiated. If yes, we create it and pass it along with other
-arguments created to intialize CUTLASS kernel then, the kernel is launched.
+arguments created to initialize CUTLASS kernel then, the kernel is launched.
 In this example, we later on launch a reference gemm kernel (from CUTLASS utilities) to compare if
 the output from CUTLASS kernel is same as reference GEMM kernel.
--- a/examples/08_turing_tensorop_gemm/turing_tensorop_gemm.cu
+++ b/examples/08_turing_tensorop_gemm/turing_tensorop_gemm.cu
@ -74,7 +74,7 @@ Now that we setup the properties of data, we have to setup properties of computa
 Second, we create template variables of tile sizes for thread-block, warp and mma-op to 128x256x64,
 64x64x16, 8x8x16 (MxNxK) respectively. When passed to instantiate CUTLASS GEMM kernel, it internally
 deduce the amount of threads needed per thread-block, amount of shared memory, storing data in
-bank-conflict free manner, and ton of other variables required to compose, intialize and launch a
+bank-conflict free manner, and ton of other variables required to compose, initialize and launch a
 high performance GEMM kernel. This is the beauty of CUTLASS, it relieves developer from
 understanding and coding complicated hardware optimizations which can easily go wrong.
@ -106,7 +106,7 @@ is done which threadblock launched on an SM, CUDA SM architecture of GPU you wan
 These are all put together to create a template variable which describes CUTLASS GEMM kernel using
 cutlass::gemm::device::Gemm template.
-The next step is to intialize physical data, instantiate and initialize CUTLASS kernel and run it.
+The next step is to initialize physical data, instantiate and initialize CUTLASS kernel and run it.
 We use CUTLASS utilities to initialize, fill, compare matrices as they are simple and doesn't come
 in the way of learning CUTLASS.
@ -114,7 +114,7 @@ Once all the matrices are initialized and filled with data, create arguments tup
 kernel which takes problem size (M = 5120, N = 4096 and K = 4096), matrices, alpha, beta and the
 important one, split k-dimension factor. Along with that, we query CUTLASS if any scratch-space
 memory required by the kernel we instantiated. If yes, we create it and pass it along with other
-arguments created to intialize CUTLASS kernel then, the kernel is launched.
+arguments created to initialize CUTLASS kernel then, the kernel is launched.
 In this example, we later on launch a reference gemm kernel (from CUTLASS utilities) to compare if
 the output from CUTLASS kernel is same as reference GEMM kernel.
--- a/examples/09_turing_tensorop_conv2dfprop/turing_tensorop_conv2dfprop.cu
+++ b/examples/09_turing_tensorop_conv2dfprop/turing_tensorop_conv2dfprop.cu
@ -76,7 +76,7 @@ Now that we setup the properties of data, we have to setup properties of computa
 Second, we create template variables of tile sizes for thread-block, warp and mma-op to 128x128x128,
 64x64x128, 8x8x32 (MxNxK) respectively. When passed to instantiate CUTLASS Implicit GEMM kernel, it
 internally deduces the amount of threads needed per thread-block, amount of shared memory, storing
-data in bank-conflict free manner, and ton of other variables required to compose, intialize and
+data in bank-conflict free manner, and ton of other variables required to compose, initialize and
 launch a high performance Implicit GEMM kernel. This is the beauty of CUTLASS, it relieves developer
 from understanding and coding complicated hardware optimizations which can easily go wrong.
@ -108,7 +108,7 @@ is done which threadblock launched on an SM, CUDA SM architecture of GPU you wan
 These are all put together to create a template variable which describes CUTLASS Implicit GEMM
 kernel using cutlass::conv::device::ImplicitGemm template.
-The next step is to intialize physical data, instantiate and initialize CUTLASS kernel and run it.
+The next step is to initialize physical data, instantiate and initialize CUTLASS kernel and run it.
 We use CUTLASS utilities to initialize, fill, compare tensors as they are simple and doesn't come
 in the way of learning CUTLASS.
@ -117,7 +117,7 @@ kernel which takes problem size (N = 1, H = 64, W = 64, C = 128), filter size (K
 R = 3, S = 3, C = 128 ), padding, strides, dilation, tensors, alpha, beta and the
 important one, split k-dimension factor. Along with that, we query CUTLASS if any scratch-space
 memory required by the kernel we instantiated. If yes, we create it and pass it along with other
-arguments created to intialize CUTLASS kernel then, the kernel is launched.
+arguments created to initialize CUTLASS kernel then, the kernel is launched.
 In this example, we later on launch a reference convolution kernel (from CUTLASS utilities) to
 compare if the output from CUTLASS kernel is same as the reference implicit GEMM kernel.
--- a/examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_pipelined.h
+++ b/examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_pipelined.h
@ -321,7 +321,7 @@ public:
    int smem_write_stage_idx = 1;
    // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing 
-    // shared memory loads (which have the tighest latency requirement).
+    // shared memory loads (which have the tightest latency requirement).
    //
    // Mainloop
@ -461,7 +461,7 @@ public:
    int gemm_k_iterations_1 = FragmentIteratorA1::Policy::kIterations / Base::kWarpGemmIterations1;
    // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing 
-    // shared memory loads (which have the tighest latency requirement).
+    // shared memory loads (which have the tightest latency requirement).
    //
    // Mainloop
--- a/examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_pipelined_smem_accumulator.h
+++ b/examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_pipelined_smem_accumulator.h
@ -341,7 +341,7 @@ public:
    int smem_write_stage_idx = 1;
    // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing 
-    // shared memory loads (which have the tighest latency requirement).
+    // shared memory loads (which have the tightest latency requirement).
    //
    // Mainloop
--- a/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_pipelined.h
+++ b/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_pipelined.h
@ -325,7 +325,7 @@ public:
    iterator_B0.clear_mask(gemm_k_iterations_0 <= 1);
    // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing 
-    // shared memory loads (which have the tighest latency requirement).
+    // shared memory loads (which have the tightest latency requirement).
    //
    // Mainloop
--- a/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_pipelined_smem_accumulator.h
+++ b/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_pipelined_smem_accumulator.h
@ -346,7 +346,7 @@ public:
    iterator_B0.clear_mask(gemm_k_iterations_0 <= 1);
    // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing 
-    // shared memory loads (which have the tighest latency requirement).
+    // shared memory loads (which have the tightest latency requirement).
    //
    // Mainloop
--- a/examples/16_ampere_tensorop_conv2dfprop/ampere_tensorop_conv2dfprop.cu
+++ b/examples/16_ampere_tensorop_conv2dfprop/ampere_tensorop_conv2dfprop.cu
@ -73,7 +73,7 @@ Now that we setup the properties of data, we have to setup properties of computa
 Second, we create template variables of tile sizes for thread-block, warp and mma-op to 128x128x64,
 64x64x64, 16x8x16 (MxNxK) respectively. When passed to instantiate CUTLASS Implicit GEMM kernel, it
 internally deduces the amount of threads needed per thread-block, amount of shared memory, storing
-data in bank-conflict free manner, and ton of other variables required to compose, intialize and
+data in bank-conflict free manner, and ton of other variables required to compose, initialize and
 launch a high performance Implicit GEMM kernel. This is the beauty of CUTLASS, it relieves developer
 from understanding and coding complicated hardware optimizations which can easily go wrong.
@ -95,7 +95,7 @@ is done which threadblock launched on an SM, CUDA SM architecture of GPU you wan
 These are all put together to create a template variable which describes CUTLASS Implicit GEMM
 kernel using cutlass::conv::device::ImplicitGemm template.
-The next step is to intialize physical data, instantiate and initialize CUTLASS kernel and run it.
+The next step is to initialize physical data, instantiate and initialize CUTLASS kernel and run it.
 We use CUTLASS utilities to initialize, fill, compare tensors as they are simple and doesn't come
 in the way of learning CUTLASS.
@ -104,7 +104,7 @@ kernel which takes problem size (N = 1, H = 64, W = 64, C = 128), filter size (K
 R = 3, S = 3, C = 128 ), padding, strides, dilation, tensors, alpha, beta and the
 important one, split k-dimension factor. Along with that, we query CUTLASS if any scratch-space
 memory required by the kernel we instantiated. If yes, we create it and pass it along with other
-arguments created to intialize CUTLASS kernel then, the kernel is launched.
+arguments created to initialize CUTLASS kernel then, the kernel is launched.
 In this example, we later on launch a reference convolution kernel (from CUTLASS utilities) to
 compare if the output from CUTLASS kernel is same as the reference implicit GEMM kernel.
--- a/examples/23_ampere_gemm_operand_reduction_fusion/ampere_gemm_operand_reduction_fusion.cu
+++ b/examples/23_ampere_gemm_operand_reduction_fusion/ampere_gemm_operand_reduction_fusion.cu
@ -36,7 +36,7 @@ computing GEMM.  So the output also contains either a Mx1 or 1XN vector.  It onl
 core instructions.
 Most of the reduction is done in gemm/warp level, see gemm/warp/mma_with_reduction_tensor_op.h
-A few bit of reduction is done in the epilouge before storing the vector, see
+A few bit of reduction is done in the epilogue before storing the vector, see
 epilogue/threadblock/epilogue_gemm_k_reduction.h 
 */
--- a/examples/41_fused_multi_head_attention/fused_multihead_attention_fixed_seqlen.cu
+++ b/examples/41_fused_multi_head_attention/fused_multihead_attention_fixed_seqlen.cu
@ -1088,7 +1088,7 @@ int main(int argc, char const **args) {
  // Determine kernel configuration based on head size.
  // If head size is less than or equal to 64, each block operates over 64 queries and
-  // 64 keys, and parital results can be stored in the register file.
+  // 64 keys, and partial results can be stored in the register file.
  // If head size is greater than 64, each block operates over 32 queries and 128 keys,
  // and partial results are stored in shared memory.
  if (options.head_size_v > 64) {
--- a/examples/41_fused_multi_head_attention/fused_multihead_attention_variable_seqlen.cu
+++ b/examples/41_fused_multi_head_attention/fused_multihead_attention_variable_seqlen.cu
@ -1173,7 +1173,7 @@ int main(int argc, char const **args) {
  // Determine kernel configuration based on head size.
  // If head size is less than or equal to 64, each block operates over 64 queries and
-  // 64 keys, and parital results can be stored in the register file.
+  // 64 keys, and partial results can be stored in the register file.
  // If head size is greater than 64, each block operates over 32 queries and 128 keys,
  // and partial results are stored in shared memory.
  if (options.head_size_v > 64) {
--- a/examples/41_fused_multi_head_attention/gemm/custom_mma_pipelined.h
+++ b/examples/41_fused_multi_head_attention/gemm/custom_mma_pipelined.h
@ -310,7 +310,7 @@ class CustomMmaPipelined : public CustomMmaBase<Shape_, Policy_, 2> {
    iterator_B.clear_mask(gemm_k_iterations <= 1);
    // Issue loads during the first warp-level matrix multiply-add *AFTER*
-    // issuing shared memory loads (which have the tighest latency requirement).
+    // issuing shared memory loads (which have the tightest latency requirement).
    //
    // Mainloop
--- a/examples/41_fused_multi_head_attention/gemm/mma_from_smem.h
+++ b/examples/41_fused_multi_head_attention/gemm/mma_from_smem.h
@ -600,7 +600,7 @@ class MmaPipelinedFromSharedMemory : public MmaBaseFromSharedMemory<
    iterator_B.clear_mask(gemm_k_iterations <= 1);
    // Issue loads during the first warp-level matrix multiply-add *AFTER*
-    // issuing shared memory loads (which have the tighest latency requirement).
+    // issuing shared memory loads (which have the tightest latency requirement).
    //
    // Mainloop
--- a/examples/41_fused_multi_head_attention/iterators/predicated_tile_access_iterator_residual_last.h
+++ b/examples/41_fused_multi_head_attention/iterators/predicated_tile_access_iterator_residual_last.h
@ -181,7 +181,7 @@ class PredicatedTileAccessIteratorResidualLast<
  BytePointer pointer_;
  /// Below is used when Gather is turned on.  We need to record strided_offset
-  /// and contiguous_offset seperated to compute the offset by using
+  /// and contiguous_offset separated to compute the offset by using
  ///
  /// offset = contiguous_offset + indices[strided_offset]
  ///
--- a/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_threadblock.py
+++ b/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_threadblock.py
@ -86,14 +86,14 @@ class gen_default_b2b_mma:
                                                "OperatorClass", str(stage), "Operator")
        return gen_code
-    def gen_using_FusedAddBiasEpilouge(self):
+    def gen_using_FusedAddBiasEpilogue(self):
        gen_code = ""
        for i in range(self.b2b_num - 1):
-            code_using = helper.var_idx("using FusedAddBiasEpilouge", i)
+            code_using = helper.var_idx("using FusedAddBiasEpilogue", i)
-            epilouge_name = "typename cutlass::epilogue::threadblock::DefaultFusedBiasActEpilogueTensorOp"
+            epilogue_name = "typename cutlass::epilogue::threadblock::DefaultFusedBiasActEpilogueTensorOp"
            template_args = helper.var_idx("<ThreadblockShape", i) + helper.var_idx(",typename MmaCore", i) + helper.var_idx("::MmaPolicy::Operator, 1, EpilogueOutputOp", i) + ", 2>::Epilogue"
-            gen_code += code_using + " = " + epilouge_name + template_args + ";\n"
+            gen_code += code_using + " = " + epilogue_name + template_args + ";\n"
        return gen_code        
@ -161,12 +161,12 @@ class gen_default_b2b_mma:
        MmaPipelined_param_list += "ElementAccumulator0, layout::RowMajor, "
        for i in range(self.b2b_num - 1):
-            epilouge_name = "EpilogueOutputOp" + str(i)
+            epilogue_name = "EpilogueOutputOp" + str(i)
-            MmaPipelined_param_list += epilouge_name + ", "
+            MmaPipelined_param_list += epilogue_name + ", "
        for i in range(self.b2b_num - 1):
-            epilouge_name = "FusedAddBiasEpilouge" + str(i)
+            epilogue_name = "FusedAddBiasEpilogue" + str(i)
-            MmaPipelined_param_list += epilouge_name + ", "
+            MmaPipelined_param_list += epilogue_name + ", "
        for i in range(self.b2b_num):
            MmaPolicy = "typename MmaCore" + str(i) + "::MmaPolicy"
@ -198,7 +198,7 @@ class gen_default_b2b_mma:
        mmacore_codebody = self.gen_using_MmaCore(2)
        iterator_codebody = self.gen_using_Iterator()
        fragment_iterator_codebody = self.gen_fragment_iterator()
-        epilogue_iterator_codebody = self.gen_using_FusedAddBiasEpilouge()
+        epilogue_iterator_codebody = self.gen_using_FusedAddBiasEpilogue()
        threadBlockMma = self.gen_threadblockmma()
        specialized_code = mmacore_codebody + iterator_codebody + fragment_iterator_codebody + epilogue_iterator_codebody + threadBlockMma
@ -352,7 +352,7 @@ class gen_b2b_mme_pipelined:
    }\n\
 \n\
    // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing \n\
-    // shared memory loads (which have the tighest latency requirement).\n\
+    // shared memory loads (which have the tightest latency requirement).\n\
 \n\
    //\n\
    // Mainloop\n\
@ -459,7 +459,7 @@ class gen_b2b_mme_pipelined:
    }\n\
 \n\
    // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing \n\
-    // shared memory loads (which have the tighest latency requirement).\n\
+    // shared memory loads (which have the tightest latency requirement).\n\
    iterator_A.load(tb_frag_A);\n\
 \n\
    //\n\
@ -490,7 +490,7 @@ class gen_b2b_mme_pipelined:
          __syncthreads();\n\
 \n\
          // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing \n\
-          // shared memory loads (which have the tighest latency requirement).\n\
+          // shared memory loads (which have the tightest latency requirement).\n\
          iterator_A.load(tb_frag_A);\n\
          \n\
          ++this->smem_iterator_B0_;\n\
@ -549,12 +549,12 @@ class gen_b2b_mme_pipelined:
                code = "// " + str(id + 1) + " Gemm" 
                code += "    /// Iterator to load a warp-scoped tile of A1 operand from intermediate accumulator tile\n"
-                code += "    " + helper.var_idx("FragmentC", id - 1) + helper.var_idx(" after_epilouge_accu", id - 1) + ";\n"
+                code += "    " + helper.var_idx("FragmentC", id - 1) + helper.var_idx(" after_epilogue_accu", id - 1) + ";\n"
                code += "    " + helper.var_idx("epilogue_", id - 1) + helper.var_idx("(output_op_", id - 1) + helper.var_idx(", accum", id - 1) \
-                               + helper.var_idx(", after_epilouge_accu", id - 1) + helper.var_idx(", iterator_C", id - 1) +");\n"
+                               + helper.var_idx(", after_epilogue_accu", id - 1) + helper.var_idx(", iterator_C", id - 1) +");\n"
                #    FragmentIteratorA1 warp_tile_iterator_A1_(accum0); 
-                code += "    " + helper.var_idx("FragmentIteratorA", id) + helper.var_idx(" warp_tile_iterator_A", id) +"_(" + helper.var_idx("after_epilouge_accu", id - 1) + ");\n"
+                code += "    " + helper.var_idx("FragmentIteratorA", id) + helper.var_idx(" warp_tile_iterator_A", id) +"_(" + helper.var_idx("after_epilogue_accu", id - 1) + ");\n"
                #    FragmentB1 tb_frag_B1;
                code += "    " +  helper.var_idx("FragmentB", id) + " " + helper.var_idx("tb_frag_B", id) + ";\n"
                #    tb_frag_B1.clear();
@ -990,7 +990,7 @@ class gen_threadblock:
        self.gen_b2b_mma_base = gen_b2b_mma_base(template_param, gen_class_name, b2b_num, cutlass_deps_root, project_root)
-        self.gen_b2b_mma_piplined = gen_b2b_mme_pipelined(template_param, gen_class_name, b2b_num, cutlass_deps_root, project_root)
+        self.gen_b2b_mma_pipelined = gen_b2b_mme_pipelined(template_param, gen_class_name, b2b_num, cutlass_deps_root, project_root)
        self.gen_default_b2b_mma = gen_default_b2b_mma(template_param, gen_class_name, b2b_num, cutlass_deps_root, project_root)
@ -1001,7 +1001,7 @@ class gen_threadblock:
        with open(self.file_dir + "b2b_mma_base.h", "w+") as f:
            f.write(base_code)        
-        pipeline_code = self.gen_b2b_mma_piplined.gen_code(first_use_1stage = first_use_1stage)
+        pipeline_code = self.gen_b2b_mma_pipelined.gen_code(first_use_1stage = first_use_1stage)
        print("[INFO]: Gen kernel code [b2b_mma_pipelined.h]output Dir: is ", self.file_dir)
        with open(self.file_dir + "b2b_mma_pipelined.h", "w+") as f:
--- a/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_verify.py
+++ b/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_verify.py
@ -45,7 +45,7 @@ class gen_verify:
        self.user_header_file = ""
        for header in user_header_file: 
            self.user_header_file += "#include \"" + header + "\"\n"
-        self.seperate_cutlass = gen_basic.gen_volta_turing_fuse_act_impl(fuse_gemm_info, gen_class_name, user_header_file, output_dir)
+        self.separate_cutlass = gen_basic.gen_volta_turing_fuse_act_impl(fuse_gemm_info, gen_class_name, user_header_file, output_dir)
        self.gen_params()
        self.output_dir = output_dir
@ -53,14 +53,14 @@ class gen_verify:
    def gen_code(self):
        code = ""
        code += self.user_header_file
-        code += self.seperate_cutlass.gen_using(False)  #False -> Turing, True -> Volta
+        code += self.separate_cutlass.gen_using(False)  #False -> Turing, True -> Volta
        code_body = ""
        for i in range(self.b2b_num):
            code_body += "    " + helper.var_idx("Gemm", i) + helper.var_idx(" gemm_op_", i) + ";\n"
            code_body += "    " + helper.var_idx("gemm_op_", i) + helper.var_idx(".initialize(Arguments_", i) + ", nullptr);\n"
-        code_body += self.seperate_cutlass.gen_run()
+        code_body += self.separate_cutlass.gen_run()
        code += ir.gen_func(self.name, self.params, code_body)
        helper.write_2_headfile("cutlass_verify.h", self.output_dir, code)
@ -87,6 +87,6 @@ class gen_verify:
    def gen_initialize():
        code = ""
-        initialize_code = self.seperate_cutlass.gen_initialize()
+        initialize_code = self.separate_cutlass.gen_initialize()
        code = ir.gen_func("initialize", [[]])
--- a/examples/44_multi_gemm_ir_and_codegen/ir_gen/helper.py
+++ b/examples/44_multi_gemm_ir_and_codegen/ir_gen/helper.py
@ -83,23 +83,23 @@ def list_2_string(input_list, ):
    return rtn_string
-def get_epilouge_info(layer_info):
+def get_epilogue_info(layer_info):
    return layer_info['epilogue']
 def get_epilogue_tp(layer_info):
-    epilogue_info = get_epilouge_info(layer_info)
+    epilogue_info = get_epilogue_info(layer_info)
    return epilogue_info['tp']
 def get_epilogue_add_bias_or_not(layer_info):
-    epilogue_info = get_epilouge_info(layer_info)
+    epilogue_info = get_epilogue_info(layer_info)
    return epilogue_info['bias']['addbias']
 def get_epilogue_add_bias_tp(layer_info):
-    epilogue_info = get_epilouge_info(layer_info)
+    epilogue_info = get_epilogue_info(layer_info)
    return epilogue_info['bias']['bias_tp']
 def get_epilogue_args(layer_info):
-    epilogue_info = get_epilouge_info(layer_info)
+    epilogue_info = get_epilogue_info(layer_info)
    return epilogue_info['args']
 def get_epilogue_bias_shape(layer_info):
--- a/examples/49_hopper_gemm_schedules_with_collective_builder/49_hopper_gemm_schedules_with_collective_builder.cu
+++ b/examples/49_hopper_gemm_schedules_with_collective_builder/49_hopper_gemm_schedules_with_collective_builder.cu
@ -33,7 +33,7 @@
    \brief Hopper GEMM example leveraging collective operation builders.
    This example showcases the use of CUTLASS's CollectiveBuilder to easily construct performant kernels
-    targetting the NVIDIA Hopper architecture.
+    targeting the NVIDIA Hopper architecture.
    Background and motivation
    -------------------------
@ -45,7 +45,7 @@
    However, DefaultGemmConfigurations leave multiple opportunities for improvement, which are addressed
    in CUTLASS 3:
      (1) DefaultGemmConfigurations do not allow one to use a more-performant set of parameters without
-          specifying every parameter. For example, the DefaultGemmConfigurations for GEMMs targetting
+          specifying every parameter. For example, the DefaultGemmConfigurations for GEMMs targeting
          Ampere specify that three pipeline stages should be used regardless of the sizes of operands.
          If one wished to increase this value, one would also need to specify all other template parameters.
          This leaves a gap between a high-level ease-of-use interface and a lower-level detailed interface.
@ -55,7 +55,7 @@
    Alongside these opportunities for improvement, the Hopper architecture offers new features that increase
    the number of valid configurations of a kernel. In addition to the many template parameters already available
-    in CUTLASS 2 kernels, CUTLASS 3 kernels targetting Hopper also have various scheduling modes to select from that control:
+    in CUTLASS 2 kernels, CUTLASS 3 kernels targeting Hopper also have various scheduling modes to select from that control:
      (1) how data is to be loaded (e.g., using the Hopper TMA feature or Ampere cp.async)
      (2) how work is to be divided among warps in a thread block (e.g., whether to use "warp specialization")
      (3) whether persistent thread blocks should be used
@ -64,13 +64,13 @@
    Introduction to the CollectiveBuilder
    -------------------------------------
    CUTLASS 3 introduces the CollectiveBuilder to further ease the process of selecting template parameters
-    for kernels targetting Hopper. Similar to the DefaultGemmConfigurations used in CUTLASS 2, the CollectiveBuilder
+    for kernels targeting Hopper. Similar to the DefaultGemmConfigurations used in CUTLASS 2, the CollectiveBuilder
    takes in a small set of template parameters (e.g., the data types of operands A and B). It then automatically
    determines the data loading strategy to use depending on whether the Hopper TMA feature can be used with the provided
    parameters. If one does not indicate a particular scheduling policy or stage count to use (by using `Auto` template
    parameters), the CollectiveBuilder will also automatically select these.
-    Unlike DefaultGemmConfigurations a parital specialization of the CollectiveBuilder is not needed for many
+    Unlike DefaultGemmConfigurations a partial specialization of the CollectiveBuilder is not needed for many
    configurations of operand types. Instead the CollectiveBuilder "builds" a configuration based on generic
    properties of the specified operands, layouts, and other parameters. For example, when the stage count
    is set to `Auto`, the CollectiveBuilder may automatically calculate the maximum number of stages that
@ -90,7 +90,7 @@
    Details of this example
    -----------------------
    This example walks through the use of the CollectiveBuilder with various schedules and stage counts specified.
-    This example also illustrates how CUTLASS 3 GEMMs targetting Hopper automatically support batched GEMMs by simply
+    This example also illustrates how CUTLASS 3 GEMMs targeting Hopper automatically support batched GEMMs by simply
    extending the problem size with an additional tensor rank.
    Example usage:
@ -162,7 +162,7 @@ struct Options {
    out << "49_hopper_gemm_schedules_with_collective_builder\n\n"
      << "  This example showcases the use of CUTLASS's collective operation builders to easily construct\n"
-      << "  performant kernels targetting NVIDIA's Hopper architecture.\n\n"
+      << "  performant kernels targeting NVIDIA's Hopper architecture.\n\n"
      << "Options:\n\n"
      << "  --help                      If specified, displays this usage statement\n\n"
      << "  --m=<int>                   Sets the M extent of the GEMM\n"
--- a/include/cute/atom/copy_traits_sm90_tma.hpp
+++ b/include/cute/atom/copy_traits_sm90_tma.hpp
@ -718,7 +718,7 @@ make_tma_copy(CopyOp,
              << "\nswizzle        " << smem_swizzle
              << "\nl2Promotion    " << tma_l2Promotion
              << "\noobFill        " << tma_oobFill << std::endl;
-    std::cerr << "Error: Failed to intialize the TMA descriptor " << result << std::endl;
+    std::cerr << "Error: Failed to initialize the TMA descriptor " << result << std::endl;
    assert(false);
  }
 #endif // (__CUDACC_VER_MAJOR__ >= 12)
--- a/include/cutlass/arch/mma.h
+++ b/include/cutlass/arch/mma.h
@ -98,11 +98,11 @@ struct OpClassSimt {};
 /////////////////////////////////////////////////////////////////////////////////////////////////
-/// Tag classifing operators as Tensor Core operations.
+/// Tag classifying operators as Tensor Core operations.
 struct OpClassTensorOp {};
 /////////////////////////////////////////////////////////////////////////////////////////////////
-/// Tag classifing operators as WMMA Tensor Core operations
+/// Tag classifying operators as WMMA Tensor Core operations
 struct OpClassWmmaTensorOp {};
 /////////////////////////////////////////////////////////////////////////////////////////////////
--- a/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_optimized.h
+++ b/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_optimized.h
@ -230,7 +230,7 @@ public:
      offset_p[s] = (mapped_h + problem_size_.pad_h - filter_r) / problem_size_.stride_h;
      offset_q[s] = (mapped_w + problem_size_.pad_w - filter_s) / problem_size_.stride_w;
-      // Intialize pointers for gemm_k=0
+      // Initialize pointers for gemm_k=0
      TensorCoord coord{offset_n[s], offset_p[s], offset_q[s], filter_k_};
      pointer_[s] += params_.layout(coord) * sizeof_bits<Element>::value / 8;
@ -341,7 +341,7 @@ public:
        next_idx = 1;
-        // Restore bytes in q coordinate (Mma in filter s dimenstion)
+        // Restore bytes in q coordinate (Mma in filter s dimension)
        reset_bytes = reset_bytes_s_;
      } else {
@ -351,7 +351,7 @@ public:
        next_idx = 2;
-        // Restore bytes in p and q coordinate (Mma in filter s and r dimenstion)
+        // Restore bytes in p and q coordinate (Mma in filter s and r dimension)
        reset_bytes = reset_bytes_r_;
      }
 #else
--- a/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_analytic.h
+++ b/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_analytic.h
@ -195,7 +195,7 @@ public:
      s = filter_s_[iteration_contiguous_];
    }  
    else {
-      /// Multiple access to support non-128b alignment in contiguous dimenstion
+      /// Multiple access to support non-128b alignment in contiguous dimension
      c = (filter_c_[iteration_contiguous_] + iteration_vector_ * AccessType::kElements) % problem_size_.C;
      int wrap_c = (filter_c_[iteration_contiguous_] + iteration_vector_ * AccessType::kElements) / problem_size_.C;
      s = (filter_s_[iteration_contiguous_] + wrap_c) % problem_size_.S;
--- a/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_optimized.h
+++ b/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_optimized.h
@ -212,7 +212,7 @@ public:
    if (kAccessesPerVector > 1) {
      // This code section is only to support non-128b alignment
-      // Multiple access to support non-128b alignment in contiguous dimenstion
+      // Multiple access to support non-128b alignment in contiguous dimension
      int wrap_c;
      params_.c_divmod(wrap_c, c, c + iteration_vector_ * AccessType::kElements);
--- a/include/cutlass/conv/threadblock/depthwise_fprop_pipelined.h
+++ b/include/cutlass/conv/threadblock/depthwise_fprop_pipelined.h
@ -241,7 +241,7 @@ public:
    int rs_plane_idx = 0;
    // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing 
-    // shared memory loads (which have the tighest latency requirement).
+    // shared memory loads (which have the tightest latency requirement).
    //
    // Mainloop
--- a/include/cutlass/conv/threadblock/implicit_gemm_pipelined.h
+++ b/include/cutlass/conv/threadblock/implicit_gemm_pipelined.h
@ -238,7 +238,7 @@ public:
    int smem_write_stage_idx = 1;
    // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing 
-    // shared memory loads (which have the tighest latency requirement).
+    // shared memory loads (which have the tightest latency requirement).
    //
    // Mainloop
--- a/include/cutlass/conv/threadblock/threadblock_swizzle.h
+++ b/include/cutlass/conv/threadblock/threadblock_swizzle.h
@ -67,7 +67,7 @@ static int get_strided_dgrad_tile_m(
  // CUTLASS strided dgrad performance for stride > filter, i.e., stride={2x2} and filter={1x1})
  //
  // * Optimization * 
-  // Only launch CTAs in M dimenstion which contribute to a row in Dx output
+  // Only launch CTAs in M dimension which contribute to a row in Dx output
  // 
  // 
  // * Constraints *
@ -107,7 +107,7 @@ struct StridedDgradHorizontalThreadblockSwizzle :
    // compute number of tiles in m dimension
    int tile_m = get_strided_dgrad_tile_m(problem_size, tile_size.m());
-    // compute number of tiles in n dimenstion 
+    // compute number of tiles in n dimension
    int tile_n = (implicit_gemm_problem_size.n() + tile_size.n() - 1) / tile_size.n();
    return gemm::GemmCoord(
@ -148,7 +148,7 @@ struct StridedDgradIdentityThreadblockSwizzle :
    // compute number of tiles in m dimension
    int tile_m = get_strided_dgrad_tile_m(problem_size, tile_size.m());
-    // compute number of tiles in n dimenstion 
+    // compute number of tiles in n dimension
    int tile_n = (implicit_gemm_problem_size.n() + tile_size.n() - 1) / tile_size.n();
    return gemm::GemmCoord(
--- a/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op.h
+++ b/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op.h
@ -77,7 +77,7 @@ namespace threadblock {
 //  D = dr + j di = (ar*br - ai*bi) + j (ar*bi + ai*br) 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 template <
-  /// Epilouge Shape
+  /// Epilogue Shape
  typename Shape_,
  /// Warp-level mma operator
  typename WarpMmaTensorOp_,
--- a/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op_blas3.h
+++ b/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op_blas3.h
@ -78,7 +78,7 @@ namespace threadblock {
 //  D = dr + j di = (ar*br - ai*bi) + j (ar*bi + ai*br) 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 template <
-  /// Epilouge Shape
+  /// Epilogue Shape
  typename Shape_,
  /// Warp-level mma operator
  typename WarpMmaTensorOp_,
--- a/include/cutlass/epilogue/threadblock/predicated_tile_iterator_direct_conv.h
+++ b/include/cutlass/epilogue/threadblock/predicated_tile_iterator_direct_conv.h
@ -198,7 +198,7 @@ private:
  /// A thread's starting column
  Index thread_start_column_;
-  /// Initial thread ouput location
+  /// Initial thread output location
  int thread_start_n_, thread_start_p_, thread_start_q_;
  /// Current threadblock tile index
--- a/include/cutlass/epilogue/threadblock/predicated_tile_iterator_strided_dgrad.h
+++ b/include/cutlass/epilogue/threadblock/predicated_tile_iterator_strided_dgrad.h
@ -186,10 +186,10 @@ private:
  /// Extent of the matrix tile in rows
  Index extent_row_;
-  /// Starting Dx h and w dimenstion for strided dgrad mapping
+  /// Starting Dx h and w dimension for strided dgrad mapping
  int start_h_, start_w_;
-  /// Effective Dy P and Q dimenstions for strided dgrad mapping
+  /// Effective Dy P and Q dimensions for strided dgrad mapping
  int p_, q_;
  /// A thread's starting row position (assuming steady-state predicates have been computed)
--- a/include/cutlass/gemm/device/ell_gemm.h
+++ b/include/cutlass/gemm/device/ell_gemm.h
@ -547,7 +547,7 @@ public:
 ////////////////////////////////////////////////////////////////////////////////
-/// Parital specialization for column-major output exchanges problem size and operand.
+/// Partial specialization for column-major output exchanges problem size and operand.
 template <
    /// Element type for A matrix operand
    typename ElementA_,
--- a/include/cutlass/gemm/device/gemm.h
+++ b/include/cutlass/gemm/device/gemm.h
@ -521,7 +521,7 @@ public:
 ////////////////////////////////////////////////////////////////////////////////
-/// Parital specialization for column-major output exchanges problem size and operand.
+/// Partial specialization for column-major output exchanges problem size and operand.
 template <
    /// Element type for A matrix operand
    typename ElementA_,
--- a/include/cutlass/gemm/device/gemm_array.h
+++ b/include/cutlass/gemm/device/gemm_array.h
@ -476,7 +476,7 @@ public:
 ////////////////////////////////////////////////////////////////////////////////
-/// Parital specialization for column-major output exchanges problem size and operand.
+/// Partial specialization for column-major output exchanges problem size and operand.
 template <
  /// Element type for A matrix operand
  typename ElementA_,
--- a/include/cutlass/gemm/device/gemm_batched.h
+++ b/include/cutlass/gemm/device/gemm_batched.h
@ -454,7 +454,7 @@ public:
 ////////////////////////////////////////////////////////////////////////////////
-/// Parital specialization for column-major output exchanges problem size and operand.
+/// Partial specialization for column-major output exchanges problem size and operand.
 template <
  /// Element type for A matrix operand
  typename ElementA_,
--- a/include/cutlass/gemm/device/gemm_complex.h
+++ b/include/cutlass/gemm/device/gemm_complex.h
@ -475,7 +475,7 @@ public:
 ////////////////////////////////////////////////////////////////////////////////
-/// Parital specialization for column-major output exchanges problem size and operand.
+/// Partial specialization for column-major output exchanges problem size and operand.
 template <
  /// Element type for A matrix operand
  typename ElementA_,
--- a/include/cutlass/gemm/device/gemm_layernorm_mainloop_fusion.h
+++ b/include/cutlass/gemm/device/gemm_layernorm_mainloop_fusion.h
@ -194,7 +194,7 @@ class GemmLayernormMainloopFusion :
 ////////////////////////////////////////////////////////////////////////////////
-/// Parital specialization for column-major output exchanges problem size and operand.
+/// Partial specialization for column-major output exchanges problem size and operand.
 template <
    /// Element type for A matrix operand
    typename ElementA_,
--- a/include/cutlass/gemm/device/gemm_universal.h
+++ b/include/cutlass/gemm/device/gemm_universal.h
@ -219,7 +219,7 @@ class GemmUniversal :
 ////////////////////////////////////////////////////////////////////////////////
-/// Parital specialization for column-major output exchanges problem size and operand.
+/// Partial specialization for column-major output exchanges problem size and operand.
 template <
    /// Element type for A matrix operand
    typename ElementA_,
--- a/include/cutlass/gemm/device/gemm_universal_with_broadcast.h
+++ b/include/cutlass/gemm/device/gemm_universal_with_broadcast.h
@ -198,7 +198,7 @@ class GemmUniversalWithBroadcast :
 ////////////////////////////////////////////////////////////////////////////////
-/// Parital specialization for column-major output exchanges problem size and operand.
+/// Partial specialization for column-major output exchanges problem size and operand.
 template <
    /// Element type for A matrix operand
    typename ElementA_,
--- a/include/cutlass/gemm/device/gemm_with_k_reduction.h
+++ b/include/cutlass/gemm/device/gemm_with_k_reduction.h
@ -211,7 +211,7 @@ class GemmWithKReduction :
 ////////////////////////////////////////////////////////////////////////////////
-/// Parital specialization for column-major output exchanges problem size and operand.
+/// Partial specialization for column-major output exchanges problem size and operand.
 template <
    /// Element type for A matrix operand
    typename ElementA_,
--- a/include/cutlass/gemm/device/rank_2k.h
+++ b/include/cutlass/gemm/device/rank_2k.h
@ -348,7 +348,7 @@ public:
 };
 ////////////////////////////////////////////////////////////////////////////////
-/// Parital specialization for column-major output exchange operand.
+/// Partial specialization for column-major output exchange operand.
 template <
    /// Element type for A matrix operand
    typename ElementA_,
--- a/include/cutlass/gemm/device/rank_k.h
+++ b/include/cutlass/gemm/device/rank_k.h
@ -325,7 +325,7 @@ public:
 };
 ////////////////////////////////////////////////////////////////////////////////
-/// Parital specialization for column-major output exchange operand.
+/// Partial specialization for column-major output exchange operand.
 template <
    /// Element type for A matrix operand
    typename ElementA_,
--- a/include/cutlass/gemm/device/symm.h
+++ b/include/cutlass/gemm/device/symm.h
@ -408,7 +408,7 @@ public:
   call GEMM mainloop for with RowMajor efficient-epilogue
 ********************************************************************************************************/
-/// Parital specialization for column-major output exchanges problem size and operand.
+/// Partial specialization for column-major output exchanges problem size and operand.
 template <
    /// Element type for A matrix operand
    typename ElementA_,
--- a/include/cutlass/gemm/device/trmm.h
+++ b/include/cutlass/gemm/device/trmm.h
@ -563,7 +563,7 @@ For the mainloop and trmm kernel, `A` and `B` points to left-side and right-side
   call GEMM mainloop for with RowMajor efficient-epilogue
 ********************************************************************************************************/
-/// Parital specialization for column-major output exchanges problem size and operand.
+/// Partial specialization for column-major output exchanges problem size and operand.
 template <
    /// Element type for A matrix operand
    typename ElementA_,
--- a/include/cutlass/gemm/kernel/default_gemm_with_broadcast.h
+++ b/include/cutlass/gemm/kernel/default_gemm_with_broadcast.h
@ -137,7 +137,7 @@ struct DefaultGemmWithBroadcast {
 /////////////////////////////////////////////////////////////////////////////////////////////////
-/// Parital specialization: ArchTag = cutlass::arch::Sm70
+/// Partial specialization: ArchTag = cutlass::arch::Sm70
 ///
 ///
 template <
--- a/include/cutlass/gemm/kernel/default_gemm_with_reduction.h
+++ b/include/cutlass/gemm/kernel/default_gemm_with_reduction.h
@ -138,7 +138,7 @@ struct DefaultGemmWithReduction {
 /////////////////////////////////////////////////////////////////////////////////////////////////
-/// Parital specialization: ArchTag = cutlass::arch::Sm70
+/// Partial specialization: ArchTag = cutlass::arch::Sm70
 ///
 ///
 template <
--- a/include/cutlass/gemm/kernel/rank_2k_grouped_problem_visitor.h
+++ b/include/cutlass/gemm/kernel/rank_2k_grouped_problem_visitor.h
@ -138,7 +138,7 @@
            i = i_macro
            j = j_macro
-    Handling cases with grid dimensions that aren't multiples of eachother
+    Handling cases with grid dimensions that aren't multiples of each other
    ----------------------------------------------------------------------
    Even though threadblock shapes M and N are typically multiples of one another, the grid
    for a given problem may not have dimensions of the same ratio as that of the threadblock.
--- a/include/cutlass/gemm/kernel/sm90_gemm_tma.hpp
+++ b/include/cutlass/gemm/kernel/sm90_gemm_tma.hpp
@ -196,7 +196,7 @@ public:
    // Any Tensor Op MMA Atom in the WGMMA ISA is arch conditional to sm90a.
    #if ! defined(__CUDA_ARCH_FEAT_SM90_ALL)
      if constexpr(size<0>(typename TiledMma::AtomShape_MNK{}) == 64) {
-        printf("ERROR : Arch conditional MMA instruction used without targetting sm90a compute capability. Aborting.\n");
+        printf("ERROR : Arch conditional MMA instruction used without targeting sm90a compute capability. Aborting.\n");
        return;
      }
    #endif
--- a/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized.hpp
+++ b/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized.hpp
@ -186,7 +186,7 @@ public:
    // Any Tensor Op MMA Atom in the WGMMA ISA is arch conditional to sm90a.
    #if ! defined(__CUDA_ARCH_FEAT_SM90_ALL)
      if constexpr(size<0>(typename TiledMma::AtomShape_MNK{}) == 64) {
-        printf("ERROR : Arch conditional MMA instruction used without targetting sm90a compute capability. Aborting.\n");
+        printf("ERROR : Arch conditional MMA instruction used without targeting sm90a compute capability. Aborting.\n");
        return;
      }
    #endif
--- a/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_persistent.hpp
+++ b/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_persistent.hpp
@ -258,7 +258,7 @@ public:
    // Any Tensor Op MMA Atom in the WGMMA ISA is arch conditional to sm90a.
    #if ! defined(__CUDA_ARCH_FEAT_SM90_ALL)
      if constexpr(size<0>(typename TiledMma::AtomShape_MNK{}) == 64) {
-        printf("ERROR : Arch conditional MMA instruction used without targetting sm90a compute capability. Aborting.\n");
+        printf("ERROR : Arch conditional MMA instruction used without targeting sm90a compute capability. Aborting.\n");
        return;
      }
    #endif
--- a/include/cutlass/gemm/threadblock/ell_mma_pipelined.h
+++ b/include/cutlass/gemm/threadblock/ell_mma_pipelined.h
@ -271,7 +271,7 @@ public:
    }
    // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing 
-    // shared memory loads (which have the tighest latency requirement).
+    // shared memory loads (which have the tightest latency requirement).
    //
    // Mainloop
--- a/include/cutlass/gemm/threadblock/mma_planar_complex_pipelined.h
+++ b/include/cutlass/gemm/threadblock/mma_planar_complex_pipelined.h
@ -321,7 +321,7 @@ public:
    iterator_B_imag.clear_mask(gemm_k_iterations <= 1);
    // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing 
-    // shared memory loads (which have the tighest latency requirement).
+    // shared memory loads (which have the tightest latency requirement).
    //
    // Mainloop
--- a/include/cutlass/reduction/kernel/tensor_reduce_affine_contiguous.h
+++ b/include/cutlass/reduction/kernel/tensor_reduce_affine_contiguous.h
@ -83,7 +83,7 @@ struct TensorReductionAffineContiguousParams {
  uint64_t outer_count;                          /// Number of elements in outer index space
  ElementOutput * destination;                  /// Pointer to output tensor of rank kReducedRank
-  ElementSource const * source;                 /// Poitner to source pointer of rank kRank
+  ElementSource const * source;                 /// Pointer to source pointer of rank kRank
  ReductionOp reduction_op;                     /// Reduction operator
  ElementCompute reduction_identity;            /// Identity element used by reduction operator
  ElementCompute *device_workspace;             /// Pointer to device workspace for inter-CTA reductions
--- a/include/cutlass/reduction/kernel/tensor_reduce_affine_strided.h
+++ b/include/cutlass/reduction/kernel/tensor_reduce_affine_strided.h
@ -85,7 +85,7 @@ struct TensorReductionAffineStridedParams {
  uint64_t outer_count;                          /// Number of elements in outer index space
  ElementOutput * destination;                  /// Pointer to output tensor of rank kReducedRank
-  ElementSource const * source;                 /// Poitner to source pointer of rank kRank
+  ElementSource const * source;                 /// Pointer to source pointer of rank kRank
  ReductionOp reduction_op;                     /// Reduction operator
  ElementCompute reduction_identity;            /// Identity element for reduction operator
  ElementCompute *device_workspace;             /// Pointer to device workspace for inter-CTA reductions
--- a/include/cutlass/transform/threadblock/predicated_tile_access_iterator.h
+++ b/include/cutlass/transform/threadblock/predicated_tile_access_iterator.h
@ -399,7 +399,7 @@ class PredicatedTileAccessIterator<Shape_, Element_, layout::PitchLinear,
  bool is_residue_tile_;
  /// Below is used when Gather is turned on.  We need to record strided_offset
-  /// and contiguous_offset seperated to compute the offset by using
+  /// and contiguous_offset separated to compute the offset by using
  ///
  /// offset = contiguous_offset + indices[strided_offset]
  ///
--- a/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op_sm70.h
+++ b/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op_sm70.h
@ -1079,7 +1079,7 @@ class RegularTileIterator<
  //
  /// The crosswised elements will be stored in a line.
-  /// line_size is size of crosswised dimention plus padding.
+  /// line_size is size of crosswised dimension plus padding.
  /// in units of AccessType
  Index line_size;
--- a/media/docs/implicit_gemm_convolution.md
+++ b/media/docs/implicit_gemm_convolution.md
@ -347,7 +347,7 @@ creating GEMM-B tile in shared memory.
 The improvements covered by optimized iterators are: 
 - (a) Precomputing kernel-invariant pointer deltas on the host 
 - (b) Computing cta-invariant mask predicates on device-side iterator ctors
- (c) Use of [fast divmod](/include/cutlass/fast_math.h) to map GEMM dimenstions to convolution tensors. 
+- (c) Use of [fast divmod](/include/cutlass/fast_math.h) to map GEMM dimensions to convolution tensors. 
 For example, _optimized_ activation iterator uses fast divmod to map GEMM _M_ to NPQ 
 for activation iterator
--- a/media/docs/quickstart.md
+++ b/media/docs/quickstart.md
@ -587,7 +587,8 @@ To instantiate all operations supporting all tile sizes, data types, and alignme
 ```bash
 $ cmake .. -DCUTLASS_NVCC_ARCHS='70;75;80' -DCUTLASS_LIBRARY_KERNELS=all
 ```
-The above command line generates about twenty thousand kernels targetting NVIDIA Ampere, Turing, and Volta architectures. 
+
 The above command line generates about twenty thousand kernels targeting NVIDIA Ampere, Turing, and Volta architectures. 
 Compiling thousands of kernels for three different architectures is time consuming. Additionaly, this would also result 
 in a large binary size and on some platforms linker to fail on building the library.
@ -641,13 +642,13 @@ $ cmake .. -DCUTLASS_NVCC_ARCHS='80' -DCUTLASS_LIBRARY_KERNELS=s16816fprop,s1681
 $ cmake .. -DCUTLASS_NVCC_ARCHS='50;60;61;70;75;80' -DCUTLASS_LIBRARY_KERNELS=sfprop
 ```
-**Example.** All forward propagation (fprop) convolution kernels with FP32 accumulation and FP16 input targetting NVIDIA Ampere's 16816 Tensor Core operation
+**Example.** All forward propagation (fprop) convolution kernels with FP32 accumulation and FP16 input targeting NVIDIA Ampere's 16816 Tensor Core operation
 ```bash
 $ cmake .. -DCUTLASS_NVCC_ARCHS='80' -DCUTLASS_LIBRARY_KERNELS=s16816fprop_*_f16
 ```
 **Example.** All backward weight gradient (wgrad) convolution kernels with FP32 accumulation, FP16 input, and optimized global memory iterator 
-targetting NVIDIA Ampere, Turing, and Volta Tensor Core operations
+targeting NVIDIA Ampere, Turing, and Volta Tensor Core operations
 ```bash
 $ cmake .. -DCUTLASS_NVCC_ARCHS='70;75;80' -DCUTLASS_LIBRARY_KERNELS=tensorop*s*wgrad_optimized_f16
 ```
--- a/test/unit/conv/device/conv2d_testbed.h
+++ b/test/unit/conv/device/conv2d_testbed.h
@ -573,7 +573,7 @@ bool TestSpecificConv2d(
 /////////////////////////////////////////////////////////////////////////////////////////////////////////
 // TestAllConv: Runs cutlass::conv::device::ImplicitGemmConvolution operator and compares it with reference
 // TestAllConv runs conv operator on default conv problem sizes from test::conv::device::TestbedConv2dProblemSizes
-// Additionaly, each conv2d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes 
+// Additionally, each conv2d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes
 // (conv_blacklist_sizes)
 /////////////////////////////////////////////////////////////////////////////////////////////////////////////
 template <typename ImplicitGemm>
--- a/test/unit/conv/device/conv2d_testbed_interleaved.h
+++ b/test/unit/conv/device/conv2d_testbed_interleaved.h
@ -517,7 +517,7 @@ public:
 /////////////////////////////////////////////////////////////////////////////////////////////////////////
 // TestAllConv: Runs cutlass::conv::device::ImplicitGemmConvolution operator and compares it with reference
 // TestAllConv runs conv operator on default conv problem sizes from test::conv::device::TestbedConv2dProblemSizes
-// Additionaly, each conv2d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes 
+// Additionally, each conv2d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes
 // (conv_blacklist_sizes)
 /////////////////////////////////////////////////////////////////////////////////////////////////////////////
 template <typename ImplicitGemm, int InterleavedK>
--- a/test/unit/conv/device/conv2d_with_broadcast_testbed.h
+++ b/test/unit/conv/device/conv2d_with_broadcast_testbed.h
@ -502,7 +502,7 @@ public:
 /////////////////////////////////////////////////////////////////////////////////////////////////////////
 // TestAllConv: Runs cutlass::conv::device::ImplicitGemmConvolution operator and compares it with reference
 // TestAllConv runs conv operator on default conv problem sizes from test::conv::device::TestbedConv2dProblemSizes
-// Additionaly, each conv2d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes 
+// Additionally, each conv2d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes
 // (conv_blacklist_sizes)
 /////////////////////////////////////////////////////////////////////////////////////////////////////////////
 template <typename ImplicitGemm,
--- a/test/unit/conv/device/conv2d_with_reduction_testbed.h
+++ b/test/unit/conv/device/conv2d_with_reduction_testbed.h
@ -464,7 +464,7 @@ public:
 /////////////////////////////////////////////////////////////////////////////////////////////////////////
 // TestAllConv: Runs cutlass::conv::device::ImplicitGemmConvolution operator and compares it with reference
 // TestAllConv runs conv operator on default conv problem sizes from test::conv::device::TestbedConv2dProblemSizes
-// Additionaly, each conv2d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes 
+// Additionally, each conv2d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes
 // (conv_blacklist_sizes)
 /////////////////////////////////////////////////////////////////////////////////////////////////////////////
 template <typename ImplicitGemm>
--- a/test/unit/conv/device/conv3d_testbed.h
+++ b/test/unit/conv/device/conv3d_testbed.h
@ -522,7 +522,7 @@ public:
 /////////////////////////////////////////////////////////////////////////////////////////////////////////
 // TestAllConv: Runs cutlass::conv::device::ImplicitGemmConvolution operator and compares it with reference
 // TestAllConv runs conv operator on default conv problem sizes from test::conv::device::TestbedConv2dProblemSizes
-// Additionaly, each conv3d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes 
+// Additionally, each conv3d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes
 // (conv_blacklist_sizes)
 /////////////////////////////////////////////////////////////////////////////////////////////////////////////
--- a/test/unit/gemm/device/default_gemm_configuration.hpp
+++ b/test/unit/gemm/device/default_gemm_configuration.hpp
@ -638,7 +638,7 @@ struct DefaultGemmConfigurationToCutlass3Types<
    GmemTiledCopyB, SmemLayoutAtomB, SmemCopyAtomB, cute::identity   // B
  >;
-  // Epilouge
+  // Epilogue
  using CollectiveEpilogue = epilogue::collective::DefaultEpilogue<
    TagToStrideC_t<LayoutC>,
    TagToStrideC_t<LayoutC>,
--- a/tools/library/include/cutlass/library/handle.h
+++ b/tools/library/include/cutlass/library/handle.h
@ -321,13 +321,13 @@ public:
    NumericTypeID element_C,                  /// Data type of C and D matrix
    void const * const * ptr_C_real,          /// Pointer to array containing pointers to real part of C matrices
-    void const * const * ptr_C_imag,          /// Pointer to array containing poitners to imaginary part of C matrices
+    void const * const * ptr_C_imag,          /// Pointer to array containing pointers to imaginary part of C matrices
    int64_t ldc_real,                         /// Leading dimension of real part of C matrix
    int64_t ldc_imag,                         /// Leading dimension of imaginary part of C matrix
    void * const * ptr_D_real,                /// Pointer to array containing pointers to real part of D matrices
-    void * const * ptr_D_imag,                /// Pointer to array containing poitners to imaginary part of D matrices
+    void * const * ptr_D_imag,                /// Pointer to array containing pointers to imaginary part of D matrices
    int64_t ldd_real,                         /// Leading dimension of real part of D matrix
    int64_t ldd_imag                          /// Leading dimension of imaginary part of D matrix
--- a/tools/library/include/cutlass/library/library.h
+++ b/tools/library/include/cutlass/library/library.h
@ -518,7 +518,7 @@ struct GemmDescription : public OperationDescription {
 /////////////////////////////////////////////////////////////////////////////////////////////////
-/// Desciprion for structured sparse GEMMs.
+/// Description for structured sparse GEMMs.
 struct SparseGemmDescription : public GemmDescription {
  /// Description structure for structured sparse GEMM
@ -1160,7 +1160,7 @@ struct GemmGroupedArguments {
 // OperationKind: kSparseGemm
 //
-/// Computes GEMM assumine one of the inputs has 2:4 structured sparsity.
+/// Computes GEMM assuming one of the inputs has 2:4 structured sparsity.
 struct SparseGemmConfiguration {
  GemmUniversalMode mode;
@ -1187,7 +1187,7 @@ struct SparseGemmArguments {
  void const *B;                    /// pointer to B matrix
  void const *C;                    /// pointer to C matrix
  void *D;                          /// pointer to D matrix
-  void const *E;                    /// pointer to E matric (metadata)
+  void const *E;                    /// pointer to E matrix (metadata)
  void const *alpha;                /// pointer to alpha scalar
  void const *beta;                 /// pointer to beta scalar
@ -1465,7 +1465,7 @@ struct ConvArguments {
  /// pointer to implicit gemm matrix C
  void const *C;
-  /// pointer to implicit gemm desitination matrix D
+  /// pointer to implicit gemm destination matrix D
  void *D;
  /// Host or device pointer to alpha scalar
@ -1487,16 +1487,16 @@ struct ConvArguments {
 //
 struct ReductionConfiguration {
-  /// Redcution problem size
+  /// Reduction problem size
  MatrixCoord problem_size;
  /// Number of partitions to reduce
  int partitions;
-  /// Number of lements between each partition
+  /// Number of elements between each partition
  int64_t partition_stride;
-  /// leading dimension of 'w'orksace operand
+  /// leading dimension of 'w'orkspace operand
  int64_t ldw; 
  /// leading dimension of 's'ource operand
--- a/Show More
+++ b/Show More