From 557be3ab0e3de8fab300f1d10b8e09e3e732eefc Mon Sep 17 00:00:00 2001 From: wang-y-z <57429717+wang-y-z@users.noreply.github.com> Date: Fri, 3 Nov 2023 11:54:46 +0800 Subject: [PATCH] Fix several typos (#1169) Co-authored-by: isaacw --- docs/namespacecutlass_1_1reference_1_1host.html | 2 +- .../44_multi_gemm_ir_and_codegen/ir_gen/gen_device.py | 8 ++++---- .../44_multi_gemm_ir_and_codegen/ir_gen/gen_kernel.py | 2 +- .../ir_gen/gen_threadblock.py | 10 +++++----- examples/python/04_epilogue_visitor.ipynb | 2 +- include/cute/algorithm/tensor_algorithms.hpp | 2 +- include/cute/layout.hpp | 2 +- .../sm90_gemm_tma_warpspecialized_cooperative.hpp | 2 +- .../kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp | 2 +- .../gemm/kernel/sm90_tile_scheduler_stream_k.hpp | 2 +- .../transform/collective/sm90_wgmma_transpose.hpp | 2 +- include/cutlass/wmma_array.h | 2 +- media/docs/cute/04_algorithms.md | 2 +- media/docs/cute/0t_mma_atom.md | 2 +- media/docs/cute/0x_gemm_tutorial.md | 2 +- media/docs/quickstart.md | 2 +- test/unit/conv/device/conv2d_testbed.h | 6 +++--- test/unit/conv/device/conv2d_testbed_interleaved.h | 2 +- test/unit/conv/device/conv2d_with_broadcast_testbed.h | 2 +- test/unit/conv/device/conv2d_with_reduction_testbed.h | 2 +- test/unit/conv/device/conv3d_testbed.h | 2 +- 21 files changed, 30 insertions(+), 30 deletions(-) diff --git a/docs/namespacecutlass_1_1reference_1_1host.html b/docs/namespacecutlass_1_1reference_1_1host.html index 6d07d1f2..194bdfe8 100644 --- a/docs/namespacecutlass_1_1reference_1_1host.html +++ b/docs/namespacecutlass_1_1reference_1_1host.html @@ -1677,7 +1677,7 @@ template<typename Element , typename Layout >
-

Returns a pair containing a boolean of whether a value exists in a tensor and the location of of the first occurrence. If the value is not contained in the tensor, the second element of the pair is undefined.

+

Returns a pair containing a boolean of whether a value exists in a tensor and the location of the first occurrence. If the value is not contained in the tensor, the second element of the pair is undefined.

diff --git a/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_device.py b/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_device.py index c6df88cc..f8899e25 100644 --- a/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_device.py +++ b/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_device.py @@ -49,7 +49,7 @@ class gen_device: self.arg_member = [] self.gen_class_name = gen_class_name self.gen_kernel_name = gen_class_name + "Kernel" - self.tempalte_args = [] + self.template_args = [] self.__tempalate_arg_list = {'Stages': int, 'SplitKSerial': bool, 'IsBetaZero': bool, 'AlignmentA': int, 'AlignmentB': int} self.file_name = output_dir + "/device/" +gen_class_name +".h" @@ -63,7 +63,7 @@ class gen_device: self.first_use_1stage = False ## gen kernel - self.gen_kernel = gen_ker.gen_kernel(self.tempalte_args, self.gen_class_name, self.b2b_num, output_dir, cutlass_deps_root, project_root) + self.gen_kernel = gen_ker.gen_kernel(self.template_args, self.gen_class_name, self.b2b_num, output_dir, cutlass_deps_root, project_root) def __check_arg_type(self, temp_arg): @@ -126,7 +126,7 @@ class gen_device: func_code = self.gen_all_func() member_var_code = "private:\n typename B2bGemmKernel::Params params_;\n" - gen_code = gen_ir.gen_template_class(self.gen_class_name, self.tempalte_args, func_code + member_var_code) + gen_code = gen_ir.gen_template_class(self.gen_class_name, self.template_args, func_code + member_var_code) code = self.gen_include_header() + gen_ir.gen_namespace("cutlass", gen_ir.gen_namespace("gemm", gen_ir.gen_namespace("device", gen_code))) if ifprint: @@ -142,7 +142,7 @@ class gen_device: def update_b2b_class_template_args(self): for arg in self.args.keys(): - self.tempalte_args.append([self.__check_arg_type(arg), arg, self.args[arg]]) + self.template_args.append([self.__check_arg_type(arg), arg, self.args[arg]]) def update_b2b_args(self): diff --git a/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_kernel.py b/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_kernel.py index a640fc60..0143e6b4 100644 --- a/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_kernel.py +++ b/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_kernel.py @@ -444,7 +444,7 @@ class gen_kernel: self.gen_class_name = "B2bGemm" self.gen_kernel_name = gen_class_name + "Kernel" - self.tempalte_args = [] + self.template_args = [] self.cutlass_deps_root = cutlass_deps_root self.project_root = project_root diff --git a/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_threadblock.py b/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_threadblock.py index bb3c76fc..36d1303c 100644 --- a/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_threadblock.py +++ b/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_threadblock.py @@ -957,13 +957,13 @@ public:\n\ def gen_code(self): - tempalte_arg = [] + template_arg = [] for i in range(self.b2b_num): - tempalte_arg.append(("typename", helper.var_idx("Shape", i))) + template_arg.append(("typename", helper.var_idx("Shape", i))) for i in range(self.b2b_num): - tempalte_arg.append(("typename", helper.var_idx("Policy", i))) + template_arg.append(("typename", helper.var_idx("Policy", i))) for i in range(self.b2b_num): - tempalte_arg.append((int, helper.var_idx("Stage", i))) + template_arg.append((int, helper.var_idx("Stage", i))) @@ -971,7 +971,7 @@ public:\n\ code_body += self.gen_protected() code_body += self.gen_public_member() - class_code = gen_ir.gen_template_class("B2bMmaBase", tempalte_arg, code_body) + class_code = gen_ir.gen_template_class("B2bMmaBase", template_arg, code_body) code = self.gen_include_header() + gen_ir.gen_namespace("cutlass", gen_ir.gen_namespace("gemm", gen_ir.gen_namespace("threadblock", class_code))) diff --git a/examples/python/04_epilogue_visitor.ipynb b/examples/python/04_epilogue_visitor.ipynb index 3f47afa0..69a32226 100644 --- a/examples/python/04_epilogue_visitor.ipynb +++ b/examples/python/04_epilogue_visitor.ipynb @@ -68,7 +68,7 @@ "source": [ "## Define the epilogue visitor functor\n", "The epilogue functor can be defined as a simple Python function and a set of example tensors for inputs and outputs. The example below illustrates a complex epilogue under the directed acyclic graph structure (`F` is used twice). The epilogue takes source tensors in different ranks: `alpha`, `beta` are scalars, `bias` is a column vector to broadcast, and `C`, `aux` are matrices. It contains various math operations from basic arithmatic operations and built-in callable functions like `relu`. It also accomodates multiple outputs `D` and `F`. Note that there are some restrictions on syntax.\n", - "* Each named variable must be assigned exactly once and defined before it it used.\n", + "* Each named variable must be assigned exactly once and defined before it used.\n", "* Reserved names: `accum`, `C`, and `D` are reserved for accumulator, tensor_C, and tensor_D.\n", "* Return values must be a named variable.\n", "\n", diff --git a/include/cute/algorithm/tensor_algorithms.hpp b/include/cute/algorithm/tensor_algorithms.hpp index 294374b8..e260f8b0 100644 --- a/include/cute/algorithm/tensor_algorithms.hpp +++ b/include/cute/algorithm/tensor_algorithms.hpp @@ -123,7 +123,7 @@ transform(Tensor&& tensor_in, Tensor&& t // Similar to std::transform with a binary operation // Takes two tensors as input and one tensor as output. -// Applies the binary_op to tensor_in1 and and tensor_in2 and +// Applies the binary_op to tensor_in1 and tensor_in2 and // assigns it to tensor_out template const& layout) // Return the codomain shape of a mode // @post size(coshape(@a a)) == cosize(@a a) -// @return C Coordinate with smallest elements such that that +// @return C Coordinate with smallest elements such that // @a elem_less(sub_layout(c), C) for all c < size(@a sub_layout) // where sub_layout = get(layout). template diff --git a/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp b/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp index ca1ce1a4..61e94e92 100644 --- a/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp +++ b/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp @@ -527,7 +527,7 @@ public: auto blk_coord = make_coord(m_coord, n_coord, _, l_coord); auto work_k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, blk_shape); - // Allocate the the accumulators for the (M,N) blk_shape + // Allocate the accumulators for the (M,N) blk_shape // // MSVC CTAD breaks if we say "Tensor" here, so we use "auto" instead. auto accumulators = partition_fragment_C(tiled_mma, take<0,2>(blk_shape)); // (MMA,MMA_M,MMA_N) diff --git a/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp b/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp index 845ed861..ba483c2b 100644 --- a/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp +++ b/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp @@ -540,7 +540,7 @@ public: auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl)); auto blk_coord = make_coord(m_coord, n_coord, _, l_coord); - // Allocate the the accumulators for the (M,N) blk_shape + // Allocate the accumulators for the (M,N) blk_shape Tensor accumulators = partition_fragment_C(tiled_mma, take<0,2>(blk_shape)); // (MMA,MMA_M,MMA_N) // Order two Math WG's MMA one after the other, helps hide Epilogue diff --git a/include/cutlass/gemm/kernel/sm90_tile_scheduler_stream_k.hpp b/include/cutlass/gemm/kernel/sm90_tile_scheduler_stream_k.hpp index ad333e1d..ec278da8 100644 --- a/include/cutlass/gemm/kernel/sm90_tile_scheduler_stream_k.hpp +++ b/include/cutlass/gemm/kernel/sm90_tile_scheduler_stream_k.hpp @@ -347,7 +347,7 @@ public: // The number of tiles for which reduction is required is either: // (a) the total number of output tiles (in the case of split-K) // (b) the number of stream-K tiles - // To calculate the the total number of output tiles in the split-K case, we + // To calcualte the total number of output tiles in the split-K case, we // note that, in the split-K case, the units_per_problem_ member of Params will be // the total number of output tiles. auto reduction_tiles = params.splits_ > 1 ? params.units_per_problem_ : params.sk_tiles_; diff --git a/include/cutlass/transform/collective/sm90_wgmma_transpose.hpp b/include/cutlass/transform/collective/sm90_wgmma_transpose.hpp index 6e41b7db..ac67da08 100644 --- a/include/cutlass/transform/collective/sm90_wgmma_transpose.hpp +++ b/include/cutlass/transform/collective/sm90_wgmma_transpose.hpp @@ -556,7 +556,7 @@ public: constexpr auto WarpThreadLayout = make_layout(make_shape(Int{}, Int{})); ////////////////////////////////////////////////////////////////////////////////////////////////////////////// /// A warp group uses 8 steps to transpose the whole WarpgroupTileSize x WarpgroupTileSize. - /// Divide a warp_group_tile into 8x8 warp_tiles to futher reduce the reg usage. + /// Divide a warp_group_tile into 8x8 warp_tiles to further reduce the reg usage. /// Step 0: Step 1: Step 2: Step 3: /// W0 W1 W2 W3 -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- /// W1 W0 -- -- -- -- -- -- -- -- W3 W2 -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- diff --git a/include/cutlass/wmma_array.h b/include/cutlass/wmma_array.h index 1366a43b..53652f55 100644 --- a/include/cutlass/wmma_array.h +++ b/include/cutlass/wmma_array.h @@ -47,7 +47,7 @@ namespace cutlass { //////////////////////////////////////////////////////////////////////////////////////////////////// -/// Wmma array type (WmmaFragmentArray holds elements of of type nvcuda::wmma::fragment) +/// Wmma array type (WmmaFragmentArray holds elements of type nvcuda::wmma::fragment) template < /// Element type typename T, diff --git a/media/docs/cute/04_algorithms.md b/media/docs/cute/04_algorithms.md index e35b7561..3a4f1e0b 100644 --- a/media/docs/cute/04_algorithms.md +++ b/media/docs/cute/04_algorithms.md @@ -116,7 +116,7 @@ would include the following. access instructions (like `cp.async`), then dispatch to the custom instruction. -2. The the two `Tensor`s have static layouts and it can be proven +2. The two `Tensor`s have static layouts and it can be proven that element vectorization is valid -- for example, four `LDS.32`s can be combined into a single `LDS.128` -- then vectorize the source and destinations tensors. diff --git a/media/docs/cute/0t_mma_atom.md b/media/docs/cute/0t_mma_atom.md index f1880464..d742851f 100644 --- a/media/docs/cute/0t_mma_atom.md +++ b/media/docs/cute/0t_mma_atom.md @@ -37,7 +37,7 @@ and the `Layout`s of threads and values within the operation. The `MMA_Traits` struct takes the Operation as a template parameter. CuTe specializes `MMA_Traits` for each Operation type that it supports. -Together, these two types comprise an "Atom" that decouples the complexity of thread and data layouts from the call site of of the PTX instruction. The Atom's Traits struct exposes information that is relevant to a single MMA operation, no matter the granularity at which it operates. +Together, these two types comprise an "Atom" that decouples the complexity of thread and data layouts from the call site of the PTX instruction. The Atom's Traits struct exposes information that is relevant to a single MMA operation, no matter the granularity at which it operates. CuTe MMA atoms expose the semantics of a single MMA operation. This is true regardless of the hardware level at which the MMA operates. diff --git a/media/docs/cute/0x_gemm_tutorial.md b/media/docs/cute/0x_gemm_tutorial.md index 17cccbf8..8a4f1c23 100644 --- a/media/docs/cute/0x_gemm_tutorial.md +++ b/media/docs/cute/0x_gemm_tutorial.md @@ -255,7 +255,7 @@ int bar() } ``` -"Static" is an unfortunately overloaded term in C++. Sometimes it means "the opposite of instance," like a "static function" or "static member" of a class. (Some programming languages, like Java, say "class method" to refer to a "static function of a class.") That's not what we mean here. Instead, we mean "part of a compile-time type." For example, `Int<1>` encodes the value 1 at compile time, as part of the type of a templated class `Int`. `Int<3>` and `Int<4>` have different types. You can get the value of of the type like this: `Int<3>::value`. (The `value` is a `static constexpr` member of the class, where "static" means "opposite of instance.") As soon as you go from `Int<3>` to `Int<3>::value`, you've gone from (3) above (a compile-time value) to (2) above (a `constexpr` value). In some situations, this may mean that the compiler treats it as a run-time value. +"Static" is an unfortunately overloaded term in C++. Sometimes it means "the opposite of instance," like a "static function" or "static member" of a class. (Some programming languages, like Java, say "class method" to refer to a "static function of a class.") That's not what we mean here. Instead, we mean "part of a compile-time type." For example, `Int<1>` encodes the value 1 at compile time, as part of the type of a templated class `Int`. `Int<3>` and `Int<4>` have different types. You can get the value of the type like this: `Int<3>::value`. (The `value` is a `static constexpr` member of the class, where "static" means "opposite of instance.") As soon as you go from `Int<3>` to `Int<3>::value`, you've gone from (3) above (a compile-time value) to (2) above (a `constexpr` value). In some situations, this may mean that the compiler treats it as a run-time value. #### Strides diff --git a/media/docs/quickstart.md b/media/docs/quickstart.md index c43882cc..45122bee 100644 --- a/media/docs/quickstart.md +++ b/media/docs/quickstart.md @@ -56,7 +56,7 @@ You may explicitly exclude cuBLAS and cuDNN as dependencies with the following C ## Build and run the CUTLASS Profiler -From the `build/` directory created above, compile the the CUTLASS Profiler. +From the `build/` directory created above, compile the CUTLASS Profiler. ```bash $ make cutlass_profiler -j12 ``` diff --git a/test/unit/conv/device/conv2d_testbed.h b/test/unit/conv/device/conv2d_testbed.h index 47c3ca18..7054ce98 100644 --- a/test/unit/conv/device/conv2d_testbed.h +++ b/test/unit/conv/device/conv2d_testbed.h @@ -696,7 +696,7 @@ bool TestAllConv2d( return false; } - // If CUTLASS_UNIT_TEST_PROBLEM_COUNT is set reduce the the number of tested problem counts + // If CUTLASS_UNIT_TEST_PROBLEM_COUNT is set reduce the number of tested problem counts if (CutlassUnitTestProblemCount() && testbed.tested_problem_count > CutlassUnitTestProblemCount()) { return true; @@ -742,7 +742,7 @@ bool TestAllConv2d( } // Sweep split-k-slice using serial and prallel reduction with non-unity alpha and non-zero beta for // a single conv2d problem size. Convolution unit tests take a long time to run so only sweep parameters - // which are abolutely neccessary to catch functional bugs. The below code does provide option to sweep + // which are abolutely necessary to catch functional bugs. The below code does provide option to sweep // alpha and beta for local testing, but only runs one value for alpha and beta. cutlass::conv::Conv2dProblemSize conv2d_split_k_test_size ( {1, 17, 11, 288}, // input size (NHWC) @@ -784,7 +784,7 @@ bool TestAllConv2d( return false; } - // If CUTLASS_UNIT_TEST_PROBLEM_COUNT is set reduce the the number of tested problem counts + // If CUTLASS_UNIT_TEST_PROBLEM_COUNT is set reduce the number of tested problem counts if (CutlassUnitTestProblemCount() && testbed.tested_problem_count > CutlassUnitTestProblemCount()) { return true; diff --git a/test/unit/conv/device/conv2d_testbed_interleaved.h b/test/unit/conv/device/conv2d_testbed_interleaved.h index cc00d82b..3093525a 100644 --- a/test/unit/conv/device/conv2d_testbed_interleaved.h +++ b/test/unit/conv/device/conv2d_testbed_interleaved.h @@ -609,7 +609,7 @@ bool TestAllInterleavedConv2d( #if 0 // Sweep split-k-slice using serial and prallel reduction with non-unity alpha and non-zero beta for // a single conv2d problem size. Convolution unit tests take a long time to run so only sweep parameters - // which are abolutely neccessary to catch functional bugs. The below code does provide option to sweep + // which are abolutely necessary to catch functional bugs. The below code does provide option to sweep // alpha and beta for local testing, but only runs one value for alpha and beta. cutlass::conv::Conv2dProblemSize conv2d_split_k_test_size ( {1, 17, 11, 288}, // input size (NHWC) diff --git a/test/unit/conv/device/conv2d_with_broadcast_testbed.h b/test/unit/conv/device/conv2d_with_broadcast_testbed.h index 7bbe6745..dc2297fc 100644 --- a/test/unit/conv/device/conv2d_with_broadcast_testbed.h +++ b/test/unit/conv/device/conv2d_with_broadcast_testbed.h @@ -632,7 +632,7 @@ bool TestAllConv2dWithBroadcast( // Sweep split-k-slice using serial and prallel reduction with non-unity alpha and non-zero beta for // a single conv2d problem size. Convolution unit tests take a long time to run so only sweep parameters - // which are abolutely neccessary to catch functional bugs. The below code does provide option to sweep + // which are abolutely necessary to catch functional bugs. The below code does provide option to sweep // alpha and beta for local testing, but only runs one value for alpha and beta. cutlass::conv::Conv2dProblemSize conv2d_split_k_test_size ( {1, 17, 11, 288}, // input size (NHWC) diff --git a/test/unit/conv/device/conv2d_with_reduction_testbed.h b/test/unit/conv/device/conv2d_with_reduction_testbed.h index c3d97998..7973870a 100644 --- a/test/unit/conv/device/conv2d_with_reduction_testbed.h +++ b/test/unit/conv/device/conv2d_with_reduction_testbed.h @@ -587,7 +587,7 @@ bool TestAllConv2dWithReduction( // Sweep split-k-slice using serial and prallel reduction with non-unity alpha and non-zero beta for // a single conv2d problem size. Convolution unit tests take a long time to run so only sweep parameters - // which are abolutely neccessary to catch functional bugs. The below code does provide option to sweep + // which are abolutely necessary to catch functional bugs. The below code does provide option to sweep // alpha and beta for local testing, but only runs one value for alpha and beta. cutlass::conv::Conv2dProblemSize conv2d_split_k_test_size ( {1, 17, 11, 288}, // input size (NHWC) diff --git a/test/unit/conv/device/conv3d_testbed.h b/test/unit/conv/device/conv3d_testbed.h index 64447c52..577c84f3 100644 --- a/test/unit/conv/device/conv3d_testbed.h +++ b/test/unit/conv/device/conv3d_testbed.h @@ -613,7 +613,7 @@ bool TestAllConv3d( // Sweep split-k-slice using serial reduction with non-unity alpha and non-zero beta for // a single conv2d problem size. Convolution unit tests take a long time to run so only sweep parameters - // which are abolutely neccessary to catch functional bugs. The below code does provide option to sweep + // which are abolutely necessary to catch functional bugs. The below code does provide option to sweep // alpha and beta for local testing, but only runs one value for alpha and beta. cutlass::conv::Conv3dProblemSize conv3d_split_k_test_size ( {1, 8, 8, 8, 32}, // input size (NDHWC)