From 5e1a0a5adbcda7c2ea7fb66d9ea22b89f7840ba7 Mon Sep 17 00:00:00 2001 From: Haicheng Wu <57973641+hwu36@users.noreply.github.com> Date: Tue, 17 Oct 2023 15:15:39 -0400 Subject: [PATCH] fix alignmentC for h16816_s8xf16 (#1146) * fix alignmentC for h16816_s8xf16 * manish's change --- python/cutlass_library/generator.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/python/cutlass_library/generator.py b/python/cutlass_library/generator.py index 365ea6cf..099c4271 100644 --- a/python/cutlass_library/generator.py +++ b/python/cutlass_library/generator.py @@ -2225,7 +2225,7 @@ def GenerateSM80_TensorOp_16816_mixed_input_upcast_a(manifest, cuda_version): math_inst.element_accumulator, ] - CreateGemmOperator(manifest, layouts, tile_descriptions, \ + operations = CreateGemmOperator(manifest, layouts, tile_descriptions, \ data_type, alignment_constraints) # Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation) @@ -2238,11 +2238,12 @@ def GenerateSM80_TensorOp_16816_mixed_input_upcast_a(manifest, cuda_version): math_inst.element_accumulator, ] - operations = CreateGemmOperator(manifest, layouts, tile_descriptions, \ + operations += CreateGemmOperator(manifest, layouts, tile_descriptions, \ data_type_mixed, alignment_constraints) for op in operations: - if op.tile_description.threadblock_shape[1] <= 32: + if (DataTypeSize[op.C.element] == 16) and \ + (op.tile_description.threadblock_shape[1] <= 32): op.C.alignment = 4