diff --git a/python/cutlass_library/generator.py b/python/cutlass_library/generator.py
index 365ea6cf..099c4271 100644
--- a/python/cutlass_library/generator.py
+++ b/python/cutlass_library/generator.py
@@ -2225,7 +2225,7 @@ def GenerateSM80_TensorOp_16816_mixed_input_upcast_a(manifest, cuda_version):
       math_inst.element_accumulator,
     ]
 
-    CreateGemmOperator(manifest, layouts, tile_descriptions, \
+    operations = CreateGemmOperator(manifest, layouts, tile_descriptions, \
       data_type, alignment_constraints)
 
     # Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
@@ -2238,11 +2238,12 @@ def GenerateSM80_TensorOp_16816_mixed_input_upcast_a(manifest, cuda_version):
         math_inst.element_accumulator,
       ]
 
-      operations = CreateGemmOperator(manifest, layouts, tile_descriptions, \
+      operations += CreateGemmOperator(manifest, layouts, tile_descriptions, \
         data_type_mixed, alignment_constraints) 
     
       for op in operations:
-        if op.tile_description.threadblock_shape[1] <= 32:
+        if (DataTypeSize[op.C.element] == 16) and \
+           (op.tile_description.threadblock_shape[1] <= 32):
           op.C.alignment = 4