fix alignmentC=8 for imma N=128 (#822)

Co-authored-by: Haicheng Wu <haichengw@nvidia.com>
2023-02-15 12:06:00 -05:00 · 2023-02-15 12:06:00 -05:00 · 9fb38ac048
commit 9fb38ac048
parent 8f5c242426
1 changed files with 6 additions and 6 deletions
--- a/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op.h
+++ b/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op.h
@ -270,7 +270,7 @@ struct DefaultIteratorsTensorOp<
  >;
  using WarpTileIterator = typename platform::conditional<
-                             (ThreadblockShape::kN == 256),
+                             (ThreadblockShape::kN == 256) || (ThreadblockShape::kN == 128 && ElementsPerAccess == 8),
                             WarpTileIteratorNotMixed,
                             WarpTileIteratorMixed>::type;
@ -289,7 +289,7 @@ struct DefaultIteratorsTensorOp<
  >;
  using SharedLoadIterator = typename platform::conditional<
-                             (ThreadblockShape::kN == 256),
+                             (ThreadblockShape::kN == 256) || (ThreadblockShape::kN == 128 && ElementsPerAccess == 8),
                             SharedLoadIteratorNotMixed,
                             SharedLoadIteratorMixed>::type;
@ -337,7 +337,7 @@ struct DefaultIteratorsTensorOp<
  >;
  using WarpTileIterator = typename platform::conditional<
-                             (ThreadblockShape::kN == 256),
+                             (ThreadblockShape::kN == 256) || (ThreadblockShape::kN == 128 && ElementsPerAccess == 8),
                             WarpTileIteratorNotMixed,
                             WarpTileIteratorMixed>::type;
@ -356,7 +356,7 @@ struct DefaultIteratorsTensorOp<
  >;
  using SharedLoadIterator = typename platform::conditional<
-                             (ThreadblockShape::kN == 256),
+                             (ThreadblockShape::kN == 256) || (ThreadblockShape::kN == 128 && ElementsPerAccess == 8),
                             SharedLoadIteratorNotMixed,
                             SharedLoadIteratorMixed>::type;
@ -404,7 +404,7 @@ struct DefaultIteratorsTensorOp<
  >;
  using WarpTileIterator = typename platform::conditional<
-                             (ThreadblockShape::kN == 256),
+                             (ThreadblockShape::kN == 256) || (ThreadblockShape::kN == 128 && ElementsPerAccess == 8),
                             WarpTileIteratorNotMixed,
                             WarpTileIteratorMixed>::type;
@ -423,7 +423,7 @@ struct DefaultIteratorsTensorOp<
  >;
  using SharedLoadIterator = typename platform::conditional<
-                             (ThreadblockShape::kN == 256),
+                             (ThreadblockShape::kN == 256) || (ThreadblockShape::kN == 128 && ElementsPerAccess == 8),
                             SharedLoadIteratorNotMixed,
                             SharedLoadIteratorMixed>::type;