Re-enable all alignments for int accumulators (#807)

2023-02-06 22:01:15 -05:00 · 2023-02-06 22:01:15 -05:00 · 5921043981
commit 5921043981
parent add4ba622f
1 changed files with 6 additions and 4 deletions
--- a/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op.h
+++ b/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op.h
@ -136,14 +136,15 @@ struct DefaultIteratorsTensorOp<float, float, 4, ThreadblockShape, WarpShape, In
  static int const kFragmentsPerIteration = 2;
 };

-/// Partial specialization for int32_t <= int32_t x 4
+/// Partial specialization for int32_t <= int32_t
 template <
+  int ElementsPerAccess,
  typename ThreadblockShape,
  typename WarpShape,
  typename InstructionShape,
  typename ThreadMap
 >
-struct DefaultIteratorsTensorOp<int32_t, int32_t, 4, ThreadblockShape, WarpShape, InstructionShape, ThreadMap> {
+struct DefaultIteratorsTensorOp<int32_t, int32_t, ElementsPerAccess, ThreadblockShape, WarpShape, InstructionShape, ThreadMap> {
  
  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOp<
    WarpShape,
@ -160,14 +161,15 @@ struct DefaultIteratorsTensorOp<int32_t, int32_t, 4, ThreadblockShape, WarpShape
  static int const kFragmentsPerIteration = 1;
 };

-/// Partial specialization for float <= int32_t x 4
+/// Partial specialization for float <= int32_t
 template <
+  int ElementsPerAccess,
  typename ThreadblockShape,
  typename WarpShape,
  typename InstructionShape,
  typename ThreadMap
 >
-struct DefaultIteratorsTensorOp<float, int32_t, 4, ThreadblockShape, WarpShape, InstructionShape, ThreadMap> {
+struct DefaultIteratorsTensorOp<float, int32_t, ElementsPerAccess, ThreadblockShape, WarpShape, InstructionShape, ThreadMap> {

  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOp<
    WarpShape,