update float < int32_t * 4 (#488)

Co-authored-by: 赵俊涛 <zhaojuntao@zhaojuntaos-MacBook-Pro.local>
2022-05-05 01:36:05 +08:00 · 2022-05-05 01:36:05 +08:00 · ddd8f9cf41
commit ddd8f9cf41
parent ec2b4fd85d
1 changed files with 24 additions and 0 deletions
--- a/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op.h
+++ b/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op.h
@ -158,6 +158,30 @@ struct DefaultIteratorsTensorOp<int32_t, int32_t, 4, ThreadblockShape, WarpShape
  static int const kFragmentsPerIteration = 1;
 };
 /// Partial specialization for float <= int32_t x 4
 template <
  typename ThreadblockShape,
  typename WarpShape,
  typename InstructionShape,
  typename ThreadMap
 >
 struct DefaultIteratorsTensorOp<float, int32_t, 4, ThreadblockShape, WarpShape, InstructionShape, ThreadMap> {
  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOp<
    WarpShape,
    InstructionShape,
    int32_t,
    layout::RowMajor
  >;
  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
    ThreadMap,
    int32_t
  >;
  static int const kFragmentsPerIteration = 1;
 };
 /// Partial specialization for half <= float x 8 epilogues avoids shared memory bank conflicts.
 template <
  typename ThreadblockShape,