Changes to iterators to support s8 gemm with f16 outputs (#812)

* Changes to iterators to support s8 gemm with f16 outputs * should work --------- Co-authored-by: Sujan Gonugondla <gsujan@amaon.com> Co-authored-by: Haicheng Wu <haichengw@nvidia.com>
2023-02-16 18:37:51 -05:00 · 2023-02-16 18:37:51 -05:00 · d8359c804b
commit d8359c804b
parent 34bed24af3
3 changed files with 47 additions and 7 deletions
--- a/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op.h
+++ b/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op.h
@ -224,6 +224,44 @@ struct DefaultIteratorsTensorOp<
  static int const kFragmentsPerIteration = 2;
 };
 /// Partial specialization for half <= int32_t x 8 epilogues avoids shared memory bank conflicts.
 template <
  typename ThreadblockShape,
  typename WarpShape,
  typename InstructionShape,
  typename ThreadMap
 >
 struct DefaultIteratorsTensorOp<
  half_t, 
  int32_t, 
  8, 
  ThreadblockShape, 
  WarpShape, 
  InstructionShape, 
  ThreadMap> {
  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOpMixed<
    WarpShape,
    InstructionShape,
    int32_t,
    32,
    16,
    8,
    8
  >;
  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIteratorMixed<
    ThreadMap,
    int32_t,
    32,
    16,
    8,
    8
  >;
  static int const kFragmentsPerIteration = 2;
 };
 /// Partial specialization for int8/int4b_t <= int32 x 16/8 epilogues avoids shared memory bank conflicts.
 /// Threadblock::kN = 256 still has bank conflicts.
 template <
--- a/include/cutlass/epilogue/threadblock/shared_load_iterator_mixed.h
+++ b/include/cutlass/epilogue/threadblock/shared_load_iterator_mixed.h
@ -70,8 +70,9 @@ template <
  int ElementSizeBits_,      ///< Size of accumulator in bits
  int OutputSizeBits_,       ///< Size of output element in bits
  int ElementsPerAccess,     ///< Vector length of output vector
-  int ContiguousLanes        ///< Number of lanes in the warp writing to contiguous elements
+  int ContiguousLanes,       ///< Number of lanes in the warp writing to contiguous elements
                             ///  in the global memory tensor
  bool EightBitsOutputOrLess = (OutputSizeBits_ <= 8)
 >
 class SharedLoadIteratorMixed;
@ -85,7 +86,7 @@ template <
  typename ThreadMap_,       ///< Thread map (conept: OutputTileThreadMap)
  typename Element_          ///< Accumulator data type
 >
-class SharedLoadIteratorMixed<ThreadMap_, Element_, 32, 16, 8, 8> {
+class SharedLoadIteratorMixed<ThreadMap_, Element_, 32, 16, 8, 8, false> {
 public:
  using ThreadMap = ThreadMap_;
  using Shape = typename ThreadMap::Shape;
@ -253,7 +254,7 @@ template <
  typename ThreadMap_,      ///< Thread map (conept: OutputTileThreadMap)
  int OutputSizeBits_       ///< Size of output element in bits
 >
-class SharedLoadIteratorMixed<ThreadMap_, int32_t, 32, OutputSizeBits_, 16, 8> {
+class SharedLoadIteratorMixed<ThreadMap_, int32_t, 32, OutputSizeBits_, 16, 8, true> {
 public:
  using ThreadMap = ThreadMap_;
  using Shape = typename ThreadMap::Shape;
@ -418,7 +419,7 @@ template <
  typename ThreadMap_,      ///< Thread map (conept: OutputTileThreadMap)
  int OutputSizeBits_
 >
-class SharedLoadIteratorMixed<ThreadMap_, int32_t, 32, OutputSizeBits_, 8, 8> {
+class SharedLoadIteratorMixed<ThreadMap_, int32_t, 32, OutputSizeBits_, 8, 8, true> {
 public:
  using ThreadMap = ThreadMap_;
  using Shape = typename ThreadMap::Shape;
--- a/include/cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h
+++ b/include/cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h
@ -64,7 +64,8 @@ template <
  int ElementSizeBits,            ///< Size of accumulator element in bits
  int OutputSizeBits,             ///< Size of output element in bits
  int OutputElementCount,         ///< number of elements in output vector
-  int ContiguousLanes             ///< Number of consecutive lanes writing to contiguous memory
+  int ContiguousLanes,            ///< Number of consecutive lanes writing to contiguous memory
  bool EightBitsOutputOrLess = (OutputSizeBits <= 8)
 >
 class TileIteratorTensorOpMixed {
 public:
@ -319,7 +320,7 @@ template <
  typename OperatorShape_,        ///< matrix multiply operation shape (concept: gemm::GemmShape),
  int OutputSizeBits              ///< Size of output element in bits
 >
-class TileIteratorTensorOpMixed<WarpShape_, OperatorShape_, int32_t, 32, OutputSizeBits, 16, 8> {
+class TileIteratorTensorOpMixed<WarpShape_, OperatorShape_, int32_t, 32, OutputSizeBits, 16, 8, true> {
 public:
  using WarpShape = WarpShape_;
@ -526,7 +527,7 @@ template <
  typename OperatorShape_,        ///< matrix multiply operation shape (concept: gemm::GemmShape)
  int OutputSizeBits              ///< Size of output element in bits
 >
-class TileIteratorTensorOpMixed<WarpShape_, OperatorShape_, int32_t, 32, OutputSizeBits, 8, 8> {
+class TileIteratorTensorOpMixed<WarpShape_, OperatorShape_, int32_t, 32, OutputSizeBits, 8, 8, true> {
 public:
  using WarpShape = WarpShape_;