From 8ac2edc810551686f066fb6f7483ea850d4a372e Mon Sep 17 00:00:00 2001
From: Kun Wu <tonywukun@qq.com>
Date: Fri, 5 Jan 2024 07:27:45 -0600
Subject: [PATCH] expose stream API in python kernel call interfaces (#1287)

* expose stream API in python kernel call interfaces

* add stream to ReductionArguments; document stream arg

* add stream argument to GemmGroupedArguments
---
 python/cutlass/backend/arguments.py           |  5 ++++
 python/cutlass/backend/conv2d_operation.py    |  3 +++
 python/cutlass/backend/gemm_operation.py      | 12 +++++++++
 python/cutlass/backend/reduction_operation.py |  5 ++++
 python/cutlass/op/conv.py                     | 27 ++++++++++++-------
 python/cutlass/op/gemm.py                     |  7 ++++-
 python/cutlass/op/gemm_grouped.py             |  9 +++++--
 7 files changed, 56 insertions(+), 12 deletions(-)

diff --git a/python/cutlass/backend/arguments.py b/python/cutlass/backend/arguments.py
index 2c188334..2fd988fc 100644
--- a/python/cutlass/backend/arguments.py
+++ b/python/cutlass/backend/arguments.py
@@ -62,6 +62,11 @@ class ArgumentBase:
             # by default, tensor_C is not bias
             self.bias = False
 
+        if "stream" in kwargs.keys():
+            self.stream = kwargs["stream"]
+        else:
+            self.stream = cuda.CUstream(0)
+
         # RMM buffers used to track tensor lifetime
         self.buffers = {}
         # Host tensor to copy the computed result back
diff --git a/python/cutlass/backend/conv2d_operation.py b/python/cutlass/backend/conv2d_operation.py
index 4a2f2f03..e323b986 100644
--- a/python/cutlass/backend/conv2d_operation.py
+++ b/python/cutlass/backend/conv2d_operation.py
@@ -97,6 +97,8 @@ class Conv2dArguments(ArgumentBase):
     :type split_k_mode: cutlass_library.library.SplitKMode, optional
     :param output_op: output operator, optional
     :type output_op: :class:`cutlass.backend.LinearCombinationFunctorArguments`
+    :param stream: cuda stream, defaults to cuda.cuda.CUstream(0)
+    :type stream: :class:`cuda.cuda.CUstream`
     """
 
     def __init__(self, operation, problem_size, A, B, C, D,
@@ -448,6 +450,7 @@ class Conv2dOperation:
             arguments.host_workspace,
             arguments.device_workspace,
             arguments.launch_config,
+            arguments.stream
         )
 
         if err != cuda.CUresult.CUDA_SUCCESS:
diff --git a/python/cutlass/backend/gemm_operation.py b/python/cutlass/backend/gemm_operation.py
index 76543d76..85b64f29 100644
--- a/python/cutlass/backend/gemm_operation.py
+++ b/python/cutlass/backend/gemm_operation.py
@@ -164,6 +164,9 @@ class GemmArguments2x(ArgumentBase):
 
     :param output_op: output operator, optional
     :type output_op: :class:`cutlass.backend.LinearCombinationFunctorArguments`
+
+    :param stream: cuda stream, defaults to cuda.cuda.CUstream(0)
+    :type stream: :class:`cuda.cuda.CUstream`
     """
 
     def __init__(self, operation, problem_size, A, B, C, D, gemm_mode=GemmUniversalMode.Gemm, **kwargs):
@@ -666,6 +669,9 @@ class GemmGroupedArguments:
 
     :param output_op: output operator, optional
     :type output_op: :class:`cutlass.backend.LinearCombinationFunctorArguments`
+
+    :param stream: cuda stream, defaults to cuda.cuda.CUstream(0)
+    :type stream: :class:`cuda.cuda.CUstream`
     """
 
     def __init__(self, operation, problem_sizes, A, B, C, D, **kwargs):
@@ -765,6 +771,11 @@ class GemmGroupedArguments:
             self.output_op = kwargs["output_op"]
         else:
             self.output_op = self.operation.epilogue_type(1.0, 0.0)
+        
+        if "stream" in kwargs.keys():
+            self.stream = kwargs["stream"]
+        else:
+            self.stream = cuda.CUstream(0)
 
         # Get host problem size
         self.host_problem_size_ptr = np.array(problem_size_host, dtype=np.int32).__array_interface__["data"][0]
@@ -1542,6 +1553,7 @@ class GemmOperationBase:
             arguments.host_workspace,
             arguments.device_workspace,
             arguments.launch_config,
+            arguments.stream
         )
 
         if err != cuda.CUresult.CUDA_SUCCESS:
diff --git a/python/cutlass/backend/reduction_operation.py b/python/cutlass/backend/reduction_operation.py
index 5d42cc52..2a13d353 100644
--- a/python/cutlass/backend/reduction_operation.py
+++ b/python/cutlass/backend/reduction_operation.py
@@ -79,6 +79,10 @@ class ReductionArguments:
         else:
             # by default, tensor_C is not bias
             self.bias = False
+        if "stream" in kwargs.keys():
+            self.stream = kwargs["stream"]
+        else:
+            self.stream = cuda.CUstream(0)
 
         self.operation = operation
         self.ptr_workspace = workspace
@@ -386,6 +390,7 @@ class ReductionOperation:
             host_workspace,
             device_workspace,
             launch_config,
+            arguments.stream
         )
 
         if err != cuda.CUresult.CUDA_SUCCESS:
diff --git a/python/cutlass/op/conv.py b/python/cutlass/op/conv.py
index 3b8545fa..e2c4389d 100644
--- a/python/cutlass/op/conv.py
+++ b/python/cutlass/op/conv.py
@@ -131,6 +131,7 @@ from cutlass.backend.library import TensorDescription, TileDescription
 from cutlass.op.op import OperationBase
 from cutlass.shape import Conv2DProblemSize, MatrixCoord
 from cutlass.utils import check, datatypes
+from cuda import cuda
 
 
 class Conv2d(OperationBase):
@@ -733,7 +734,8 @@ class Conv2d(OperationBase):
             stride=(1, 1), padding=(0, 0), dilation=(1, 1),
             alpha=None, beta=None,
             split_k=("serial", 1), sync: bool = True,
-            print_module: bool = False) -> Conv2dArguments:
+            print_module: bool = False,
+            stream: cuda.CUstream = cuda.CUstream(0)) -> Conv2dArguments:
         """
         Runs the kernel currently specified. If it has not already been, the kernel is emitted and
         compiled. Tensors holding operands and outputs of the kernel are sourced either from the
@@ -760,6 +762,8 @@ class Conv2d(OperationBase):
         :type sync: bool
         :param print_module: whether to print the emitted C++ code
         :type print_module: bool
+        :param stream: cuda stream, defaults to cuda.cuda.CUstream(0)
+        :type stream: :class:`cuda.cuda.CUstream`
 
         :return: arguments passed in to the kernel
         :rtype: cutlass.backend.Conv2dArguments
@@ -850,7 +854,8 @@ class Conv2d(OperationBase):
             A=A, B=B, C=C, D=D,
             output_op=self.operation.epilogue_type(*epilogue_args),
             split_k_mode=datatypes.getattr_enum(SplitKMode, split_k[0]),
-            split_k_slices=split_k[1]
+            split_k_slices=split_k[1],
+            stream=stream
         )
 
         self.operation.run(arguments)
@@ -864,7 +869,8 @@ class Conv2d(OperationBase):
                 workspace=arguments.ptr_D,
                 destination=D,
                 source=C,
-                output_op=self.reduction_operation.epilogue_type(*epilogue_args)
+                output_op=self.reduction_operation.epilogue_type(*epilogue_args),
+                stream=stream
             )
             self.reduction_operation.run(reduction_arguments)
 
@@ -919,11 +925,12 @@ class Conv2dFprop(Conv2d):
     def run(
         self, input=None, weight=None, C=None, output=None, alpha=None, beta=None,
         stride=(1, 1), padding=(0, 0), dilation=(1, 1), split_k=("serial", 1),
-        sync: bool = True, print_module: bool = False) -> Conv2dArguments:
+        sync: bool = True, print_module: bool = False,
+        stream: cuda.CUstream = cuda.CUstream(0)) -> Conv2dArguments:
 
         A, B, D = input, weight, output
         return super().run(
-            A, B, C, D, alpha, beta, stride, padding, dilation, split_k, sync, print_module)
+            A, B, C, D, alpha, beta, stride, padding, dilation, split_k, sync, print_module, stream)
 
 
 class Conv2dDgrad(Conv2d):
@@ -943,11 +950,12 @@ class Conv2dDgrad(Conv2d):
 
     def run(self, grad_output=None, weight=None, C=None, grad_input=None, alpha=None, beta=None,
         stride=(1, 1), padding=(0, 0), dilation=(1, 1), split_k=("serial", 1),
-        sync: bool = True, print_module: bool = False) -> Conv2dArguments:
+        sync: bool = True, print_module: bool = False,
+        stream: cuda.CUstream = cuda.CUstream(0)) -> Conv2dArguments:
         #
         A, B, D = grad_output, weight, grad_input
         return super().run(
-            A, B, C, D, alpha, beta, stride, padding, dilation, split_k, sync, print_module)
+            A, B, C, D, alpha, beta, stride, padding, dilation, split_k, sync, print_module, stream)
 
 
 class Conv2dWgrad(Conv2d):
@@ -967,8 +975,9 @@ class Conv2dWgrad(Conv2d):
 
     def run(self, grad_output=None, input=None, C=None, grad_weight=None, alpha=None, beta=None,
         stride=(1, 1), padding=(0, 0), dilation=(1, 1), split_k=("serial", 1),
-        sync: bool = True, print_module: bool = False) -> Conv2dArguments:
+        sync: bool = True, print_module: bool = False,
+        stream: cuda.CUstream = cuda.CUstream(0)) -> Conv2dArguments:
         #
         A, B, D = grad_output, input, grad_weight
         return super().run(
-            A, B, C, D, alpha, beta, stride, padding, dilation, split_k, sync, print_module)
+            A, B, C, D, alpha, beta, stride, padding, dilation, split_k, sync, print_module, stream)
diff --git a/python/cutlass/op/gemm.py b/python/cutlass/op/gemm.py
index 72afcba5..61e4f6a8 100644
--- a/python/cutlass/op/gemm.py
+++ b/python/cutlass/op/gemm.py
@@ -131,6 +131,7 @@ from cutlass.backend.library import TensorDescription, TileDescription
 from cutlass.op.op import OperationBase
 from cutlass.shape import GemmCoord
 from cutlass.utils import check, datatypes
+from cuda import cuda
 
 
 class Gemm(OperationBase):
@@ -621,7 +622,8 @@ class Gemm(OperationBase):
                                 f'layout of ({ref_type}, {ref_layout}) and transpose failed.')
 
     def run(self, A=None, B=None, C=None, D=None,
-            alpha=None, beta=None, sync: bool = True, print_module: bool = False, visitor_args: dict = None) -> GemmArguments:
+            alpha=None, beta=None, sync: bool = True, print_module: bool = False, visitor_args: dict = None,
+            stream: cuda.CUstream = cuda.CUstream(0)) -> GemmArguments:
         """
         Runs the kernel currently specified. If it has not already been, the kernel is emitted and
         compiled. Tensors holding operands and outputs of the kernel are sourced either from the
@@ -644,6 +646,8 @@ class Gemm(OperationBase):
         :type sync: bool
         :param print_module: whether to print the emitted C++ code
         :type print_module: bool
+        :param stream: cuda stream, defaults to cuda.cuda.CUstream(0)
+        :type stream: :class:`cuda.cuda.CUstream`
 
         :return: arguments passed in to the kernel
         :rtype: cutlass.backend.GemmArguments
@@ -687,6 +691,7 @@ class Gemm(OperationBase):
                     'D': self._get_batch_stride(D)
                 }
             }
+        kwargs['stream'] = stream
 
         if isinstance(self.epilogue_functor, EpilogueFunctorVisitor):
             output_op = self.operation.epilogue_type(visitor_args)
diff --git a/python/cutlass/op/gemm_grouped.py b/python/cutlass/op/gemm_grouped.py
index f88dea17..162e0493 100644
--- a/python/cutlass/op/gemm_grouped.py
+++ b/python/cutlass/op/gemm_grouped.py
@@ -65,6 +65,7 @@ from cutlass.backend.library import (
 from cutlass.op.gemm import Gemm
 from cutlass.shape import GemmCoord
 from cutlass.utils import check, datatypes
+from cuda import cuda
 
 
 class GroupedGemm(Gemm):
@@ -194,7 +195,8 @@ class GroupedGemm(Gemm):
 
     def run(self, A, B, C, D,
             alpha=None, beta=None, sync: bool = True,
-            print_module: bool = False) -> GemmGroupedArguments:
+            print_module: bool = False,
+            stream: cuda.CUstream = cuda.CUstream(0)) -> GemmGroupedArguments:
         """
         Runs the kernel currently specified.
 
@@ -217,6 +219,8 @@ class GroupedGemm(Gemm):
         :type sync: bool
         :param print_module: whether to print the emitted C++ code
         :type print_module: bool
+        :param stream: cuda stream, defaults to cuda.cuda.CUstream(0)
+        :type stream: :class:`cuda.cuda.CUstream`
 
         :return: arguments passed in to the kernel
         :rtype: cutlass.backend.GemmGroupedArguments
@@ -248,7 +252,8 @@ class GroupedGemm(Gemm):
             operation=self.operation,
             problem_sizes=problem_sizes,
             A=As, B=Bs, C=Cs, D=Ds,
-            output_op=self.operation.epilogue_type(alpha, beta)
+            output_op=self.operation.epilogue_type(alpha, beta),
+            stream=stream
         )
 
         self.operation.run(arguments)