From 8ac2edc810551686f066fb6f7483ea850d4a372e Mon Sep 17 00:00:00 2001 From: Kun Wu Date: Fri, 5 Jan 2024 07:27:45 -0600 Subject: [PATCH] expose stream API in python kernel call interfaces (#1287) * expose stream API in python kernel call interfaces * add stream to ReductionArguments; document stream arg * add stream argument to GemmGroupedArguments --- python/cutlass/backend/arguments.py | 5 ++++ python/cutlass/backend/conv2d_operation.py | 3 +++ python/cutlass/backend/gemm_operation.py | 12 +++++++++ python/cutlass/backend/reduction_operation.py | 5 ++++ python/cutlass/op/conv.py | 27 ++++++++++++------- python/cutlass/op/gemm.py | 7 ++++- python/cutlass/op/gemm_grouped.py | 9 +++++-- 7 files changed, 56 insertions(+), 12 deletions(-) diff --git a/python/cutlass/backend/arguments.py b/python/cutlass/backend/arguments.py index 2c188334..2fd988fc 100644 --- a/python/cutlass/backend/arguments.py +++ b/python/cutlass/backend/arguments.py @@ -62,6 +62,11 @@ class ArgumentBase: # by default, tensor_C is not bias self.bias = False + if "stream" in kwargs.keys(): + self.stream = kwargs["stream"] + else: + self.stream = cuda.CUstream(0) + # RMM buffers used to track tensor lifetime self.buffers = {} # Host tensor to copy the computed result back diff --git a/python/cutlass/backend/conv2d_operation.py b/python/cutlass/backend/conv2d_operation.py index 4a2f2f03..e323b986 100644 --- a/python/cutlass/backend/conv2d_operation.py +++ b/python/cutlass/backend/conv2d_operation.py @@ -97,6 +97,8 @@ class Conv2dArguments(ArgumentBase): :type split_k_mode: cutlass_library.library.SplitKMode, optional :param output_op: output operator, optional :type output_op: :class:`cutlass.backend.LinearCombinationFunctorArguments` + :param stream: cuda stream, defaults to cuda.cuda.CUstream(0) + :type stream: :class:`cuda.cuda.CUstream` """ def __init__(self, operation, problem_size, A, B, C, D, @@ -448,6 +450,7 @@ class Conv2dOperation: arguments.host_workspace, arguments.device_workspace, arguments.launch_config, + arguments.stream ) if err != cuda.CUresult.CUDA_SUCCESS: diff --git a/python/cutlass/backend/gemm_operation.py b/python/cutlass/backend/gemm_operation.py index 76543d76..85b64f29 100644 --- a/python/cutlass/backend/gemm_operation.py +++ b/python/cutlass/backend/gemm_operation.py @@ -164,6 +164,9 @@ class GemmArguments2x(ArgumentBase): :param output_op: output operator, optional :type output_op: :class:`cutlass.backend.LinearCombinationFunctorArguments` + + :param stream: cuda stream, defaults to cuda.cuda.CUstream(0) + :type stream: :class:`cuda.cuda.CUstream` """ def __init__(self, operation, problem_size, A, B, C, D, gemm_mode=GemmUniversalMode.Gemm, **kwargs): @@ -666,6 +669,9 @@ class GemmGroupedArguments: :param output_op: output operator, optional :type output_op: :class:`cutlass.backend.LinearCombinationFunctorArguments` + + :param stream: cuda stream, defaults to cuda.cuda.CUstream(0) + :type stream: :class:`cuda.cuda.CUstream` """ def __init__(self, operation, problem_sizes, A, B, C, D, **kwargs): @@ -765,6 +771,11 @@ class GemmGroupedArguments: self.output_op = kwargs["output_op"] else: self.output_op = self.operation.epilogue_type(1.0, 0.0) + + if "stream" in kwargs.keys(): + self.stream = kwargs["stream"] + else: + self.stream = cuda.CUstream(0) # Get host problem size self.host_problem_size_ptr = np.array(problem_size_host, dtype=np.int32).__array_interface__["data"][0] @@ -1542,6 +1553,7 @@ class GemmOperationBase: arguments.host_workspace, arguments.device_workspace, arguments.launch_config, + arguments.stream ) if err != cuda.CUresult.CUDA_SUCCESS: diff --git a/python/cutlass/backend/reduction_operation.py b/python/cutlass/backend/reduction_operation.py index 5d42cc52..2a13d353 100644 --- a/python/cutlass/backend/reduction_operation.py +++ b/python/cutlass/backend/reduction_operation.py @@ -79,6 +79,10 @@ class ReductionArguments: else: # by default, tensor_C is not bias self.bias = False + if "stream" in kwargs.keys(): + self.stream = kwargs["stream"] + else: + self.stream = cuda.CUstream(0) self.operation = operation self.ptr_workspace = workspace @@ -386,6 +390,7 @@ class ReductionOperation: host_workspace, device_workspace, launch_config, + arguments.stream ) if err != cuda.CUresult.CUDA_SUCCESS: diff --git a/python/cutlass/op/conv.py b/python/cutlass/op/conv.py index 3b8545fa..e2c4389d 100644 --- a/python/cutlass/op/conv.py +++ b/python/cutlass/op/conv.py @@ -131,6 +131,7 @@ from cutlass.backend.library import TensorDescription, TileDescription from cutlass.op.op import OperationBase from cutlass.shape import Conv2DProblemSize, MatrixCoord from cutlass.utils import check, datatypes +from cuda import cuda class Conv2d(OperationBase): @@ -733,7 +734,8 @@ class Conv2d(OperationBase): stride=(1, 1), padding=(0, 0), dilation=(1, 1), alpha=None, beta=None, split_k=("serial", 1), sync: bool = True, - print_module: bool = False) -> Conv2dArguments: + print_module: bool = False, + stream: cuda.CUstream = cuda.CUstream(0)) -> Conv2dArguments: """ Runs the kernel currently specified. If it has not already been, the kernel is emitted and compiled. Tensors holding operands and outputs of the kernel are sourced either from the @@ -760,6 +762,8 @@ class Conv2d(OperationBase): :type sync: bool :param print_module: whether to print the emitted C++ code :type print_module: bool + :param stream: cuda stream, defaults to cuda.cuda.CUstream(0) + :type stream: :class:`cuda.cuda.CUstream` :return: arguments passed in to the kernel :rtype: cutlass.backend.Conv2dArguments @@ -850,7 +854,8 @@ class Conv2d(OperationBase): A=A, B=B, C=C, D=D, output_op=self.operation.epilogue_type(*epilogue_args), split_k_mode=datatypes.getattr_enum(SplitKMode, split_k[0]), - split_k_slices=split_k[1] + split_k_slices=split_k[1], + stream=stream ) self.operation.run(arguments) @@ -864,7 +869,8 @@ class Conv2d(OperationBase): workspace=arguments.ptr_D, destination=D, source=C, - output_op=self.reduction_operation.epilogue_type(*epilogue_args) + output_op=self.reduction_operation.epilogue_type(*epilogue_args), + stream=stream ) self.reduction_operation.run(reduction_arguments) @@ -919,11 +925,12 @@ class Conv2dFprop(Conv2d): def run( self, input=None, weight=None, C=None, output=None, alpha=None, beta=None, stride=(1, 1), padding=(0, 0), dilation=(1, 1), split_k=("serial", 1), - sync: bool = True, print_module: bool = False) -> Conv2dArguments: + sync: bool = True, print_module: bool = False, + stream: cuda.CUstream = cuda.CUstream(0)) -> Conv2dArguments: A, B, D = input, weight, output return super().run( - A, B, C, D, alpha, beta, stride, padding, dilation, split_k, sync, print_module) + A, B, C, D, alpha, beta, stride, padding, dilation, split_k, sync, print_module, stream) class Conv2dDgrad(Conv2d): @@ -943,11 +950,12 @@ class Conv2dDgrad(Conv2d): def run(self, grad_output=None, weight=None, C=None, grad_input=None, alpha=None, beta=None, stride=(1, 1), padding=(0, 0), dilation=(1, 1), split_k=("serial", 1), - sync: bool = True, print_module: bool = False) -> Conv2dArguments: + sync: bool = True, print_module: bool = False, + stream: cuda.CUstream = cuda.CUstream(0)) -> Conv2dArguments: # A, B, D = grad_output, weight, grad_input return super().run( - A, B, C, D, alpha, beta, stride, padding, dilation, split_k, sync, print_module) + A, B, C, D, alpha, beta, stride, padding, dilation, split_k, sync, print_module, stream) class Conv2dWgrad(Conv2d): @@ -967,8 +975,9 @@ class Conv2dWgrad(Conv2d): def run(self, grad_output=None, input=None, C=None, grad_weight=None, alpha=None, beta=None, stride=(1, 1), padding=(0, 0), dilation=(1, 1), split_k=("serial", 1), - sync: bool = True, print_module: bool = False) -> Conv2dArguments: + sync: bool = True, print_module: bool = False, + stream: cuda.CUstream = cuda.CUstream(0)) -> Conv2dArguments: # A, B, D = grad_output, input, grad_weight return super().run( - A, B, C, D, alpha, beta, stride, padding, dilation, split_k, sync, print_module) + A, B, C, D, alpha, beta, stride, padding, dilation, split_k, sync, print_module, stream) diff --git a/python/cutlass/op/gemm.py b/python/cutlass/op/gemm.py index 72afcba5..61e4f6a8 100644 --- a/python/cutlass/op/gemm.py +++ b/python/cutlass/op/gemm.py @@ -131,6 +131,7 @@ from cutlass.backend.library import TensorDescription, TileDescription from cutlass.op.op import OperationBase from cutlass.shape import GemmCoord from cutlass.utils import check, datatypes +from cuda import cuda class Gemm(OperationBase): @@ -621,7 +622,8 @@ class Gemm(OperationBase): f'layout of ({ref_type}, {ref_layout}) and transpose failed.') def run(self, A=None, B=None, C=None, D=None, - alpha=None, beta=None, sync: bool = True, print_module: bool = False, visitor_args: dict = None) -> GemmArguments: + alpha=None, beta=None, sync: bool = True, print_module: bool = False, visitor_args: dict = None, + stream: cuda.CUstream = cuda.CUstream(0)) -> GemmArguments: """ Runs the kernel currently specified. If it has not already been, the kernel is emitted and compiled. Tensors holding operands and outputs of the kernel are sourced either from the @@ -644,6 +646,8 @@ class Gemm(OperationBase): :type sync: bool :param print_module: whether to print the emitted C++ code :type print_module: bool + :param stream: cuda stream, defaults to cuda.cuda.CUstream(0) + :type stream: :class:`cuda.cuda.CUstream` :return: arguments passed in to the kernel :rtype: cutlass.backend.GemmArguments @@ -687,6 +691,7 @@ class Gemm(OperationBase): 'D': self._get_batch_stride(D) } } + kwargs['stream'] = stream if isinstance(self.epilogue_functor, EpilogueFunctorVisitor): output_op = self.operation.epilogue_type(visitor_args) diff --git a/python/cutlass/op/gemm_grouped.py b/python/cutlass/op/gemm_grouped.py index f88dea17..162e0493 100644 --- a/python/cutlass/op/gemm_grouped.py +++ b/python/cutlass/op/gemm_grouped.py @@ -65,6 +65,7 @@ from cutlass.backend.library import ( from cutlass.op.gemm import Gemm from cutlass.shape import GemmCoord from cutlass.utils import check, datatypes +from cuda import cuda class GroupedGemm(Gemm): @@ -194,7 +195,8 @@ class GroupedGemm(Gemm): def run(self, A, B, C, D, alpha=None, beta=None, sync: bool = True, - print_module: bool = False) -> GemmGroupedArguments: + print_module: bool = False, + stream: cuda.CUstream = cuda.CUstream(0)) -> GemmGroupedArguments: """ Runs the kernel currently specified. @@ -217,6 +219,8 @@ class GroupedGemm(Gemm): :type sync: bool :param print_module: whether to print the emitted C++ code :type print_module: bool + :param stream: cuda stream, defaults to cuda.cuda.CUstream(0) + :type stream: :class:`cuda.cuda.CUstream` :return: arguments passed in to the kernel :rtype: cutlass.backend.GemmGroupedArguments @@ -248,7 +252,8 @@ class GroupedGemm(Gemm): operation=self.operation, problem_sizes=problem_sizes, A=As, B=Bs, C=Cs, D=Ds, - output_op=self.operation.epilogue_type(alpha, beta) + output_op=self.operation.epilogue_type(alpha, beta), + stream=stream ) self.operation.run(arguments)