cutlass/examples/40_cutlass_py/gemm_grouped.py

################################################################################
#
# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
################################################################################
import pycutlass
from pycutlass import *
import csv

import argparse

# parse the arguments
parser = argparse.ArgumentParser(
    description="Launch CUTLASS GEMM Grouped kernels from python")

# Operation description
# math instruction description
parser.add_argument("-i", "--instruction_shape",
                    default=[1, 1, 1], nargs=3, type=int, 
                    help="This option describes the size of MMA op")
parser.add_argument("-ta", "--element_a", default="float32", type=str,
                    choices=['float64', 'float32', 'float16', 'bfloat16', 'int32', 'int8'], 
                    help='Data type of elements in input tensor A')
parser.add_argument("-tb", "--element_b", default="float32", type=str,
                    choices=['float64', 'float32', 'float16', 'bfloat16', 'int32', 'int8'], 
                    help='Data type of elements in input tensor B')
parser.add_argument("-tc", "--element_c", default="float32", type=str,
                    choices=['float64', 'float32', 'float16', 'bfloat16', 'int32', 'int8'], 
                    help='Data type of elements in input tensor C and output tensor D')
parser.add_argument("-tacc", "--element_acc", default="float32", type=str,
                    choices=['float64', 'float32', 'float16', 'bfloat16', 'int32', 'int8'], 
                    help='Data type of accumulator')
parser.add_argument('-m', "--math", default="multiply_add",
                    type=str, choices=["multiply_add", "multiply_add_fast_bf16", "multiply_add_fast_f32"], help="math instruction")
parser.add_argument('-op', "--opcode", default="simt", type=str,
                    choices=["Simt", 'TensorOp'], help='This option describes whether you want to use tensor \
                        cores (TensorOp) or regular SIMT cores (Simt) on GPU SM')
# tile description
parser.add_argument("-b", "--threadblock_shape",
                    default=[128, 128, 8], nargs=3, type=int, 
                    help="This option describes the tile size a thread block with compute")
parser.add_argument("-s", "--stages", default=4,
                    type=int, help="Number of pipelines you want to use")
parser.add_argument("-w", "--warp_count", default=[
                    4, 2, 1], nargs=3, type=int, 
                    help="This option describes the number of warps along M, N, and K of the threadblock")
parser.add_argument("-cc", "--compute_capability", default=80,
                    type=int, help="This option describes CUDA SM architecture number")
# A
parser.add_argument('-la', "--layout_a", default="RowMajor", type=str, choices=[
                    "RowMajor", "ColumnMajor", "RowMajorInterleaved32", "ColumnMajorInterleaved32"], 
                    help="Memory layout of input tensor A")
parser.add_argument('-aa', '--alignment_a', default=1,
                    type=int, help="Memory alignment of input tensor A")
# B
parser.add_argument('-lb', "--layout_b", default="RowMajor", type=str, choices=[
                    "RowMajor", "ColumnMajor", "RowMajorInterleaved32", "ColumnMajorInterleaved32"], 
                    help="Memory layout of input tensor B")
parser.add_argument('-ab', '--alignment_b', default=1,
                    type=int, help="Memory alignment of input tensor B")
# C
parser.add_argument('-lc', "--layout_c", default="RowMajor", type=str, choices=[
                    "RowMajor", "ColumnMajor", "RowMajorInterleaved32", "ColumnMajorInterleaved32"], 
                    help="Memory layout of input tensor C and output tensor D")
parser.add_argument('-ac', '--alignment_c', default=1,
                    type=int, help="Memory alignment of input tensor C and output tensor D")
# epilogue
parser.add_argument("-te", "--element_epilogue", default="float32", type=str,
                    choices=['float64', 'float32', 'float16', 'bfloat16'], help='Epilogue datatype')
parser.add_argument("-ep", "--epilogue_functor", default="LinearCombination",
                    type=str, choices=['LinearCombination', 'FastLinearCombinationClamp', 'LinearCombinationClamp'], 
                    help="This option describes the epilogue part of the kernel")
# swizzling
parser.add_argument("-sw", "--swizzling_functor", default="IdentitySwizzle1", type=str, choices=[
                    "IdentitySwizzle1", "IdentitySwizzle2", "IdentitySwizzle4", "IdentitySwizzle8", "HorizontalSwizzle"],
                    help="This option describes how thread blocks are scheduled on GPU")
# precompute mode
parser.add_argument("-pm", "--precompute_mode",
                    default="Device", type=str, choices=["Host", "Device"],
                    help="Grouped Gemm Scheduing on device only (Device) or using host precompute (Host)")
# arguments
parser.add_argument("-p", "--problem_size_dir", type=str, 
                    help="path to the csv file contains the problem sizes")
parser.add_argument("-alpha", "--alpha", default=1.0, type=float, help="alpha")
parser.add_argument("-beta", "--beta", default=0.0, type=float, help="beta")

parser.add_argument('--print_cuda', action="store_true",
                    help="print the underlying CUDA kernel")

try:
    args = parser.parse_args()
except:
    sys.exit(0)

pycutlass.get_memory_pool(init_pool_size=2**30, max_pool_size=2**32)

element_a = getattr(cutlass, args.element_a)
element_b = getattr(cutlass, args.element_b)
element_c = getattr(cutlass, args.element_c)
element_acc = getattr(cutlass, args.element_acc)
math_operation = getattr(MathOperation, args.math)
opclass = getattr(cutlass.OpClass, args.opcode)

math_inst = MathInstruction(
    args.instruction_shape, element_a, element_b,
    element_acc, opclass, math_operation
)

tile_description = TileDescription(
    args.threadblock_shape, args.stages, args.warp_count,
    math_inst, args.compute_capability, args.compute_capability
)

layout_a = getattr(cutlass, args.layout_a)
layout_b = getattr(cutlass, args.layout_b)
layout_c = getattr(cutlass, args.layout_c)

A = TensorDescription(
    element_a, layout_a, args.alignment_a
)

B = TensorDescription(
    element_b, layout_b, args.alignment_b
)

C = TensorDescription(
    element_c, layout_c, args.alignment_c
)

element_epilogue = getattr(cutlass, args.element_epilogue)
epilogue_functor = getattr(EpilogueFunctor, args.epilogue_functor)
swizzling_functor = getattr(cutlass, args.swizzling_functor)
precompute_mode = getattr(SchedulerMode, args.precompute_mode)

operation = GemmOperationGrouped(
    arch=args.compute_capability, tile_description=tile_description,
    A=A, B=B, C=C, element_epilogue=element_epilogue,
    epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor,
    precompute_mode=precompute_mode
)

if args.print_cuda:
    print(operation.rt_module.emit())

pycutlass.compiler.add_module([operation, ])

reference_module = ReferenceModule(A, B, C)

# get problems
problem_sizes = []
with open(args.problem_size_dir) as csv_file:
    reader = csv.reader(csv_file)
    for row in reader:
        problem_sizes.append(
            cutlass.gemm.GemmCoord(int(row[0]), int(row[1]), int(row[2]))
        )

problem_count = len(problem_sizes)

tensor_As = []
tensor_Bs = []
tensor_Cs = []
tensor_Ds = []
problem_sizes_coord = []
tensor_D_refs = []

for problem_size in problem_sizes:
    if args.element_a != "int8":
        if args.element_a == "bfloat16":
            tensor_A = np.ceil(np.random.uniform(low=-8.5, high=7.5, size=(problem_size.m()
                                                                           * problem_size.k(),))).astype(bfloat16)
        else:
            tensor_A = np.ceil(np.random.uniform(low=-8.5, high=7.5, size=(problem_size.m()
                                                                           * problem_size.k(),))).astype(getattr(np, args.element_a))
    else:
        tensor_A = np.random.uniform(low=-2, high=2, size=(problem_size.m()
                                                           * problem_size.k(),)).astype(getattr(np, args.element_a))

    if args.element_b != "int8":
        if args.element_b == "bfloat16":
            tensor_B = np.ceil(np.random.uniform(low=-8.5, high=7.5, size=(problem_size.k()
                                                                           * problem_size.n(),))).astype(bfloat16)
        else:
            tensor_B = np.ceil(np.random.uniform(low=-8.5, high=7.5, size=(problem_size.k()
                                                                           * problem_size.n(),))).astype(getattr(np, args.element_b))
    else:
        tensor_B = np.random.uniform(low=-2, high=2, size=(problem_size.k()
                                                           * problem_size.n(),)).astype(getattr(np, args.element_b))

    if args.element_c != "int8":
        if args.element_c == "bfloat16":
            tensor_C = np.ceil(np.random.uniform(low=-8.5, high=7.5, size=(problem_size.m()
                                                                           * problem_size.n(),))).astype(bfloat16)
        else:
            tensor_C = np.ceil(np.random.uniform(low=-8.5, high=7.5, size=(problem_size.m()
                                                                           * problem_size.n(),))).astype(getattr(np, args.element_c))
    else:
        tensor_C = np.random.uniform(low=-2, high=2, size=(problem_size.m()
                                                           * problem_size.n(),)).astype(getattr(np, args.element_c))
    tensor_D = np.zeros_like(tensor_C)

    tensor_As.append(tensor_A)
    tensor_Bs.append(tensor_B)
    tensor_Cs.append(tensor_C)
    tensor_Ds.append(tensor_D)
    tensor_D_refs.append(reference_module.run(
        tensor_A, tensor_B, tensor_C, problem_size, args.alpha, args.beta))
    problem_sizes_coord.append(problem_size)

arguments = GemmGroupedArguments(
    operation, problem_sizes_coord, tensor_As, tensor_Bs, tensor_Cs, tensor_Ds,
    output_op=LinearCombinationFunctorArguments(args.alpha, args.beta)
)

operation.run(arguments)

arguments.sync()

for tensor_d, tensor_d_ref in zip(tensor_Ds, tensor_D_refs):
    assert np.array_equal(tensor_d, tensor_d_ref)

print("Passed.")
CUTLASS 2.10 (#615) Co-authored-by: Aniket Shivam <ashivam@nvidia.com> 2022-09-04 06:48:46 +08:00			`################################################################################`
			`#`
			`# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.`
			`# SPDX-License-Identifier: BSD-3-Clause`
			`#`
			`# Redistribution and use in source and binary forms, with or without`
			`# modification, are permitted provided that the following conditions are met:`
			`#`
			`# 1. Redistributions of source code must retain the above copyright notice, this`
			`# list of conditions and the following disclaimer.`
			`#`
			`# 2. Redistributions in binary form must reproduce the above copyright notice,`
			`# this list of conditions and the following disclaimer in the documentation`
			`# and/or other materials provided with the distribution.`
			`#`
			`# 3. Neither the name of the copyright holder nor the names of its`
			`# contributors may be used to endorse or promote products derived from`
			`# this software without specific prior written permission.`
			`#`
			`# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"`
			`# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE`
			`# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE`
			`# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE`
			`# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL`
			`# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR`
			`# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER`
			`# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,`
			`# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE`
			`# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.`
			`#`
			`################################################################################`
			`import pycutlass`
			`from pycutlass import *`
			`import csv`

			`import argparse`

			`# parse the arguments`
			`parser = argparse.ArgumentParser(`
			`description="Launch CUTLASS GEMM Grouped kernels from python")`

			`# Operation description`
			`# math instruction description`
			`parser.add_argument("-i", "--instruction_shape",`
			`default=[1, 1, 1], nargs=3, type=int,`
			`help="This option describes the size of MMA op")`
			`parser.add_argument("-ta", "--element_a", default="float32", type=str,`
			`choices=['float64', 'float32', 'float16', 'bfloat16', 'int32', 'int8'],`
			`help='Data type of elements in input tensor A')`
			`parser.add_argument("-tb", "--element_b", default="float32", type=str,`
			`choices=['float64', 'float32', 'float16', 'bfloat16', 'int32', 'int8'],`
			`help='Data type of elements in input tensor B')`
			`parser.add_argument("-tc", "--element_c", default="float32", type=str,`
			`choices=['float64', 'float32', 'float16', 'bfloat16', 'int32', 'int8'],`
			`help='Data type of elements in input tensor C and output tensor D')`
			`parser.add_argument("-tacc", "--element_acc", default="float32", type=str,`
			`choices=['float64', 'float32', 'float16', 'bfloat16', 'int32', 'int8'],`
			`help='Data type of accumulator')`
			`parser.add_argument('-m', "--math", default="multiply_add",`
			`type=str, choices=["multiply_add", "multiply_add_fast_bf16", "multiply_add_fast_f32"], help="math instruction")`
			`parser.add_argument('-op', "--opcode", default="simt", type=str,`
			`choices=["Simt", 'TensorOp'], help='This option describes whether you want to use tensor \`
			`cores (TensorOp) or regular SIMT cores (Simt) on GPU SM')`
			`# tile description`
			`parser.add_argument("-b", "--threadblock_shape",`
			`default=[128, 128, 8], nargs=3, type=int,`
			`help="This option describes the tile size a thread block with compute")`
			`parser.add_argument("-s", "--stages", default=4,`
			`type=int, help="Number of pipelines you want to use")`
			`parser.add_argument("-w", "--warp_count", default=[`
			`4, 2, 1], nargs=3, type=int,`
			`help="This option describes the number of warps along M, N, and K of the threadblock")`
			`parser.add_argument("-cc", "--compute_capability", default=80,`
			`type=int, help="This option describes CUDA SM architecture number")`
			`# A`
			`parser.add_argument('-la', "--layout_a", default="RowMajor", type=str, choices=[`
			`"RowMajor", "ColumnMajor", "RowMajorInterleaved32", "ColumnMajorInterleaved32"],`
			`help="Memory layout of input tensor A")`
			`parser.add_argument('-aa', '--alignment_a', default=1,`
			`type=int, help="Memory alignment of input tensor A")`
			`# B`
			`parser.add_argument('-lb', "--layout_b", default="RowMajor", type=str, choices=[`
			`"RowMajor", "ColumnMajor", "RowMajorInterleaved32", "ColumnMajorInterleaved32"],`
			`help="Memory layout of input tensor B")`
			`parser.add_argument('-ab', '--alignment_b', default=1,`
			`type=int, help="Memory alignment of input tensor B")`
			`# C`
			`parser.add_argument('-lc', "--layout_c", default="RowMajor", type=str, choices=[`
			`"RowMajor", "ColumnMajor", "RowMajorInterleaved32", "ColumnMajorInterleaved32"],`
			`help="Memory layout of input tensor C and output tensor D")`
			`parser.add_argument('-ac', '--alignment_c', default=1,`
			`type=int, help="Memory alignment of input tensor C and output tensor D")`
			`# epilogue`
			`parser.add_argument("-te", "--element_epilogue", default="float32", type=str,`
			`choices=['float64', 'float32', 'float16', 'bfloat16'], help='Epilogue datatype')`
			`parser.add_argument("-ep", "--epilogue_functor", default="LinearCombination",`
			`type=str, choices=['LinearCombination', 'FastLinearCombinationClamp', 'LinearCombinationClamp'],`
			`help="This option describes the epilogue part of the kernel")`
			`# swizzling`
			`parser.add_argument("-sw", "--swizzling_functor", default="IdentitySwizzle1", type=str, choices=[`
			`"IdentitySwizzle1", "IdentitySwizzle2", "IdentitySwizzle4", "IdentitySwizzle8", "HorizontalSwizzle"],`
			`help="This option describes how thread blocks are scheduled on GPU")`
			`# precompute mode`
			`parser.add_argument("-pm", "--precompute_mode",`
			`default="Device", type=str, choices=["Host", "Device"],`
			`help="Grouped Gemm Scheduing on device only (Device) or using host precompute (Host)")`
			`# arguments`
			`parser.add_argument("-p", "--problem_size_dir", type=str,`
			`help="path to the csv file contains the problem sizes")`
			`parser.add_argument("-alpha", "--alpha", default=1.0, type=float, help="alpha")`
			`parser.add_argument("-beta", "--beta", default=0.0, type=float, help="beta")`

			`parser.add_argument('--print_cuda', action="store_true",`
			`help="print the underlying CUDA kernel")`

			`try:`
			`args = parser.parse_args()`
			`except:`
			`sys.exit(0)`

			`pycutlass.get_memory_pool(init_pool_size=230, max_pool_size=232)`

			`element_a = getattr(cutlass, args.element_a)`
			`element_b = getattr(cutlass, args.element_b)`
			`element_c = getattr(cutlass, args.element_c)`
			`element_acc = getattr(cutlass, args.element_acc)`
			`math_operation = getattr(MathOperation, args.math)`
			`opclass = getattr(cutlass.OpClass, args.opcode)`

			`math_inst = MathInstruction(`
			`args.instruction_shape, element_a, element_b,`
			`element_acc, opclass, math_operation`
			`)`

			`tile_description = TileDescription(`
			`args.threadblock_shape, args.stages, args.warp_count,`
			`math_inst, args.compute_capability, args.compute_capability`
			`)`

			`layout_a = getattr(cutlass, args.layout_a)`
			`layout_b = getattr(cutlass, args.layout_b)`
			`layout_c = getattr(cutlass, args.layout_c)`

			`A = TensorDescription(`
			`element_a, layout_a, args.alignment_a`
			`)`

			`B = TensorDescription(`
			`element_b, layout_b, args.alignment_b`
			`)`

			`C = TensorDescription(`
			`element_c, layout_c, args.alignment_c`
			`)`

			`element_epilogue = getattr(cutlass, args.element_epilogue)`
			`epilogue_functor = getattr(EpilogueFunctor, args.epilogue_functor)`
			`swizzling_functor = getattr(cutlass, args.swizzling_functor)`
			`precompute_mode = getattr(SchedulerMode, args.precompute_mode)`

			`operation = GemmOperationGrouped(`
			`arch=args.compute_capability, tile_description=tile_description,`
			`A=A, B=B, C=C, element_epilogue=element_epilogue,`
			`epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor,`
			`precompute_mode=precompute_mode`
			`)`

			`if args.print_cuda:`
			`print(operation.rt_module.emit())`

			`pycutlass.compiler.add_module([operation, ])`

			`reference_module = ReferenceModule(A, B, C)`

			`# get problems`
			`problem_sizes = []`
			`with open(args.problem_size_dir) as csv_file:`
			`reader = csv.reader(csv_file)`
			`for row in reader:`
			`problem_sizes.append(`
			`cutlass.gemm.GemmCoord(int(row[0]), int(row[1]), int(row[2]))`
			`)`

			`problem_count = len(problem_sizes)`

			`tensor_As = []`
			`tensor_Bs = []`
			`tensor_Cs = []`
			`tensor_Ds = []`
			`problem_sizes_coord = []`
			`tensor_D_refs = []`

			`for problem_size in problem_sizes:`
			`if args.element_a != "int8":`
			`if args.element_a == "bfloat16":`
			`tensor_A = np.ceil(np.random.uniform(low=-8.5, high=7.5, size=(problem_size.m()`
			`* problem_size.k(),))).astype(bfloat16)`
			`else:`
			`tensor_A = np.ceil(np.random.uniform(low=-8.5, high=7.5, size=(problem_size.m()`
			`* problem_size.k(),))).astype(getattr(np, args.element_a))`
			`else:`
			`tensor_A = np.random.uniform(low=-2, high=2, size=(problem_size.m()`
			`* problem_size.k(),)).astype(getattr(np, args.element_a))`

			`if args.element_b != "int8":`
			`if args.element_b == "bfloat16":`
			`tensor_B = np.ceil(np.random.uniform(low=-8.5, high=7.5, size=(problem_size.k()`
			`* problem_size.n(),))).astype(bfloat16)`
			`else:`
			`tensor_B = np.ceil(np.random.uniform(low=-8.5, high=7.5, size=(problem_size.k()`
			`* problem_size.n(),))).astype(getattr(np, args.element_b))`
			`else:`
			`tensor_B = np.random.uniform(low=-2, high=2, size=(problem_size.k()`
			`* problem_size.n(),)).astype(getattr(np, args.element_b))`

			`if args.element_c != "int8":`
			`if args.element_c == "bfloat16":`
			`tensor_C = np.ceil(np.random.uniform(low=-8.5, high=7.5, size=(problem_size.m()`
			`* problem_size.n(),))).astype(bfloat16)`
			`else:`
			`tensor_C = np.ceil(np.random.uniform(low=-8.5, high=7.5, size=(problem_size.m()`
			`* problem_size.n(),))).astype(getattr(np, args.element_c))`
			`else:`
			`tensor_C = np.random.uniform(low=-2, high=2, size=(problem_size.m()`
			`* problem_size.n(),)).astype(getattr(np, args.element_c))`
			`tensor_D = np.zeros_like(tensor_C)`

			`tensor_As.append(tensor_A)`
			`tensor_Bs.append(tensor_B)`
			`tensor_Cs.append(tensor_C)`
			`tensor_Ds.append(tensor_D)`
			`tensor_D_refs.append(reference_module.run(`
			`tensor_A, tensor_B, tensor_C, problem_size, args.alpha, args.beta))`
			`problem_sizes_coord.append(problem_size)`

			`arguments = GemmGroupedArguments(`
			`operation, problem_sizes_coord, tensor_As, tensor_Bs, tensor_Cs, tensor_Ds,`
			`output_op=LinearCombinationFunctorArguments(args.alpha, args.beta)`
			`)`

			`operation.run(arguments)`

			`arguments.sync()`

			`for tensor_d, tensor_d_ref in zip(tensor_Ds, tensor_D_refs):`
			`assert np.array_equal(tensor_d, tensor_d_ref)`

			`print("Passed.")`