cutlass/tools/library/scripts/rt.py
ANIKET SHIVAM 66d9cddc83
New updates for 2.11 (#775)
* New updates.

* Minor profiler updates

Co-authored-by: Aniket Shivam <ashivam@nvidia.com>
2023-01-20 16:32:57 -05:00

797 lines
21 KiB
Python

#################################################################################################
#
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
# System imports
import struct
import io
import ctypes
# CUDA Python import
from cuda import cuda
from cuda import nvrtc
# CUTLASS imports
from library import *
from gemm_operation import EmitGemmUniversalInstance
#################################################################################################
#
# CUTLASS Py Runtime Components
#
#################################################################################################
#
def MaxAlignment(fmt):
align = 1
for x in fmt:
align = max(align, struct.calcsize(x))
return align
#
def AlignedOffset(offset, align):
remainder = (offset % align)
if remainder:
offset += (align - remainder)
return offset
#
def PackInteger(host_workspace, offset, value):
fmt = "i"
padding = AlignedOffset(offset, 4)
struct.pack_into(fmt, host_workspace, offset, value)
return padding + struct.calcsize(fmt)
#
def PackDevicePointer(host_workspace, offset, value):
fmt = "P"
offset = AlignedOffset(offset, 8)
struct.pack_into(fmt, host_workspace, offset, value)
return offset + struct.calcsize(fmt)
#
def ceil_div(a, b):
return -(a // -b)
#################################################################################################
#
class PitchLinearCoord:
def __init__(self, contiguous, strided):
self.contiguous = contiguous
self.strided = strided
#
class GemmCoord:
def __init__(self, m = 1, n = 1, k = 1):
self.m = m
self.n = n
self.k = k
self.fmt = "iii"
#
def ceil_div(self, rhs):
return GemmCoord(ceil_div(self.m, rhs.m), ceil_div(self.n, rhs.n), ceil_div(self.k, rhs.k))
#
def size(self):
return struct.calcsize(self.fmt)
#
def alignment(self):
return MaxAlignment(self.fmt)
#
def pack_into(self, host_workspace, offset):
offset = AlignedOffset(offset, 4)
struct.pack_into(
self.fmt,
host_workspace,
offset,
self.m, self.n, self.k)
return offset + self.size()
#
class TensorRef:
def __init__(self, pointer = None, layout = 0):
self.pointer = pointer
self.layout = layout
def __str__(self):
return "(%x, %d)" % (self.pointer._ptr, self.layout)
#################################################################################################
#
class PredicatedTileAccessIteratorDesc:
'''
'''
def __init__(
self,
element_size_bits,
advance_rank,
threadblock_shape,
threadmap_iterations,
threadmap_delta):
self.element_size_bits = element_size_bits
self.advance_rank = advance_rank
self.threadblock_shape = threadblock_shape
self.threadmap_iterations = threadmap_iterations
self.threadmap_delta = threadmap_delta
#
class PredicatedTileAccessIteratorParams:
'''
'''
#
def __init__(self, desc, label):
self.desc = desc
self.label = label
self.fmt = "qqqq"
#
def size(self):
return struct.calcsize(self.fmt)
#
def alignment(self):
return MaxAlignment(self.fmt)
#
def initialize(self, host_workspace, offset, stride):
offset = AlignedOffset(offset, self.alignment())
inc_strided = stride * \
self.desc.threadmap_delta.strided * \
self.desc.element_size_bits // 8
if self.desc.advance_rank:
inc_advance = self.desc.threadblock_shape.strided * \
stride * \
self.desc.element_size_bits // 8
else:
inc_advance = self.desc.threadblock_shape.contiguous * \
self.desc.element_size_bits // 8
inc_next = inc_advance - (self.desc.threadmap_iterations.strided - 1) * \
self.desc.threadmap_delta.strided * \
stride * \
self.desc.element_size_bits // 8
struct.pack_into(
self.fmt,
host_workspace,
offset,
stride, inc_strided, inc_next, inc_advance)
return offset + self.size()
#
#################################################################################################
#
class EpilogueTileDesc:
'''
'''
def __init__(self, column, row, group, cluster, tile):
self.column = column
self.row = row
self.group = group
self.cluster = cluster
self.tile = tile
#
class EpilogueThreadMap:
'''
'''
def __init__(self, threads, elements_per_access, element_size_bits, shape, iterations, delta, count):
self.threads = threads
self.elements_per_access = elements_per_access
self.element_size_bits = element_size_bits
self.shape = shape
self.iterations = iterations
self.delta = delta
self.count = count
pass
#
class EpilogueTileIteratorParams:
'''
'''
#
def __init__(self, desc, label):
self.desc = desc
self.label = label
self.fmt = "qqqqqqqq"
#
def size(self):
return struct.calcsize(self.fmt)
#
def alignment(self):
return MaxAlignment(self.fmt)
#
def initialize(self, host_workspace, offset, stride):
stride = stride * self.desc.element_size_bits // 8
offset = AlignedOffset(offset, self.alignment())
increment_row = stride * self.desc.delta.row
increment_group = stride * self.desc.delta.group \
- stride * self.desc.delta.row * (self.desc.iterations.row - 1)
increment_cluster = stride * self.desc.delta.cluster \
- stride * self.desc.delta.group * (self.desc.iterations.group - 1) \
- stride * self.desc.delta.row * (self.desc.iterations.row - 1)
advance_row = stride * self.desc.shape.row
advance_group = stride * \
(self.desc.shape.group - 1) * \
self.desc.shape.row * \
self.desc.count.row
advance_cluster = stride * \
self.desc.count.group * \
self.desc.shape.group * \
self.desc.count.row * \
self.desc.shape.row
advance_tile = stride * \
self.desc.shape.group * \
self.desc.shape.row * \
self.desc.shape.cluster * \
self.desc.shape.tile
struct.pack_into(
self.fmt, \
host_workspace, \
offset, \
stride, \
increment_row, increment_group, increment_cluster, \
advance_row, advance_group, advance_cluster, advance_tile)
return offset + self.size()
#
#################################################################################################
#
# Launch configuration
#
#################################################################################################
class LaunchConfiguration:
def __init__(self, grid = [1,1,1], block = [1,1,1], smem = 0):
self.grid = grid
self.block = block
self.shared_memory_capacity = smem
#################################################################################################
#
# Functors
#
#################################################################################################
#
class Functor:
def __init__(self):
self.decl = ''
self.definition = ''
self.fmt = ''
self.identifier = ''
#
def emit_declaration(self):
return self.decl
#
def emit_definition(self):
return self.definition
#
def size(self):
'''
Size of the packed Params structure
'''
return struct.calcsize(self.fmt)
#
def alignment(self):
return MaxAlignment(self.fmt)
#
def initialize(self, host_workspace, offset, arguments):
return offset + self.size()
#################################################################################################
#
class LinearCombinationFunctorArguments:
def __init__(self, alpha = 1.0, beta = 0.0):
self.alpha = alpha
self.beta = beta
self.alpha_ptr = 0
self.beta_ptr = 0
#
class LinearCombinationFunctor(Functor):
def __init__(self):
super().__init__()
self.decl = """
cutlass::epilogue::thread::LinearCombination<
float,
1,
float,
float
>"""
self.identifier = 'linear_combination'
self.fmt = "ffPP"
#
def size(self):
'''
Size of the packed Params structure
'''
return struct.calcsize(self.fmt)
#
def alignment(self):
return MaxAlignment(self.fmt)
#
def initialize(self, host_workspace, offset, arguments):
offset = AlignedOffset(offset, self.alignment())
struct.pack_into(
self.fmt,
host_workspace, offset,
arguments.alpha, arguments.beta, arguments.alpha_ptr, arguments.beta_ptr)
return offset + self.size()
#################################################################################################
#
# Base class for an executable operation
#
#################################################################################################
#
class ExecutableOperation:
'''
'''
def __init__(self, operation):
self.operation = operation
self.module = None
self.kernel = None
#
def name(self):
return self.operation.procedural_name()
#
def emit(self):
return ''
#
def can_implement(self, configuration, arguments):
return False
#
def get_host_workspace_size(self, arguments):
return 0
#
def get_device_workspace_size(self, arguments):
return 0
#
def plan(self, arguments):
return LaunchConfiguration()
#
def initialize(self, host_workspace, device_workspace, launch_config, arguments, stream = cuda.CUstream(0)):
raise NotImplementedError()
#
def run(self, host_workspace, device_workspace, launch_config, stream = cuda.CUstream(0)):
cArg = (ctypes.c_char * len(host_workspace)).from_buffer(host_workspace)
packed = (ctypes.c_void_p * 1)()
packed[0] = ctypes.addressof(cArg)
err, = cuda.cuLaunchKernel(
self.kernel,
launch_config.grid[0], launch_config.grid[1], launch_config.grid[2],
launch_config.block[0], launch_config.block[1], launch_config.block[2],
launch_config.shared_memory_capacity,
stream,
packed,
0)
return err
#################################################################################################
#
class GemmArguments:
'''
'''
def __init__(self):
self.problem_size = GemmCoord(0, 0, 0)
self.A = TensorRef()
self.B = TensorRef()
self.C = TensorRef()
self.D = TensorRef()
self.output_op = LinearCombinationFunctorArguments()
#
class ThreadblockSwizzle:
def __init__(self, threadblock_shape, log_threadblock_cohort = 0):
self.threadblock_shape = threadblock_shape
self.log_threadblock_cohort = log_threadblock_cohort
def grid_tiled_shape(self, problem_size):
return GemmCoord(
ceil_div(problem_size.m, self.threadblock_shape.m),
ceil_div(problem_size.n, self.threadblock_shape.n),
1)
#
class Gemm(ExecutableOperation):
'''
GEMM manages the CUTLASS runtime components
'''
#
def __init__(self, operation):
super().__init__(operation)
self.emitter = EmitGemmUniversalInstance('_type')
self.threadblock_swizzle = ThreadblockSwizzle(GemmCoord(128, 128, 8))
self.threads = 256
self.shared_memory_capacity = (32 << 10)
self.params_A = PredicatedTileAccessIteratorParams(
PredicatedTileAccessIteratorDesc(
32,
1,
PitchLinearCoord(128, 8),
PitchLinearCoord(1, 4),
PitchLinearCoord(1, 2)), 'A')
self.params_B = PredicatedTileAccessIteratorParams(
PredicatedTileAccessIteratorDesc(
32,
1,
PitchLinearCoord(128, 8),
PitchLinearCoord(1, 4),
PitchLinearCoord(1, 2)), 'B')
self.params_C = EpilogueTileIteratorParams(
EpilogueThreadMap(
256,
1,
32,
EpilogueTileDesc(128, 1, 4, 4, 1),
EpilogueTileDesc(4, 1, 2, 1, 1),
EpilogueTileDesc(32, 1, 8, 1, 1),
EpilogueTileDesc(1, 4, 2, 1, 8)), 'C')
self.params_D = EpilogueTileIteratorParams(
EpilogueThreadMap(
256,
1,
32,
EpilogueTileDesc(128, 1, 4, 4, 1),
EpilogueTileDesc(4, 1, 2, 1, 1),
EpilogueTileDesc(32, 1, 8, 1, 1),
EpilogueTileDesc(1, 4, 2, 1, 8)), 'D')
self.output_op = LinearCombinationFunctor()
#
def emit(self):
return self.emitter.emit(self.operation)
#
def can_implement(self, configuration, arguments):
pass
#
def get_host_workspace_size(self, arguments):
return 336
#
def get_device_workspace_size(self, arguments):
return 0
#
def plan(self, arguments):
grid = self.threadblock_swizzle.grid_tiled_shape(arguments.problem_size)
return LaunchConfiguration([grid.m, grid.n, grid.k], [self.threads, 1, 1], self.shared_memory_capacity)
#
def initialize(self, host_workspace, device_workspace, launch_config, arguments, stream = cuda.CUstream(0)):
offset = 0
# Compute intermediate results
swizzle_log_tile = 0
gemm_mode = 0
batch_count = 1
gemm_k_size = arguments.problem_size.k
# Pack into the host workspace buffer
offset = arguments.problem_size.pack_into(host_workspace, offset)
grid_tiled_shape = self.threadblock_swizzle.grid_tiled_shape(arguments.problem_size)
offset = grid_tiled_shape.pack_into(host_workspace, offset)
offset = PackInteger(host_workspace, offset, swizzle_log_tile)
offset = self.params_A.initialize(host_workspace, offset, arguments.A.layout)
offset = self.params_B.initialize(host_workspace, offset, arguments.B.layout)
offset = self.params_C.initialize(host_workspace, offset, arguments.C.layout)
offset = self.params_D.initialize(host_workspace, offset, arguments.D.layout)
offset = self.output_op.initialize(host_workspace, offset, arguments.output_op)
offset = PackInteger(host_workspace, offset, gemm_mode)
offset = PackInteger(host_workspace, offset, batch_count)
offset = PackInteger(host_workspace, offset, gemm_k_size)
offset = PackDevicePointer(host_workspace, offset, int(arguments.A.pointer))
offset = PackDevicePointer(host_workspace, offset, int(arguments.B.pointer))
offset = PackDevicePointer(host_workspace, offset, int(arguments.C.pointer))
offset = PackDevicePointer(host_workspace, offset, int(arguments.D.pointer))
return offset
#################################################################################################
#
# Module represents a compilation unit
#
#################################################################################################
#
class CompilationOptions:
'''
Compilation options.
'''
#
def __init__(self, architectures = [80], include_paths = []):
self.includes = []
self.include_paths = include_paths
self.flags = ['-std=c++11', '-default-device']
self.architectures = architectures
#
def get(self):
options = []
for flag in self.flags:
options.append(bytes(str.encode(flag)))
for incl in self.include_paths:
options.append(bytes(str.encode('--include-path=%s' % incl)))
arch_list = "-arch="
for idx, arch in enumerate(self.architectures):
if idx:
arch_list += ","
arch_list += "sm_%d" % arch
options.append(bytes(str.encode(arch_list)))
return options
IncludeTemplate = r'''#include "${include}"
'''
KernelTemplate = r'''
extern "C"
__global__ void
${operation_name}(${operation_name}${operation_suffix}::Params params) {
// Dynamic shared memory base pointer
extern __shared__ int SharedStorageBase[];
// Declare pointer to dynamic shared memory.
${operation_name}${operation_suffix}::SharedStorage *shared_storage =
reinterpret_cast<${operation_name}${operation_suffix}::SharedStorage *>(SharedStorageBase);
${operation_name}${operation_suffix} op;
op(params, *shared_storage);
}
'''
#
class Module:
def __init__(self, name, operations, compilation_options):
self.name = name
self.operations = operations
self.module = None
self.log = None
self.cubin_image = None
self.source_buffer = ''
#
# Emit source
#
self.emit_()
#
# Compile
#
self.compile_(compilation_options)
#
# Load module
#
self.load_()
# Done
return
# Emit a source buffer
def emit_(self):
# 1. Includes
includes = []
for operation in self.operations:
for incl in operation.emitter.includes:
if incl not in includes:
includes.append(incl)
for incl in includes:
self.source_buffer += SubstituteTemplate(IncludeTemplate, { 'include': incl} )
# 2. Operations
for operation in self.operations:
self.source_buffer += operation.emit()
values = {
'operation_name': operation.name(),
'operation_suffix': operation.emitter.operation_suffix
}
self.source_buffer += SubstituteTemplate(KernelTemplate, values)
# Done
return
# Compile with NVRTC
def compile_(self, compilation_options):
err, program = nvrtc.nvrtcCreateProgram(
str.encode(self.source_buffer),
bytes(str.encode(self.name)),
0, [], [])
if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
raise RuntimeError('NVRTC Error: {}'.format(err))
# Compile program
options = compilation_options.get()
err, = nvrtc.nvrtcCompileProgram(program, len(options), options)
if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
error_string = 'NVRTC Error: {}\n'.format(err)
# Get log from compilation
err, logSize = nvrtc.nvrtcGetProgramLogSize(program)
if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
raise RuntimeError('NVRTC Error: {}'.format(err))
self.log = b' ' * logSize
err, = nvrtc.nvrtcGetProgramLog(program, self.log)
if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
raise RuntimeError('NVRTC Error: {}'.format(err))
raise RuntimeError(error_string + self.log.decode() + self.source_buffer)
# Get data from compilation
err, dataSize = nvrtc.nvrtcGetCUBINSize(program)
if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
raise RuntimeError('NVRTC Error: {}'.format(err))
self.cubin_image = b' ' * dataSize
err, = nvrtc.nvrtcGetCUBIN(program, self.cubin_image)
if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
raise RuntimeError('NVRTC Error: {}'.format(err))
return
#
def load_(self):
# Load data as module data
err, self.module = cuda.cuModuleLoadData(self.cubin_image)
if err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError('Cuda Error: {}'.format(err))
# Get functions
for operation in self.operations:
err, operation.kernel = cuda.cuModuleGetFunction(
self.module,
bytes(str.encode(operation.name())))
if err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError('Cuda Error: {}'.format(err))
operation.module = self
return
#################################################################################################
#
# Manifest represents an 'owner' for modules and operations
#
#################################################################################################
#
class Manifest:
#
def __init__(self):
self.operations = {}
self.modules = []
pass
#
def append_module(self, module):
'''
Appends a module and takes ownership of operations used to construct it.
'''
self.modules.append(module)
for operation in module.operations:
self.operations[operation.name()] = operation
#################################################################################################