
* New updates. * Minor profiler updates Co-authored-by: Aniket Shivam <ashivam@nvidia.com>
797 lines
21 KiB
Python
797 lines
21 KiB
Python
#################################################################################################
|
|
#
|
|
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
# SPDX-License-Identifier: BSD-3-Clause
|
|
#
|
|
# Redistribution and use in source and binary forms, with or without
|
|
# modification, are permitted provided that the following conditions are met:
|
|
#
|
|
# 1. Redistributions of source code must retain the above copyright notice, this
|
|
# list of conditions and the following disclaimer.
|
|
#
|
|
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
|
# this list of conditions and the following disclaimer in the documentation
|
|
# and/or other materials provided with the distribution.
|
|
#
|
|
# 3. Neither the name of the copyright holder nor the names of its
|
|
# contributors may be used to endorse or promote products derived from
|
|
# this software without specific prior written permission.
|
|
#
|
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
|
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
#
|
|
#################################################################################################
|
|
|
|
# System imports
|
|
import struct
|
|
import io
|
|
import ctypes
|
|
|
|
# CUDA Python import
|
|
from cuda import cuda
|
|
from cuda import nvrtc
|
|
|
|
# CUTLASS imports
|
|
from library import *
|
|
from gemm_operation import EmitGemmUniversalInstance
|
|
|
|
#################################################################################################
|
|
#
|
|
# CUTLASS Py Runtime Components
|
|
#
|
|
#################################################################################################
|
|
|
|
#
|
|
def MaxAlignment(fmt):
|
|
align = 1
|
|
for x in fmt:
|
|
align = max(align, struct.calcsize(x))
|
|
return align
|
|
|
|
#
|
|
def AlignedOffset(offset, align):
|
|
remainder = (offset % align)
|
|
if remainder:
|
|
offset += (align - remainder)
|
|
return offset
|
|
|
|
#
|
|
def PackInteger(host_workspace, offset, value):
|
|
fmt = "i"
|
|
padding = AlignedOffset(offset, 4)
|
|
struct.pack_into(fmt, host_workspace, offset, value)
|
|
return padding + struct.calcsize(fmt)
|
|
|
|
#
|
|
def PackDevicePointer(host_workspace, offset, value):
|
|
fmt = "P"
|
|
offset = AlignedOffset(offset, 8)
|
|
struct.pack_into(fmt, host_workspace, offset, value)
|
|
return offset + struct.calcsize(fmt)
|
|
|
|
#
|
|
def ceil_div(a, b):
|
|
return -(a // -b)
|
|
|
|
#################################################################################################
|
|
|
|
#
|
|
class PitchLinearCoord:
|
|
def __init__(self, contiguous, strided):
|
|
self.contiguous = contiguous
|
|
self.strided = strided
|
|
|
|
#
|
|
class GemmCoord:
|
|
def __init__(self, m = 1, n = 1, k = 1):
|
|
self.m = m
|
|
self.n = n
|
|
self.k = k
|
|
self.fmt = "iii"
|
|
|
|
#
|
|
def ceil_div(self, rhs):
|
|
return GemmCoord(ceil_div(self.m, rhs.m), ceil_div(self.n, rhs.n), ceil_div(self.k, rhs.k))
|
|
|
|
#
|
|
def size(self):
|
|
return struct.calcsize(self.fmt)
|
|
|
|
#
|
|
def alignment(self):
|
|
return MaxAlignment(self.fmt)
|
|
|
|
#
|
|
def pack_into(self, host_workspace, offset):
|
|
|
|
offset = AlignedOffset(offset, 4)
|
|
|
|
struct.pack_into(
|
|
self.fmt,
|
|
host_workspace,
|
|
offset,
|
|
self.m, self.n, self.k)
|
|
|
|
return offset + self.size()
|
|
|
|
#
|
|
class TensorRef:
|
|
def __init__(self, pointer = None, layout = 0):
|
|
self.pointer = pointer
|
|
self.layout = layout
|
|
|
|
def __str__(self):
|
|
return "(%x, %d)" % (self.pointer._ptr, self.layout)
|
|
|
|
#################################################################################################
|
|
|
|
#
|
|
class PredicatedTileAccessIteratorDesc:
|
|
'''
|
|
'''
|
|
|
|
def __init__(
|
|
self,
|
|
element_size_bits,
|
|
advance_rank,
|
|
threadblock_shape,
|
|
threadmap_iterations,
|
|
threadmap_delta):
|
|
|
|
self.element_size_bits = element_size_bits
|
|
self.advance_rank = advance_rank
|
|
self.threadblock_shape = threadblock_shape
|
|
self.threadmap_iterations = threadmap_iterations
|
|
self.threadmap_delta = threadmap_delta
|
|
|
|
#
|
|
class PredicatedTileAccessIteratorParams:
|
|
'''
|
|
'''
|
|
#
|
|
def __init__(self, desc, label):
|
|
self.desc = desc
|
|
self.label = label
|
|
self.fmt = "qqqq"
|
|
#
|
|
def size(self):
|
|
return struct.calcsize(self.fmt)
|
|
|
|
#
|
|
def alignment(self):
|
|
return MaxAlignment(self.fmt)
|
|
|
|
#
|
|
def initialize(self, host_workspace, offset, stride):
|
|
|
|
offset = AlignedOffset(offset, self.alignment())
|
|
|
|
inc_strided = stride * \
|
|
self.desc.threadmap_delta.strided * \
|
|
self.desc.element_size_bits // 8
|
|
|
|
if self.desc.advance_rank:
|
|
inc_advance = self.desc.threadblock_shape.strided * \
|
|
stride * \
|
|
self.desc.element_size_bits // 8
|
|
else:
|
|
inc_advance = self.desc.threadblock_shape.contiguous * \
|
|
self.desc.element_size_bits // 8
|
|
|
|
inc_next = inc_advance - (self.desc.threadmap_iterations.strided - 1) * \
|
|
self.desc.threadmap_delta.strided * \
|
|
stride * \
|
|
self.desc.element_size_bits // 8
|
|
|
|
struct.pack_into(
|
|
self.fmt,
|
|
host_workspace,
|
|
offset,
|
|
stride, inc_strided, inc_next, inc_advance)
|
|
|
|
return offset + self.size()
|
|
#
|
|
|
|
#################################################################################################
|
|
|
|
#
|
|
class EpilogueTileDesc:
|
|
'''
|
|
'''
|
|
def __init__(self, column, row, group, cluster, tile):
|
|
self.column = column
|
|
self.row = row
|
|
self.group = group
|
|
self.cluster = cluster
|
|
self.tile = tile
|
|
|
|
#
|
|
class EpilogueThreadMap:
|
|
'''
|
|
'''
|
|
def __init__(self, threads, elements_per_access, element_size_bits, shape, iterations, delta, count):
|
|
self.threads = threads
|
|
self.elements_per_access = elements_per_access
|
|
self.element_size_bits = element_size_bits
|
|
self.shape = shape
|
|
self.iterations = iterations
|
|
self.delta = delta
|
|
self.count = count
|
|
pass
|
|
|
|
#
|
|
class EpilogueTileIteratorParams:
|
|
'''
|
|
'''
|
|
#
|
|
def __init__(self, desc, label):
|
|
self.desc = desc
|
|
self.label = label
|
|
self.fmt = "qqqqqqqq"
|
|
|
|
#
|
|
def size(self):
|
|
return struct.calcsize(self.fmt)
|
|
|
|
#
|
|
def alignment(self):
|
|
return MaxAlignment(self.fmt)
|
|
|
|
#
|
|
def initialize(self, host_workspace, offset, stride):
|
|
|
|
stride = stride * self.desc.element_size_bits // 8
|
|
|
|
offset = AlignedOffset(offset, self.alignment())
|
|
|
|
increment_row = stride * self.desc.delta.row
|
|
|
|
increment_group = stride * self.desc.delta.group \
|
|
- stride * self.desc.delta.row * (self.desc.iterations.row - 1)
|
|
|
|
increment_cluster = stride * self.desc.delta.cluster \
|
|
- stride * self.desc.delta.group * (self.desc.iterations.group - 1) \
|
|
- stride * self.desc.delta.row * (self.desc.iterations.row - 1)
|
|
|
|
advance_row = stride * self.desc.shape.row
|
|
|
|
advance_group = stride * \
|
|
(self.desc.shape.group - 1) * \
|
|
self.desc.shape.row * \
|
|
self.desc.count.row
|
|
|
|
advance_cluster = stride * \
|
|
self.desc.count.group * \
|
|
self.desc.shape.group * \
|
|
self.desc.count.row * \
|
|
self.desc.shape.row
|
|
|
|
advance_tile = stride * \
|
|
self.desc.shape.group * \
|
|
self.desc.shape.row * \
|
|
self.desc.shape.cluster * \
|
|
self.desc.shape.tile
|
|
|
|
struct.pack_into(
|
|
self.fmt, \
|
|
host_workspace, \
|
|
offset, \
|
|
stride, \
|
|
increment_row, increment_group, increment_cluster, \
|
|
advance_row, advance_group, advance_cluster, advance_tile)
|
|
|
|
return offset + self.size()
|
|
#
|
|
|
|
#################################################################################################
|
|
#
|
|
# Launch configuration
|
|
#
|
|
#################################################################################################
|
|
|
|
class LaunchConfiguration:
|
|
def __init__(self, grid = [1,1,1], block = [1,1,1], smem = 0):
|
|
self.grid = grid
|
|
self.block = block
|
|
self.shared_memory_capacity = smem
|
|
|
|
#################################################################################################
|
|
#
|
|
# Functors
|
|
#
|
|
#################################################################################################
|
|
|
|
#
|
|
class Functor:
|
|
def __init__(self):
|
|
self.decl = ''
|
|
self.definition = ''
|
|
self.fmt = ''
|
|
self.identifier = ''
|
|
|
|
#
|
|
def emit_declaration(self):
|
|
return self.decl
|
|
|
|
#
|
|
def emit_definition(self):
|
|
return self.definition
|
|
|
|
#
|
|
def size(self):
|
|
'''
|
|
Size of the packed Params structure
|
|
'''
|
|
return struct.calcsize(self.fmt)
|
|
|
|
#
|
|
def alignment(self):
|
|
return MaxAlignment(self.fmt)
|
|
|
|
#
|
|
def initialize(self, host_workspace, offset, arguments):
|
|
return offset + self.size()
|
|
|
|
#################################################################################################
|
|
|
|
#
|
|
class LinearCombinationFunctorArguments:
|
|
def __init__(self, alpha = 1.0, beta = 0.0):
|
|
self.alpha = alpha
|
|
self.beta = beta
|
|
self.alpha_ptr = 0
|
|
self.beta_ptr = 0
|
|
|
|
#
|
|
class LinearCombinationFunctor(Functor):
|
|
def __init__(self):
|
|
super().__init__()
|
|
|
|
self.decl = """
|
|
cutlass::epilogue::thread::LinearCombination<
|
|
float,
|
|
1,
|
|
float,
|
|
float
|
|
>"""
|
|
self.identifier = 'linear_combination'
|
|
self.fmt = "ffPP"
|
|
|
|
#
|
|
def size(self):
|
|
'''
|
|
Size of the packed Params structure
|
|
'''
|
|
return struct.calcsize(self.fmt)
|
|
|
|
#
|
|
def alignment(self):
|
|
return MaxAlignment(self.fmt)
|
|
|
|
#
|
|
def initialize(self, host_workspace, offset, arguments):
|
|
|
|
offset = AlignedOffset(offset, self.alignment())
|
|
|
|
struct.pack_into(
|
|
self.fmt,
|
|
host_workspace, offset,
|
|
arguments.alpha, arguments.beta, arguments.alpha_ptr, arguments.beta_ptr)
|
|
|
|
return offset + self.size()
|
|
|
|
#################################################################################################
|
|
#
|
|
# Base class for an executable operation
|
|
#
|
|
#################################################################################################
|
|
|
|
#
|
|
class ExecutableOperation:
|
|
'''
|
|
'''
|
|
def __init__(self, operation):
|
|
self.operation = operation
|
|
self.module = None
|
|
self.kernel = None
|
|
|
|
#
|
|
def name(self):
|
|
return self.operation.procedural_name()
|
|
|
|
#
|
|
def emit(self):
|
|
return ''
|
|
|
|
#
|
|
def can_implement(self, configuration, arguments):
|
|
return False
|
|
|
|
#
|
|
def get_host_workspace_size(self, arguments):
|
|
return 0
|
|
|
|
#
|
|
def get_device_workspace_size(self, arguments):
|
|
return 0
|
|
|
|
#
|
|
def plan(self, arguments):
|
|
return LaunchConfiguration()
|
|
|
|
#
|
|
def initialize(self, host_workspace, device_workspace, launch_config, arguments, stream = cuda.CUstream(0)):
|
|
raise NotImplementedError()
|
|
|
|
#
|
|
def run(self, host_workspace, device_workspace, launch_config, stream = cuda.CUstream(0)):
|
|
|
|
cArg = (ctypes.c_char * len(host_workspace)).from_buffer(host_workspace)
|
|
packed = (ctypes.c_void_p * 1)()
|
|
packed[0] = ctypes.addressof(cArg)
|
|
|
|
err, = cuda.cuLaunchKernel(
|
|
self.kernel,
|
|
launch_config.grid[0], launch_config.grid[1], launch_config.grid[2],
|
|
launch_config.block[0], launch_config.block[1], launch_config.block[2],
|
|
launch_config.shared_memory_capacity,
|
|
stream,
|
|
packed,
|
|
0)
|
|
|
|
return err
|
|
|
|
#################################################################################################
|
|
|
|
|
|
#
|
|
class GemmArguments:
|
|
'''
|
|
'''
|
|
def __init__(self):
|
|
self.problem_size = GemmCoord(0, 0, 0)
|
|
self.A = TensorRef()
|
|
self.B = TensorRef()
|
|
self.C = TensorRef()
|
|
self.D = TensorRef()
|
|
self.output_op = LinearCombinationFunctorArguments()
|
|
|
|
#
|
|
class ThreadblockSwizzle:
|
|
def __init__(self, threadblock_shape, log_threadblock_cohort = 0):
|
|
self.threadblock_shape = threadblock_shape
|
|
self.log_threadblock_cohort = log_threadblock_cohort
|
|
|
|
def grid_tiled_shape(self, problem_size):
|
|
return GemmCoord(
|
|
ceil_div(problem_size.m, self.threadblock_shape.m),
|
|
ceil_div(problem_size.n, self.threadblock_shape.n),
|
|
1)
|
|
|
|
#
|
|
class Gemm(ExecutableOperation):
|
|
'''
|
|
GEMM manages the CUTLASS runtime components
|
|
'''
|
|
#
|
|
def __init__(self, operation):
|
|
super().__init__(operation)
|
|
|
|
self.emitter = EmitGemmUniversalInstance('_type')
|
|
self.threadblock_swizzle = ThreadblockSwizzle(GemmCoord(128, 128, 8))
|
|
|
|
self.threads = 256
|
|
self.shared_memory_capacity = (32 << 10)
|
|
|
|
self.params_A = PredicatedTileAccessIteratorParams(
|
|
PredicatedTileAccessIteratorDesc(
|
|
32,
|
|
1,
|
|
PitchLinearCoord(128, 8),
|
|
PitchLinearCoord(1, 4),
|
|
PitchLinearCoord(1, 2)), 'A')
|
|
|
|
self.params_B = PredicatedTileAccessIteratorParams(
|
|
PredicatedTileAccessIteratorDesc(
|
|
32,
|
|
1,
|
|
PitchLinearCoord(128, 8),
|
|
PitchLinearCoord(1, 4),
|
|
PitchLinearCoord(1, 2)), 'B')
|
|
|
|
self.params_C = EpilogueTileIteratorParams(
|
|
EpilogueThreadMap(
|
|
256,
|
|
1,
|
|
32,
|
|
EpilogueTileDesc(128, 1, 4, 4, 1),
|
|
EpilogueTileDesc(4, 1, 2, 1, 1),
|
|
EpilogueTileDesc(32, 1, 8, 1, 1),
|
|
EpilogueTileDesc(1, 4, 2, 1, 8)), 'C')
|
|
|
|
self.params_D = EpilogueTileIteratorParams(
|
|
EpilogueThreadMap(
|
|
256,
|
|
1,
|
|
32,
|
|
EpilogueTileDesc(128, 1, 4, 4, 1),
|
|
EpilogueTileDesc(4, 1, 2, 1, 1),
|
|
EpilogueTileDesc(32, 1, 8, 1, 1),
|
|
EpilogueTileDesc(1, 4, 2, 1, 8)), 'D')
|
|
|
|
self.output_op = LinearCombinationFunctor()
|
|
|
|
#
|
|
def emit(self):
|
|
return self.emitter.emit(self.operation)
|
|
|
|
#
|
|
def can_implement(self, configuration, arguments):
|
|
pass
|
|
|
|
#
|
|
def get_host_workspace_size(self, arguments):
|
|
return 336
|
|
|
|
#
|
|
def get_device_workspace_size(self, arguments):
|
|
return 0
|
|
|
|
#
|
|
def plan(self, arguments):
|
|
grid = self.threadblock_swizzle.grid_tiled_shape(arguments.problem_size)
|
|
return LaunchConfiguration([grid.m, grid.n, grid.k], [self.threads, 1, 1], self.shared_memory_capacity)
|
|
|
|
#
|
|
def initialize(self, host_workspace, device_workspace, launch_config, arguments, stream = cuda.CUstream(0)):
|
|
|
|
offset = 0
|
|
|
|
# Compute intermediate results
|
|
swizzle_log_tile = 0
|
|
gemm_mode = 0
|
|
batch_count = 1
|
|
gemm_k_size = arguments.problem_size.k
|
|
|
|
# Pack into the host workspace buffer
|
|
offset = arguments.problem_size.pack_into(host_workspace, offset)
|
|
|
|
grid_tiled_shape = self.threadblock_swizzle.grid_tiled_shape(arguments.problem_size)
|
|
offset = grid_tiled_shape.pack_into(host_workspace, offset)
|
|
|
|
offset = PackInteger(host_workspace, offset, swizzle_log_tile)
|
|
|
|
offset = self.params_A.initialize(host_workspace, offset, arguments.A.layout)
|
|
offset = self.params_B.initialize(host_workspace, offset, arguments.B.layout)
|
|
offset = self.params_C.initialize(host_workspace, offset, arguments.C.layout)
|
|
offset = self.params_D.initialize(host_workspace, offset, arguments.D.layout)
|
|
|
|
offset = self.output_op.initialize(host_workspace, offset, arguments.output_op)
|
|
|
|
offset = PackInteger(host_workspace, offset, gemm_mode)
|
|
offset = PackInteger(host_workspace, offset, batch_count)
|
|
offset = PackInteger(host_workspace, offset, gemm_k_size)
|
|
offset = PackDevicePointer(host_workspace, offset, int(arguments.A.pointer))
|
|
offset = PackDevicePointer(host_workspace, offset, int(arguments.B.pointer))
|
|
offset = PackDevicePointer(host_workspace, offset, int(arguments.C.pointer))
|
|
offset = PackDevicePointer(host_workspace, offset, int(arguments.D.pointer))
|
|
|
|
return offset
|
|
|
|
|
|
#################################################################################################
|
|
#
|
|
# Module represents a compilation unit
|
|
#
|
|
#################################################################################################
|
|
|
|
#
|
|
class CompilationOptions:
|
|
'''
|
|
Compilation options.
|
|
'''
|
|
|
|
#
|
|
def __init__(self, architectures = [80], include_paths = []):
|
|
self.includes = []
|
|
self.include_paths = include_paths
|
|
self.flags = ['-std=c++11', '-default-device']
|
|
self.architectures = architectures
|
|
|
|
#
|
|
def get(self):
|
|
options = []
|
|
|
|
for flag in self.flags:
|
|
options.append(bytes(str.encode(flag)))
|
|
|
|
for incl in self.include_paths:
|
|
options.append(bytes(str.encode('--include-path=%s' % incl)))
|
|
|
|
arch_list = "-arch="
|
|
for idx, arch in enumerate(self.architectures):
|
|
if idx:
|
|
arch_list += ","
|
|
arch_list += "sm_%d" % arch
|
|
|
|
options.append(bytes(str.encode(arch_list)))
|
|
|
|
return options
|
|
|
|
IncludeTemplate = r'''#include "${include}"
|
|
'''
|
|
|
|
KernelTemplate = r'''
|
|
extern "C"
|
|
__global__ void
|
|
${operation_name}(${operation_name}${operation_suffix}::Params params) {
|
|
|
|
// Dynamic shared memory base pointer
|
|
extern __shared__ int SharedStorageBase[];
|
|
|
|
// Declare pointer to dynamic shared memory.
|
|
${operation_name}${operation_suffix}::SharedStorage *shared_storage =
|
|
reinterpret_cast<${operation_name}${operation_suffix}::SharedStorage *>(SharedStorageBase);
|
|
|
|
${operation_name}${operation_suffix} op;
|
|
|
|
op(params, *shared_storage);
|
|
}
|
|
|
|
'''
|
|
|
|
#
|
|
class Module:
|
|
def __init__(self, name, operations, compilation_options):
|
|
self.name = name
|
|
self.operations = operations
|
|
self.module = None
|
|
self.log = None
|
|
self.cubin_image = None
|
|
self.source_buffer = ''
|
|
|
|
#
|
|
# Emit source
|
|
#
|
|
self.emit_()
|
|
|
|
#
|
|
# Compile
|
|
#
|
|
self.compile_(compilation_options)
|
|
|
|
#
|
|
# Load module
|
|
#
|
|
self.load_()
|
|
|
|
# Done
|
|
return
|
|
|
|
# Emit a source buffer
|
|
def emit_(self):
|
|
|
|
# 1. Includes
|
|
includes = []
|
|
for operation in self.operations:
|
|
for incl in operation.emitter.includes:
|
|
if incl not in includes:
|
|
includes.append(incl)
|
|
|
|
for incl in includes:
|
|
self.source_buffer += SubstituteTemplate(IncludeTemplate, { 'include': incl} )
|
|
|
|
# 2. Operations
|
|
for operation in self.operations:
|
|
self.source_buffer += operation.emit()
|
|
values = {
|
|
'operation_name': operation.name(),
|
|
'operation_suffix': operation.emitter.operation_suffix
|
|
}
|
|
self.source_buffer += SubstituteTemplate(KernelTemplate, values)
|
|
|
|
# Done
|
|
return
|
|
|
|
# Compile with NVRTC
|
|
def compile_(self, compilation_options):
|
|
|
|
err, program = nvrtc.nvrtcCreateProgram(
|
|
str.encode(self.source_buffer),
|
|
bytes(str.encode(self.name)),
|
|
0, [], [])
|
|
|
|
if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
|
|
raise RuntimeError('NVRTC Error: {}'.format(err))
|
|
|
|
# Compile program
|
|
options = compilation_options.get()
|
|
|
|
err, = nvrtc.nvrtcCompileProgram(program, len(options), options)
|
|
if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
|
|
|
|
error_string = 'NVRTC Error: {}\n'.format(err)
|
|
|
|
# Get log from compilation
|
|
err, logSize = nvrtc.nvrtcGetProgramLogSize(program)
|
|
if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
|
|
raise RuntimeError('NVRTC Error: {}'.format(err))
|
|
|
|
self.log = b' ' * logSize
|
|
err, = nvrtc.nvrtcGetProgramLog(program, self.log)
|
|
if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
|
|
raise RuntimeError('NVRTC Error: {}'.format(err))
|
|
|
|
raise RuntimeError(error_string + self.log.decode() + self.source_buffer)
|
|
|
|
# Get data from compilation
|
|
err, dataSize = nvrtc.nvrtcGetCUBINSize(program)
|
|
if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
|
|
raise RuntimeError('NVRTC Error: {}'.format(err))
|
|
|
|
self.cubin_image = b' ' * dataSize
|
|
err, = nvrtc.nvrtcGetCUBIN(program, self.cubin_image)
|
|
if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
|
|
raise RuntimeError('NVRTC Error: {}'.format(err))
|
|
|
|
return
|
|
|
|
#
|
|
def load_(self):
|
|
|
|
# Load data as module data
|
|
err, self.module = cuda.cuModuleLoadData(self.cubin_image)
|
|
if err != cuda.CUresult.CUDA_SUCCESS:
|
|
raise RuntimeError('Cuda Error: {}'.format(err))
|
|
|
|
# Get functions
|
|
for operation in self.operations:
|
|
err, operation.kernel = cuda.cuModuleGetFunction(
|
|
self.module,
|
|
bytes(str.encode(operation.name())))
|
|
|
|
if err != cuda.CUresult.CUDA_SUCCESS:
|
|
raise RuntimeError('Cuda Error: {}'.format(err))
|
|
|
|
operation.module = self
|
|
|
|
return
|
|
|
|
|
|
#################################################################################################
|
|
#
|
|
# Manifest represents an 'owner' for modules and operations
|
|
#
|
|
#################################################################################################
|
|
|
|
#
|
|
class Manifest:
|
|
|
|
#
|
|
def __init__(self):
|
|
self.operations = {}
|
|
self.modules = []
|
|
pass
|
|
|
|
#
|
|
def append_module(self, module):
|
|
'''
|
|
Appends a module and takes ownership of operations used to construct it.
|
|
'''
|
|
|
|
self.modules.append(module)
|
|
|
|
for operation in module.operations:
|
|
self.operations[operation.name()] = operation
|
|
|
|
|
|
#################################################################################################
|