| 
									
										
										
										
											2022-04-24 03:02:38 +08:00
										 |  |  | ################################################################################################# | 
					
						
							|  |  |  | # | 
					
						
							| 
									
										
										
										
											2023-01-21 05:32:57 +08:00
										 |  |  | # Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. | 
					
						
							| 
									
										
										
										
											2022-04-24 03:02:38 +08:00
										 |  |  | # SPDX-License-Identifier: BSD-3-Clause | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Redistribution and use in source and binary forms, with or without | 
					
						
							|  |  |  | # modification, are permitted provided that the following conditions are met: | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # 1. Redistributions of source code must retain the above copyright notice, this | 
					
						
							|  |  |  | # list of conditions and the following disclaimer. | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # 2. Redistributions in binary form must reproduce the above copyright notice, | 
					
						
							|  |  |  | # this list of conditions and the following disclaimer in the documentation | 
					
						
							|  |  |  | # and/or other materials provided with the distribution. | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # 3. Neither the name of the copyright holder nor the names of its | 
					
						
							|  |  |  | # contributors may be used to endorse or promote products derived from | 
					
						
							|  |  |  | # this software without specific prior written permission. | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | 
					
						
							|  |  |  | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | 
					
						
							|  |  |  | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | 
					
						
							|  |  |  | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE | 
					
						
							|  |  |  | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | 
					
						
							|  |  |  | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | 
					
						
							|  |  |  | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | 
					
						
							|  |  |  | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | 
					
						
							|  |  |  | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | 
					
						
							|  |  |  | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | ################################################################################################# | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # System imports | 
					
						
							|  |  |  | import struct | 
					
						
							|  |  |  | import io | 
					
						
							|  |  |  | import ctypes | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # CUDA Python import | 
					
						
							|  |  |  | from cuda import cuda | 
					
						
							|  |  |  | from cuda import nvrtc | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # CUTLASS imports | 
					
						
							|  |  |  | from library import * | 
					
						
							|  |  |  | from gemm_operation import EmitGemmUniversalInstance | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | ################################################################################################# | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # CUTLASS Py Runtime Components | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | ################################################################################################# | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | def MaxAlignment(fmt): | 
					
						
							|  |  |  |   align = 1 | 
					
						
							|  |  |  |   for x in fmt: | 
					
						
							|  |  |  |     align = max(align, struct.calcsize(x)) | 
					
						
							|  |  |  |   return align | 
					
						
							|  |  |  |    | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | def AlignedOffset(offset, align): | 
					
						
							|  |  |  |   remainder = (offset % align) | 
					
						
							|  |  |  |   if remainder: | 
					
						
							|  |  |  |     offset += (align - remainder) | 
					
						
							|  |  |  |   return offset | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | def PackInteger(host_workspace, offset, value): | 
					
						
							|  |  |  |   fmt = "i" | 
					
						
							|  |  |  |   padding = AlignedOffset(offset, 4)   | 
					
						
							|  |  |  |   struct.pack_into(fmt, host_workspace, offset, value) | 
					
						
							|  |  |  |   return padding + struct.calcsize(fmt) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | def PackDevicePointer(host_workspace, offset, value): | 
					
						
							|  |  |  |   fmt = "P" | 
					
						
							|  |  |  |   offset = AlignedOffset(offset, 8) | 
					
						
							|  |  |  |   struct.pack_into(fmt, host_workspace, offset, value) | 
					
						
							|  |  |  |   return offset + struct.calcsize(fmt) | 
					
						
							|  |  |  |    | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | def ceil_div(a, b): | 
					
						
							|  |  |  |   return -(a // -b) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | ################################################################################################# | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | class PitchLinearCoord: | 
					
						
							|  |  |  |   def __init__(self, contiguous, strided): | 
					
						
							|  |  |  |     self.contiguous = contiguous | 
					
						
							|  |  |  |     self.strided = strided | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | class GemmCoord: | 
					
						
							|  |  |  |   def __init__(self, m = 1, n = 1, k = 1): | 
					
						
							|  |  |  |     self.m = m | 
					
						
							|  |  |  |     self.n = n | 
					
						
							|  |  |  |     self.k = k | 
					
						
							|  |  |  |     self.fmt = "iii" | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   # | 
					
						
							|  |  |  |   def ceil_div(self, rhs): | 
					
						
							|  |  |  |     return GemmCoord(ceil_div(self.m, rhs.m), ceil_div(self.n, rhs.n), ceil_div(self.k, rhs.k)) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   # | 
					
						
							|  |  |  |   def size(self): | 
					
						
							|  |  |  |     return struct.calcsize(self.fmt) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   # | 
					
						
							|  |  |  |   def alignment(self): | 
					
						
							|  |  |  |     return MaxAlignment(self.fmt) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   # | 
					
						
							|  |  |  |   def pack_into(self, host_workspace, offset): | 
					
						
							|  |  |  |      | 
					
						
							|  |  |  |     offset = AlignedOffset(offset, 4) | 
					
						
							|  |  |  |    | 
					
						
							|  |  |  |     struct.pack_into( | 
					
						
							|  |  |  |       self.fmt,  | 
					
						
							|  |  |  |       host_workspace,  | 
					
						
							|  |  |  |       offset,  | 
					
						
							|  |  |  |       self.m, self.n, self.k) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     return offset + self.size() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | class TensorRef: | 
					
						
							|  |  |  |   def __init__(self, pointer = None, layout = 0): | 
					
						
							|  |  |  |     self.pointer = pointer | 
					
						
							|  |  |  |     self.layout = layout | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   def __str__(self): | 
					
						
							|  |  |  |     return "(%x, %d)" % (self.pointer._ptr, self.layout) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | ################################################################################################# | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | class PredicatedTileAccessIteratorDesc: | 
					
						
							|  |  |  |   '''
 | 
					
						
							|  |  |  |   '''
 | 
					
						
							|  |  |  |    | 
					
						
							|  |  |  |   def __init__( | 
					
						
							|  |  |  |       self,  | 
					
						
							|  |  |  |       element_size_bits,  | 
					
						
							|  |  |  |       advance_rank,  | 
					
						
							|  |  |  |       threadblock_shape,  | 
					
						
							|  |  |  |       threadmap_iterations,  | 
					
						
							|  |  |  |       threadmap_delta): | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     self.element_size_bits = element_size_bits | 
					
						
							|  |  |  |     self.advance_rank = advance_rank | 
					
						
							|  |  |  |     self.threadblock_shape = threadblock_shape | 
					
						
							|  |  |  |     self.threadmap_iterations = threadmap_iterations | 
					
						
							|  |  |  |     self.threadmap_delta = threadmap_delta | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | class PredicatedTileAccessIteratorParams: | 
					
						
							|  |  |  |   '''
 | 
					
						
							|  |  |  |   '''
 | 
					
						
							|  |  |  |   # | 
					
						
							|  |  |  |   def __init__(self, desc, label): | 
					
						
							|  |  |  |     self.desc = desc | 
					
						
							|  |  |  |     self.label = label | 
					
						
							|  |  |  |     self.fmt = "qqqq" | 
					
						
							|  |  |  |   # | 
					
						
							|  |  |  |   def size(self): | 
					
						
							|  |  |  |     return struct.calcsize(self.fmt) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   # | 
					
						
							|  |  |  |   def alignment(self): | 
					
						
							|  |  |  |     return MaxAlignment(self.fmt) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   # | 
					
						
							|  |  |  |   def initialize(self, host_workspace, offset, stride): | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     offset = AlignedOffset(offset, self.alignment()) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     inc_strided = stride *                            \ | 
					
						
							|  |  |  |                   self.desc.threadmap_delta.strided * \ | 
					
						
							|  |  |  |                   self.desc.element_size_bits // 8 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     if self.desc.advance_rank: | 
					
						
							|  |  |  |       inc_advance = self.desc.threadblock_shape.strided * \ | 
					
						
							|  |  |  |                           stride *                        \ | 
					
						
							|  |  |  |                           self.desc.element_size_bits // 8 | 
					
						
							|  |  |  |     else: | 
					
						
							|  |  |  |       inc_advance = self.desc.threadblock_shape.contiguous * \ | 
					
						
							|  |  |  |                           self.desc.element_size_bits // 8 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     inc_next = inc_advance - (self.desc.threadmap_iterations.strided - 1) * \ | 
					
						
							|  |  |  |                       self.desc.threadmap_delta.strided *                   \ | 
					
						
							|  |  |  |                       stride *                                              \ | 
					
						
							|  |  |  |                       self.desc.element_size_bits // 8 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     struct.pack_into( | 
					
						
							|  |  |  |       self.fmt,  | 
					
						
							|  |  |  |       host_workspace,  | 
					
						
							|  |  |  |       offset,  | 
					
						
							|  |  |  |       stride, inc_strided, inc_next, inc_advance) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     return offset + self.size() | 
					
						
							|  |  |  |   # | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | ################################################################################################# | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | class EpilogueTileDesc: | 
					
						
							|  |  |  |   '''
 | 
					
						
							|  |  |  |   '''
 | 
					
						
							|  |  |  |   def __init__(self, column, row, group, cluster, tile): | 
					
						
							|  |  |  |     self.column = column | 
					
						
							|  |  |  |     self.row = row | 
					
						
							|  |  |  |     self.group = group | 
					
						
							|  |  |  |     self.cluster = cluster | 
					
						
							|  |  |  |     self.tile = tile | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | class EpilogueThreadMap: | 
					
						
							|  |  |  |   '''
 | 
					
						
							|  |  |  |   '''
 | 
					
						
							|  |  |  |   def __init__(self, threads, elements_per_access, element_size_bits, shape, iterations, delta, count): | 
					
						
							|  |  |  |     self.threads = threads | 
					
						
							|  |  |  |     self.elements_per_access = elements_per_access | 
					
						
							|  |  |  |     self.element_size_bits = element_size_bits | 
					
						
							|  |  |  |     self.shape = shape | 
					
						
							|  |  |  |     self.iterations = iterations | 
					
						
							|  |  |  |     self.delta = delta | 
					
						
							|  |  |  |     self.count = count | 
					
						
							|  |  |  |     pass | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | class EpilogueTileIteratorParams: | 
					
						
							|  |  |  |   '''
 | 
					
						
							|  |  |  |   '''
 | 
					
						
							|  |  |  |   # | 
					
						
							|  |  |  |   def __init__(self, desc, label): | 
					
						
							|  |  |  |     self.desc = desc | 
					
						
							|  |  |  |     self.label = label | 
					
						
							|  |  |  |     self.fmt = "qqqqqqqq" | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   # | 
					
						
							|  |  |  |   def size(self): | 
					
						
							|  |  |  |     return struct.calcsize(self.fmt) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   # | 
					
						
							|  |  |  |   def alignment(self): | 
					
						
							|  |  |  |     return MaxAlignment(self.fmt) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   # | 
					
						
							|  |  |  |   def initialize(self, host_workspace, offset, stride): | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     stride = stride * self.desc.element_size_bits // 8 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     offset = AlignedOffset(offset, self.alignment()) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     increment_row = stride * self.desc.delta.row | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     increment_group = stride * self.desc.delta.group \ | 
					
						
							|  |  |  |       - stride * self.desc.delta.row * (self.desc.iterations.row - 1) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     increment_cluster = stride * self.desc.delta.cluster \ | 
					
						
							|  |  |  |       - stride * self.desc.delta.group * (self.desc.iterations.group - 1) \ | 
					
						
							|  |  |  |       - stride * self.desc.delta.row * (self.desc.iterations.row - 1) | 
					
						
							|  |  |  |        | 
					
						
							|  |  |  |     advance_row = stride * self.desc.shape.row | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     advance_group = stride *                   \ | 
					
						
							|  |  |  |       (self.desc.shape.group - 1) * \ | 
					
						
							|  |  |  |       self.desc.shape.row *         \ | 
					
						
							|  |  |  |       self.desc.count.row | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     advance_cluster = stride * \ | 
					
						
							|  |  |  |       self.desc.count.group * \ | 
					
						
							|  |  |  |       self.desc.shape.group * \ | 
					
						
							|  |  |  |       self.desc.count.row   * \ | 
					
						
							|  |  |  |       self.desc.shape.row | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     advance_tile = stride * \ | 
					
						
							|  |  |  |       self.desc.shape.group * \ | 
					
						
							|  |  |  |       self.desc.shape.row   * \ | 
					
						
							|  |  |  |       self.desc.shape.cluster * \ | 
					
						
							|  |  |  |       self.desc.shape.tile | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     struct.pack_into( | 
					
						
							|  |  |  |       self.fmt,                                           \ | 
					
						
							|  |  |  |       host_workspace,                                     \ | 
					
						
							|  |  |  |       offset,                                             \ | 
					
						
							|  |  |  |       stride,                                             \ | 
					
						
							|  |  |  |       increment_row, increment_group, increment_cluster,  \ | 
					
						
							|  |  |  |       advance_row, advance_group, advance_cluster, advance_tile) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     return offset + self.size() | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | ################################################################################################# | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Launch configuration | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | ################################################################################################# | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | class LaunchConfiguration: | 
					
						
							|  |  |  |   def __init__(self, grid = [1,1,1], block = [1,1,1], smem = 0): | 
					
						
							|  |  |  |     self.grid = grid | 
					
						
							|  |  |  |     self.block = block | 
					
						
							|  |  |  |     self.shared_memory_capacity = smem | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | ################################################################################################# | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Functors | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | ################################################################################################# | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | class Functor: | 
					
						
							|  |  |  |   def __init__(self): | 
					
						
							|  |  |  |     self.decl = '' | 
					
						
							|  |  |  |     self.definition = '' | 
					
						
							|  |  |  |     self.fmt = '' | 
					
						
							|  |  |  |     self.identifier = '' | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   # | 
					
						
							|  |  |  |   def emit_declaration(self): | 
					
						
							|  |  |  |     return self.decl | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   # | 
					
						
							|  |  |  |   def emit_definition(self): | 
					
						
							|  |  |  |     return self.definition | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   #  | 
					
						
							|  |  |  |   def size(self): | 
					
						
							|  |  |  |     '''
 | 
					
						
							|  |  |  |     Size of the packed Params structure | 
					
						
							|  |  |  |     '''
 | 
					
						
							|  |  |  |     return struct.calcsize(self.fmt) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   # | 
					
						
							|  |  |  |   def alignment(self): | 
					
						
							|  |  |  |     return MaxAlignment(self.fmt) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   #  | 
					
						
							|  |  |  |   def initialize(self, host_workspace, offset, arguments): | 
					
						
							|  |  |  |     return offset + self.size() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | ################################################################################################# | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | class LinearCombinationFunctorArguments: | 
					
						
							|  |  |  |   def __init__(self, alpha = 1.0, beta = 0.0): | 
					
						
							|  |  |  |     self.alpha = alpha | 
					
						
							|  |  |  |     self.beta = beta | 
					
						
							|  |  |  |     self.alpha_ptr = 0 | 
					
						
							|  |  |  |     self.beta_ptr = 0 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | class LinearCombinationFunctor(Functor): | 
					
						
							|  |  |  |   def __init__(self): | 
					
						
							|  |  |  |     super().__init__() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     self.decl = """
 | 
					
						
							|  |  |  |     cutlass::epilogue::thread::LinearCombination< | 
					
						
							|  |  |  |       float, | 
					
						
							|  |  |  |       1, | 
					
						
							|  |  |  |       float, | 
					
						
							|  |  |  |       float | 
					
						
							|  |  |  |     >"""
 | 
					
						
							|  |  |  |     self.identifier = 'linear_combination' | 
					
						
							|  |  |  |     self.fmt = "ffPP" | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   #  | 
					
						
							|  |  |  |   def size(self): | 
					
						
							|  |  |  |     '''
 | 
					
						
							|  |  |  |     Size of the packed Params structure | 
					
						
							|  |  |  |     '''
 | 
					
						
							|  |  |  |     return struct.calcsize(self.fmt) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   # | 
					
						
							|  |  |  |   def alignment(self): | 
					
						
							|  |  |  |     return MaxAlignment(self.fmt) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   #  | 
					
						
							|  |  |  |   def initialize(self, host_workspace, offset, arguments): | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     offset = AlignedOffset(offset, self.alignment()) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     struct.pack_into( | 
					
						
							|  |  |  |       self.fmt,  | 
					
						
							|  |  |  |       host_workspace, offset,  | 
					
						
							|  |  |  |       arguments.alpha, arguments.beta, arguments.alpha_ptr, arguments.beta_ptr) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     return offset + self.size() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | ################################################################################################# | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Base class for an executable operation | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | ################################################################################################# | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | class ExecutableOperation: | 
					
						
							|  |  |  |   '''
 | 
					
						
							|  |  |  |   '''
 | 
					
						
							|  |  |  |   def __init__(self, operation): | 
					
						
							|  |  |  |     self.operation = operation | 
					
						
							|  |  |  |     self.module = None | 
					
						
							|  |  |  |     self.kernel = None | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   # | 
					
						
							|  |  |  |   def name(self): | 
					
						
							|  |  |  |     return self.operation.procedural_name() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   # | 
					
						
							|  |  |  |   def emit(self): | 
					
						
							|  |  |  |     return '' | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   # | 
					
						
							|  |  |  |   def can_implement(self, configuration, arguments): | 
					
						
							|  |  |  |     return False | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   # | 
					
						
							|  |  |  |   def get_host_workspace_size(self, arguments): | 
					
						
							|  |  |  |     return 0 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   # | 
					
						
							|  |  |  |   def get_device_workspace_size(self, arguments): | 
					
						
							|  |  |  |     return 0 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   # | 
					
						
							|  |  |  |   def plan(self, arguments): | 
					
						
							|  |  |  |     return LaunchConfiguration() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   # | 
					
						
							|  |  |  |   def initialize(self, host_workspace, device_workspace, launch_config, arguments, stream = cuda.CUstream(0)): | 
					
						
							|  |  |  |     raise NotImplementedError() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   # | 
					
						
							|  |  |  |   def run(self, host_workspace, device_workspace, launch_config, stream = cuda.CUstream(0)): | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     cArg = (ctypes.c_char * len(host_workspace)).from_buffer(host_workspace) | 
					
						
							|  |  |  |     packed = (ctypes.c_void_p * 1)() | 
					
						
							|  |  |  |     packed[0] = ctypes.addressof(cArg) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     err, = cuda.cuLaunchKernel( | 
					
						
							|  |  |  |       self.kernel,  | 
					
						
							|  |  |  |       launch_config.grid[0], launch_config.grid[1], launch_config.grid[2],  | 
					
						
							|  |  |  |       launch_config.block[0], launch_config.block[1], launch_config.block[2],  | 
					
						
							|  |  |  |       launch_config.shared_memory_capacity,  | 
					
						
							|  |  |  |       stream,  | 
					
						
							|  |  |  |       packed,  | 
					
						
							|  |  |  |       0) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     return err | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | ################################################################################################# | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | class GemmArguments: | 
					
						
							|  |  |  |   '''
 | 
					
						
							|  |  |  |   '''
 | 
					
						
							|  |  |  |   def __init__(self): | 
					
						
							|  |  |  |     self.problem_size = GemmCoord(0, 0, 0) | 
					
						
							|  |  |  |     self.A = TensorRef() | 
					
						
							|  |  |  |     self.B = TensorRef() | 
					
						
							|  |  |  |     self.C = TensorRef() | 
					
						
							|  |  |  |     self.D = TensorRef() | 
					
						
							|  |  |  |     self.output_op = LinearCombinationFunctorArguments() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | class ThreadblockSwizzle: | 
					
						
							|  |  |  |   def __init__(self, threadblock_shape, log_threadblock_cohort = 0): | 
					
						
							|  |  |  |     self.threadblock_shape = threadblock_shape | 
					
						
							|  |  |  |     self.log_threadblock_cohort = log_threadblock_cohort | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   def grid_tiled_shape(self, problem_size): | 
					
						
							|  |  |  |     return GemmCoord( | 
					
						
							|  |  |  |       ceil_div(problem_size.m, self.threadblock_shape.m),  | 
					
						
							|  |  |  |       ceil_div(problem_size.n, self.threadblock_shape.n),  | 
					
						
							|  |  |  |       1) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | class Gemm(ExecutableOperation): | 
					
						
							|  |  |  |   '''
 | 
					
						
							|  |  |  |   GEMM manages the CUTLASS runtime components | 
					
						
							|  |  |  |   '''
 | 
					
						
							|  |  |  |   # | 
					
						
							|  |  |  |   def __init__(self, operation): | 
					
						
							|  |  |  |     super().__init__(operation) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     self.emitter = EmitGemmUniversalInstance('_type') | 
					
						
							|  |  |  |     self.threadblock_swizzle = ThreadblockSwizzle(GemmCoord(128, 128, 8)) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     self.threads = 256 | 
					
						
							|  |  |  |     self.shared_memory_capacity = (32 << 10) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     self.params_A = PredicatedTileAccessIteratorParams( | 
					
						
							|  |  |  |         PredicatedTileAccessIteratorDesc( | 
					
						
							|  |  |  |           32,  | 
					
						
							|  |  |  |           1,  | 
					
						
							|  |  |  |           PitchLinearCoord(128, 8),  | 
					
						
							|  |  |  |           PitchLinearCoord(1, 4),  | 
					
						
							|  |  |  |           PitchLinearCoord(1, 2)), 'A') | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     self.params_B = PredicatedTileAccessIteratorParams( | 
					
						
							|  |  |  |         PredicatedTileAccessIteratorDesc( | 
					
						
							|  |  |  |           32,  | 
					
						
							|  |  |  |           1,  | 
					
						
							|  |  |  |           PitchLinearCoord(128, 8),  | 
					
						
							|  |  |  |           PitchLinearCoord(1, 4),  | 
					
						
							|  |  |  |           PitchLinearCoord(1, 2)), 'B') | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     self.params_C = EpilogueTileIteratorParams( | 
					
						
							|  |  |  |       EpilogueThreadMap( | 
					
						
							|  |  |  |         256,  | 
					
						
							|  |  |  |         1,  | 
					
						
							|  |  |  |         32, | 
					
						
							|  |  |  |         EpilogueTileDesc(128, 1, 4, 4, 1),  | 
					
						
							|  |  |  |         EpilogueTileDesc(4, 1, 2, 1, 1),  | 
					
						
							|  |  |  |         EpilogueTileDesc(32, 1, 8, 1, 1),  | 
					
						
							|  |  |  |         EpilogueTileDesc(1, 4, 2, 1, 8)), 'C') | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     self.params_D = EpilogueTileIteratorParams( | 
					
						
							|  |  |  |       EpilogueThreadMap( | 
					
						
							|  |  |  |         256,  | 
					
						
							|  |  |  |         1, | 
					
						
							|  |  |  |         32, | 
					
						
							|  |  |  |         EpilogueTileDesc(128, 1, 4, 4, 1),  | 
					
						
							|  |  |  |         EpilogueTileDesc(4, 1, 2, 1, 1),  | 
					
						
							|  |  |  |         EpilogueTileDesc(32, 1, 8, 1, 1),  | 
					
						
							|  |  |  |         EpilogueTileDesc(1, 4, 2, 1, 8)), 'D') | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     self.output_op = LinearCombinationFunctor() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   # | 
					
						
							|  |  |  |   def emit(self): | 
					
						
							|  |  |  |     return self.emitter.emit(self.operation) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   # | 
					
						
							|  |  |  |   def can_implement(self, configuration, arguments): | 
					
						
							|  |  |  |     pass | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   # | 
					
						
							|  |  |  |   def get_host_workspace_size(self, arguments): | 
					
						
							|  |  |  |     return 336 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   # | 
					
						
							|  |  |  |   def get_device_workspace_size(self, arguments): | 
					
						
							|  |  |  |     return 0 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   # | 
					
						
							|  |  |  |   def plan(self, arguments): | 
					
						
							|  |  |  |     grid = self.threadblock_swizzle.grid_tiled_shape(arguments.problem_size) | 
					
						
							|  |  |  |     return LaunchConfiguration([grid.m, grid.n, grid.k], [self.threads, 1, 1], self.shared_memory_capacity) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   # | 
					
						
							|  |  |  |   def initialize(self, host_workspace, device_workspace, launch_config, arguments, stream = cuda.CUstream(0)): | 
					
						
							|  |  |  |      | 
					
						
							|  |  |  |     offset = 0 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Compute intermediate results | 
					
						
							|  |  |  |     swizzle_log_tile = 0 | 
					
						
							|  |  |  |     gemm_mode = 0 | 
					
						
							|  |  |  |     batch_count = 1 | 
					
						
							|  |  |  |     gemm_k_size = arguments.problem_size.k | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Pack into the host workspace buffer | 
					
						
							|  |  |  |     offset = arguments.problem_size.pack_into(host_workspace, offset) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     grid_tiled_shape = self.threadblock_swizzle.grid_tiled_shape(arguments.problem_size) | 
					
						
							|  |  |  |     offset = grid_tiled_shape.pack_into(host_workspace, offset) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     offset = PackInteger(host_workspace, offset, swizzle_log_tile) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     offset = self.params_A.initialize(host_workspace, offset, arguments.A.layout) | 
					
						
							|  |  |  |     offset = self.params_B.initialize(host_workspace, offset, arguments.B.layout) | 
					
						
							|  |  |  |     offset = self.params_C.initialize(host_workspace, offset, arguments.C.layout) | 
					
						
							|  |  |  |     offset = self.params_D.initialize(host_workspace, offset, arguments.D.layout) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     offset = self.output_op.initialize(host_workspace, offset, arguments.output_op) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     offset = PackInteger(host_workspace, offset, gemm_mode) | 
					
						
							|  |  |  |     offset = PackInteger(host_workspace, offset, batch_count) | 
					
						
							|  |  |  |     offset = PackInteger(host_workspace, offset, gemm_k_size) | 
					
						
							|  |  |  |     offset = PackDevicePointer(host_workspace, offset, int(arguments.A.pointer)) | 
					
						
							|  |  |  |     offset = PackDevicePointer(host_workspace, offset, int(arguments.B.pointer)) | 
					
						
							|  |  |  |     offset = PackDevicePointer(host_workspace, offset, int(arguments.C.pointer)) | 
					
						
							|  |  |  |     offset = PackDevicePointer(host_workspace, offset, int(arguments.D.pointer))    | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     return offset | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | ################################################################################################# | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Module represents a compilation unit  | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | ################################################################################################# | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | class CompilationOptions: | 
					
						
							|  |  |  |   '''
 | 
					
						
							|  |  |  |   Compilation options. | 
					
						
							|  |  |  |   '''
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   # | 
					
						
							|  |  |  |   def __init__(self, architectures = [80], include_paths = []): | 
					
						
							|  |  |  |     self.includes = [] | 
					
						
							|  |  |  |     self.include_paths = include_paths | 
					
						
							|  |  |  |     self.flags = ['-std=c++11', '-default-device'] | 
					
						
							|  |  |  |     self.architectures = architectures | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   # | 
					
						
							|  |  |  |   def get(self): | 
					
						
							|  |  |  |     options = [] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     for flag in self.flags: | 
					
						
							|  |  |  |       options.append(bytes(str.encode(flag))) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     for incl in self.include_paths: | 
					
						
							|  |  |  |       options.append(bytes(str.encode('--include-path=%s' % incl))) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     arch_list = "-arch=" | 
					
						
							|  |  |  |     for idx, arch in enumerate(self.architectures): | 
					
						
							|  |  |  |       if idx: | 
					
						
							|  |  |  |         arch_list += "," | 
					
						
							|  |  |  |       arch_list += "sm_%d" % arch | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     options.append(bytes(str.encode(arch_list))) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     return options | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | IncludeTemplate = r'''#include "${include}"
 | 
					
						
							|  |  |  | '''
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | KernelTemplate = r'''
 | 
					
						
							|  |  |  | extern "C" | 
					
						
							|  |  |  | __global__ void | 
					
						
							|  |  |  | ${operation_name}(${operation_name}${operation_suffix}::Params params) { | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   // Dynamic shared memory base pointer | 
					
						
							|  |  |  |   extern __shared__ int SharedStorageBase[]; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   // Declare pointer to dynamic shared memory. | 
					
						
							|  |  |  |   ${operation_name}${operation_suffix}::SharedStorage *shared_storage = | 
					
						
							|  |  |  |       reinterpret_cast<${operation_name}${operation_suffix}::SharedStorage *>(SharedStorageBase); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   ${operation_name}${operation_suffix} op; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   op(params, *shared_storage); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | '''
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | class Module: | 
					
						
							|  |  |  |   def __init__(self, name, operations, compilation_options): | 
					
						
							|  |  |  |     self.name = name | 
					
						
							|  |  |  |     self.operations = operations | 
					
						
							|  |  |  |     self.module = None | 
					
						
							|  |  |  |     self.log = None | 
					
						
							|  |  |  |     self.cubin_image = None | 
					
						
							|  |  |  |     self.source_buffer = '' | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Emit source | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     self.emit_() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Compile | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     self.compile_(compilation_options) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Load module | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     self.load_() | 
					
						
							|  |  |  |      | 
					
						
							|  |  |  |     # Done | 
					
						
							|  |  |  |     return | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   # Emit a source buffer | 
					
						
							|  |  |  |   def emit_(self): | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # 1. Includes | 
					
						
							|  |  |  |     includes = [] | 
					
						
							|  |  |  |     for operation in self.operations: | 
					
						
							|  |  |  |       for incl in operation.emitter.includes: | 
					
						
							|  |  |  |         if incl not in includes: | 
					
						
							|  |  |  |           includes.append(incl) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     for incl in includes: | 
					
						
							|  |  |  |       self.source_buffer += SubstituteTemplate(IncludeTemplate, { 'include': incl} ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # 2. Operations | 
					
						
							|  |  |  |     for operation in self.operations: | 
					
						
							|  |  |  |       self.source_buffer += operation.emit() | 
					
						
							|  |  |  |       values = { | 
					
						
							|  |  |  |         'operation_name': operation.name(), | 
					
						
							|  |  |  |         'operation_suffix': operation.emitter.operation_suffix | 
					
						
							|  |  |  |       } | 
					
						
							|  |  |  |       self.source_buffer += SubstituteTemplate(KernelTemplate, values) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Done | 
					
						
							|  |  |  |     return | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   # Compile with NVRTC | 
					
						
							|  |  |  |   def compile_(self, compilation_options): | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     err, program = nvrtc.nvrtcCreateProgram( | 
					
						
							|  |  |  |       str.encode(self.source_buffer),  | 
					
						
							|  |  |  |       bytes(str.encode(self.name)),  | 
					
						
							|  |  |  |       0, [], []) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     if err != nvrtc.nvrtcResult.NVRTC_SUCCESS: | 
					
						
							|  |  |  |         raise RuntimeError('NVRTC Error: {}'.format(err)) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Compile program | 
					
						
							|  |  |  |     options = compilation_options.get() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     err, = nvrtc.nvrtcCompileProgram(program, len(options), options) | 
					
						
							|  |  |  |     if err != nvrtc.nvrtcResult.NVRTC_SUCCESS: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |       error_string = 'NVRTC Error: {}\n'.format(err) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |       # Get log from compilation | 
					
						
							|  |  |  |       err, logSize = nvrtc.nvrtcGetProgramLogSize(program) | 
					
						
							|  |  |  |       if err != nvrtc.nvrtcResult.NVRTC_SUCCESS: | 
					
						
							|  |  |  |           raise RuntimeError('NVRTC Error: {}'.format(err)) | 
					
						
							|  |  |  |        | 
					
						
							|  |  |  |       self.log = b' ' * logSize | 
					
						
							|  |  |  |       err, = nvrtc.nvrtcGetProgramLog(program, self.log) | 
					
						
							|  |  |  |       if err != nvrtc.nvrtcResult.NVRTC_SUCCESS: | 
					
						
							|  |  |  |           raise RuntimeError('NVRTC Error: {}'.format(err)) | 
					
						
							|  |  |  |        | 
					
						
							|  |  |  |       raise RuntimeError(error_string + self.log.decode() + self.source_buffer) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Get data from compilation | 
					
						
							|  |  |  |     err, dataSize = nvrtc.nvrtcGetCUBINSize(program) | 
					
						
							|  |  |  |     if err != nvrtc.nvrtcResult.NVRTC_SUCCESS: | 
					
						
							|  |  |  |         raise RuntimeError('NVRTC Error: {}'.format(err)) | 
					
						
							|  |  |  |      | 
					
						
							|  |  |  |     self.cubin_image = b' ' * dataSize | 
					
						
							|  |  |  |     err, = nvrtc.nvrtcGetCUBIN(program, self.cubin_image) | 
					
						
							|  |  |  |     if err != nvrtc.nvrtcResult.NVRTC_SUCCESS: | 
					
						
							|  |  |  |         raise RuntimeError('NVRTC Error: {}'.format(err)) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     return | 
					
						
							|  |  |  |      | 
					
						
							|  |  |  |   # | 
					
						
							|  |  |  |   def load_(self): | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Load data as module data | 
					
						
							|  |  |  |     err, self.module = cuda.cuModuleLoadData(self.cubin_image) | 
					
						
							|  |  |  |     if err != cuda.CUresult.CUDA_SUCCESS: | 
					
						
							|  |  |  |         raise RuntimeError('Cuda Error: {}'.format(err)) | 
					
						
							|  |  |  |      | 
					
						
							|  |  |  |     # Get functions | 
					
						
							|  |  |  |     for operation in self.operations: | 
					
						
							|  |  |  |       err, operation.kernel = cuda.cuModuleGetFunction( | 
					
						
							|  |  |  |         self.module,  | 
					
						
							|  |  |  |         bytes(str.encode(operation.name()))) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |       if err != cuda.CUresult.CUDA_SUCCESS: | 
					
						
							|  |  |  |           raise RuntimeError('Cuda Error: {}'.format(err)) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |       operation.module = self | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     return | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | ################################################################################################# | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Manifest represents an 'owner' for modules and operations | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | ################################################################################################# | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | class Manifest: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   # | 
					
						
							|  |  |  |   def __init__(self): | 
					
						
							|  |  |  |     self.operations = {} | 
					
						
							|  |  |  |     self.modules = [] | 
					
						
							|  |  |  |     pass | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   # | 
					
						
							|  |  |  |   def append_module(self, module): | 
					
						
							|  |  |  |     '''
 | 
					
						
							|  |  |  |     Appends a module and takes ownership of operations used to construct it. | 
					
						
							|  |  |  |     '''
 | 
					
						
							|  |  |  |      | 
					
						
							|  |  |  |     self.modules.append(module) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     for operation in module.operations: | 
					
						
							|  |  |  |       self.operations[operation.name()] = operation | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | ################################################################################################# |