################################################################################################# # # Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, this # list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # # 3. Neither the name of the copyright holder nor the names of its # contributors may be used to endorse or promote products derived from # this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # ################################################################################################# """ Utilities for enumerating CUTLASS library SM90 kernels """ import argparse import enum from itertools import product import math import logging import os.path import shutil import sys import copy from typing import Any, Optional, Sequence, Tuple try: import builtins if hasattr(builtins, "CUTLASS_IGNORE_PACKAGE") and CUTLASS_IGNORE_PACKAGE == True: raise ImportError("Disabling attempt to import cutlass_library") from cutlass_library.library import * except ImportError: from library import * # NOTE: this is a duplicate of CudaToolkitVersionSatisfies in generator.py def CudaToolkitVersionSatisfies(semantic_ver_string, major, minor, patch = 0): # by default, use the latest CUDA Toolkit version cuda_version = [11, 0, 132] # Update cuda_version based on parsed string if semantic_ver_string != '': for i, x in enumerate([int(x) for x in semantic_ver_string.split('.')]): if i < len(cuda_version): cuda_version[i] = x else: cuda_version.append(x) return cuda_version >= [major, minor, patch] #### Step 0: define levels # One integer level controls multiple "generators" and how many # combinations they generate. That is the "global" level. # "Generators" are WGMMA shapes, MMA multipliers, cluster sizes, and # anything that is eventually involved in the Cartesian product # which yields our kernel configurations. # For simplicity, each generator defines their own levels, # starting from 0. As a rule we assume 10 or fewer levels, making # their level a digit. # The "global" level simply stacks these digits and represents them # as a single integer. # # For example, level 500 indicates cluster sizes are at level 5, MMA # multipliers are at level 0, and WGMMA shapes are at level 0 as well. # # Here we define the global level to generator level mappings. def get_wgmma_level_from_global_level(global_level: int): return global_level % 10 def get_mma_level_from_global_level(global_level: int): return (global_level // 10) % 10 def get_cluster_level_from_global_level(global_level: int): return (global_level // 100) % 10 def get_pruning_level_from_global_level(global_level: int): return (global_level // 1000) % 10 #### Step 1: generate MMA instruction shapes based on levels try: from .sm90_shapes import ( SM90_MMA_MULTIPLIERS, SM90_CLUSTER_SIZES, SM90_WGMMA_SHAPES_TF32_DENSE, SM90_WGMMA_SHAPES_FP16_BF16_DENSE, SM90_WGMMA_SHAPES_FP8_DENSE, SM90_WGMMA_SHAPES_INT8_DENSE, ) except: from sm90_shapes import ( SM90_MMA_MULTIPLIERS, SM90_CLUSTER_SIZES, SM90_WGMMA_SHAPES_TF32_DENSE, SM90_WGMMA_SHAPES_FP16_BF16_DENSE, SM90_WGMMA_SHAPES_FP8_DENSE, SM90_WGMMA_SHAPES_INT8_DENSE, ) def generate_tf32_math_instruction_shapes_sm90(level: int): assert isinstance(level, int) and level >= 0 filtered_list_of_wgmma_shapes = [ wgmma_shape for wgmma_shape, min_level in SM90_WGMMA_SHAPES_TF32_DENSE.items() if level >= min_level ] return filtered_list_of_wgmma_shapes def generate_fp16_bf16_math_instruction_shapes_sm90(level: int): assert isinstance(level, int) and level >= 0 filtered_list_of_wgmma_shapes = [ wgmma_shape for wgmma_shape, min_level in SM90_WGMMA_SHAPES_FP16_BF16_DENSE.items() if level >= min_level ] return filtered_list_of_wgmma_shapes def generate_fp8_math_instruction_shapes_sm90(level: int): assert isinstance(level, int) and level >= 0 filtered_list_of_wgmma_shapes = [ wgmma_shape for wgmma_shape, min_level in SM90_WGMMA_SHAPES_FP8_DENSE.items() if level >= min_level ] return filtered_list_of_wgmma_shapes def generate_int8_math_instruction_shapes_sm90(level: int): assert isinstance(level, int) and level >= 0 filtered_list_of_wgmma_shapes = [ wgmma_shape for wgmma_shape, min_level in SM90_WGMMA_SHAPES_INT8_DENSE.items() if level >= min_level ] return filtered_list_of_wgmma_shapes ########### def generate_tf32_math_instructions_sm90(level: int): wgmma_level = get_wgmma_level_from_global_level(level) math_instructions = [] for math_instruction_shape in generate_tf32_math_instruction_shapes_sm90(wgmma_level): math_instructions.append( MathInstruction( math_instruction_shape, DataType.tf32, DataType.tf32, DataType.f32, OpcodeClass.TensorOp, MathOperation.multiply_add) ) return math_instructions def generate_fp16_bf16_math_instructions_sm90(level: int): wgmma_level = get_wgmma_level_from_global_level(level) math_instructions = [] for math_instruction_shape in generate_fp16_bf16_math_instruction_shapes_sm90(wgmma_level): math_instructions += [ MathInstruction( math_instruction_shape, DataType.f16, DataType.f16, DataType.f16, OpcodeClass.TensorOp, MathOperation.multiply_add), MathInstruction( math_instruction_shape, DataType.f16, DataType.f16, DataType.f32, OpcodeClass.TensorOp, MathOperation.multiply_add), MathInstruction( math_instruction_shape, DataType.bf16, DataType.bf16, DataType.f32, OpcodeClass.TensorOp, MathOperation.multiply_add), ] return math_instructions def generate_fp8_math_instructions_sm90(level: int): wgmma_level = get_wgmma_level_from_global_level(level) math_instructions = [] for math_instruction_shape in generate_fp8_math_instruction_shapes_sm90(wgmma_level): math_instructions += [ MathInstruction( math_instruction_shape, DataType.e4m3, DataType.e4m3, DataType.f32, OpcodeClass.TensorOp, MathOperation.multiply_add), MathInstruction( math_instruction_shape, DataType.e4m3, DataType.e5m2, DataType.f32, OpcodeClass.TensorOp, MathOperation.multiply_add), MathInstruction( math_instruction_shape, DataType.e5m2, DataType.e4m3, DataType.f32, OpcodeClass.TensorOp, MathOperation.multiply_add), MathInstruction( math_instruction_shape, DataType.e5m2, DataType.e5m2, DataType.f32, OpcodeClass.TensorOp, MathOperation.multiply_add), ] return math_instructions def generate_int8_math_instructions_sm90(level: int): wgmma_level = get_wgmma_level_from_global_level(level) math_instructions = [] for math_instruction_shape in generate_int8_math_instruction_shapes_sm90(wgmma_level): math_instructions += [ MathInstruction( math_instruction_shape, DataType.s8, DataType.s8, DataType.s32, OpcodeClass.TensorOp, MathOperation.multiply_add), MathInstruction( math_instruction_shape, DataType.u8, DataType.u8, DataType.s32, OpcodeClass.TensorOp, MathOperation.multiply_add), ] return math_instructions def make_sparse_math_instructions(math_instructions): sparse_instructions = [] for inst in math_instructions: if inst.opcode_class == OpcodeClass.TensorOp: sparse_instructions.append(MathInstruction( (inst.instruction_shape[0], inst.instruction_shape[1], inst.instruction_shape[2] * 2), inst.element_a, inst.element_b, inst.element_accumulator, OpcodeClass.SparseTensorOp, inst.math_operation),) return sparse_instructions #### Step 2: generate tile descriptions from math instruction shapes def is_tile_desc_valid(tile_description): if tile_description.minimum_compute_capability != 90 or tile_description.maximum_compute_capability != 90: return False element_a, element_b, element_accum = ( tile_description.math_instruction.element_a, tile_description.math_instruction.element_b, tile_description.math_instruction.element_accumulator ) cluster_shape, cta_shape, inst_shape = ( tile_description.cluster_shape, tile_description.threadblock_shape, tile_description.math_instruction.instruction_shape ) grid_size = ( cta_shape[0] * cluster_shape[0] + cta_shape[1] * cluster_shape[1] + cta_shape[2] * cluster_shape[2] ) cluster_size = cluster_shape[0] * cluster_shape[1] * cluster_shape[2] # Maximum number of CTAs per cluster is 8 for Hopper, but up to 16 is # allowed for non portable clusters. if cluster_size > 16 or cluster_size < 1: return False if grid_size < 1: return False # SM90 WGMMA shapes are always 64 across M, therefore # CTA shape across M must always be a multiple of 64. if cta_shape[0] < 64 or cta_shape[0] % 64 != 0: return False # The minimum WGMMA shape across N is 8, and increments # vary across different dtypes, but they're never smaller # than 8. The minimum CTA shape allowed across N though is 16. if cta_shape[1] < 16 or cta_shape[1] % 8 != 0: return False # SM90 WGMMA shapes across K are always 8 for 32 bit dense # operations, 16 for 16 bit, and 32 for 8 bit. In any case, # the CTA shape across K should be a multiple of 8 and at least # twice the WGMMA shape across K. if cta_shape[2] < 16 or cta_shape[2] % 8 != 0: return False # Minimum of 2 stages if cta_shape[2] < inst_shape[2] or cta_shape[2] % inst_shape[2] != 0 or cta_shape[2] / inst_shape[2] < 2: return False # CTA shape upper bound: <256, 256, 256> if cta_shape[0] > 256 or cta_shape[1] > 256 or cta_shape[2] > 256: return False return True def get_mma_multipliers(level: int): assert isinstance(level, int) and level >= 0 mma_level = get_mma_level_from_global_level(level) return [ mma_mul for mma_mul, mma_min_level in SM90_MMA_MULTIPLIERS.items() if mma_level >= mma_min_level ] def get_cluster_sizes(level: int, is_aligned: bool): if not is_aligned: return [(1, 1, 1)] assert isinstance(level, int) and level >= 0 cluster_level = get_cluster_level_from_global_level(level) return [ cluster_size for cluster_size, cluster_min_level in SM90_CLUSTER_SIZES.items() if cluster_level >= cluster_min_level ] def generate_tile_descriptions_sm90(math_instructions, is_aligned: bool, level: int): tile_descriptions = set() mma_multipliers, cluster_sizes = get_mma_multipliers(level), get_cluster_sizes(level, is_aligned) for math_inst, mma_mul, cluster_size in product(math_instructions, mma_multipliers, cluster_sizes): tile_desc = TileDescription( threadblock_shape=[ math_inst.instruction_shape[0] * mma_mul[0], math_inst.instruction_shape[1] * mma_mul[1], math_inst.instruction_shape[2] * mma_mul[2] ], stages=0, warp_count=[4, 1, 1], math_instruction=math_inst, min_compute=90, max_compute=90, cluster_shape=cluster_size) # For sparse kernels K-tile is twice as large (due to 2x MMA-K size) # Reduce it to same size as dense to afford more smem stages if math_inst.opcode_class == OpcodeClass.SparseTensorOp: tile_desc.threadblock_shape[2] = tile_desc.threadblock_shape[2] // 2 if is_tile_desc_valid(tile_desc): tile_descriptions.add(tile_desc) return tile_descriptions #### Step 3: map tile description to valid schedules def is_tile_desc_compatible_with_cooperative(tile_description): # Cooperative kernels require a minimum CTA-M of 128 return tile_description.threadblock_shape[0] >= 128 def can_tile_desc_use_shmem_in_epilogue(tile_description, data_types): dtype_a, dtype_b, dtype_c, dtype_d, dtype_acc, dtype_epi = ( data_types["a_type"], data_types["b_type"], data_types["c_type"], data_types["d_type"], data_types["acc_type"], data_types["epi_type"] ) mn = tile_description.threadblock_shape[0] * tile_description.threadblock_shape[1] bitsize_c, bitsize_d = DataTypeSize[dtype_c], DataTypeSize[dtype_d] shmem_bits_c, shmem_bits_d = bitsize_c * mn, bitsize_d * mn shmem_bits_total = shmem_bits_c + shmem_bits_d # Magic number: 2^20 # Existing logic suggested that tile shape 256x128 (or 128x256) # would run out of shmem if D is FP32, and source is needed. # That would be 256 * 128 * 32 == 2^21 (~262 KB), which is over the limit. # Hopper's max shmem size is 228 KB, and 2^20 ~= 131 KB. # Since epilogue can't possibly use ALL of the shmem available # we can just settle on 2^20 bits (~ 131 KB) being the upper bound # we would allow for epilogue. # This can be different for non-persistent kernels where epilogue and # mainloop shmem is shared. if shmem_bits_total > 2 ** 20: return False return True def get_valid_schedules(tile_description, cuda_version, is_aligned, data_types, layout, instantiation_level, enable_fp8_fast_acc=True): # Level 0: prune according to existing generator.py behavior # Level >= 1: no pruning level = get_pruning_level_from_global_level(instantiation_level) schedules = [] stream_k_schedules = [] if not is_tile_desc_valid(tile_description): return schedules, stream_k_schedules FP16_TYPES = [DataType.f16, DataType.bf16] is_fp16 = data_types["a_type"] in FP16_TYPES and data_types["b_type"] in FP16_TYPES FP8_TYPES = [DataType.e4m3, DataType.e5m2] is_fp8 = data_types["a_type"] in FP8_TYPES and data_types["b_type"] in FP8_TYPES can_do_fp8_fast_accum = is_fp8 and enable_fp8_fast_acc FP32_TYPES = [DataType.f32, DataType.tf32] is_fp32 = data_types["a_type"] in FP32_TYPES and data_types["b_type"] in FP32_TYPES requires_transposed_epilogue = is_fp32 and layout[0][0] == LayoutType.RowMajor and layout[1][0] == LayoutType.RowMajor is_sparse = tile_description.math_instruction.opcode_class == OpcodeClass.SparseTensorOp can_do_cooperative = is_tile_desc_compatible_with_cooperative(tile_description) can_do_tma_epilogue = is_aligned and not requires_transposed_epilogue and can_tile_desc_use_shmem_in_epilogue(tile_description, data_types) default_epilogue = EpilogueScheduleType.NoSmemWarpSpecialized if not requires_transposed_epilogue else EpilogueScheduleType.EpilogueTransposed auto_epilogue = EpilogueScheduleType.ScheduleAuto if not requires_transposed_epilogue else EpilogueScheduleType.EpilogueTransposed cta_m, cta_n, cta_k = ( tile_description.threadblock_shape[0], tile_description.threadblock_shape[1], tile_description.threadblock_shape[2] ) c_type = data_types["c_type"] d_type = data_types["d_type"] is_void_c = c_type == DataType.void # Early pruning if level < 1: # Don't stamp out FP16/BF16 kernels smaller than or equal to 64x128x64 if is_fp16 and cta_m <= 64 and cta_n <= 128 and cta_k <= 64: return [], [] # FP8 configs with CTA tile larger than or equal to 256x128x128 limit data types and schedules is_large_fp8_tile = is_fp8 and cta_m >= 256 and cta_n >= 128 and cta_k >= 128 if is_large_fp8_tile: # Only void-C, and only FP8 outputs allowed if not is_void_c or d_type not in FP8_TYPES: return [], [] if CudaToolkitVersionSatisfies(cuda_version, 12, 1) and can_do_cooperative and can_do_tma_epilogue: return [ [ KernelScheduleType.TmaWarpSpecializedCooperative if not is_sparse else KernelScheduleType.TmaWarpSpecializedCooperativeFP8FastAccum, EpilogueScheduleType.TmaWarpSpecializedCooperative ], [ KernelScheduleType.TmaWarpSpecializedCooperativeFP8FastAccum, EpilogueScheduleType.TmaWarpSpecializedCooperative ], ] , [] return [], [] if is_fp8 and not is_large_fp8_tile: valid_dtypes_for_c = [DataType.f32, DataType.bf16, DataType.f16] # Prune all configs with fp8 source, and all configs with non-fp8 output # that have different dtypes for source and output. if c_type not in valid_dtypes_for_c or (d_type not in FP8_TYPES and c_type != d_type): return [], [] # FP32/TF32 kernels don't stamp out void-C if is_fp32 and is_void_c: return [], [] # Void-c only makes a difference for TMA epilogues if is_void_c and not can_do_tma_epilogue: return [], [] if not is_aligned: schedules = [[KernelScheduleType.CpAsyncWarpSpecialized, default_epilogue]] stream_k_schedules = [] if CudaToolkitVersionSatisfies(cuda_version, 12, 1) and can_do_cooperative: schedules.append([ KernelScheduleType.CpAsyncWarpSpecializedCooperative, default_epilogue ]) stream_k_schedules.append([ KernelScheduleType.CpAsyncWarpSpecializedCooperative, default_epilogue ]) return schedules, stream_k_schedules schedules = [] # Pruning: emit Void-C kernels with persistent kernels only if level >= 1 or not is_void_c: # Pruning: don't stamp out fp8 kernels with auto schedule if not is_fp8: schedules.append([KernelScheduleType.ScheduleAuto, auto_epilogue]) if not (is_fp8 and is_sparse): schedules.append([KernelScheduleType.TmaWarpSpecialized, default_epilogue]) stream_k_schedules = [] if CudaToolkitVersionSatisfies(cuda_version, 12, 1): # Pruning: don't stamp out fp8 ping-ponging kernel with non-tma epilogue if not is_fp8 or level >= 1: schedules.append([KernelScheduleType.TmaWarpSpecializedPingpong, default_epilogue]) if can_do_fp8_fast_accum: schedules.append([KernelScheduleType.TmaWarpSpecializedFP8FastAccum, default_epilogue]) schedules.append([KernelScheduleType.TmaWarpSpecializedPingpongFP8FastAccum, default_epilogue]) if can_do_cooperative: # Sparse kernels only support FastAccum FP8 mainloop if not (is_fp8 and is_sparse): schedules.append([ KernelScheduleType.TmaWarpSpecializedCooperative, default_epilogue ]) stream_k_schedules.append([ KernelScheduleType.TmaWarpSpecializedCooperative, default_epilogue ]) if can_do_fp8_fast_accum: schedules.append([ KernelScheduleType.TmaWarpSpecializedCooperativeFP8FastAccum, default_epilogue ]) stream_k_schedules.append([ KernelScheduleType.TmaWarpSpecializedCooperativeFP8FastAccum, default_epilogue ]) # persistent kernels with TMA epilogues if can_do_tma_epilogue: assert not requires_transposed_epilogue # Inconsistency: fp8 pingpong only gets stamped out with fast accum if not is_fp8 or level >= 1: schedules.append([ KernelScheduleType.TmaWarpSpecializedPingpong, EpilogueScheduleType.TmaWarpSpecialized ]) if can_do_fp8_fast_accum: schedules.append([ KernelScheduleType.TmaWarpSpecializedPingpongFP8FastAccum, EpilogueScheduleType.TmaWarpSpecialized ]) if can_do_cooperative: # Sparse kernels only support FastAccum FP8 mainloop if not (is_fp8 and is_sparse): schedules.append([ KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative ]) stream_k_schedules.append([ KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative ]) if can_do_fp8_fast_accum: schedules.append([ KernelScheduleType.TmaWarpSpecializedCooperativeFP8FastAccum, EpilogueScheduleType.TmaWarpSpecializedCooperative ]) stream_k_schedules.append([ KernelScheduleType.TmaWarpSpecializedCooperativeFP8FastAccum, EpilogueScheduleType.TmaWarpSpecializedCooperative ]) return schedules, stream_k_schedules #### Misc: helpers def generate_data_types_from_math_instruction(math_instruction, element_source = None, element_dest = None, element_epilogue = None): element_a, element_b = math_instruction.element_a, math_instruction.element_b element_accumulator = math_instruction.element_accumulator element_c = element_source or element_accumulator element_d = element_dest or element_accumulator element_epilogue = element_epilogue or element_accumulator data_types = { "a_type" : element_a, "b_type" : element_b, "c_type" : element_c, "d_type" : element_d, "acc_type" : element_accumulator, "epi_type" : element_epilogue } return data_types def fix_alignments(data_types, layout, alignment_bits = 128): operand_keys = ["a_type", "b_type", "c_type"] operands_to_fix = ["c_type"] new_layout = [] assert len(layout) == len(operand_keys) for i, k in enumerate(operand_keys): assert k in data_types and data_types[k] in DataTypeSize dtype = data_types[k] dtype_size_bits = DataTypeSize[dtype] layout_type = layout[i][0] layout_alignment = layout[i][1] # Don't modify alignment if dtype's been changed to void if k in operands_to_fix and dtype_size_bits >= 1: layout_alignment = alignment_bits // dtype_size_bits new_layout.append([layout_type, layout_alignment]) return new_layout