cutlass/python/cutlass_library/generator.py
2024-01-16 14:37:22 -05:00

6108 lines
244 KiB
Python

#################################################################################################
#
# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
"""
Utilities for enumerating CUTLASS library kernels
"""
import argparse
import enum
from itertools import product
import logging
import os.path
import shutil
import sys
# Certain usecases of cutlass_library nearly always prefer to run as scripts with
# relative imports, rather than via an installed Python package. An example of this
# is using CUTLASS's CMake system to generate a library of kernels to be profiled.
# To make it easy to use these use cases when an existing installation of cutlass_library
# exists, this global flag can be set to true (via command-line arguments) to ensure
# that package-based installations are not used.
# Create a temporary argument parser to check only for the availability of the
# --disable-cutlass-package-imports argument, which controls whether package-based
# imports are disabled.
def _add_package_disablement_flag(argparser):
argparser.add_argument("--disable-cutlass-package-imports", action='store_true', required=False,
help="Disable use of cutlass_library from Python package")
_parser = argparse.ArgumentParser()
_add_package_disablement_flag(_parser)
_args, _ = _parser.parse_known_args()
# Add `CUTLASS_IGNORE_PACKAGE` to `builtins` so that it is visible for gating future
# imports without requiring importing another module. Ideally, we would just place this
# as a global variable in a module to that could be imported and checked (e.g.,
# utils.CUTLASS_IGNORE_PACKAGE). However, this raises the issue of determining
# where this module should be sourced (from the cutlass_library package or from
# a relative import), which is the problem this variable is being used to solve in the
# first place.
import builtins
builtins.CUTLASS_IGNORE_PACKAGE = _args.disable_cutlass_package_imports
try:
if CUTLASS_IGNORE_PACKAGE:
raise ImportError("Disabling attempt to import cutlass_library")
from cutlass_library.library import *
from cutlass_library.manifest import *
except ImportError:
from library import *
from manifest import *
###################################################################################################
#
def CudaToolkitVersionSatisfies(semantic_ver_string, major, minor, patch = 0):
# by default, use the latest CUDA Toolkit version
cuda_version = [11, 0, 132]
# Update cuda_version based on parsed string
if semantic_ver_string != '':
for i, x in enumerate([int(x) for x in semantic_ver_string.split('.')]):
if i < len(cuda_version):
cuda_version[i] = x
else:
cuda_version.append(x)
return cuda_version >= [major, minor, patch]
###################################################################################################
###################################################################################################
#
def EpilogueAlignment(max_alignment, tile, epilogue_steps = 8):
''' Helper to compute the maximum alignment of the epilogue '''
def product(X, identity = 1):
result = identity
for item in X:
result *= item
return result
elements_per_thread = product(tile.threadblock_shape[:-1]) // product(tile.warp_count) // 32 // epilogue_steps
return min(max_alignment, elements_per_thread)
def DefaultSwizzlingFunctor():
return SwizzlingFunctor.Identity8
# To use StreamK decomposition for basic GEMMs, set `swizzling_functor = SwizzlingFunctor.StreamK`
#
def CreateGemmOperator(manifest, layouts, tile_descriptions, data_type, \
alignment_constraints, complex_transforms = None, epilogue_functor = EpilogueFunctor.LinearCombination, \
swizzling_functor = DefaultSwizzlingFunctor()):
if complex_transforms is None:
complex_transforms = [(ComplexTransform.none, ComplexTransform.none),]
element_a, element_b, element_c, element_epilogue = data_type
operations = []
# by default, only generate the largest tile and largest alignment
if manifest.kernel_filter == '':
tile_descriptions = [tile_descriptions[0],]
alignment_constraints = [alignment_constraints[0],]
for layout in layouts:
for tile_description in tile_descriptions:
for alignment in alignment_constraints:
for complex_transform in complex_transforms:
# If alignment is a tuple or a list, then we have different alignments for A and B
alignment_a = alignment if isinstance(alignment, int) else alignment[0]
alignment_b = alignment if isinstance(alignment, int) else alignment[1]
alignment_c = min(8, alignment_a) if isinstance(alignment, int) else alignment[2]
A = TensorDescription(element_a, layout[0], alignment_a, complex_transform[0])
B = TensorDescription(element_b, layout[1], alignment_b, complex_transform[1])
C = TensorDescription(element_c, layout[2], alignment_c)
new_operation = GemmOperation(GemmKind.Universal, tile_description.minimum_compute_capability, \
tile_description, A, B, C, element_epilogue, epilogue_functor, swizzling_functor)
manifest.append(new_operation)
operations.append(new_operation)
return operations
# Generates 3.0 API based GemmUniversal API kernels. Alignment constraints are folded in with layouts
def CreateGemmUniversal3xOperator(
manifest, layouts, tile_descriptions, data_types,
schedules = [[KernelScheduleType.ScheduleAuto, EpilogueScheduleType.ScheduleAuto]],
complex_transforms=None,
epilogue_functor=EpilogueFunctor.LinearCombination,
swizzling_functor=SwizzlingFunctor.Identity1,
tile_schedulers=[TileSchedulerType.Persistent]):
if type(data_types) is dict:
data_types = [data_types]
for s in schedules:
assert(len(s) == 2)
if complex_transforms is None:
complex_transforms = [(ComplexTransform.none, ComplexTransform.none), ]
operations = []
# by default, only generate the largest tile and largest alignment
if manifest.kernel_filter == '':
tile_descriptions = [tile_descriptions[0]]
combinations = product(layouts, tile_descriptions, data_types, complex_transforms, schedules, tile_schedulers)
for layout, tile_description, data_type, complex_transform, schedules, tile_scheduler in combinations:
kernel_schedule, epilogue_schedule = schedules
A = TensorDescription(
data_type["a_type"], layout[0][0], layout[0][1], complex_transform[0])
B = TensorDescription(
data_type["b_type"], layout[1][0], layout[1][1], complex_transform[1])
C = TensorDescription(data_type["c_type"], layout[2][0], layout[2][1])
D = TensorDescription(data_type["d_type"], layout[2][0], layout[2][1])
gemm_op_extra_args = {}
gemm_kind = GemmKind.Universal3x
element_compute = data_type.get("epi_type", data_type["acc_type"])
operation = GemmOperation(
gemm_kind, tile_description.minimum_compute_capability,
tile_description, A, B, C, element_compute, epilogue_functor, swizzling_functor, D,
kernel_schedule, epilogue_schedule, tile_scheduler, **gemm_op_extra_args)
manifest.append(operation)
operations.append(operation)
return operations
#
def CreateSparseGemmOperator(manifest, layouts, tile_descriptions, data_type, \
alignment_constraints, complex_transforms = None, epilogue_functor = EpilogueFunctor.LinearCombination, \
swizzling_functor = SwizzlingFunctor.Identity8):
if complex_transforms is None:
complex_transforms = [(ComplexTransform.none, ComplexTransform.none),]
element_a, element_b, element_c, element_epilogue = data_type
gemm_kinds = [GemmKind.Sparse]
operations = []
# by default, only generate the largest tile and largest alignment
if manifest.kernel_filter == '':
tile_descriptions = [tile_descriptions[0],]
alignment_constraints = [alignment_constraints[0],]
for layout in layouts:
for tile_description in tile_descriptions:
for alignment in alignment_constraints:
for complex_transform in complex_transforms:
alignment_c = min(8, alignment)
A = TensorDescription(element_a, layout[0], alignment, complex_transform[0])
B = TensorDescription(element_b, layout[1], alignment, complex_transform[1])
C = TensorDescription(element_c, layout[2], alignment_c)
new_operation = GemmOperation(GemmKind.Sparse, tile_description.minimum_compute_capability, \
tile_description, A, B, C, element_epilogue, epilogue_functor, swizzling_functor)
manifest.append(new_operation)
operations.append(new_operation)
return operations
#
def CreateGemmPlanarComplexOperator(manifest, layouts, tile_descriptions, data_type, \
alignment_constraints, complex_transforms):
if complex_transforms is None:
complex_transforms = [(ComplexTransform.none, ComplexTransform.none),]
element_a, element_b, element_c, element_epilogue = data_type
gemm_kinds = [GemmKind.PlanarComplex, GemmKind.PlanarComplexArray]
# by default, only generate the largest tile and largest alignment
if manifest.kernel_filter == '':
tile_descriptions = [tile_descriptions[0],]
alignment_constraints = [alignment_constraints[0],]
for gemm_kind in gemm_kinds:
for layout in layouts:
for tile_description in tile_descriptions:
for alignment in alignment_constraints:
for complex_transform in complex_transforms:
alignment_c = min(8, alignment)
A = TensorDescription(element_a, layout[0], alignment, complex_transform[0])
B = TensorDescription(element_b, layout[1], alignment, complex_transform[1])
C = TensorDescription(element_c, layout[2], alignment_c)
manifest.append(GemmOperation(gemm_kind, \
tile_description.minimum_compute_capability, \
tile_description, A, B, C, element_epilogue))
return
#
def CreateGemmGroupedOperator(manifest, layouts, tile_descriptions, data_type, \
alignment_constraints, complex_transforms = None, epilogue_functor = EpilogueFunctor.LinearCombination, \
swizzling_functor = SwizzlingFunctor.Identity8):
if complex_transforms is None:
complex_transforms = [(ComplexTransform.none, ComplexTransform.none),]
element_a, element_b, element_c, element_epilogue = data_type
operations = []
# by default, only generate the largest tile and largest alignment
if manifest.kernel_filter == '':
tile_descriptions = [tile_descriptions[0],]
alignment_constraints = [alignment_constraints[0],]
for layout in layouts:
for tile_description in tile_descriptions:
for alignment in alignment_constraints:
for complex_transform in complex_transforms:
alignment_c = min(8, alignment)
A = TensorDescription(element_a, layout[0], alignment, complex_transform[0])
B = TensorDescription(element_b, layout[1], alignment, complex_transform[1])
C = TensorDescription(element_c, layout[2], alignment_c)
new_operation = GroupedGemmOperation(GemmKind.Grouped, tile_description.minimum_compute_capability, \
tile_description, A, B, C, element_epilogue, epilogue_functor, swizzling_functor)
manifest.append(new_operation)
operations.append(new_operation)
return operations
#
def CreateRankKOperator(manifest, layouts, fill_modes, tile_descriptions, data_type, \
alignment_constraints, blas_mode, epilogue_functor = EpilogueFunctor.LinearCombination, \
swizzling_functor = SwizzlingFunctor.Identity8):
element_a, element_c, element_epilogue = data_type
operations = []
# by default, only generate the largest tile and largest alignment
if manifest.kernel_filter == '':
tile_descriptions = [tile_descriptions[0],]
alignment_constraints = [alignment_constraints[0],]
for layout in layouts:
for fill_mode in fill_modes:
for tile_description in tile_descriptions:
for alignment in alignment_constraints:
# SERK supported layouts (RowMajor, ColumnMajor) with no conjugation
complex_transform = ComplexTransform.none
# HERK supported layouts (RowMajor + conj, ColumnMajor)
if blas_mode == BlasMode.hermitian and layout[0] == LayoutType.RowMajor:
complex_transform = ComplexTransform.conj
alignment_c = 1 # Alignment only applies to A in SYRK
A = TensorDescription(element_a, layout[0], alignment, complex_transform)
C = SymmetricTensorDescription(element_c, layout[1], fill_mode, alignment_c)
# Rank-K update
new_operation = RankKOperation(RankKKind.Universal, tile_description.minimum_compute_capability, \
tile_description, A, C, element_epilogue, epilogue_functor, swizzling_functor, blas_mode)
manifest.append(new_operation)
operations.append(new_operation)
# Rank-2K update
new_operation = Rank2KOperation(RankKKind.Universal, tile_description.minimum_compute_capability, \
tile_description, A, C, element_epilogue, epilogue_functor, swizzling_functor, blas_mode)
manifest.append(new_operation)
operations.append(new_operation)
return operations
#
def CreateTrmmOperator(manifest, layouts, side_modes, fill_modes, diag_types, tile_descriptions, data_type, \
alignment_constraints, complex_transforms = None, epilogue_functor = EpilogueFunctor.LinearCombination, \
swizzling_functor = SwizzlingFunctor.Identity8):
if complex_transforms is None:
complex_transforms = [(ComplexTransform.none),]
element_a, element_b, element_c, element_epilogue = data_type
operations = []
# by default, only generate the largest tile and largest alignment
if manifest.kernel_filter == '':
tile_descriptions = [tile_descriptions[0],]
alignment_constraints = [alignment_constraints[0],]
for layout in layouts:
for side_mode in side_modes:
for fill_mode in fill_modes:
for diag_type in diag_types:
for tile_description in tile_descriptions:
for alignment in alignment_constraints:
for complex_transform in complex_transforms:
alignment_c = min(8, alignment)
A = TriangularTensorDescription(element_a, layout[0], side_mode, fill_mode, diag_type,
alignment, complex_transform)
B = TensorDescription(element_b, layout[1], alignment)
C = TensorDescription(element_c, layout[2], alignment_c)
new_operation = TrmmOperation(TrmmKind.Universal, tile_description.minimum_compute_capability, \
tile_description, A, B, C, element_epilogue, epilogue_functor, swizzling_functor)
manifest.append(new_operation)
operations.append(new_operation)
return operations
#
def CreateSymmOperator(manifest, layouts, side_modes, fill_modes, tile_descriptions, data_type, \
alignment_constraints, blas_mode, epilogue_functor = EpilogueFunctor.LinearCombination, \
swizzling_functor = SwizzlingFunctor.Identity8):
element_a, element_b, element_c, element_epilogue = data_type
operations = []
# by default, only generate the largest tile and largest alignment
if manifest.kernel_filter == '':
tile_descriptions = [tile_descriptions[0],]
alignment_constraints = [alignment_constraints[0],]
for layout in layouts:
for side_mode in side_modes:
for fill_mode in fill_modes:
for tile_description in tile_descriptions:
for alignment in alignment_constraints:
# SYMM supported layouts (RowMajor, ColumnMajor) with no conjugation
complex_transform = ComplexTransform.none
alignment_a = 1 # No vectorized access for the triangular matrix
alignment_c = min(8, alignment)
A = SymmetricTensorDescription(element_a, layout[0], fill_mode, alignment_a, complex_transform, side_mode)
# tensor A and B have same data type and layout
B = TensorDescription(element_b, layout[0], alignment)
C = TensorDescription(element_c, layout[1], alignment_c)
# SYMM/HEMM update
new_operation = SymmOperation(SymmKind.Universal, tile_description.minimum_compute_capability, \
tile_description, A, B, C, element_epilogue, epilogue_functor, swizzling_functor, blas_mode)
manifest.append(new_operation)
operations.append(new_operation)
# SYMM/HEMM update
new_operation = SymmOperation(SymmKind.Universal, tile_description.minimum_compute_capability, \
tile_description, A, B, C, element_epilogue, epilogue_functor, swizzling_functor, blas_mode)
manifest.append(new_operation)
operations.append(new_operation)
return operations
###########################################################################################################
# ConvolutionOperator support variations
# ____________________________________________________________________
# ConvolutionalOperator | Analytic | Optimized
# ____________________________________________________________________
# | Fprop | (strided) | (strided)
# | Dgrad | (strided, unity*) | (strided, unity)
# | Wgrad | (strided) | (strided)
# ____________________________________________________________________
#
# Note : Operator marked (*) are supported but not generated to keep the instantiated kernel count low
###########################################################################################################
# Convolution for 2D operations
def CreateConv2dOperator(manifest, layout, tile_descriptions, data_type, alignment_constraints, \
conv_kinds = [ConvKind.Fprop, ConvKind.Dgrad, ConvKind.Wgrad], \
epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity4):
element_a, element_b, element_c, element_epilogue = data_type
# one exceptional case
# iterator algorithm (analytic and optimized)
iterator_algorithms = [IteratorAlgorithm.Analytic, IteratorAlgorithm.Optimized]
# by default, only generate the largest tile size, largest alignment, and optimized iterator
if manifest.kernel_filter == '':
tile_descriptions = [tile_descriptions[0],]
alignment_constraints = [alignment_constraints[0],]
iterator_algorithms = [IteratorAlgorithm.Optimized]
operations = []
for tile in tile_descriptions:
for alignment in alignment_constraints:
alignment_c = min(8, alignment)
A = TensorDescription(element_a, layout[0], alignment)
B = TensorDescription(element_b, layout[1], alignment)
C = TensorDescription(element_c, layout[2], alignment_c)
swizzling_functor_ = swizzling_functor
#
# Conv2d Fprop
#
if ConvKind.Fprop in conv_kinds:
# Strided support for Analytic and Optimized Fprop
for iterator_algorithm in iterator_algorithms:
new_operations = [
# None grouped kernel
Conv2dOperation(ConvKind.Fprop, iterator_algorithm, tile.minimum_compute_capability, tile,\
A, B, C, element_epilogue, StrideSupport.Strided, epilogue_functor, swizzling_functor_),
]
# Instance group conv kernel
if tile.math_instruction.opcode_class == OpcodeClass.TensorOp and A.layout == LayoutType.TensorNHWC and \
tile.minimum_compute_capability >= 80:
# SingleGroup kernel
new_operations.append(Conv2dOperation(ConvKind.Fprop, iterator_algorithm, tile.minimum_compute_capability, tile,\
A, B, C, element_epilogue, StrideSupport.Strided, epilogue_functor, swizzling_functor_, group_mode=GroupMode.SingleGroup))
# Analytic iterator supports MultipleGroup mode
if iterator_algorithm == IteratorAlgorithm.Analytic:
new_operations.append(Conv2dOperation(ConvKind.Fprop, iterator_algorithm, tile.minimum_compute_capability, tile,\
A, B, C, element_epilogue, StrideSupport.Strided, epilogue_functor, swizzling_functor_, group_mode=GroupMode.MultipleGroup))
for new_operation in new_operations:
manifest.append(new_operation)
operations.append(new_operation)
#
# Conv2d Dgrad
#
if ConvKind.Dgrad in conv_kinds:
# Unity stride for Analytic and Optimized Dgrad
for iterator_algorithm in iterator_algorithms:
new_operation = Conv2dOperation(ConvKind.Dgrad, iterator_algorithm, tile.minimum_compute_capability, tile,\
A, B, C, element_epilogue, StrideSupport.Unity, epilogue_functor, swizzling_functor_)
manifest.append(new_operation)
operations.append(new_operation)
# Strided support for Analytic Dgrad
# strided dgrad uses a special threadblock swizzle
# note that SwizzlingFunctor.StridedDgradHorizontal might be
# better for problem sizes with large activation channel count
swizzling_functor_strided_dgrad_ = SwizzlingFunctor.StridedDgradIdentity1
if IteratorAlgorithm.Analytic in iterator_algorithms:
new_operation = Conv2dOperation(ConvKind.Dgrad, IteratorAlgorithm.Analytic, tile.minimum_compute_capability, tile,\
A, B, C, element_epilogue, StrideSupport.Strided, epilogue_functor, swizzling_functor_strided_dgrad_)
manifest.append(new_operation)
operations.append(new_operation)
# Strided support for Optimized Dgrad
if IteratorAlgorithm.Optimized in iterator_algorithms:
new_operation = Conv2dOperation(ConvKind.Dgrad, IteratorAlgorithm.Optimized, tile.minimum_compute_capability, tile,\
A, B, C, element_epilogue, StrideSupport.Strided, epilogue_functor, swizzling_functor_strided_dgrad_)
manifest.append(new_operation)
operations.append(new_operation)
#
# Conv2d Wgrad
#
if ConvKind.Wgrad in conv_kinds:
# Strided support for Analytic and Optimized Wgrad
for iterator_algorithm in iterator_algorithms:
new_operation = Conv2dOperation(ConvKind.Wgrad, iterator_algorithm, tile.minimum_compute_capability, tile,\
A, B, C, element_epilogue, StrideSupport.Strided, epilogue_functor, swizzling_functor_)
manifest.append(new_operation)
operations.append(new_operation)
return operations
# Convolution for 2D operations specialized for few channels
def CreateConv2dFixedChannelsOperator(manifest, layout, tile_descriptions, data_type, channel_counts, \
conv_kinds = [ConvKind.Fprop, ConvKind.Dgrad, ConvKind.Wgrad], \
epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity4):
element_a, element_b, element_c, element_epilogue = data_type
# one exceptional case
# iterator algorithm (analytic and optimized)
iterator_algorithms = [IteratorAlgorithm.FixedChannels,]
# by default, only generate the largest tile size, largest alignment, and optimized iterator
if manifest.kernel_filter == '':
tile_descriptions = [tile_descriptions[0],]
channel_counts = [channel_counts[0],]
operations = []
for tile in tile_descriptions:
for channel_count in channel_counts:
alignment_c = EpilogueAlignment(channel_count, tile)
A = TensorDescription(element_a, layout[0], channel_count)
B = TensorDescription(element_b, layout[1], channel_count)
C = TensorDescription(element_c, layout[2], alignment_c)
swizzling_functor_ = swizzling_functor
#
# Conv2d Fprop
#
if ConvKind.Fprop in conv_kinds:
# Strided support for Analytic and Optimized Fprop
for iterator_algorithm in iterator_algorithms:
new_operation = Conv2dOperation(ConvKind.Fprop, iterator_algorithm, tile.minimum_compute_capability, tile,\
A, B, C, element_epilogue, StrideSupport.Strided, epilogue_functor, swizzling_functor_)
manifest.append(new_operation)
operations.append(new_operation)
return operations
# Convolution for 2D operations specialized for few channels
def CreateConv2dFewChannelsOperator(manifest, layout, tile_descriptions, data_type, channel_counts, \
conv_kinds = [ConvKind.Fprop, ConvKind.Dgrad, ConvKind.Wgrad], \
epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity4):
element_a, element_b, element_c, element_epilogue = data_type
# one exceptional case
# iterator algorithm (analytic and optimized)
iterator_algorithms = [IteratorAlgorithm.FewChannels,]
# by default, only generate the largest tile size, largest alignment, and optimized iterator
if manifest.kernel_filter == '':
tile_descriptions = [tile_descriptions[0],]
channel_counts = [channel_counts[0],]
operations = []
for tile in tile_descriptions:
for channel_count in channel_counts:
alignment_c = EpilogueAlignment(channel_count, tile)
A = TensorDescription(element_a, layout[0], channel_count)
B = TensorDescription(element_b, layout[1], channel_count)
C = TensorDescription(element_c, layout[2], alignment_c)
swizzling_functor_ = swizzling_functor
#
# Conv2d Fprop
#
if ConvKind.Fprop in conv_kinds:
# Strided support for Analytic and Optimized Fprop
for iterator_algorithm in iterator_algorithms:
new_operation = Conv2dOperation(ConvKind.Fprop, iterator_algorithm, tile.minimum_compute_capability, tile,\
A, B, C, element_epilogue, StrideSupport.Strided, epilogue_functor, swizzling_functor_)
manifest.append(new_operation)
operations.append(new_operation)
return operations
# Convolution for 3D operations
def CreateConv3dOperator(manifest, layout, tile_descriptions, data_type, alignment, \
conv_kinds = [ConvKind.Fprop, ConvKind.Dgrad, ConvKind.Wgrad], epilogue_functor = EpilogueFunctor.LinearCombination):
element_a, element_b, element_c, element_epilogue = data_type
# one exceptional case
alignment_c = min(8, alignment)
# iterator algorithm (analytic and optimized)
iterator_algorithms = [IteratorAlgorithm.Analytic, IteratorAlgorithm.Optimized]
# by default, only generate the largest tile size and optimized iterators
if manifest.kernel_filter == '':
tile_descriptions = [tile_descriptions[0],]
iterator_algorithms = [IteratorAlgorithm.Optimized]
operations = []
# All tile sizes for Conv3dFprop and Conv3dWgrad
for tile in tile_descriptions:
A = TensorDescription(element_a, layout, alignment)
B = TensorDescription(element_b, layout, alignment)
C = TensorDescription(element_c, layout, alignment_c)
#
# Conv3d Fprop
#
if ConvKind.Fprop in conv_kinds:
# Strided support for Analytic and Optimized Fprop
for iterator_algorithm in iterator_algorithms:
new_operation = Conv3dOperation(ConvKind.Fprop, iterator_algorithm, tile.minimum_compute_capability, tile,\
A, B, C, element_epilogue, StrideSupport.Strided)
manifest.append(new_operation)
operations.append(new_operation)
#
# Conv3d Wgrad
#
if ConvKind.Wgrad in conv_kinds:
# Strided support for Analytic and Optimized Wgrad
for iterator_algorithm in iterator_algorithms:
new_operation = Conv3dOperation(ConvKind.Wgrad, iterator_algorithm, tile.minimum_compute_capability, tile,\
A, B, C, element_epilogue, StrideSupport.Strided, epilogue_functor)
manifest.append(new_operation)
operations.append(new_operation)
# All tile sizes for Conv3dDgrad
for tile in tile_descriptions:
A = TensorDescription(element_a, layout, alignment)
B = TensorDescription(element_b, layout, alignment)
C = TensorDescription(element_c, layout, alignment_c)
#
# Conv3d Dgrad
#
if ConvKind.Dgrad in conv_kinds:
# Unity stride for Optimized Dgrad
new_operation = Conv3dOperation(ConvKind.Dgrad, IteratorAlgorithm.Optimized, tile.minimum_compute_capability, tile,\
A, B, C, element_epilogue, StrideSupport.Unity, epilogue_functor)
manifest.append(new_operation)
operations.append(new_operation)
# Strided support for Analytic Dgrad
# Conv3dDgrad has a naive strided support which does not cut down redundant MMAs
new_operation = Conv3dOperation(ConvKind.Dgrad, IteratorAlgorithm.Analytic, tile.minimum_compute_capability, tile,\
A, B, C, element_epilogue, StrideSupport.Strided, epilogue_functor)
manifest.append(new_operation)
operations.append(new_operation)
return operations
# Convolution for Depthwise 2d conv
def CreateDepthwiseConv2dOperator(manifest, layout, tile_descriptions, data_type, alignment_constraints, \
conv_kinds = [ConvKind.Fprop, ConvKind.Dgrad, ConvKind.Wgrad], \
epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity4):
element_a, element_b, element_c, element_epilogue = data_type
# iterator algorithm (FixedStrideDilation, Optimized)
iterator_algorithms = [IteratorAlgorithm.FixedStrideDilation, IteratorAlgorithm.Optimized]
# by default, only generate the largest tile size, largest alignment, and optimized iterator
if manifest.kernel_filter == '':
tile_descriptions = [tile_descriptions[0],]
alignment_constraints = [alignment_constraints[0],]
operations = []
for tile in tile_descriptions:
for alignment in alignment_constraints:
alignment_c = min(8, alignment)
A = TensorDescription(element_a, layout[0], alignment)
B = TensorDescription(element_b, layout[1], alignment)
C = TensorDescription(element_c, layout[2], alignment_c)
swizzling_functor_ = swizzling_functor
if ConvKind.Fprop in conv_kinds:
# Strided support for Optimized and FixedStridedDilation Depthwise Conv
for iterator_algorithm in iterator_algorithms:
stride_support = StrideSupport.Strided
if iterator_algorithm == IteratorAlgorithm.FixedStrideDilation:
if tile.stride == [-1, -1] or tile.dilation == [-1,-1]:
continue
stride_support = StrideSupport.Fixed
if iterator_algorithm == IteratorAlgorithm.Optimized:
if tile.stride != [-1, -1] or tile.dilation != [-1,-1]:
continue
new_operation = Conv2dOperation(ConvKind.Fprop,
iterator_algorithm,
tile.minimum_compute_capability,
tile,
A, B, C,
element_epilogue,
stride_support,
epilogue_functor,
swizzling_functor_,
group_mode=GroupMode.Depthwise)
manifest.append(new_operation)
operations.append(new_operation)
return operations
###################################################################################################
###################################################################################################
#
def GenerateSM50_Simt(manifest, cuda_version):
layouts = [
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
]
math_instructions = [
MathInstruction( \
[1, 1, 1], \
DataType.f32, DataType.f32, DataType.f32, \
OpcodeClass.Simt, \
MathOperation.multiply_add),
MathInstruction( \
[1, 1, 1], \
DataType.f64, DataType.f64, DataType.f64, \
OpcodeClass.Simt, \
MathOperation.multiply_add),
]
min_cc = 50
max_cc = 1024
alignment_constraints = [1,]
for math_inst in math_instructions:
tile_descriptions = [
TileDescription([128, 128, 8], 2, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 8], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 128, 8], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 64, 8], 2, [2, 1, 1], math_inst, min_cc, max_cc),
TileDescription([128, 32, 8], 2, [2, 1, 1], math_inst, min_cc, max_cc),
TileDescription([ 32, 128, 8], 2, [1, 2, 1], math_inst, min_cc, max_cc),
]
data_type = [
math_inst.element_a,
math_inst.element_b,
math_inst.element_accumulator,
math_inst.element_accumulator,
]
CreateGemmOperator(manifest, layouts, tile_descriptions, \
data_type, alignment_constraints)
if math_inst.element_a == DataType.f32:
conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type, alignment_constraints)
#
#
def GenerateSM50_Simt_complex(manifest, cuda_version):
layouts = [
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
]
math_instructions = [
MathInstruction( \
[1, 1, 1], \
DataType.f32, DataType.f32, DataType.f32, \
OpcodeClass.Simt, \
MathOperation.multiply_add_complex),
]
min_cc = 50
max_cc = 1024
alignment_constraints = [1,]
for math_inst in math_instructions:
tile_descriptions = [
TileDescription([128, 64, 8], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 128, 8], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 64, 8], 2, [2, 1, 1], math_inst, min_cc, max_cc),
TileDescription([128, 32, 8], 2, [2, 1, 1], math_inst, min_cc, max_cc),
TileDescription([ 32, 128, 8], 2, [1, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 8], 2, [4, 2, 1], math_inst, min_cc, max_cc),
]
data_type = [
DataType.cf32,
DataType.cf32,
DataType.cf32,
DataType.cf32,
]
CreateGemmOperator(manifest, layouts, tile_descriptions, \
data_type, alignment_constraints)
conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type, alignment_constraints)
#
#
def GenerateSM50(manifest, cuda_version):
GenerateSM50_Simt(manifest, cuda_version)
GenerateSM50_Simt_complex(manifest, cuda_version)
###################################################################################################
###################################################################################################
#
def GenerateSM60_Simt(manifest, cuda_version):
layouts = [
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
]
math_instructions = [
MathInstruction( \
[1, 1, 1], \
DataType.f16, DataType.f16, DataType.f16, \
OpcodeClass.Simt, \
MathOperation.multiply_add),
]
min_cc = 60
max_cc = 1024
alignment_constraints = [1,]
for math_inst in math_instructions:
tile_descriptions = [
TileDescription([256, 128, 8], 2, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 256, 8], 2, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 8], 2, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 8], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 128, 8], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 64, 8], 2, [2, 1, 1], math_inst, min_cc, max_cc),
TileDescription([128, 32, 8], 2, [2, 1, 1], math_inst, min_cc, max_cc),
TileDescription([ 32, 128, 8], 2, [1, 2, 1], math_inst, min_cc, max_cc),
]
data_type = [
math_inst.element_a,
math_inst.element_b,
math_inst.element_accumulator,
math_inst.element_accumulator,
]
CreateGemmOperator(manifest, layouts, tile_descriptions, \
data_type, alignment_constraints)
#
def GenerateSM60_Simt_DepthwiseConv2d(manifest, cuda_version):
math_instructions = [
MathInstruction( \
[1, 1, 1], \
DataType.f16, DataType.f16, DataType.f16, \
OpcodeClass.Simt, \
MathOperation.multiply_add),
]
min_cc = 60
max_cc = 1024
alignment_constraints = [8,]
filter_3x3 = [3, 3]
filter_5x5 = [5, 5]
# [stride_h, stride_w]
# [-1, -1] means all stride size.
strides = [[-1,-1], [1, 1], [2, 2]]
# [dilation_h, dilation_w]
# [-1, -1] means all dilation size.
dilations = [[-1,-1], [1, 1], [2, 2]]
#groups per thread block
g16 = 16
g32 = 32
g64 = 64
#output shape per thread block
npq_1x4x4 = [1, 4, 4]
npq_1x8x8 = [1, 8, 8]
npq_1x10x10 = [1, 10, 10]
tile_descriptions = []
for math_inst in math_instructions:
for stride, dilation in product(strides, dilations):
tile_descriptions.extend([
# filter3x3 ThreadBlock_output, filter, stage, warp
Direct2dConvFixedStrideDilationTileDescription(npq_1x8x8+[g32], filter_3x3, 3, stride, dilation,[4, 1, 1],math_inst, min_cc, max_cc),
Direct2dConvFixedStrideDilationTileDescription(npq_1x8x8+[g64], filter_3x3, 3, stride, dilation,[4, 1, 1],math_inst, min_cc, max_cc),
Direct2dConvFixedStrideDilationTileDescription(npq_1x8x8+[g16], filter_3x3, 3, stride, dilation,[4, 1, 1],math_inst, min_cc, max_cc),
Direct2dConvFixedStrideDilationTileDescription(npq_1x10x10+[g64], filter_3x3, 2, stride, dilation,[4, 1, 1],math_inst, min_cc, max_cc),
Direct2dConvFixedStrideDilationTileDescription(npq_1x4x4+[g32], filter_3x3, 4, stride, dilation, [4, 1, 1], math_inst, min_cc, max_cc),
Direct2dConvFixedStrideDilationTileDescription(npq_1x4x4+[g64], filter_3x3, 4, stride, dilation,[4, 1, 1], math_inst, min_cc, max_cc),
Direct2dConvFixedStrideDilationTileDescription(npq_1x4x4+[g16], filter_3x3, 4, stride, dilation, [4, 1, 1], math_inst, min_cc, max_cc),
# filter5x5 ThreadBlock_output, filter, stage, warp
Direct2dConvFixedStrideDilationTileDescription(npq_1x8x8+[g32], filter_5x5, 3, stride, dilation,[4, 1, 1],math_inst, min_cc, max_cc),
Direct2dConvFixedStrideDilationTileDescription(npq_1x8x8+[g64], filter_5x5, 3, stride, dilation,[4, 1, 1],math_inst, min_cc, max_cc),
Direct2dConvFixedStrideDilationTileDescription(npq_1x8x8+[g16], filter_5x5, 3, stride, dilation,[4, 1, 1],math_inst, min_cc, max_cc),
Direct2dConvFixedStrideDilationTileDescription(npq_1x10x10+[g64], filter_5x5, 2, stride, dilation,[4, 1, 1],math_inst, min_cc, max_cc),
Direct2dConvFixedStrideDilationTileDescription(npq_1x4x4+[g32], filter_5x5, 4, stride, dilation,[4, 1, 1],math_inst, min_cc, max_cc),
Direct2dConvFixedStrideDilationTileDescription(npq_1x4x4+[g64], filter_5x5, 4, stride, dilation,[4, 1, 1],math_inst, min_cc, max_cc),
Direct2dConvFixedStrideDilationTileDescription(npq_1x4x4+[g16], filter_5x5, 4, stride, dilation,[4, 1, 1],math_inst, min_cc, max_cc)
])
data_type = [
math_inst.element_a,
math_inst.element_b,
math_inst.element_accumulator,
math_inst.element_accumulator,
]
conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
CreateDepthwiseConv2dOperator(manifest, conv_layout, tile_descriptions, data_type, alignment_constraints)
#
#
def GenerateSM60(manifest, cuda_version):
GenerateSM60_Simt(manifest, cuda_version)
GenerateSM60_Simt_DepthwiseConv2d(manifest, cuda_version)
###################################################################################################
###################################################################################################
#
def GenerateSM61_Simt(manifest, cuda_version):
layouts = [
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
]
math_instructions = [
MathInstruction( \
[1, 1, 4], \
DataType.s8, DataType.s8, DataType.s32, \
OpcodeClass.Simt, \
MathOperation.multiply_add),
]
min_cc = 61
max_cc = 1024
alignment_constraints = [1,]
for math_inst in math_instructions:
tile_descriptions = [
TileDescription([128, 128, 32], 2, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 128, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 64, 32], 2, [2, 1, 1], math_inst, min_cc, max_cc),
TileDescription([128, 32, 32], 2, [2, 1, 1], math_inst, min_cc, max_cc),
TileDescription([ 32, 128, 32], 2, [1, 2, 1], math_inst, min_cc, max_cc),
]
data_type = [
math_inst.element_a,
math_inst.element_b,
math_inst.element_accumulator,
math_inst.element_accumulator,
]
data_type_mixed = [
math_inst.element_a,
math_inst.element_b,
math_inst.element_a,
math_inst.element_accumulator,
]
CreateGemmOperator(manifest, layouts, tile_descriptions, \
data_type, alignment_constraints)
CreateGemmOperator(manifest, layouts, tile_descriptions, \
data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
#
#
def GenerateSM61(manifest, cuda_version):
GenerateSM61_Simt(manifest, cuda_version)
###################################################################################################
###################################################################################################
#
def GenerateSM70_TensorOp_884(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 10, 1):
return
layouts = [
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
]
math_instructions = [
MathInstruction( \
[8, 8, 4], \
DataType.f16, DataType.f16, DataType.f32, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add),
MathInstruction( \
[8, 8, 4], \
DataType.f16, DataType.f16, DataType.f16, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add),
]
min_cc = 70
max_cc = 75
alignment_constraints = [8, 4, 2, 1]
for math_inst in math_instructions:
tile_descriptions = [
TileDescription([256, 128, 32], 2, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 256, 32], 2, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([256, 64, 32], 2, [4, 1, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 256, 32], 2, [1, 4, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 128, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
]
data_type = [
math_inst.element_a,
math_inst.element_b,
math_inst.element_accumulator,
math_inst.element_accumulator,
]
CreateGemmOperator(manifest, layouts, tile_descriptions, \
data_type, alignment_constraints)
conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type, alignment_constraints)
# Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
if math_inst.element_a != math_inst.element_accumulator:
data_type_mixed = [
math_inst.element_a,
math_inst.element_b,
math_inst.element_a,
math_inst.element_accumulator,
]
CreateGemmOperator(manifest, layouts, tile_descriptions, \
data_type_mixed, alignment_constraints)
CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type_mixed, alignment_constraints)
#
def GenerateSM70_PlanarComplexTensorOp_884(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 10, 1):
return
layouts = [
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
]
complex_transforms = [
(ComplexTransform.none, ComplexTransform.none),
(ComplexTransform.conj, ComplexTransform.none),
(ComplexTransform.none, ComplexTransform.conj),
(ComplexTransform.conj, ComplexTransform.conj)
]
math_instructions = [
MathInstruction( \
[8, 8, 4], \
DataType.f16, DataType.f16, DataType.f32, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add),
MathInstruction( \
[8, 8, 4], \
DataType.f16, DataType.f16, DataType.f16, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add),
]
min_cc = 70
max_cc = 75
alignment_constraints = [8, 2, 1]
for math_inst in math_instructions:
tile_descriptions = [
TileDescription([ 64, 64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
]
data_type = [
math_inst.element_a,
math_inst.element_b,
math_inst.element_accumulator,
math_inst.element_accumulator,
]
CreateGemmPlanarComplexOperator(manifest, layouts, tile_descriptions, \
data_type, alignment_constraints, complex_transforms)
# Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
if math_inst.element_a != math_inst.element_accumulator:
data_type_mixed = [
math_inst.element_a,
math_inst.element_b,
math_inst.element_a,
math_inst.element_accumulator,
]
CreateGemmPlanarComplexOperator(manifest, layouts, tile_descriptions, \
data_type_mixed, alignment_constraints, complex_transforms)
#
def GenerateSM70_WmmaTensorOp_161616(manifest, cuda_version):
layouts = [
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
]
math_instructions = [
MathInstruction( \
[16, 16, 16], \
DataType.f16, DataType.f16, DataType.f32, \
OpcodeClass.WmmaTensorOp, \
MathOperation.multiply_add),
MathInstruction( \
[16, 16, 16], \
DataType.f16, DataType.f16, DataType.f16, \
OpcodeClass.WmmaTensorOp, \
MathOperation.multiply_add),
]
min_cc = 70
max_cc = 1024
alignment_constraints = [8,]
for math_inst in math_instructions:
tile_descriptions = [
TileDescription([128, 128, 32], 2, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 128, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
]
data_type = [
math_inst.element_a,
math_inst.element_b,
math_inst.element_accumulator,
math_inst.element_accumulator,
]
CreateGemmOperator(manifest, layouts, tile_descriptions, \
data_type, alignment_constraints)
# Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
if math_inst.element_a != math_inst.element_accumulator:
data_type_mixed = [
math_inst.element_a,
math_inst.element_b,
math_inst.element_a,
math_inst.element_accumulator,
]
CreateGemmOperator(manifest, layouts, tile_descriptions, \
data_type_mixed, alignment_constraints)
#
##################################################################################################
#
def GenerateSM70(manifest, cuda_version):
GenerateSM70_TensorOp_884(manifest, cuda_version)
GenerateSM70_PlanarComplexTensorOp_884(manifest, cuda_version)
# To limit build size, WMMA GEMMs are disabled for now.
#
#GenerateSM70_WmmaTensorOp_161616(manifest, cuda_version)
###################################################################################################
###################################################################################################
#
def GenerateSM75_TensorOp_1688_FewChannels(manifest, cuda_version, math_inst):
min_cc = 75
max_cc = 1024
tile_descriptions = [
TileDescription([128, 64, 32], 2, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([256, 64, 32], 2, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 32], 2, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 256, 32], 2, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 128, 32], 2, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 64, 32], 2, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 128, 64], 2, [2, 2, 2], math_inst, min_cc, max_cc),
]
data_type = [
math_inst.element_a,
math_inst.element_b,
math_inst.element_accumulator,
math_inst.element_accumulator,
]
conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
CreateConv2dFixedChannelsOperator(manifest, conv_layout, tile_descriptions, data_type, [4, 8])
CreateConv2dFewChannelsOperator(manifest, conv_layout, tile_descriptions, data_type, [1, 2, 4])
# Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
if math_inst.element_a != math_inst.element_accumulator:
data_type_mixed = [
math_inst.element_a,
math_inst.element_b,
math_inst.element_a,
math_inst.element_accumulator,
]
CreateConv2dFixedChannelsOperator(manifest, conv_layout, tile_descriptions, data_type_mixed, [4, 8])
CreateConv2dFewChannelsOperator(manifest, conv_layout, tile_descriptions, data_type_mixed, [1, 2, 4])
#
def GenerateSM75_TensorOp_1688(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 10, 2):
return
layouts = [
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
]
math_instructions = [
MathInstruction( \
[16, 8, 8], \
DataType.f16, DataType.f16, DataType.f32, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add),
MathInstruction( \
[16, 8, 8], \
DataType.f16, DataType.f16, DataType.f16, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add),
]
min_cc = 75
max_cc = 1024
alignment_constraints = [8, 4, 2, 1]
for math_inst in math_instructions:
tile_descriptions = [
TileDescription([256, 128, 32], 2, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 256, 32], 2, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 256, 32], 2, [1, 4, 1], math_inst, min_cc, max_cc),
TileDescription([256, 64, 32], 2, [4, 1, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 128, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 128, 64], 2, [1, 2, 2], math_inst, min_cc, max_cc),
]
data_type = [
math_inst.element_a,
math_inst.element_b,
math_inst.element_accumulator,
math_inst.element_accumulator,
]
CreateGemmOperator(manifest, layouts, tile_descriptions, \
data_type, alignment_constraints)
conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type, alignment_constraints)
# Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
if math_inst.element_a != math_inst.element_accumulator:
data_type_mixed = [
math_inst.element_a,
math_inst.element_b,
math_inst.element_a,
math_inst.element_accumulator,
]
CreateGemmOperator(manifest, layouts, tile_descriptions, \
data_type_mixed, alignment_constraints)
CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type_mixed, alignment_constraints)
# Separate generator for 'few channels' specializations
GenerateSM75_TensorOp_1688_FewChannels(manifest, cuda_version, math_inst)
#
#
def GenerateSM75_PlanarComplexTensorOp_1688(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 10, 2):
return
layouts = [
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
]
complex_transforms = [
(ComplexTransform.none, ComplexTransform.none),
(ComplexTransform.conj, ComplexTransform.none),
(ComplexTransform.none, ComplexTransform.conj),
(ComplexTransform.conj, ComplexTransform.conj)
]
math_instructions = [
MathInstruction( \
[16, 8, 8], \
DataType.f16, DataType.f16, DataType.f32, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add),
MathInstruction( \
[16, 8, 8], \
DataType.f16, DataType.f16, DataType.f16, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add),
]
min_cc = 75
max_cc = 1024
alignment_constraints = [8, 2, 1]
for math_inst in math_instructions:
tile_descriptions = [
TileDescription([ 64, 128, 32], 2, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 32], 2, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
]
data_type = [
math_inst.element_a,
math_inst.element_b,
math_inst.element_accumulator,
math_inst.element_accumulator,
]
CreateGemmPlanarComplexOperator(manifest, layouts, tile_descriptions, \
data_type, alignment_constraints, complex_transforms)
# Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
if math_inst.element_a != math_inst.element_accumulator:
data_type_mixed = [
math_inst.element_a,
math_inst.element_b,
math_inst.element_a,
math_inst.element_accumulator,
]
CreateGemmPlanarComplexOperator(manifest, layouts, tile_descriptions, \
data_type_mixed, alignment_constraints, complex_transforms)
#
def GenerateSM75_TensorOp_8816_TN(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 10, 2):
return
layouts = [
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
]
math_instructions = [
MathInstruction( \
[8, 8, 16], \
DataType.s8, DataType.s8, DataType.s32, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_saturate),
MathInstruction( \
[8, 8, 16], \
DataType.u8, DataType.u8, DataType.s32, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_saturate),
]
min_cc = 75
max_cc = 90
alignment_constraints = [16,]
alignment_constraints_small_channels = [16, 8, 4]
for math_inst in math_instructions:
tile_descriptions = [
TileDescription([256, 128, 64], 2, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 256, 64], 2, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 256, 64], 2, [1, 4, 1], math_inst, min_cc, max_cc),
TileDescription([256, 64, 64], 2, [4, 1, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 128, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 64, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([256, 32, 64], 2, [4, 1, 1], math_inst, min_cc, max_cc),
TileDescription([ 32, 256, 64], 2, [1, 4, 1], math_inst, min_cc, max_cc),
TileDescription([128, 32, 64], 2, [4, 1, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 32, 64], 2, [2, 1, 1], math_inst, min_cc, max_cc),
TileDescription([256, 128, 32], 2, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 256, 32], 2, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 256, 32], 2, [1, 4, 1], math_inst, min_cc, max_cc),
TileDescription([256, 64, 32], 2, [4, 1, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 128, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 32, 32], 2, [2, 1, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 32, 32], 2, [2, 1, 1], math_inst, min_cc, max_cc),
]
data_type = [
math_inst.element_a,
math_inst.element_b,
math_inst.element_accumulator,
DataType.s32,
]
CreateGemmOperator(manifest, layouts, tile_descriptions, \
data_type, alignment_constraints, None, EpilogueFunctor.LinearCombination)
conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
data_type, alignment_constraints, [ConvKind.Fprop], EpilogueFunctor.LinearCombination)
# Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
if math_inst.element_a != math_inst.element_accumulator:
data_type_mixed = [
math_inst.element_a,
math_inst.element_b,
math_inst.element_a,
DataType.f32,
]
operations = []
operations += CreateGemmOperator(manifest, layouts, tile_descriptions, \
data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
operations += CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
data_type_mixed, alignment_constraints, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp)
operations += CreateConv2dFixedChannelsOperator(manifest, conv_layout, tile_descriptions,
data_type_mixed, alignment_constraints_small_channels, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp)
operations += CreateConv2dFewChannelsOperator(manifest, conv_layout, tile_descriptions,
data_type_mixed, alignment_constraints_small_channels, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp)
for op in operations:
if op.tile_description.threadblock_shape[1] >= 128:
op.C.alignment = 16
else:
op.C.alignment = 8
#
#
def GenerateSM75_TensorOp_8816_Interleaved(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 10, 2):
return
layouts = [
(LayoutType.ColumnMajorInterleaved32, LayoutType.RowMajorInterleaved32, LayoutType.ColumnMajorInterleaved32),
]
math_instructions = [
MathInstruction( \
[8, 8, 16], \
DataType.s8, DataType.s8, DataType.s32, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_saturate),
MathInstruction( \
[8, 8, 16], \
DataType.u8, DataType.u8, DataType.s32, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_saturate),
]
min_cc = 75
max_cc = 90
alignment_constraints = [16,]
for math_inst in math_instructions:
tile_descriptions = [
TileDescription([256, 128, 64], 2, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 256, 64], 2, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([256, 64, 64], 2, [4, 1, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 256, 64], 2, [1, 4, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 128, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 64, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc),
]
data_type_mixed = [
math_inst.element_a,
math_inst.element_b,
math_inst.element_a,
DataType.f32,
]
operations = CreateGemmOperator(manifest, layouts, tile_descriptions, \
data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
conv_layout = (LayoutType.TensorNC32HW32, LayoutType.TensorC32RSK32, LayoutType.TensorNC32HW32)
operations += CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
data_type_mixed, alignment_constraints, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp)
for op in operations:
op.C.alignment = 8
#
#
def GenerateSM75_TensorOp_8832_TN(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 10, 2):
return
layouts = [
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
]
math_instructions = [
MathInstruction( \
[8, 8, 32], \
DataType.s4, DataType.s4, DataType.s32, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_saturate),
MathInstruction( \
[8, 8, 32], \
DataType.u4, DataType.u4, DataType.s32, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_saturate),
]
min_cc = 75
max_cc = 89
alignment_constraints = [32,]
for math_inst in math_instructions:
tile_descriptions = [
TileDescription([256, 128, 128], 2, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 256, 128], 2, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 128], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([256, 64, 128], 2, [4, 1, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 256, 128], 2, [1, 4, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 128, 128], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 128], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 64, 128], 2, [2, 2, 1], math_inst, min_cc, max_cc),
]
data_type = [
math_inst.element_a,
math_inst.element_b,
math_inst.element_accumulator,
DataType.s32,
]
CreateGemmOperator(manifest, layouts, tile_descriptions, \
data_type, alignment_constraints, None, EpilogueFunctor.LinearCombination)
conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
data_type, alignment_constraints, [ConvKind.Fprop], EpilogueFunctor.LinearCombination)
# Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
if math_inst.element_a != math_inst.element_accumulator:
data_type_mixed = [
math_inst.element_a,
math_inst.element_b,
math_inst.element_a,
DataType.f32,
]
operations = []
operations += CreateGemmOperator(manifest, layouts, tile_descriptions, \
data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
operations += CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
data_type_mixed, alignment_constraints, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp)
for op in operations:
if op.tile_description.threadblock_shape[1] >= 128:
op.C.alignment = 16
elif op.tile_description.threadblock_shape[1] == 64:
op.C.alignment = 8
else:
op.C.alignment = 8
#
#
def GenerateSM75_TensorOp_8832_Interleaved(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 10, 2):
return
layouts = [
(LayoutType.ColumnMajorInterleaved64, LayoutType.RowMajorInterleaved64, LayoutType.ColumnMajorInterleaved64),
]
math_instructions = [
MathInstruction( \
[8, 8, 32], \
DataType.s4, DataType.s4, DataType.s32, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_saturate),
MathInstruction( \
[8, 8, 32], \
DataType.u4, DataType.u4, DataType.s32, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_saturate),
]
min_cc = 75
max_cc = 89
alignment_constraints = [32,]
for math_inst in math_instructions:
tile_descriptions = [
TileDescription([256, 128, 128], 2, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 256, 128], 2, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 128], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([256, 64, 128], 2, [4, 1, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 256, 128], 2, [1, 4, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 128, 128], 2, [2, 2, 1], math_inst, min_cc, max_cc),
]
# Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
if math_inst.element_a != math_inst.element_accumulator:
data_type_mixed = [
math_inst.element_a,
math_inst.element_b,
math_inst.element_a,
DataType.f32,
]
operations = CreateGemmOperator(manifest, layouts, tile_descriptions, \
data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
conv_layout = (LayoutType.TensorNC64HW64, LayoutType.TensorC64RSK64, LayoutType.TensorNC64HW64)
operations += CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
data_type_mixed, alignment_constraints, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp)
for op in operations:
op.C.alignment = 16
#
#
def GenerateSM75_TensorOp_88128(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
return
layouts = [
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
]
math_instructions = [
MathInstruction( \
[8, 8, 128], \
DataType.b1, DataType.b1, DataType.s32, \
OpcodeClass.TensorOp, \
MathOperation.xor_popc),
]
min_cc = 75
max_cc = {
MathOperation.xor_popc: 89,
MathOperation.and_popc: 90
}
alignment_constraints = [128,]
for math_inst in math_instructions:
tile_descriptions = [
TileDescription([256, 128, 512], 2, [4, 2, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
TileDescription([128, 256, 512], 2, [2, 4, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
TileDescription([128, 128, 512], 2, [2, 2, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
TileDescription([ 64, 256, 512], 2, [1, 4, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
TileDescription([256, 64, 512], 2, [4, 1, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
TileDescription([ 64, 128, 512], 2, [2, 2, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
TileDescription([128, 64, 512], 2, [2, 2, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
TileDescription([ 64, 64, 512], 2, [2, 2, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
]
data_type = [DataType.b1, DataType.b1, DataType.s32, DataType.s32]
CreateGemmOperator(manifest, layouts, tile_descriptions, \
data_type, alignment_constraints)
#
#
def GenerateSM75_WmmaTensorOp_161616(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 10, 0):
return
layouts = [
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
]
math_instructions = [
MathInstruction( \
[16, 16, 16], \
DataType.s8, DataType.s8, DataType.s32, \
OpcodeClass.WmmaTensorOp, \
MathOperation.multiply_add),
]
min_cc = 75
max_cc = 1024
alignment_constraints = [16,]
for math_inst in math_instructions:
tile_descriptions = [
TileDescription([128, 128, 32], 2, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 128, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
]
data_type = [
math_inst.element_a,
math_inst.element_b,
math_inst.element_accumulator,
DataType.f32,
]
CreateGemmOperator(manifest, layouts, tile_descriptions, \
data_type, alignment_constraints)
# Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
if math_inst.element_a != math_inst.element_accumulator:
data_type_mixed = [
math_inst.element_a,
math_inst.element_b,
math_inst.element_a,
DataType.f32,
]
CreateGemmOperator(manifest, layouts, tile_descriptions, \
data_type_mixed, alignment_constraints)
#
#
def GenerateSM75_Simt_complex(manifest, cuda_version):
math_instructions = [
MathInstruction( \
[1, 1, 1], \
DataType.f32, DataType.f32, DataType.f32, \
OpcodeClass.Simt, \
MathOperation.multiply_add_complex),
]
min_cc = 75
max_cc = 1024
alignment_constraints = [1,]
for math_inst in math_instructions:
tile_descriptions = [
TileDescription([128, 128, 8], 5, [4, 2, 1], math_inst, min_cc, max_cc)
]
data_type = [
DataType.cf32,
DataType.cf32,
DataType.cf32,
DataType.cf32
]
complex_transforms = [
(ComplexTransform.none, ComplexTransform.none),
(ComplexTransform.conj, ComplexTransform.none),
(ComplexTransform.none, ComplexTransform.conj),
(ComplexTransform.conj, ComplexTransform.conj)
]
conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type, alignment_constraints)
#
def GenerateSM75(manifest, cuda_version):
GenerateSM75_TensorOp_1688(manifest, cuda_version)
GenerateSM75_PlanarComplexTensorOp_1688(manifest, cuda_version)
GenerateSM75_TensorOp_8816_TN(manifest, cuda_version)
GenerateSM75_TensorOp_8816_Interleaved(manifest, cuda_version)
GenerateSM75_TensorOp_8832_TN(manifest, cuda_version)
GenerateSM75_TensorOp_8832_Interleaved(manifest, cuda_version)
GenerateSM75_TensorOp_88128(manifest, cuda_version)
#GenerateSM75_WmmaTensorOp_161616(manifest, cuda_version)
GenerateSM75_Simt_complex(manifest, cuda_version)
###################################################################################################
###################################################################################################
#
def GenerateSM80_TensorOp_16816(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
return
layouts = [
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
]
math_instructions = [
MathInstruction( \
[16, 8, 16], \
DataType.f16, DataType.f16, DataType.f32, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add),
MathInstruction( \
[16, 8, 16], \
DataType.f16, DataType.f16, DataType.f16, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add),
MathInstruction( \
[16, 8, 16], \
DataType.bf16, DataType.bf16, DataType.f32, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add),
]
min_cc = 80
max_cc = 1024
alignment_constraints = [8, 4, 2]
for math_inst in math_instructions:
tile_descriptions = [
TileDescription([256, 128, 32], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 256, 32], 3, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([256, 64, 32], 3, [4, 1, 1], math_inst, min_cc, max_cc),
TileDescription([256, 64, 32], 4, [4, 1, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 256, 32], 4, [1, 4, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 32], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 32], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 32], 5, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 32], 6, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 128, 32], 6, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 64, 32], 10, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([256, 128, 64], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 256, 64], 3, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([256, 64, 64], 4, [4, 1, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 256, 64], 4, [1, 4, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 64], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([256, 64, 64], 3, [4, 1, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 256, 64], 3, [1, 4, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 64], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 64], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 128, 64], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 64, 64], 5, [2, 2, 1], math_inst, min_cc, max_cc),
]
data_type = [
math_inst.element_a,
math_inst.element_b,
math_inst.element_accumulator,
math_inst.element_accumulator,
]
CreateGemmOperator(manifest, layouts, tile_descriptions, \
data_type, alignment_constraints)
CreateGemmGroupedOperator(manifest, layouts, tile_descriptions, data_type, alignment_constraints)
conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type, alignment_constraints)
CreateConv2dFixedChannelsOperator(manifest, conv_layout, tile_descriptions, data_type, [4, 8])
CreateConv3dOperator(manifest, LayoutType.TensorNDHWC, tile_descriptions, data_type, 8)
# Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
if math_inst.element_a != math_inst.element_accumulator:
data_type_mixed = [
math_inst.element_a,
math_inst.element_b,
math_inst.element_a,
math_inst.element_accumulator,
]
CreateGemmOperator(manifest, layouts, tile_descriptions, \
data_type_mixed, alignment_constraints)
CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type_mixed, alignment_constraints)
CreateConv2dFixedChannelsOperator(manifest, conv_layout, tile_descriptions, data_type_mixed, [4, 8])
CreateConv3dOperator(manifest, LayoutType.TensorNDHWC, tile_descriptions, data_type_mixed, 8)
#
#
def GenerateSM80_SparseTensorOp_16832(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 11, 1):
return
layouts = [
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.RowMajor),
(LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.RowMajor),
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.RowMajor),
(LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.RowMajor),
]
math_instructions = [
MathInstruction( \
[16, 8, 32], \
DataType.f16, DataType.f16, DataType.f32, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add),
MathInstruction( \
[16, 8, 32], \
DataType.f16, DataType.f16, DataType.f16, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add),
MathInstruction( \
[16, 8, 32], \
DataType.bf16, DataType.bf16, DataType.f32, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add),
]
min_cc = 80
max_cc = 1024
alignment_constraints = [8]
for math_inst in math_instructions:
tile_descriptions = [
TileDescription([ 64, 128, 64], 6, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([256, 128, 64], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 256, 64], 3, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 64], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([256, 64, 64], 3, [4, 1, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 256, 64], 4, [1, 4, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 64], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 64, 64], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 128], 3, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([256, 64, 128], 3, [4, 1, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 128], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 128, 128], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 64, 128], 3, [2, 2, 1], math_inst, min_cc, max_cc),
]
data_type = [
math_inst.element_a,
math_inst.element_b,
math_inst.element_accumulator,
math_inst.element_accumulator,
]
CreateSparseGemmOperator(manifest, layouts, tile_descriptions, \
data_type, alignment_constraints)
# Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
if math_inst.element_a != math_inst.element_accumulator:
data_type_mixed = [
math_inst.element_a,
math_inst.element_b,
math_inst.element_a,
math_inst.element_accumulator,
]
CreateSparseGemmOperator(manifest, layouts, tile_descriptions, \
data_type_mixed, alignment_constraints)
#
#
def GenerateSM80_PlanarComplexTensorOp_16816(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
return
layouts = [
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
]
complex_transforms = [
(ComplexTransform.none, ComplexTransform.none),
(ComplexTransform.conj, ComplexTransform.none),
(ComplexTransform.none, ComplexTransform.conj),
(ComplexTransform.conj, ComplexTransform.conj)
]
math_instructions = [
MathInstruction( \
[16, 8, 16], \
DataType.f16, DataType.f16, DataType.f32, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add),
MathInstruction( \
[16, 8, 16], \
DataType.bf16, DataType.bf16, DataType.f32, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add),
MathInstruction( \
[16, 8, 16], \
DataType.f16, DataType.f16, DataType.f16, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add),
]
min_cc = 80
max_cc = 1024
alignment_constraints = [8, ]
for math_inst in math_instructions:
tile_descriptions = [
TileDescription([ 64, 128, 32], 3, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 32], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 64, 32], 4, [2, 2, 1], math_inst, min_cc, max_cc),
]
data_type = [
math_inst.element_a,
math_inst.element_b,
math_inst.element_accumulator,
math_inst.element_accumulator,
]
CreateGemmPlanarComplexOperator(manifest, layouts, tile_descriptions, \
data_type, alignment_constraints, complex_transforms)
# Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
if math_inst.element_a != math_inst.element_accumulator:
data_type_mixed = [
math_inst.element_a,
math_inst.element_b,
math_inst.element_a,
math_inst.element_accumulator,
]
CreateGemmPlanarComplexOperator(manifest, layouts, tile_descriptions, \
data_type_mixed, alignment_constraints, complex_transforms)
#
def GenerateSM80_TensorOp_16816_mixed_input_upcast_a(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
return
layouts = [
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
]
# Upcast on Operand A
math_instructions = [
MathInstruction( \
[16, 8, 16], \
DataType.s8, DataType.f16, DataType.f16, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_mixed_input_upcast),
MathInstruction( \
[16, 8, 16], \
DataType.s8, DataType.f16, DataType.f32, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_mixed_input_upcast),
MathInstruction( \
[16, 8, 16], \
DataType.u8, DataType.f16, DataType.f32, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_mixed_input_upcast),
MathInstruction( \
[16, 8, 16], \
DataType.u8, DataType.bf16, DataType.f32, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_mixed_input_upcast),
MathInstruction( \
[16, 8, 16], \
DataType.s8, DataType.bf16, DataType.f32, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_mixed_input_upcast),
]
min_cc = 80
max_cc = 1024
# For mixed-input alignment constraints are a list of lists, where the
# inner list contains the alignment constraints for operands/matrices
# [[alignA, alignB, alignC],..]
alignment_constraints = [[16, 8, 8],]
for math_inst in math_instructions:
tile_descriptions = [
# 128x128
TileDescription([128, 128, 64], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 64], 3, [2, 2, 1], math_inst, min_cc, max_cc),
# 128x64
TileDescription([128, 64, 64], 5, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 64], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 64], 3, [2, 2, 1], math_inst, min_cc, max_cc),
# 128x32
TileDescription([128, 32, 64], 9, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 32, 64], 5, [2, 2, 1], math_inst, min_cc, max_cc),
# 128x16
TileDescription([128, 16, 64], 5, [2, 1, 1], math_inst, min_cc, max_cc),
TileDescription([128, 16, 64], 3, [2, 1, 1], math_inst, min_cc, max_cc),
]
data_type = [
math_inst.element_a,
math_inst.element_b,
math_inst.element_accumulator,
math_inst.element_accumulator,
]
# streamk uses more regs which can cause spill for the biggest warp tile size when the accumulators are 32bit.
operations = CreateGemmOperator(manifest, layouts, tile_descriptions, \
data_type, alignment_constraints, None, EpilogueFunctor.LinearCombination, SwizzlingFunctor.Identity8)
# Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
if math_inst.element_a != math_inst.element_accumulator:
data_type_mixed = [
math_inst.element_a,
math_inst.element_b,
math_inst.element_b,
math_inst.element_accumulator,
]
operations += CreateGemmOperator(manifest, layouts, tile_descriptions, \
data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombination, SwizzlingFunctor.Identity8)
for op in operations:
if (DataTypeSize[op.C.element] == 16) and \
(op.tile_description.threadblock_shape[1] <= 32):
op.C.alignment = 4
#
def GenerateSM80_TensorOp_16816_mixed_input_upcast_b(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
return
layouts = [
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
]
math_instructions = [
MathInstruction( \
[16, 8, 16], \
DataType.f16, DataType.s8, DataType.f32, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_mixed_input_upcast),
MathInstruction( \
[16, 8, 16], \
DataType.bf16, DataType.s8, DataType.f32, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_mixed_input_upcast),
MathInstruction( \
[16, 8, 16], \
DataType.f16, DataType.u8, DataType.f32, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_mixed_input_upcast),
MathInstruction( \
[16, 8, 16], \
DataType.bf16, DataType.u8, DataType.f32, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_mixed_input_upcast),
]
min_cc = 80
max_cc = 1024
# For mixed-input alignment constraints are a list of lists, where the
# inner list contains the alignment constraints for operands/matrices
# [[alignA, alignB, alignC],..]
alignment_constraints = [[8, 16, 8],]
for math_inst in math_instructions:
tile_descriptions = [
# 128x128
TileDescription([128, 128, 64], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 64], 3, [2, 2, 1], math_inst, min_cc, max_cc),
# 128x64
TileDescription([128, 64, 64], 5, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 64], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 64], 3, [2, 2, 1], math_inst, min_cc, max_cc),
# 128x32
TileDescription([128, 32, 64], 9, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 32, 64], 5, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 32, 32], 9, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 32, 32], 5, [2, 2, 1], math_inst, min_cc, max_cc),
# 128x16
TileDescription([128, 16, 64], 5, [2, 1, 1], math_inst, min_cc, max_cc),
TileDescription([128, 16, 64], 3, [2, 1, 1], math_inst, min_cc, max_cc),
TileDescription([128, 16, 32], 9, [2, 1, 1], math_inst, min_cc, max_cc),
TileDescription([128, 16, 32], 5, [2, 1, 1], math_inst, min_cc, max_cc),
TileDescription([128, 16, 32], 3, [2, 1, 1], math_inst, min_cc, max_cc),
# 256x16
TileDescription([256, 16, 32], 5, [2, 1, 1], math_inst, min_cc, max_cc),
TileDescription([256, 16, 32], 3, [2, 1, 1], math_inst, min_cc, max_cc),
]
data_type = [
math_inst.element_a,
math_inst.element_b,
math_inst.element_accumulator,
math_inst.element_accumulator,
]
# streamk uses more regs which can cause spill for the biggest warp tile size when the accumulators are 32bit.
CreateGemmOperator(manifest, layouts, tile_descriptions, \
data_type, alignment_constraints, None, EpilogueFunctor.LinearCombination, SwizzlingFunctor.Identity8)
# Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
if math_inst.element_a != math_inst.element_accumulator:
data_type_mixed = [
math_inst.element_a,
math_inst.element_b,
math_inst.element_a,
math_inst.element_accumulator,
]
operations = CreateGemmOperator(manifest, layouts, tile_descriptions, \
data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombination, SwizzlingFunctor.Identity8)
for op in operations:
if op.tile_description.threadblock_shape[1] <= 32:
op.C.alignment = 4
#
def GenerateSM80_TensorOp_16832_TN(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
return
layouts = [
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
]
math_instructions = [
MathInstruction( \
[16, 8, 32], \
DataType.s8, DataType.s8, DataType.s32, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_saturate),
MathInstruction( \
[16, 8, 32], \
DataType.u8, DataType.u8, DataType.s32, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_saturate),
]
min_cc = 80
max_cc = 1024
smem_usage = 164
alignment_constraints = [16,]
alignment_constraints_small_channels = [16, 8, 4]
for math_inst in math_instructions:
tile_descriptions = [
TileDescription([256, 128, 64], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 256, 64], 3, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([256, 64, 64], 4, [4, 1, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 256, 64], 4, [1, 4, 1], math_inst, min_cc, max_cc),
TileDescription([256, 32, 64], 4, [4, 1, 1], math_inst, min_cc, max_cc),
TileDescription([ 32, 256, 64], 4, [1, 4, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 64], 5, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 64], 6, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 128, 64], 6, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 32, 64], 6, [4, 1, 1], math_inst, min_cc, max_cc),
TileDescription([ 32, 128, 64], 6, [1, 4, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 64, 64], 10, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([256, 128, 128], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 256, 128], 3, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([256, 64, 128], 4, [4, 1, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 256, 128], 4, [1, 4, 1], math_inst, min_cc, max_cc),
TileDescription([256, 32, 128], 4, [4, 1, 1], math_inst, min_cc, max_cc),
TileDescription([ 32, 256, 128], 4, [1, 4, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 128], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 128], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 128, 128], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 32, 128], 4, [4, 1, 1], math_inst, min_cc, max_cc),
TileDescription([ 32, 128, 128], 4, [1, 4, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 64, 128], 5, [2, 2, 1], math_inst, min_cc, max_cc),
]
data_type = [math_inst.element_a, math_inst.element_b, math_inst.element_accumulator, DataType.s32]
data_type_mixed = [math_inst.element_a, math_inst.element_b, math_inst.element_a, DataType.f32]
CreateGemmOperator(manifest, layouts, tile_descriptions, \
data_type, alignment_constraints, None, EpilogueFunctor.LinearCombination)
conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
data_type, alignment_constraints, [ConvKind.Fprop], EpilogueFunctor.LinearCombination)
operations = []
operations += CreateGemmOperator(manifest, layouts, tile_descriptions, \
data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
operations += CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
data_type_mixed, alignment_constraints, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp)
operations += CreateConv2dFixedChannelsOperator(manifest, conv_layout, tile_descriptions,
data_type_mixed, alignment_constraints_small_channels, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp)
operations += CreateConv2dFewChannelsOperator(manifest, conv_layout, tile_descriptions,
data_type_mixed, alignment_constraints_small_channels, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp)
for op in operations:
if op.tile_description.threadblock_shape[1] >= 128:
if op.tile_description.threadblock_shape[0] == 32:
op.C.alignment = 8
else:
op.C.alignment = 16
else:
op.C.alignment = 8
#
#
def GenerateSM80_SparseTensorOp_16864_TN(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 11, 1):
return
layouts = [
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.RowMajor),
]
math_inst = \
MathInstruction( \
[16, 8, 64], \
DataType.s8, DataType.s8, DataType.s32, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_saturate)
min_cc = 80
max_cc = 1024
alignment_constraints = [16,]
tile_descriptions = [
TileDescription([128, 64, 128], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([256, 128, 128], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 256, 128], 3, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 128], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([256, 64, 128], 3, [4, 1, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 256, 128], 4, [1, 4, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 128, 128], 6, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 64, 128], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 256], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 256], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 128, 256], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 64, 256], 3, [2, 2, 1], math_inst, min_cc, max_cc),
]
data_type = [DataType.s8, DataType.s8, DataType.s32, DataType.s32]
data_type_mixed = [DataType.s8, DataType.s8, DataType.s8, DataType.f32]
CreateSparseGemmOperator(manifest, layouts, tile_descriptions, \
data_type, alignment_constraints, None, EpilogueFunctor.LinearCombination)
operations = []
operations += CreateSparseGemmOperator(manifest, layouts, tile_descriptions, \
data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
for op in operations:
if op.tile_description.threadblock_shape[1] >= 128:
op.C.alignment = 16
else:
op.C.alignment = 8
#
#
def GenerateSM80_TensorOp_16832_Interleaved(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
return
layouts = [
(LayoutType.ColumnMajorInterleaved32, LayoutType.RowMajorInterleaved32, LayoutType.ColumnMajorInterleaved32),
]
math_instructions = [
MathInstruction( \
[16, 8, 32], \
DataType.s8, DataType.s8, DataType.s32, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_saturate),
MathInstruction( \
[16, 8, 32], \
DataType.u8, DataType.u8, DataType.s32, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_saturate),
]
min_cc = 80
max_cc = 1024
alignment_constraints = [16,]
for math_inst in math_instructions:
tile_descriptions = [
TileDescription([256, 128, 64], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 256, 64], 3, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([256, 64, 64], 4, [4, 1, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 256, 64], 4, [1, 4, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 64], 5, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 64], 6, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 128, 64], 6, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 64, 64], 10, [2, 2, 1], math_inst, min_cc, max_cc),
]
data_type_mixed = [math_inst.element_a, math_inst.element_b, math_inst.element_a, DataType.f32]
operations = CreateGemmOperator(manifest, layouts, tile_descriptions, \
data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
conv_layout = (LayoutType.TensorNC32HW32, LayoutType.TensorC32RSK32, LayoutType.TensorNC32HW32)
operations += CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
data_type_mixed, alignment_constraints, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp)
for op in operations:
op.C.alignment = 8
#
#
def GenerateSM80_TensorOp_16864_TN(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
return
layouts = [
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
]
math_instructions = [
MathInstruction( \
[16, 8, 64], \
DataType.s4, DataType.s4, DataType.s32, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_saturate),
MathInstruction( \
[16, 8, 64], \
DataType.u4, DataType.u4, DataType.s32, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_saturate),
]
min_cc = 80
max_cc = 1024
alignment_constraints = [32,]
for math_inst in math_instructions:
tile_descriptions = [
TileDescription([256, 128, 128], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 256, 128], 3, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([256, 64, 128], 4, [4, 1, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 256, 128], 4, [1, 4, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 128], 5, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 128], 6, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 128, 128], 6, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 64, 128], 10, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([256, 128, 256], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 256, 256], 3, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([256, 64, 256], 4, [4, 1, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 256, 256], 4, [1, 4, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 256], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 256], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 256], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 128, 256], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 64, 256], 5, [2, 2, 1], math_inst, min_cc, max_cc),
]
data_type = [math_inst.element_a, math_inst.element_b, math_inst.element_accumulator, DataType.s32]
data_type_mixed = [math_inst.element_a, math_inst.element_b, math_inst.element_a, DataType.f32]
CreateGemmOperator(manifest, layouts, tile_descriptions, \
data_type, alignment_constraints, None, EpilogueFunctor.LinearCombination)
operations = []
operations += CreateGemmOperator(manifest, layouts, tile_descriptions, \
data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
data_type, alignment_constraints, [ConvKind.Fprop], EpilogueFunctor.LinearCombination)
operations += CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
data_type_mixed, alignment_constraints, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp)
for op in operations:
if op.tile_description.threadblock_shape[1] >= 128:
op.C.alignment = 16
elif op.tile_description.threadblock_shape[1] == 64:
op.C.alignment = 8
else:
op.C.alignment = 8
#
#
def GenerateSM80_SparseTensorOp_168128_TN(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 11, 1):
return
layouts = [
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.RowMajor),
]
math_inst = \
MathInstruction( \
[16, 8, 128], \
DataType.s4, DataType.s4, DataType.s32, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_saturate)
min_cc = 80
max_cc = 1024
alignment_constraints = [32,]
tile_descriptions = [
TileDescription([ 64, 64, 256], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([256, 64, 256], 3, [4, 1, 1], math_inst, min_cc, max_cc),
TileDescription([256, 128, 256], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 256, 256], 3, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 256], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 256, 256], 4, [1, 4, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 256], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 128, 256], 6, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 512], 3, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 512], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 128, 512], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 64, 512], 3, [2, 2, 1], math_inst, min_cc, max_cc),
]
data_type = [DataType.s4, DataType.s4, DataType.s32, DataType.s32]
data_type_mixed = [DataType.s4, DataType.s4, DataType.s4, DataType.f32]
CreateSparseGemmOperator(manifest, layouts, tile_descriptions, \
data_type, alignment_constraints, None, EpilogueFunctor.LinearCombination)
operations = []
operations += CreateSparseGemmOperator(manifest, layouts, tile_descriptions, \
data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
for op in operations:
if op.tile_description.threadblock_shape[1] > 128:
op.C.alignment = 16
else:
op.C.alignment = 8
#
#
def GenerateSM80_TensorOp_16864_Interleaved(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
return
layouts = [
(LayoutType.ColumnMajorInterleaved64, LayoutType.RowMajorInterleaved64, LayoutType.ColumnMajorInterleaved64),
]
math_instructions = [
MathInstruction( \
[16, 8, 64], \
DataType.s4, DataType.s4, DataType.s32, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_saturate),
MathInstruction( \
[16, 8, 64], \
DataType.u4, DataType.u4, DataType.s32, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_saturate),
]
min_cc = 80
max_cc = 1024
alignment_constraints = [32,]
for math_inst in math_instructions:
tile_descriptions = [
TileDescription([256, 128, 128], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 256, 128], 3, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([256, 64, 128], 4, [4, 1, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 256, 128], 4, [1, 4, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 128], 5, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 128, 128], 6, [2, 2, 1], math_inst, min_cc, max_cc),
]
data_type_mixed = [math_inst.element_a, math_inst.element_b, math_inst.element_a, DataType.f32]
operations = []
operations += CreateGemmOperator(manifest, layouts, tile_descriptions, \
data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
conv_layout = (LayoutType.TensorNC64HW64, LayoutType.TensorC64RSK64, LayoutType.TensorNC64HW64)
operations += CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
data_type_mixed, alignment_constraints, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp)
for op in operations:
op.C.alignment = 16
#
#
def GenerateSM80_TensorOp_168256(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
return
layouts = [
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
]
math_instructions = [
MathInstruction( \
[16, 8, 256], \
DataType.b1, DataType.b1, DataType.s32, \
OpcodeClass.TensorOp, \
MathOperation.xor_popc),
MathInstruction( \
[16, 8, 256], \
DataType.b1, DataType.b1, DataType.s32, \
OpcodeClass.TensorOp, \
MathOperation.and_popc),
]
min_cc = 80
max_cc = {
MathOperation.xor_popc: 89,
MathOperation.and_popc: 90
}
alignment_constraints = [128,]
for math_inst in math_instructions:
tile_descriptions = [
TileDescription([256, 128, 512], 3, [4, 2, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
TileDescription([128, 256, 512], 3, [2, 4, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
TileDescription([256, 64, 512], 4, [4, 1, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
TileDescription([ 64, 256, 512], 4, [1, 4, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
TileDescription([128, 128, 512], 5, [2, 2, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
TileDescription([128, 64, 512], 6, [2, 2, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
TileDescription([ 64, 128, 512], 6, [2, 2, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
TileDescription([ 64, 64, 512], 10, [2, 2, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
TileDescription([256, 128, 1024], 3, [4, 2, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
TileDescription([128, 256, 1024], 3, [2, 4, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
TileDescription([256, 64, 1024], 4, [4, 1, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
TileDescription([ 64, 256, 1024], 4, [1, 4, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
TileDescription([128, 128, 1024], 4, [2, 2, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
TileDescription([128, 64, 1024], 3, [2, 2, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
TileDescription([ 64, 128, 1024], 3, [2, 2, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
TileDescription([ 64, 64, 1024], 5, [2, 2, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
]
data_type = [DataType.b1, DataType.b1, DataType.s32, DataType.s32]
CreateGemmOperator(manifest, layouts, tile_descriptions, \
data_type, alignment_constraints)
#
#
def GenerateSM80_TensorOp_1688(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
return
layouts = [
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
]
math_instructions = [
MathInstruction( \
[16, 8, 8], \
DataType.tf32, DataType.tf32, DataType.f32, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add)
]
min_cc = 80
max_cc = 1024
alignment_constraints = [4, 2, 1]
for math_inst in math_instructions:
tile_descriptions = [
TileDescription([256, 128, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 256, 16], 3, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([256, 64, 16], 4, [4, 1, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 256, 16], 4, [1, 4, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 16], 5, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 16], 6, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 128, 16], 6, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 64, 16], 10, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([256, 128, 32], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 256, 32], 3, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([256, 64, 32], 4, [4, 1, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 256, 32], 4, [1, 4, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 32], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 32], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 32], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 128, 32], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 64, 32], 5, [2, 2, 1], math_inst, min_cc, max_cc),
]
data_type = [
math_inst.element_a,
math_inst.element_b,
math_inst.element_accumulator,
math_inst.element_accumulator,
]
data_type_mixed = [
math_inst.element_a,
math_inst.element_b,
math_inst.element_a,
math_inst.element_accumulator,
]
CreateGemmOperator(manifest, layouts, tile_descriptions, \
data_type, alignment_constraints)
CreateGemmOperator(manifest, layouts, tile_descriptions, \
data_type_mixed, alignment_constraints)
conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type, alignment_constraints)
CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type_mixed, alignment_constraints)
#
#
def GenerateSM80_TensorOp_1688_fast_math(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
return
layouts = [
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
]
math_instructions = [
MathInstruction( \
[16, 8, 8], \
DataType.tf32, DataType.tf32, DataType.f32, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add),
MathInstruction( \
[16, 8, 8], \
DataType.f16, DataType.f16, DataType.f32, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_fast_f16),
MathInstruction( \
[16, 8, 8], \
DataType.bf16, DataType.bf16, DataType.f32, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_fast_bf16),
]
min_cc = 80
max_cc = 1024
alignment_constraints = [4, 2, 1]
for math_inst in math_instructions:
tile_descriptions = [
TileDescription([256, 128, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 256, 16], 3, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([256, 64, 16], 4, [4, 1, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 256, 16], 4, [1, 4, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 16], 5, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 16], 6, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 128, 16], 6, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 64, 16], 10, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([256, 128, 32], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 256, 32], 3, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([256, 64, 32], 4, [4, 1, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 256, 32], 4, [1, 4, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 32], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 32], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 32], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 128, 32], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 64, 32], 5, [2, 2, 1], math_inst, min_cc, max_cc),
]
data_type = [DataType.f32, DataType.f32, DataType.f32, DataType.f32]
CreateGemmOperator(manifest, layouts, tile_descriptions, \
data_type, alignment_constraints)
conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type, alignment_constraints)
#
#
def GenerateSM80_TensorOp_1688_fast_fp32_math(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
return
layouts = [
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
]
math_instructions = [
MathInstruction( \
[16, 8, 8], \
DataType.f32, DataType.f32, DataType.f32, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_fast_f32),
]
min_cc = 80
max_cc = 1024
alignment_constraints = [4, 2, 1]
for math_inst in math_instructions:
tile_descriptions = [
TileDescription([128, 128, 16], 4, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([256, 64, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 256, 16], 3, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 128, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 64, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 32], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([256, 64, 32], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 256, 32], 3, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 32], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 128, 32], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 64, 32], 3, [2, 2, 1], math_inst, min_cc, max_cc),
]
data_type = [DataType.f32, DataType.f32, DataType.f32, DataType.f32]
CreateGemmOperator(manifest, layouts, tile_descriptions, \
data_type, alignment_constraints)
conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type, alignment_constraints)
#
def GenerateSM80_TensorOp_1688_fast_fp32_math_complex(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
return
layouts = [
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
]
math_inst = MathInstruction( \
[16, 8, 8], \
DataType.f32, DataType.f32, DataType.f32, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_complex_fast_f32)
min_cc = 80
max_cc = 1024
tile_descriptions = [
TileDescription([128, 64, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 128, 16], 3, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([64, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 32, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([32, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([32, 32, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
]
data_type = [
DataType.cf32, DataType.cf32, DataType.cf32, DataType.cf32
]
alignment_constraints = [1,]
complex_transforms = [
(ComplexTransform.none, ComplexTransform.none),
(ComplexTransform.conj, ComplexTransform.none),
(ComplexTransform.none, ComplexTransform.conj),
(ComplexTransform.conj, ComplexTransform.conj)
]
CreateGemmOperator(manifest, layouts, tile_descriptions, \
data_type, alignment_constraints, complex_transforms)
#
def GenerateSM80_SparseTensorOp_16816_fast_math(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 11, 1):
return
layouts = [
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.RowMajor),
(LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.RowMajor),
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.RowMajor),
(LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.RowMajor),
]
math_instructions = [
MathInstruction( \
[16, 8, 16], \
DataType.tf32, DataType.tf32, DataType.f32, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add),
]
min_cc = 80
max_cc = 1024
alignment_constraints = [4]
for math_inst in math_instructions:
tile_descriptions = [
TileDescription([128, 64, 32], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 32], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([256, 128, 32], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 256, 32], 3, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([256, 64, 32], 3, [4, 1, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 256, 32], 4, [1, 4, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 128, 32], 6, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 64, 32], 6, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 64], 3, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([256, 64, 64], 3, [4, 1, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 64], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 128, 64], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 64, 64], 3, [2, 2, 1], math_inst, min_cc, max_cc),
]
data_type = [DataType.f32, DataType.f32, DataType.f32, DataType.f32]
CreateSparseGemmOperator(manifest, layouts, tile_descriptions, \
data_type, alignment_constraints)
#
#
def GenerateSM80_TensorOp_1688_complex(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
return
layouts = [
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
]
math_inst = MathInstruction( \
[16, 8, 8], \
DataType.tf32, DataType.tf32, DataType.f32, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_complex)
min_cc = 80
max_cc = 1024
tile_descriptions = [
TileDescription([128, 128, 16], 4, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 16], 4, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 128, 16], 4, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([64, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 32, 16], 4, [2, 1, 1], math_inst, min_cc, max_cc),
TileDescription([32, 64, 16], 4, [1, 2, 1], math_inst, min_cc, max_cc),
TileDescription([32, 32, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
]
data_type = [
DataType.cf32, DataType.cf32, DataType.cf32, DataType.cf32
]
alignment_constraints = [1,]
complex_transforms = [
(ComplexTransform.none, ComplexTransform.none),
(ComplexTransform.conj, ComplexTransform.none),
(ComplexTransform.none, ComplexTransform.conj),
(ComplexTransform.conj, ComplexTransform.conj)
]
CreateGemmOperator(manifest, layouts, tile_descriptions, \
data_type, alignment_constraints, complex_transforms)
#
#
def GenerateSM80_TensorOp_1688_rank_k(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
return
layouts = [
(LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.ColumnMajor),
]
fill_modes = [
FillMode.Lower, FillMode.Upper,
]
math_instructions = [
MathInstruction( \
[16, 8, 8], \
DataType.tf32, DataType.tf32, DataType.f32, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add),
MathInstruction( \
[16, 8, 8], \
DataType.f32, DataType.f32, DataType.f32, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_fast_f32),
]
min_cc = 80
max_cc = 1024
alignment_constraints = [1, 2, 4] # Alignment only applies to A in SYRK
for math_inst in math_instructions:
tile_descriptions = [
TileDescription([256, 128, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 256, 16], 3, [2, 4, 1], math_inst, min_cc, max_cc),
#TileDescription([256, 64, 16], 4, [4, 1, 1], math_inst, min_cc, max_cc),
#TileDescription([ 64, 256, 16], 4, [1, 4, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 16], 5, [2, 2, 1], math_inst, min_cc, max_cc),
#TileDescription([128, 64, 16], 6, [2, 2, 1], math_inst, min_cc, max_cc),
#TileDescription([ 64, 128, 16], 6, [2, 2, 1], math_inst, min_cc, max_cc),
#TileDescription([ 64, 64, 16], 10, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([256, 128, 32], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 256, 32], 3, [2, 4, 1], math_inst, min_cc, max_cc),
#TileDescription([256, 64, 32], 4, [4, 1, 1], math_inst, min_cc, max_cc),
#TileDescription([ 64, 256, 32], 4, [1, 4, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 32], 4, [2, 2, 1], math_inst, min_cc, max_cc),
#TileDescription([128, 64, 32], 3, [2, 2, 1], math_inst, min_cc, max_cc),
#TileDescription([ 64, 128, 32], 3, [2, 2, 1], math_inst, min_cc, max_cc),
#TileDescription([ 64, 64, 32], 5, [2, 2, 1], math_inst, min_cc, max_cc),
]
data_type = [DataType.f32, DataType.f32, DataType.f32]
CreateRankKOperator(manifest, layouts, fill_modes, tile_descriptions, \
data_type, alignment_constraints, BlasMode.symmetric)
#
#
def GenerateSM80_TensorOp_1688_rank_k_complex(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
return
layouts = [
(LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.ColumnMajor),
]
fill_modes = [
FillMode.Lower, FillMode.Upper,
]
math_instructions = [
MathInstruction( \
[16, 8, 8], \
DataType.tf32, DataType.tf32, DataType.f32, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_complex),
MathInstruction( \
[16, 8, 8], \
DataType.f32, DataType.f32, DataType.f32, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_complex_fast_f32),
]
min_cc = 80
max_cc = 1024
for math_inst in math_instructions:
tile_descriptions = [
TileDescription([128, 64, 16], 4, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 128, 16], 4, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([64, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
#TileDescription([64, 32, 16], 4, [2, 1, 1], math_inst, min_cc, max_cc),
#TileDescription([32, 32, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
]
data_type = [
DataType.cf32, DataType.cf32, DataType.cf32
]
alignment_constraints = [1,]
# SYRK
CreateRankKOperator(manifest, layouts, fill_modes, tile_descriptions, \
data_type, alignment_constraints, BlasMode.symmetric)
# HERK
CreateRankKOperator(manifest, layouts, fill_modes, tile_descriptions, \
data_type, alignment_constraints, BlasMode.hermitian)
#
#
def GenerateSM80_TensorOp_1688_trmm(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
return
layouts = [
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
]
side_modes = [
SideMode.Left, SideMode.Right,
]
fill_modes = [
FillMode.Lower, FillMode.Upper,
]
diag_types = [
DiagType.NonUnit, DiagType.Unit,
]
math_instructions = [
MathInstruction( \
[16, 8, 8], \
DataType.tf32, DataType.tf32, DataType.f32, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add),
MathInstruction( \
[16, 8, 8], \
DataType.f32, DataType.f32, DataType.f32, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_fast_f32),
]
min_cc = 80
max_cc = 1024
alignment_constraints = [1, 2, 4]
for math_inst in math_instructions:
tile_descriptions = [
TileDescription([256, 128, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 256, 16], 3, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([256, 64, 16], 4, [4, 1, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 256, 16], 4, [1, 4, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 16], 5, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 16], 6, [2, 2, 1], math_inst, min_cc, max_cc),
#TileDescription([ 64, 128, 16], 6, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 64, 16], 10, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([256, 128, 32], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 256, 32], 3, [2, 4, 1], math_inst, min_cc, max_cc),
#TileDescription([256, 64, 32], 4, [4, 1, 1], math_inst, min_cc, max_cc),
#TileDescription([ 64, 256, 32], 4, [1, 4, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 32], 4, [2, 2, 1], math_inst, min_cc, max_cc),
#TileDescription([128, 64, 32], 3, [2, 2, 1], math_inst, min_cc, max_cc),
#TileDescription([ 64, 128, 32], 3, [2, 2, 1], math_inst, min_cc, max_cc),
#TileDescription([ 64, 64, 32], 5, [2, 2, 1], math_inst, min_cc, max_cc),
]
data_type = [DataType.f32, DataType.f32, DataType.f32, DataType.f32]
CreateTrmmOperator(manifest, layouts, side_modes, fill_modes, diag_types, tile_descriptions, \
data_type, alignment_constraints)
#
#
def GenerateSM80_TensorOp_1688_trmm_complex(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
return
layouts = [
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
]
side_modes = [
SideMode.Left, SideMode.Right,
]
fill_modes = [
FillMode.Lower, FillMode.Upper,
]
diag_types = [
DiagType.NonUnit, DiagType.Unit,
]
math_instructions = [
MathInstruction( \
[16, 8, 8], \
DataType.tf32, DataType.tf32, DataType.f32, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_complex),
MathInstruction( \
[16, 8, 8], \
DataType.f32, DataType.f32, DataType.f32, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_complex_fast_f32),
]
min_cc = 80
max_cc = 1024
for math_inst in math_instructions:
tile_descriptions = [
TileDescription([128, 64, 16], 4, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 128, 16], 4, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([64, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 32, 16], 4, [2, 1, 1], math_inst, min_cc, max_cc),
TileDescription([32, 32, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
]
data_type = [
DataType.cf32, DataType.cf32, DataType.cf32, DataType.cf32
]
alignment_constraints = [1,]
complex_transforms = [
ComplexTransform.none, ComplexTransform.conj,
]
CreateTrmmOperator(manifest, layouts, side_modes, fill_modes, diag_types, tile_descriptions, \
data_type, alignment_constraints, complex_transforms)
#
#
def GenerateSM80_TensorOp_1688_symm(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
return
# A and B have same layouts
layouts = [
(LayoutType.ColumnMajor, LayoutType.ColumnMajor),
]
side_modes = [
SideMode.Left, SideMode.Right,
]
fill_modes = [
FillMode.Lower, FillMode.Upper,
]
math_instructions = [
MathInstruction( \
[16, 8, 8], \
DataType.tf32, DataType.tf32, DataType.f32, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add),
MathInstruction( \
[16, 8, 8], \
DataType.f32, DataType.f32, DataType.f32, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_fast_f32),
]
min_cc = 80
max_cc = 1024
alignment_constraints = [
1, 2, 4
]
for math_inst in math_instructions:
tile_descriptions = [
TileDescription([256, 128, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 256, 16], 3, [2, 4, 1], math_inst, min_cc, max_cc),
#TileDescription([256, 64, 16], 4, [4, 1, 1], math_inst, min_cc, max_cc),
#TileDescription([ 64, 256, 16], 4, [1, 4, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 16], 5, [2, 2, 1], math_inst, min_cc, max_cc),
#TileDescription([128, 64, 16], 6, [2, 2, 1], math_inst, min_cc, max_cc),
#TileDescription([ 64, 128, 16], 6, [2, 2, 1], math_inst, min_cc, max_cc),
#TileDescription([ 64, 64, 16], 10, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([256, 128, 32], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 256, 32], 3, [2, 4, 1], math_inst, min_cc, max_cc),
#TileDescription([256, 64, 32], 4, [4, 1, 1], math_inst, min_cc, max_cc),
#TileDescription([ 64, 256, 32], 4, [1, 4, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 32], 4, [2, 2, 1], math_inst, min_cc, max_cc),
#TileDescription([128, 64, 32], 3, [2, 2, 1], math_inst, min_cc, max_cc),
#TileDescription([ 64, 128, 32], 3, [2, 2, 1], math_inst, min_cc, max_cc),
#TileDescription([ 64, 64, 32], 5, [2, 2, 1], math_inst, min_cc, max_cc),
]
data_type = [DataType.f32, DataType.f32, DataType.f32, DataType.f32]
CreateSymmOperator(manifest, layouts, side_modes, fill_modes, tile_descriptions, \
data_type, alignment_constraints, BlasMode.symmetric)
#
#
def GenerateSM80_TensorOp_1688_symm_complex(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
return
layouts = [
(LayoutType.ColumnMajor, LayoutType.ColumnMajor),
]
side_modes = [
SideMode.Left, SideMode.Right,
]
fill_modes = [
FillMode.Lower, FillMode.Upper,
]
math_instructions = [
MathInstruction( \
[16, 8, 8], \
DataType.tf32, DataType.tf32, DataType.f32, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_complex),
MathInstruction( \
[16, 8, 8], \
DataType.f32, DataType.f32, DataType.f32, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_complex_fast_f32),
]
min_cc = 80
max_cc = 1024
for math_inst in math_instructions:
tile_descriptions = [
TileDescription([128, 64, 16], 4, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 128, 16], 4, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([64, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
#TileDescription([64, 32, 16], 4, [2, 1, 1], math_inst, min_cc, max_cc),
#TileDescription([32, 32, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
]
data_type = [
DataType.cf32, DataType.cf32, DataType.cf32, DataType.cf32
]
alignment_constraints = [1,]
# SYMM
CreateSymmOperator(manifest, layouts, side_modes, fill_modes, tile_descriptions, \
data_type, alignment_constraints, BlasMode.symmetric)
# HEMM
CreateSymmOperator(manifest, layouts, side_modes, fill_modes, tile_descriptions, \
data_type, alignment_constraints, BlasMode.hermitian)
#
#
def GenerateSM80_TensorOp_884(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
return
layouts = [
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
]
math_inst = \
MathInstruction( \
[8, 8, 4], \
DataType.f64, DataType.f64, DataType.f64, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add)
min_cc = 80
max_cc = 1024
alignment_constraints = [1,]
tile_descriptions = [
TileDescription([128, 128, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([256, 64, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 256, 16], 3, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([256, 32, 16], 3, [4, 1, 1], math_inst, min_cc, max_cc),
TileDescription([32, 256, 16], 3, [1, 4, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 128, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 32, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([32, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([32, 32, 16], 5, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([16, 32, 16], 5, [1, 2, 1], math_inst, min_cc, max_cc),
TileDescription([32, 16, 16], 5, [2, 1, 1], math_inst, min_cc, max_cc),
]
data_type = [DataType.f64, DataType.f64, DataType.f64, DataType.f64]
CreateGemmOperator(manifest, layouts, tile_descriptions, \
data_type, alignment_constraints)
#
#
def GenerateSM80_TensorOp_884_complex(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
return
layouts = [
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
]
math_inst = \
MathInstruction( \
[8, 8, 4], \
DataType.f64, DataType.f64, DataType.f64, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_complex)
min_cc = 80
max_cc = 1024
alignment_constraints = [1,]
tile_descriptions = [
TileDescription([128, 64, 8 ], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 128, 8 ], 3, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([64, 64, 8 ], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 32, 8 ], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([32, 64, 8 ], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([32, 32, 8 ], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([16, 32, 8 ], 4, [1, 2, 1], math_inst, min_cc, max_cc),
TileDescription([32, 16, 8 ], 4, [2, 1, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 128, 16], 3, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([64, 64, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 32, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([32, 64, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([32, 32, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([16, 32, 16], 4, [1, 2, 1], math_inst, min_cc, max_cc),
TileDescription([32, 16, 16], 3, [2, 1, 1], math_inst, min_cc, max_cc),
]
data_type = [DataType.cf64, DataType.cf64, DataType.cf64, DataType.cf64]
complex_transforms = [
(ComplexTransform.none, ComplexTransform.none),
(ComplexTransform.conj, ComplexTransform.none),
(ComplexTransform.none, ComplexTransform.conj),
(ComplexTransform.conj, ComplexTransform.conj)
]
CreateGemmOperator(manifest, layouts, tile_descriptions, \
data_type, alignment_constraints, complex_transforms)
#
def GenerateSM80_TensorOp_884_complex_gaussian(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
return
layouts = [
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
]
math_inst = \
MathInstruction( \
[8, 8, 4], \
DataType.f64, DataType.f64, DataType.f64, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_complex_gaussian)
min_cc = 80
max_cc = 1024
alignment_constraints = [1,]
tile_descriptions = [
TileDescription([64, 64, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([32, 64, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([32, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([16, 32, 8], 4, [1, 2, 1], math_inst, min_cc, max_cc),
TileDescription([32, 16, 8], 4, [2, 1, 1], math_inst, min_cc, max_cc),
]
data_type = [DataType.cf64, DataType.cf64, DataType.cf64, DataType.cf64]
complex_transforms = [
(ComplexTransform.none, ComplexTransform.none),
(ComplexTransform.conj, ComplexTransform.none),
(ComplexTransform.none, ComplexTransform.conj),
(ComplexTransform.conj, ComplexTransform.conj)
]
CreateGemmOperator(manifest, layouts, tile_descriptions, \
data_type, alignment_constraints, complex_transforms)
#
#
def GenerateSM80_TensorOp_884_rank_k(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
return
layouts = [
(LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.ColumnMajor),
]
fill_modes = [
FillMode.Lower, FillMode.Upper,
]
math_inst = \
MathInstruction( \
[8, 8, 4], \
DataType.f64, DataType.f64, DataType.f64, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add)
min_cc = 80
max_cc = 1024
alignment_constraints = [1,]
tile_descriptions = [
TileDescription([128, 128, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 128, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 32, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([32, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([32, 32, 16], 5, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([16, 32, 16], 5, [1, 2, 1], math_inst, min_cc, max_cc),
TileDescription([32, 16, 16], 5, [2, 1, 1], math_inst, min_cc, max_cc),
]
data_type = [DataType.f64, DataType.f64, DataType.f64]
CreateRankKOperator(manifest, layouts, fill_modes, tile_descriptions, \
data_type, alignment_constraints, BlasMode.symmetric)
#
#
def GenerateSM80_TensorOp_884_rank_k_complex(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
return
layouts = [
(LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.ColumnMajor),
]
fill_modes = [
FillMode.Lower, FillMode.Upper,
]
math_inst = \
MathInstruction( \
[8, 8, 4], \
DataType.f64, DataType.f64, DataType.f64, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_complex)
min_cc = 80
max_cc = 1024
alignment_constraints = [1,]
tile_descriptions = [
TileDescription([128, 64, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 128, 8], 3, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([64, 64, 8], 3, [2, 2, 1], math_inst, min_cc, max_cc),
#TileDescription([64, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
#TileDescription([32, 64, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
#TileDescription([32, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
#TileDescription([16, 32, 8], 4, [1, 2, 1], math_inst, min_cc, max_cc),
#TileDescription([32, 16, 8], 4, [2, 1, 1], math_inst, min_cc, max_cc),
]
data_type = [DataType.cf64, DataType.cf64, DataType.cf64]
# SYRK computation
CreateRankKOperator(manifest, layouts, fill_modes, tile_descriptions, \
data_type, alignment_constraints, BlasMode.symmetric)
# HERK computation
CreateRankKOperator(manifest, layouts, fill_modes, tile_descriptions, \
data_type, alignment_constraints, BlasMode.hermitian)
#
#
def GenerateSM80_TensorOp_884_rank_k_complex_gaussian(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
return
layouts = [
(LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.ColumnMajor),
]
fill_modes = [
FillMode.Lower, FillMode.Upper,
]
math_inst = \
MathInstruction( \
[8, 8, 4], \
DataType.f64, DataType.f64, DataType.f64, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_complex_gaussian)
min_cc = 80
max_cc = 1024
alignment_constraints = [1,]
tile_descriptions = [
TileDescription([64, 64, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([32, 64, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
#TileDescription([32, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
#TileDescription([16, 32, 8], 4, [1, 2, 1], math_inst, min_cc, max_cc),
#TileDescription([32, 16, 8], 4, [2, 1, 1], math_inst, min_cc, max_cc),
]
data_type = [DataType.cf64, DataType.cf64, DataType.cf64]
complex_transforms = [ComplexTransform.none,]
# SYRK computation
CreateRankKOperator(manifest, layouts, fill_modes, tile_descriptions, \
data_type, alignment_constraints, BlasMode.symmetric)
# HERK computation
CreateRankKOperator(manifest, layouts, fill_modes, tile_descriptions, \
data_type, alignment_constraints, BlasMode.hermitian)
#
#
def GenerateSM80_TensorOp_884_trmm(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
return
layouts = [
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
]
side_modes = [
SideMode.Left, SideMode.Right,
]
fill_modes = [
FillMode.Lower, FillMode.Upper,
]
diag_types = [
DiagType.NonUnit, DiagType.Unit,
]
math_inst = \
MathInstruction( \
[8, 8, 4], \
DataType.f64, DataType.f64, DataType.f64, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add)
min_cc = 80
max_cc = 1024
alignment_constraints = [1,]
tile_descriptions = [
TileDescription([128, 128, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 128, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
]
data_type = [DataType.f64, DataType.f64, DataType.f64, DataType.f64]
CreateTrmmOperator(manifest, layouts, side_modes, fill_modes, diag_types, tile_descriptions, \
data_type, alignment_constraints)
#
#
def GenerateSM80_TensorOp_884_trmm_complex(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
return
layouts = [
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
]
side_modes = [
SideMode.Left, SideMode.Right,
]
fill_modes = [
FillMode.Lower, FillMode.Upper,
]
diag_types = [
DiagType.NonUnit, DiagType.Unit,
]
math_inst = \
MathInstruction( \
[8, 8, 4], \
DataType.f64, DataType.f64, DataType.f64, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_complex)
min_cc = 80
max_cc = 1024
alignment_constraints = [1,]
tile_descriptions = [
TileDescription([128, 64, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 128, 8], 3, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([64, 64, 8], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([32, 64, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
]
data_type = [DataType.cf64, DataType.cf64, DataType.cf64, DataType.cf64]
complex_transforms = [
ComplexTransform.none, ComplexTransform.conj,
]
CreateTrmmOperator(manifest, layouts, side_modes, fill_modes, diag_types, tile_descriptions, \
data_type, alignment_constraints, complex_transforms)
#
#
def GenerateSM80_TensorOp_884_trmm_complex_gaussian(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
return
layouts = [
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
]
side_modes = [
SideMode.Left, SideMode.Right,
]
fill_modes = [
FillMode.Lower, FillMode.Upper,
]
diag_types = [
DiagType.NonUnit, DiagType.Unit,
]
math_inst = \
MathInstruction( \
[8, 8, 4], \
DataType.f64, DataType.f64, DataType.f64, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_complex_gaussian)
min_cc = 80
max_cc = 1024
alignment_constraints = [1,]
tile_descriptions = [
TileDescription([64, 64, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([32, 64, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
]
data_type = [DataType.cf64, DataType.cf64, DataType.cf64, DataType.cf64]
complex_transforms = [
ComplexTransform.none, ComplexTransform.conj,
]
CreateTrmmOperator(manifest, layouts, side_modes, fill_modes, diag_types, tile_descriptions, \
data_type, alignment_constraints, complex_transforms)
#
#
def GenerateSM80_TensorOp_884_symm(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
return
layouts = [
(LayoutType.ColumnMajor, LayoutType.ColumnMajor),
]
side_modes = [
SideMode.Left, SideMode.Right,
]
fill_modes = [
FillMode.Lower, FillMode.Upper,
]
math_inst = \
MathInstruction( \
[8, 8, 4], \
DataType.f64, DataType.f64, DataType.f64, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add)
min_cc = 80
max_cc = 1024
alignment_constraints = [1,]
tile_descriptions = [
TileDescription([128, 128, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 128, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 32, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([32, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([32, 32, 16], 5, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([16, 32, 16], 5, [1, 2, 1], math_inst, min_cc, max_cc),
TileDescription([32, 16, 16], 5, [2, 1, 1], math_inst, min_cc, max_cc),
]
data_type = [DataType.f64, DataType.f64, DataType.f64, DataType.f64]
CreateSymmOperator(manifest, layouts, side_modes, fill_modes, tile_descriptions, \
data_type, alignment_constraints, BlasMode.symmetric)
#
#
def GenerateSM80_TensorOp_884_symm_complex(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
return
layouts = [
(LayoutType.ColumnMajor, LayoutType.ColumnMajor),
]
side_modes = [
SideMode.Left, SideMode.Right,
]
fill_modes = [
FillMode.Lower, FillMode.Upper,
]
math_inst = \
MathInstruction( \
[8, 8, 4], \
DataType.f64, DataType.f64, DataType.f64, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_complex)
min_cc = 80
max_cc = 1024
alignment_constraints = [1,]
tile_descriptions = [
TileDescription([128, 64, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 128, 8], 3, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([64, 64, 8], 3, [2, 2, 1], math_inst, min_cc, max_cc),
#TileDescription([64, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
#TileDescription([32, 64, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
#TileDescription([32, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
#TileDescription([16, 32, 8], 4, [1, 2, 1], math_inst, min_cc, max_cc),
#TileDescription([32, 16, 8], 4, [2, 1, 1], math_inst, min_cc, max_cc),
]
data_type = [DataType.cf64, DataType.cf64, DataType.cf64, DataType.cf64]
# SYMM computation
CreateSymmOperator(manifest, layouts, side_modes, fill_modes, tile_descriptions, \
data_type, alignment_constraints, BlasMode.symmetric)
# HEMM computation
CreateSymmOperator(manifest, layouts, side_modes, fill_modes, tile_descriptions, \
data_type, alignment_constraints, BlasMode.hermitian)
#
#
def GenerateSM80_TensorOp_884_symm_complex_gaussian(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
return
layouts = [
(LayoutType.ColumnMajor, LayoutType.ColumnMajor),
]
side_modes = [
SideMode.Left, SideMode.Right,
]
fill_modes = [
FillMode.Lower, FillMode.Upper,
]
math_inst = \
MathInstruction( \
[8, 8, 4], \
DataType.f64, DataType.f64, DataType.f64, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_complex_gaussian)
min_cc = 80
max_cc = 1024
alignment_constraints = [1,]
tile_descriptions = [
TileDescription([64, 64, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([32, 64, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
#TileDescription([32, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
#TileDescription([16, 32, 8], 4, [1, 2, 1], math_inst, min_cc, max_cc),
#TileDescription([32, 16, 8], 4, [2, 1, 1], math_inst, min_cc, max_cc),
]
data_type = [DataType.cf64, DataType.cf64, DataType.cf64, DataType.cf64]
complex_transforms = [ComplexTransform.none,]
# SYMM computation
CreateSymmOperator(manifest, layouts, side_modes, fill_modes, tile_descriptions, \
data_type, alignment_constraints, BlasMode.symmetric)
# HEMM computation
CreateSymmOperator(manifest, layouts, side_modes, fill_modes, tile_descriptions, \
data_type, alignment_constraints, BlasMode.hermitian)
#
###################################################################################################
#
def GenerateSM80_Simt_f32(manifest, cuda_version):
layouts = [
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
]
math_instructions = [
MathInstruction( \
[1, 1, 1], \
DataType.f32, DataType.f32, DataType.f32, \
OpcodeClass.Simt, \
MathOperation.multiply_add),
]
min_cc = 80
max_cc = 1024
alignment_constraints = [1,]
for math_inst in math_instructions:
tile_descriptions = [
TileDescription([256, 128, 8], 5, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 256, 8], 5, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 8], 5, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([256, 128, 8], 4, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 256, 8], 4, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 8], 4, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 8], 5, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 128, 8], 5, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 64, 8], 5, [2, 1, 1], math_inst, min_cc, max_cc),
TileDescription([128, 32, 8], 5, [2, 1, 1], math_inst, min_cc, max_cc),
TileDescription([ 32, 128, 8], 5, [1, 2, 1], math_inst, min_cc, max_cc),
]
data_type = [
math_inst.element_a,
math_inst.element_b,
math_inst.element_accumulator,
math_inst.element_accumulator,
]
CreateGemmOperator(manifest, layouts, tile_descriptions, \
data_type, alignment_constraints)
conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type, alignment_constraints)
#
#
def GenerateSM80_Simt_f64(manifest, cuda_version):
layouts = [
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
]
math_instructions = [
MathInstruction( \
[1, 1, 1], \
DataType.f64, DataType.f64, DataType.f64, \
OpcodeClass.Simt, \
MathOperation.multiply_add),
]
min_cc = 80
max_cc = 1024
alignment_constraints = [1,]
for math_inst in math_instructions:
tile_descriptions = [
TileDescription([128, 128, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 128, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 64, 8], 5, [2, 1, 1], math_inst, min_cc, max_cc),
TileDescription([128, 32, 8], 5, [2, 1, 1], math_inst, min_cc, max_cc),
TileDescription([ 32, 128, 8], 5, [1, 2, 1], math_inst, min_cc, max_cc),
]
data_type = [
math_inst.element_a,
math_inst.element_b,
math_inst.element_accumulator,
math_inst.element_accumulator,
]
CreateGemmOperator(manifest, layouts, tile_descriptions, \
data_type, alignment_constraints)
#
##################################################################################################
#
def GenerateSM80_Simt_complex(manifest, cuda_version):
math_instructions = [
MathInstruction( \
[1, 1, 1], \
DataType.f32, DataType.f32, DataType.f32, \
OpcodeClass.Simt, \
MathOperation.multiply_add_complex),
]
min_cc = 80
max_cc = 1024
alignment_constraints = [1,]
data_type = [
DataType.cf32,
DataType.cf32,
DataType.cf32,
DataType.cf32
]
layouts = [
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
]
complex_transforms = [
(ComplexTransform.none, ComplexTransform.none),
(ComplexTransform.conj, ComplexTransform.none),
(ComplexTransform.none, ComplexTransform.conj),
(ComplexTransform.conj, ComplexTransform.conj)
]
for math_inst in math_instructions:
tile_descriptions = [
TileDescription([128, 128, 8], 5, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 8], 4, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 64, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 128, 16], 6, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 16], 6, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 32, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([32, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([32, 32, 16], 5, [2, 2, 1], math_inst, min_cc, max_cc),
]
CreateGemmOperator(manifest, layouts, tile_descriptions, data_type, alignment_constraints, complex_transforms)
conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type, alignment_constraints)
#
###################################################################################################
#
def GenerateSM80(manifest, cuda_version):
GenerateSM80_TensorOp_16816(manifest, cuda_version)
GenerateSM80_SparseTensorOp_16832(manifest, cuda_version)
GenerateSM80_PlanarComplexTensorOp_16816(manifest, cuda_version)
GenerateSM80_TensorOp_1688(manifest, cuda_version)
GenerateSM80_TensorOp_1688_fast_math(manifest, cuda_version)
GenerateSM80_SparseTensorOp_16816_fast_math(manifest, cuda_version)
GenerateSM80_TensorOp_1688_complex(manifest, cuda_version)
# 3xTF32
GenerateSM80_TensorOp_1688_fast_fp32_math(manifest, cuda_version)
GenerateSM80_TensorOp_1688_fast_fp32_math_complex(manifest, cuda_version)
GenerateSM80_TensorOp_1688_rank_k(manifest, cuda_version)
GenerateSM80_TensorOp_1688_rank_k_complex(manifest, cuda_version)
GenerateSM80_TensorOp_1688_trmm(manifest, cuda_version)
GenerateSM80_TensorOp_1688_trmm_complex(manifest, cuda_version)
GenerateSM80_TensorOp_1688_symm(manifest, cuda_version)
GenerateSM80_TensorOp_1688_symm_complex(manifest, cuda_version)
GenerateSM80_TensorOp_884(manifest, cuda_version)
GenerateSM80_TensorOp_884_complex(manifest, cuda_version)
GenerateSM80_TensorOp_884_complex_gaussian(manifest, cuda_version)
GenerateSM80_TensorOp_884_rank_k(manifest, cuda_version)
GenerateSM80_TensorOp_884_rank_k_complex(manifest, cuda_version)
GenerateSM80_TensorOp_884_rank_k_complex_gaussian(manifest, cuda_version)
GenerateSM80_TensorOp_884_trmm(manifest, cuda_version)
GenerateSM80_TensorOp_884_trmm_complex(manifest, cuda_version)
GenerateSM80_TensorOp_884_trmm_complex_gaussian(manifest, cuda_version)
GenerateSM80_TensorOp_884_symm(manifest, cuda_version)
GenerateSM80_TensorOp_884_symm_complex(manifest, cuda_version)
GenerateSM80_TensorOp_884_symm_complex_gaussian(manifest, cuda_version)
GenerateSM80_TensorOp_16816_mixed_input_upcast_a(manifest, cuda_version)
GenerateSM80_TensorOp_16816_mixed_input_upcast_b(manifest, cuda_version)
GenerateSM80_TensorOp_16832_TN(manifest, cuda_version)
GenerateSM80_SparseTensorOp_16864_TN(manifest, cuda_version)
GenerateSM80_TensorOp_16832_Interleaved(manifest, cuda_version)
GenerateSM80_TensorOp_16864_TN(manifest, cuda_version)
GenerateSM80_SparseTensorOp_168128_TN(manifest, cuda_version)
GenerateSM80_TensorOp_16864_Interleaved(manifest, cuda_version)
GenerateSM80_TensorOp_168256(manifest, cuda_version)
GenerateSM80_Simt_f32(manifest, cuda_version)
GenerateSM80_Simt_f64(manifest, cuda_version)
GenerateSM80_Simt_complex(manifest, cuda_version)
###################################################################################################
#
def GenerateSM90_TensorOp_16b_WGMMA_gemm(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
return
# layouts for ABC and their alignments.
layouts = [
[[LayoutType.ColumnMajor, 8], [LayoutType.ColumnMajor, 8], [LayoutType.ColumnMajor, 1]],
[[LayoutType.ColumnMajor, 8], [LayoutType.RowMajor, 8], [LayoutType.ColumnMajor, 1]],
[[LayoutType.RowMajor, 8], [LayoutType.ColumnMajor, 8], [LayoutType.ColumnMajor, 1]],
[[LayoutType.RowMajor, 8], [LayoutType.RowMajor, 8], [LayoutType.ColumnMajor, 1]],
]
math_instructions = [
MathInstruction(
[64, 128, 16],
DataType.f16, DataType.f16, DataType.f16,
OpcodeClass.TensorOp,
MathOperation.multiply_add),
MathInstruction(
[64, 128, 16],
DataType.f16, DataType.f16, DataType.f32,
OpcodeClass.TensorOp,
MathOperation.multiply_add),
MathInstruction(
[64, 128, 16],
DataType.bf16, DataType.bf16, DataType.f32,
OpcodeClass.TensorOp,
MathOperation.multiply_add),
]
min_cc = 90
max_cc = 90
for math_inst in math_instructions:
tile_descriptions_small = [
# Not compatible with TmaWarpSpecializedCooperative
TileDescription([math_inst.instruction_shape[0], math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
0, [4, 1, 1], math_inst, min_cc, max_cc, [2,1,1]),
TileDescription([math_inst.instruction_shape[0], math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
0, [4, 1, 1], math_inst, min_cc, max_cc, [1,2,1]),
]
tile_descriptions_medium = [
TileDescription([math_inst.instruction_shape[0]*2, math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
0, [4, 1, 1], math_inst, min_cc, max_cc, [2,1,1]),
TileDescription([math_inst.instruction_shape[0]*2, math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
0, [4, 1, 1], math_inst, min_cc, max_cc, [1,2,1]),
]
tile_descriptions_large = [
TileDescription([math_inst.instruction_shape[0]*4, math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
0, [4, 1, 1], math_inst, min_cc, max_cc, [2,1,1]),
TileDescription([math_inst.instruction_shape[0]*4, math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
0, [4, 1, 1], math_inst, min_cc, max_cc, [1,2,1]),
TileDescription([math_inst.instruction_shape[0]*2, math_inst.instruction_shape[1]*2, math_inst.instruction_shape[2]*4],
0, [4, 2, 1], math_inst, min_cc, max_cc, [2,1,1]),
TileDescription([math_inst.instruction_shape[0]*2, math_inst.instruction_shape[1]*2, math_inst.instruction_shape[2]*4],
0, [4, 2, 1], math_inst, min_cc, max_cc, [1,2,1]),
]
tile_descriptions = tile_descriptions_medium + tile_descriptions_large
data_type = {
"a_type" : math_inst.element_a,
"b_type" : math_inst.element_b,
"c_type" : math_inst.element_accumulator,
"d_type" : math_inst.element_accumulator,
"acc_type" : math_inst.element_accumulator,
"epi_type" : math_inst.element_accumulator
}
# Set alignment c based on Destination format.
for layout in layouts:
if data_type["c_type"] in [DataType.s32, DataType.f32]:
layout[2][1] = 4
elif data_type["c_type"] in [DataType.f16, DataType.bf16]:
layout[2][1] = 8
if CudaToolkitVersionSatisfies(cuda_version, 12, 1):
schedules = [
[KernelScheduleType.ScheduleAuto, EpilogueScheduleType.ScheduleAuto],
[KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.NoSmemWarpSpecialized],
[KernelScheduleType.TmaWarpSpecializedPingpong, EpilogueScheduleType.NoSmemWarpSpecialized],
[KernelScheduleType.TmaWarpSpecialized, EpilogueScheduleType.NoSmemWarpSpecialized]
]
stream_k_schedules = [[KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.NoSmemWarpSpecialized]]
else:
schedules = [
[KernelScheduleType.ScheduleAuto, EpilogueScheduleType.ScheduleAuto],
[KernelScheduleType.TmaWarpSpecialized, EpilogueScheduleType.NoSmemWarpSpecialized]
# TmaWarpSpecializedCooperative and TmaWarpSpecializedPingpong require CUDA version >= 12.1 for optimal performance.
]
stream_k_schedules = []
CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type, schedules)
if CudaToolkitVersionSatisfies(cuda_version, 12, 1):
# Add stream-K variants
CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type, stream_k_schedules, tile_schedulers=[TileSchedulerType.StreamK])
# persistent kernels with TMA epilogues
if CudaToolkitVersionSatisfies(cuda_version, 12, 1):
# not enough smem for 256x128 f32 out with C allocation
if data_type["d_type"] == DataType.f32:
CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions_medium, data_type,
[[KernelScheduleType.TmaWarpSpecializedPingpong, EpilogueScheduleType.TmaWarpSpecialized],
[KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative]])
CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions_medium, data_type,
[[KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative]],
tile_schedulers=[TileSchedulerType.StreamK])
else:
CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type,
[[KernelScheduleType.TmaWarpSpecializedPingpong, EpilogueScheduleType.TmaWarpSpecialized],
[KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative]])
CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type,
[[KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative]],
tile_schedulers=[TileSchedulerType.StreamK])
# Emit instance without C allocation + load
data_type["c_type"] = DataType.void
CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type,
[[KernelScheduleType.TmaWarpSpecializedPingpong, EpilogueScheduleType.TmaWarpSpecialized],
[KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative]])
CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type,
[[KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative]],
tile_schedulers=[TileSchedulerType.StreamK])
# for mixed precision kernels, also generate kernels that write output matrix in the A/B format
# Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
if math_inst.element_a != math_inst.element_accumulator:
data_type_mixed = {
"a_type" : math_inst.element_a,
"b_type" : math_inst.element_b,
"c_type" : math_inst.element_a,
"d_type" : math_inst.element_a,
"acc_type" : math_inst.element_accumulator,
"epi_type" : math_inst.element_accumulator
}
# Set alignment c based on Destination format.
for layout in layouts:
if data_type_mixed["c_type"] in [DataType.s32, DataType.f32]:
layout[2][1] = 4
elif data_type_mixed["c_type"] in [DataType.f16, DataType.bf16]:
layout[2][1] = 8
CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type_mixed, schedules)
CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type_mixed, stream_k_schedules, tile_schedulers=[TileSchedulerType.StreamK])
# persistent kernels with TMA epilogues
if CudaToolkitVersionSatisfies(cuda_version, 12, 1):
CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type_mixed,
[[KernelScheduleType.TmaWarpSpecializedPingpong, EpilogueScheduleType.TmaWarpSpecialized],
[KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative]])
CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type_mixed,
[[KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative]],
tile_schedulers=[TileSchedulerType.StreamK])
# Emit instance without C allocation+load
data_type_mixed["c_type"] = DataType.void
CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type_mixed,
[[KernelScheduleType.TmaWarpSpecializedPingpong, EpilogueScheduleType.TmaWarpSpecialized],
[KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative]])
CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type_mixed,
[[KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative]],
tile_schedulers=[TileSchedulerType.StreamK])
#
def GenerateSM90_TensorOp_16b_WGMMA_alignx_gemm(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
return
# layouts for ABC and their alignments.
layouts = [
[[LayoutType.RowMajor, 4], [LayoutType.ColumnMajor, 4], [LayoutType.ColumnMajor, 1]],
[[LayoutType.RowMajor, 4], [LayoutType.RowMajor, 4], [LayoutType.ColumnMajor, 1]],
[[LayoutType.ColumnMajor, 4], [LayoutType.ColumnMajor, 4], [LayoutType.ColumnMajor, 1]],
[[LayoutType.ColumnMajor, 4], [LayoutType.RowMajor, 4], [LayoutType.ColumnMajor, 1]],
[[LayoutType.RowMajor, 2], [LayoutType.ColumnMajor, 2], [LayoutType.ColumnMajor, 1]],
[[LayoutType.RowMajor, 2], [LayoutType.RowMajor, 2], [LayoutType.ColumnMajor, 1]],
[[LayoutType.ColumnMajor, 2], [LayoutType.ColumnMajor, 2], [LayoutType.ColumnMajor, 1]],
[[LayoutType.ColumnMajor, 2], [LayoutType.RowMajor, 2], [LayoutType.ColumnMajor, 1]],
]
math_instructions = [
MathInstruction(
[64, 128, 16],
DataType.f16, DataType.f16, DataType.f16,
OpcodeClass.TensorOp,
MathOperation.multiply_add),
MathInstruction(
[64, 128, 16],
DataType.f16, DataType.f16, DataType.f32,
OpcodeClass.TensorOp,
MathOperation.multiply_add),
MathInstruction(
[64, 128, 16],
DataType.bf16, DataType.bf16, DataType.f32,
OpcodeClass.TensorOp,
MathOperation.multiply_add),
]
min_cc = 90
max_cc = 90
for math_inst in math_instructions:
tile_descriptions_small = [
# TileDescription([math_inst.instruction_shape[0], math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
# 0, [4, 1, 1], math_inst, min_cc, max_cc, [1,1,1]),
]
tile_descriptions_medium = [
TileDescription([math_inst.instruction_shape[0]*2, math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
0, [4, 1, 1], math_inst, min_cc, max_cc, [1,1,1]),
# TileDescription([math_inst.instruction_shape[0], math_inst.instruction_shape[1]*2, math_inst.instruction_shape[2]*4],
# 0, [4, 1, 1], math_inst, min_cc, max_cc, [1,1,1]),
]
tile_descriptions = tile_descriptions_small + tile_descriptions_medium
data_type = {
"a_type" : math_inst.element_a,
"b_type" : math_inst.element_b,
"c_type" : math_inst.element_accumulator,
"d_type" : math_inst.element_accumulator,
"acc_type" : math_inst.element_accumulator,
"epi_type" : math_inst.element_accumulator
}
# Set alignment c based on Destination format.
for layout in layouts:
if data_type["c_type"] in [DataType.s32, DataType.f32]:
layout[2][1] = 4
elif data_type["c_type"] in [DataType.f16, DataType.bf16]:
layout[2][1] = 8
schedules = [
# [KernelScheduleType.ScheduleAuto, EpilogueScheduleType.ScheduleAuto],
[KernelScheduleType.CpAsyncWarpSpecialized, EpilogueScheduleType.NoSmemWarpSpecialized]
]
stream_k_schedules = []
if CudaToolkitVersionSatisfies(cuda_version, 12, 1):
schedules += [
[KernelScheduleType.CpAsyncWarpSpecializedCooperative, EpilogueScheduleType.NoSmemWarpSpecialized],
# [KernelScheduleType.CpAsyncWarpSpecializedPingpong, EpilogueScheduleType.NoSmemWarpSpecialized]
]
stream_k_schedules += [[KernelScheduleType.CpAsyncWarpSpecializedCooperative, EpilogueScheduleType.NoSmemWarpSpecialized]]
CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type, schedules)
if CudaToolkitVersionSatisfies(cuda_version, 12, 1):
# Add stream-K variants
CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type, stream_k_schedules, tile_schedulers=[TileSchedulerType.StreamK])
# persistent kernels with TMA epilogues
# if CudaToolkitVersionSatisfies(cuda_version, 12, 1):
# CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type,
# [[KernelScheduleType.CpAsyncWarpSpecializedPingpong, EpilogueScheduleType.TmaWarpSpecialized],
# [KernelScheduleType.CpAsyncWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative]])
# CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type,
# [[KernelScheduleType.CpAsyncWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative]],
# tile_schedulers=[TileSchedulerType.StreamK])
# # Emit instance without C allocation + load
# data_type["c_type"] = DataType.void
# CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type,
# [[KernelScheduleType.CpAsyncWarpSpecializedPingpong, EpilogueScheduleType.TmaWarpSpecialized],
# [KernelScheduleType.CpAsyncWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative]])
# CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type,
# [[KernelScheduleType.CpAsyncWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative]],
# tile_schedulers=[TileSchedulerType.StreamK])
# for mixed precision kernels, also generate kernels that write output matrix in the A/B format
# Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
if math_inst.element_a != math_inst.element_accumulator:
data_type_mixed = {
"a_type" : math_inst.element_a,
"b_type" : math_inst.element_b,
"c_type" : math_inst.element_a,
"d_type" : math_inst.element_a,
"acc_type" : math_inst.element_accumulator,
"epi_type" : math_inst.element_accumulator
}
# Set alignment c based on Destination format.
for layout in layouts:
if data_type_mixed["c_type"] in [DataType.s32, DataType.f32]:
layout[2][1] = 4
elif data_type_mixed["c_type"] in [DataType.f16, DataType.bf16]:
layout[2][1] = 8
CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type_mixed, schedules)
CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type_mixed, stream_k_schedules, tile_schedulers=[TileSchedulerType.StreamK])
# persistent kernels with TMA epilogues
# if CudaToolkitVersionSatisfies(cuda_version, 12, 1):
# CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type_mixed,
# [[KernelScheduleType.CpAsyncWarpSpecializedPingpong, EpilogueScheduleType.TmaWarpSpecialized],
# [KernelScheduleType.CpAsyncWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative]])
# CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type_mixed,
# [[KernelScheduleType.CpAsyncWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative]],
# tile_schedulers=[TileSchedulerType.StreamK])
# # Emit instance without C allocation+load
# data_type_mixed["c_type"] = DataType.void
# CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type_mixed,
# [[KernelScheduleType.CpAsyncWarpSpecializedPingpong, EpilogueScheduleType.TmaWarpSpecialized],
# [KernelScheduleType.CpAsyncWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative]])
# CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type_mixed,
# [[KernelScheduleType.CpAsyncWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative]],
# tile_schedulers=[TileSchedulerType.StreamK])
#
def GenerateSM90_TensorOp_tf32_WGMMA_gemm(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
return
# layouts for ABC and their alignments
layouts_tf32 = [
[[LayoutType.RowMajor, 4], [LayoutType.ColumnMajor, 4], [LayoutType.ColumnMajor, 4]],
[[LayoutType.RowMajor, 4], [LayoutType.RowMajor, 4], [LayoutType.ColumnMajor, 4]],
[[LayoutType.ColumnMajor, 4], [LayoutType.ColumnMajor, 4], [LayoutType.ColumnMajor, 4]],
[[LayoutType.ColumnMajor, 4], [LayoutType.RowMajor, 4], [LayoutType.ColumnMajor, 4]],
]
math_inst = MathInstruction(
[64, 128, 8],
DataType.tf32, DataType.tf32, DataType.f32,
OpcodeClass.TensorOp,
MathOperation.multiply_add)
math_inst_largeN = MathInstruction(
[64, 256, 8],
DataType.tf32, DataType.tf32, DataType.f32,
OpcodeClass.TensorOp,
MathOperation.multiply_add)
min_cc = 90
max_cc = 90
tile_descriptions_large = [
TileDescription([math_inst.instruction_shape[0]*4, math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
0, [4, 1, 1], math_inst, min_cc, max_cc, [1,2,1]),
TileDescription([math_inst.instruction_shape[0]*4, math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
0, [4, 1, 1], math_inst, min_cc, max_cc, [2,1,1]),
TileDescription([math_inst_largeN.instruction_shape[0]*2, math_inst_largeN.instruction_shape[1], math_inst_largeN.instruction_shape[2]*4],
0, [4, 1, 1], math_inst_largeN, min_cc, max_cc, [2,1,1]),
TileDescription([math_inst_largeN.instruction_shape[0]*2, math_inst_largeN.instruction_shape[1], math_inst_largeN.instruction_shape[2]*4],
0, [4, 1, 1], math_inst_largeN, min_cc, max_cc, [1,2,1]),
]
tile_descriptions_medium = [
TileDescription([math_inst.instruction_shape[0]*2, math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
0, [4, 1, 1], math_inst, min_cc, max_cc, [2,1,1]),
TileDescription([math_inst.instruction_shape[0]*2, math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
0, [4, 1, 1], math_inst, min_cc, max_cc, [1,2,1]),
]
tile_descriptions_small = [
TileDescription([math_inst.instruction_shape[0], math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
0, [4, 1, 1], math_inst, min_cc, max_cc, [2,1,1]),
TileDescription([math_inst.instruction_shape[0], math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
0, [4, 1, 1], math_inst, min_cc, max_cc, [1,2,1]),
]
tile_descriptions = tile_descriptions_medium + tile_descriptions_small
data_types = [
{
"a_type" : math_inst.element_a,
"b_type" : math_inst.element_b,
"c_type" : math_inst.element_accumulator,
"d_type" : math_inst.element_accumulator,
"acc_type" : math_inst.element_accumulator,
"epi_type" : math_inst.element_accumulator
},
{
"a_type" : DataType.f32,
"b_type" : DataType.f32,
"c_type" : math_inst.element_accumulator,
"d_type" : math_inst.element_accumulator,
"acc_type" : math_inst.element_accumulator,
"epi_type" : DataType.f32
}
]
schedules_default = [
[KernelScheduleType.ScheduleAuto, EpilogueScheduleType.ScheduleAuto],
[KernelScheduleType.TmaWarpSpecialized, EpilogueScheduleType.NoSmemWarpSpecialized],
]
# TMA kernels with TT layout use EpilogueTransposed (NoSmemWarpSpecialized with swapped strides),
# because they use NN kernels underneath and transposing its epilogue will get the correct output
schedules_transposed_epilogue = [
[KernelScheduleType.ScheduleAuto, EpilogueScheduleType.EpilogueTransposed],
[KernelScheduleType.TmaWarpSpecialized, EpilogueScheduleType.EpilogueTransposed]
]
# TMA kernels with TN, NN, or NT layout
layouts_tf32_tn_nn_nt = [layouts_tf32[0], layouts_tf32[2], layouts_tf32[3]]
# TMA kernels with TT layout
layouts_tf32_tt = [layouts_tf32[1]]
if CudaToolkitVersionSatisfies(cuda_version, 12, 1):
CreateGemmUniversal3xOperator(manifest, layouts_tf32_tn_nn_nt, tile_descriptions_small, data_types, [
[KernelScheduleType.TmaWarpSpecializedPingpong, EpilogueScheduleType.TmaWarpSpecialized],
[KernelScheduleType.TmaWarpSpecializedPingpong, EpilogueScheduleType.NoSmemWarpSpecialized]
])
CreateGemmUniversal3xOperator(manifest, layouts_tf32_tn_nn_nt, tile_descriptions_medium, data_types, [
[KernelScheduleType.TmaWarpSpecializedPingpong, EpilogueScheduleType.TmaWarpSpecialized],
[KernelScheduleType.TmaWarpSpecializedPingpong, EpilogueScheduleType.NoSmemWarpSpecialized],
[KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative],
[KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.NoSmemWarpSpecialized]
])
CreateGemmUniversal3xOperator(manifest, layouts_tf32_tn_nn_nt, tile_descriptions_large, data_types, [
[KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.NoSmemWarpSpecialized],
])
CreateGemmUniversal3xOperator(manifest, layouts_tf32_tt, tile_descriptions_small, data_types, [
[KernelScheduleType.TmaWarpSpecializedPingpong, EpilogueScheduleType.EpilogueTransposed]
])
CreateGemmUniversal3xOperator(manifest, layouts_tf32_tt, tile_descriptions_medium, data_types, [
[KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.EpilogueTransposed],
[KernelScheduleType.TmaWarpSpecializedPingpong, EpilogueScheduleType.EpilogueTransposed]
])
CreateGemmUniversal3xOperator(manifest, layouts_tf32_tt, tile_descriptions_large, data_types, [
[KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.EpilogueTransposed],
])
else:
CreateGemmUniversal3xOperator(manifest, layouts_tf32_tn_nn_nt, tile_descriptions, data_types, schedules_default)
CreateGemmUniversal3xOperator(manifest, layouts_tf32_tt, tile_descriptions, data_types, schedules_transposed_epilogue)
#
def GenerateSM90_TensorOp_tf32_WGMMA_alignx_gemm(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
return
# layouts for ABC and their alignments.
layouts = [
[[LayoutType.RowMajor, 2], [LayoutType.ColumnMajor, 2], [LayoutType.ColumnMajor, 1]],
[[LayoutType.RowMajor, 2], [LayoutType.RowMajor, 2], [LayoutType.ColumnMajor, 1]],
[[LayoutType.ColumnMajor, 2], [LayoutType.ColumnMajor, 2], [LayoutType.ColumnMajor, 1]],
[[LayoutType.ColumnMajor, 2], [LayoutType.RowMajor, 2], [LayoutType.ColumnMajor, 1]],
[[LayoutType.RowMajor, 1], [LayoutType.ColumnMajor, 1], [LayoutType.ColumnMajor, 1]],
[[LayoutType.RowMajor, 1], [LayoutType.RowMajor, 1], [LayoutType.ColumnMajor, 1]],
[[LayoutType.ColumnMajor, 1], [LayoutType.ColumnMajor, 1], [LayoutType.ColumnMajor, 1]],
[[LayoutType.ColumnMajor, 1], [LayoutType.RowMajor, 1], [LayoutType.ColumnMajor, 1]],
]
math_inst = MathInstruction(
[64, 128, 8],
DataType.tf32, DataType.tf32, DataType.f32,
OpcodeClass.TensorOp,
MathOperation.multiply_add)
min_cc = 90
max_cc = 90
tile_descriptions_medium = [
TileDescription([math_inst.instruction_shape[0]*2, math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
0, [4, 1, 1], math_inst, min_cc, max_cc, [1,1,1])
]
tile_descriptions_small = [
# TileDescription([math_inst.instruction_shape[0], math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
# 0, [4, 1, 1], math_inst, min_cc, max_cc, [1,1,1])
]
tile_descriptions = tile_descriptions_medium + tile_descriptions_small
data_types = [
{
"a_type" : math_inst.element_a,
"b_type" : math_inst.element_b,
"c_type" : math_inst.element_accumulator,
"d_type" : math_inst.element_accumulator,
"acc_type" : math_inst.element_accumulator,
"epi_type" : math_inst.element_accumulator
},
{
"a_type" : DataType.f32,
"b_type" : DataType.f32,
"c_type" : math_inst.element_accumulator,
"d_type" : math_inst.element_accumulator,
"acc_type" : math_inst.element_accumulator,
"epi_type" : DataType.f32
}
]
is_tt_layout = lambda v: v[0][0] == LayoutType.RowMajor and v[1][0] == LayoutType.RowMajor
# Split kernels into TN/NT, NN or TT layouts
layouts_tn_nn_nt = filter(lambda v: not is_tt_layout(v), layouts)
layouts_tt = filter(is_tt_layout, layouts)
CreateGemmUniversal3xOperator(manifest, layouts_tn_nn_nt, tile_descriptions, data_types, [
# [KernelScheduleType.ScheduleAuto, EpilogueScheduleType.ScheduleAuto],
[KernelScheduleType.CpAsyncWarpSpecialized, EpilogueScheduleType.NoSmemWarpSpecialized],
])
# Kernels with TT layout use EpilogueTransposed (NoSmemWarpSpecialized with swapped strides),
# because they use NN kernels underneath and transposing its epilogue will get the correct output
CreateGemmUniversal3xOperator(manifest, layouts_tt, tile_descriptions, data_types, [
# [KernelScheduleType.ScheduleAuto, EpilogueScheduleType.EpilogueTransposed],
[KernelScheduleType.CpAsyncWarpSpecialized, EpilogueScheduleType.EpilogueTransposed]
])
if CudaToolkitVersionSatisfies(cuda_version, 12, 1):
CreateGemmUniversal3xOperator(manifest, layouts_tn_nn_nt, tile_descriptions, data_types, [
# [KernelScheduleType.CpAsyncWarpSpecializedPingpong, EpilogueScheduleType.NoSmemWarpSpecialized],
[KernelScheduleType.CpAsyncWarpSpecializedCooperative, EpilogueScheduleType.NoSmemWarpSpecialized]
])
# Stream-K schedules
CreateGemmUniversal3xOperator(manifest, layouts_tn_nn_nt, tile_descriptions, data_types, [
[KernelScheduleType.CpAsyncWarpSpecializedCooperative, EpilogueScheduleType.NoSmemWarpSpecialized]
], tile_schedulers=[TileSchedulerType.StreamK])
#
def GenerateSM90_TensorOp_int8_WGMMA_gemm(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
return
# layouts for ABC and their alignments
layouts = [
[[LayoutType.RowMajor, 16], [LayoutType.ColumnMajor, 16], [LayoutType.ColumnMajor, 16]],
]
math_instructions = [
MathInstruction(
[64, 128, 32],
DataType.s8, DataType.s8, DataType.s32,
OpcodeClass.TensorOp,
MathOperation.multiply_add),
MathInstruction(
[64, 128, 32],
DataType.u8, DataType.u8, DataType.s32,
OpcodeClass.TensorOp,
MathOperation.multiply_add),
]
min_cc = 90
max_cc = 90
for math_inst in math_instructions:
# 64x128x128
tile_descriptions_small = [
TileDescription([math_inst.instruction_shape[0], math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
0, [4, 1, 1], math_inst, min_cc, max_cc, [2,1,1]),
TileDescription([math_inst.instruction_shape[0], math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
0, [4, 1, 1], math_inst, min_cc, max_cc, [1,2,1]),
]
# 128x128x128
tile_descriptions_medium = [
TileDescription([math_inst.instruction_shape[0]*2, math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
0, [4, 1, 1], math_inst, min_cc, max_cc, [2,1,1]),
TileDescription([math_inst.instruction_shape[0]*2, math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
0, [4, 1, 1], math_inst, min_cc, max_cc, [1,2,1]),
]
tile_descriptions = tile_descriptions_medium + tile_descriptions_small
data_types = [
{
"a_type" : math_inst.element_a,
"b_type" : math_inst.element_b,
"c_type" : math_inst.element_accumulator,
"d_type" : math_inst.element_accumulator,
"acc_type" : math_inst.element_accumulator,
"epi_type" : math_inst.element_accumulator
},
{
"a_type" : math_inst.element_a,
"b_type" : math_inst.element_b,
"c_type" : DataType.s8,
"d_type" : math_inst.element_a,
"acc_type" : math_inst.element_accumulator,
"epi_type" : DataType.f32
}
]
for data_type in data_types:
for layout in layouts:
layout[2][1] = 128 // DataTypeSize[data_type["d_type"]]
CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type)
# persistent kernels with TMA epilogues
if CudaToolkitVersionSatisfies(cuda_version, 12, 1):
# Emit instance without C allocation+load
data_types += [
{
"a_type" : math_inst.element_a,
"b_type" : math_inst.element_b,
"c_type" : DataType.void,
"d_type" : math_inst.element_accumulator,
"acc_type" : math_inst.element_accumulator,
"epi_type" : math_inst.element_accumulator
}
]
for data_type in data_types:
# Set output alignment based on destination format first
for layout in layouts:
layout[2][1] = 128 // DataTypeSize[data_type["d_type"]]
# Pingpong persistent
CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type,
[[KernelScheduleType.TmaWarpSpecializedPingpong, EpilogueScheduleType.TmaWarpSpecialized],
[KernelScheduleType.TmaWarpSpecializedPingpong, EpilogueScheduleType.NoSmemWarpSpecialized]])
# Cooperative persistent
CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions_medium, data_type,
[[KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative],
[KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.NoSmemWarpSpecialized]],
tile_schedulers=[TileSchedulerType.Persistent, TileSchedulerType.StreamK]
)
#
def GenerateSM90_TensorOp_int8_WGMMA_alignx_gemm(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
return
# layouts for ABC and their alignments
layouts = [
[[LayoutType.RowMajor, 8], [LayoutType.ColumnMajor, 8], [LayoutType.ColumnMajor, 1]],
[[LayoutType.RowMajor, 4], [LayoutType.ColumnMajor, 4], [LayoutType.ColumnMajor, 1]],
]
math_instructions = [
MathInstruction(
[64, 128, 32],
DataType.s8, DataType.s8, DataType.s32,
OpcodeClass.TensorOp,
MathOperation.multiply_add),
MathInstruction(
[64, 128, 32],
DataType.u8, DataType.u8, DataType.s32,
OpcodeClass.TensorOp,
MathOperation.multiply_add),
]
min_cc = 90
max_cc = 90
for math_inst in math_instructions:
tile_descriptions_small = [
# TileDescription([math_inst.instruction_shape[0], math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
# 0, [4, 1, 1], math_inst, min_cc, max_cc, [1,1,1]),
]
tile_descriptions_medium = [
TileDescription([math_inst.instruction_shape[0]*2, math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
0, [4, 1, 1], math_inst, min_cc, max_cc, [1,1,1]),
]
tile_descriptions = tile_descriptions_medium + tile_descriptions_small
data_types = [
{
"a_type" : math_inst.element_a,
"b_type" : math_inst.element_b,
"c_type" : math_inst.element_accumulator,
"d_type" : math_inst.element_accumulator,
"acc_type" : math_inst.element_accumulator,
"epi_type" : math_inst.element_accumulator
},
{
"a_type" : math_inst.element_a,
"b_type" : math_inst.element_b,
"c_type" : DataType.s8,
"d_type" : math_inst.element_a,
"acc_type" : math_inst.element_accumulator,
"epi_type" : DataType.f32
}
]
for data_type in data_types:
for layout in layouts:
layout[2][1] = 128 // DataTypeSize[data_type["d_type"]]
CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type, [
# [KernelScheduleType.ScheduleAuto, EpilogueScheduleType.NoSmemWarpSpecialized],
[KernelScheduleType.CpAsyncWarpSpecialized, EpilogueScheduleType.NoSmemWarpSpecialized]
])
if CudaToolkitVersionSatisfies(cuda_version, 12, 1):
CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type, [
# [KernelScheduleType.CpAsyncWarpSpecializedPingpong, EpilogueScheduleType.NoSmemWarpSpecialized],
[KernelScheduleType.CpAsyncWarpSpecializedCooperative, EpilogueScheduleType.NoSmemWarpSpecialized]
])
CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type,
[[KernelScheduleType.CpAsyncWarpSpecializedCooperative, EpilogueScheduleType.NoSmemWarpSpecialized]],
tile_schedulers=[TileSchedulerType.StreamK])
#
def GenerateSM90_TensorOp_fp8_WGMMA_gemm(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
return
# layouts for ABC and their alignments
layouts = [
[[LayoutType.RowMajor, 16], [LayoutType.ColumnMajor, 16], [LayoutType.ColumnMajor, 1]], # TN Layout
]
math_instructions = [
# inst 64x128x32
MathInstruction(
[64, 128, 32],
DataType.e4m3, DataType.e4m3, DataType.f32,
OpcodeClass.TensorOp,
MathOperation.multiply_add),
MathInstruction(
[64, 128, 32],
DataType.e4m3, DataType.e5m2, DataType.f32,
OpcodeClass.TensorOp,
MathOperation.multiply_add),
MathInstruction(
[64, 128, 32],
DataType.e5m2, DataType.e4m3, DataType.f32,
OpcodeClass.TensorOp,
MathOperation.multiply_add),
MathInstruction(
[64, 128, 32],
DataType.e5m2, DataType.e5m2, DataType.f32,
OpcodeClass.TensorOp,
MathOperation.multiply_add),
]
min_cc = 90
max_cc = 90
for math_inst in math_instructions:
data_types = [
{
"a_type" : math_inst.element_a,
"b_type" : math_inst.element_b,
"c_type" : DataType.f32,
"d_type" : DataType.f32,
"acc_type" : math_inst.element_accumulator,
"epi_type" : math_inst.element_accumulator
},
{
"a_type" : math_inst.element_a,
"b_type" : math_inst.element_b,
"c_type" : DataType.f32,
"d_type" : DataType.e4m3,
"acc_type" : math_inst.element_accumulator,
"epi_type" : math_inst.element_accumulator
},
{
"a_type" : math_inst.element_a,
"b_type" : math_inst.element_b,
"c_type" : DataType.f32,
"d_type" : DataType.e5m2,
"acc_type" : math_inst.element_accumulator,
"epi_type" : math_inst.element_accumulator
},
{
"a_type" : math_inst.element_a,
"b_type" : math_inst.element_b,
"c_type" : DataType.bf16,
"d_type" : DataType.bf16,
"acc_type" : math_inst.element_accumulator,
"epi_type" : math_inst.element_accumulator
},
{
"a_type" : math_inst.element_a,
"b_type" : math_inst.element_b,
"c_type" : DataType.bf16,
"d_type" : DataType.e4m3,
"acc_type" : math_inst.element_accumulator,
"epi_type" : math_inst.element_accumulator
},
{
"a_type" : math_inst.element_a,
"b_type" : math_inst.element_b,
"c_type" : DataType.bf16,
"d_type" : DataType.e5m2,
"acc_type" : math_inst.element_accumulator,
"epi_type" : math_inst.element_accumulator
},
{
"a_type" : math_inst.element_a,
"b_type" : math_inst.element_b,
"c_type" : DataType.f16,
"d_type" : DataType.f16,
"acc_type" : math_inst.element_accumulator,
"epi_type" : math_inst.element_accumulator
},
{
"a_type" : math_inst.element_a,
"b_type" : math_inst.element_b,
"c_type" : DataType.f16,
"d_type" : DataType.e4m3,
"acc_type" : math_inst.element_accumulator,
"epi_type" : math_inst.element_accumulator
},
{
"a_type" : math_inst.element_a,
"b_type" : math_inst.element_b,
"c_type" : DataType.f16,
"d_type" : DataType.e5m2,
"acc_type" : math_inst.element_accumulator,
"epi_type" : math_inst.element_accumulator
},
]
data_types_large_tile = [
{
"a_type" : math_inst.element_a,
"b_type" : math_inst.element_b,
"c_type" : DataType.void,
"d_type" : DataType.e5m2,
"acc_type" : math_inst.element_accumulator,
"epi_type" : math_inst.element_accumulator
},
{
"a_type" : math_inst.element_a,
"b_type" : math_inst.element_b,
"c_type" : DataType.void,
"d_type" : DataType.e4m3,
"acc_type" : math_inst.element_accumulator,
"epi_type" : math_inst.element_accumulator
}
]
if math_inst.instruction_shape[1] == 128:
tile_descriptions_small = [
# 64x128x128
TileDescription([math_inst.instruction_shape[0], math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
0, [4, 1, 1], math_inst, min_cc, max_cc, [1,2,1]),
TileDescription([math_inst.instruction_shape[0], math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
0, [4, 1, 1], math_inst, min_cc, max_cc, [2,1,1]),
]
tile_descriptions_large = [
# 256x128x128
TileDescription([math_inst.instruction_shape[0]*4, math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
0, [4, 1, 1], math_inst, min_cc, max_cc, [1,2,1]),
TileDescription([math_inst.instruction_shape[0]*4, math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
0, [4, 1, 1], math_inst, min_cc, max_cc, [2,1,1]),
]
tile_descriptions = [
# 128x128x128
TileDescription([math_inst.instruction_shape[0]*2, math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
0, [4, 1, 1], math_inst, min_cc, max_cc, [1,2,1]),
TileDescription([math_inst.instruction_shape[0]*2, math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
0, [4, 1, 1], math_inst, min_cc, max_cc, [2,1,1]),
]
else:
assert False, "math inst is not supported"
# some schedules disabled to save on library size
if CudaToolkitVersionSatisfies(cuda_version, 12, 1):
schedules = [
#[KernelScheduleType.ScheduleAuto, EpilogueScheduleType.ScheduleAuto],
[KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.NoSmemWarpSpecialized],
[KernelScheduleType.TmaWarpSpecialized, EpilogueScheduleType.NoSmemWarpSpecialized],
[KernelScheduleType.TmaWarpSpecializedPingpongFP8FastAccum, EpilogueScheduleType.NoSmemWarpSpecialized],
[KernelScheduleType.TmaWarpSpecializedCooperativeFP8FastAccum, EpilogueScheduleType.NoSmemWarpSpecialized],
[KernelScheduleType.TmaWarpSpecializedFP8FastAccum, EpilogueScheduleType.NoSmemWarpSpecialized]
]
stream_k_schedules = [[KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.NoSmemWarpSpecialized],
[KernelScheduleType.TmaWarpSpecializedCooperativeFP8FastAccum, EpilogueScheduleType.NoSmemWarpSpecialized]]
else:
schedules = [
# [KernelScheduleType.ScheduleAuto, EpilogueScheduleType.ScheduleAuto],
[KernelScheduleType.TmaWarpSpecialized, EpilogueScheduleType.NoSmemWarpSpecialized]
# TmaWarpSpecializedCooperative require CUDA version >= 12.1 for optimal performance.
]
stream_k_schedules = []
for data_type in data_types:
# With No-SMEM epilogues
CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type, schedules)
if CudaToolkitVersionSatisfies(cuda_version, 12, 1):
# Persistent kernels with TMA epilogues
CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type,
[[KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative],
[KernelScheduleType.TmaWarpSpecializedPingpongFP8FastAccum, EpilogueScheduleType.TmaWarpSpecialized],
[KernelScheduleType.TmaWarpSpecializedCooperativeFP8FastAccum, EpilogueScheduleType.TmaWarpSpecializedCooperative]])
# Small tiles
CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions_small, data_type,
[[KernelScheduleType.TmaWarpSpecializedPingpongFP8FastAccum, EpilogueScheduleType.TmaWarpSpecialized],
[KernelScheduleType.TmaWarpSpecializedPingpongFP8FastAccum, EpilogueScheduleType.NoSmemWarpSpecialized]])
# Large tiles
CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions_large, data_types_large_tile,
[[KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative],
[KernelScheduleType.TmaWarpSpecializedCooperativeFP8FastAccum, EpilogueScheduleType.TmaWarpSpecializedCooperative]])
# Add stream-K variants (with and without TMA epilogues)
CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type, stream_k_schedules, tile_schedulers=[TileSchedulerType.StreamK])
CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type,
[[KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative],
[KernelScheduleType.TmaWarpSpecializedCooperativeFP8FastAccum, EpilogueScheduleType.TmaWarpSpecializedCooperative]],
tile_schedulers=[TileSchedulerType.StreamK])
#
def GenerateSM90_TensorOp_fp8_WGMMA_alignx_gemm(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
return
# layouts for ABC and their alignments
layouts = [
[[LayoutType.RowMajor, 8], [LayoutType.ColumnMajor, 8], [LayoutType.ColumnMajor, 1]], # TN Layout
[[LayoutType.RowMajor, 4], [LayoutType.ColumnMajor, 4], [LayoutType.ColumnMajor, 1]], # TN Layout
]
math_instructions = [
# inst 64x128x32
MathInstruction(
[64, 128, 32],
DataType.e4m3, DataType.e4m3, DataType.f32,
OpcodeClass.TensorOp,
MathOperation.multiply_add),
MathInstruction(
[64, 128, 32],
DataType.e4m3, DataType.e5m2, DataType.f32,
OpcodeClass.TensorOp,
MathOperation.multiply_add),
MathInstruction(
[64, 128, 32],
DataType.e5m2, DataType.e4m3, DataType.f32,
OpcodeClass.TensorOp,
MathOperation.multiply_add),
MathInstruction(
[64, 128, 32],
DataType.e5m2, DataType.e5m2, DataType.f32,
OpcodeClass.TensorOp,
MathOperation.multiply_add),
# inst 64x64x32
# MathInstruction(
# [64, 64, 32],
# DataType.e4m3, DataType.e4m3, DataType.f32,
# OpcodeClass.TensorOp,
# MathOperation.multiply_add),
# MathInstruction(
# [64, 64, 32],
# DataType.e4m3, DataType.e5m2, DataType.f32,
# OpcodeClass.TensorOp,
# MathOperation.multiply_add),
# MathInstruction(
# [64, 64, 32],
# DataType.e5m2, DataType.e4m3, DataType.f32,
# OpcodeClass.TensorOp,
# MathOperation.multiply_add),
# MathInstruction(
# [64, 64, 32],
# DataType.e5m2, DataType.e5m2, DataType.f32,
# OpcodeClass.TensorOp,
# MathOperation.multiply_add),
]
min_cc = 90
max_cc = 90
for math_inst in math_instructions:
data_types = [
{
"a_type" : math_inst.element_a,
"b_type" : math_inst.element_b,
"c_type" : DataType.f32,
"d_type" : DataType.f32,
"acc_type" : math_inst.element_accumulator,
"epi_type" : math_inst.element_accumulator
},
{
"a_type" : math_inst.element_a,
"b_type" : math_inst.element_b,
"c_type" : DataType.f32,
"d_type" : DataType.e4m3,
"acc_type" : math_inst.element_accumulator,
"epi_type" : math_inst.element_accumulator
},
{
"a_type" : math_inst.element_a,
"b_type" : math_inst.element_b,
"c_type" : DataType.f32,
"d_type" : DataType.e5m2,
"acc_type" : math_inst.element_accumulator,
"epi_type" : math_inst.element_accumulator
},
{
"a_type" : math_inst.element_a,
"b_type" : math_inst.element_b,
"c_type" : DataType.bf16,
"d_type" : DataType.bf16,
"acc_type" : math_inst.element_accumulator,
"epi_type" : math_inst.element_accumulator
},
{
"a_type" : math_inst.element_a,
"b_type" : math_inst.element_b,
"c_type" : DataType.bf16,
"d_type" : DataType.e4m3,
"acc_type" : math_inst.element_accumulator,
"epi_type" : math_inst.element_accumulator
},
{
"a_type" : math_inst.element_a,
"b_type" : math_inst.element_b,
"c_type" : DataType.bf16,
"d_type" : DataType.e5m2,
"acc_type" : math_inst.element_accumulator,
"epi_type" : math_inst.element_accumulator
},
{
"a_type" : math_inst.element_a,
"b_type" : math_inst.element_b,
"c_type" : DataType.f16,
"d_type" : DataType.f16,
"acc_type" : math_inst.element_accumulator,
"epi_type" : math_inst.element_accumulator
},
{
"a_type" : math_inst.element_a,
"b_type" : math_inst.element_b,
"c_type" : DataType.f16,
"d_type" : DataType.e4m3,
"acc_type" : math_inst.element_accumulator,
"epi_type" : math_inst.element_accumulator
},
{
"a_type" : math_inst.element_a,
"b_type" : math_inst.element_b,
"c_type" : DataType.f16,
"d_type" : DataType.e5m2,
"acc_type" : math_inst.element_accumulator,
"epi_type" : math_inst.element_accumulator
},
]
if math_inst.instruction_shape[1] == 128:
tile_descriptions = [
# 128x128x128
TileDescription([math_inst.instruction_shape[0]*2, math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
0, [4, 1, 1], math_inst, min_cc, max_cc, [1,1,1]),
]
# elif math_inst.instruction_shape[1] == 64:
# tile_descriptions = [
# # 256x64x128
# TileDescription([math_inst.instruction_shape[0]*4, math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
# 0, [4, 1, 1], math_inst, min_cc, max_cc, [1,1,1]),
# ]
else:
assert False, "math inst is not supported"
if CudaToolkitVersionSatisfies(cuda_version, 12, 1):
schedules = [
# [KernelScheduleType.ScheduleAuto, EpilogueScheduleType.ScheduleAuto],
[KernelScheduleType.CpAsyncWarpSpecializedCooperative, EpilogueScheduleType.NoSmemWarpSpecialized],
# [KernelScheduleType.CpAsyncWarpSpecializedPingpong, EpilogueScheduleType.NoSmemWarpSpecialized],
[KernelScheduleType.CpAsyncWarpSpecialized, EpilogueScheduleType.NoSmemWarpSpecialized],
]
stream_k_schedules = [[KernelScheduleType.CpAsyncWarpSpecializedCooperative, EpilogueScheduleType.NoSmemWarpSpecialized]]
else:
schedules = [
# [KernelScheduleType.ScheduleAuto, EpilogueScheduleType.ScheduleAuto],
[KernelScheduleType.CpAsyncWarpSpecialized, EpilogueScheduleType.NoSmemWarpSpecialized]
]
stream_k_schedules = []
for data_type in data_types:
# With No-SMEM epilogues
CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type, schedules)
if CudaToolkitVersionSatisfies(cuda_version, 12, 1):
# Persistent kernels with TMA epilogues
# CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type,
# [[KernelScheduleType.CpAsyncWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative]])
# Add stream-K variants (with and without TMA epilogues)
CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type, stream_k_schedules, tile_schedulers=[TileSchedulerType.StreamK])
# CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type,
# [[KernelScheduleType.CpAsyncWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative]],
# tile_schedulers=[TileSchedulerType.StreamK])
#
def GenerateSM90_TensorOp_1684(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 11, 8):
return
layouts = [
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
]
math_inst = MathInstruction(
[16, 8, 4],
DataType.f64, DataType.f64, DataType.f64,
OpcodeClass.TensorOp,
MathOperation.multiply_add)
min_cc = 90
max_cc = 90
alignment_constraints = [1,]
tile_descriptions = [
TileDescription([128, 128, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([256, 64, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 256, 16], 3, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([256, 32, 16], 3, [4, 1, 1], math_inst, min_cc, max_cc),
TileDescription([32, 256, 16], 3, [1, 4, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 128, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 32, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([32, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([32, 32, 16], 5, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([16, 32, 16], 5, [1, 2, 1], math_inst, min_cc, max_cc),
TileDescription([32, 16, 16], 5, [2, 1, 1], math_inst, min_cc, max_cc),
]
data_type = [DataType.f64, DataType.f64, DataType.f64, DataType.f64]
CreateGemmOperator(manifest, layouts, tile_descriptions,
data_type, alignment_constraints)
#
#
def GenerateSM90_TensorOp_1684_complex(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 11, 8):
return
layouts = [
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
]
math_inst = \
MathInstruction( \
[16, 8, 4], \
DataType.f64, DataType.f64, DataType.f64, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_complex)
min_cc = 90
max_cc = 90
alignment_constraints = [1,]
tile_descriptions = [
TileDescription([128, 64, 8 ], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 128, 8 ], 3, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([64, 64, 8 ], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 32, 8 ], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([32, 64, 8 ], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([32, 32, 8 ], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([16, 32, 8 ], 4, [1, 2, 1], math_inst, min_cc, max_cc),
TileDescription([32, 16, 8 ], 4, [2, 1, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 128, 16], 3, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([64, 64, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 32, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([32, 64, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([32, 32, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([16, 32, 16], 4, [1, 2, 1], math_inst, min_cc, max_cc),
TileDescription([32, 16, 16], 3, [2, 1, 1], math_inst, min_cc, max_cc),
]
data_type = [DataType.cf64, DataType.cf64, DataType.cf64, DataType.cf64]
complex_transforms = [
(ComplexTransform.none, ComplexTransform.none),
(ComplexTransform.conj, ComplexTransform.none),
(ComplexTransform.none, ComplexTransform.conj),
(ComplexTransform.conj, ComplexTransform.conj)
]
CreateGemmOperator(manifest, layouts, tile_descriptions, \
data_type, alignment_constraints, complex_transforms)
#
#
def GenerateSM90_TensorOp_1684_complex_gaussian(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 11, 8):
return
layouts = [
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
]
math_inst = \
MathInstruction( \
[16, 8, 4], \
DataType.f64, DataType.f64, DataType.f64, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_complex_gaussian)
min_cc = 90
max_cc = 90
alignment_constraints = [1,]
tile_descriptions = [
TileDescription([64, 64, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([32, 64, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([32, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([16, 32, 8], 4, [1, 2, 1], math_inst, min_cc, max_cc),
TileDescription([32, 16, 8], 4, [2, 1, 1], math_inst, min_cc, max_cc),
]
data_type = [DataType.cf64, DataType.cf64, DataType.cf64, DataType.cf64]
complex_transforms = [
(ComplexTransform.none, ComplexTransform.none),
(ComplexTransform.conj, ComplexTransform.none),
(ComplexTransform.none, ComplexTransform.conj),
(ComplexTransform.conj, ComplexTransform.conj)
]
CreateGemmOperator(manifest, layouts, tile_descriptions, \
data_type, alignment_constraints, complex_transforms)
#
#
def GenerateSM90_TensorOp_1684_rank_k(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 11, 8):
return
layouts = [
(LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.ColumnMajor),
]
fill_modes = [
FillMode.Lower, FillMode.Upper,
]
math_inst = \
MathInstruction( \
[16, 8, 4], \
DataType.f64, DataType.f64, DataType.f64, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add)
min_cc = 90
max_cc = 90
alignment_constraints = [1,]
tile_descriptions = [
TileDescription([128, 128, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 128, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 32, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([32, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([32, 32, 16], 5, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([16, 32, 16], 5, [1, 2, 1], math_inst, min_cc, max_cc),
TileDescription([32, 16, 16], 5, [2, 1, 1], math_inst, min_cc, max_cc),
]
data_type = [DataType.f64, DataType.f64, DataType.f64]
CreateRankKOperator(manifest, layouts, fill_modes, tile_descriptions, \
data_type, alignment_constraints, BlasMode.symmetric)
#
#
def GenerateSM90_TensorOp_1684_rank_k_complex(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 11, 8):
return
layouts = [
(LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.ColumnMajor),
]
fill_modes = [
FillMode.Lower, FillMode.Upper,
]
math_inst = \
MathInstruction( \
[16, 8, 4], \
DataType.f64, DataType.f64, DataType.f64, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_complex)
min_cc = 90
max_cc = 90
alignment_constraints = [1,]
tile_descriptions = [
TileDescription([128, 64, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 128, 8], 3, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([64, 64, 8], 3, [2, 2, 1], math_inst, min_cc, max_cc),
#TileDescription([64, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
#TileDescription([32, 64, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
#TileDescription([32, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
#TileDescription([16, 32, 8], 4, [1, 2, 1], math_inst, min_cc, max_cc),
#TileDescription([32, 16, 8], 4, [2, 1, 1], math_inst, min_cc, max_cc),
]
data_type = [DataType.cf64, DataType.cf64, DataType.cf64]
# SYRK computation
CreateRankKOperator(manifest, layouts, fill_modes, tile_descriptions, \
data_type, alignment_constraints, BlasMode.symmetric)
# HERK computation
CreateRankKOperator(manifest, layouts, fill_modes, tile_descriptions, \
data_type, alignment_constraints, BlasMode.hermitian)
#
#
def GenerateSM90_TensorOp_1684_rank_k_complex_gaussian(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 11, 8):
return
layouts = [
(LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.ColumnMajor),
]
fill_modes = [
FillMode.Lower, FillMode.Upper,
]
math_inst = \
MathInstruction( \
[16, 8, 4], \
DataType.f64, DataType.f64, DataType.f64, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_complex_gaussian)
min_cc = 90
max_cc = 90
alignment_constraints = [1,]
tile_descriptions = [
TileDescription([64, 64, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([32, 64, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
#TileDescription([32, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
#TileDescription([16, 32, 8], 4, [1, 2, 1], math_inst, min_cc, max_cc),
#TileDescription([32, 16, 8], 4, [2, 1, 1], math_inst, min_cc, max_cc),
]
data_type = [DataType.cf64, DataType.cf64, DataType.cf64]
complex_transforms = [ComplexTransform.none,]
# SYRK computation
CreateRankKOperator(manifest, layouts, fill_modes, tile_descriptions, \
data_type, alignment_constraints, BlasMode.symmetric)
# HERK computation
CreateRankKOperator(manifest, layouts, fill_modes, tile_descriptions, \
data_type, alignment_constraints, BlasMode.hermitian)
#
#
def GenerateSM90_TensorOp_1684_trmm(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 11, 8):
return
layouts = [
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
]
side_modes = [
SideMode.Left, SideMode.Right,
]
fill_modes = [
FillMode.Lower, FillMode.Upper,
]
diag_types = [
DiagType.NonUnit, DiagType.Unit,
]
math_inst = \
MathInstruction( \
[16, 8, 4], \
DataType.f64, DataType.f64, DataType.f64, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add)
min_cc = 90
max_cc = 90
alignment_constraints = [1,]
tile_descriptions = [
TileDescription([128, 128, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 128, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
]
data_type = [DataType.f64, DataType.f64, DataType.f64, DataType.f64]
CreateTrmmOperator(manifest, layouts, side_modes, fill_modes, diag_types, tile_descriptions, \
data_type, alignment_constraints)
#
#
def GenerateSM90_TensorOp_1684_trmm_complex(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 11, 8):
return
layouts = [
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
]
side_modes = [
SideMode.Left, SideMode.Right,
]
fill_modes = [
FillMode.Lower, FillMode.Upper,
]
diag_types = [
DiagType.NonUnit, DiagType.Unit,
]
math_inst = \
MathInstruction( \
[16, 8, 4], \
DataType.f64, DataType.f64, DataType.f64, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_complex)
min_cc = 90
max_cc = 90
alignment_constraints = [1,]
tile_descriptions = [
TileDescription([128, 64, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 128, 8], 3, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([64, 64, 8], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([32, 64, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
]
data_type = [DataType.cf64, DataType.cf64, DataType.cf64, DataType.cf64]
complex_transforms = [
ComplexTransform.none, ComplexTransform.conj,
]
CreateTrmmOperator(manifest, layouts, side_modes, fill_modes, diag_types, tile_descriptions, \
data_type, alignment_constraints, complex_transforms)
#
#
def GenerateSM90_TensorOp_1684_trmm_complex_gaussian(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 11, 8):
return
layouts = [
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
]
side_modes = [
SideMode.Left, SideMode.Right,
]
fill_modes = [
FillMode.Lower, FillMode.Upper,
]
diag_types = [
DiagType.NonUnit, DiagType.Unit,
]
math_inst = \
MathInstruction( \
[16, 8, 4], \
DataType.f64, DataType.f64, DataType.f64, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_complex_gaussian)
min_cc = 90
max_cc = 90
alignment_constraints = [1,]
tile_descriptions = [
TileDescription([64, 64, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([32, 64, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
]
data_type = [DataType.cf64, DataType.cf64, DataType.cf64, DataType.cf64]
complex_transforms = [
ComplexTransform.none, ComplexTransform.conj,
]
CreateTrmmOperator(manifest, layouts, side_modes, fill_modes, diag_types, tile_descriptions, \
data_type, alignment_constraints, complex_transforms)
#
#
def GenerateSM90_TensorOp_1684_symm(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 11, 8):
return
layouts = [
(LayoutType.ColumnMajor, LayoutType.ColumnMajor),
]
side_modes = [
SideMode.Left, SideMode.Right,
]
fill_modes = [
FillMode.Lower, FillMode.Upper,
]
math_inst = \
MathInstruction( \
[16, 8, 4], \
DataType.f64, DataType.f64, DataType.f64, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add)
min_cc = 90
max_cc = 90
alignment_constraints = [1,]
tile_descriptions = [
TileDescription([128, 128, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 128, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 32, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([32, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([32, 32, 16], 5, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([16, 32, 16], 5, [1, 2, 1], math_inst, min_cc, max_cc),
TileDescription([32, 16, 16], 5, [2, 1, 1], math_inst, min_cc, max_cc),
]
data_type = [DataType.f64, DataType.f64, DataType.f64, DataType.f64]
CreateSymmOperator(manifest, layouts, side_modes, fill_modes, tile_descriptions, \
data_type, alignment_constraints, BlasMode.symmetric)
#
#
def GenerateSM90_TensorOp_1684_symm_complex(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 11, 8):
return
layouts = [
(LayoutType.ColumnMajor, LayoutType.ColumnMajor),
]
side_modes = [
SideMode.Left, SideMode.Right,
]
fill_modes = [
FillMode.Lower, FillMode.Upper,
]
math_inst = \
MathInstruction( \
[16, 8, 4], \
DataType.f64, DataType.f64, DataType.f64, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_complex)
min_cc = 90
max_cc = 90
alignment_constraints = [1,]
tile_descriptions = [
TileDescription([128, 64, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 128, 8], 3, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([64, 64, 8], 3, [2, 2, 1], math_inst, min_cc, max_cc),
#TileDescription([64, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
#TileDescription([32, 64, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
#TileDescription([32, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
#TileDescription([16, 32, 8], 4, [1, 2, 1], math_inst, min_cc, max_cc),
#TileDescription([32, 16, 8], 4, [2, 1, 1], math_inst, min_cc, max_cc),
]
data_type = [DataType.cf64, DataType.cf64, DataType.cf64, DataType.cf64]
# SYMM computation
CreateSymmOperator(manifest, layouts, side_modes, fill_modes, tile_descriptions, \
data_type, alignment_constraints, BlasMode.symmetric)
# HEMM computation
CreateSymmOperator(manifest, layouts, side_modes, fill_modes, tile_descriptions, \
data_type, alignment_constraints, BlasMode.hermitian)
#
#
def GenerateSM90_TensorOp_1684_symm_complex_gaussian(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 11, 8):
return
layouts = [
(LayoutType.ColumnMajor, LayoutType.ColumnMajor),
]
side_modes = [
SideMode.Left, SideMode.Right,
]
fill_modes = [
FillMode.Lower, FillMode.Upper,
]
math_inst = \
MathInstruction( \
[16, 8, 4], \
DataType.f64, DataType.f64, DataType.f64, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_complex_gaussian)
min_cc = 90
max_cc = 90
alignment_constraints = [1,]
tile_descriptions = [
TileDescription([64, 64, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([32, 64, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
#TileDescription([32, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
#TileDescription([16, 32, 8], 4, [1, 2, 1], math_inst, min_cc, max_cc),
#TileDescription([32, 16, 8], 4, [2, 1, 1], math_inst, min_cc, max_cc),
]
data_type = [DataType.cf64, DataType.cf64, DataType.cf64, DataType.cf64]
complex_transforms = [ComplexTransform.none,]
# SYMM computation
CreateSymmOperator(manifest, layouts, side_modes, fill_modes, tile_descriptions, \
data_type, alignment_constraints, BlasMode.symmetric)
# HEMM computation
CreateSymmOperator(manifest, layouts, side_modes, fill_modes, tile_descriptions, \
data_type, alignment_constraints, BlasMode.hermitian)
#
###################################################################################################
#
def GenerateSM90(manifest, cuda_version):
GenerateSM90_TensorOp_16b_WGMMA_gemm(manifest, cuda_version)
GenerateSM90_TensorOp_16b_WGMMA_alignx_gemm(manifest, cuda_version)
GenerateSM90_TensorOp_tf32_WGMMA_gemm(manifest, cuda_version)
GenerateSM90_TensorOp_tf32_WGMMA_alignx_gemm(manifest, cuda_version)
GenerateSM90_TensorOp_int8_WGMMA_gemm(manifest, cuda_version)
GenerateSM90_TensorOp_int8_WGMMA_alignx_gemm(manifest, cuda_version)
GenerateSM90_TensorOp_fp8_WGMMA_gemm(manifest, cuda_version)
GenerateSM90_TensorOp_fp8_WGMMA_alignx_gemm(manifest, cuda_version)
GenerateSM90_TensorOp_1684(manifest, cuda_version)
GenerateSM90_TensorOp_1684_complex(manifest, cuda_version)
GenerateSM90_TensorOp_1684_complex_gaussian(manifest, cuda_version)
GenerateSM90_TensorOp_1684_rank_k(manifest, cuda_version)
GenerateSM90_TensorOp_1684_rank_k_complex(manifest, cuda_version)
GenerateSM90_TensorOp_1684_rank_k_complex_gaussian(manifest, cuda_version)
GenerateSM90_TensorOp_1684_trmm(manifest, cuda_version)
GenerateSM90_TensorOp_1684_trmm_complex(manifest, cuda_version)
GenerateSM90_TensorOp_1684_trmm_complex_gaussian(manifest, cuda_version)
GenerateSM90_TensorOp_1684_symm(manifest, cuda_version)
GenerateSM90_TensorOp_1684_symm_complex(manifest, cuda_version)
GenerateSM90_TensorOp_1684_symm_complex_gaussian(manifest, cuda_version)
###################################################################################################
def numeric_log_level(log_level: str) -> int:
"""
Converts the string identifier of the log level into the numeric identifier used
in setting the log level
:param x: string representation of log level (e.g., 'INFO', 'DEBUG')
:type x: str
:return: numeric representation of log level
:rtype: int
"""
numeric_level = getattr(logging, log_level.upper(), None)
if not isinstance(numeric_level, int):
raise ValueError(f'Invalid log level: {log_level}')
return numeric_level
# This function for defining the ArgumentParser is used to make it easy for the CUTLASS Python interface
# to leverage the functionality in this file without running this script via a shell prompt.
def define_parser():
parser = argparse.ArgumentParser(description="Generates device kernel registration code for CUTLASS Kernels")
parser.add_argument("--operations", default="all", help="Specifies the operation to generate (gemm, all)")
parser.add_argument("--build-dir", default=".", required=False, help="CUTLASS top-level build directory")
parser.add_argument("--curr-build-dir", default=".", help="CUTLASS current build directory. cmake files will be emitted in this directory")
parser.add_argument("--generator-target", default='library', help="Target of CUTLASS Library Generator.")
parser.add_argument("--architectures", default='53;60;61;70;75;80;90', help="Target compute architectures")
parser.add_argument("--kernels", default='', help='Comma delimited list to filter kernels by name.')
parser.add_argument("--ignore-kernels", default='', help='Comma delimited list of kernels to exclude from build.')
parser.add_argument("--filter-by-cc", default='True', type=str, help='If enabled, kernels whose compute capability range is not satisfied by the build target are excluded.')
parser.add_argument("--cuda-version", default="11.0.0", help="Semantic version string of CUDA Toolkit")
parser.add_argument('--kernel-filter-file', type=str, default=None, required=False, help='Full path of filter file')
parser.add_argument('--selected-kernel-list', type=str, default=None, required=False,
help='Specify the output log file containing all enabled kernels in this build')
parser.add_argument("--interface-dir", default=None, required=False, help="Interface header to kernels")
parser.add_argument("--disable-full-archs-compilation", action="store_true", required=False, help="Disable compilation for every archs in --architectures")
parser.add_argument("--log-level", default='info', type=numeric_log_level, required=False,
help='Logging level to be used by the generator script')
_add_package_disablement_flag(parser)
return parser
if __name__ == "__main__":
parser = define_parser()
args = parser.parse_args()
# Set the logging level based on the user-provided `--log-level` command-line option
logging.basicConfig(level=args.log_level)
manifest = Manifest(args)
GenerateSM50(manifest, args.cuda_version)
GenerateSM60(manifest, args.cuda_version)
GenerateSM61(manifest, args.cuda_version)
GenerateSM70(manifest, args.cuda_version)
GenerateSM75(manifest, args.cuda_version)
GenerateSM80(manifest, args.cuda_version)
GenerateSM90(manifest, args.cuda_version)
if 'library' in args.generator_target.split(','):
manifest.emit(GeneratorTarget.Library)
if args.selected_kernel_list is not None:
if len(manifest.selected_kernels) > 0:
with open(args.selected_kernel_list, 'w') as file_writer:
for line in manifest.selected_kernels:
file_writer.write("%s\n" % line)
###################################################################################################