cutlass/tools/library/scripts/generator.py

941 lines
33 KiB
Python
Raw Normal View History

#
# \file generator.py
#
# \brief Generates the CUTLASS Library's instances
#
import enum
import os.path
import shutil
import argparse
from library import *
from manifest import *
from gemm_operation import *
###################################################################################################
#
def CudaToolkitVersionSatisfies(semantic_ver_string, major, minor, patch = 0):
# by default, use the latest CUDA Toolkit version
cuda_version = [10, 2, 82]
# Update cuda_version based on parsed string
if semantic_ver_string != '':
for i, x in enumerate([int(x) for x in semantic_ver_string.split('.')]):
if i < len(cuda_version):
cuda_version[i] = x
else:
cuda_version.append(x)
return cuda_version >= [major, minor, patch]
###################################################################################################
###################################################################################################
#
def CreateGemmOperator(manifest, layouts, tile_descriptions, data_type, \
alignment_constraints, complex_transforms = None, epilogue_functor = EpilogueFunctor.LinearCombination, \
swizzling_functor = SwizzlingFunctor.Cohort):
if complex_transforms is None:
complex_transforms = [(ComplexTransform.none, ComplexTransform.none),]
element_a, element_b, element_c, element_epilogue = data_type
operations = []
# by default, only generate the largest tile and largest alignment
if manifest.args.kernels == '':
tile_descriptions = [tile_descriptions[0],]
alignment_constraints = [alignment_constraints[0],]
for layout in layouts:
for tile_description in tile_descriptions:
for alignment in alignment_constraints:
for complex_transform in complex_transforms:
alignment_c = min(8, alignment)
A = TensorDescription(element_a, layout[0], alignment, complex_transform[0])
B = TensorDescription(element_b, layout[1], alignment, complex_transform[1])
C = TensorDescription(element_c, layout[2], alignment_c)
new_operation = GemmOperation(GemmKind.Gemm, tile_description.minimum_compute_capability, \
tile_description, A, B, C, element_epilogue, epilogue_functor, swizzling_functor)
manifest.append(new_operation)
operations.append(new_operation)
return operations
#
def CreateGemmPlanarComplexOperator(manifest, layouts, tile_descriptions, data_type, \
alignment_constraints, complex_transforms):
if complex_transforms is None:
complex_transforms = [(ComplexTransform.none, ComplexTransform.none),]
element_a, element_b, element_c, element_epilogue = data_type
gemm_kinds = [GemmKind.PlanarComplex, GemmKind.PlanarComplexArray]
# by default, only generate the largest tile and largest alignment
if manifest.args.kernels == '':
tile_descriptions = [tile_descriptions[0],]
alignment_constraints = [alignment_constraints[0],]
for gemm_kind in gemm_kinds:
for layout in layouts:
for tile_description in tile_descriptions:
for alignment in alignment_constraints:
for complex_transform in complex_transforms:
alignment_c = min(8, alignment)
A = TensorDescription(element_a, layout[0], alignment, complex_transform[0])
B = TensorDescription(element_b, layout[1], alignment, complex_transform[1])
C = TensorDescription(element_c, layout[2], alignment_c)
manifest.append(GemmOperation(gemm_kind, \
tile_description.minimum_compute_capability, \
tile_description, A, B, C, element_epilogue))
return
###################################################################################################
###################################################################################################
###################################################################################################
#
def GenerateSM50_Simt(manifest, args):
layouts = [
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
]
math_instructions = [
MathInstruction( \
[1, 1, 1], \
DataType.f32, DataType.f32, DataType.f32, \
OpcodeClass.Simt, \
MathOperation.multiply_add),
MathInstruction( \
[1, 1, 1], \
DataType.f64, DataType.f64, DataType.f64, \
OpcodeClass.Simt, \
MathOperation.multiply_add),
]
min_cc = 50
max_cc = 1024
alignment_constraints = [1,]
for math_inst in math_instructions:
tile_descriptions = [
TileDescription([128, 128, 8], 2, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 8], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 128, 8], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 64, 8], 2, [2, 1, 1], math_inst, min_cc, max_cc),
TileDescription([128, 32, 8], 2, [2, 1, 1], math_inst, min_cc, max_cc),
TileDescription([ 32, 128, 8], 2, [1, 2, 1], math_inst, min_cc, max_cc),
]
data_type = [
math_inst.element_a,
math_inst.element_b,
math_inst.element_accumulator,
math_inst.element_accumulator,
]
CreateGemmOperator(manifest, layouts, tile_descriptions, \
data_type, alignment_constraints)
#
#
def GenerateSM50(manifest, args):
GenerateSM50_Simt(manifest, args)
###################################################################################################
###################################################################################################
#
def GenerateSM60_Simt(manifest, args):
layouts = [
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
]
math_instructions = [
MathInstruction( \
[1, 1, 1], \
DataType.f16, DataType.f16, DataType.f16, \
OpcodeClass.Simt, \
MathOperation.multiply_add),
]
min_cc = 60
max_cc = 1024
alignment_constraints = [1,]
for math_inst in math_instructions:
tile_descriptions = [
TileDescription([256, 128, 8], 2, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 256, 8], 2, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 8], 2, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 8], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 128, 8], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 64, 8], 2, [2, 1, 1], math_inst, min_cc, max_cc),
TileDescription([128, 32, 8], 2, [2, 1, 1], math_inst, min_cc, max_cc),
TileDescription([ 32, 128, 8], 2, [1, 2, 1], math_inst, min_cc, max_cc),
]
data_type = [
math_inst.element_a,
math_inst.element_b,
math_inst.element_accumulator,
math_inst.element_accumulator,
]
CreateGemmOperator(manifest, layouts, tile_descriptions, \
data_type, alignment_constraints)
#
#
def GenerateSM60(manifest, args):
GenerateSM60_Simt(manifest, args)
###################################################################################################
###################################################################################################
#
def GenerateSM61_Simt(manifest, args):
layouts = [
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
]
math_instructions = [
MathInstruction( \
[1, 1, 4], \
DataType.s8, DataType.s8, DataType.s32, \
OpcodeClass.Simt, \
MathOperation.multiply_add),
]
min_cc = 61
max_cc = 1024
alignment_constraints = [1,]
for math_inst in math_instructions:
tile_descriptions = [
TileDescription([128, 128, 32], 2, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 128, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 64, 32], 2, [2, 1, 1], math_inst, min_cc, max_cc),
TileDescription([128, 32, 32], 2, [2, 1, 1], math_inst, min_cc, max_cc),
TileDescription([ 32, 128, 32], 2, [1, 2, 1], math_inst, min_cc, max_cc),
]
data_type = [
math_inst.element_a,
math_inst.element_b,
math_inst.element_accumulator,
math_inst.element_accumulator,
]
data_type_mixed = [
math_inst.element_a,
math_inst.element_b,
math_inst.element_a,
math_inst.element_accumulator,
]
CreateGemmOperator(manifest, layouts, tile_descriptions, \
data_type, alignment_constraints)
CreateGemmOperator(manifest, layouts, tile_descriptions, \
data_type_mixed, alignment_constraints)
#
#
def GenerateSM61(manifest, args):
GenerateSM61_Simt(manifest, args)
###################################################################################################
###################################################################################################
#
def GenerateSM70_TensorOp_884(manifest, args):
if not CudaToolkitVersionSatisfies(args.cuda_version, 10, 1):
return
layouts = [
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
]
math_instructions = [
MathInstruction( \
[8, 8, 4], \
DataType.f16, DataType.f16, DataType.f32, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add),
MathInstruction( \
[8, 8, 4], \
DataType.f16, DataType.f16, DataType.f16, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add),
]
min_cc = 70
max_cc = 75
alignment_constraints = [8, 4, 2, 1]
for math_inst in math_instructions:
tile_descriptions = [
TileDescription([256, 128, 32], 2, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 256, 32], 2, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 128, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
]
data_type = [
math_inst.element_a,
math_inst.element_b,
math_inst.element_accumulator,
math_inst.element_accumulator,
]
CreateGemmOperator(manifest, layouts, tile_descriptions, \
data_type, alignment_constraints)
# Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
if math_inst.element_a != math_inst.element_accumulator:
data_type_mixed = [
math_inst.element_a,
math_inst.element_b,
math_inst.element_a,
math_inst.element_accumulator,
]
CreateGemmOperator(manifest, layouts, tile_descriptions, \
data_type_mixed, alignment_constraints)
#
def GenerateSM70_PlanarComplexTensorOp_884(manifest, args):
if not CudaToolkitVersionSatisfies(args.cuda_version, 10, 1):
return
layouts = [
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
]
complex_transforms = [
(ComplexTransform.none, ComplexTransform.none),
(ComplexTransform.conj, ComplexTransform.none),
(ComplexTransform.none, ComplexTransform.conj),
(ComplexTransform.conj, ComplexTransform.conj)
]
math_instructions = [
MathInstruction( \
[8, 8, 4], \
DataType.f16, DataType.f16, DataType.f32, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add),
MathInstruction( \
[8, 8, 4], \
DataType.f16, DataType.f16, DataType.f16, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add),
]
min_cc = 70
max_cc = 75
alignment_constraints = [8, 2, 1]
for math_inst in math_instructions:
tile_descriptions = [
TileDescription([ 64, 64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
]
data_type = [
math_inst.element_a,
math_inst.element_b,
math_inst.element_accumulator,
math_inst.element_accumulator,
]
CreateGemmPlanarComplexOperator(manifest, layouts, tile_descriptions, \
data_type, alignment_constraints, complex_transforms)
# Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
if math_inst.element_a != math_inst.element_accumulator:
data_type_mixed = [
math_inst.element_a,
math_inst.element_b,
math_inst.element_a,
math_inst.element_accumulator,
]
CreateGemmPlanarComplexOperator(manifest, layouts, tile_descriptions, \
data_type_mixed, alignment_constraints, complex_transforms)
#
def GenerateSM70_WmmaTensorOp_161616(manifest, args):
layouts = [
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
]
math_instructions = [
MathInstruction( \
[16, 16, 16], \
DataType.f16, DataType.f16, DataType.f32, \
OpcodeClass.WmmaTensorOp, \
MathOperation.multiply_add),
MathInstruction( \
[16, 16, 16], \
DataType.f16, DataType.f16, DataType.f16, \
OpcodeClass.WmmaTensorOp, \
MathOperation.multiply_add),
]
min_cc = 70
max_cc = 1024
alignment_constraints = [8,]
for math_inst in math_instructions:
tile_descriptions = [
TileDescription([128, 128, 32], 2, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 128, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
]
data_type = [
math_inst.element_a,
math_inst.element_b,
math_inst.element_accumulator,
math_inst.element_accumulator,
]
CreateGemmOperator(manifest, layouts, tile_descriptions, \
data_type, alignment_constraints)
# Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
if math_inst.element_a != math_inst.element_accumulator:
data_type_mixed = [
math_inst.element_a,
math_inst.element_b,
math_inst.element_a,
math_inst.element_accumulator,
]
CreateGemmOperator(manifest, layouts, tile_descriptions, \
data_type_mixed, alignment_constraints)
#
def GenerateSM70(manifest, args):
GenerateSM70_TensorOp_884(manifest, args)
GenerateSM70_PlanarComplexTensorOp_884(manifest, args)
#GenerateSM70_WmmaTensorOp_161616(manifest, args)
###################################################################################################
###################################################################################################
#
def GenerateSM75_TensorOp_1688(manifest, args):
if not CudaToolkitVersionSatisfies(args.cuda_version, 10, 2):
return
layouts = [
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
]
math_instructions = [
MathInstruction( \
[16, 8, 8], \
DataType.f16, DataType.f16, DataType.f32, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add),
MathInstruction( \
[16, 8, 8], \
DataType.f16, DataType.f16, DataType.f16, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add),
]
min_cc = 75
max_cc = 1024
alignment_constraints = [8, 4, 2, 1]
for math_inst in math_instructions:
tile_descriptions = [
TileDescription([256, 128, 32], 2, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 256, 32], 2, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 128, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
]
data_type = [
math_inst.element_a,
math_inst.element_b,
math_inst.element_accumulator,
math_inst.element_accumulator,
]
CreateGemmOperator(manifest, layouts, tile_descriptions, \
data_type, alignment_constraints)
# Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
if math_inst.element_a != math_inst.element_accumulator:
data_type_mixed = [
math_inst.element_a,
math_inst.element_b,
math_inst.element_a,
math_inst.element_accumulator,
]
CreateGemmOperator(manifest, layouts, tile_descriptions, \
data_type_mixed, alignment_constraints)
#
#
def GenerateSM75_PlanarComplexTensorOp_1688(manifest, args):
if not CudaToolkitVersionSatisfies(args.cuda_version, 10, 2):
return
layouts = [
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
]
complex_transforms = [
(ComplexTransform.none, ComplexTransform.none),
(ComplexTransform.conj, ComplexTransform.none),
(ComplexTransform.none, ComplexTransform.conj),
(ComplexTransform.conj, ComplexTransform.conj)
]
math_instructions = [
MathInstruction( \
[16, 8, 8], \
DataType.f16, DataType.f16, DataType.f32, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add),
MathInstruction( \
[16, 8, 8], \
DataType.f16, DataType.f16, DataType.f16, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add),
]
min_cc = 75
max_cc = 1024
alignment_constraints = [8, 2, 1]
for math_inst in math_instructions:
tile_descriptions = [
TileDescription([ 64, 128, 32], 2, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 32], 2, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
]
data_type = [
math_inst.element_a,
math_inst.element_b,
math_inst.element_accumulator,
math_inst.element_accumulator,
]
CreateGemmPlanarComplexOperator(manifest, layouts, tile_descriptions, \
data_type, alignment_constraints, complex_transforms)
# Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
if math_inst.element_a != math_inst.element_accumulator:
data_type_mixed = [
math_inst.element_a,
math_inst.element_b,
math_inst.element_a,
math_inst.element_accumulator,
]
CreateGemmPlanarComplexOperator(manifest, layouts, tile_descriptions, \
data_type_mixed, alignment_constraints, complex_transforms)
#
def GenerateSM75_TensorOp_8816_TN(manifest, args):
if not CudaToolkitVersionSatisfies(args.cuda_version, 10, 2):
return
layouts = [
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
]
math_instructions = [
MathInstruction( \
[8, 8, 16], \
DataType.s8, DataType.s8, DataType.s32, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_saturate),
]
min_cc = 75
max_cc = 1024
alignment_constraints = [16,]
for math_inst in math_instructions:
tile_descriptions = [
TileDescription([256, 128, 64], 2, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 256, 64], 2, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 128, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 64, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc),
]
data_type = [
math_inst.element_a,
math_inst.element_b,
math_inst.element_accumulator,
DataType.s32,
]
CreateGemmOperator(manifest, layouts, tile_descriptions, \
data_type, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
# Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
if math_inst.element_a != math_inst.element_accumulator:
data_type_mixed = [
math_inst.element_a,
math_inst.element_b,
math_inst.element_a,
DataType.f32,
]
operations = []
operations += CreateGemmOperator(manifest, layouts, tile_descriptions, \
data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
for op in operations:
if op.tile_description.threadblock_shape[1] >= 128:
op.C.alignment = 16
else:
op.C.alignment = 8
#
#
def GenerateSM75_TensorOp_8816_Interleaved(manifest, args):
if not CudaToolkitVersionSatisfies(args.cuda_version, 10, 2):
return
layouts = [
(LayoutType.ColumnMajorInterleaved32, LayoutType.RowMajorInterleaved32, LayoutType.ColumnMajorInterleaved32),
]
math_instructions = [
MathInstruction( \
[8, 8, 16], \
DataType.s8, DataType.s8, DataType.s32, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_saturate),
]
min_cc = 75
max_cc = 1024
alignment_constraints = [16,]
for math_inst in math_instructions:
tile_descriptions = [
TileDescription([256, 128, 64], 2, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 256, 64], 2, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 128, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 64, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc),
]
data_type_mixed = [
math_inst.element_a,
math_inst.element_b,
DataType.s8,
DataType.f32,
]
operations = CreateGemmOperator(manifest, layouts, tile_descriptions, \
data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp, \
SwizzlingFunctor.Identity)
for op in operations:
op.C.alignment = 8
#
#
def GenerateSM75_TensorOp_8832_TN(manifest, args):
if not CudaToolkitVersionSatisfies(args.cuda_version, 10, 2):
return
layouts = [
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
]
math_instructions = [
MathInstruction( \
[8, 8, 32], \
DataType.s4, DataType.s4, DataType.s32, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_saturate),
]
min_cc = 75
max_cc = 1024
alignment_constraints = [32,]
for math_inst in math_instructions:
tile_descriptions = [
TileDescription([256, 128, 128], 2, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 256, 128], 2, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 128], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 128, 128], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 128], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 64, 128], 2, [2, 2, 1], math_inst, min_cc, max_cc),
]
data_type = [
math_inst.element_a,
math_inst.element_b,
math_inst.element_accumulator,
DataType.s32,
]
CreateGemmOperator(manifest, layouts, tile_descriptions, \
data_type, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
# Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
if math_inst.element_a != math_inst.element_accumulator:
data_type_mixed = [
math_inst.element_a,
math_inst.element_b,
math_inst.element_a,
DataType.f32,
]
operations = []
operations += CreateGemmOperator(manifest, layouts, tile_descriptions, \
data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
for op in operations:
if op.tile_description.threadblock_shape[1] >= 128:
op.C.alignment = 8
elif op.tile_description.threadblock_shape[1] == 64:
op.C.alignment = 8
else:
op.C.alignment = 4
#
#
def GenerateSM75_TensorOp_8832_Interleaved(manifest, args):
if not CudaToolkitVersionSatisfies(args.cuda_version, 10, 2):
return
layouts = [
(LayoutType.ColumnMajorInterleaved64, LayoutType.RowMajorInterleaved64, LayoutType.ColumnMajorInterleaved64),
]
math_instructions = [
MathInstruction( \
[8, 8, 32], \
DataType.s4, DataType.s4, DataType.s32, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_saturate),
]
min_cc = 75
max_cc = 1024
alignment_constraints = [32,]
for math_inst in math_instructions:
tile_descriptions = [
TileDescription([256, 128, 128], 2, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 256, 128], 2, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 128], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 128, 128], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 128], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 64, 128], 2, [2, 2, 1], math_inst, min_cc, max_cc),
]
# Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
if math_inst.element_a != math_inst.element_accumulator:
data_type_mixed = [
math_inst.element_a,
math_inst.element_b,
DataType.s4,
DataType.f32,
]
operations = CreateGemmOperator(manifest, layouts, tile_descriptions, \
data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp, \
SwizzlingFunctor.Identity)
for op in operations:
op.C.alignment = 16
#
#
def GenerateSM75_WmmaTensorOp_161616(manifest, args):
if not CudaToolkitVersionSatisfies(args.cuda_version, 10, 0):
return
layouts = [
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
]
math_instructions = [
MathInstruction( \
[16, 16, 16], \
DataType.s8, DataType.s8, DataType.s32, \
OpcodeClass.WmmaTensorOp, \
MathOperation.multiply_add),
]
min_cc = 75
max_cc = 1024
alignment_constraints = [16,]
for math_inst in math_instructions:
tile_descriptions = [
TileDescription([128, 128, 32], 2, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 128, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
]
data_type = [
math_inst.element_a,
math_inst.element_b,
math_inst.element_accumulator,
DataType.f32,
]
CreateGemmOperator(manifest, layouts, tile_descriptions, \
data_type, alignment_constraints)
# Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
if math_inst.element_a != math_inst.element_accumulator:
data_type_mixed = [
math_inst.element_a,
math_inst.element_b,
math_inst.element_a,
DataType.f32,
]
CreateGemmOperator(manifest, layouts, tile_descriptions, \
data_type_mixed, alignment_constraints)
#
#
def GenerateSM75(manifest, args):
GenerateSM75_TensorOp_1688(manifest, args)
GenerateSM75_PlanarComplexTensorOp_1688(manifest, args)
GenerateSM75_TensorOp_8816_TN(manifest, args)
GenerateSM75_TensorOp_8816_Interleaved(manifest, args)
GenerateSM75_TensorOp_8832_TN(manifest, args)
GenerateSM75_TensorOp_8832_Interleaved(manifest, args)
#GenerateSM75_WmmaTensorOp_161616(manifest, args)
###################################################################################################
###################################################################################################
###################################################################################################
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Generates device kernel registration code for CUTLASS Kernels")
parser.add_argument("--operations", default="all", help="Specifies the operation to generate (gemm, all)")
parser.add_argument("--build-dir", default=".", required=False, help="CUTLASS top-level build directory")
parser.add_argument("--curr-build-dir", default=".", help="CUTLASS current build directory. cmake files will be emitted in this directory")
parser.add_argument("--generator-target", default='library', help="Target of CUTLASS Library Generator.")
parser.add_argument("--architectures", default='50;60;61;75', help="Target compute architectures")
parser.add_argument("--kernels", default='', help='Comma delimited list to filter kernels by name.')
parser.add_argument("--cuda-version", default="11.0.0", help="Semantic version string of CUDA Toolkit")
args = parser.parse_args()
manifest = Manifest(args)
GenerateSM50(manifest, args)
GenerateSM60(manifest, args)
GenerateSM61(manifest, args)
GenerateSM70(manifest, args)
GenerateSM75(manifest, args)
if 'library' in args.generator_target.split(','):
manifest.emit(GeneratorTarget.Library)
#
###################################################################################################