cutlass/tools/library/scripts/generator.py
Andrew Kerr fb335f6a5f
CUTLASS 2.0 (#62)
CUTLASS 2.0

Substantially refactored for

- Better performance, particularly for native Turing Tensor Cores
- Robust and durable templates spanning the design space
- Encapsulated functionality embodying modern C++11 programming techniques
- Optimized containers and data types for efficient, generic, portable device code

Updates to:
- Quick start guide
- Documentation
- Utilities
- CUTLASS Profiler

Native Turing Tensor Cores
- Efficient GEMM kernels targeting Turing Tensor Cores
- Mixed-precision floating point, 8-bit integer, 4-bit integer, and binarized operands

Coverage of existing CUTLASS functionality:
- GEMM kernels targeting CUDA and Tensor Cores in NVIDIA GPUs
- Volta Tensor Cores through native mma.sync and through WMMA API
- Optimizations such as parallel reductions, threadblock rasterization, and intra-threadblock reductions
- Batched GEMM operations
- Complex-valued GEMMs

Note: this commit and all that follow require a host compiler supporting C++11 or greater.
2019-11-19 16:55:34 -08:00

236 lines
9.1 KiB
Python

#
# \file generator.py
#
# \brief Generates the CUTLASS Library's instances
#
import enum
import os.path
import shutil
import argparse
from library import *
from manifest import *
from gemm_operation import *
###################################################################################################
#
def CudaToolkitVersionSatisfies(semantic_ver_string, major, minor, patch = 0):
if semantic_ver_string == '':
cuda_version = [10, 2, 0]
else:
cuda_version = [int(x) for x in semantic_ver_string.split('.')]
return cuda_version >= [major, minor, patch]
###################################################################################################
#
def GenerateSM50(manifest, args):
min_cc = 50
max_cc = 1024
stages = 2
# single-precision
inst = MathInstruction([1, 1, 1], DataType.f32, DataType.f32, DataType.f32, OpcodeClass.Simt)
tile_descriptions = [
TileDescription([128, 128, 8], stages, [2, 2, 1], inst, min_cc, max_cc),
TileDescription([128, 256, 8], stages, [2, 4, 1], inst, min_cc, max_cc),
TileDescription([256, 128, 8], stages, [4, 2, 1], inst, min_cc, max_cc),
TileDescription([64, 128, 8], stages, [2, 2, 1], inst, min_cc, max_cc),
TileDescription([128, 64, 8], stages, [2, 2, 1], inst, min_cc, max_cc),
TileDescription([128, 32, 8], stages, [4, 1, 1], inst, min_cc, max_cc),
TileDescription([32, 128, 8], stages, [1, 4, 1], inst, min_cc, max_cc),
]
GenerateGemmSimt(GemmKind.Gemm, manifest, tile_descriptions, min_cc)
GenerateGemmSimt(GemmKind.Batched, manifest, tile_descriptions, min_cc)
# double precision
inst = MathInstruction([1, 1, 1], DataType.f64, DataType.f64, DataType.f64, OpcodeClass.Simt)
tile_descriptions = [
TileDescription([128, 128, 8], stages, [4, 2, 1], inst, min_cc, max_cc),
TileDescription([64, 128, 8], stages, [2, 2, 1], inst, min_cc, max_cc),
TileDescription([128, 64, 8], stages, [2, 2, 1], inst, min_cc, max_cc),
TileDescription([128, 32, 8], stages, [4, 1, 1], inst, min_cc, max_cc),
TileDescription([32, 128, 8], stages, [1, 4, 1], inst, min_cc, max_cc),
]
GenerateGemmSimt(GemmKind.Gemm, manifest, tile_descriptions, min_cc)
GenerateGemmSimt(GemmKind.Batched, manifest, tile_descriptions, min_cc)
###################################################################################################
#
def GenerateSM60(manifest, args):
min_cc = 60
max_cc = 1024
stages = 2
math_instructions = [
MathInstruction([1, 1, 1], DataType.f16, DataType.f16, DataType.f16, OpcodeClass.Simt),
]
tile_descriptions = []
for inst in math_instructions:
tile_descriptions += [
TileDescription([256, 256, 8], stages, [4, 2, 1], inst, min_cc, max_cc),
TileDescription([128, 256, 8], stages, [2, 2, 1], inst, min_cc, max_cc),
TileDescription([128, 128, 8], stages, [2, 2, 1], inst, min_cc, max_cc),
TileDescription([64, 128, 8], stages, [2, 2, 1], inst, min_cc, max_cc),
TileDescription([32, 128, 8], stages, [1, 2, 1], inst, min_cc, max_cc),
TileDescription([128, 32, 8], stages, [2, 1, 1], inst, min_cc, max_cc),
]
GenerateGemmSimt(GemmKind.Gemm, manifest, tile_descriptions, min_cc)
###################################################################################################
#
def GenerateSM61(manifest, args):
min_cc = 61
max_cc = 1024
stages = 2
math_instructions = [
MathInstruction([1, 1, 4], DataType.s8, DataType.s8, DataType.s32, OpcodeClass.Simt),
]
tile_descriptions = []
for inst in math_instructions:
tile_descriptions += [
TileDescription([128, 256, 32], stages, [2, 4, 1], inst, min_cc, max_cc),
TileDescription([256, 128, 32], stages, [4, 2, 1], inst, min_cc, max_cc),
TileDescription([128, 128, 32], stages, [2, 4, 1], inst, min_cc, max_cc),
TileDescription([64, 128, 32], stages, [2, 2, 1], inst, min_cc, max_cc),
TileDescription([128, 64, 32], stages, [4, 1, 1], inst, min_cc, max_cc),
TileDescription([32, 128, 32], stages, [1, 2, 1], inst, min_cc, max_cc),
TileDescription([128, 32, 32], stages, [2, 1, 1], inst, min_cc, max_cc),
]
GenerateGemmSimt(GemmKind.Gemm, manifest, tile_descriptions, min_cc)
###################################################################################################
#
def GenerateSM70(manifest, args):
min_cc = 70
max_cc = 75
stages = 2
k_groups = 8
math_instructions = [
MathInstruction([8, 8, 4], DataType.f16, DataType.f16, DataType.f16, OpcodeClass.TensorOp),
MathInstruction([8, 8, 4], DataType.f16, DataType.f16, DataType.f32, OpcodeClass.TensorOp),
]
tile_descriptions = []
for inst in math_instructions:
kblock = k_groups * inst.instruction_shape[2]
tile_descriptions += [
TileDescription([256, 128, kblock], stages, [4, 2, 1], inst, min_cc, max_cc),
TileDescription([128, 256, kblock], stages, [2, 4, 1], inst, min_cc, max_cc),
TileDescription([128, 128, kblock], stages, [2, 2, 1], inst, min_cc, max_cc),
TileDescription([64, 128, kblock], stages, [2, 2, 1], inst, min_cc, max_cc),
TileDescription([128, 64, kblock], stages, [2, 2, 1], inst, min_cc, max_cc),
TileDescription([64, 64, kblock], stages, [2, 2, 1], inst, min_cc, max_cc),
]
if CudaToolkitVersionSatisfies(args.cuda_version, 10, 1):
GenerateGemmTensorOp(GemmKind.Gemm, manifest, tile_descriptions, min_cc)
GenerateGemmTensorOp(GemmKind.Batched, manifest, tile_descriptions, min_cc)
# wmma tensor op SM70 Gemm kernels
stages = 2
k_groups = 2
math_instructions = [
MathInstruction([16, 16, 16], DataType.f16, DataType.f16, DataType.f16, OpcodeClass.WmmaTensorOp),
MathInstruction([16, 16, 16], DataType.f16, DataType.f16, DataType.f32, OpcodeClass.WmmaTensorOp),
]
tile_descriptions = []
for inst in math_instructions:
kblock = k_groups * inst.instruction_shape[2]
tile_descriptions += [
TileDescription([128, 128, kblock], stages, [2, 4, 1], inst, min_cc, max_cc),
TileDescription([64, 128, kblock], stages, [2, 2, 1], inst, min_cc, max_cc),
TileDescription([128, 64, kblock], stages, [2, 2, 1], inst, min_cc, max_cc),
TileDescription([64, 64, kblock], stages, [2, 2, 1], inst, min_cc, max_cc),
]
GenerateGemmWmmaTensorOp(GemmKind.Gemm, manifest, tile_descriptions, min_cc)
###################################################################################################
#
def GenerateSM75(manifest, args):
min_cc = 75
max_cc = 1024
stages = 2
k_groups = 4
math_instructions = [
MathInstruction([16, 8, 8], DataType.f16, DataType.f16, DataType.f16, OpcodeClass.TensorOp),
MathInstruction([16, 8, 8], DataType.f16, DataType.f16, DataType.f32, OpcodeClass.TensorOp),
MathInstruction([8, 8, 16], DataType.s8, DataType.s8, DataType.s32, OpcodeClass.TensorOp),
MathInstruction([8, 8, 32], DataType.s4, DataType.s4, DataType.s32, OpcodeClass.TensorOp)
]
tile_descriptions = []
for inst in math_instructions:
kblock = k_groups * inst.instruction_shape[2]
tile_descriptions += [
TileDescription([256, 128, kblock], stages, [4, 2, 1], inst, min_cc, max_cc),
TileDescription([128, 256, kblock], stages, [2, 4, 1], inst, min_cc, max_cc),
TileDescription([128, 128, kblock], stages, [2, 2, 1], inst, min_cc, max_cc),
TileDescription([64, 128, kblock], stages, [2, 2, 1], inst, min_cc, max_cc),
TileDescription([128, 64, kblock], stages, [2, 2, 1], inst, min_cc, max_cc),
TileDescription([64, 64, kblock], stages, [2, 2, 1], inst, min_cc, max_cc),
]
if CudaToolkitVersionSatisfies(args.cuda_version, 10, 2):
GenerateGemmTensorOp(GemmKind.Gemm, manifest, tile_descriptions, min_cc)
GenerateGemmTensorOp(GemmKind.Batched, manifest, tile_descriptions, min_cc)
###################################################################################################
###################################################################################################
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Generates device kernel registration code for CUTLASS Kernels")
parser.add_argument("--operations", default="gemm", help="Specifies the operation to generate (gemm, all)")
parser.add_argument("--build-dir", default=".", required=False, help="CUTLASS top-level build directory")
parser.add_argument("--curr-build-dir", default=".", help="CUTLASS current build directory. cmake files will be emitted in this directory")
parser.add_argument("--generator-target", default='library', help="Target of CUTLASS Library Generator.")
parser.add_argument("--architectures", default='50 60 61 75', help="Target compute architectures")
parser.add_argument("--kernels", default='', help='Comma delimited list to filter kernels by name.')
parser.add_argument("--cuda-version", default="10.2.0", help="Semantic version string of CUDA Toolkit")
args = parser.parse_args()
manifest = Manifest(args)
GenerateSM50(manifest, args)
GenerateSM60(manifest, args)
GenerateSM61(manifest, args)
GenerateSM70(manifest, args)
GenerateSM75(manifest, args)
if 'library' in args.generator_target.split(','):
manifest.emit(GeneratorTarget.Library)
#
###################################################################################################