
CUTLASS 1.3 Release - Efficient GEMM kernel targeting Volta Tensor Cores via mma.sync instruction added in CUDA 10.1.
303 lines
11 KiB
CMake
303 lines
11 KiB
CMake
# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
|
|
#
|
|
# Redistribution and use in source and binary forms, with or without modification, are permitted
|
|
# provided that the following conditions are met:
|
|
# * Redistributions of source code must retain the above copyright notice, this list of
|
|
# conditions and the following disclaimer.
|
|
# * Redistributions in binary form must reproduce the above copyright notice, this list of
|
|
# conditions and the following disclaimer in the documentation and/or other materials
|
|
# provided with the distribution.
|
|
# * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
|
|
# to endorse or promote products derived from this software without specific prior written
|
|
# permission.
|
|
#
|
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
|
|
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
|
|
# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
|
|
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
|
# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
|
|
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
|
# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
cmake_minimum_required(VERSION 3.3.0 FATAL_ERROR)
|
|
|
|
set(CUTLASS_LANGUAGES CXX)
|
|
|
|
# CMake 3.9.0 has native support for CUDA without the need of the CUDA package. Use it!
|
|
if(WIN32 AND NOT ${CMAKE_VERSION} VERSION_LESS "3.9.0")
|
|
list(APPEND CUTLASS_LANGUAGES CUDA)
|
|
set(CUTLASS_NATIVE_CUDA TRUE)
|
|
|
|
macro(cutlass_add_executable)
|
|
add_executable(${ARGN})
|
|
endmacro()
|
|
else()
|
|
# FindCUDA fails to detect VS 2017 due to a changed directory format of the toolkits.
|
|
# For this configuration we need CMake >= 3.9.0 to use the native CUDA support.
|
|
if (WIN32 AND MSVC_VERSION GREATER 1800)
|
|
message(SEND_ERROR "Please upgrade CMake to version >= 3.9.0 to support Visual Studio 2017 or higher")
|
|
cmake_minimum_required(VERSION 3.9.0 FATAL_ERROR)
|
|
endif()
|
|
|
|
# Fall back to the FindCUDA version to create an executable with CUDA files
|
|
macro(cutlass_add_executable)
|
|
cuda_add_executable(${ARGN})
|
|
endmacro()
|
|
endif()
|
|
|
|
project(CUTLASS ${CUTLASS_LANGUAGES})
|
|
|
|
# check if the configuration is supported
|
|
if( NOT CMAKE_SIZEOF_VOID_P EQUAL 8 )
|
|
message(FATAL_ERROR "CUTLASS requires a 64-bit compiler!")
|
|
endif()
|
|
|
|
find_package(CUDA REQUIRED)
|
|
include_directories(SYSTEM ${CUDA_INCLUDE_DIRS})
|
|
# Some platforms (e.g. Visual Studio) don't add the CUDA include directories to the system include
|
|
# paths by default, so we add it explicitly here.
|
|
|
|
find_package(Doxygen QUIET)
|
|
|
|
###################################################################################################
|
|
#
|
|
# Configure CMake variables
|
|
#
|
|
###################################################################################################
|
|
|
|
#
|
|
# Conditionally enable cuBLAS
|
|
#
|
|
set(CUTLASS_ENABLE_CUBLAS ON CACHE BOOL "Enable CUTLASS Tests to build with cuBLAS library.")
|
|
|
|
if(CUTLASS_ENABLE_CUBLAS)
|
|
|
|
find_library(CUBLAS_LIBRARY cublas HINTS
|
|
${CUDA_TOOLKIT_ROOT_DIR}/lib64
|
|
${CUDA_TOOLKIT_ROOT_DIR}/lib/x64)
|
|
endif()
|
|
|
|
|
|
# By default we want to build in Release mode to ensure that we're getting best performance
|
|
if (NOT (CMAKE_BUILD_TYPE OR CONFIGURATION_TYPES))
|
|
set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose build level" FORCE)
|
|
# We do support Debug or Release builds
|
|
set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "RelWithDebInfo" "Release")
|
|
endif()
|
|
|
|
if(WIN32)
|
|
# On Windows we link against the shared (DLL) runtime. Change gtest settings to match this.
|
|
set(gtest_force_shared_crt ON CACHE BOOL "Use shared (DLL) run-time lib even when Google Test is built as static lib" FORCE)
|
|
endif()
|
|
|
|
if (WIN32)
|
|
# Enable more warnings and treat as errors
|
|
string(APPEND NVCC_FLAGS " -Xcompiler /W3 -Xcompiler /WX")
|
|
|
|
# Disable warning on Unicode characters
|
|
string(APPEND NVCC_FLAGS " -Xcompiler /wd4819")
|
|
|
|
# Disable excess x86 floating point precision that can lead to results being labeled incorrectly
|
|
string(APPEND NVCC_FLAGS " -Xcompiler /fp:strict")
|
|
|
|
# Verbose option
|
|
if (${CUTLASS_NVCC_VERBOSE})
|
|
string(APPEND NVCC_FLAGS " -v")
|
|
endif()
|
|
endif(WIN32)
|
|
|
|
set(CUTLASS_NVCC_ARCHS_DEFAULT "")
|
|
if(NOT CUDA_VERSION VERSION_LESS 7.5)
|
|
list(APPEND CUTLASS_NVCC_ARCHS_DEFAULT 50)
|
|
endif()
|
|
if(NOT CUDA_VERSION VERSION_LESS 8.0)
|
|
list(APPEND CUTLASS_NVCC_ARCHS_DEFAULT 60 61)
|
|
endif()
|
|
if(NOT CUDA_VERSION VERSION_LESS 9.0)
|
|
list(APPEND CUTLASS_NVCC_ARCHS_DEFAULT 70)
|
|
endif()
|
|
if(NOT CUDA_VERSION VERSION_LESS 9.2)
|
|
list(APPEND CUTLASS_NVCC_ARCHS_DEFAULT 72)
|
|
endif()
|
|
if(NOT CUDA_VERSION VERSION_LESS 10.0)
|
|
list(APPEND CUTLASS_NVCC_ARCHS_DEFAULT 75)
|
|
endif()
|
|
set(CUTLASS_NVCC_ARCHS ${CUTLASS_NVCC_ARCHS_DEFAULT} CACHE STRING "The SM architectures to build code for.")
|
|
|
|
set(CUTLASS_NVCC_EMBED_CUBIN ON CACHE BOOL "Embed compiled CUDA kernel binaries into executables.")
|
|
set(CUTLASS_NVCC_EMBED_PTX ON CACHE BOOL "Embed compiled PTX into executables.")
|
|
set(CUTLASS_NVCC_KEEP OFF CACHE BOOL "Keep intermediate files generated by NVCC.")
|
|
|
|
# CUDA 10.1 introduces "mma" in PTX performing collective matrix multiply operations.
|
|
if (CUDA_VERSION VERSION_LESS 10.1)
|
|
set(CUTLASS_ENABLE_TENSOR_CORE_MMA_DEFAULT OFF)
|
|
else()
|
|
set(CUTLASS_ENABLE_TENSOR_CORE_MMA_DEFAULT ON)
|
|
endif()
|
|
|
|
set(CUTLASS_ENABLE_TENSOR_CORE_MMA ${CUTLASS_ENABLE_TENSOR_CORE_MMA_DEFAULT} CACHE BOOL
|
|
"Enable PTX mma instruction for collective matrix multiply operations.")
|
|
|
|
set(CUTLASS_EXHAUSTIVE_PERFORMANCE_TEST ${CUTLASS_EXHAUSTIVE_PERFORMANCE_TEST} CACHE BOOL
|
|
"Enable more kernels instantiated in the perf suite. This might result in longer compiler time. ")
|
|
|
|
#
|
|
# NOTE: running with asan and CUDA requires the following environment variable:
|
|
#
|
|
# ASAN_OPTIONS=protect_shadow_gap=0:replace_intrin=0:detect_leaks=0
|
|
#
|
|
# without the above environment setting, an error like the following may be generated:
|
|
#
|
|
# *** Error: Could not detect active GPU device ID [out of memory]
|
|
# ...
|
|
# ==9149==ERROR: LeakSanitizer: detected memory leaks
|
|
# ...
|
|
#
|
|
if(ENABLE_ASAN) # https://github.com/google/sanitizers/wiki/AddressSanitizer
|
|
string(APPEND NVCC_FLAGS " --compiler-options -fsanitize=address --compiler-options -fno-omit-frame-pointer")
|
|
string(APPEND CMAKE_EXE_LINKER_FLAGS " -fsanitize=address")
|
|
endif()
|
|
|
|
###################################################################################################
|
|
#
|
|
# Configure CUDA build options
|
|
#
|
|
###################################################################################################
|
|
|
|
# Set NVCC arguments
|
|
foreach(ARCH ${CUTLASS_NVCC_ARCHS})
|
|
if(CUTLASS_NVCC_EMBED_CUBIN)
|
|
string(APPEND NVCC_FLAGS " -gencode arch=compute_${ARCH},code=sm_${ARCH}")
|
|
endif()
|
|
if(CUTLASS_NVCC_EMBED_PTX)
|
|
string(APPEND NVCC_FLAGS " -gencode arch=compute_${ARCH},code=compute_${ARCH}")
|
|
endif()
|
|
endforeach()
|
|
|
|
if (CUTLASS_ENABLE_TENSOR_CORE_MMA)
|
|
string(APPEND NVCC_FLAGS " -DCUTLASS_ENABLE_TENSOR_CORE_MMA=1")
|
|
endif()
|
|
|
|
if (CUTLASS_ENABLE_CUBLAS)
|
|
string(APPEND NVCC_FLAGS " -DCUTLASS_ENABLE_CUBLAS=1")
|
|
endif()
|
|
|
|
if (CUTLASS_EXHAUSTIVE_PERFORMANCE_TEST)
|
|
add_definitions(-DEXHAUSTIVE_PROF)
|
|
endif()
|
|
|
|
if (CUTLASS_NVCC_KEEP)
|
|
string(APPEND NVCC_FLAGS " -keep")
|
|
endif()
|
|
|
|
if (WIN32 AND CUTLASS_NATIVE_CUDA)
|
|
string(APPEND NVCC_FLAGS_RELEASE " -lineinfo")
|
|
else()
|
|
string(APPEND NVCC_FLAGS " -lineinfo")
|
|
endif()
|
|
|
|
if (UNIX)
|
|
string(APPEND NVCC_FLAGS " -Xcompiler -Wconversion")
|
|
endif()
|
|
|
|
string(APPEND NVCC_FLAGS_DEBUG " -g")
|
|
string(APPEND NVCC_FLAGS_RELWITHDEBINFO " -O3")
|
|
string(APPEND NVCC_FLAGS_RELEASE " -O3")
|
|
|
|
# define NDEBUG for release mode to disable assertions
|
|
string(APPEND NVCC_FLAGS_RELEASE " -DNDEBUG")
|
|
|
|
if (CUTLASS_NATIVE_CUDA)
|
|
set(CMAKE_CUDA_FLAGS "${NVCC_FLAGS}")
|
|
set(CMAKE_CUDA_FLAGS_RELEASE "${NVCC_FLAGS_RELEASE}")
|
|
set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "${NVCC_FLAGS_RELWITHDEBINFO}")
|
|
set(CMAKE_CUDA_FLAGS_DEBUG "${NVCC_FLAGS_DEBUG}")
|
|
else()
|
|
set(CUDA_NVCC_FLAGS ${NVCC_FLAGS})
|
|
set(CUDA_NVCC_FLAGS_DEBUG ${NVCC_FLAGS_DEBUG})
|
|
set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ${NVCC_FLAGS_RELWITHDEBINFO})
|
|
set(CUDA_NVCC_FLAGS_RELEASE ${NVCC_FLAGS_RELEASE})
|
|
endif()
|
|
|
|
#
|
|
# The following items should eventually be pushed into cutlass/CMakeLists.txt
|
|
#
|
|
|
|
# GLOB for CUTLASS header files. Should we use a static list instead?
|
|
file(GLOB CUTLASS_GEMM RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} cutlass/gemm/*.h)
|
|
file(GLOB CUTLASS_UTIL RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} cutlass/util/*.h)
|
|
file(GLOB CUTLASS_DEVICE RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} cutlass/device/*.h)
|
|
file(GLOB CUTLASS_CORE RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} cutlass/*.h)
|
|
file(GLOB CUTLASS_REDUCTION RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} cutlass/reduction/*.h )
|
|
file(GLOB CUTLASS_LAYOUT_THREAD RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} cutlass/layout/thread/*.h)
|
|
|
|
###################################################################################################
|
|
#
|
|
# Define build targets
|
|
#
|
|
###################################################################################################
|
|
|
|
source_group("cutlass\\gemm" FILES ${CUTLASS_GEMM})
|
|
source_group("cutlass\\util" FILES ${CUTLASS_UTIL})
|
|
source_group("cutlass\\device" FILES ${CUTLASS_DEVICE})
|
|
source_group("cutlass\\reduction" FILES ${CUTLASS_REDUCTION})
|
|
source_group("cutlass\\layout\\thread" FILES ${CUTLASS_LAYOUT_THREAD})
|
|
source_group("cutlass" FILES ${CUTLASS_CORE})
|
|
|
|
add_library(CUTLASS INTERFACE)
|
|
include_directories("${CMAKE_CURRENT_SOURCE_DIR}")
|
|
|
|
# Special policy introduced in CMake 3.13
|
|
if (POLICY CMP0076)
|
|
cmake_policy(SET CMP0076 NEW)
|
|
endif()
|
|
|
|
target_sources(CUTLASS INTERFACE
|
|
${CUTLASS_GEMM}
|
|
${CUTLASS_UTIL}
|
|
${CUTLASS_DEVICE}
|
|
${CUTLASS_CORE}
|
|
${CUTLASS_REDUCTION}
|
|
${CUTLASS_LAYOUT_THREAD}
|
|
)
|
|
|
|
target_include_directories(CUTLASS INTERFACE ${CMAKE_CURRENT_SOURCE_DIR})
|
|
|
|
# Create a custom target to ensure that the CUTLASS sources are visible in an IDE
|
|
add_custom_target(cutlass_ide SOURCES
|
|
${CUTLASS_GEMM}
|
|
${CUTLASS_UTIL}
|
|
${CUTLASS_DEVICE}
|
|
${CUTLASS_CORE}
|
|
${CUTLASS_REDUCTION}
|
|
${CUTLASS_LAYOUT_THREAD}
|
|
)
|
|
# Doxygen is available. Generate documentation
|
|
if (DOXYGEN_FOUND)
|
|
# DOT is available. Enable graph generation in the documentation
|
|
if (DOXYGEN_DOT_EXECUTABLE)
|
|
set(CUTLASS_ENABLE_DOXYGEN_DOT ON CACHE BOOL "Use dot to generate graphs in the doxygen documentation.")
|
|
else()
|
|
set(CUTLASS_ENABLE_DOXYGEN_DOT OFF CACHE BOOL "Use dot to generate graphs in the doxygen documentation." FORCE)
|
|
endif()
|
|
|
|
if (CUTLASS_ENABLE_DOXYGEN_DOT)
|
|
set(HAVE_DOT "YES")
|
|
else()
|
|
set(HAVE_DOT "NO")
|
|
endif()
|
|
|
|
# Add custom target for Doxygen.
|
|
add_custom_target(cutlass_docs ${CMAKE_COMMAND} -E env
|
|
"DOT_PATH=${DOXYGEN_DOT_EXECUTABLE}"
|
|
"HAVE_DOT=${HAVE_DOT}"
|
|
${DOXYGEN_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/Doxyfile
|
|
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
|
|
VERBATIM
|
|
)
|
|
endif()
|
|
|
|
add_subdirectory(tools)
|
|
add_subdirectory(examples)
|