cutlass/test/unit/gemm/device/CMakeLists.txt

# Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
#
# Redistribution and use in source and binary forms, with or without modification, are permitted
# provided that the following conditions are met:
#     * Redistributions of source code must retain the above copyright notice, this list of
#       conditions and the following disclaimer.
#     * Redistributions in binary form must reproduce the above copyright notice, this list of
#       conditions and the following disclaimer in the documentation and/or other materials
#       provided with the distribution.
#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
#       to endorse or promote products derived from this software without specific prior written
#       permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

cutlass_test_unit_add_executable(
  cutlass_test_unit_gemm_device

  BATCH_SOURCES ON
  BATCH_SIZE 4

  gemm_f16t_f16n_f16t_tensor_op_f16_sm75.cu
  gemm_f16n_f16t_f16t_tensor_op_f16_sm75.cu
  gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm75.cu
  gemm_f16t_f16n_f16t_tensor_op_f16_slicedk_sm75.cu

  gemm_f16n_f16n_f16t_tensor_op_f32_sm75.cu

  gemm_f16n_f16n_f32t_tensor_op_f32_sm75.cu
  gemm_f16n_f16t_f32t_tensor_op_f32_sm75.cu
  gemm_f16t_f16n_f32t_tensor_op_f32_sm75.cu
  gemm_f16t_f16t_f32t_tensor_op_f32_sm75.cu

  gemm_f16n_f16n_f32n_tensor_op_f32_sm75.cu
  gemm_f16t_f16t_f32n_tensor_op_f32_sm75.cu

  gemm_s8n_s8t_s8n_tensor_op_s32_sm75.cu
  gemm_s8t_s8n_s32t_tensor_op_s32_sm75.cu
  gemm_s8t_s8n_s32n_tensor_op_s32_sm75.cu
  gemm_s8t_s8n_s8t_tensor_op_s32_sm75.cu
  gemm_s8t_s8n_s8n_tensor_op_s32_sm75.cu

  gemm_s4n_s4t_s4n_tensor_op_s32_sm75.cu
  gemm_s4t_s4n_s32t_tensor_op_s32_sm75.cu
  gemm_s4t_s4n_s32n_tensor_op_s32_sm75.cu

  gemm_f16n_f16n_f32t_volta_tensor_op_f32_sm70.cu
  gemm_f16n_f16t_f32t_volta_tensor_op_f32_sm70.cu
  gemm_f16t_f16n_f32t_volta_tensor_op_f32_sm70.cu
  gemm_f16t_f16t_f32t_volta_tensor_op_f32_sm70.cu

  gemm_f16n_f16n_f16t_volta_tensor_op_f32_sm70.cu

  gemm_f16n_f16t_f16t_volta_tensor_op_f16_sm70.cu
  gemm_f16t_f16n_f16t_volta_tensor_op_f16_sm70.cu

  simt_cgemm_nn_sm50.cu
  simt_cgemm_nt_sm50.cu
  simt_cgemm_tn_sm50.cu
  simt_cgemm_tt_sm50.cu

  simt_dgemm_nn_sm50.cu
  simt_dgemm_nt_sm50.cu
  simt_dgemm_tn_sm50.cu
  simt_dgemm_tt_sm50.cu

  simt_hgemm_nn_sm50.cu
  simt_hgemm_nt_sm50.cu
  simt_hgemm_tn_sm50.cu
  simt_hgemm_tt_sm50.cu

  simt_igemm_nn_sm50.cu
  simt_igemm_nt_sm50.cu
  simt_igemm_tn_sm50.cu
  simt_igemm_tt_sm50.cu

  simt_int8_igemm_sm61_sliced_k.cu
  simt_int8_igemm_sm61.cu

  simt_sgemm_nn_sm50.cu
  simt_sgemm_nt_sm50.cu
  simt_sgemm_tn_sm50.cu
  simt_sgemm_tt_sm50.cu

  simt_zgemm_nn_sm50.cu
  simt_zgemm_nt_sm50.cu
  simt_zgemm_tn_sm50.cu
  simt_zgemm_tt_sm50.cu

  gemm_splitk_serial_tensor_op_sm75.cu
  gemm_splitk_tensor_op_sm75.cu
  gemm_splitk_tensor_op_sm70.cu
  gemm_splitk_simt_sm50.cu

  # wmma floating point tests
  gemm_f16t_f16n_f16t_wmma_tensor_op_f16_sm70.cu
  gemm_f16n_f16t_f16t_wmma_tensor_op_f16_sm70.cu
  gemm_f16t_f16t_f16t_wmma_tensor_op_f16_sm70.cu
  gemm_f16n_f16n_f16t_wmma_tensor_op_f16_sm70.cu
  gemm_f16t_f16n_f16n_wmma_tensor_op_f16_sm70.cu
  gemm_f16n_f16t_f16n_wmma_tensor_op_f16_sm70.cu
  gemm_f16t_f16t_f16n_wmma_tensor_op_f16_sm70.cu
  gemm_f16n_f16n_f16n_wmma_tensor_op_f16_sm70.cu

  gemm_f16t_f16n_f32t_wmma_tensor_op_f32_sm70.cu
  gemm_f16n_f16t_f32t_wmma_tensor_op_f32_sm70.cu
  gemm_f16t_f16t_f32t_wmma_tensor_op_f32_sm70.cu
  gemm_f16n_f16n_f32t_wmma_tensor_op_f32_sm70.cu
  gemm_f16t_f16n_f32n_wmma_tensor_op_f32_sm70.cu
  gemm_f16n_f16t_f32n_wmma_tensor_op_f32_sm70.cu
  gemm_f16t_f16t_f32n_wmma_tensor_op_f32_sm70.cu
  gemm_f16n_f16n_f32n_wmma_tensor_op_f32_sm70.cu

  gemm_f16t_f16n_f16t_wmma_tensor_op_f32_sm70.cu
  gemm_f16n_f16t_f16t_wmma_tensor_op_f32_sm70.cu
  gemm_f16t_f16t_f16t_wmma_tensor_op_f32_sm70.cu
  gemm_f16n_f16n_f16t_wmma_tensor_op_f32_sm70.cu
  gemm_f16t_f16n_f16n_wmma_tensor_op_f32_sm70.cu
  gemm_f16n_f16t_f16n_wmma_tensor_op_f32_sm70.cu
  gemm_f16t_f16t_f16n_wmma_tensor_op_f32_sm70.cu
  gemm_f16n_f16n_f16n_wmma_tensor_op_f32_sm70.cu

  # wmma int8 tests
  gemm_s8t_s8n_s32t_wmma_tensor_op_s32_sm72.cu
  gemm_s8t_s8n_s32n_wmma_tensor_op_s32_sm72.cu

  gemm_s8t_s8n_s8t_wmma_tensor_op_s32_sm72.cu
  gemm_s8t_s8n_s8n_wmma_tensor_op_s32_sm72.cu

  # wmma uint8 tests
  gemm_u8t_u8n_s32t_wmma_tensor_op_s32_sm72.cu

  # wmma sub byptes (s4 and b1) tests
  gemm_s4t_s4n_s32n_wmma_tensor_op_s32_sm75.cu
  gemm_s4t_s4n_s32t_wmma_tensor_op_s32_sm75.cu

  gemm_b1t_b1n_s32n_wmma_tensor_op_s32_sm75.cu
  gemm_b1t_b1n_s32t_wmma_tensor_op_s32_sm75.cu

  # wmma floating point tests (using singestage pipeline)
  gemm_f16t_f16n_f16t_singlestage_wmma_tensor_op_f16_sm70.cu
  gemm_f16t_f16n_f16n_singlestage_wmma_tensor_op_f16_sm70.cu

  gemm_f16t_f16n_f32t_singlestage_wmma_tensor_op_f32_sm70.cu
)
CUTLASS 2.0 (#62) CUTLASS 2.0 Substantially refactored for - Better performance, particularly for native Turing Tensor Cores - Robust and durable templates spanning the design space - Encapsulated functionality embodying modern C++11 programming techniques - Optimized containers and data types for efficient, generic, portable device code Updates to: - Quick start guide - Documentation - Utilities - CUTLASS Profiler Native Turing Tensor Cores - Efficient GEMM kernels targeting Turing Tensor Cores - Mixed-precision floating point, 8-bit integer, 4-bit integer, and binarized operands Coverage of existing CUTLASS functionality: - GEMM kernels targeting CUDA and Tensor Cores in NVIDIA GPUs - Volta Tensor Cores through native mma.sync and through WMMA API - Optimizations such as parallel reductions, threadblock rasterization, and intra-threadblock reductions - Batched GEMM operations - Complex-valued GEMMs Note: this commit and all that follow require a host compiler supporting C++11 or greater. 2019-11-20 08:55:34 +08:00			`# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.`
			`#`
			`# Redistribution and use in source and binary forms, with or without modification, are permitted`
			`# provided that the following conditions are met:`
			`# * Redistributions of source code must retain the above copyright notice, this list of`
			`# conditions and the following disclaimer.`
			`# * Redistributions in binary form must reproduce the above copyright notice, this list of`
			`# conditions and the following disclaimer in the documentation and/or other materials`
			`# provided with the distribution.`
			`# * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used`
			`# to endorse or promote products derived from this software without specific prior written`
			`# permission.`
			`#`
			`# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR`
			`# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND`
			`# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE`
			`# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,`
			`# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;`
			`# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,`
			`# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE`
			`# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.`

			`cutlass_test_unit_add_executable(`
			`cutlass_test_unit_gemm_device`

CUTLASS 2.1 (#83) CUTLASS 2.1 contributes: - BLAS-style host-side API added to CUTLASS Library - Planar Complex GEMM kernels targeting Volta and Turing Tensor Cores - Minor enhancements and bug fixes 2020-04-08 04:51:25 +08:00			`BATCH_SOURCES ON`
			`BATCH_SIZE 4`

CUTLASS 2.0 (#62) CUTLASS 2.0 Substantially refactored for - Better performance, particularly for native Turing Tensor Cores - Robust and durable templates spanning the design space - Encapsulated functionality embodying modern C++11 programming techniques - Optimized containers and data types for efficient, generic, portable device code Updates to: - Quick start guide - Documentation - Utilities - CUTLASS Profiler Native Turing Tensor Cores - Efficient GEMM kernels targeting Turing Tensor Cores - Mixed-precision floating point, 8-bit integer, 4-bit integer, and binarized operands Coverage of existing CUTLASS functionality: - GEMM kernels targeting CUDA and Tensor Cores in NVIDIA GPUs - Volta Tensor Cores through native mma.sync and through WMMA API - Optimizations such as parallel reductions, threadblock rasterization, and intra-threadblock reductions - Batched GEMM operations - Complex-valued GEMMs Note: this commit and all that follow require a host compiler supporting C++11 or greater. 2019-11-20 08:55:34 +08:00			`gemm_f16t_f16n_f16t_tensor_op_f16_sm75.cu`
			`gemm_f16n_f16t_f16t_tensor_op_f16_sm75.cu`
CUTLASS 2.1 (#83) CUTLASS 2.1 contributes: - BLAS-style host-side API added to CUTLASS Library - Planar Complex GEMM kernels targeting Volta and Turing Tensor Cores - Minor enhancements and bug fixes 2020-04-08 04:51:25 +08:00			`gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm75.cu`
			`gemm_f16t_f16n_f16t_tensor_op_f16_slicedk_sm75.cu`
CUTLASS 2.0 (#62) CUTLASS 2.0 Substantially refactored for - Better performance, particularly for native Turing Tensor Cores - Robust and durable templates spanning the design space - Encapsulated functionality embodying modern C++11 programming techniques - Optimized containers and data types for efficient, generic, portable device code Updates to: - Quick start guide - Documentation - Utilities - CUTLASS Profiler Native Turing Tensor Cores - Efficient GEMM kernels targeting Turing Tensor Cores - Mixed-precision floating point, 8-bit integer, 4-bit integer, and binarized operands Coverage of existing CUTLASS functionality: - GEMM kernels targeting CUDA and Tensor Cores in NVIDIA GPUs - Volta Tensor Cores through native mma.sync and through WMMA API - Optimizations such as parallel reductions, threadblock rasterization, and intra-threadblock reductions - Batched GEMM operations - Complex-valued GEMMs Note: this commit and all that follow require a host compiler supporting C++11 or greater. 2019-11-20 08:55:34 +08:00
			`gemm_f16n_f16n_f16t_tensor_op_f32_sm75.cu`

			`gemm_f16n_f16n_f32t_tensor_op_f32_sm75.cu`
			`gemm_f16n_f16t_f32t_tensor_op_f32_sm75.cu`
			`gemm_f16t_f16n_f32t_tensor_op_f32_sm75.cu`
			`gemm_f16t_f16t_f32t_tensor_op_f32_sm75.cu`

			`gemm_f16n_f16n_f32n_tensor_op_f32_sm75.cu`
			`gemm_f16t_f16t_f32n_tensor_op_f32_sm75.cu`

			`gemm_s8n_s8t_s8n_tensor_op_s32_sm75.cu`
			`gemm_s8t_s8n_s32t_tensor_op_s32_sm75.cu`
			`gemm_s8t_s8n_s32n_tensor_op_s32_sm75.cu`
			`gemm_s8t_s8n_s8t_tensor_op_s32_sm75.cu`
			`gemm_s8t_s8n_s8n_tensor_op_s32_sm75.cu`

			`gemm_s4n_s4t_s4n_tensor_op_s32_sm75.cu`
			`gemm_s4t_s4n_s32t_tensor_op_s32_sm75.cu`
			`gemm_s4t_s4n_s32n_tensor_op_s32_sm75.cu`

			`gemm_f16n_f16n_f32t_volta_tensor_op_f32_sm70.cu`
			`gemm_f16n_f16t_f32t_volta_tensor_op_f32_sm70.cu`
			`gemm_f16t_f16n_f32t_volta_tensor_op_f32_sm70.cu`
			`gemm_f16t_f16t_f32t_volta_tensor_op_f32_sm70.cu`

			`gemm_f16n_f16n_f16t_volta_tensor_op_f32_sm70.cu`

			`gemm_f16n_f16t_f16t_volta_tensor_op_f16_sm70.cu`
			`gemm_f16t_f16n_f16t_volta_tensor_op_f16_sm70.cu`

			`simt_cgemm_nn_sm50.cu`
			`simt_cgemm_nt_sm50.cu`
			`simt_cgemm_tn_sm50.cu`
			`simt_cgemm_tt_sm50.cu`

			`simt_dgemm_nn_sm50.cu`
			`simt_dgemm_nt_sm50.cu`
			`simt_dgemm_tn_sm50.cu`
			`simt_dgemm_tt_sm50.cu`

			`simt_hgemm_nn_sm50.cu`
			`simt_hgemm_nt_sm50.cu`
			`simt_hgemm_tn_sm50.cu`
			`simt_hgemm_tt_sm50.cu`

			`simt_igemm_nn_sm50.cu`
			`simt_igemm_nt_sm50.cu`
			`simt_igemm_tn_sm50.cu`
			`simt_igemm_tt_sm50.cu`

			`simt_int8_igemm_sm61_sliced_k.cu`
			`simt_int8_igemm_sm61.cu`

			`simt_sgemm_nn_sm50.cu`
			`simt_sgemm_nt_sm50.cu`
			`simt_sgemm_tn_sm50.cu`
			`simt_sgemm_tt_sm50.cu`

			`simt_zgemm_nn_sm50.cu`
			`simt_zgemm_nt_sm50.cu`
			`simt_zgemm_tn_sm50.cu`
			`simt_zgemm_tt_sm50.cu`

CUTLASS 2.1 (#83) CUTLASS 2.1 contributes: - BLAS-style host-side API added to CUTLASS Library - Planar Complex GEMM kernels targeting Volta and Turing Tensor Cores - Minor enhancements and bug fixes 2020-04-08 04:51:25 +08:00			`gemm_splitk_serial_tensor_op_sm75.cu`
CUTLASS 2.0 (#62) CUTLASS 2.0 Substantially refactored for - Better performance, particularly for native Turing Tensor Cores - Robust and durable templates spanning the design space - Encapsulated functionality embodying modern C++11 programming techniques - Optimized containers and data types for efficient, generic, portable device code Updates to: - Quick start guide - Documentation - Utilities - CUTLASS Profiler Native Turing Tensor Cores - Efficient GEMM kernels targeting Turing Tensor Cores - Mixed-precision floating point, 8-bit integer, 4-bit integer, and binarized operands Coverage of existing CUTLASS functionality: - GEMM kernels targeting CUDA and Tensor Cores in NVIDIA GPUs - Volta Tensor Cores through native mma.sync and through WMMA API - Optimizations such as parallel reductions, threadblock rasterization, and intra-threadblock reductions - Batched GEMM operations - Complex-valued GEMMs Note: this commit and all that follow require a host compiler supporting C++11 or greater. 2019-11-20 08:55:34 +08:00			`gemm_splitk_tensor_op_sm75.cu`
			`gemm_splitk_tensor_op_sm70.cu`
			`gemm_splitk_simt_sm50.cu`

			`# wmma floating point tests`
			`gemm_f16t_f16n_f16t_wmma_tensor_op_f16_sm70.cu`
			`gemm_f16n_f16t_f16t_wmma_tensor_op_f16_sm70.cu`
			`gemm_f16t_f16t_f16t_wmma_tensor_op_f16_sm70.cu`
			`gemm_f16n_f16n_f16t_wmma_tensor_op_f16_sm70.cu`
			`gemm_f16t_f16n_f16n_wmma_tensor_op_f16_sm70.cu`
			`gemm_f16n_f16t_f16n_wmma_tensor_op_f16_sm70.cu`
			`gemm_f16t_f16t_f16n_wmma_tensor_op_f16_sm70.cu`
			`gemm_f16n_f16n_f16n_wmma_tensor_op_f16_sm70.cu`

			`gemm_f16t_f16n_f32t_wmma_tensor_op_f32_sm70.cu`
			`gemm_f16n_f16t_f32t_wmma_tensor_op_f32_sm70.cu`
			`gemm_f16t_f16t_f32t_wmma_tensor_op_f32_sm70.cu`
			`gemm_f16n_f16n_f32t_wmma_tensor_op_f32_sm70.cu`
			`gemm_f16t_f16n_f32n_wmma_tensor_op_f32_sm70.cu`
			`gemm_f16n_f16t_f32n_wmma_tensor_op_f32_sm70.cu`
			`gemm_f16t_f16t_f32n_wmma_tensor_op_f32_sm70.cu`
			`gemm_f16n_f16n_f32n_wmma_tensor_op_f32_sm70.cu`

			`gemm_f16t_f16n_f16t_wmma_tensor_op_f32_sm70.cu`
			`gemm_f16n_f16t_f16t_wmma_tensor_op_f32_sm70.cu`
			`gemm_f16t_f16t_f16t_wmma_tensor_op_f32_sm70.cu`
			`gemm_f16n_f16n_f16t_wmma_tensor_op_f32_sm70.cu`
			`gemm_f16t_f16n_f16n_wmma_tensor_op_f32_sm70.cu`
			`gemm_f16n_f16t_f16n_wmma_tensor_op_f32_sm70.cu`
			`gemm_f16t_f16t_f16n_wmma_tensor_op_f32_sm70.cu`
			`gemm_f16n_f16n_f16n_wmma_tensor_op_f32_sm70.cu`

			`# wmma int8 tests`
			`gemm_s8t_s8n_s32t_wmma_tensor_op_s32_sm72.cu`
			`gemm_s8t_s8n_s32n_wmma_tensor_op_s32_sm72.cu`

			`gemm_s8t_s8n_s8t_wmma_tensor_op_s32_sm72.cu`
			`gemm_s8t_s8n_s8n_wmma_tensor_op_s32_sm72.cu`

			`# wmma uint8 tests`
			`gemm_u8t_u8n_s32t_wmma_tensor_op_s32_sm72.cu`

			`# wmma sub byptes (s4 and b1) tests`
			`gemm_s4t_s4n_s32n_wmma_tensor_op_s32_sm75.cu`
			`gemm_s4t_s4n_s32t_wmma_tensor_op_s32_sm75.cu`

			`gemm_b1t_b1n_s32n_wmma_tensor_op_s32_sm75.cu`
			`gemm_b1t_b1n_s32t_wmma_tensor_op_s32_sm75.cu`

			`# wmma floating point tests (using singestage pipeline)`
			`gemm_f16t_f16n_f16t_singlestage_wmma_tensor_op_f16_sm70.cu`
			`gemm_f16t_f16n_f16n_singlestage_wmma_tensor_op_f16_sm70.cu`

			`gemm_f16t_f16n_f32t_singlestage_wmma_tensor_op_f32_sm70.cu`
			`)`