Updated mma_sm80.h to avoid perf penalty due to reinterpret_cast<>. (#100)

- Updated mma_sm80.h to avoid perf penalty due to reinterpret_cast<>.
- Enhancement to CUTLASS Utility Library's HostTensorPlanarComplex template to support copy-in and copy-out
- Added test_examples target to build and test all CUTLASS examples
- Minor edits to documentation to point to GTC 2020 webinar
This commit is contained in:
Andrew Kerr 2020-06-15 10:47:01 -07:00 committed by GitHub
parent 86931fef85
commit 1ab1027954
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 213 additions and 33 deletions

View File

@ -9,6 +9,7 @@
* Tensor Float 32, BFloat16, and double-precision data types * Tensor Float 32, BFloat16, and double-precision data types
* Mixed integer data types (int8, int4, bin1) * Mixed integer data types (int8, int4, bin1)
* Asynchronous copy for deep software pipelines via [`cp.async`](https://docs.nvidia.com/cuda/parallel-thread-execution) * Asynchronous copy for deep software pipelines via [`cp.async`](https://docs.nvidia.com/cuda/parallel-thread-execution)
* Described in [GTC 2020 Webinar (SR 21745)](https://developer.nvidia.com/gtc/2020/video/s21745) (free registration required)
* Features: * Features:
* SDK examples showing GEMM fused with bias+relu and fused GEMM+GEMM * SDK examples showing GEMM fused with bias+relu and fused GEMM+GEMM
* Complex-valued GEMMs targeting NVIDIA Ampere Tensor Cores in double-precision and Tensor Float 32 * Complex-valued GEMMs targeting NVIDIA Ampere Tensor Cores in double-precision and Tensor Float 32

View File

@ -37,6 +37,7 @@ CUTLASS 2.2 is a significant update to CUTLASS adding:
- Coverage of [NVIDIA Ampere Architecture features](https://devblogs.nvidia.com/nvidia-ampere-architecture-in-depth/) - Coverage of [NVIDIA Ampere Architecture features](https://devblogs.nvidia.com/nvidia-ampere-architecture-in-depth/)
- Tensor Core-accelerated GEMMs targeting Tensor Float 32, BFloat16, and double-precision data types - Tensor Core-accelerated GEMMs targeting Tensor Float 32, BFloat16, and double-precision data types
- Deep software pipelines using asynchronous copy - Deep software pipelines using asynchronous copy
- Described in [GTC 2020 Webinar (SR 21745)](https://developer.nvidia.com/gtc/2020/video/s21745)
- Intended to be compiled with [CUDA 11 Toolkit](https://developer.nvidia.com/cuda-toolkit) - Intended to be compiled with [CUDA 11 Toolkit](https://developer.nvidia.com/cuda-toolkit)
# What's New in CUTLASS 2.1 # What's New in CUTLASS 2.1

View File

@ -20,15 +20,9 @@
# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cutlass_add_executable( cutlass_example_add_executable(
03_visualize_layout 03_visualize_layout
visualize_layout.cpp visualize_layout.cpp
register_layout.cu register_layout.cu
) )
target_link_libraries(
03_visualize_layout
PRIVATE
CUTLASS
cutlass_tools_util_includes
)

View File

@ -182,10 +182,12 @@ int run() {
return -1; return -1;
} }
if (!(props.major >= 7)) { if (props.major != 7) {
std::cerr << "Volta Tensor Ops must be run on a machine with compute capability at least 70." std::cerr << "Volta Tensor Ops must be run on a machine with compute capability of 70, 72, or 75."
<< std::endl; << std::endl;
return -1;
// Return 0 so tests pass if run on unsupported architectures or CUDA Toolkits.
return 0;
} }
// //

View File

@ -198,10 +198,12 @@ int run() {
return -1; return -1;
} }
if (!(props.major >= 7)) { if (props.major != 7) {
std::cerr << "Volta Tensor Ops must be run on a machine with compute capability at least 70." std::cerr << "Volta Tensor Ops must be run on a machine with compute capability of 70, 72, or 75."
<< std::endl; << std::endl;
return -1;
// Return 0 so tests are considered passing if run on unsupported architectures or CUDA Toolkits.
return 0;
} }
const int length_m = 5120; const int length_m = 5120;

View File

@ -208,7 +208,9 @@ int run() {
if (!((props.major * 10 + props.minor) >= 75)) { if (!((props.major * 10 + props.minor) >= 75)) {
std::cerr << "Turing Tensor Core operations must be run on a machine with compute capability at least 75." std::cerr << "Turing Tensor Core operations must be run on a machine with compute capability at least 75."
<< std::endl; << std::endl;
return -1;
// Return 0 so tests are considered passing if run on unsupported platforms.
return 0;
} }
const int length_m = 5120; const int length_m = 5120;

View File

@ -44,9 +44,18 @@ function(cutlass_example_add_executable NAME)
${CUTLASS_EXAMPLES_COMMON_SOURCE_DIR} ${CUTLASS_EXAMPLES_COMMON_SOURCE_DIR}
) )
add_custom_target(
test_${NAME}
COMMAND
${CUTLASS_TEST_EXECUTION_ENVIRONMENT} $<TARGET_FILE:${NAME}>
DEPENDS
${NAME}
)
endfunction() endfunction()
add_custom_target(cutlass_examples) add_custom_target(cutlass_examples)
add_custom_target(test_examples)
foreach(EXAMPLE foreach(EXAMPLE
00_basic_gemm 00_basic_gemm
@ -66,5 +75,6 @@ foreach(EXAMPLE
add_subdirectory(${EXAMPLE}) add_subdirectory(${EXAMPLE})
add_dependencies(cutlass_examples ${EXAMPLE}) add_dependencies(cutlass_examples ${EXAMPLE})
add_dependencies(test_examples test_${EXAMPLE})
endforeach() endforeach()

View File

@ -164,4 +164,5 @@ struct Mma<gemm::GemmShape<1, 1, 1>, 1, ElementA, LayoutA, ElementB, LayoutB, El
#include "cutlass/arch/mma_sm61.h" #include "cutlass/arch/mma_sm61.h"
#include "cutlass/arch/mma_sm70.h" #include "cutlass/arch/mma_sm70.h"
#include "cutlass/arch/mma_sm75.h" #include "cutlass/arch/mma_sm75.h"
#include "cutlass/arch/mma_sm80.h"
///////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -98,17 +98,17 @@ struct Mma<
uint32_t const *A = reinterpret_cast<uint32_t const *>(&a); uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
uint32_t const *B = reinterpret_cast<uint32_t const *>(&b); uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
uint32_t const *C = reinterpret_cast<uint32_t const *>(&c); float const *C = reinterpret_cast<float const *>(&c);
uint32_t *D = reinterpret_cast<uint32_t *>(&d); float *D = reinterpret_cast<float *>(&d);
asm( asm(
"mma.sync.aligned.m16n8k8.row.col.f32.bf16.bf16.f32 " "mma.sync.aligned.m16n8k8.row.col.f32.bf16.bf16.f32 "
"{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n" "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
: "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3]) : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
: :
"r"(A[0]), "r"(A[1]), "r"(A[0]), "r"(A[1]),
"r"(B[0]), "r"(B[0]),
"r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]) "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3])
); );
#else #else
@ -341,15 +341,15 @@ struct Mma<
uint32_t const *A = reinterpret_cast<uint32_t const *>(&a); uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
uint32_t const *B = reinterpret_cast<uint32_t const *>(&b); uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
uint32_t const *C = reinterpret_cast<uint32_t const *>(&c); float const *C = reinterpret_cast<float const *>(&c);
uint32_t *D = reinterpret_cast<uint32_t *>(&d); float *D = reinterpret_cast<float *>(&d);
asm volatile( asm volatile(
"mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 " "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
"{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
: "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3]) : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
: "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
"r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3])); "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]));
#else #else
assert(0); assert(0);
@ -402,15 +402,15 @@ struct Mma<
uint32_t const *A = reinterpret_cast<uint32_t const *>(&a); uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
uint32_t const *B = reinterpret_cast<uint32_t const *>(&b); uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
uint32_t const *C = reinterpret_cast<uint32_t const *>(&c); float const *C = reinterpret_cast<float const *>(&c);
uint32_t *D = reinterpret_cast<uint32_t *>(&d); float *D = reinterpret_cast<float *>(&d);
asm volatile( asm volatile(
"mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, " "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, "
"{%10,%11,%12,%13};\n" "{%10,%11,%12,%13};\n"
: "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3]) : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
: "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
"r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3])); "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]));
#else #else
assert(0); assert(0);
@ -461,15 +461,15 @@ struct Mma<
#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED) #if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
uint64_t const & A = reinterpret_cast<uint64_t const &>(a); double const & A = reinterpret_cast<double const &>(a);
uint64_t const & B = reinterpret_cast<uint64_t const &>(b); double const & B = reinterpret_cast<double const &>(b);
uint64_t const *C = reinterpret_cast<uint64_t const *>(&c); double const *C = reinterpret_cast<double const *>(&c);
uint64_t *D = reinterpret_cast<uint64_t *>(&d); double *D = reinterpret_cast<double *>(&d);
asm volatile("mma.sync.aligned.m8n8k4.row.col.f64.f64.f64.f64 {%0,%1}, {%2}, {%3}, {%4,%5};\n" asm volatile("mma.sync.aligned.m8n8k4.row.col.f64.f64.f64.f64 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
: "=l"(D[0]), "=l"(D[1]) : "=d"(D[0]), "=d"(D[1])
: "l"(A), "l"(B), "l"(C[0]), "l"(C[1])); : "d"(A), "d"(B), "d"(C[0]), "d"(C[1]));
#else #else
assert(0); assert(0);

View File

@ -161,6 +161,7 @@ compiled as C++11 or greater.
#include <iostream> #include <iostream>
#include <cutlass/cutlass.h> #include <cutlass/cutlass.h>
#include <cutlass/numeric_types.h> #include <cutlass/numeric_types.h>
#include <cutlass/core_io.h>
int main() { int main() {
@ -174,10 +175,13 @@ int main() {
## Launching a GEMM kernel in CUDA ## Launching a GEMM kernel in CUDA
**Example:** launch a mixed-precision GEMM targeting Turing Tensor Cores. **Example:** launch a mixed-precision GEMM targeting Turing Tensor Cores.
_Note, this example uses CUTLASS Utilities. Be sure `tools/util/include` is listed as an include path._
```c++ ```c++
#include <cutlass/numeric_types.h> #include <cutlass/numeric_types.h>
#include <cutlass/gemm/device/gemm.h> #include <cutlass/gemm/device/gemm.h>
#include <cutlass/util/host_tensor.h> #include <cutlass/util/host_tensor.h>
int main() { int main() {

View File

@ -276,6 +276,9 @@ public:
/// Gets pointer to device data with a pointer offset /// Gets pointer to device data with a pointer offset
Element const * device_data_ptr_offset(LongIndex ptr_element_offset) const { return device_.get() + ptr_element_offset; } Element const * device_data_ptr_offset(LongIndex ptr_element_offset) const { return device_.get() + ptr_element_offset; }
/// Gets a pointer to the device data imaginary part
Element * device_data_imag() { return device_.get() + imaginary_stride(); }
/// Accesses the tensor reference pointing to data /// Accesses the tensor reference pointing to data
TensorRef host_ref(LongIndex ptr_element_offset=0) { TensorRef host_ref(LongIndex ptr_element_offset=0) {
return TensorRef(host_data_ptr_offset(ptr_element_offset), layout_, imaginary_stride()); return TensorRef(host_data_ptr_offset(ptr_element_offset), layout_, imaginary_stride());
@ -416,6 +419,166 @@ public:
device_data(), host_data(), imaginary_stride() * 2); device_data(), host_data(), imaginary_stride() * 2);
} }
} }
/// Copy data from a caller-supplied device pointer into host memory.
void copy_in_device_to_host(
Element const* ptr_device_real, ///< source device memory
Element const* ptr_device_imag, ///< source device memory
LongIndex count = -1) { ///< number of elements to transfer; if negative, entire tensor is overwritten.
if (count < 0) {
count = capacity();
}
else {
count = __NV_STD_MIN(capacity(), count);
}
device_memory::copy_to_host(
host_data(), ptr_device_real, count);
device_memory::copy_to_host(
host_data_imag(), ptr_device_imag, count);
}
/// Copy data from a caller-supplied device pointer into host memory.
void copy_in_device_to_device(
Element const* ptr_device_real, ///< source device memory
Element const* ptr_device_imag, ///< source device memory
LongIndex count = -1) { ///< number of elements to transfer; if negative, entire tensor is overwritten.
if (count < 0) {
count = capacity();
}
else {
count = __NV_STD_MIN(capacity(), count);
}
device_memory::copy_device_to_device(
device_data(), ptr_device_real, count);
device_memory::copy_device_to_device(
device_data_imag(), ptr_device_imag, count);
}
/// Copy data from a caller-supplied device pointer into host memory.
void copy_in_host_to_device(
Element const* ptr_host_real, ///< source host memory
Element const* ptr_host_imag, ///< source host memory
LongIndex count = -1) { ///< number of elements to transfer; if negative, entire tensor is overwritten.
if (count < 0) {
count = capacity();
}
else {
count = __NV_STD_MIN(capacity(), count);
}
device_memory::copy_to_device(
device_data(), ptr_host_real, count);
device_memory::copy_to_device(
device_data_imag(), ptr_host_imag, count);
}
/// Copy data from a caller-supplied device pointer into host memory.
void copy_in_host_to_host(
Element const* ptr_host_real, ///< source host memory
Element const* ptr_host_imag, ///< source host memory
LongIndex count = -1) { ///< number of elements to transfer; if negative, entire tensor is overwritten.
if (count < 0) {
count = capacity();
}
else {
count = __NV_STD_MIN(capacity(), count);
}
device_memory::copy_host_to_host(
host_data(), ptr_host_real, count);
device_memory::copy_host_to_host(
host_data_imag(), ptr_host_imag, count);
}
/// Copy data from a caller-supplied device pointer into host memory.
void copy_out_device_to_host(
Element * ptr_host_real, ///< source device memory
Element * ptr_host_imag, ///< source device memory
LongIndex count = -1) const { ///< number of elements to transfer; if negative, entire tensor is overwritten.
if (count < 0) {
count = capacity();
}
else {
count = __NV_STD_MIN(capacity(), count);
}
device_memory::copy_to_host(
ptr_host_real, device_data(), count);
device_memory::copy_to_host(
ptr_host_imag, device_data_imag(), count);
}
/// Copy data from a caller-supplied device pointer into host memory.
void copy_out_device_to_device(
Element * ptr_device_real, ///< source device memory
Element * ptr_device_imag, ///< source device memory
LongIndex count = -1) const { ///< number of elements to transfer; if negative, entire tensor is overwritten.
if (count < 0) {
count = capacity();
}
else {
count = __NV_STD_MIN(capacity(), count);
}
device_memory::copy_device_to_device(
ptr_device_real, device_data(), count);
device_memory::copy_device_to_device(
ptr_device_imag, device_data_imag(), count);
}
/// Copy data from a caller-supplied device pointer into host memory.
void copy_out_host_to_device(
Element * ptr_device_real, ///< source device memory
Element * ptr_device_imag, ///< source device memory
LongIndex count = -1) const { ///< number of elements to transfer; if negative, entire tensor is overwritten.
if (count < 0) {
count = capacity();
}
else {
count = __NV_STD_MIN(capacity(), count);
}
device_memory::copy_to_device(
ptr_device_real, host_data(), count);
device_memory::copy_to_device(
ptr_device_imag, host_data_imag(), count);
}
/// Copy data from a caller-supplied device pointer into host memory.
void copy_out_host_to_host(
Element * ptr_host_real, ///< source host memory
Element * ptr_host_imag, ///< source host memory
LongIndex count = -1) const { ///< number of elements to transfer; if negative, entire tensor is overwritten.
if (count < 0) {
count = capacity();
}
else {
count = __NV_STD_MIN(capacity(), count);
}
device_memory::copy_host_to_host(
ptr_host_real, host_data(), count);
device_memory::copy_host_to_host(
ptr_host_imag, host_data_imag(), count);
}
}; };
/////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////