cutlass/tools/test/unit/util/tensor_foreach.cu
Andrew Kerr 877bdcace6
Cutlass 1.3 Release (#42)
CUTLASS 1.3 Release
- Efficient GEMM kernel targeting Volta Tensor Cores via mma.sync instruction added in CUDA 10.1.
2019-03-20 10:49:17 -07:00

218 lines
7.0 KiB
Plaintext

/***************************************************************************************************
* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright notice, this list of
* conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice, this list of
* conditions and the following disclaimer in the documentation and/or other materials
* provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
* to endorse or promote products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/* \file
\brief
These tests are intended to demonstrate the CUTLASS reference implementation for basic for-each
operators on the index space of TensorView objects. They instantiate a HostMatrix, initialize
its elements with random data according to specified random distributions, and clamp the
elements using a TensorForEach() operation.
Both device-side and host-side reference implementations are called.
*/
#include "cutlass_unit_test.h"
#include "cutlass/matrix_traits.h"
#include "tools/util/tensor_view_io.h"
#include "tools/util/host_tensor.h"
#include "tools/util/host_matrix.h"
#include "tools/util/reference/device/tensor_foreach.h"
#include "tools/util/reference/device/tensor_elementwise.h"
#include "tools/util/reference/host/tensor_foreach.h"
#include "tools/util/reference/host/tensor_elementwise.h"
///////////////////////////////////////////////////////////////////////////////////////////////////
namespace test {
/// Define a functor that computes the ReLu operation on a tensor.
template <typename View>
struct ReLuFunc {
/// Coordinate of index space
typedef typename View::TensorCoord TensorCoord;
/// Scalar type
typedef typename View::Storage T;
//
// Data members
//
/// Tensor view
View view;
/// ReLu threshold
T threshold;
//
// Methods
//
/// Constructor
CUTLASS_HOST_DEVICE
ReLuFunc(View const &view, T threshold): view(view), threshold(threshold) { }
/// ReLu function
CUTLASS_HOST_DEVICE
void operator()(TensorCoord const &coord) {
T value = view.at(coord);
if (value < threshold) {
value = threshold;
}
view.at(coord) = value;
}
};
} // namespace test
///////////////////////////////////////////////////////////////////////////////////////////////////
/// This tests models the computation of ReLu using reference utility code.
TEST(TensorForEach, ReLu_device) {
// Define HostMatrix type
typedef cutlass::HostMatrix<float> HostMatrix;
typedef typename HostMatrix::DeviceTensorView View;
// Define the problem size
int const M = 517;
int const N = 117;
float threshold = 0;
// Construct the host matrix
HostMatrix source(cutlass::MatrixCoord(M, N), cutlass::MatrixLayout::kRowMajor);
source.fill(0);
// Initialize the source matrix with a uniform distribution
cutlass::Distribution dist;
dist.set_uniform(-16, 16);
// RNG seed is hard-coded for determinism in the test.
int64_t seed = 2080;
cutlass::reference::device::TensorInitialize(source.device_view(), seed, dist);
// Define a functor called by TensorForEach<>
typedef test::ReLuFunc<View> ReLuFunc;
// Instantiate on host with TensorView and threshold value
ReLuFunc relu_func(source.device_view(), threshold);
// Launch kernel that applies the element-wise operator over the tensor's index space.
cutlass::reference::device::TensorForEach<
ReLuFunc,
View::kRank,
ReLuFunc>(source.size(), relu_func);
// Verify no element is less than the ReLu threshold.
source.sync_host();
int errors = 0;
for (cutlass::MatrixCoord coord(0, 0); coord.row() < M; ++coord.row()) {
for (coord.column() = 0; coord.column() < N; ++coord.column()) {
if (source.at(coord) < threshold) {
++errors;
if (errors < 10) {
std::cout << "Error - source(" << coord << ") = "
<< source.at(coord) << " is less than threshold " << threshold << std::endl;
}
}
}
}
EXPECT_EQ(errors, 0)
<< "Result: " << source;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
/// Test to apply the ReLu operation using host-side utilities
TEST(TensorForEach, ReLu_host) {
// Define HostMatrix type
typedef cutlass::HostMatrix<float> HostMatrix;
typedef typename HostMatrix::HostTensorView View;
// Define the problem size
int const M = 517;
int const N = 117;
float threshold = 0;
bool const kDeviceBacked = false;
// Construct the host matrix
HostMatrix source(cutlass::MatrixCoord(M, N), cutlass::MatrixLayout::kRowMajor, kDeviceBacked);
source.fill(0);
// Initialize the source matrix with a uniform distribution
cutlass::Distribution dist;
dist.set_gaussian(-1, 4);
// RNG seed is hard-coded for determinism in the test.
unsigned seed = 2080;
cutlass::reference::host::TensorInitialize(source.host_view(), seed, dist);
// Define a functor called by TensorForEach<>
typedef test::ReLuFunc<View> ReLuFunc;
// Instantiate on host with TensorView and threshold value
ReLuFunc relu_func(source.host_view(), threshold);
// Invoke host-side for-each computation on the tensor
cutlass::reference::host::TensorForEach<
ReLuFunc,
View::kRank,
ReLuFunc>(source.size(), relu_func);
int errors = 0;
for (cutlass::MatrixCoord coord(0, 0); coord.row() < M; ++coord.row()) {
for (coord.column() = 0; coord.column() < N; ++coord.column()) {
if (source.at(coord) < threshold) {
++errors;
if (errors < 10) {
std::cout << "Error - source(" << coord << ") = "
<< source.at(coord) << " is less than threshold " << threshold << std::endl;
}
}
}
}
EXPECT_EQ(errors, 0)
<< "Result: " << source;
}
///////////////////////////////////////////////////////////////////////////////////////////////////