2017-12-05 13:12:52 +08:00
|
|
|
/******************************************************************************
|
2017-12-06 23:00:59 +08:00
|
|
|
* Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
|
2017-12-05 13:12:52 +08:00
|
|
|
*
|
|
|
|
* Redistribution and use in source and binary forms, with or without
|
2017-12-06 23:00:59 +08:00
|
|
|
* modification, are permitted provided that the following conditions are met:
|
|
|
|
* * Redistributions of source code must retain the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer.
|
|
|
|
* * Redistributions in binary form must reproduce the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
|
|
* documentation and/or other materials provided with the distribution.
|
|
|
|
* * Neither the name of the NVIDIA CORPORATION nor the
|
|
|
|
* names of its contributors may be used to endorse or promote products
|
|
|
|
* derived from this software without specific prior written permission.
|
2017-12-05 13:12:52 +08:00
|
|
|
*
|
|
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
|
|
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
|
|
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
|
|
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
|
|
|
|
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
|
|
|
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
|
|
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
|
|
|
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
|
|
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
*
|
|
|
|
******************************************************************************/
|
|
|
|
|
|
|
|
#pragma once
|
|
|
|
|
|
|
|
/**
|
|
|
|
* \file
|
|
|
|
* Matrix data structure providing basic CPU-based algorithms and
|
|
|
|
* operations that can be cloned and synchronized in GPU device memory
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <vector>
|
|
|
|
#include <fstream>
|
|
|
|
|
|
|
|
#include <cutlass/util/debug.h>
|
|
|
|
#include "../cutlass/util/matrix_transform.h"
|
|
|
|
#include "half.h"
|
|
|
|
|
|
|
|
|
|
|
|
namespace cutlass {
|
|
|
|
|
|
|
|
/**
|
|
|
|
* \brief Matrix data structure providing basic CPU-based algorithms and
|
|
|
|
* operations that be synchronized with a GPU-based replica
|
|
|
|
*/
|
|
|
|
template <typename value_t>
|
|
|
|
struct matrix
|
|
|
|
{
|
|
|
|
// Host value type (must be convertible to/from value_t)
|
|
|
|
typedef typename nv_std::conditional<
|
|
|
|
(nv_std::is_same<value_t, __half>::value), // If (value_t == __half) ...
|
|
|
|
half_t, // ... use half_t internally for host storage, else...
|
|
|
|
value_t>::type // ... use value_t directly
|
|
|
|
host_value_t;
|
|
|
|
|
|
|
|
|
|
|
|
//-----------------------------------------------------------------------------
|
|
|
|
// Data members
|
|
|
|
//-----------------------------------------------------------------------------
|
|
|
|
|
|
|
|
private:
|
|
|
|
|
|
|
|
/// M dimension (height in rows)
|
|
|
|
int _m;
|
|
|
|
|
|
|
|
/// N dimension (width in columns)
|
|
|
|
int _n;
|
|
|
|
|
|
|
|
/// Data array on host
|
|
|
|
std::vector<host_value_t> _h_data;
|
|
|
|
|
|
|
|
/// Clone of data array on GPU device
|
|
|
|
value_t *_d_data;
|
|
|
|
|
|
|
|
/// GPU Device identifier that clone synchronizes with
|
|
|
|
int _device_id;
|
|
|
|
|
|
|
|
public:
|
|
|
|
|
|
|
|
//-----------------------------------------------------------------------------
|
|
|
|
// Lifetime and synchronization
|
|
|
|
//-----------------------------------------------------------------------------
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Constructor: zero-initializes the matrix.
|
|
|
|
*/
|
|
|
|
matrix(
|
|
|
|
int m, ///< Height of the matrix in rows
|
|
|
|
int n) ///< Width of the matrix in columns
|
|
|
|
:
|
|
|
|
_m(m),
|
|
|
|
_n(n),
|
|
|
|
_d_data(NULL),
|
|
|
|
_device_id(0)
|
|
|
|
{
|
|
|
|
_h_data.resize(_m * _n, 0);
|
|
|
|
CUDA_PERROR_EXIT(cudaMalloc((void ** )&_d_data, sizeof(value_t) * _m * _n));
|
|
|
|
CUDA_PERROR_EXIT(cudaGetDevice(&_device_id));
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Destructor
|
|
|
|
~matrix()
|
|
|
|
{
|
|
|
|
if (_d_data)
|
|
|
|
{
|
|
|
|
CUDA_PERROR_EXIT(cudaFree(_d_data));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Synchronize the GPU-based replica with the current host-based matrix data
|
|
|
|
*/
|
|
|
|
void sync_device()
|
|
|
|
{
|
|
|
|
size_t bytes = _m * _n * sizeof(value_t);
|
|
|
|
CUDA_PERROR_EXIT(cudaMemcpy(_d_data, &_h_data[0], bytes, cudaMemcpyHostToDevice));
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Synchronize the host-based replica with the current GPU-based matrix data
|
|
|
|
*/
|
|
|
|
void sync_host()
|
|
|
|
{
|
|
|
|
size_t bytes = _m * _n * sizeof(value_t);
|
|
|
|
CUDA_PERROR_EXIT(cudaMemcpy(&_h_data[0], _d_data, bytes, cudaMemcpyDeviceToHost));
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
//-----------------------------------------------------------------------------
|
|
|
|
// Inspectors
|
|
|
|
//-----------------------------------------------------------------------------
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Return the height of the matrix, subject to the optional \p transpose_op
|
|
|
|
*/
|
|
|
|
int height(matrix_transform_t transpose_op = matrix_transform_t::NonTranspose) const
|
|
|
|
{
|
|
|
|
switch (transpose_op)
|
|
|
|
{
|
|
|
|
case matrix_transform_t::NonTranspose : return _m;
|
|
|
|
case matrix_transform_t::Transpose : return _n;
|
|
|
|
default: return -1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Return the width of the matrix, subject to the optional \p transpose_op
|
|
|
|
*/
|
|
|
|
int width(matrix_transform_t transpose_op = matrix_transform_t::NonTranspose) const
|
|
|
|
{
|
|
|
|
switch (transpose_op)
|
|
|
|
{
|
|
|
|
case matrix_transform_t::NonTranspose : return _n;
|
|
|
|
case matrix_transform_t::Transpose : return _m;
|
|
|
|
default: return -1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Return item at (x, y) coordinate of matrix, subject to the optional \p transform op
|
|
|
|
*/
|
|
|
|
host_value_t get(
|
|
|
|
int x,
|
|
|
|
int y,
|
|
|
|
matrix_transform_t transpose_op = matrix_transform_t::NonTranspose) const
|
|
|
|
{
|
|
|
|
switch (transpose_op)
|
|
|
|
{
|
|
|
|
case matrix_transform_t::NonTranspose : return _h_data[y + (x * _m)];
|
|
|
|
case matrix_transform_t::Transpose : return _h_data[x + (y * _m)];
|
|
|
|
default: return 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Return the distance (in items) within memory between elements of two
|
|
|
|
* consecutive columns which have the same row index, subject to the optional \p transform op
|
|
|
|
*/
|
|
|
|
int leading_dim(matrix_transform_t transpose_op = matrix_transform_t::NonTranspose) const
|
|
|
|
{
|
|
|
|
switch (transpose_op)
|
|
|
|
{
|
|
|
|
case matrix_transform_t::NonTranspose : return _m;
|
|
|
|
case matrix_transform_t::Transpose : return _n;
|
|
|
|
default: return 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Get host data pointer
|
|
|
|
*/
|
|
|
|
value_t* h_data()
|
|
|
|
{
|
|
|
|
return _h_data.data();
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Get host data pointer
|
|
|
|
*/
|
|
|
|
value_t const* h_data() const
|
|
|
|
{
|
|
|
|
return _h_data.data();
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Get device data pointer
|
|
|
|
*/
|
|
|
|
value_t const* d_data() const
|
|
|
|
{
|
|
|
|
return _d_data;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Get device data pointer
|
|
|
|
*/
|
|
|
|
value_t * d_data()
|
|
|
|
{
|
|
|
|
return _d_data;
|
|
|
|
}
|
|
|
|
|
|
|
|
//-----------------------------------------------------------------------------
|
|
|
|
// Initialization
|
|
|
|
//-----------------------------------------------------------------------------
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Initialize matrix values with a 2D "ramp" defined as
|
|
|
|
* <tt>values(x, y) = (y * rs) + (x * cs)</tt>
|
|
|
|
*/
|
|
|
|
void fill_ramp(
|
|
|
|
host_value_t rs,
|
|
|
|
host_value_t cs)
|
|
|
|
{
|
|
|
|
for (int x = 0; x < _n; x++)
|
|
|
|
{
|
|
|
|
for (int y = 0; y < _m; y++)
|
|
|
|
{
|
|
|
|
_h_data[y + (x * _m)] = host_value_t((y * rs) + (x * cs));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Initialize matrix values such that all the elements of the principal diagonal
|
|
|
|
* are ones and all other elements are zeros
|
|
|
|
*/
|
|
|
|
void fill_identity()
|
|
|
|
{
|
|
|
|
for (int j = 0; j < _n; j++)
|
|
|
|
{
|
|
|
|
for (int i = 0; i < _m; i++)
|
|
|
|
{
|
|
|
|
_h_data[i + j * _m] = host_value_t(i == j ? 1 : 0);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Initialize matrix values using the random number \p generator. The
|
|
|
|
* \p generator reference is assumed to be a nullary functor that returns
|
|
|
|
* values convertible to the matrix \p value_t.
|
|
|
|
*/
|
|
|
|
template <typename T>
|
|
|
|
void fill_random(T & generator)
|
|
|
|
{
|
|
|
|
for (int j = 0; j < _n; j++)
|
|
|
|
{
|
|
|
|
for (int i = 0; i < _m; i++)
|
|
|
|
{
|
|
|
|
_h_data[i + j * _m] = (value_t) generator();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Element-wise matrix addition
|
|
|
|
*/
|
|
|
|
matrix & operator+=(matrix const &mat)
|
|
|
|
{
|
|
|
|
for (int j = 0; j < _n; j++)
|
|
|
|
{
|
|
|
|
for (int i = 0; i < _m; i++)
|
|
|
|
{
|
|
|
|
_h_data[i + j * _m] += mat._h_data[i + j * _m];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return *this;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Element-wise matrix subtraction
|
|
|
|
*/
|
|
|
|
matrix & operator-=(matrix const &mat)
|
|
|
|
{
|
|
|
|
for (int j = 0; j < _n; j++)
|
|
|
|
{
|
|
|
|
for (int i = 0; i < _m; i++)
|
|
|
|
{
|
|
|
|
_h_data[i + j * _m] -= mat._h_data[i + j * _m];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return *this;
|
|
|
|
}
|
|
|
|
|
|
|
|
//-----------------------------------------------------------------------------
|
|
|
|
// Output
|
|
|
|
//-----------------------------------------------------------------------------
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Prints matrix in CSV to output stream
|
|
|
|
*/
|
|
|
|
template <typename _hv_t>
|
|
|
|
std::ostream & write_matrix(std::ostream &out, _hv_t)
|
|
|
|
{
|
|
|
|
for (int i = 0; i < _m; i++)
|
|
|
|
{
|
|
|
|
for (int j = 0; j < _n; j++)
|
|
|
|
{
|
|
|
|
out << (j ? "," : "") << _h_data[i + j * _m];
|
|
|
|
}
|
|
|
|
out << "\n";
|
|
|
|
}
|
|
|
|
return out;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Prints matrix in CSV to output stream
|
|
|
|
*/
|
|
|
|
std::ostream & write_matrix(std::ostream &out, int8_t)
|
|
|
|
{
|
|
|
|
for (int i = 0; i < _m; i++)
|
|
|
|
{
|
|
|
|
for (int j = 0; j < _n; j++)
|
|
|
|
{
|
|
|
|
out << (j ? "," : "") << int32_t(_h_data[i + j * _m]);
|
|
|
|
}
|
|
|
|
out << "\n";
|
|
|
|
}
|
|
|
|
return out;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Prints matrix in CSV to output stream
|
|
|
|
*/
|
|
|
|
std::ostream & write_matrix(std::ostream &out)
|
|
|
|
{
|
|
|
|
return write_matrix(out, _h_data[0]);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
//-----------------------------------------------------------------------------
|
|
|
|
// Floating point "almost-equal" utilities
|
|
|
|
//-----------------------------------------------------------------------------
|
|
|
|
|
|
|
|
static bool almost_equal_ulps(half_t a, half_t b, int max_ulps)
|
|
|
|
{
|
|
|
|
if (a == b)
|
|
|
|
return true;
|
|
|
|
|
|
|
|
int32_t int_diff = abs(a.raw() - b.raw());
|
|
|
|
if (int_diff <= max_ulps)
|
|
|
|
return true;
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static bool almost_equal_ulps(float a, float b, int max_ulps)
|
|
|
|
{
|
|
|
|
if (a == b)
|
|
|
|
return true;
|
|
|
|
int32_t int_diff = abs(*(int32_t*)&a - *(int32_t*)&b);
|
|
|
|
if (int_diff <= max_ulps)
|
|
|
|
return true;
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static bool almost_equal_ulps(double a, double b, int max_ulps)
|
|
|
|
{
|
|
|
|
if (a == b)
|
|
|
|
return true;
|
|
|
|
int64_t int_diff = abs(*(int64_t*)&a - *(int64_t*)&b);
|
|
|
|
if (int_diff <= max_ulps)
|
|
|
|
return true;
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool almost_equal_ulps(int32_t a, int32_t b, int max_ulps)
|
|
|
|
{
|
|
|
|
return (a == b);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
//-----------------------------------------------------------------------------
|
|
|
|
// matrix operations
|
|
|
|
//-----------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Returns matrix equality
|
|
|
|
*/
|
|
|
|
bool operator==(const matrix<value_t> &mat) const
|
|
|
|
{
|
|
|
|
int max_ulps = 30;
|
|
|
|
|
|
|
|
if (_m != mat._m || _n != mat._n)
|
|
|
|
{
|
|
|
|
fprintf(stderr, "Error: dimension mismatch during matrix comparison.\n"); exit(1);
|
|
|
|
}
|
|
|
|
|
|
|
|
for (int j = 0; j < _n; j++)
|
|
|
|
{
|
|
|
|
for (int i = 0; i < _m; i++)
|
|
|
|
{
|
|
|
|
if (!almost_equal_ulps(_h_data[i + j * _m], mat._h_data[i + j * _m], max_ulps))
|
|
|
|
{
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Returns matrix inequality
|
|
|
|
*/
|
|
|
|
bool operator!=(const matrix<value_t> &mat) const
|
|
|
|
{
|
|
|
|
return !(*this == mat);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Computes this = (alpha * op(A) * op(B)) + (beta * this), specialized for gemm_nn
|
|
|
|
*/
|
|
|
|
template <typename multiplicand_t>
|
|
|
|
void gemm(
|
|
|
|
matrix_transform_t transform_a,
|
|
|
|
matrix_transform_t transform_b,
|
|
|
|
host_value_t alpha,
|
|
|
|
const matrix<multiplicand_t> &A,
|
|
|
|
const matrix<multiplicand_t> &B,
|
|
|
|
host_value_t beta)
|
|
|
|
{
|
|
|
|
// Sanity check dimensions
|
|
|
|
if ((_m != A.height(transform_a)) ||
|
|
|
|
(_n != B.width(transform_b)) ||
|
|
|
|
(A.width(transform_a) != B.height(transform_b)))
|
|
|
|
{
|
|
|
|
fprintf(stderr, "Error: dimension mismatch during gemm.\n");
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
|
|
|
|
int M = A.height(transform_a);
|
|
|
|
int K = A.width(transform_a);
|
|
|
|
int N = B.width(transform_b);
|
|
|
|
|
|
|
|
// Even the host-side implementation utilizes a blocking structure to improve
|
|
|
|
// verification performance
|
|
|
|
int DimBlockM = (M % 16 == 0) ? 16 : 1;
|
|
|
|
int DimBlockN = (N % 16 == 0) ? 16 : 1;
|
|
|
|
|
|
|
|
for (int i = 0; i < M; i += DimBlockM)
|
|
|
|
{
|
|
|
|
for (int j = 0; j < N; j += DimBlockN)
|
|
|
|
{
|
|
|
|
for (int block_y = 0; block_y < DimBlockM; block_y++)
|
|
|
|
{
|
|
|
|
for (int block_x = 0; block_x < DimBlockN; block_x++)
|
|
|
|
{
|
|
|
|
int y = i + block_y;
|
|
|
|
int x = j + block_x;
|
|
|
|
|
|
|
|
host_value_t accum(0);
|
|
|
|
for (int k = 0; k < K; k++)
|
|
|
|
{
|
|
|
|
accum += host_value_t(A.get(k, y, transform_a)) * host_value_t(B.get(x, k, transform_b));
|
|
|
|
}
|
|
|
|
|
|
|
|
_h_data[y + x * M] = (alpha * accum) + (beta * _h_data[y + x * M]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
} // namespace cutlass
|