cutlass/tools/util/host_tensor.h

/***************************************************************************************************
 * Copyright (c) 2017-2018, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright notice, this list of
 *       conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright notice, this list of
 *       conditions and the following disclaimer in the documentation and/or other materials
 *       provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
 *       to endorse or promote products derived from this software without specific prior written
 *       permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
 * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
#pragma once

/*! \file
    \brief Template class to perform computations on tensors and manage memory.
*/

#include <cutlass/cutlass.h>
#include <cutlass/matrix_traits.h>
#include <tools/util/device_memory.h>
#include <tools/util/host_tensor_view.h>
#include <tools/util/type_traits.h>
#include <vector>

namespace cutlass {

template <typename T, bool DeviceBacked_ = true>
class HostTensor : public HostTensorView<T> {
 public:
  /// Type used for device-side allocations
  typedef typename TypeTraits<T>::device_type DeviceType;

  /// Base class
  typedef HostTensorView<T> Base;

  /// If true, allocates device side memory
  static bool const DeviceBacked = DeviceBacked_;

  /// Rank of tensor
  static int const Rank = Base::Rank;

  /// Type used to compute the offset of an element to the base of a tensor
  typedef typename Base::Offset_t Offset_t;

  /// Tensor reference to host memory
  typedef typename Base::TensorRef_t TensorRef_t;

  /// Tensor reference to device memory
  typedef TensorRef<DeviceType, TensorRef_t::Rank> DeviceTensorRef;

  /// Tensor reference to constant device memory
  typedef TensorRef<DeviceType const, TensorRef_t::Rank> ConstDeviceTensorRef;

  /// Coordinate into tensor
  typedef typename Base::Coord_t Coord_t;

 private:
  /// Host-side memory allocation
  std::vector<T> host_;

  /// Device-side memory
  cutlass::device_memory::allocation<DeviceType> device_;

 public:
  //
  // Device and Host Methods
  //

  /// Default constructor
  HostTensor() {}

  /// Constructs a Tensor_view from stride and size
  HostTensor(Coord_t const& _stride, Coord_t const& _size) { reset(_stride, _size); }

  /// Constructs a HostTensor from size - infers strides
  HostTensor(Coord_t const& _size) {
    Coord_t _stride = make_Coord(
        _size.at(2) * _size.at(1) * _size.at(0), _size.at(1) * _size.at(0), _size.at(0), 1);
    reset(_stride, _size);
  }

  /// Returns the number of elements needed to back vector
  size_t capacity() { return Base::capacity(); }

  /// Returns true if the Tensor_view is bound to some memory
  bool good() const { return Base::good(); }

  /// Updates the reference and size of a Tensor_view object
  void reset(Coord_t const& _stride, Coord_t const& _size) {
    size_t _capacity = _size.at(0) * _stride.at(0);

    DeviceType* _device_memory = nullptr;
    if (DeviceBacked) {
      _device_memory = cutlass::device_memory::allocate<DeviceType>(_capacity);
    }

    host_.clear();
    host_.resize(_capacity);
    device_.reset(_device_memory, _capacity);

    Base::reset(TensorRef_t(host_.data(), _stride), _size);
  }

  /// Initializes the host tensor as a matrix
  void resize_matrix(int rows, int columns, MatrixLayout::Kind layout) {
    bool col_major = (layout == MatrixLayout::kColumnMajor);
    int ldm = (col_major ? rows : columns);

    Coord_t stride = make_Coord(rows * columns, col_major ? 1 : ldm, col_major ? ldm : 1, 1);

    Coord_t size = make_Coord(1, rows, columns, 1);

    reset(stride, size);
  }

  /// Simplifies resizing the host tensor
  void resize(int elements) { resize_matrix(1, elements, MatrixLayout::kColumnMajor); }

  /// Gets pointer to host data
  T const* host_data() const { return &host_[0]; }

  /// Gets pointer to host data
  T* host_data() { return &host_[0]; }

  /// Gets pointer to device data
  DeviceType* device_data() const { return device_.get(); }

  /// Copies data from device to host
  void sync_host() {
    if (DeviceBacked) {
      device_memory::copy_to_host(
          host_.data(), reinterpret_cast<T const*>(device_.get()), host_.size());
    }
  }

  /// Copies data from host to device
  void sync_device() {
    if (DeviceBacked) {
      device_memory::copy_to_device(
          device_.get(), reinterpret_cast<DeviceType const*>(host_.data()), host_.size());
    }
  }

  /// Copy data from a caller-supplied device pointer
  void copy_to_host(DeviceType const *ptr_device) {
    device_memory::copy_to_host(
      host_.data(), reinterpret_cast<T const *>(ptr_device), host_.size());
  }

  /// Copies data to a caller-supplied device pointer
  void copy_to_device(DeviceType *ptr_device) {
    device_memory::copy_to_device(
      ptr_device, reinterpret_cast<DeviceType const *>(host_.data()), host_.size());
  }

  /// Accesses the tensor reference pointing to data
  TensorRef_t& host_ref() { return Base::ref(); }

  /// Accesses the tensor reference pointing to data
  TensorRef_t const& host_ref() const { return Base::ref(); }

  /// Accesses the tensor reference pointing to data
  DeviceTensorRef device_ref() const { return DeviceTensorRef(device_data(), stride()); }

  /// Returns a tensor ref to constant memory on the device
  ConstDeviceTensorRef const_device_ref() const {
    return ConstDeviceTensorRef(device_data(), stride());
  }

  /// Accesses the size
  Coord_t const& size() const { return Base::size(); }

  /// Accesses the size
  int size(int dim) const { return Base::size(dim); }

  /// Accesses the size
  Coord_t const& stride() const { return Base::stride(); }

  /// Accesses the size
  int stride(int dim) const { return Base::stride(dim); }

  /// Returns the index of an element
  Offset_t offset(Coord_t const& coord) const { return Base::offset(coord); }

  /// Determines whether a location is within a tensor
  bool contains(Coord_t const& coord) const { return Base::contains(coord); }

  /// Element-wise accessor
  T& at(Coord_t const& coord) const { return Base::at(coord); }

  /// Element-wise accessor
  T& operator[](Coord_t const& coord) { return at(coord); }

  /// Element-wise accessor with basic offset
  T& at(int idx) const { return Base::at(idx); }

  /// Returns a Tensor_view given location and size quantities
  TensorView<T> subview(Coord_t const& _location, Coord_t _size) const {
    return Base::subview(_location, _size);
  }

  /// Recurses through all dimensions and applies a unary operation
  template <typename F>
  void elementwise_in_place(F& op, int dim = 0, Offset_t dst_offset_base = 0) {
    Base::elementwise_in_place(op, dim, dst_offset_base);
  }

  /// Recurses through all dimensions and applies a unary operator, supplying the logical
  /// coordinate within the tensor as an argument
  template <typename F>
  void elementwise_stream(F& op, int dim = 0, Offset_t dst_offset_base = 0) {
    Base::elementwise_stream(op, dim, dst_offset_base);
  }

  /// Recurses through all dimensions and applies a unary operator, supplying the logical
  /// coordinate within the tensor as an argument
  template <typename F>
  void elementwise_generate(F& op,
                            int dim = 0,
                            Offset_t dst_offset_base = 0,
                            Coord_t coord = Coord_t(0)) {
    Base::elementwise_generate(op, dim, dst_offset_base, coord);
  }

  /// Recurses through all dimensions and applies a binary operation
  template <typename Src, typename F>
  bool elementwise_in_place(F& op,
                            int dim,
                            TensorView<Src> const& tensor,
                            Offset_t dst_offset_base = 0,
                            Offset_t src_offset_base = 0) {
    return Base::elementwise_in_place(op, dim, tensor, dst_offset_base, src_offset_base);
  }

  /// Accumulate in place
  template <typename Src>
  TensorView<T>& operator+=(TensorView<Src> const& tensor) {
    Base::operator+=(tensor);
    sync_device();
    return *this;
  }

  /// Subtract in place
  template <typename Src>
  TensorView<T>& operator-=(TensorView<Src> const& tensor) {
    Base::operator-=(tensor);
    sync_device();
    return *this;
  }

  /// Multiply in place
  template <typename Src>
  TensorView<T>& operator*=(TensorView<Src> const& tensor) {
    Base::operator*=(tensor);
    sync_device();
    return *this;
  }

  /// Divide in place
  template <typename Src>
  TensorView<T>& operator/=(TensorView<Src> const& tensor) {
    Base::operator/=(tensor);
    sync_device();
    return *this;
  }

  /// equality with epsilon tolerance
  bool equals(TensorView<T> const& tensor, T epsilon) const {
    return Base::equals(tensor, epsilon);
  }

  /// equality with ulps tolerance
  bool bit_equals(TensorView<T> const& tensor, long long ulps_threshold = 0) {
    return Base::bit_equals(tensor, ulps_threshold);
  }

  /// Computes general matrix product among select dimensions of a tensor
  /// Assumes:
  ///   D: number of independent GEMMs to compute
  ///   H: height of matrix
  ///   W: width of matrix
  template <
      /// Data type of A matrix elements
      typename A,
      /// Data type of B matrix elements
      typename B,
      /// Data type of "compute" type (i.e. accumulator)
      typename Ctype,
      /// Data type of scale factors
      typename Stype>
  void gemm(TensorView<A> const& tensor_a, TensorView<B> const& tensor_b, Stype alpha, Stype beta) {
    Base::template gemm<A, B, Ctype, Stype>(tensor_a, tensor_b, alpha, beta);
  }

  /// Fills with random data
  template <typename Gen>
  void fill_random(Gen generator) {
    Base::fill_random(generator);
    sync_device();
  }

  /// Procedurally assigns elements
  template <typename Gen>
  void generate(Gen generator) {
    Base::generate(generator);
    sync_device();
  }

  /// Procedurally visits elements
  template <typename Gen>
  void visit(Gen& generator) const {
    Base::visit(generator);
  }

  /// initializes with identity
  void fill_identity() {
    Base::fill_identity();
    sync_device();
  }

  /// computes elements as a linear combination of their coordinates
  void fill_linear(Coord_t v, T offset = T(0)) {
    Base::fill_linear(v, offset);
    sync_device();
  }

  /// computes elements as a linear combination of their coordinates
  void fill_sequential(T v = T(1), T offset = T(0)) {
    Base::fill_sequential(v, offset);
    sync_device();
  }

  /// fills with a value
  void fill(T val = T(0)) {
    Base::fill(val);
    sync_device();
  }

  /// Copies from external data source and performs type conversion
  template <typename Src>
  void fill(TensorView<Src> const& tensor) {
    Base::fill(tensor);
    sync_device();
  }

  /// Computes the norm of the matrix in double-precision
  double norm() const { return Base::norm(); }
};
}  // namespace cutlass