cutlass/examples/45_dual_gemm/dual_gemm_run.h
Adnan Akhundov 3c995c7606
Extend DualGemm: support batched mode + decouple B0/B1 layouts (#790)
* Fix MHA kernel

Summary:

ATT

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:

* Extend DualGemm to support batched mode (#5)

Following the GemmUniversalMode::kBatched implementation, batched mode is added to the DualGemm (under examples/45_dual_gemm). DualGemmMode::kBatched and SplitKSerial are not compatible: Status::kErrorInvalidProblem is returned if both are set.

* Decouple LayoutB0 and LayoutB1 in DualGemm

The DualGemm template assumed the same layout, LayoutB, for both right operand matrices B0 and B1. This is problematic if the layout of the two matrices is different. In particular, this may be the case when one of the matrices is row-major, while the other is a (column) vector that has to be broadcasted in column-major with zero stride (e.g., as {B1.device_data(), 0}) for the DualGemm implementation to be able to process B0 and B1 simultaneously.

In this commit, LayoutB0 and LayoutB1 are decoupled throughout the DualGemm code (device, kernel, and mma). Additionally, the batch strides of B0 and B1 are also decoupled to accommodate the column vector B1 case described above.

* Remove comment as no longer relevant

* Revert Fix MHA kernel

---------

Co-authored-by: mikeiovine <mikeiovine@fb.com>
2023-02-13 15:27:13 -05:00

938 lines
31 KiB
C++

/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
#pragma once
#include <iostream>
#include <fstream>
#include <sstream>
#include <type_traits>
#include "cutlass/util/host_tensor.h"
#include "cutlass/util/tensor_view_io.h"
#include "cutlass/util/distribution.h"
#include "cutlass/util/reference/host/tensor_fill.h"
#include "cutlass/util/reference/host/tensor_copy.h"
#include "cutlass/util/reference/host/tensor_compare.h"
#include "cutlass/util/reference/host/tensor_norm.h"
#include "cutlass/util/reference/device/gemm.h"
#include "cutlass/util/reference/device/tensor_relu.h"
#include "cutlass/gemm/gemm.h"
#include "cutlass/gemm/device/gemm_universal.h"
#include "dual_gemm_common.h"
#include "helper.h"
#define CHECK_GT(val1, val2) \
if((val1) <= (val2)) \
std::cerr << __FILE__ << " " << __LINE__ << ": CHECK_GT failed\n";
#define CHECK_TRUE(val) \
if(!(val)) \
std::cerr << __FILE__ << " " << __LINE__ << ": CHECK_TRUE failed\n";
template <
typename OutputOp,
typename Element,
typename Layout>
struct TensorEpilogueForEachFunc {
/// View type
using TensorView = cutlass::TensorView<Element, Layout>;
/// Coordinate in tensor's index space
using TensorCoord = typename TensorView::TensorCoord;
/// Parameters structure
struct Params {
//
// Data members
//
TensorView view_x0;
TensorView view_x1;
TensorView view_y;
OutputOp output_op;
//
// Methods
//
Params(
TensorView view_x0_ = TensorView(),
TensorView view_x1_ = TensorView(),
TensorView view_y_ = TensorView(),
OutputOp output_op_ = OutputOp(typename OutputOp::Params{})
):
view_x0(view_x0_), view_x1(view_x1_), view_y(view_y_), output_op(output_op_) {
}
};
Params params;
CUTLASS_DEVICE
TensorEpilogueForEachFunc(Params const &params): params(params) {
}
CUTLASS_DEVICE
void operator()(TensorCoord const &coord) {
Element const & x0 = params.view_x0.at(coord);
Element const & x1 = params.view_x1.at(coord);
Element& y = params.view_y.at(coord);
y = params.output_op(x0, x1);
}
};
template <
typename OutputOp,
typename Element,
typename Layout>
void TensorEpilogueForEach(
cutlass::TensorView<Element, Layout> x0,
cutlass::TensorView<Element, Layout> x1,
cutlass::TensorView<Element, Layout> y) {
using Func = TensorEpilogueForEachFunc<OutputOp, Element, Layout>;
using Params = typename Func::Params;
cutlass::reference::device::TensorForEach<Func, Layout::kRank, Params>(
y.extent(),
Params(x0, x1, y)
);
}
////////////////////////////////////////////////////////////////////////////////
template <typename Gemm0_, typename Gemm1_>
struct NonFusedDualGemmRun
{
using Gemm0 = Gemm0_;
using Gemm1 = Gemm1_;
using ElementAccumulator = typename Gemm0::ElementAccumulator;
using ElementCompute = typename Gemm0::GemmKernel::Epilogue::OutputOp::ElementCompute;
/// Initialization
cutlass::Distribution::Kind init_A;
cutlass::Distribution::Kind init_B;
cutlass::Distribution::Kind init_C;
cutlass::Distribution::Kind init_Bias;
uint64_t seed;
//
// Methods
//
NonFusedDualGemmRun(
cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
cutlass::Distribution::Kind init_Bias_ = cutlass::Distribution::Uniform,
uint64_t seed_ = 2080
):
init_A(init_A_), init_B(init_B_), init_C(init_C_), init_Bias(init_Bias_), seed(seed_) { }
/// Helper to initialize a tensor view
template <typename Element, typename Layout>
bool initialize_tensor(
cutlass::TensorView<Element, Layout> view,
cutlass::Distribution::Kind dist_kind,
uint64_t seed) {
if (dist_kind == cutlass::Distribution::Uniform) {
cutlass::reference::host::TensorFillRandomUniform(
view, seed, 2, -2, 0);
}
else if (dist_kind == cutlass::Distribution::Identity) {
cutlass::reference::host::TensorFillIdentity(view);
}
else if (dist_kind == cutlass::Distribution::Gaussian) {
cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
}
else if (dist_kind == cutlass::Distribution::Sequential) {
cutlass::reference::host::BlockFillSequential(
view.data(), view.capacity());
}
else if (dist_kind == cutlass::Distribution::AllZeros) {
cutlass::reference::host::TensorFill(view, Element(0));
}
else if (dist_kind == cutlass::Distribution::AllOnes) {
cutlass::reference::host::TensorFill(view, Element(1));
}
else {
std::cerr << "Not implemented\n";
return false;
}
return true;
}
/// Executes one test
bool run(
cutlass::gemm::GemmCoord problem_size,
ElementCompute alpha0 = ElementCompute(1),
ElementCompute beta0 = ElementCompute(0),
ElementCompute alpha1 = ElementCompute(1),
ElementCompute beta1 = ElementCompute(0),
bool is_profiling = true,
bool relu = false,
int warm_ups = 1,
int runs = 100) {
//
// Allocate the GEMM workspace
//
cutlass::HostTensor<
typename Gemm0::ElementA,
typename Gemm0::LayoutA> tensor_A0(problem_size.mk());
cutlass::HostTensor<
typename Gemm0::ElementB,
typename Gemm0::LayoutB> tensor_B0(problem_size.kn());
cutlass::HostTensor<
typename Gemm0::ElementC,
typename Gemm0::LayoutC> tensor_C0(problem_size.mn());
cutlass::HostTensor<
typename Gemm1::ElementC,
typename Gemm0::LayoutC> tensor_Bias0({1, problem_size.n()});
cutlass::HostTensor<
typename Gemm0::ElementC,
typename Gemm0::LayoutC> tensor_D0(problem_size.mn());
cutlass::HostTensor<
typename Gemm0::ElementC,
typename Gemm0::LayoutC> reference_D0(problem_size.mn());
cutlass::HostTensor<
typename Gemm1::ElementB,
typename Gemm1::LayoutB> tensor_B1(problem_size.kn());
cutlass::HostTensor<
typename Gemm1::ElementC,
typename Gemm1::LayoutC> tensor_C1(problem_size.mn());
cutlass::HostTensor<
typename Gemm1::ElementC,
typename Gemm1::LayoutC> tensor_Bias1({1, problem_size.n()});
cutlass::HostTensor<
typename Gemm1::ElementC,
typename Gemm1::LayoutC> tensor_D1(problem_size.mn());
cutlass::HostTensor<
typename Gemm1::ElementC,
typename Gemm1::LayoutC> reference_D1(problem_size.mn());
CHECK_TRUE(initialize_tensor(tensor_A0.host_view(), init_A, seed + 2019));
CHECK_TRUE(initialize_tensor(tensor_B0.host_view(), init_B, seed + 2018));
CHECK_TRUE(initialize_tensor(tensor_C0.host_view(), init_C, seed + 2017));
CHECK_TRUE(initialize_tensor(tensor_Bias0.host_view(), init_Bias, seed + 2014));
CHECK_TRUE(initialize_tensor(tensor_B1.host_view(), init_B, seed + 2016));
CHECK_TRUE(initialize_tensor(tensor_C1.host_view(), init_C, seed + 2015));
CHECK_TRUE(initialize_tensor(tensor_Bias1.host_view(), init_Bias, seed + 2013));
cutlass::reference::host::TensorFill(
tensor_D0.host_view());
cutlass::reference::host::TensorFill(
tensor_D1.host_view());
cutlass::reference::host::TensorFill(
reference_D0.host_view());
cutlass::reference::host::TensorFill(
reference_D1.host_view());
tensor_A0.sync_device();
tensor_B0.sync_device();
tensor_C0.sync_device();
tensor_Bias0.sync_device();
tensor_D0.sync_device();
reference_D0.sync_device();
tensor_B1.sync_device();
tensor_C1.sync_device();
tensor_Bias1.sync_device();
tensor_D1.sync_device();
reference_D1.sync_device();
//
// Initialize the GEMM operator
//
int split_k_slices = Gemm0::kSplitKSerial ? 2 : 1;
typename Gemm0::Arguments arguments_0{
problem_size,
tensor_A0.device_ref(),
tensor_B0.device_ref(),
{tensor_Bias0.device_data(), typename Gemm0::LayoutC::Stride(0)},
tensor_D0.device_ref(),
{alpha0, beta0},
split_k_slices
};
split_k_slices = Gemm1::kSplitKSerial ? 2 : 1;
typename Gemm1::Arguments arguments_1{
problem_size,
tensor_A0.device_ref(),
tensor_B1.device_ref(),
{tensor_Bias1.device_data(), typename Gemm1::LayoutC::Stride(0)},
tensor_D1.device_ref(),
{alpha1, beta1},
split_k_slices
};
Gemm0 gemm_op_0;
Gemm1 gemm_op_1;
// Allocate workspace memory
cutlass::device_memory::allocation<uint8_t> workspace0(gemm_op_0.get_workspace_size(arguments_0));
cutlass::device_memory::allocation<uint8_t> workspace1(gemm_op_1.get_workspace_size(arguments_1));
cutlass::Status status = gemm_op_0.initialize(arguments_0, workspace0.get());
CUTLASS_CHECK(status);
status = gemm_op_1.initialize(arguments_1, workspace1.get());
CUTLASS_CHECK(status);
for(int i = 0; i < warm_ups; i++) {
status = gemm_op_0();
CUTLASS_CHECK(status);
status = gemm_op_1();
CUTLASS_CHECK(status);
}
if (is_profiling) {
//
// Profile the GEMM
//
cudaEvent_t start, stop1, stop2;
cudaEventCreate(&start);
cudaEventCreate(&stop1);
cudaEventCreate(&stop2);
cudaEventRecord(start);
for(int i = 0; i < runs; i++) {
status = gemm_op_0();
CUTLASS_CHECK(status);
}
cudaEventRecord(stop1);
for(int i = 0; i < runs; i++) {
status = gemm_op_1();
CUTLASS_CHECK(status);
}
cudaEventRecord(stop2);
cudaDeviceSynchronize();
float gemm0Time, gemm1Time, totalTime;
cudaEventElapsedTime(&gemm0Time, start, stop1);
cudaEventElapsedTime(&gemm1Time, stop1, stop2);
cudaEventElapsedTime(&totalTime, start, stop2);
std::cout << "gemm 0 time " << gemm0Time / (float)runs << " ms\n";
std::cout << "gemm 1 time " << gemm1Time / (float)runs << " ms\n";
std::cout << "Non-fusion GEMM only time " << totalTime / (float)runs << " ms\n";
}
tensor_D0.sync_host();
tensor_D1.sync_host();
//
// Verify
//
cutlass::reference::device::Gemm<
typename Gemm0::ElementA, typename Gemm0::LayoutA,
typename Gemm0::ElementB, typename Gemm0::LayoutB,
typename Gemm0::ElementC, typename Gemm0::LayoutC, ElementCompute,
ElementAccumulator, typename Gemm0::Operator>
reference_gemm_0;
cutlass::reference::device::Gemm<
typename Gemm1::ElementA, typename Gemm1::LayoutA,
typename Gemm1::ElementB, typename Gemm1::LayoutB,
typename Gemm1::ElementC, typename Gemm1::LayoutC, ElementCompute,
ElementAccumulator, typename Gemm1::Operator>
reference_gemm_1;
reference_gemm_0(
problem_size,
alpha0,
tensor_A0.device_ref(),
tensor_B0.device_ref(),
beta0,
{tensor_Bias0.device_data(), typename Gemm0::LayoutC::Stride(0)},
reference_D0.device_ref()
);
if(relu) {
cutlass::reference::device::TensorReLu(reference_D0.device_view());
}
reference_gemm_1(
problem_size,
alpha1,
tensor_A0.device_ref(),
tensor_B1.device_ref(),
beta1,
{tensor_Bias1.device_data(), typename Gemm1::LayoutC::Stride(0)},
reference_D1.device_ref()
);
if(relu) {
cutlass::reference::device::TensorReLu(reference_D1.device_view());
}
// Wait for kernels to finish
cudaDeviceSynchronize();
reference_D0.sync_host();
reference_D1.sync_host();
CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D0.host_view()), 0);
CHECK_GT(cutlass::reference::host::TensorNorm(reference_D0.host_view()), 0);
CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D1.host_view()), 0);
CHECK_GT(cutlass::reference::host::TensorNorm(reference_D1.host_view()), 0);
bool passed0 = cutlass::reference::host::TensorEquals(
reference_D1.host_view(),
tensor_D1.host_view());
CHECK_TRUE(passed0);
bool passed1 = cutlass::reference::host::TensorEquals(
reference_D1.host_view(),
tensor_D1.host_view());
CHECK_TRUE(passed1);
if (!passed0 || !passed1) {
std::stringstream fname;
fname << "error_DualGemm_device_nonfused.txt";
std::cerr << "Dumping results in " << fname.str() << "\n";
std::ofstream file(fname.str());
file
<< "A0 =\n" << tensor_A0.host_view()
<< "\nB0 =\n" << tensor_B0.host_view()
<< "\nC0 =\n" << tensor_C0.host_view()
<< "\nBias0:\n" << tensor_Bias0.host_view() << "\n"
<< "\nD0 =\n" << tensor_D0.host_view()
<< "\nB1 =\n" << tensor_B1.host_view()
<< "\nC1 =\n" << tensor_C1.host_view()
<< "\nBias1:\n" << tensor_Bias1.host_view() << "\n"
<< "\n\nReference =\n" << reference_D1.host_view()
<< "\nComputed =\n" << tensor_D1.host_view();
}
return passed0 && passed1;
}
};
template <typename DualGemm_>
struct DualFusedGemmRun
{
using DualGemm = DualGemm_;
using ElementAccumulator = typename DualGemm::ElementAccumulator;
using ElementCompute = typename DualGemm::DualGemmKernel::Epilogue0::OutputOp::ElementCompute;
using EpilogueOutputOp2 = typename DualGemm::EpilogueOutputOp2;
/// Initialization
cutlass::Distribution::Kind init_A;
cutlass::Distribution::Kind init_B;
cutlass::Distribution::Kind init_C;
cutlass::Distribution::Kind init_Scale;
cutlass::Distribution::Kind init_Bias;
uint64_t seed;
//
// Methods
//
DualFusedGemmRun(
cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
cutlass::Distribution::Kind init_Scale_ = cutlass::Distribution::Uniform,
cutlass::Distribution::Kind init_Bias_ = cutlass::Distribution::Uniform,
uint64_t seed_ = 2080
):
init_A(init_A_), init_B(init_B_), init_C(init_C_),
init_Scale(init_Scale_), init_Bias(init_Bias_), seed(seed_) { }
/// Helper to initialize a tensor view
template <typename Element, typename Layout>
bool initialize_tensor(
cutlass::TensorView<Element, Layout> view,
cutlass::Distribution::Kind dist_kind,
uint64_t seed) {
if (dist_kind == cutlass::Distribution::Uniform) {
cutlass::reference::host::TensorFillRandomUniform(
view, seed, 2, -2, 0);
}
else if (dist_kind == cutlass::Distribution::Identity) {
cutlass::reference::host::TensorFillIdentity(view);
}
else if (dist_kind == cutlass::Distribution::Gaussian) {
cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
}
else if (dist_kind == cutlass::Distribution::Sequential) {
cutlass::reference::host::BlockFillSequential(
view.data(), view.capacity());
}
else if (dist_kind == cutlass::Distribution::AllZeros) {
cutlass::reference::host::TensorFill(view, Element(0));
}
else if (dist_kind == cutlass::Distribution::AllOnes) {
cutlass::reference::host::TensorFill(view, Element(1));
}
else {
std::cerr << "Not implemented\n";
return false;
}
return true;
}
/// Executes one test
bool run(
cutlass::gemm::GemmCoord problem_size,
ElementCompute alpha0 = ElementCompute(1),
ElementCompute beta0 = ElementCompute(1),
ElementCompute alpha1 = ElementCompute(1),
ElementCompute beta1 = ElementCompute(1),
int batch_count = 1,
bool broadcast_b1 = false,
bool is_profiling = true,
bool relu = false,
int warm_ups = 1,
int runs = 100) {
//
// Allocate the GEMM workspace
//
cutlass::HostTensor<
typename DualGemm::ElementA,
typename DualGemm::LayoutA> tensor_A0(
std::is_same<typename DualGemm::LayoutA, cutlass::layout::RowMajor>::value ?
cutlass::MatrixCoord(batch_count * problem_size.m(), problem_size.k()) :
cutlass::MatrixCoord(problem_size.m(), batch_count * problem_size.k()));
cutlass::HostTensor<
typename DualGemm::ElementB,
typename DualGemm::LayoutB0> tensor_B0(
std::is_same<typename DualGemm::LayoutB0, cutlass::layout::RowMajor>::value ?
cutlass::MatrixCoord(batch_count * problem_size.k(), problem_size.n()) :
cutlass::MatrixCoord(problem_size.k(), batch_count * problem_size.n()));
cutlass::HostTensor<
typename DualGemm::ElementC,
typename DualGemm::LayoutC> tensor_C0(
std::is_same<typename DualGemm::LayoutC, cutlass::layout::RowMajor>::value ?
cutlass::MatrixCoord(batch_count * problem_size.m(), problem_size.n()) :
cutlass::MatrixCoord(problem_size.m(), batch_count * problem_size.n()));
cutlass::HostTensor<
typename DualGemm::ElementC,
typename DualGemm::LayoutScaleBias> tensor_Bias0({batch_count, problem_size.n()});
cutlass::HostTensor<
typename DualGemm::ElementC,
typename DualGemm::LayoutC> tensor_D0(
std::is_same<typename DualGemm::LayoutC, cutlass::layout::RowMajor>::value ?
cutlass::MatrixCoord(batch_count * problem_size.m(), problem_size.n()) :
cutlass::MatrixCoord(problem_size.m(), batch_count * problem_size.n()));
cutlass::HostTensor<
typename DualGemm::ElementC,
typename DualGemm::LayoutC> reference_D0(
std::is_same<typename DualGemm::LayoutC, cutlass::layout::RowMajor>::value ?
cutlass::MatrixCoord(batch_count * problem_size.m(), problem_size.n()) :
cutlass::MatrixCoord(problem_size.m(), batch_count * problem_size.n()));
cutlass::HostTensor<
typename DualGemm::ElementB,
typename DualGemm::LayoutB1> tensor_B1(
std::is_same<typename DualGemm::LayoutB1, cutlass::layout::RowMajor>::value ?
cutlass::MatrixCoord(batch_count * problem_size.k(), problem_size.n()) :
cutlass::MatrixCoord(problem_size.k(), batch_count * problem_size.n()));
if (broadcast_b1) {
tensor_B1.resize({problem_size.k(), batch_count});
}
cutlass::HostTensor<
typename DualGemm::ElementC,
typename DualGemm::LayoutC> tensor_C1(
std::is_same<typename DualGemm::LayoutC, cutlass::layout::RowMajor>::value ?
cutlass::MatrixCoord(batch_count * problem_size.m(), problem_size.n()) :
cutlass::MatrixCoord(problem_size.m(), batch_count * problem_size.n()));
cutlass::HostTensor<
typename DualGemm::ElementC,
typename DualGemm::LayoutScaleBias> tensor_Bias1({batch_count, problem_size.n()});
cutlass::HostTensor<
typename DualGemm::ElementC,
typename DualGemm::LayoutC> tensor_D1(
std::is_same<typename DualGemm::LayoutC, cutlass::layout::RowMajor>::value ?
cutlass::MatrixCoord(batch_count * problem_size.m(), problem_size.n()) :
cutlass::MatrixCoord(problem_size.m(), batch_count * problem_size.n()));
cutlass::HostTensor<
typename DualGemm::ElementC,
typename DualGemm::LayoutC> tensor_D2(
std::is_same<typename DualGemm::LayoutC, cutlass::layout::RowMajor>::value ?
cutlass::MatrixCoord(batch_count * problem_size.m(), problem_size.n()) :
cutlass::MatrixCoord(problem_size.m(), batch_count * problem_size.n()));
cutlass::HostTensor<
typename DualGemm::ElementC,
typename DualGemm::LayoutC> reference_D1(
std::is_same<typename DualGemm::LayoutC, cutlass::layout::RowMajor>::value ?
cutlass::MatrixCoord(batch_count * problem_size.m(), problem_size.n()) :
cutlass::MatrixCoord(problem_size.m(), batch_count * problem_size.n()));
cutlass::HostTensor<
typename DualGemm::ElementC,
typename DualGemm::LayoutC> reference_D2(
std::is_same<typename DualGemm::LayoutC, cutlass::layout::RowMajor>::value ?
cutlass::MatrixCoord(batch_count * problem_size.m(), problem_size.n()) :
cutlass::MatrixCoord(problem_size.m(), batch_count * problem_size.n()));
CHECK_TRUE(initialize_tensor(tensor_A0.host_view(), init_A, seed + 2019));
CHECK_TRUE(initialize_tensor(tensor_B0.host_view(), init_B, seed + 2118));
CHECK_TRUE(initialize_tensor(tensor_C0.host_view(), init_C, seed + 2017));
CHECK_TRUE(initialize_tensor(tensor_Bias0.host_view(), init_Bias, seed + 2011));
CHECK_TRUE(initialize_tensor(tensor_B1.host_view(), init_B, seed + 2113));
CHECK_TRUE(initialize_tensor(tensor_C1.host_view(), init_C, seed + 2015));
CHECK_TRUE(initialize_tensor(tensor_Bias1.host_view(), init_Bias, seed + 2012));
cutlass::reference::host::TensorFill(
tensor_D0.host_view());
cutlass::reference::host::TensorFill(
tensor_D1.host_view());
cutlass::reference::host::TensorFill(
tensor_D2.host_view());
cutlass::reference::host::TensorFill(
reference_D0.host_view());
cutlass::reference::host::TensorFill(
reference_D1.host_view());
cutlass::reference::host::TensorFill(
reference_D2.host_view());
tensor_A0.sync_device();
tensor_B0.sync_device();
tensor_C0.sync_device();
tensor_Bias0.sync_device();
tensor_B1.sync_device();
tensor_C1.sync_device();
tensor_Bias1.sync_device();
tensor_D0.sync_device();
tensor_D1.sync_device();
tensor_D2.sync_device();
reference_D0.sync_device();
reference_D1.sync_device();
reference_D2.sync_device();
//
// Batch strides (irrelevant when batch_count == 1)
//
int64_t batch_stride_A = problem_size.m() * problem_size.k();
int64_t batch_stride_B0 = problem_size.k() * problem_size.n();
int64_t batch_stride_B1 = problem_size.k() * problem_size.n();
if (broadcast_b1) {
// B1 is a (column) vector
batch_stride_B1 = problem_size.k();
}
int64_t batch_stride_Bias = problem_size.n();
int64_t batch_stride_D = problem_size.m() * problem_size.n();
//
// Initialize the GEMM operator
//
int split_k_slices = DualGemm::kSplitKSerial ? 2 : 1;
typename cutlass::TensorRef<typename DualGemm::ElementC, typename DualGemm::LayoutC> nullptr_ref{};
decltype(nullptr_ref) ref_B0, ref_B1;
if (beta0 != ElementCompute(0)) {
ref_B0 = {tensor_Bias0.device_data(), typename DualGemm::LayoutC::Stride(0)};
}
if (beta1 != ElementCompute(0)) {
ref_B1 = {tensor_Bias1.device_data(), typename DualGemm::LayoutC::Stride(0)};
}
typename DualGemm::Arguments arguments{
(batch_count > 1 ?
cutlass::gemm::DualGemmMode::kBatched :
cutlass::gemm::DualGemmMode::kGemm),
problem_size,
tensor_A0.device_ref(),
tensor_B0.device_ref(),
ref_B0,
DualGemm::kStoreD0 ? tensor_D0.device_ref() : nullptr_ref,
(broadcast_b1 ?
typename DualGemm::TensorRefB1(tensor_B1.device_data(), 0) :
tensor_B1.device_ref()),
ref_B1,
DualGemm::kStoreD1 ? tensor_D1.device_ref() : nullptr_ref,
tensor_D2.device_ref(),
{alpha0, beta0},
{alpha1, beta1},
{},
split_k_slices,
batch_count,
batch_stride_A,
batch_stride_B0,
batch_stride_B1,
batch_stride_Bias,
batch_stride_D,
};
//
// Run the GEMM
//
DualGemm b2b_gemm_op;
cutlass::device_memory::allocation<uint8_t> workspace(b2b_gemm_op.get_workspace_size(arguments));
cutlass::Status status = b2b_gemm_op.can_implement(arguments);
CUTLASS_CHECK(status);
status = b2b_gemm_op.initialize(arguments, workspace.get());
CUTLASS_CHECK(status);
for(int i = 0; i < warm_ups; i++) {
status = b2b_gemm_op();
CUTLASS_CHECK(status);
}
if (is_profiling) {
//
// Profile the GEMM
//
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
for(int i = 0; i < runs; i++) {
status = b2b_gemm_op();
CUTLASS_CHECK(status);
}
cudaEventRecord(stop);
cudaDeviceSynchronize();
float gemmTime;
cudaEventElapsedTime(&gemmTime, start, stop);
std::cout << "Fusion time " << gemmTime / (float)runs << " ms\n";
}
tensor_D0.sync_host();
tensor_D1.sync_host();
tensor_D2.sync_host();
//
// Verify
//
using GemmUniversal0 = cutlass::gemm::device::GemmUniversal<
typename DualGemm::ElementA, typename DualGemm::LayoutA,
typename DualGemm::ElementB, typename DualGemm::LayoutB0,
typename DualGemm::ElementC, typename DualGemm::LayoutC,
ElementAccumulator
>;
GemmUniversal0 reference_gemm0;
typename GemmUniversal0::Arguments args0 {
(batch_count > 1 ?
cutlass::gemm::GemmUniversalMode::kBatched :
cutlass::gemm::GemmUniversalMode::kGemm),
problem_size,
batch_count,
{alpha0, beta0},
tensor_A0.device_data(),
tensor_B0.device_data(),
tensor_Bias0.device_data(),
reference_D0.device_data(),
batch_stride_A,
batch_stride_B0,
batch_stride_Bias,
batch_stride_D,
tensor_A0.stride(0),
tensor_B0.stride(0),
0, // zero stride for the bias vector
reference_D0.stride(0),
};
status = reference_gemm0.can_implement(args0);
CUTLASS_CHECK(status);
status = reference_gemm0(args0);
CUTLASS_CHECK(status);
using GemmUniversal1 = cutlass::gemm::device::GemmUniversal<
typename DualGemm::ElementA, typename DualGemm::LayoutA,
typename DualGemm::ElementB, typename DualGemm::LayoutB1,
typename DualGemm::ElementC, typename DualGemm::LayoutC,
ElementAccumulator
>;
GemmUniversal1 reference_gemm1;
typename GemmUniversal1::Arguments args1 {
(batch_count > 1 ?
cutlass::gemm::GemmUniversalMode::kBatched :
cutlass::gemm::GemmUniversalMode::kGemm),
problem_size,
batch_count,
{alpha1, beta1},
tensor_A0.device_data(),
tensor_B1.device_data(),
tensor_Bias1.device_data(),
reference_D1.device_data(),
batch_stride_A,
batch_stride_B1,
batch_stride_Bias,
batch_stride_D,
tensor_A0.stride(0),
(broadcast_b1 ? 0 : tensor_B1.stride(0)),
0, // zero stride for the bias vector
reference_D1.stride(0),
};
status = reference_gemm1.can_implement(args1);
CUTLASS_CHECK(status);
status = reference_gemm1(args1);
CUTLASS_CHECK(status);
if(relu) {
cutlass::reference::device::TensorReLu(reference_D0.device_view());
cutlass::reference::device::TensorReLu(reference_D1.device_view());
}
TensorEpilogueForEach<EpilogueOutputOp2>(reference_D0.device_view(), reference_D1.device_view(), reference_D2.device_view());
cudaDeviceSynchronize();
reference_D0.sync_host();
reference_D1.sync_host();
reference_D2.sync_host();
CHECK_GT(cutlass::reference::host::TensorNorm(reference_D0.host_view()), 0);
CHECK_GT(cutlass::reference::host::TensorNorm(reference_D1.host_view()), 0);
CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D2.host_view()), 0);
CHECK_GT(cutlass::reference::host::TensorNorm(reference_D2.host_view()), 0);
bool passed_out0 = true;
if (DualGemm::kStoreD0) {
CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D0.host_view()), 0);
passed_out0 = cutlass::reference::host::TensorEquals(
reference_D0.host_view(),
tensor_D0.host_view());
}
CHECK_TRUE(passed_out0);
bool passed_out1 = true;
if (DualGemm::kStoreD1) {
CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D1.host_view()), 0);
passed_out1 = cutlass::reference::host::TensorEquals(
reference_D1.host_view(),
tensor_D1.host_view());
}
CHECK_TRUE(passed_out1);
bool passed_out2 = cutlass::reference::host::TensorEquals(
reference_D2.host_view(),
tensor_D2.host_view());
CHECK_TRUE(passed_out2);
bool passed = passed_out0 && passed_out1 && passed_out2;
if (!passed)
{
std::stringstream fname;
fname << "error_DualGemm_device_fused.txt";
std::cerr << "Dumping results in " << fname.str() << "\n";
std::ofstream file(fname.str());
file
<< "A0 =\n" << tensor_A0.host_view()
<< "\nB0 =\n" << tensor_B0.host_view()
<< "\nC0 =\n" << tensor_C0.host_view()
<< "\nBias0:\n" << tensor_Bias0.host_view() << "\n"
<< "\nB1 =\n" << tensor_B1.host_view()
<< "\nC1 =\n" << tensor_C1.host_view()
<< "\nBias1:\n" << tensor_Bias1.host_view() << "\n"
<< "\n\nReference0 =\n" << reference_D0.host_view()
<< "\nComputed0 =\n" << tensor_D0.host_view()
<< "\n\nReference1 =\n" << reference_D1.host_view()
<< "\nComputed1 =\n" << tensor_D1.host_view()
<< "\n\nReference2 =\n" << reference_D2.host_view()
<< "\nComputed2 =\n" << tensor_D2.host_view();
}
//std::cout << "A0 " << tensor_A0.host_view() << std::endl;
// std::cout << "reference_D0 " << reference_D0.host_view() << std::endl;
// std::cout << "reference_D1 " << reference_D1.host_view() << std::endl;
// std::cout << "reference_D2 " << reference_D2.host_view() << std::endl;
//std::cout << "reference_D0 " << reference_D0.host_view() << std::endl;
return passed;
}
};
////////////////////////////////////////////////////////////////////////////////