cutlass/tools/test/unit/gemm/gemm_shared_mem_layouts.cu

/***************************************************************************************************
 * Copyright (c) 2017-2018, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright notice, this list of
 *       conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright notice, this list of
 *       conditions and the following disclaimer in the documentation and/or other materials
 *       provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
 *       to endorse or promote products derived from this software without specific prior written
 *       permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
 * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
#include <cutlass_unit_tests.h>
#include <tools/util/host_tensor.h>
#include <tools/util/tensor_view_io.h>
#include <cutlass/gemm/gemm.h>
#include <cutlass/shape.h>
#include <cutlass/gemm/sgemm_traits.h>
#include <cutlass/gemm/dgemm_traits.h>
#include <cutlass/gemm/hgemm_traits.h>

////////////////////////////////////////////////////////////////////////////////////////////////////

namespace test {

    // M/N/K struct.
    struct GemmDesc { 
      int m, n, k; 
      inline __host__ __device__ GemmDesc(int m_, int n_, int k_) : m(m_), n(n_), k(k_) {} 
    };

 /// Simple test to load from global memory and store to shared memory

    // Loading from global memory and storing to shared memory for A
    template <typename Traits>
    __global__ void gemm_load_global_store_shared_a(
        typename Traits::GlobalLoadStreamA::Scalar *output,
        typename Traits::GlobalLoadStreamA::Scalar const *input,
        int M,
        int N,
        int K,
        int ldm,
        int skew) {

        //Create shared memory.
        __shared__ typename Traits::SharedStorage shared_storage;

        // Create those iterators.
        typedef typename Traits::GlobalLoadStreamA GlobalLoadStreamA;

        typename GlobalLoadStreamA::Params global_load_params;
        GemmDesc desc(M, N, K);
        global_load_params.initialize(desc, input, ldm);

        GlobalLoadStreamA stream_a(global_load_params, shared_storage.main_loop.stream_a.global, M, N, K, cutlass::make_Coord(0, 0, 0));
        stream_a.copy();
        stream_a.commit();

        // store barrier
        __syncthreads();

        // one thread writes everything out
        if (threadIdx.x == 0) {
            for (int i = 0; i < (M+skew)*K; ++i) {
                output[i] = shared_storage.main_loop.stream_a.shared.scalars[i];
            }
        }

    }

    // Loading from global memory and storing to shared memory for B
    template <typename Traits>
    __global__ void gemm_load_global_store_shared_b(
        typename Traits::GlobalLoadStreamB::Scalar *output,
        typename Traits::GlobalLoadStreamB::Scalar const *input,
        int M,
        int N,
        int K,
        int ldm,
        int skew) {

        //Create shared memory.
        __shared__ typename Traits::SharedStorage shared_storage;

        // Create those iterators.
        typedef typename Traits::GlobalLoadStreamB GlobalLoadStreamB;
        typename GlobalLoadStreamB::Params global_load_params;
        GemmDesc desc(M, N, K);
        global_load_params.initialize(desc, input, ldm);

        GlobalLoadStreamB stream_b(global_load_params, shared_storage.main_loop.stream_b.global, M, N, K, cutlass::make_Coord(0, 0, 0));
        stream_b.copy();
        stream_b.commit();

        // store barrier
        __syncthreads();

        // one thread writes everything out
        if (threadIdx.x == 0) {
            for (int i = 0; i < (N+skew)*K; ++i) {
                output[i] = shared_storage.main_loop.stream_b.shared.scalars[i];
            }
        }

    }

////////////////////////////////////////////////////////////////////////////////////////////////////
TEST(GemmSharedMemLayout, A_float_contiguous) {

    static int const M = 64;
    static int const N = 64;
    static int const K = 8;

    typedef cutlass::Shape<K, N, M> ThreadBlockTile;

    typedef cutlass::gemm::SgemmTraits<cutlass::MatrixLayout::kColumnMajor, cutlass::MatrixLayout::kRowMajor, ThreadBlockTile >
            SgemmTraits;

    
    cutlass::HostTensor<float> input;
    cutlass::HostTensor<float> output;
    int skew = 0;

    input.resize_matrix(ThreadBlockTile::kW, ThreadBlockTile::kD,
            cutlass::MatrixLayout::kColumnMajor);
    output.resize_matrix(ThreadBlockTile::kW, ThreadBlockTile::kD,
            cutlass::MatrixLayout::kColumnMajor);


    input.fill_linear(cutlass::make_Coord(1, 1, ThreadBlockTile::kW, 1));

    output.fill(0);

    test::gemm_load_global_store_shared_a< SgemmTraits ><<<
        dim3(1,1,1),
        dim3(SgemmTraits::kThreads, 1)
    >>>(
        output.device_data(),
        input.device_data(),
        M,
        N,
        K,
        M,
        skew
    );

    cudaError_t result = cudaDeviceSynchronize();
    ASSERT_EQ(result, cudaSuccess) << "\nCUDA kernel launch error: " << cudaGetErrorString(result)
                                 << "\n";


    output.sync_host();

    EXPECT_TRUE(input.bit_equals(output));
}

////////////////////////////////////////////////////////////////////////////////////////////////////
TEST(GemmSharedMemLayout, A_float_crosswise) {

    static int const M = 64;
    static int const N = 64;
    static int const K = 8;

    typedef cutlass::Shape<K, N, M> ThreadBlockTile;

    typedef cutlass::gemm::SgemmTraits<cutlass::MatrixLayout::kRowMajor, cutlass::MatrixLayout::kRowMajor, ThreadBlockTile >
            SgemmTraits;

    
    cutlass::HostTensor<float> input;
    cutlass::HostTensor<float> output;
    int skew = 4;

    input.resize_matrix(ThreadBlockTile::kW, ThreadBlockTile::kD,
            cutlass::MatrixLayout::kRowMajor);
    output.resize_matrix(ThreadBlockTile::kW + skew, ThreadBlockTile::kD,
            cutlass::MatrixLayout::kColumnMajor);


    input.fill_linear(cutlass::make_Coord(1, ThreadBlockTile::kD, 1, 1));

    output.fill(0);

    test::gemm_load_global_store_shared_a< SgemmTraits ><<<
        dim3(1,1,1),
        dim3(SgemmTraits::kThreads, 1)
    >>>(
        output.device_data(),
        input.device_data(),
        M,
        N,
        K,
        K,
        skew
    );

    cudaError_t result = cudaDeviceSynchronize();
    ASSERT_EQ(result, cudaSuccess) << "\nCUDA kernel launch error: " << cudaGetErrorString(result)
                                 << "\n";

    output.sync_host();

    EXPECT_TRUE(input.bit_equals(output));

}

////////////////////////////////////////////////////////////////////////////////////////////////////
TEST(GemmSharedMemLayout, B_float_contiguous) {

    static int const M = 64;
    static int const N = 64;
    static int const K = 8;

    typedef cutlass::Shape<K, N, M> ThreadBlockTile;

    typedef cutlass::gemm::SgemmTraits<cutlass::MatrixLayout::kColumnMajor, cutlass::MatrixLayout::kRowMajor, ThreadBlockTile >
            SgemmTraits;

    
    cutlass::HostTensor<float> input;
    cutlass::HostTensor<float> output;
    int skew = 0;

    input.resize_matrix(ThreadBlockTile::kD, ThreadBlockTile::kH,
            cutlass::MatrixLayout::kRowMajor);
    output.resize_matrix(ThreadBlockTile::kD, ThreadBlockTile::kH,
            cutlass::MatrixLayout::kRowMajor);


    input.fill_linear(cutlass::make_Coord(1, ThreadBlockTile::kH, 1, 1));

    output.fill(0);

    test::gemm_load_global_store_shared_b< SgemmTraits ><<<
        dim3(1,1,1),
        dim3(SgemmTraits::kThreads, 1)
    >>>(
        output.device_data(),
        input.device_data(),
        M,
        N,
        K,
        N,
        skew
    );

    cudaError_t result = cudaDeviceSynchronize();
    ASSERT_EQ(result, cudaSuccess) << "\nCUDA kernel launch error: " << cudaGetErrorString(result)
                                 << "\n";


    output.sync_host();

    EXPECT_TRUE(input.bit_equals(output));
}

////////////////////////////////////////////////////////////////////////////////////////////////////
TEST(GemmSharedMemLayout, B_float_crosswise) {

    static int const M = 64;
    static int const N = 64;
    static int const K = 8;

    typedef cutlass::Shape<K, N, M> ThreadBlockTile;

    typedef cutlass::gemm::SgemmTraits<cutlass::MatrixLayout::kColumnMajor,
            cutlass::MatrixLayout::kColumnMajor, ThreadBlockTile >
            SgemmTraits;

    
    cutlass::HostTensor<float> input;
    cutlass::HostTensor<float> output;
    int skew = 4;

    input.resize_matrix(ThreadBlockTile::kD, ThreadBlockTile::kH,
            cutlass::MatrixLayout::kColumnMajor);
    output.resize_matrix(ThreadBlockTile::kD + skew, ThreadBlockTile::kH,
            cutlass::MatrixLayout::kRowMajor);


    input.fill_linear(cutlass::make_Coord(1, 1, ThreadBlockTile::kD, 1));

    output.fill(0);

    test::gemm_load_global_store_shared_b< SgemmTraits ><<<
        dim3(1,1,1),
        dim3(SgemmTraits::kThreads, 1)
    >>>(
        output.device_data(),
        input.device_data(),
        M,
        N,
        K,
        K,
        skew
    );

    cudaError_t result = cudaDeviceSynchronize();
    ASSERT_EQ(result, cudaSuccess) << "\nCUDA kernel launch error: " << cudaGetErrorString(result)
                                 << "\n";

    output.sync_host();

    EXPECT_TRUE(input.bit_equals(output));
}

////////////////////////////////////////////////////////////////////////////////////////////////////
TEST(GemmSharedMemLayout, A_double_contiguous) {

    static int const M = 64;
    static int const N = 64;
    static int const K = 8;

    typedef cutlass::Shape<K, N, M> ThreadBlockTile;

    typedef cutlass::gemm::DgemmTraits<cutlass::MatrixLayout::kColumnMajor, cutlass::MatrixLayout::kRowMajor, ThreadBlockTile >
            DgemmTraits;

    
    cutlass::HostTensor<double> input;
    cutlass::HostTensor<double> output;
    int skew = 0;

    input.resize_matrix(ThreadBlockTile::kW, ThreadBlockTile::kD,
            cutlass::MatrixLayout::kColumnMajor);
    output.resize_matrix(ThreadBlockTile::kW, ThreadBlockTile::kD,
            cutlass::MatrixLayout::kColumnMajor);


    input.fill_linear(cutlass::make_Coord(1, 1, ThreadBlockTile::kW, 1));

    output.fill(0);

    test::gemm_load_global_store_shared_a< DgemmTraits ><<<
        dim3(1,1,1),
        dim3(DgemmTraits::kThreads, 1)
    >>>(
        output.device_data(),
        input.device_data(),
        M,
        N,
        K,
        M,
        skew
    );

    cudaError_t result = cudaDeviceSynchronize();
    ASSERT_EQ(result, cudaSuccess) << "\nCUDA kernel launch error: " << cudaGetErrorString(result)
                                 << "\n";


    output.sync_host();

    EXPECT_TRUE(input.bit_equals(output));
}

////////////////////////////////////////////////////////////////////////////////////////////////////
TEST(GemmSharedMemLayout, A_double_crosswise) {

    static int const M = 64;
    static int const N = 64;
    static int const K = 8;

    typedef cutlass::Shape<K, N, M> ThreadBlockTile;

    typedef cutlass::gemm::DgemmTraits<cutlass::MatrixLayout::kRowMajor, cutlass::MatrixLayout::kRowMajor, ThreadBlockTile >
            DgemmTraits;

    
    cutlass::HostTensor<double> input;
    cutlass::HostTensor<double> output;
    int skew = 2;

    input.resize_matrix(ThreadBlockTile::kW, ThreadBlockTile::kD,
            cutlass::MatrixLayout::kRowMajor);
    output.resize_matrix(ThreadBlockTile::kW + skew, ThreadBlockTile::kD,
            cutlass::MatrixLayout::kColumnMajor);


    input.fill_linear(cutlass::make_Coord(1, ThreadBlockTile::kD, 1, 1));

    output.fill(0);

    test::gemm_load_global_store_shared_a< DgemmTraits ><<<
        dim3(1,1,1),
        dim3(DgemmTraits::kThreads, 1)
    >>>(
        output.device_data(),
        input.device_data(),
        M,
        N,
        K,
        K,
        skew
    );

    cudaError_t result = cudaDeviceSynchronize();
    ASSERT_EQ(result, cudaSuccess) << "\nCUDA kernel launch error: " << cudaGetErrorString(result)
                                 << "\n";

    output.sync_host();

    EXPECT_TRUE(input.bit_equals(output));

}

////////////////////////////////////////////////////////////////////////////////////////////////////
TEST(GemmSharedMemLayout, B_double_contiguous) {

    static int const M = 64;
    static int const N = 64;
    static int const K = 8;

    typedef cutlass::Shape<K, N, M> ThreadBlockTile;

    typedef cutlass::gemm::DgemmTraits<cutlass::MatrixLayout::kColumnMajor, cutlass::MatrixLayout::kRowMajor, ThreadBlockTile >
            DgemmTraits;

    
    cutlass::HostTensor<double> input;
    cutlass::HostTensor<double> output;
    int skew = 0;

    input.resize_matrix(ThreadBlockTile::kD, ThreadBlockTile::kH,
            cutlass::MatrixLayout::kRowMajor);
    output.resize_matrix(ThreadBlockTile::kD, ThreadBlockTile::kH,
            cutlass::MatrixLayout::kRowMajor);


    input.fill_linear(cutlass::make_Coord(1, ThreadBlockTile::kH, 1, 1));

    output.fill(0);

    test::gemm_load_global_store_shared_b< DgemmTraits ><<<
        dim3(1,1,1),
        dim3(DgemmTraits::kThreads, 1)
    >>>(
        output.device_data(),
        input.device_data(),
        M,
        N,
        K,
        N,
        skew
    );

    cudaError_t result = cudaDeviceSynchronize();
    ASSERT_EQ(result, cudaSuccess) << "\nCUDA kernel launch error: " << cudaGetErrorString(result)
                                 << "\n";


    output.sync_host();

    EXPECT_TRUE(input.bit_equals(output));
}

////////////////////////////////////////////////////////////////////////////////////////////////////
TEST(GemmSharedMemLayout, B_double_crosswise) {

    static int const M = 64;
    static int const N = 64;
    static int const K = 8;

    typedef cutlass::Shape<K, N, M> ThreadBlockTile;

    typedef cutlass::gemm::DgemmTraits<cutlass::MatrixLayout::kColumnMajor,
            cutlass::MatrixLayout::kColumnMajor, ThreadBlockTile >
            DgemmTraits;

    
    cutlass::HostTensor<double> input;
    cutlass::HostTensor<double> output;
    int skew = 2;

    input.resize_matrix(ThreadBlockTile::kD, ThreadBlockTile::kH,
            cutlass::MatrixLayout::kColumnMajor);
    output.resize_matrix(ThreadBlockTile::kD + skew, ThreadBlockTile::kH,
            cutlass::MatrixLayout::kRowMajor);


    input.fill_linear(cutlass::make_Coord(1, 1, ThreadBlockTile::kD, 1));

    output.fill(0);

    test::gemm_load_global_store_shared_b< DgemmTraits ><<<
        dim3(1,1,1),
        dim3(DgemmTraits::kThreads, 1)
    >>>(
        output.device_data(),
        input.device_data(),
        M,
        N,
        K,
        K,
        skew
    );

    cudaError_t result = cudaDeviceSynchronize();
    ASSERT_EQ(result, cudaSuccess) << "\nCUDA kernel launch error: " << cudaGetErrorString(result)
                                 << "\n";

    output.sync_host();

    EXPECT_TRUE(input.bit_equals(output));
}

////////////////////////////////////////////////////////////////////////////////////////////////////
TEST(GemmSharedMemLayout, A_half_crosswise) {

    static int const M = 128;
    static int const N = 128;
    static int const K = 8;

    typedef cutlass::Shape<K, N, M> ThreadBlockTile;

    typedef cutlass::gemm::HgemmTraits<cutlass::MatrixLayout::kRowMajor, cutlass::MatrixLayout::kRowMajor, ThreadBlockTile >
            HgemmTraits;

    
    cutlass::HostTensor<cutlass::half_t> input;
    cutlass::HostTensor<cutlass::half_t> output;
    int skew = 8;

    input.resize_matrix(ThreadBlockTile::kW, ThreadBlockTile::kD,
            cutlass::MatrixLayout::kRowMajor);
    output.resize_matrix(ThreadBlockTile::kW + skew, ThreadBlockTile::kD,
            cutlass::MatrixLayout::kColumnMajor);


    input.fill_linear(cutlass::make_Coord(1, ThreadBlockTile::kD, 1, 1));

    output.fill(0);

    test::gemm_load_global_store_shared_a< HgemmTraits ><<<
        dim3(1,1,1),
        dim3(HgemmTraits::kThreads, 1)
    >>>(
        output.device_data(),
        input.device_data(),
        M,
        N,
        K,
        K,
        skew
    );

    cudaError_t result = cudaDeviceSynchronize();
    ASSERT_EQ(result, cudaSuccess) << "\nCUDA kernel launch error: " << cudaGetErrorString(result)
                                 << "\n";

    output.sync_host();
    
    EXPECT_TRUE(input.bit_equals(output));

}

////////////////////////////////////////////////////////////////////////////////////////////////////
TEST(GemmSharedMemLayout, B_half_crosswise) {

    static int const M = 128;
    static int const N = 128;
    static int const K = 8;

    typedef cutlass::Shape<K, N, M> ThreadBlockTile;

    typedef cutlass::gemm::HgemmTraits<cutlass::MatrixLayout::kColumnMajor,
            cutlass::MatrixLayout::kColumnMajor, ThreadBlockTile >
            HgemmTraits;

    
    cutlass::HostTensor<cutlass::half_t> input;
    cutlass::HostTensor<cutlass::half_t> output;
    int skew = 8;

    input.resize_matrix(ThreadBlockTile::kD, ThreadBlockTile::kH,
            cutlass::MatrixLayout::kColumnMajor);
    output.resize_matrix(ThreadBlockTile::kD + skew, ThreadBlockTile::kH,
            cutlass::MatrixLayout::kRowMajor);


    input.fill_linear(cutlass::make_Coord(1, 1, ThreadBlockTile::kD, 1));

    output.fill(0);

    test::gemm_load_global_store_shared_b< HgemmTraits ><<<
        dim3(1,1,1),
        dim3(HgemmTraits::kThreads, 1)
    >>>(
        output.device_data(),
        input.device_data(),
        M,
        N,
        K,
        K,
        skew
    );

    cudaError_t result = cudaDeviceSynchronize();
    ASSERT_EQ(result, cudaSuccess) << "\nCUDA kernel launch error: " << cudaGetErrorString(result)
                                 << "\n";

    output.sync_host();

    EXPECT_TRUE(input.bit_equals(output));
}

////////////////////////////////////////////////////////////////////////////////////////////////////
}
CUTLASS v1.0 release 2018-05-17 02:44:56 +08:00			`/***************************************************************************************************`
			`* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.`
			`*`
			`* Redistribution and use in source and binary forms, with or without modification, are permitted`
			`* provided that the following conditions are met:`
			`* * Redistributions of source code must retain the above copyright notice, this list of`
			`* conditions and the following disclaimer.`
			`* * Redistributions in binary form must reproduce the above copyright notice, this list of`
			`* conditions and the following disclaimer in the documentation and/or other materials`
			`* provided with the distribution.`
			`* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used`
			`* to endorse or promote products derived from this software without specific prior written`
			`* permission.`
			`*`
			`* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR`
			`* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND`
			`* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE`
			`* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,`
			`* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;`
			`* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,`
			`* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE`
			`* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.`
			`*`
			`**************************************************************************************************/`
			`#include <cutlass_unit_tests.h>`
			`#include <tools/util/host_tensor.h>`
			`#include <tools/util/tensor_view_io.h>`
			`#include <cutlass/gemm/gemm.h>`
			`#include <cutlass/shape.h>`
			`#include <cutlass/gemm/sgemm_traits.h>`
			`#include <cutlass/gemm/dgemm_traits.h>`
			`#include <cutlass/gemm/hgemm_traits.h>`

			`////////////////////////////////////////////////////////////////////////////////////////////////////`

			`namespace test {`

			`// M/N/K struct.`
			`struct GemmDesc {`
			`int m, n, k;`
			`inline __host__ __device__ GemmDesc(int m_, int n_, int k_) : m(m_), n(n_), k(k_) {}`
			`};`

			`/// Simple test to load from global memory and store to shared memory`

			`// Loading from global memory and storing to shared memory for A`
			`template <typename Traits>`
			`__global__ void gemm_load_global_store_shared_a(`
			`typename Traits::GlobalLoadStreamA::Scalar *output,`
			`typename Traits::GlobalLoadStreamA::Scalar const *input,`
			`int M,`
			`int N,`
			`int K,`
			`int ldm,`
			`int skew) {`

			`//Create shared memory.`
			`__shared__ typename Traits::SharedStorage shared_storage;`

			`// Create those iterators.`
			`typedef typename Traits::GlobalLoadStreamA GlobalLoadStreamA;`

			`typename GlobalLoadStreamA::Params global_load_params;`
			`GemmDesc desc(M, N, K);`
			`global_load_params.initialize(desc, input, ldm);`

			`GlobalLoadStreamA stream_a(global_load_params, shared_storage.main_loop.stream_a.global, M, N, K, cutlass::make_Coord(0, 0, 0));`
			`stream_a.copy();`
			`stream_a.commit();`

			`// store barrier`
			`__syncthreads();`

			`// one thread writes everything out`
			`if (threadIdx.x == 0) {`
			`for (int i = 0; i < (M+skew)*K; ++i) {`
			`output[i] = shared_storage.main_loop.stream_a.shared.scalars[i];`
			`}`
			`}`

			`}`

			`// Loading from global memory and storing to shared memory for B`
			`template <typename Traits>`
			`__global__ void gemm_load_global_store_shared_b(`
			`typename Traits::GlobalLoadStreamB::Scalar *output,`
			`typename Traits::GlobalLoadStreamB::Scalar const *input,`
			`int M,`
			`int N,`
			`int K,`
			`int ldm,`
			`int skew) {`

			`//Create shared memory.`
			`__shared__ typename Traits::SharedStorage shared_storage;`

			`// Create those iterators.`
			`typedef typename Traits::GlobalLoadStreamB GlobalLoadStreamB;`
			`typename GlobalLoadStreamB::Params global_load_params;`
			`GemmDesc desc(M, N, K);`
			`global_load_params.initialize(desc, input, ldm);`

			`GlobalLoadStreamB stream_b(global_load_params, shared_storage.main_loop.stream_b.global, M, N, K, cutlass::make_Coord(0, 0, 0));`
			`stream_b.copy();`
			`stream_b.commit();`

			`// store barrier`
			`__syncthreads();`

			`// one thread writes everything out`
			`if (threadIdx.x == 0) {`
			`for (int i = 0; i < (N+skew)*K; ++i) {`
			`output[i] = shared_storage.main_loop.stream_b.shared.scalars[i];`
			`}`
			`}`

			`}`

			`////////////////////////////////////////////////////////////////////////////////////////////////////`
			`TEST(GemmSharedMemLayout, A_float_contiguous) {`

			`static int const M = 64;`
			`static int const N = 64;`
			`static int const K = 8;`

			`typedef cutlass::Shape<K, N, M> ThreadBlockTile;`

			`typedef cutlass::gemm::SgemmTraits<cutlass::MatrixLayout::kColumnMajor, cutlass::MatrixLayout::kRowMajor, ThreadBlockTile >`
			`SgemmTraits;`


			`cutlass::HostTensor<float> input;`
			`cutlass::HostTensor<float> output;`
			`int skew = 0;`

			`input.resize_matrix(ThreadBlockTile::kW, ThreadBlockTile::kD,`
			`cutlass::MatrixLayout::kColumnMajor);`
			`output.resize_matrix(ThreadBlockTile::kW, ThreadBlockTile::kD,`
			`cutlass::MatrixLayout::kColumnMajor);`


			`input.fill_linear(cutlass::make_Coord(1, 1, ThreadBlockTile::kW, 1));`

			`output.fill(0);`

			`test::gemm_load_global_store_shared_a< SgemmTraits ><<<`
			`dim3(1,1,1),`
			`dim3(SgemmTraits::kThreads, 1)`
			`>>>(`
			`output.device_data(),`
			`input.device_data(),`
			`M,`
			`N,`
			`K,`
			`M,`
			`skew`
			`);`

			`cudaError_t result = cudaDeviceSynchronize();`
			`ASSERT_EQ(result, cudaSuccess) << "\nCUDA kernel launch error: " << cudaGetErrorString(result)`
			`<< "\n";`


			`output.sync_host();`

			`EXPECT_TRUE(input.bit_equals(output));`
			`}`

			`////////////////////////////////////////////////////////////////////////////////////////////////////`
			`TEST(GemmSharedMemLayout, A_float_crosswise) {`

			`static int const M = 64;`
			`static int const N = 64;`
			`static int const K = 8;`

			`typedef cutlass::Shape<K, N, M> ThreadBlockTile;`

			`typedef cutlass::gemm::SgemmTraits<cutlass::MatrixLayout::kRowMajor, cutlass::MatrixLayout::kRowMajor, ThreadBlockTile >`
			`SgemmTraits;`


			`cutlass::HostTensor<float> input;`
			`cutlass::HostTensor<float> output;`
			`int skew = 4;`

			`input.resize_matrix(ThreadBlockTile::kW, ThreadBlockTile::kD,`
			`cutlass::MatrixLayout::kRowMajor);`
			`output.resize_matrix(ThreadBlockTile::kW + skew, ThreadBlockTile::kD,`
			`cutlass::MatrixLayout::kColumnMajor);`


			`input.fill_linear(cutlass::make_Coord(1, ThreadBlockTile::kD, 1, 1));`

			`output.fill(0);`

			`test::gemm_load_global_store_shared_a< SgemmTraits ><<<`
			`dim3(1,1,1),`
			`dim3(SgemmTraits::kThreads, 1)`
			`>>>(`
			`output.device_data(),`
			`input.device_data(),`
			`M,`
			`N,`
			`K,`
			`K,`
			`skew`
			`);`

			`cudaError_t result = cudaDeviceSynchronize();`
			`ASSERT_EQ(result, cudaSuccess) << "\nCUDA kernel launch error: " << cudaGetErrorString(result)`
			`<< "\n";`

			`output.sync_host();`

			`EXPECT_TRUE(input.bit_equals(output));`

			`}`

			`////////////////////////////////////////////////////////////////////////////////////////////////////`
			`TEST(GemmSharedMemLayout, B_float_contiguous) {`

			`static int const M = 64;`
			`static int const N = 64;`
			`static int const K = 8;`

			`typedef cutlass::Shape<K, N, M> ThreadBlockTile;`

			`typedef cutlass::gemm::SgemmTraits<cutlass::MatrixLayout::kColumnMajor, cutlass::MatrixLayout::kRowMajor, ThreadBlockTile >`
			`SgemmTraits;`


			`cutlass::HostTensor<float> input;`
			`cutlass::HostTensor<float> output;`
			`int skew = 0;`

			`input.resize_matrix(ThreadBlockTile::kD, ThreadBlockTile::kH,`
			`cutlass::MatrixLayout::kRowMajor);`
			`output.resize_matrix(ThreadBlockTile::kD, ThreadBlockTile::kH,`
			`cutlass::MatrixLayout::kRowMajor);`


			`input.fill_linear(cutlass::make_Coord(1, ThreadBlockTile::kH, 1, 1));`

			`output.fill(0);`

			`test::gemm_load_global_store_shared_b< SgemmTraits ><<<`
			`dim3(1,1,1),`
			`dim3(SgemmTraits::kThreads, 1)`
			`>>>(`
			`output.device_data(),`
			`input.device_data(),`
			`M,`
			`N,`
			`K,`
			`N,`
			`skew`
			`);`

			`cudaError_t result = cudaDeviceSynchronize();`
			`ASSERT_EQ(result, cudaSuccess) << "\nCUDA kernel launch error: " << cudaGetErrorString(result)`
			`<< "\n";`


			`output.sync_host();`

			`EXPECT_TRUE(input.bit_equals(output));`
			`}`

			`////////////////////////////////////////////////////////////////////////////////////////////////////`
			`TEST(GemmSharedMemLayout, B_float_crosswise) {`

			`static int const M = 64;`
			`static int const N = 64;`
			`static int const K = 8;`

			`typedef cutlass::Shape<K, N, M> ThreadBlockTile;`

			`typedef cutlass::gemm::SgemmTraits<cutlass::MatrixLayout::kColumnMajor,`
			`cutlass::MatrixLayout::kColumnMajor, ThreadBlockTile >`
			`SgemmTraits;`


			`cutlass::HostTensor<float> input;`
			`cutlass::HostTensor<float> output;`
			`int skew = 4;`

			`input.resize_matrix(ThreadBlockTile::kD, ThreadBlockTile::kH,`
			`cutlass::MatrixLayout::kColumnMajor);`
			`output.resize_matrix(ThreadBlockTile::kD + skew, ThreadBlockTile::kH,`
			`cutlass::MatrixLayout::kRowMajor);`


			`input.fill_linear(cutlass::make_Coord(1, 1, ThreadBlockTile::kD, 1));`

			`output.fill(0);`

			`test::gemm_load_global_store_shared_b< SgemmTraits ><<<`
			`dim3(1,1,1),`
			`dim3(SgemmTraits::kThreads, 1)`
			`>>>(`
			`output.device_data(),`
			`input.device_data(),`
			`M,`
			`N,`
			`K,`
			`K,`
			`skew`
			`);`

			`cudaError_t result = cudaDeviceSynchronize();`
			`ASSERT_EQ(result, cudaSuccess) << "\nCUDA kernel launch error: " << cudaGetErrorString(result)`
			`<< "\n";`

			`output.sync_host();`

			`EXPECT_TRUE(input.bit_equals(output));`
			`}`

			`////////////////////////////////////////////////////////////////////////////////////////////////////`
			`TEST(GemmSharedMemLayout, A_double_contiguous) {`

			`static int const M = 64;`
			`static int const N = 64;`
			`static int const K = 8;`

			`typedef cutlass::Shape<K, N, M> ThreadBlockTile;`

			`typedef cutlass::gemm::DgemmTraits<cutlass::MatrixLayout::kColumnMajor, cutlass::MatrixLayout::kRowMajor, ThreadBlockTile >`
			`DgemmTraits;`


			`cutlass::HostTensor<double> input;`
			`cutlass::HostTensor<double> output;`
			`int skew = 0;`

			`input.resize_matrix(ThreadBlockTile::kW, ThreadBlockTile::kD,`
			`cutlass::MatrixLayout::kColumnMajor);`
			`output.resize_matrix(ThreadBlockTile::kW, ThreadBlockTile::kD,`
			`cutlass::MatrixLayout::kColumnMajor);`


			`input.fill_linear(cutlass::make_Coord(1, 1, ThreadBlockTile::kW, 1));`

			`output.fill(0);`

			`test::gemm_load_global_store_shared_a< DgemmTraits ><<<`
			`dim3(1,1,1),`
			`dim3(DgemmTraits::kThreads, 1)`
			`>>>(`
			`output.device_data(),`
			`input.device_data(),`
			`M,`
			`N,`
			`K,`
			`M,`
			`skew`
			`);`

			`cudaError_t result = cudaDeviceSynchronize();`
			`ASSERT_EQ(result, cudaSuccess) << "\nCUDA kernel launch error: " << cudaGetErrorString(result)`
			`<< "\n";`


			`output.sync_host();`

			`EXPECT_TRUE(input.bit_equals(output));`
			`}`

			`////////////////////////////////////////////////////////////////////////////////////////////////////`
			`TEST(GemmSharedMemLayout, A_double_crosswise) {`

			`static int const M = 64;`
			`static int const N = 64;`
			`static int const K = 8;`

			`typedef cutlass::Shape<K, N, M> ThreadBlockTile;`

			`typedef cutlass::gemm::DgemmTraits<cutlass::MatrixLayout::kRowMajor, cutlass::MatrixLayout::kRowMajor, ThreadBlockTile >`
			`DgemmTraits;`


			`cutlass::HostTensor<double> input;`
			`cutlass::HostTensor<double> output;`
			`int skew = 2;`

			`input.resize_matrix(ThreadBlockTile::kW, ThreadBlockTile::kD,`
			`cutlass::MatrixLayout::kRowMajor);`
			`output.resize_matrix(ThreadBlockTile::kW + skew, ThreadBlockTile::kD,`
			`cutlass::MatrixLayout::kColumnMajor);`


			`input.fill_linear(cutlass::make_Coord(1, ThreadBlockTile::kD, 1, 1));`

			`output.fill(0);`

			`test::gemm_load_global_store_shared_a< DgemmTraits ><<<`
			`dim3(1,1,1),`
			`dim3(DgemmTraits::kThreads, 1)`
			`>>>(`
			`output.device_data(),`
			`input.device_data(),`
			`M,`
			`N,`
			`K,`
			`K,`
			`skew`
			`);`

			`cudaError_t result = cudaDeviceSynchronize();`
			`ASSERT_EQ(result, cudaSuccess) << "\nCUDA kernel launch error: " << cudaGetErrorString(result)`
			`<< "\n";`

			`output.sync_host();`

			`EXPECT_TRUE(input.bit_equals(output));`

			`}`

			`////////////////////////////////////////////////////////////////////////////////////////////////////`
			`TEST(GemmSharedMemLayout, B_double_contiguous) {`

			`static int const M = 64;`
			`static int const N = 64;`
			`static int const K = 8;`

			`typedef cutlass::Shape<K, N, M> ThreadBlockTile;`

			`typedef cutlass::gemm::DgemmTraits<cutlass::MatrixLayout::kColumnMajor, cutlass::MatrixLayout::kRowMajor, ThreadBlockTile >`
			`DgemmTraits;`


			`cutlass::HostTensor<double> input;`
			`cutlass::HostTensor<double> output;`
			`int skew = 0;`

			`input.resize_matrix(ThreadBlockTile::kD, ThreadBlockTile::kH,`
			`cutlass::MatrixLayout::kRowMajor);`
			`output.resize_matrix(ThreadBlockTile::kD, ThreadBlockTile::kH,`
			`cutlass::MatrixLayout::kRowMajor);`


			`input.fill_linear(cutlass::make_Coord(1, ThreadBlockTile::kH, 1, 1));`

			`output.fill(0);`

			`test::gemm_load_global_store_shared_b< DgemmTraits ><<<`
			`dim3(1,1,1),`
			`dim3(DgemmTraits::kThreads, 1)`
			`>>>(`
			`output.device_data(),`
			`input.device_data(),`
			`M,`
			`N,`
			`K,`
			`N,`
			`skew`
			`);`

			`cudaError_t result = cudaDeviceSynchronize();`
			`ASSERT_EQ(result, cudaSuccess) << "\nCUDA kernel launch error: " << cudaGetErrorString(result)`
			`<< "\n";`


			`output.sync_host();`

			`EXPECT_TRUE(input.bit_equals(output));`
			`}`

			`////////////////////////////////////////////////////////////////////////////////////////////////////`
			`TEST(GemmSharedMemLayout, B_double_crosswise) {`

			`static int const M = 64;`
			`static int const N = 64;`
			`static int const K = 8;`

			`typedef cutlass::Shape<K, N, M> ThreadBlockTile;`

			`typedef cutlass::gemm::DgemmTraits<cutlass::MatrixLayout::kColumnMajor,`
			`cutlass::MatrixLayout::kColumnMajor, ThreadBlockTile >`
			`DgemmTraits;`


			`cutlass::HostTensor<double> input;`
			`cutlass::HostTensor<double> output;`
			`int skew = 2;`

			`input.resize_matrix(ThreadBlockTile::kD, ThreadBlockTile::kH,`
			`cutlass::MatrixLayout::kColumnMajor);`
			`output.resize_matrix(ThreadBlockTile::kD + skew, ThreadBlockTile::kH,`
			`cutlass::MatrixLayout::kRowMajor);`


			`input.fill_linear(cutlass::make_Coord(1, 1, ThreadBlockTile::kD, 1));`

			`output.fill(0);`

			`test::gemm_load_global_store_shared_b< DgemmTraits ><<<`
			`dim3(1,1,1),`
			`dim3(DgemmTraits::kThreads, 1)`
			`>>>(`
			`output.device_data(),`
			`input.device_data(),`
			`M,`
			`N,`
			`K,`
			`K,`
			`skew`
			`);`

			`cudaError_t result = cudaDeviceSynchronize();`
			`ASSERT_EQ(result, cudaSuccess) << "\nCUDA kernel launch error: " << cudaGetErrorString(result)`
			`<< "\n";`

			`output.sync_host();`

			`EXPECT_TRUE(input.bit_equals(output));`
			`}`

			`////////////////////////////////////////////////////////////////////////////////////////////////////`
			`TEST(GemmSharedMemLayout, A_half_crosswise) {`

			`static int const M = 128;`
			`static int const N = 128;`
			`static int const K = 8;`

			`typedef cutlass::Shape<K, N, M> ThreadBlockTile;`

			`typedef cutlass::gemm::HgemmTraits<cutlass::MatrixLayout::kRowMajor, cutlass::MatrixLayout::kRowMajor, ThreadBlockTile >`
			`HgemmTraits;`


			`cutlass::HostTensor<cutlass::half_t> input;`
			`cutlass::HostTensor<cutlass::half_t> output;`
			`int skew = 8;`

			`input.resize_matrix(ThreadBlockTile::kW, ThreadBlockTile::kD,`
			`cutlass::MatrixLayout::kRowMajor);`
			`output.resize_matrix(ThreadBlockTile::kW + skew, ThreadBlockTile::kD,`
			`cutlass::MatrixLayout::kColumnMajor);`


			`input.fill_linear(cutlass::make_Coord(1, ThreadBlockTile::kD, 1, 1));`

			`output.fill(0);`

			`test::gemm_load_global_store_shared_a< HgemmTraits ><<<`
			`dim3(1,1,1),`
			`dim3(HgemmTraits::kThreads, 1)`
			`>>>(`
			`output.device_data(),`
			`input.device_data(),`
			`M,`
			`N,`
			`K,`
			`K,`
			`skew`
			`);`

			`cudaError_t result = cudaDeviceSynchronize();`
			`ASSERT_EQ(result, cudaSuccess) << "\nCUDA kernel launch error: " << cudaGetErrorString(result)`
			`<< "\n";`

			`output.sync_host();`

			`EXPECT_TRUE(input.bit_equals(output));`

			`}`

			`////////////////////////////////////////////////////////////////////////////////////////////////////`
			`TEST(GemmSharedMemLayout, B_half_crosswise) {`

			`static int const M = 128;`
			`static int const N = 128;`
			`static int const K = 8;`

			`typedef cutlass::Shape<K, N, M> ThreadBlockTile;`

			`typedef cutlass::gemm::HgemmTraits<cutlass::MatrixLayout::kColumnMajor,`
			`cutlass::MatrixLayout::kColumnMajor, ThreadBlockTile >`
			`HgemmTraits;`


			`cutlass::HostTensor<cutlass::half_t> input;`
			`cutlass::HostTensor<cutlass::half_t> output;`
			`int skew = 8;`

			`input.resize_matrix(ThreadBlockTile::kD, ThreadBlockTile::kH,`
			`cutlass::MatrixLayout::kColumnMajor);`
			`output.resize_matrix(ThreadBlockTile::kD + skew, ThreadBlockTile::kH,`
			`cutlass::MatrixLayout::kRowMajor);`


			`input.fill_linear(cutlass::make_Coord(1, 1, ThreadBlockTile::kD, 1));`

			`output.fill(0);`

			`test::gemm_load_global_store_shared_b< HgemmTraits ><<<`
			`dim3(1,1,1),`
			`dim3(HgemmTraits::kThreads, 1)`
			`>>>(`
			`output.device_data(),`
			`input.device_data(),`
			`M,`
			`N,`
			`K,`
			`K,`
			`skew`
			`);`

			`cudaError_t result = cudaDeviceSynchronize();`
			`ASSERT_EQ(result, cudaSuccess) << "\nCUDA kernel launch error: " << cudaGetErrorString(result)`
			`<< "\n";`

			`output.sync_host();`

			`EXPECT_TRUE(input.bit_equals(output));`
			`}`

			`////////////////////////////////////////////////////////////////////////////////////////////////////`
			`}`