From 6023038bae346ed9b4349f4815a94fa8de0f87e7 Mon Sep 17 00:00:00 2001 From: Haicheng Wu <57973641+hwu36@users.noreply.github.com> Date: Fri, 6 May 2022 13:24:51 -0400 Subject: [PATCH] add verification of the reduction tensor (#489) Co-authored-by: Haicheng Wu --- .../device/conv2d_with_reduction_testbed.h | 166 +++++++++--------- 1 file changed, 79 insertions(+), 87 deletions(-) diff --git a/test/unit/conv/device/conv2d_with_reduction_testbed.h b/test/unit/conv/device/conv2d_with_reduction_testbed.h index 3b55a9ae..d2ccc9f1 100644 --- a/test/unit/conv/device/conv2d_with_reduction_testbed.h +++ b/test/unit/conv/device/conv2d_with_reduction_testbed.h @@ -37,6 +37,7 @@ #include "cutlass/cutlass.h" #include "cutlass/conv/device/implicit_gemm_convolution.h" +#include "cutlass/reduction/device/tensor_reduce.h" #include "cutlass/reduction/device/reduce_split_k.h" #include "cutlass/reduction/thread/reduction_operators.h" @@ -88,8 +89,9 @@ public: cutlass::HostTensor tensor_B; cutlass::HostTensor tensor_C; - cutlass::HostTensor tensor_Reduction; + cutlass::HostTensor tensor_Reduction; cutlass::HostTensor tensor_Tensor; + cutlass::HostTensor tensor_Final_Reduction; cutlass::HostTensor tensor_D_computed; cutlass::HostTensor tensor_D_reference; @@ -115,23 +117,8 @@ public: if (dist_kind == cutlass::Distribution::Uniform) { - int scope; - int bits = cutlass::sizeof_bits::value; + int scope = 2; - if (bits <= 8) { - scope = 2; - } - else if (bits == 16) { - if (cutlass::sizeof_bits::value <= 16) { - scope = 3; - } - else { - scope = 5; - } - } - else { - scope = 8; - } cutlass::reference::host::TensorFillRandomUniform( view, seed, scope, -scope, 0); } @@ -159,8 +146,17 @@ public: tensor_C.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size)); tensor_Reduction.resize({ - (problem_size.N * problem_size.P * problem_size.Q), - (problem_size.K - 1 + Conv2d::ThreadblockShape::kN) / Conv2d::ThreadblockShape::kN + 1, + 1, + (problem_size.N * problem_size.P * problem_size.Q - 1 + Conv2d::ThreadblockShape::kM) / Conv2d::ThreadblockShape::kM, + (problem_size.K) + }); + + tensor_Final_Reduction.resize({ + 1, + 1, + 1, + (problem_size.K) }); tensor_Tensor.resize({(problem_size.N * problem_size.P * problem_size.Q), problem_size.K}); @@ -291,52 +287,36 @@ public: EXPECT_EQ(result, cudaSuccess) << " device reference error: " << cudaGetErrorString(result); + // Final reduction over the partial reduction tensor + using Functor = cutlass::plus; + using TensorReduction = cutlass::reduction::device::TensorReduction< + ElementAccumulator, + ElementAccumulator, + LayoutC, + Functor, + 8, + ElementAccumulator + >; + + TensorReduction reduction(tensor_Reduction.extent(), 2); + + cutlass::DeviceAllocation reduction_device_workspace(reduction.workspace_size()); + + status = reduction.reduce( + tensor_Final_Reduction.device_ref(), + tensor_Reduction.device_ref(), + reduction_device_workspace.get(), + ElementAccumulator()); + + EXPECT_EQ(status, cutlass::Status::kSuccess); + EXPECT_EQ(cudaDeviceSynchronize(), cudaSuccess); + + // + // Reference check + // + tensor_D_computed.sync_host(); - // - // Reference check - support caching results - // - - CachedTestKey cached_test_key = CreateCachedConv2dWithReductionTestKey< - ElementA, LayoutA, - ElementB, LayoutB, - ElementC, LayoutC, - ElementAccumulator, - ElementCompute - >( - kConvolutionalOperator, - problem_size, - alpha, - beta, - tensor_A.host_view(), - tensor_B.host_view(), - tensor_C.host_view() - ); - - // - // Look for the cached key - // - - bool cached_result_loaded = false; - CachedTestResult cached_test_result; - - std::string conv2d_result_cache_name = - std::string("cached_results_") + CUTLASS_TARGET_NAME + ".txt"; - - if (CUTLASS_TEST_ENABLE_CACHED_RESULTS) { - - CachedTestResultListing cached_results(conv2d_result_cache_name); - - auto cached = cached_results.find(cached_test_key); - - cached_result_loaded = cached.first; - if (cached_result_loaded) { - cached_test_result = cached.second; - } - } - - if (!cached_result_loaded) { - #if CUTLASS_CONV_TEST_UNIT_REFERENCE_DEVICE_ENABLED cutlass::reference::device::Conv2d< @@ -384,32 +364,44 @@ public: #endif - if (CUTLASS_TEST_ENABLE_CACHED_RESULTS) { + passed = cutlass::reference::host::TensorEquals( + tensor_D_computed.host_view(), + tensor_D_reference.host_view()); - cached_test_result.D = TensorHash(tensor_D_reference.host_view()); + EXPECT_TRUE(passed); - CachedTestResultListing cached_results(conv2d_result_cache_name); + // + // Reference check on reduction results + // - cached_results.append(cached_test_key, cached_test_result); - cached_results.write(conv2d_result_cache_name); + tensor_Reduction.sync_host(); + tensor_Final_Reduction.sync_host(); + + // compute backwards for reduction results + cutlass::HostTensor reference_Reduction; + reference_Reduction.resize({ + 1, + 1, + 1, + (problem_size.K) + }); + + for (int k = 0; k < problem_size.K; ++k) { + ElementAccumulator reduced_value = ElementAccumulator(); + for (int n = 0; n < problem_size.N; ++n) { + for (int p = 0; p < problem_size.P; ++p) { + for (int q = 0; q < problem_size.Q; ++q) { + reduced_value += tensor_D_reference.at({n, p, q, k}); + } + } } - } // if (!cached_result_loaded) - - - uint32_t tensor_D_hash = TensorHash(tensor_D_computed.host_view()); - - if (CUTLASS_TEST_ENABLE_CACHED_RESULTS) { - passed = (tensor_D_hash == cached_test_result.D); - - EXPECT_EQ(tensor_D_hash, cached_test_result.D) - << "Hash-based comparison failed for key:" << "\n" << cached_test_key << "\n"; + reference_Reduction.at({0, 0, 0, k}) = reduced_value; } - else { - passed = cutlass::reference::host::TensorEquals( - tensor_D_computed.host_view(), - tensor_D_reference.host_view()); - } + passed = cutlass::reference::host::TensorEquals( + tensor_Final_Reduction.host_view(), + reference_Reduction.host_view() + ); EXPECT_TRUE(passed); @@ -458,13 +450,13 @@ public: << "\nB:\n" << tensor_B.host_view() << "\n" << "\nC:\n" << tensor_C.host_view() << "\n" << "\nD reference:\n" << tensor_D_reference.host_view() << "\n" - << "\nD computed:\n" << tensor_D_computed.host_view() << "\n"; - + << "\nD computed:\n" << tensor_D_computed.host_view() << "\n" + << "\nreduction reference:\n" << reference_Reduction.host_view() << "\n" + << "\nreduction computed:\n" << tensor_Reduction.host_view() << "\n"; } return passed; } - }; ///////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -603,9 +595,9 @@ bool TestAllConv2dWithReduction( {1, 1} // dilation (dilation_h, dilation_w) ); + // Parallel SplitK is not tested. cutlass::conv::SplitKMode split_k_modes [] = { cutlass::conv::SplitKMode::kSerial, - cutlass::conv::SplitKMode::kParallel, }; int split_k_slices[] = {