518 lines
		
	
	
		
			15 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
		
		
			
		
	
	
			518 lines
		
	
	
		
			15 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
|   | /*************************************************************************************************** | ||
|  |  * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved. | ||
|  |  * | ||
|  |  * Redistribution and use in source and binary forms, with or without modification, are permitted | ||
|  |  * provided that the following conditions are met: | ||
|  |  *     * Redistributions of source code must retain the above copyright notice, this list of | ||
|  |  *       conditions and the following disclaimer. | ||
|  |  *     * Redistributions in binary form must reproduce the above copyright notice, this list of | ||
|  |  *       conditions and the following disclaimer in the documentation and/or other materials | ||
|  |  *       provided with the distribution. | ||
|  |  *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used | ||
|  |  *       to endorse or promote products derived from this software without specific prior written | ||
|  |  *       permission. | ||
|  |  * | ||
|  |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR | ||
|  |  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND | ||
|  |  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE | ||
|  |  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, | ||
|  |  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; | ||
|  |  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, | ||
|  |  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | ||
|  |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||
|  |  * | ||
|  |  **************************************************************************************************/ | ||
|  | /*! \file | ||
|  |     \brief Tests for TensorReduce family of device-wide operators | ||
|  | */ | ||
|  | 
 | ||
|  | #include <iostream> | ||
|  | #include <limits> | ||
|  | 
 | ||
|  | #include "../../common/cutlass_unit_test.h" | ||
|  | 
 | ||
|  | #include "cutlass/cutlass.h" | ||
|  | #include "cutlass/complex.h" | ||
|  | #include "cutlass/reduction/thread/reduction_operators.h" | ||
|  | #include "cutlass/reduction/device/tensor_reduce.h" | ||
|  | 
 | ||
|  | #include "cutlass/functional.h" | ||
|  | #include "cutlass/layout/tensor.h" | ||
|  | 
 | ||
|  | #include "cutlass/util/host_tensor.h" | ||
|  | #include "cutlass/util/reference/host/gemm.h" | ||
|  | #include "cutlass/util/reference/host/tensor_compare.h" | ||
|  | #include "cutlass/util/reference/host/tensor_copy.h" | ||
|  | #include "cutlass/util/reference/host/tensor_fill.h" | ||
|  | #include "cutlass/util/reference/device/tensor_fill.h" | ||
|  | #include "cutlass/util/reference/host/tensor_norm.h" | ||
|  | #include "cutlass/util/tensor_view_io.h" | ||
|  | 
 | ||
|  | ///////////////////////////////////////////////////////////////////////////////////////////////// | ||
|  | 
 | ||
|  | /// This reduces the W dimension, transforming an NHWC tensor into NHWC with W=1. | ||
|  | template < | ||
|  |   typename TensorReduction,  | ||
|  |   typename ElementCompute = typename TensorReduction::ElementCompute | ||
|  | > | ||
|  | bool TestAllReduction_NHWC_reduce_w(ElementCompute reduction_identity = ElementCompute()) { | ||
|  | 
 | ||
|  |   using Layout = typename TensorReduction::Layout; | ||
|  |   using ElementOutput = typename TensorReduction::ElementOutput; | ||
|  |   using ElementSource = typename TensorReduction::ElementSource; | ||
|  | 
 | ||
|  |   int const kV = TensorReduction::kVectorLength; | ||
|  | 
 | ||
|  |   int const N_indices[] = {1, 2, 5, 10}; | ||
|  |   int const H_indices[] = {1, 3, 9 }; | ||
|  |   int const W_indices[] = {1, 5, 19, 40, 224}; | ||
|  |   int const C_indices[] = { | ||
|  |     kV,  | ||
|  |     2 * kV,  | ||
|  |     5 * kV,  | ||
|  |     9 * kV,  | ||
|  |     17 * kV,  | ||
|  |     39 * kV,  | ||
|  |     257 * kV,  | ||
|  |     kV * 760 | ||
|  |   }; | ||
|  | 
 | ||
|  |   using Element = int; | ||
|  | 
 | ||
|  |   for (int N : N_indices) { | ||
|  |     for (int H : H_indices) { | ||
|  |       for (int W : W_indices) { | ||
|  |         for (int C : C_indices) { | ||
|  | 
 | ||
|  |           cutlass::HostTensor<ElementSource, Layout> src_tensor({N, H, W, C}); | ||
|  |           cutlass::HostTensor<ElementOutput, Layout> dst_tensor({N, H, 1, C}); | ||
|  | 
 | ||
|  |           cutlass::reference::host::TensorFillRandomUniform( | ||
|  |             src_tensor.host_view(), 17, 10, -10, 0); | ||
|  | 
 | ||
|  |           cutlass::reference::host::BlockFillSequential( | ||
|  |             dst_tensor.host_data(), dst_tensor.capacity()); | ||
|  | 
 | ||
|  |           dst_tensor.sync_device(); | ||
|  |           src_tensor.sync_device(); | ||
|  | 
 | ||
|  |           // Execute a tensor reduction over rank 2 (the 'W' dimension is reduced; NHWC => NHC) | ||
|  |           TensorReduction reduction(src_tensor.extent(), 2); | ||
|  | 
 | ||
|  |           cutlass::DeviceAllocation<uint8_t> device_workspace(reduction.workspace_size()); | ||
|  | 
 | ||
|  |           cutlass::Status status = reduction.reduce( | ||
|  |             dst_tensor.device_ref(), | ||
|  |             src_tensor.device_ref(), | ||
|  |             device_workspace.get(), | ||
|  |             reduction_identity | ||
|  |           ); | ||
|  | 
 | ||
|  |           EXPECT_EQ(status, cutlass::Status::kSuccess); | ||
|  |           EXPECT_EQ(cudaDeviceSynchronize(), cudaSuccess); | ||
|  |           // Reference check | ||
|  |           dst_tensor.sync_host(); | ||
|  | 
 | ||
|  |           typename TensorReduction::ReductionOp reduction_op; | ||
|  | 
 | ||
|  |           for (int n = 0; n < src_tensor.extent().n(); ++n) { | ||
|  |             for (int h = 0; h < src_tensor.extent().h(); ++h) { | ||
|  |               for (int c = 0; c < src_tensor.extent().c(); ++c) { | ||
|  | 
 | ||
|  |                 ElementCompute w_accum = reduction_identity; | ||
|  | 
 | ||
|  |                 for (int w = 0; w < src_tensor.extent().w(); ++w) { | ||
|  |                   w_accum = reduction_op(w_accum, ElementCompute(src_tensor.at({n, h, w, c}))); | ||
|  |                 } | ||
|  | 
 | ||
|  |                 ElementCompute got = ElementCompute(dst_tensor.at({n, h, 0, c})); | ||
|  | 
 | ||
|  |                 bool equal = (w_accum == got); | ||
|  | 
 | ||
|  |                 EXPECT_TRUE(equal); | ||
|  |                 if (!equal) { | ||
|  | 
 | ||
|  |                   std::cerr  | ||
|  |                     << "Error at location (" << n << ", " << h << ", 0, " << c << ")" << std::endl; | ||
|  | 
 | ||
|  |                   std::cerr  | ||
|  |                     << "  expected: " << w_accum << std::endl | ||
|  |                     << "       got: " << got << std::endl; | ||
|  | 
 | ||
|  |                   std::cerr  | ||
|  |                     << "Problem: " << src_tensor.extent() << " -> "  | ||
|  |                     << dst_tensor.extent() << std::endl; | ||
|  | 
 | ||
|  |                   std::cerr  | ||
|  |                     << "   Grid: " << reduction.reduction_strided.grid_shape  | ||
|  |                     << "\n  Block: " << reduction.reduction_strided.threadblock_shape << std::endl | ||
|  |                     << "  Final: " << reduction.reduction_strided.grid_final  | ||
|  |                     << "\n  Block: " << reduction.reduction_strided.threadblock_final << "\n"; | ||
|  | 
 | ||
|  |                   return false; | ||
|  |                 } | ||
|  |               } | ||
|  |             } | ||
|  |           } | ||
|  |         } | ||
|  |       } | ||
|  |     } | ||
|  |   } | ||
|  | 
 | ||
|  |   return true; | ||
|  | } | ||
|  | 
 | ||
|  | ///////////////////////////////////////////////////////////////////////////////////////////////// | ||
|  | 
 | ||
|  | /// Test tensor reduction from NHWC to NHC | ||
|  | TEST(Reduction_TensorReduce, nhwc_reduce_w_f32x8_f16x8) { | ||
|  | 
 | ||
|  |   int const kV = 8; | ||
|  |   using ElementOutput = float; | ||
|  |   using ElementSource = cutlass::half_t; | ||
|  |   using ElementCompute = float; | ||
|  |   using Layout = cutlass::layout::TensorNHWC; | ||
|  | 
 | ||
|  |   // Define the functor | ||
|  |   using Functor = cutlass::plus<ElementCompute>; | ||
|  | 
 | ||
|  |   using TensorReduction = cutlass::reduction::device::TensorReduction< | ||
|  |     ElementOutput, | ||
|  |     ElementSource, | ||
|  |     Layout, | ||
|  |     Functor, | ||
|  |     kV, | ||
|  |     ElementCompute | ||
|  |   >; | ||
|  | 
 | ||
|  |   EXPECT_TRUE(TestAllReduction_NHWC_reduce_w<TensorReduction>());           | ||
|  | } | ||
|  | 
 | ||
|  | /// Test tensor reduction from NHWC to NHC | ||
|  | TEST(Reduction_TensorReduce, nhwc_reduce_w_f32x2_f16x2) { | ||
|  | 
 | ||
|  |   int const kV = 2; | ||
|  |   using ElementOutput = float; | ||
|  |   using ElementSource = cutlass::half_t; | ||
|  |   using ElementCompute = float; | ||
|  |   using Layout = cutlass::layout::TensorNHWC; | ||
|  | 
 | ||
|  |   // Define the functor | ||
|  |   using Functor = cutlass::plus<ElementCompute>; | ||
|  | 
 | ||
|  |   using TensorReduction = cutlass::reduction::device::TensorReduction< | ||
|  |     ElementOutput, | ||
|  |     ElementSource, | ||
|  |     Layout, | ||
|  |     Functor, | ||
|  |     kV, | ||
|  |     ElementCompute | ||
|  |   >; | ||
|  | 
 | ||
|  |   EXPECT_TRUE(TestAllReduction_NHWC_reduce_w<TensorReduction>());           | ||
|  | } | ||
|  | 
 | ||
|  | /// Test tensor reduction from NHWC to NHC | ||
|  | TEST(Reduction_TensorReduce, nhwc_reduce_w_f32x1_f16x1) { | ||
|  | 
 | ||
|  |   int const kV = 1; | ||
|  |   using ElementOutput = float; | ||
|  |   using ElementSource = cutlass::half_t; | ||
|  |   using ElementCompute = float; | ||
|  |   using Layout = cutlass::layout::TensorNHWC; | ||
|  | 
 | ||
|  |   // Define the functor | ||
|  |   using Functor = cutlass::plus<ElementCompute>; | ||
|  | 
 | ||
|  |   using TensorReduction = cutlass::reduction::device::TensorReduction< | ||
|  |     ElementOutput, | ||
|  |     ElementSource, | ||
|  |     Layout, | ||
|  |     Functor, | ||
|  |     kV, | ||
|  |     ElementCompute | ||
|  |   >; | ||
|  | 
 | ||
|  |   EXPECT_TRUE(TestAllReduction_NHWC_reduce_w<TensorReduction>());           | ||
|  | } | ||
|  | 
 | ||
|  | /// Test tensor reduction from NHWC to NHC | ||
|  | TEST(Reduction_TensorReduce, nhwc_reduce_w_s32x4) { | ||
|  | 
 | ||
|  |   int const kV = 4; | ||
|  |   using Element = int; | ||
|  |   using Layout = cutlass::layout::TensorNHWC; | ||
|  | 
 | ||
|  |   // Define the functor | ||
|  |   using Functor = cutlass::plus<Element>; | ||
|  | 
 | ||
|  |   using TensorReduction = cutlass::reduction::device::TensorReduction< | ||
|  |     Element, | ||
|  |     Element, | ||
|  |     Layout, | ||
|  |     Functor, | ||
|  |     kV, | ||
|  |     Element | ||
|  |   >; | ||
|  | 
 | ||
|  |   EXPECT_TRUE(TestAllReduction_NHWC_reduce_w<TensorReduction>());           | ||
|  | } | ||
|  | 
 | ||
|  | /// Test tensor reduction from NHWC to NHC | ||
|  | TEST(Reduction_TensorReduce, nhwc_reduce_w_cf32) { | ||
|  | 
 | ||
|  |   int const kV = 1; | ||
|  |   using ElementOutput = cutlass::complex<float>; | ||
|  |   using ElementSource = cutlass::complex<float>; | ||
|  |   using ElementCompute = cutlass::complex<float>; | ||
|  |   using Layout = cutlass::layout::TensorNHWC; | ||
|  | 
 | ||
|  |   // Define the functor | ||
|  |   using Functor = cutlass::plus<ElementCompute>; | ||
|  | 
 | ||
|  |   using TensorReduction = cutlass::reduction::device::TensorReduction< | ||
|  |     ElementOutput, | ||
|  |     ElementSource, | ||
|  |     Layout, | ||
|  |     Functor, | ||
|  |     kV, | ||
|  |     ElementCompute | ||
|  |   >; | ||
|  | 
 | ||
|  |   EXPECT_TRUE(TestAllReduction_NHWC_reduce_w<TensorReduction>());           | ||
|  | } | ||
|  | 
 | ||
|  | ///////////////////////////////////////////////////////////////////////////////////////////////// | ||
|  | 
 | ||
|  | /// Test tensor reduction from NHWC to NHC | ||
|  | TEST(Reduction_TensorReduce, nhwc_maximum_w_cf32) { | ||
|  | 
 | ||
|  |   int const kV = 1; | ||
|  |   using ElementOutput = float; | ||
|  |   using ElementSource = float; | ||
|  |   using ElementCompute = float; | ||
|  |   using Layout = cutlass::layout::TensorNHWC; | ||
|  | 
 | ||
|  |   // Define the functor | ||
|  |   using Functor = cutlass::maximum<ElementCompute>; | ||
|  | 
 | ||
|  |   using TensorReduction = cutlass::reduction::device::TensorReduction< | ||
|  |     ElementOutput, | ||
|  |     ElementSource, | ||
|  |     Layout, | ||
|  |     Functor, | ||
|  |     kV, | ||
|  |     ElementCompute | ||
|  |   >; | ||
|  | 
 | ||
|  |   EXPECT_TRUE(TestAllReduction_NHWC_reduce_w<TensorReduction>( -std::numeric_limits<float>::max() ));           | ||
|  | } | ||
|  | 
 | ||
|  | /// Test tensor reduction from NHWC to NHC | ||
|  | TEST(Reduction_TensorReduce, nhwc_minimum_w_cf32) { | ||
|  | 
 | ||
|  |   int const kV = 1; | ||
|  |   using ElementOutput = float; | ||
|  |   using ElementSource = float; | ||
|  |   using ElementCompute = float; | ||
|  |   using Layout = cutlass::layout::TensorNHWC; | ||
|  | 
 | ||
|  |   // Define the functor | ||
|  |   using Functor = cutlass::minimum<ElementCompute>; | ||
|  | 
 | ||
|  |   using TensorReduction = cutlass::reduction::device::TensorReduction< | ||
|  |     ElementOutput, | ||
|  |     ElementSource, | ||
|  |     Layout, | ||
|  |     Functor, | ||
|  |     kV, | ||
|  |     ElementCompute | ||
|  |   >; | ||
|  | 
 | ||
|  |   EXPECT_TRUE(TestAllReduction_NHWC_reduce_w<TensorReduction>(std::numeric_limits<float>::max()));           | ||
|  | } | ||
|  | 
 | ||
|  | ///////////////////////////////////////////////////////////////////////////////////////////////// | ||
|  | 
 | ||
|  | /// Test tensor reduction from NHWC to NHC | ||
|  | TEST(Reduction_TensorReduce, nhwc_XOR_w_u32) { | ||
|  | 
 | ||
|  |   int const kV = 1; | ||
|  |   using ElementOutput = int; | ||
|  |   using ElementSource = int; | ||
|  |   using ElementCompute = int; | ||
|  |   using Layout = cutlass::layout::TensorNHWC; | ||
|  | 
 | ||
|  |   // Define the functor | ||
|  |   using Functor = cutlass::bit_xor<ElementCompute>; | ||
|  | 
 | ||
|  |   using TensorReduction = cutlass::reduction::device::TensorReduction< | ||
|  |     ElementOutput, | ||
|  |     ElementSource, | ||
|  |     Layout, | ||
|  |     Functor, | ||
|  |     kV, | ||
|  |     ElementCompute | ||
|  |   >; | ||
|  | 
 | ||
|  |   EXPECT_TRUE(TestAllReduction_NHWC_reduce_w<TensorReduction>());           | ||
|  | } | ||
|  | 
 | ||
|  | ///////////////////////////////////////////////////////////////////////////////////////////////// | ||
|  | 
 | ||
|  | /// Test tensor reduction from NHWC to NHC | ||
|  | TEST(Reduction_TensorReduce, nhwc_AND_w_s32) { | ||
|  | 
 | ||
|  |   int const kV = 1; | ||
|  |   using ElementOutput = unsigned; | ||
|  |   using ElementSource = unsigned; | ||
|  |   using ElementCompute = unsigned; | ||
|  |   using Layout = cutlass::layout::TensorNHWC; | ||
|  | 
 | ||
|  |   // Define the functor | ||
|  |   using Functor = cutlass::bit_and<ElementCompute>; | ||
|  | 
 | ||
|  |   using TensorReduction = cutlass::reduction::device::TensorReduction< | ||
|  |     ElementOutput, | ||
|  |     ElementSource, | ||
|  |     Layout, | ||
|  |     Functor, | ||
|  |     kV, | ||
|  |     ElementCompute | ||
|  |   >; | ||
|  | 
 | ||
|  |   EXPECT_TRUE(TestAllReduction_NHWC_reduce_w<TensorReduction>(0xffffffff)); | ||
|  | } | ||
|  | 
 | ||
|  | ///////////////////////////////////////////////////////////////////////////////////////////////// | ||
|  | 
 | ||
|  | /// Test tensor reduction from NHWC to NHC | ||
|  | TEST(Reduction_TensorReduce, nhwc_OR_w_u32) { | ||
|  | 
 | ||
|  |   int const kV = 1; | ||
|  |   using ElementOutput = int; | ||
|  |   using ElementSource = int; | ||
|  |   using ElementCompute = int; | ||
|  |   using Layout = cutlass::layout::TensorNHWC; | ||
|  | 
 | ||
|  |   // Define the functor | ||
|  |   using Functor = cutlass::bit_or<ElementCompute>; | ||
|  | 
 | ||
|  |   using TensorReduction = cutlass::reduction::device::TensorReduction< | ||
|  |     ElementOutput, | ||
|  |     ElementSource, | ||
|  |     Layout, | ||
|  |     Functor, | ||
|  |     kV, | ||
|  |     ElementCompute | ||
|  |   >; | ||
|  | 
 | ||
|  |   EXPECT_TRUE(TestAllReduction_NHWC_reduce_w<TensorReduction>());           | ||
|  | } | ||
|  | 
 | ||
|  | ///////////////////////////////////////////////////////////////////////////////////////////////// | ||
|  | 
 | ||
|  | /// Test tensor reduction from NHWC to NHC | ||
|  | TEST(Reduction_TensorReduce, nhwc_ANY_w_s32) { | ||
|  | 
 | ||
|  |   int const kV = 1; | ||
|  |   using ElementOutput = int; | ||
|  |   using ElementSource = int; | ||
|  |   using ElementCompute = int; | ||
|  |   using Layout = cutlass::layout::TensorNHWC; | ||
|  | 
 | ||
|  |   // Define the functor | ||
|  |   using Functor = cutlass::logical_or<ElementCompute>; | ||
|  | 
 | ||
|  |   using TensorReduction = cutlass::reduction::device::TensorReduction< | ||
|  |     ElementOutput, | ||
|  |     ElementSource, | ||
|  |     Layout, | ||
|  |     Functor, | ||
|  |     kV, | ||
|  |     ElementCompute | ||
|  |   >; | ||
|  | 
 | ||
|  |   EXPECT_TRUE(TestAllReduction_NHWC_reduce_w<TensorReduction>(ElementCompute(0)));           | ||
|  | } | ||
|  | 
 | ||
|  | ///////////////////////////////////////////////////////////////////////////////////////////////// | ||
|  | 
 | ||
|  | /// Test tensor reduction from NHWC to NHC | ||
|  | TEST(Reduction_TensorReduce, nhwc_ALL_w_s32) { | ||
|  | 
 | ||
|  |   int const kV = 1; | ||
|  |   using ElementOutput = int; | ||
|  |   using ElementSource = int; | ||
|  |   using ElementCompute = int; | ||
|  |   using Layout = cutlass::layout::TensorNHWC; | ||
|  | 
 | ||
|  |   // Define the functor | ||
|  |   using Functor = cutlass::logical_and<ElementCompute>; | ||
|  | 
 | ||
|  |   using TensorReduction = cutlass::reduction::device::TensorReduction< | ||
|  |     ElementOutput, | ||
|  |     ElementSource, | ||
|  |     Layout, | ||
|  |     Functor, | ||
|  |     kV, | ||
|  |     ElementCompute | ||
|  |   >; | ||
|  | 
 | ||
|  |   EXPECT_TRUE(TestAllReduction_NHWC_reduce_w<TensorReduction>(ElementCompute(1)));           | ||
|  | } | ||
|  | 
 | ||
|  | ///////////////////////////////////////////////////////////////////////////////////////////////// | ||
|  | 
 | ||
|  | /// Test tensor reduction from NHWC to NHC | ||
|  | TEST(Reduction_TensorReduce, nhwc_ANY_w_f32) { | ||
|  | 
 | ||
|  |   int const kV = 1; | ||
|  |   using ElementOutput = float; | ||
|  |   using ElementSource = float; | ||
|  |   using ElementCompute = float; | ||
|  |   using Layout = cutlass::layout::TensorNHWC; | ||
|  | 
 | ||
|  |   // Define the functor | ||
|  |   using Functor = cutlass::logical_or<ElementCompute>; | ||
|  | 
 | ||
|  |   using TensorReduction = cutlass::reduction::device::TensorReduction< | ||
|  |     ElementOutput, | ||
|  |     ElementSource, | ||
|  |     Layout, | ||
|  |     Functor, | ||
|  |     kV, | ||
|  |     ElementCompute | ||
|  |   >; | ||
|  | 
 | ||
|  |   EXPECT_TRUE(TestAllReduction_NHWC_reduce_w<TensorReduction>(ElementCompute(0)));           | ||
|  | } | ||
|  | 
 | ||
|  | ///////////////////////////////////////////////////////////////////////////////////////////////// | ||
|  | 
 | ||
|  | /// Test tensor reduction from NHWC to NHC | ||
|  | TEST(Reduction_TensorReduce, nhwc_ALL_w_f32) { | ||
|  | 
 | ||
|  |   int const kV = 1; | ||
|  |   using ElementOutput = float; | ||
|  |   using ElementSource = float; | ||
|  |   using ElementCompute = float; | ||
|  |   using Layout = cutlass::layout::TensorNHWC; | ||
|  | 
 | ||
|  |   // Define the functor | ||
|  |   using Functor = cutlass::logical_and<ElementCompute>; | ||
|  | 
 | ||
|  |   using TensorReduction = cutlass::reduction::device::TensorReduction< | ||
|  |     ElementOutput, | ||
|  |     ElementSource, | ||
|  |     Layout, | ||
|  |     Functor, | ||
|  |     kV, | ||
|  |     ElementCompute | ||
|  |   >; | ||
|  | 
 | ||
|  |   EXPECT_TRUE(TestAllReduction_NHWC_reduce_w<TensorReduction>(ElementCompute(1)));           | ||
|  | } | ||
|  | 
 | ||
|  | ///////////////////////////////////////////////////////////////////////////////////////////////// |