add unit test for non int4 load
This commit is contained in:
parent
bb35a3ba6f
commit
f4b0a33633
@ -615,6 +615,11 @@ struct DefaultConv2dFprop <
|
|||||||
using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
|
using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
|
||||||
using MmaPolicy = typename MmaCore::MmaPolicy;
|
using MmaPolicy = typename MmaCore::MmaPolicy;
|
||||||
|
|
||||||
|
static cutlass::arch::CacheOperation::Kind const CacheOpB =
|
||||||
|
((sizeof_bits<ElementA>::value * AlignmentB) == 128)
|
||||||
|
? cutlass::arch::CacheOperation::Global
|
||||||
|
: cutlass::arch::CacheOperation::Always;
|
||||||
|
|
||||||
// Define the Mma
|
// Define the Mma
|
||||||
using Mma = threadblock::ImplicitGemmMultistage<
|
using Mma = threadblock::ImplicitGemmMultistage<
|
||||||
ThreadblockShape,
|
ThreadblockShape,
|
||||||
@ -623,7 +628,7 @@ struct DefaultConv2dFprop <
|
|||||||
arch::CacheOperation::Always,
|
arch::CacheOperation::Always,
|
||||||
IteratorB,
|
IteratorB,
|
||||||
SmemIteratorB,
|
SmemIteratorB,
|
||||||
arch::CacheOperation::Always,
|
CacheOpB,
|
||||||
MmaPolicy,
|
MmaPolicy,
|
||||||
Stages
|
Stages
|
||||||
>;
|
>;
|
||||||
|
@ -341,6 +341,8 @@ public:
|
|||||||
// Parameters structure
|
// Parameters structure
|
||||||
//
|
//
|
||||||
|
|
||||||
|
static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
|
||||||
|
|
||||||
struct Params {
|
struct Params {
|
||||||
|
|
||||||
Layout layout;
|
Layout layout;
|
||||||
|
@ -176,6 +176,7 @@ public:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
CUTLASS_PRAGMA_UNROLL
|
||||||
for (int v_idx = 0; v_idx < kAccessesPerVector; ++v_idx) {
|
for (int v_idx = 0; v_idx < kAccessesPerVector; ++v_idx) {
|
||||||
clear_mask_(filter_c_ + v_idx * AccessSize >= problem_size_.C, v_idx);
|
clear_mask_(filter_c_ + v_idx * AccessSize >= problem_size_.C, v_idx);
|
||||||
}
|
}
|
||||||
@ -212,7 +213,6 @@ public:
|
|||||||
#else
|
#else
|
||||||
if (clear) {
|
if (clear) {
|
||||||
predicates_[index] = 0;
|
predicates_[index] = 0;
|
||||||
predicates_[index] = 0;
|
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
@ -247,6 +247,7 @@ public:
|
|||||||
filter_c_ += params_.filter_c_delta;
|
filter_c_ += params_.filter_c_delta;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
CUTLASS_PRAGMA_UNROLL
|
||||||
for (int v_idx = 0; v_idx < kAccessesPerVector; ++v_idx) {
|
for (int v_idx = 0; v_idx < kAccessesPerVector; ++v_idx) {
|
||||||
clear_mask_(filter_c_ + v_idx * AccessSize >= problem_size_.C, v_idx);
|
clear_mask_(filter_c_ + v_idx * AccessSize >= problem_size_.C, v_idx);
|
||||||
}
|
}
|
||||||
|
@ -117,5 +117,89 @@ TEST(SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_ten
|
|||||||
EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
|
EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
TEST(SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_align2,
|
||||||
|
128x128_64x3_64x64x64) {
|
||||||
|
|
||||||
|
/// Conv operation element types for the Gemm equivalent (ImplicitGemm)
|
||||||
|
using ElementA = cutlass::half_t;
|
||||||
|
using ElementB = cutlass::half_t;
|
||||||
|
using ElementC = cutlass::half_t;
|
||||||
|
using ElementAccumulator = cutlass::half_t;
|
||||||
|
using ElementCompute = cutlass::half_t;
|
||||||
|
|
||||||
|
/// Device-level Conv2d instance
|
||||||
|
using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
|
||||||
|
ElementA, cutlass::layout::TensorNHWC,
|
||||||
|
ElementB, cutlass::layout::TensorNHWC,
|
||||||
|
ElementC, cutlass::layout::TensorNHWC,
|
||||||
|
ElementAccumulator,
|
||||||
|
cutlass::arch::OpClassTensorOp,
|
||||||
|
cutlass::arch::Sm80,
|
||||||
|
cutlass::gemm::GemmShape<128, 128, 64>,
|
||||||
|
cutlass::gemm::GemmShape<64, 64, 64>,
|
||||||
|
cutlass::gemm::GemmShape<16, 8, 16>,
|
||||||
|
cutlass::epilogue::thread::LinearCombination<
|
||||||
|
ElementC,
|
||||||
|
128 / cutlass::sizeof_bits<ElementC>::value,
|
||||||
|
ElementAccumulator,
|
||||||
|
ElementCompute
|
||||||
|
>,
|
||||||
|
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
|
||||||
|
3,
|
||||||
|
cutlass::arch::OpMultiplyAdd,
|
||||||
|
cutlass::conv::IteratorAlgorithm::kOptimized,
|
||||||
|
2,
|
||||||
|
2
|
||||||
|
>::Kernel;
|
||||||
|
|
||||||
|
using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
|
||||||
|
|
||||||
|
/// Run all unit test sizes with device-level Conv2d instance
|
||||||
|
EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
TEST(SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_align4,
|
||||||
|
128x128_64x3_64x64x64) {
|
||||||
|
|
||||||
|
/// Conv operation element types for the Gemm equivalent (ImplicitGemm)
|
||||||
|
using ElementA = cutlass::half_t;
|
||||||
|
using ElementB = cutlass::half_t;
|
||||||
|
using ElementC = cutlass::half_t;
|
||||||
|
using ElementAccumulator = cutlass::half_t;
|
||||||
|
using ElementCompute = cutlass::half_t;
|
||||||
|
|
||||||
|
/// Device-level Conv2d instance
|
||||||
|
using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
|
||||||
|
ElementA, cutlass::layout::TensorNHWC,
|
||||||
|
ElementB, cutlass::layout::TensorNHWC,
|
||||||
|
ElementC, cutlass::layout::TensorNHWC,
|
||||||
|
ElementAccumulator,
|
||||||
|
cutlass::arch::OpClassTensorOp,
|
||||||
|
cutlass::arch::Sm80,
|
||||||
|
cutlass::gemm::GemmShape<128, 128, 64>,
|
||||||
|
cutlass::gemm::GemmShape<64, 64, 64>,
|
||||||
|
cutlass::gemm::GemmShape<16, 8, 16>,
|
||||||
|
cutlass::epilogue::thread::LinearCombination<
|
||||||
|
ElementC,
|
||||||
|
128 / cutlass::sizeof_bits<ElementC>::value,
|
||||||
|
ElementAccumulator,
|
||||||
|
ElementCompute
|
||||||
|
>,
|
||||||
|
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
|
||||||
|
3,
|
||||||
|
cutlass::arch::OpMultiplyAdd,
|
||||||
|
cutlass::conv::IteratorAlgorithm::kOptimized,
|
||||||
|
4,
|
||||||
|
4
|
||||||
|
>::Kernel;
|
||||||
|
|
||||||
|
using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
|
||||||
|
|
||||||
|
/// Run all unit test sizes with device-level Conv2d instance
|
||||||
|
EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
|
||||||
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
#endif // CUTLASS_ARCH_MMA_SM80_SUPPORTED
|
#endif // CUTLASS_ARCH_MMA_SM80_SUPPORTED
|
||||||
|
@ -117,5 +117,134 @@ TEST(SM75_Device_Conv2d_Fprop_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_ten
|
|||||||
EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
|
EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
TEST(SM75_Device_Conv2d_Fprop_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_align1,
|
||||||
|
128x128_32x2_64x64x32) {
|
||||||
|
|
||||||
|
/// Conv operation element types for the Gemm equivalent (ImplicitGemm)
|
||||||
|
using ElementA = cutlass::half_t;
|
||||||
|
using ElementB = cutlass::half_t;
|
||||||
|
using ElementC = float;
|
||||||
|
using ElementAccumulator = float;
|
||||||
|
using ElementCompute = float;
|
||||||
|
|
||||||
|
/// Device-level Conv2d instance
|
||||||
|
using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
|
||||||
|
ElementA, cutlass::layout::TensorNHWC,
|
||||||
|
ElementB, cutlass::layout::TensorNHWC,
|
||||||
|
ElementC, cutlass::layout::TensorNHWC,
|
||||||
|
ElementAccumulator,
|
||||||
|
cutlass::arch::OpClassTensorOp,
|
||||||
|
cutlass::arch::Sm75,
|
||||||
|
cutlass::gemm::GemmShape<128, 128, 32>,
|
||||||
|
cutlass::gemm::GemmShape<64, 64, 32>,
|
||||||
|
cutlass::gemm::GemmShape<16, 8, 8>,
|
||||||
|
cutlass::epilogue::thread::LinearCombination<
|
||||||
|
ElementC,
|
||||||
|
128 / cutlass::sizeof_bits<ElementC>::value,
|
||||||
|
ElementAccumulator,
|
||||||
|
ElementCompute
|
||||||
|
>,
|
||||||
|
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
|
||||||
|
2,
|
||||||
|
cutlass::arch::OpMultiplyAdd,
|
||||||
|
cutlass::conv::IteratorAlgorithm::kOptimized,
|
||||||
|
1,
|
||||||
|
1
|
||||||
|
>::Kernel;
|
||||||
|
|
||||||
|
using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
|
||||||
|
|
||||||
|
/// Run all unit test sizes with device-level Conv2d instance
|
||||||
|
EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
TEST(SM75_Device_Conv2d_Fprop_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_align2,
|
||||||
|
128x128_32x2_64x64x32) {
|
||||||
|
|
||||||
|
/// Conv operation element types for the Gemm equivalent (ImplicitGemm)
|
||||||
|
using ElementA = cutlass::half_t;
|
||||||
|
using ElementB = cutlass::half_t;
|
||||||
|
using ElementC = float;
|
||||||
|
using ElementAccumulator = float;
|
||||||
|
using ElementCompute = float;
|
||||||
|
|
||||||
|
/// Device-level Conv2d instance
|
||||||
|
using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
|
||||||
|
ElementA, cutlass::layout::TensorNHWC,
|
||||||
|
ElementB, cutlass::layout::TensorNHWC,
|
||||||
|
ElementC, cutlass::layout::TensorNHWC,
|
||||||
|
ElementAccumulator,
|
||||||
|
cutlass::arch::OpClassTensorOp,
|
||||||
|
cutlass::arch::Sm75,
|
||||||
|
cutlass::gemm::GemmShape<128, 128, 32>,
|
||||||
|
cutlass::gemm::GemmShape<64, 64, 32>,
|
||||||
|
cutlass::gemm::GemmShape<16, 8, 8>,
|
||||||
|
cutlass::epilogue::thread::LinearCombination<
|
||||||
|
ElementC,
|
||||||
|
128 / cutlass::sizeof_bits<ElementC>::value,
|
||||||
|
ElementAccumulator,
|
||||||
|
ElementCompute
|
||||||
|
>,
|
||||||
|
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
|
||||||
|
2,
|
||||||
|
cutlass::arch::OpMultiplyAdd,
|
||||||
|
cutlass::conv::IteratorAlgorithm::kOptimized,
|
||||||
|
2,
|
||||||
|
2
|
||||||
|
>::Kernel;
|
||||||
|
|
||||||
|
using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
|
||||||
|
|
||||||
|
/// Run all unit test sizes with device-level Conv2d instance
|
||||||
|
EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
TEST(SM75_Device_Conv2d_Fprop_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_align4,
|
||||||
|
128x128_32x2_64x64x32) {
|
||||||
|
|
||||||
|
/// Conv operation element types for the Gemm equivalent (ImplicitGemm)
|
||||||
|
using ElementA = cutlass::half_t;
|
||||||
|
using ElementB = cutlass::half_t;
|
||||||
|
using ElementC = float;
|
||||||
|
using ElementAccumulator = float;
|
||||||
|
using ElementCompute = float;
|
||||||
|
|
||||||
|
/// Device-level Conv2d instance
|
||||||
|
using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
|
||||||
|
ElementA, cutlass::layout::TensorNHWC,
|
||||||
|
ElementB, cutlass::layout::TensorNHWC,
|
||||||
|
ElementC, cutlass::layout::TensorNHWC,
|
||||||
|
ElementAccumulator,
|
||||||
|
cutlass::arch::OpClassTensorOp,
|
||||||
|
cutlass::arch::Sm75,
|
||||||
|
cutlass::gemm::GemmShape<128, 128, 32>,
|
||||||
|
cutlass::gemm::GemmShape<64, 64, 32>,
|
||||||
|
cutlass::gemm::GemmShape<16, 8, 8>,
|
||||||
|
cutlass::epilogue::thread::LinearCombination<
|
||||||
|
ElementC,
|
||||||
|
128 / cutlass::sizeof_bits<ElementC>::value,
|
||||||
|
ElementAccumulator,
|
||||||
|
ElementCompute
|
||||||
|
>,
|
||||||
|
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
|
||||||
|
2,
|
||||||
|
cutlass::arch::OpMultiplyAdd,
|
||||||
|
cutlass::conv::IteratorAlgorithm::kOptimized,
|
||||||
|
4,
|
||||||
|
4
|
||||||
|
>::Kernel;
|
||||||
|
|
||||||
|
using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
|
||||||
|
|
||||||
|
/// Run all unit test sizes with device-level Conv2d instance
|
||||||
|
EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
|
||||||
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
#endif // CUTLASS_ARCH_MMA_SM75_SUPPORTED
|
#endif // CUTLASS_ARCH_MMA_SM75_SUPPORTED
|
||||||
|
@ -77,5 +77,93 @@ TEST(SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_te
|
|||||||
EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
|
EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
TEST(SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_align1,
|
||||||
|
128x128_32x3_64x64x32) {
|
||||||
|
|
||||||
|
/// Conv operation element types for the Gemm equivalent (ImplicitGemm)
|
||||||
|
using ElementA = cutlass::tfloat32_t;
|
||||||
|
using ElementB = cutlass::tfloat32_t;
|
||||||
|
using ElementC = float;
|
||||||
|
using ElementAccumulator = float;
|
||||||
|
using ElementCompute = float;
|
||||||
|
|
||||||
|
/// Device-level Conv2d instance
|
||||||
|
using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
|
||||||
|
ElementA, cutlass::layout::TensorNHWC,
|
||||||
|
ElementB, cutlass::layout::TensorNHWC,
|
||||||
|
ElementC, cutlass::layout::TensorNHWC,
|
||||||
|
ElementAccumulator,
|
||||||
|
cutlass::arch::OpClassTensorOp,
|
||||||
|
cutlass::arch::Sm80,
|
||||||
|
cutlass::gemm::GemmShape<128, 128, 16>,
|
||||||
|
cutlass::gemm::GemmShape<64, 64, 16>,
|
||||||
|
cutlass::gemm::GemmShape<16, 8, 8>,
|
||||||
|
cutlass::epilogue::thread::LinearCombination<
|
||||||
|
ElementC,
|
||||||
|
128 / cutlass::sizeof_bits<ElementC>::value,
|
||||||
|
ElementAccumulator,
|
||||||
|
ElementCompute
|
||||||
|
>,
|
||||||
|
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
|
||||||
|
3,
|
||||||
|
cutlass::arch::OpMultiplyAdd,
|
||||||
|
cutlass::conv::IteratorAlgorithm::kOptimized,
|
||||||
|
1,
|
||||||
|
1
|
||||||
|
>::Kernel;
|
||||||
|
|
||||||
|
using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
|
||||||
|
|
||||||
|
|
||||||
|
/// Run all unit test sizes with device-level Conv2d instance
|
||||||
|
EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
TEST(SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_align2,
|
||||||
|
128x128_32x3_64x64x32) {
|
||||||
|
|
||||||
|
/// Conv operation element types for the Gemm equivalent (ImplicitGemm)
|
||||||
|
using ElementA = cutlass::tfloat32_t;
|
||||||
|
using ElementB = cutlass::tfloat32_t;
|
||||||
|
using ElementC = float;
|
||||||
|
using ElementAccumulator = float;
|
||||||
|
using ElementCompute = float;
|
||||||
|
|
||||||
|
/// Device-level Conv2d instance
|
||||||
|
using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
|
||||||
|
ElementA, cutlass::layout::TensorNHWC,
|
||||||
|
ElementB, cutlass::layout::TensorNHWC,
|
||||||
|
ElementC, cutlass::layout::TensorNHWC,
|
||||||
|
ElementAccumulator,
|
||||||
|
cutlass::arch::OpClassTensorOp,
|
||||||
|
cutlass::arch::Sm80,
|
||||||
|
cutlass::gemm::GemmShape<128, 128, 16>,
|
||||||
|
cutlass::gemm::GemmShape<64, 64, 16>,
|
||||||
|
cutlass::gemm::GemmShape<16, 8, 8>,
|
||||||
|
cutlass::epilogue::thread::LinearCombination<
|
||||||
|
ElementC,
|
||||||
|
128 / cutlass::sizeof_bits<ElementC>::value,
|
||||||
|
ElementAccumulator,
|
||||||
|
ElementCompute
|
||||||
|
>,
|
||||||
|
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
|
||||||
|
3,
|
||||||
|
cutlass::arch::OpMultiplyAdd,
|
||||||
|
cutlass::conv::IteratorAlgorithm::kOptimized,
|
||||||
|
2,
|
||||||
|
2
|
||||||
|
>::Kernel;
|
||||||
|
|
||||||
|
using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
|
||||||
|
|
||||||
|
|
||||||
|
/// Run all unit test sizes with device-level Conv2d instance
|
||||||
|
EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
|
||||||
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
#endif // CUTLASS_ARCH_MMA_SM80_SUPPORTED
|
#endif // CUTLASS_ARCH_MMA_SM80_SUPPORTED
|
||||||
|
Loading…
Reference in New Issue
Block a user