diff --git a/CHANGELOG.md b/CHANGELOG.md
index da720d8b..1d50be45 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,8 +5,8 @@
 ## [2.5.0](https://github.com/NVIDIA/cutlass/releases/tag/v2.5.0) (2021-02-26)
   * Tensor reductions
     * _m_-to-_n_ reductions of tensors with affine layout
-    * [Specializations](/test/unit/reduction/tensor_reduce_contiguous.cu) for reductions including contiguous dimension
-    * [Specializations](/test/unit/reduction/tensor_reduce_strided.cu) for reductions excluding contiguous dimension
+    * [Specializations](/test/unit/reduction/device/tensor_reduce_contiguous.cu) for reductions including contiguous dimension
+    * [Specializations](/test/unit/reduction/device/tensor_reduce_strided.cu) for reductions excluding contiguous dimension
     * Custom reduction functors such as `cutlass::logical_and`
     * Large tensor support, up to 2^63 elements (however, each dimension is limited to an extent of 2^31)
   * Optimizations for 3-D convolution
diff --git a/README.md b/README.md
index 72d30752..bf2d5c92 100644
--- a/README.md
+++ b/README.md
@@ -36,7 +36,7 @@ supported at each level of the execution model hierarchy.
 
 # What's New in CUTLASS 2.5
 CUTLASS 2.5 is a minor update to CUTLASS adding:
-- [Tensor reductions](/test/unit/reduction/tensor_reduce_contiguous.cu)
+- [Tensor reductions](/test/unit/reduction/device/tensor_reduce_contiguous.cu)
 - [Optimizations for 3-D convolution](include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_optimized.h)
 - [Fused Convolution+Convolution example](/examples/13_two_tensor_op_fusion/README.md)
 - See the [CHANGELOG](CHANGELOG.md) for more details
diff --git a/test/unit/reduction/CMakeLists.txt b/test/unit/reduction/CMakeLists.txt
index d53bc0c1..c489f50f 100644
--- a/test/unit/reduction/CMakeLists.txt
+++ b/test/unit/reduction/CMakeLists.txt
@@ -22,11 +22,14 @@
 
 add_subdirectory(thread)
 add_subdirectory(kernel)
+add_subdirectory(device)
+
 add_custom_target(
   cutlass_test_unit_reduction
   DEPENDS
   cutlass_test_unit_reduction_thread
   cutlass_test_unit_reduction_kernel
+  cutlass_test_unit_reduction_device
   )
 
 add_custom_target(
@@ -34,4 +37,5 @@ add_custom_target(
   DEPENDS
   test_unit_reduction_thread
   test_unit_reduction_kernel
+  test_unit_reduction_device
   )