diff --git a/examples/36_gather_scatter_fusion/gather_scatter_fusion.cu b/examples/36_gather_scatter_fusion/gather_scatter_fusion.cu
index f43d1160..8ceea638 100644
--- a/examples/36_gather_scatter_fusion/gather_scatter_fusion.cu
+++ b/examples/36_gather_scatter_fusion/gather_scatter_fusion.cu
@@ -43,7 +43,7 @@
 //        for (int k = 0; k < options.index_size; ++k) {
 //            int a_col = tensor_indices.at({k, 0});
 //            tensor_d_ref.at({i, b_c_d_col}) +=
-//              alpha * tensor_a.at({i, a_col}) * tensor_b.at({k, b_c_d_col}) + beta * tensor_c.at({i, b_c_d_col});
+//              alpha * tensor_a.at({i, a_col}) * tensor_b.at({k, b_c_d_col});
 //        }
 //      }
 //
@@ -229,8 +229,7 @@ using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
                                                           // the vector width of math instructions in
                                                           // epilogue too
     ElementAccumulator,                                   // <- data type of accumulator
-    ElementComputeEpilogue,                               // <- data type for alpha in linear combination function
-    cutlass::epilogue::thread::ScaleType::Nothing>;       // <- C
+    ElementComputeEpilogue>;                              // <- data type for alpha in linear combination function
 
 // Number of pipelines you want to use
 constexpr int NumStages = 5;
@@ -301,8 +300,12 @@ int run(Options &options) {
       ElementInputA(-8),
       0);  // <- Fill matrix B on host with uniform-distribution random data
 
-  cutlass::reference::host::TensorFill(
-      tensor_c.host_view());  // <- Fill matrix C on host with zeros
+  cutlass::reference::host::TensorFillRandomUniform(
+      tensor_c.host_view(),
+      1,
+      ElementOutput(7),
+      ElementOutput(-8),
+      0);  // <- Fill matrix C on host with uniform-distribution random data
 
   cutlass::reference::host::TensorFill(
     tensor_d_scattered.host_view());  // <- fill matrix D on host with zeros
@@ -387,8 +390,10 @@ int run(Options &options) {
         for (int k = 0; k < options.index_size; ++k) {
             int a_col = tensor_indices.at({k, 0});
             tensor_d_ref.at({i, b_c_d_col}) +=
-              alpha * tensor_a.at({i, a_col}) * tensor_b.at({k, b_c_d_col}) + beta * tensor_c.at({i, b_c_d_col});
+              alpha * tensor_a.at({i, a_col}) * tensor_b.at({k, b_c_d_col});
         }
+       
+        tensor_d_ref.at({i, b_c_d_col}) += (beta * tensor_c.at({i, b_c_d_col}));
       }
     }