diff --git a/examples/13_two_tensor_op_fusion/kernel/b2b_gemm.h b/examples/13_two_tensor_op_fusion/kernel/b2b_gemm.h index a6d2a8a1..fcc484ea 100644 --- a/examples/13_two_tensor_op_fusion/kernel/b2b_gemm.h +++ b/examples/13_two_tensor_op_fusion/kernel/b2b_gemm.h @@ -653,7 +653,7 @@ struct B2bGemm { // Broadcast the warp_id computed by lane 0 to ensure dependent code // is compiled as warp-uniform. - int warp_idx = __shfl_sync(0x1f, threadIdx.x / 32, 0); + int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0); int lane_idx = threadIdx.x % 32; // Construct iterators to accumulator scale/bias vector diff --git a/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_kernel.py b/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_kernel.py index 5fe51200..a640fc60 100644 --- a/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_kernel.py +++ b/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_kernel.py @@ -331,7 +331,7 @@ class gen_Kernel: operator_code += " " + helper.var_idx("FusedAddBiasEpilogue", i ) + helper.var_idx(" epilogue_", i ) + ";\n" - operator_code += " " + "int warp_idx = __shfl_sync(0x1f, threadIdx.x / 32, 0);\n" + operator_code += " " + "int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);\n" operator_code += " " + "int lane_idx = threadIdx.x % 32;\n" for i in range (self.b2bnum - 1): diff --git a/examples/45_dual_gemm/kernel/dual_gemm.h b/examples/45_dual_gemm/kernel/dual_gemm.h index f0ad97db..bd3c438f 100644 --- a/examples/45_dual_gemm/kernel/dual_gemm.h +++ b/examples/45_dual_gemm/kernel/dual_gemm.h @@ -364,7 +364,7 @@ struct DualGemm { // Broadcast the warp_id computed by lane 0 to ensure dependent code // is compiled as warp-uniform. - int warp_idx = __shfl_sync(0x1f, threadIdx.x / 32, 0); + int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0); int lane_idx = threadIdx.x % 32; //