From cf54274c2c1391f480728c57391b617ad170cdb2 Mon Sep 17 00:00:00 2001 From: "Bushi, Lorenc" Date: Mon, 27 Oct 2025 09:55:34 -0700 Subject: [PATCH 1/8] Fix a bug in hierarchical parallelism implementation --- llvm/lib/SYCLLowerIR/LowerWGScope.cpp | 25 ++++++++++++++++++++- sycl/test-e2e/HierPar/hier_par_indirect.cpp | 24 ++++++++++++++++++++ 2 files changed, 48 insertions(+), 1 deletion(-) diff --git a/llvm/lib/SYCLLowerIR/LowerWGScope.cpp b/llvm/lib/SYCLLowerIR/LowerWGScope.cpp index aa55c6cbe4650..01d276ddaf662 100644 --- a/llvm/lib/SYCLLowerIR/LowerWGScope.cpp +++ b/llvm/lib/SYCLLowerIR/LowerWGScope.cpp @@ -214,6 +214,22 @@ static bool hasCallToAFuncWithWGMetadata(Function &F) { return false; } +// Recursively searches for a call to a function with parallel_for_work_item +// metadata inside F. +static bool hasCallToAFuncWithPFWIMetadata(Function &F) { + for (auto &BB : F) + for (auto &I : BB) { + if (isCallToAFuncMarkedWithMD(&I, PFWI_MD)) + return true; + const CallInst *Call = dyn_cast(&I); + Function *F = dyn_cast_or_null(Call ? Call->getCalledFunction() + : nullptr); + if (F && hasCallToAFuncWithPFWIMetadata(*F)) + return true; + } + return false; +} + // Checks if this is a call to parallel_for_work_item. static bool isPFWICall(const Instruction *I) { return isCallToAFuncMarkedWithMD(I, PFWI_MD); @@ -835,7 +851,14 @@ PreservedAnalyses SYCLLowerWGScopePass::run(Function &F, } continue; } - if (!mayHaveSideEffects(I)) + // In addition to an instruction not having side effects, we end the range + // if the instruction is a call that contains, possibly several layers + // down the stack, a call to a parallel_for_work_item. Such calls should + // not be subject to lowering since they must be executed by every work + // item. + const CallInst *Call = dyn_cast(I); + if (!mayHaveSideEffects(I) || + (Call && hasCallToAFuncWithPFWIMetadata(*Call->getCalledFunction()))) continue; LLVM_DEBUG(llvm::dbgs() << "+++ Side effects: " << *I << "\n"); if (!First) diff --git a/sycl/test-e2e/HierPar/hier_par_indirect.cpp b/sycl/test-e2e/HierPar/hier_par_indirect.cpp index b0a1787368f97..2917cd0fe5610 100644 --- a/sycl/test-e2e/HierPar/hier_par_indirect.cpp +++ b/sycl/test-e2e/HierPar/hier_par_indirect.cpp @@ -19,12 +19,36 @@ void __attribute__((noinline)) foo(sycl::group<1> work_group) { work_group.parallel_for_work_item([&](sycl::h_item<1> index) {}); } +void __attribute__((noinline)) bar(sycl::group<1> work_group) { + work_group.parallel_for_work_item([&](sycl::h_item<1> index) {}); +} + int main(int argc, char **argv) { sycl::queue q; + + // Try a single indirect call, two indirect calls and an indirect call + // accompanied by multiple parallel_for_work_item calls in the same work_group + // scope. q.submit([&](sycl::handler &cgh) { cgh.parallel_for_work_group(sycl::range<1>{1}, sycl::range<1>{128}, ([=](sycl::group<1> wGroup) { foo(wGroup); })); }).wait(); + q.submit([&](sycl::handler &cgh) { + cgh.parallel_for_work_group(sycl::range<1>{1}, sycl::range<1>{128}, + ([=](sycl::group<1> wGroup) { + foo(wGroup); + bar(wGroup); + })); + }).wait(); + q.submit([&](sycl::handler &cgh) { + cgh.parallel_for_work_group( + sycl::range<1>{1}, sycl::range<1>{128}, ([=](sycl::group<1> wGroup) { + wGroup.parallel_for_work_item([&](sycl::h_item<1> index) {}); + foo(wGroup); + wGroup.parallel_for_work_item([&](sycl::h_item<1> index) {}); + })); + }).wait(); + std::cout << "test passed" << std::endl; return 0; } From 852e3cfaab93be037119fbd8ca3b1612b245d510 Mon Sep 17 00:00:00 2001 From: "Bushi, Lorenc" Date: Mon, 27 Oct 2025 10:03:27 -0700 Subject: [PATCH 2/8] Add more tests --- sycl/test-e2e/HierPar/hier_par_indirect.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sycl/test-e2e/HierPar/hier_par_indirect.cpp b/sycl/test-e2e/HierPar/hier_par_indirect.cpp index 2917cd0fe5610..073013a48b63b 100644 --- a/sycl/test-e2e/HierPar/hier_par_indirect.cpp +++ b/sycl/test-e2e/HierPar/hier_par_indirect.cpp @@ -20,7 +20,7 @@ void __attribute__((noinline)) foo(sycl::group<1> work_group) { } void __attribute__((noinline)) bar(sycl::group<1> work_group) { - work_group.parallel_for_work_item([&](sycl::h_item<1> index) {}); + foo(work_group); } int main(int argc, char **argv) { @@ -34,11 +34,11 @@ int main(int argc, char **argv) { ([=](sycl::group<1> wGroup) { foo(wGroup); })); }).wait(); q.submit([&](sycl::handler &cgh) { - cgh.parallel_for_work_group(sycl::range<1>{1}, sycl::range<1>{128}, - ([=](sycl::group<1> wGroup) { - foo(wGroup); - bar(wGroup); - })); + cgh.parallel_for_work_group( + sycl::range<1>{1}, sycl::range<1>{128}, ([=](sycl::group<1> wGroup) { + foo(wGroup); // 1-layer indirect call + bar(wGroup); // 2-layer indirect call since bar calls foo + })); }).wait(); q.submit([&](sycl::handler &cgh) { cgh.parallel_for_work_group( From 80e8bb80a29c382c2ebfc0a3eba9e67534ca2312 Mon Sep 17 00:00:00 2001 From: "Bushi, Lorenc" Date: Tue, 28 Oct 2025 06:26:15 -0700 Subject: [PATCH 3/8] Add IR test --- llvm/test/SYCLLowerIR/pfwg_and_pfwi.ll | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/llvm/test/SYCLLowerIR/pfwg_and_pfwi.ll b/llvm/test/SYCLLowerIR/pfwg_and_pfwi.ll index c9b8388078b2b..0643791e0b692 100644 --- a/llvm/test/SYCLLowerIR/pfwg_and_pfwi.ll +++ b/llvm/test/SYCLLowerIR/pfwg_and_pfwi.ll @@ -4,7 +4,9 @@ ; Check that allocas which correspond to PFWI lambda object and a local copy of the PFWG lambda object ; are properly handled by LowerWGScope pass. Check that WG-shared local "shadow" variables are created ; and before each PFWI invocation leader WI stores its private copy of the variable into the shadow, -; then all WIs load the shadow value into their private copies ("materialize" the private copy). +; then all WIs load the shadow value into their private copies ("materialize" the private copy). +; Also check that an indirect call to a function marked with parallel_for_work_item is treated +; the same as a direct call. %struct.bar = type { i8 } %struct.zot = type { %struct.widget, %struct.widget, %struct.widget, %struct.foo } @@ -54,6 +56,7 @@ define internal spir_func void @wibble(ptr addrspace(4) %arg, ptr byval(%struct. ; CHECK-NEXT: call void @_Z22__spirv_ControlBarrieriii(i32 2, i32 2, i32 272) #[[ATTR0]] ; CHECK-NEXT: [[TMP9:%.*]] = addrspacecast ptr [[ARG1]] to ptr addrspace(4) ; CHECK-NEXT: call spir_func void @bar(ptr addrspace(4) [[TMP9]], ptr byval([[STRUCT_FOO_0]]) align 1 [[TMP1]]) +; CHECK-NEXT: call spir_func void @foo(ptr addrspace(4) [[TMP9]], ptr byval([[STRUCT_FOO_0]]) align 1 [[TMP1]]) ; CHECK-NEXT: ret void ; bb: @@ -62,6 +65,7 @@ bb: store ptr addrspace(4) %arg, ptr %0, align 8 %2 = addrspacecast ptr %arg1 to ptr addrspace(4) call spir_func void @bar(ptr addrspace(4) %2, ptr byval(%struct.foo.0) align 1 %1) + call spir_func void @foo(ptr addrspace(4) %2, ptr byval(%struct.foo.0) align 1 %1) ret void } @@ -70,4 +74,15 @@ bb: ret void } +define internal spir_func void @foo(ptr addrspace(4) %arg, ptr byval(%struct.foo.0) align 1 %arg1) align 2 !work_group_scope !0 { +bb: + call spir_func void @baz(ptr addrspace(4) %arg, ptr byval(%struct.foo.0) align 1 %arg1) + ret void +} + +define internal spir_func void @baz(ptr addrspace(4) %arg, ptr byval(%struct.foo.0) align 1 %arg1) align 2 !work_item_scope !0 !parallel_for_work_item !0 { +bb: + ret void +} + !0 = !{} From 444bbb313bb58f93e47ad87515e299e9e10f6ed1 Mon Sep 17 00:00:00 2001 From: "Bushi, Lorenc" Date: Tue, 28 Oct 2025 13:41:37 -0700 Subject: [PATCH 4/8] Enhance test --- llvm/test/SYCLLowerIR/pfwg_and_pfwi.ll | 61 +++++++++++++++++++++----- 1 file changed, 50 insertions(+), 11 deletions(-) diff --git a/llvm/test/SYCLLowerIR/pfwg_and_pfwi.ll b/llvm/test/SYCLLowerIR/pfwg_and_pfwi.ll index 0643791e0b692..5b0c3ad99b781 100644 --- a/llvm/test/SYCLLowerIR/pfwg_and_pfwi.ll +++ b/llvm/test/SYCLLowerIR/pfwg_and_pfwi.ll @@ -4,8 +4,8 @@ ; Check that allocas which correspond to PFWI lambda object and a local copy of the PFWG lambda object ; are properly handled by LowerWGScope pass. Check that WG-shared local "shadow" variables are created ; and before each PFWI invocation leader WI stores its private copy of the variable into the shadow, -; then all WIs load the shadow value into their private copies ("materialize" the private copy). -; Also check that an indirect call to a function marked with parallel_for_work_item is treated +; then all WIs load the shadow value into their private copies ("materialize" the private copy). +; Also check that an indirect call to a function marked with parallel_for_work_item is treated ; the same as a direct call. %struct.bar = type { i8 } @@ -69,20 +69,59 @@ bb: ret void } -define internal spir_func void @bar(ptr addrspace(4) %arg, ptr byval(%struct.foo.0) align 1 %arg1) align 2 !work_item_scope !0 !parallel_for_work_item !0 { -bb: - ret void -} - define internal spir_func void @foo(ptr addrspace(4) %arg, ptr byval(%struct.foo.0) align 1 %arg1) align 2 !work_group_scope !0 { +; CHECK: bb: +; CHECK-NEXT: [[TMP0:%.*]] = alloca ptr addrspace(4), align 8 +; CHECK-NEXT: [[TMP1:%.*]] = alloca [[STRUCT_FOO_0:%.*]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr addrspace(1) @__spirv_BuiltInLocalInvocationIndex, align 4 +; CHECK-NEXT: call void @_Z22__spirv_ControlBarrieriii(i32 2, i32 2, i32 272) #[[ATTR0]] +; CHECK-NEXT: [[CMPZ3:%.*]] = icmp eq i64 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[CMPZ3]], label [[LEADER:%.*]], label [[MERGE:%.*]] +; CHECK: leader: +; CHECK-NEXT: call void @llvm.memcpy.p3.p0.i64(ptr addrspace(3) align 8 @ArgShadow.4, ptr align 1 [[ARG1:%.*]], i64 1, i1 false) +; CHECK-NEXT: br label [[MERGE]] +; CHECK: merge: +; CHECK-NEXT: call void @_Z22__spirv_ControlBarrieriii(i32 2, i32 2, i32 272) #[[ATTR0]] +; CHECK-NEXT: call void @llvm.memcpy.p0.p3.i64(ptr align 1 [[ARG1]], ptr addrspace(3) align 8 @ArgShadow.4, i64 1, i1 false) +; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr addrspace(1) @__spirv_BuiltInLocalInvocationIndex, align 4 +; CHECK-NEXT: call void @_Z22__spirv_ControlBarrieriii(i32 2, i32 2, i32 272) #[[ATTR0]] +; CHECK-NEXT: [[CMPZ:%.*]] = icmp eq i64 [[TMP3]], 0 +; CHECK-NEXT: br i1 [[CMPZ]], label [[WG_LEADER:%.*]], label [[WG_CF:%.*]] +; CHECK: wg_leader: +; CHECK-NEXT: store ptr addrspace(4) [[ARG:%.*]], ptr [[TMP0]], align 8 +; CHECK-NEXT: br label [[WG_CF]] +; CHECK: wg_cf: +; CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr addrspace(1) @__spirv_BuiltInLocalInvocationIndex, align 4 +; CHECK-NEXT: call void @_Z22__spirv_ControlBarrieriii(i32 2, i32 2, i32 272) #[[ATTR0]] +; CHECK-NEXT: [[CMPZ2:%.*]] = icmp eq i64 [[TMP4]], 0 +; CHECK-NEXT: br i1 [[CMPZ2]], label [[TESTMAT:%.*]], label [[LEADERMAT:%.*]] +; CHECK: TestMat: +; CHECK-NEXT: call void @llvm.memcpy.p3.p0.i64(ptr addrspace(3) align 8 @WGCopy.3, ptr align 1 [[TMP1]], i64 1, i1 false) +; CHECK-NEXT: [[MAT_LD:%.*]] = load ptr addrspace(4), ptr [[TMP0]], align 8 +; CHECK-NEXT: store ptr addrspace(4) [[MAT_LD]], ptr addrspace(3) @WGCopy.2, align 8 +; CHECK-NEXT: br label [[LEADERMAT]] +; CHECK: LeaderMat: +; CHECK-NEXT: call void @_Z22__spirv_ControlBarrieriii(i32 2, i32 2, i32 272) #[[ATTR0]] +; CHECK-NEXT: [[MAT_LD1:%.*]] = load ptr addrspace(4), ptr addrspace(3) @WGCopy.2, align 8 +; CHECK-NEXT: store ptr addrspace(4) [[MAT_LD1]], ptr [[TMP0]], align 8 +; CHECK-NEXT: call void @llvm.memcpy.p0.p3.i64(ptr align 1 [[TMP1]], ptr addrspace(3) align 8 @WGCopy.3, i64 1, i1 false) +; CHECK-NEXT: call void @_Z22__spirv_ControlBarrieriii(i32 2, i32 2, i32 272) #[[ATTR0]] +; CHECK-NEXT: [[TMP5:%.*]] = addrspacecast ptr [[ARG1]] to ptr addrspace(4) +; CHECK-NEXT: call spir_func void @bar(ptr addrspace(4) [[TMP5]], ptr byval([[STRUCT_FOO_0]]) align 1 [[TMP1]]) +; CHECK-NEXT: ret void +; bb: - call spir_func void @baz(ptr addrspace(4) %arg, ptr byval(%struct.foo.0) align 1 %arg1) - ret void + %0 = alloca ptr addrspace(4), align 8 + %1 = alloca %struct.foo.0, align 1 + store ptr addrspace(4) %arg, ptr %0, align 8 + %2 = addrspacecast ptr %arg1 to ptr addrspace(4) + call spir_func void @bar(ptr addrspace(4) %2, ptr byval(%struct.foo.0) align 1 %1) + ret void } -define internal spir_func void @baz(ptr addrspace(4) %arg, ptr byval(%struct.foo.0) align 1 %arg1) align 2 !work_item_scope !0 !parallel_for_work_item !0 { +define internal spir_func void @bar(ptr addrspace(4) %arg, ptr byval(%struct.foo.0) align 1 %arg1) align 2 !work_item_scope !0 !parallel_for_work_item !0 { bb: - ret void + ret void } !0 = !{} From fb54be432d6f64f1c22f4d15bbf9b0ecb78400d5 Mon Sep 17 00:00:00 2001 From: "Bushi, Lorenc" Date: Tue, 28 Oct 2025 13:49:55 -0700 Subject: [PATCH 5/8] Do some renaming --- llvm/test/SYCLLowerIR/pfwg_and_pfwi.ll | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/llvm/test/SYCLLowerIR/pfwg_and_pfwi.ll b/llvm/test/SYCLLowerIR/pfwg_and_pfwi.ll index 5b0c3ad99b781..94f9c4b0f3d50 100644 --- a/llvm/test/SYCLLowerIR/pfwg_and_pfwi.ll +++ b/llvm/test/SYCLLowerIR/pfwg_and_pfwi.ll @@ -111,11 +111,11 @@ define internal spir_func void @foo(ptr addrspace(4) %arg, ptr byval(%struct.foo ; CHECK-NEXT: ret void ; bb: - %0 = alloca ptr addrspace(4), align 8 - %1 = alloca %struct.foo.0, align 1 - store ptr addrspace(4) %arg, ptr %0, align 8 - %2 = addrspacecast ptr %arg1 to ptr addrspace(4) - call spir_func void @bar(ptr addrspace(4) %2, ptr byval(%struct.foo.0) align 1 %1) + %1 = alloca ptr addrspace(4), align 8 + %2 = alloca %struct.foo.0, align 1 + store ptr addrspace(4) %arg, ptr %1, align 8 + %3 = addrspacecast ptr %arg1 to ptr addrspace(4) + call spir_func void @bar(ptr addrspace(4) %3, ptr byval(%struct.foo.0) align 1 %2) ret void } From c8ef45300400616126ab59eca7212ff87e98ac19 Mon Sep 17 00:00:00 2001 From: "Bushi, Lorenc" Date: Mon, 3 Nov 2025 10:22:26 -0800 Subject: [PATCH 6/8] Rectify incorrect logic in LowerWGScope pass --- llvm/lib/SYCLLowerIR/LowerWGScope.cpp | 22 +++++++++++++++------- llvm/test/SYCLLowerIR/pfwg_and_pfwi.ll | 2 +- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/llvm/lib/SYCLLowerIR/LowerWGScope.cpp b/llvm/lib/SYCLLowerIR/LowerWGScope.cpp index 01d276ddaf662..0c107b07e3460 100644 --- a/llvm/lib/SYCLLowerIR/LowerWGScope.cpp +++ b/llvm/lib/SYCLLowerIR/LowerWGScope.cpp @@ -851,14 +851,22 @@ PreservedAnalyses SYCLLowerWGScopePass::run(Function &F, } continue; } - // In addition to an instruction not having side effects, we end the range - // if the instruction is a call that contains, possibly several layers - // down the stack, a call to a parallel_for_work_item. Such calls should - // not be subject to lowering since they must be executed by every work - // item. + // We also split the range if the instruction is a call that contains, + // possibly several layers down the stack, a call to a + // parallel_for_work_item. Such calls should not be subject to lowering + // since they must be executed by every work item. const CallInst *Call = dyn_cast(I); - if (!mayHaveSideEffects(I) || - (Call && hasCallToAFuncWithPFWIMetadata(*Call->getCalledFunction()))) + if (Call && hasCallToAFuncWithPFWIMetadata(*Call->getCalledFunction())) { + if (First) { + assert(Last && "range must have been closed 1"); + Ranges.push_back(InstrRange{First, Last}); + First = nullptr; + Last = nullptr; + } + continue; + } + + if (!mayHaveSideEffects(I)) continue; LLVM_DEBUG(llvm::dbgs() << "+++ Side effects: " << *I << "\n"); if (!First) diff --git a/llvm/test/SYCLLowerIR/pfwg_and_pfwi.ll b/llvm/test/SYCLLowerIR/pfwg_and_pfwi.ll index 94f9c4b0f3d50..9544582590e0b 100644 --- a/llvm/test/SYCLLowerIR/pfwg_and_pfwi.ll +++ b/llvm/test/SYCLLowerIR/pfwg_and_pfwi.ll @@ -64,8 +64,8 @@ bb: %1 = alloca %struct.foo.0, align 1 store ptr addrspace(4) %arg, ptr %0, align 8 %2 = addrspacecast ptr %arg1 to ptr addrspace(4) - call spir_func void @bar(ptr addrspace(4) %2, ptr byval(%struct.foo.0) align 1 %1) call spir_func void @foo(ptr addrspace(4) %2, ptr byval(%struct.foo.0) align 1 %1) + call spir_func void @bar(ptr addrspace(4) %2, ptr byval(%struct.foo.0) align 1 %1) ret void } From 8c22ab200c903e975e0d5e67498f1a1ca05ae1c5 Mon Sep 17 00:00:00 2001 From: "Bushi, Lorenc" Date: Tue, 4 Nov 2025 08:17:24 -0800 Subject: [PATCH 7/8] Add E2E test when work group scope variable is used in a work item scope --- sycl/test-e2e/HierPar/hier_par_indirect.cpp | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/sycl/test-e2e/HierPar/hier_par_indirect.cpp b/sycl/test-e2e/HierPar/hier_par_indirect.cpp index 073013a48b63b..ae9dee5a25381 100644 --- a/sycl/test-e2e/HierPar/hier_par_indirect.cpp +++ b/sycl/test-e2e/HierPar/hier_par_indirect.cpp @@ -48,6 +48,17 @@ int main(int argc, char **argv) { wGroup.parallel_for_work_item([&](sycl::h_item<1> index) {}); })); }).wait(); + // Also try an example of a work-group scope variable being used in work item + // scope + q.submit([&](sycl::handler &cgh) { + cgh.parallel_for_work_group( + sycl::range<1>{1}, sycl::range<1>{128}, ([=](sycl::group<1> wGroup) { + int data; + foo(wGroup); + wGroup.parallel_for_work_item( + [&](sycl::h_item<1> index) { data = 0; }); + })); + }).wait(); std::cout << "test passed" << std::endl; return 0; From e2d3c8d972d8c3d68dc97d6f6998eb21710ec0ff Mon Sep 17 00:00:00 2001 From: Lorenc Bushi Date: Tue, 4 Nov 2025 11:24:12 -0500 Subject: [PATCH 8/8] Update pfwg_and_pfwi.ll --- llvm/test/SYCLLowerIR/pfwg_and_pfwi.ll | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/test/SYCLLowerIR/pfwg_and_pfwi.ll b/llvm/test/SYCLLowerIR/pfwg_and_pfwi.ll index 9544582590e0b..94f9c4b0f3d50 100644 --- a/llvm/test/SYCLLowerIR/pfwg_and_pfwi.ll +++ b/llvm/test/SYCLLowerIR/pfwg_and_pfwi.ll @@ -64,8 +64,8 @@ bb: %1 = alloca %struct.foo.0, align 1 store ptr addrspace(4) %arg, ptr %0, align 8 %2 = addrspacecast ptr %arg1 to ptr addrspace(4) - call spir_func void @foo(ptr addrspace(4) %2, ptr byval(%struct.foo.0) align 1 %1) call spir_func void @bar(ptr addrspace(4) %2, ptr byval(%struct.foo.0) align 1 %1) + call spir_func void @foo(ptr addrspace(4) %2, ptr byval(%struct.foo.0) align 1 %1) ret void }