diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -33,6 +33,8 @@ #include "llvm/IR/GlobalValue.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" +#include "llvm/IR/IntrinsicsNVPTX.h" #include "llvm/InitializePasses.h" #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/IPO.h" @@ -2554,9 +2556,8 @@ auto &OMPInfoCache = static_cast(A.getInfoCache()); auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_target_init]; - // Check if the edge into the successor block compares the __kmpc_target_init - // result with -1. If we are in non-SPMD-mode that signals only the main - // thread will execute the edge. + // Check if the edge into the successor block contains a condition that only + // lets the main thread execute it. auto IsInitialThreadOnly = [&](BranchInst *Edge, BasicBlock *SuccessorBB) { if (!Edge || !Edge->isConditional()) return false; @@ -2571,7 +2572,7 @@ if (!C) return false; - // Match: -1 == __kmpc_target_init (for non-SPMD kernels only!) + // Match: -1 == __kmpc_target_init (for non-SPMD kernels only!) if (C->isAllOnesValue()) { auto *CB = dyn_cast(Cmp->getOperand(0)); CB = CB ? OpenMPOpt::getCallIfRegularCall(*CB, &RFI) : nullptr; @@ -2583,6 +2584,18 @@ return IsSPMDModeCI && IsSPMDModeCI->isZero(); } + if (C->isZero()) { + // Match: 0 == llvm.nvvm.read.ptx.sreg.tid.x() + if (auto *II = dyn_cast(Cmp->getOperand(0))) + if (II->getIntrinsicID() == Intrinsic::nvvm_read_ptx_sreg_tid_x) + return true; + + // Match: 0 == llvm.amdgcn.workitem.id.x() + if (auto *II = dyn_cast(Cmp->getOperand(0))) + if (II->getIntrinsicID() == Intrinsic::amdgcn_workitem_id_x) + return true; + } + return false; }; diff --git a/llvm/test/Transforms/OpenMP/single_threaded_execution.ll b/llvm/test/Transforms/OpenMP/single_threaded_execution.ll --- a/llvm/test/Transforms/OpenMP/single_threaded_execution.ll +++ b/llvm/test/Transforms/OpenMP/single_threaded_execution.ll @@ -18,8 +18,6 @@ %cmp = icmp eq i32 %call, -1 br i1 %cmp, label %if.then, label %if.else if.then: - call void @nvptx() - call void @amdgcn() br label %if.end if.else: br label %if.end @@ -31,13 +29,15 @@ ; REMARKS: remark: single_threaded_execution.c:1:0: Could not internalize function. Some optimizations may not be possible. ; REMARKS-NOT: remark: single_threaded_execution.c:1:0: Could not internalize function. Some optimizations may not be possible. -; CHECK-DAG: [openmp-opt] Basic block @nvptx entry is executed by a single thread. +; CHECK-NOT: [openmp-opt] Basic block @nvptx entry is executed by a single thread. ; CHECK-DAG: [openmp-opt] Basic block @nvptx if.then is executed by a single thread. -; CHECK-DAG: [openmp-opt] Basic block @nvptx if.end is executed by a single thread. +; CHECK-NOT: [openmp-opt] Basic block @nvptx if.end is executed by a single thread. ; Function Attrs: noinline -define internal void @nvptx() { +define void @nvptx() { entry: - br i1 true, label %if.then, label %if.end + %call = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %cmp = icmp eq i32 %call, 0 + br i1 %cmp, label %if.then, label %if.end if.then: call void @foo() @@ -50,13 +50,15 @@ ret void } -; CHECK-DAG: [openmp-opt] Basic block @amdgcn entry is executed by a single thread. +; CHECK-NOT: [openmp-opt] Basic block @amdgcn entry is executed by a single thread. ; CHECK-DAG: [openmp-opt] Basic block @amdgcn if.then is executed by a single thread. -; CHECK-DAG: [openmp-opt] Basic block @amdgcn if.end is executed by a single thread. +; CHECK-NOT: [openmp-opt] Basic block @amdgcn if.end is executed by a single thread. ; Function Attrs: noinline -define internal void @amdgcn() { +define void @amdgcn() { entry: - br i1 false, label %if.then, label %if.end + %call = call i32 @llvm.amdgcn.workitem.id.x() + %cmp = icmp eq i32 %call, 0 + br i1 %cmp, label %if.then, label %if.end if.then: call void @foo() @@ -104,6 +106,7 @@ declare void @__kmpc_kernel_prepare_parallel(i8*) declare i32 @__kmpc_target_init(%struct.ident_t*, i1, i1, i1) + declare void @__kmpc_target_deinit(%struct.ident_t*, i1, i1) attributes #0 = { cold noinline }