diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -26,19 +26,25 @@ #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/CallGraphSCCPass.h" +#include "llvm/Analysis/MemoryLocation.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/Frontend/OpenMP/OMPConstants.h" #include "llvm/Frontend/OpenMP/OMPIRBuilder.h" #include "llvm/IR/Assumptions.h" +#include "llvm/IR/Constants.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/GlobalValue.h" +#include "llvm/IR/GlobalVariable.h" #include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/IntrinsicsNVPTX.h" +#include "llvm/IR/LLVMContext.h" #include "llvm/InitializePasses.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/IPO/Attributor.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" @@ -98,6 +104,11 @@ cl::desc("Disable OpenMP optimizations that replace the state machine."), cl::Hidden, cl::init(false)); +static cl::opt DisableOpenMPOptBarrierElimination( + "openmp-opt-disable-barrier-elimination", cl::ZeroOrMore, + cl::desc("Disable OpenMP optimizations that eliminate barriers."), + cl::Hidden, cl::init(false)); + static cl::opt PrintModuleAfterOptimizations( "openmp-opt-print-module", cl::ZeroOrMore, cl::desc("Print the current module after OpenMP optimizations."), @@ -147,6 +158,7 @@ "Number of OpenMP parallel regions merged"); STATISTIC(NumBytesMovedToSharedMemory, "Amount of memory pushed to shared memory"); +STATISTIC(NumBarriersEliminated, "Number of redundant barriers eliminated"); #if !defined(NDEBUG) static constexpr auto TAG = "[" DEBUG_TYPE "]"; @@ -795,6 +807,8 @@ if (remarksEnabled()) analysisGlobalization(); + + Changed |= eliminateBarriers(); } else { if (PrintICVValues) printICVs(); @@ -817,6 +831,8 @@ Changed = true; } } + + Changed |= eliminateBarriers(); } return Changed; @@ -1386,6 +1402,213 @@ return Changed; } + /// Eliminates redundant, aligned barriers in OpenMP offloaded kernels. + /// TODO: Make this an AA and expand it to work across blocks and functions. + bool eliminateBarriers() { + bool Changed = false; + + if (DisableOpenMPOptBarrierElimination) + return /*Changed=*/false; + + if (OMPInfoCache.Kernels.empty()) + return /*Changed=*/false; + + enum ImplicitBarrierType { IBT_ENTRY, IBT_EXIT }; + + class BarrierInfo { + Instruction *I; + enum ImplicitBarrierType Type; + + public: + BarrierInfo(enum ImplicitBarrierType Type) : I(nullptr), Type(Type) {} + BarrierInfo(Instruction &I) : I(&I) {} + + bool isImplicit() { return !I; } + + bool isImplicitEntry() { return isImplicit() && Type == IBT_ENTRY; } + + bool isImplicitExit() { return isImplicit() && Type == IBT_EXIT; } + + Instruction *getInstruction() { return I; } + }; + + for (Function *Kernel : OMPInfoCache.Kernels) { + for (BasicBlock &BB : *Kernel) { + SmallVector BarriersInBlock; + SmallPtrSet BarriersToBeDeleted; + + // Add the kernel entry implicit barrier. + if (&Kernel->getEntryBlock() == &BB) + BarriersInBlock.push_back(IBT_ENTRY); + + // Find implicit and explicit aligned barriers in the same basic block. + for (Instruction &I : BB) { + if (isa(I)) { + // Add the implicit barrier when exiting the kernel. + BarriersInBlock.push_back(IBT_EXIT); + continue; + } + CallBase *CB = dyn_cast(&I); + if (!CB) + continue; + + auto IsAlignBarrierCB = [&](CallBase &CB) { + switch (CB.getIntrinsicID()) { + case Intrinsic::nvvm_barrier0: + case Intrinsic::nvvm_barrier0_and: + case Intrinsic::nvvm_barrier0_or: + case Intrinsic::nvvm_barrier0_popc: + case Intrinsic::amdgcn_s_barrier: + return true; + default: + break; + } + return hasAssumption(CB, + KnownAssumptionString("ompx_aligned_barrier")); + }; + + if (IsAlignBarrierCB(*CB)) { + // Add an explicit aligned barrier. + BarriersInBlock.push_back(I); + } + } + + if (BarriersInBlock.size() <= 1) + continue; + + // A barrier in a barrier pair is removeable if all instructions + // between the barriers in the pair are side-effect free modulo the + // barrier operation. + auto IsBarrierRemoveable = [&Kernel](BarrierInfo *StartBI, + BarrierInfo *EndBI) { + assert( + !StartBI->isImplicitExit() && + "Expected start barrier to be other than a kernel exit barrier"); + assert( + !EndBI->isImplicitEntry() && + "Expected end barrier to be other than a kernel entry barrier"); + // If StarBI instructions is null then this the implicit + // kernel entry barrier, so iterate from the first instruction in the + // entry block. + Instruction *I = (StartBI->isImplicitEntry()) + ? &Kernel->getEntryBlock().front() + : StartBI->getInstruction()->getNextNode(); + assert(I && "Expected non-null start instruction"); + Instruction *E = (EndBI->isImplicitExit()) + ? I->getParent()->getTerminator() + : EndBI->getInstruction(); + assert(E && "Expected non-null end instruction"); + + for (; I != E; I = I->getNextNode()) { + if (!I->mayHaveSideEffects() && !I->mayReadFromMemory()) + continue; + + auto IsPotentiallyAffectedByBarrier = + [](Optional Loc) { + const Value *Obj = (Loc && Loc->Ptr) + ? getUnderlyingObject(Loc->Ptr) + : nullptr; + if (!Obj) { + LLVM_DEBUG( + dbgs() + << "Access to unknown location requires barriers\n"); + return true; + } + if (isa(Obj)) + return false; + if (isa(Obj)) + return false; + if (auto *GV = dyn_cast(Obj)) { + if (GV->isConstant()) + return false; + if (GV->isThreadLocal()) + return false; + if (GV->getAddressSpace() == (int)AddressSpace::Local) + return false; + if (GV->getAddressSpace() == (int)AddressSpace::Constant) + return false; + } + LLVM_DEBUG(dbgs() << "Access to '" << *Obj + << "' requires barriers\n"); + return true; + }; + + if (MemIntrinsic *MI = dyn_cast(I)) { + Optional Loc = MemoryLocation::getForDest(MI); + if (IsPotentiallyAffectedByBarrier(Loc)) + return false; + if (MemTransferInst *MTI = dyn_cast(I)) { + Optional Loc = + MemoryLocation::getForSource(MTI); + if (IsPotentiallyAffectedByBarrier(Loc)) + return false; + } + continue; + } + + if (auto *LI = dyn_cast(I)) + if (LI->hasMetadata(LLVMContext::MD_invariant_load)) + continue; + + Optional Loc = MemoryLocation::getOrNone(I); + if (IsPotentiallyAffectedByBarrier(Loc)) + return false; + } + + return true; + }; + + // Iterate barrier pairs and remove an explicit barrier if analysis + // deems it removeable. + for (auto *It = BarriersInBlock.begin(), + *End = BarriersInBlock.end() - 1; + It != End; ++It) { + + BarrierInfo *StartBI = It; + BarrierInfo *EndBI = (It + 1); + + // Cannot remove when both are implicit barriers, continue. + if (StartBI->isImplicit() && EndBI->isImplicit()) + continue; + + if (!IsBarrierRemoveable(StartBI, EndBI)) + continue; + + assert(!(StartBI->isImplicit() && EndBI->isImplicit()) && + "Expected at least one explicit barrier to remove."); + + // Remove an explicit barrier, check first, then second. + if (!StartBI->isImplicit()) { + LLVM_DEBUG(dbgs() << "Remove start barrier " + << *StartBI->getInstruction() << "\n"); + BarriersToBeDeleted.insert(StartBI->getInstruction()); + } else { + LLVM_DEBUG(dbgs() << "Remove end barrier " + << *EndBI->getInstruction() << "\n"); + BarriersToBeDeleted.insert(EndBI->getInstruction()); + } + } + + if (BarriersToBeDeleted.empty()) + continue; + + Changed = true; + for (Instruction *I : BarriersToBeDeleted) { + ++NumBarriersEliminated; + auto Remark = [&](OptimizationRemark OR) { + return OR << "Redundant barrier eliminated."; + }; + + if (EnableVerboseRemarks) + emitRemark(I, "OMP190", Remark); + I->eraseFromParent(); + } + } + } + + return Changed; + } + void analysisGlobalization() { auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared]; diff --git a/llvm/test/Transforms/OpenMP/barrier_removal.ll b/llvm/test/Transforms/OpenMP/barrier_removal.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/OpenMP/barrier_removal.ll @@ -0,0 +1,263 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals +; RUN: opt < %s -S -openmp-opt-cgscc | FileCheck %s +; RUN: opt < %s -S -passes=openmp-opt-cgscc | FileCheck %s + +declare void @useI32(i32) +declare void @unknown() +declare void @aligned_barrier() "llvm.assume"="ompx_aligned_barrier" +declare void @llvm.nvvm.barrier0() +declare i32 @llvm.nvvm.barrier0.and(i32) +declare i32 @llvm.nvvm.barrier0.or(i32) +declare i32 @llvm.nvvm.barrier0.popc(i32) +declare void @llvm.amdgcn.s.barrier() + +;. +; CHECK: @[[GC1:[a-zA-Z0-9_$"\\.-]+]] = constant i32 42 +; CHECK: @[[GC2:[a-zA-Z0-9_$"\\.-]+]] = addrspace(4) global i32 0 +; CHECK: @[[GPTR4:[a-zA-Z0-9_$"\\.-]+]] = addrspace(4) global i32 addrspace(4)* null +; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = global i32 42 +; CHECK: @[[GS:[a-zA-Z0-9_$"\\.-]+]] = addrspace(3) global i32 0 +; CHECK: @[[GPTR:[a-zA-Z0-9_$"\\.-]+]] = global i32* null +; CHECK: @[[PG1:[a-zA-Z0-9_$"\\.-]+]] = thread_local global i32 42 +; CHECK: @[[PG2:[a-zA-Z0-9_$"\\.-]+]] = addrspace(5) global i32 0 +; CHECK: @[[GPTR5:[a-zA-Z0-9_$"\\.-]+]] = global i32 addrspace(5)* null +; CHECK: @[[G1:[a-zA-Z0-9_$"\\.-]+]] = global i32 42 +; CHECK: @[[G2:[a-zA-Z0-9_$"\\.-]+]] = addrspace(1) global i32 0 +;. +define void @pos_empty_1() { +; CHECK-LABEL: define {{[^@]+}}@pos_empty_1() { +; CHECK-NEXT: ret void +; + call void @unknown() "llvm.assume"="ompx_aligned_barrier" + ret void +} +define void @pos_empty_2() { +; CHECK-LABEL: define {{[^@]+}}@pos_empty_2() { +; CHECK-NEXT: ret void +; + call void @aligned_barrier() + ret void +} +define void @pos_empty_3() { +; CHECK-LABEL: define {{[^@]+}}@pos_empty_3() { +; CHECK-NEXT: ret void +; + call void @llvm.nvvm.barrier0() + ret void +} +define void @pos_empty_4() { +; CHECK-LABEL: define {{[^@]+}}@pos_empty_4() { +; CHECK-NEXT: ret void +; + call i32 @llvm.nvvm.barrier0.and(i32 0) + ret void +} +define void @pos_empty_5() { +; CHECK-LABEL: define {{[^@]+}}@pos_empty_5() { +; CHECK-NEXT: ret void +; + call i32 @llvm.nvvm.barrier0.or(i32 0) + ret void +} +define void @pos_empty_6() { +; CHECK-LABEL: define {{[^@]+}}@pos_empty_6() { +; CHECK-NEXT: ret void +; + call i32 @llvm.nvvm.barrier0.popc(i32 0) + ret void +} +define void @pos_empty_7() { +; CHECK-LABEL: define {{[^@]+}}@pos_empty_7() { +; CHECK-NEXT: ret void +; + call void @llvm.amdgcn.s.barrier() + ret void +} +define void @neg_empty_1() { +; CHECK-LABEL: define {{[^@]+}}@neg_empty_1() { +; CHECK-NEXT: call void @unknown() +; CHECK-NEXT: ret void +; + call void @unknown() + ret void +} +define void @neg_empty_2() { +; CHECK-LABEL: define {{[^@]+}}@neg_empty_2() { +; CHECK-NEXT: call void @aligned_barrier() +; CHECK-NEXT: ret void +; + call void @aligned_barrier() + ret void +} + +@GC1 = constant i32 42 +@GC2 = addrspace(4) global i32 0 +@GPtr4 = addrspace(4) global i32 addrspace(4)* null +define void @pos_constant_loads() { +; CHECK-LABEL: define {{[^@]+}}@pos_constant_loads() { +; CHECK-NEXT: [[ARG:%.*]] = load i32 addrspace(4)*, i32 addrspace(4)** addrspacecast (i32 addrspace(4)* addrspace(4)* @GPtr4 to i32 addrspace(4)**), align 8 +; CHECK-NEXT: [[A:%.*]] = load i32, i32* @GC1, align 4 +; CHECK-NEXT: [[B:%.*]] = load i32, i32* addrspacecast (i32 addrspace(4)* @GC2 to i32*), align 4 +; CHECK-NEXT: [[ARGC:%.*]] = addrspacecast i32 addrspace(4)* [[ARG]] to i32* +; CHECK-NEXT: [[C:%.*]] = load i32, i32* [[ARGC]], align 4 +; CHECK-NEXT: call void @aligned_barrier() +; CHECK-NEXT: [[D:%.*]] = add i32 [[A]], [[B]] +; CHECK-NEXT: [[E:%.*]] = add i32 [[D]], [[C]] +; CHECK-NEXT: call void @useI32(i32 [[E]]) +; CHECK-NEXT: ret void +; + %GPtr4c = addrspacecast i32 addrspace(4)*addrspace(4)* @GPtr4 to i32 addrspace(4)** + %arg = load i32 addrspace(4)*, i32 addrspace(4)** %GPtr4c + %a = load i32, i32* @GC1 + call void @aligned_barrier() + %GC2c = addrspacecast i32 addrspace(4)* @GC2 to i32* + %b = load i32, i32* %GC2c + call void @aligned_barrier() + %argc = addrspacecast i32 addrspace(4)* %arg to i32* + %c = load i32, i32* %argc + call void @aligned_barrier() + %d = add i32 %a, %b + %e = add i32 %d, %c + call void @useI32(i32 %e) + ret void +} +@G = global i32 42 +@GS = addrspace(3) global i32 0 +@GPtr = global i32* null +; TODO: We could remove some of the barriers due to the lack of write effects. +define void @neg_loads() { +; CHECK-LABEL: define {{[^@]+}}@neg_loads() { +; CHECK-NEXT: [[ARG:%.*]] = load i32*, i32** @GPtr, align 8 +; CHECK-NEXT: [[A:%.*]] = load i32, i32* @G, align 4 +; CHECK-NEXT: call void @aligned_barrier() +; CHECK-NEXT: [[B:%.*]] = load i32, i32* addrspacecast (i32 addrspace(3)* @GS to i32*), align 4 +; CHECK-NEXT: call void @aligned_barrier() +; CHECK-NEXT: [[C:%.*]] = load i32, i32* [[ARG]], align 4 +; CHECK-NEXT: call void @aligned_barrier() +; CHECK-NEXT: [[D:%.*]] = add i32 [[A]], [[B]] +; CHECK-NEXT: [[E:%.*]] = add i32 [[D]], [[C]] +; CHECK-NEXT: call void @useI32(i32 [[E]]) +; CHECK-NEXT: ret void +; + %arg = load i32*, i32** @GPtr + %a = load i32, i32* @G + call void @aligned_barrier() + %GSc = addrspacecast i32 addrspace(3)* @GS to i32* + %b = load i32, i32* %GSc + call void @aligned_barrier() + %c = load i32, i32* %arg + call void @aligned_barrier() + %d = add i32 %a, %b + %e = add i32 %d, %c + call void @useI32(i32 %e) + ret void +} +@PG1 = thread_local global i32 42 +@PG2 = addrspace(5) global i32 0 +@GPtr5 = global i32 addrspace(5)* null +define void @pos_priv_mem() { +; CHECK-LABEL: define {{[^@]+}}@pos_priv_mem() { +; CHECK-NEXT: [[ARG:%.*]] = load i32 addrspace(5)*, i32 addrspace(5)** @GPtr5, align 8 +; CHECK-NEXT: [[LOC:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[A:%.*]] = load i32, i32* @PG1, align 4 +; CHECK-NEXT: store i32 [[A]], i32* [[LOC]], align 4 +; CHECK-NEXT: [[B:%.*]] = load i32, i32* addrspacecast (i32 addrspace(5)* @PG2 to i32*), align 4 +; CHECK-NEXT: call void @aligned_barrier() +; CHECK-NEXT: [[ARGC:%.*]] = addrspacecast i32 addrspace(5)* [[ARG]] to i32* +; CHECK-NEXT: store i32 [[B]], i32* [[ARGC]], align 4 +; CHECK-NEXT: store i32 [[A]], i32* @PG1, align 4 +; CHECK-NEXT: ret void +; + %arg = load i32 addrspace(5)*, i32 addrspace(5)** @GPtr5 + %loc = alloca i32 + %a = load i32, i32* @PG1 + call void @aligned_barrier() + store i32 %a, i32* %loc + %PG2c = addrspacecast i32 addrspace(5)* @PG2 to i32* + %b = load i32, i32* %PG2c + call void @aligned_barrier() + %argc = addrspacecast i32 addrspace(5)* %arg to i32* + store i32 %b, i32* %argc + call void @aligned_barrier() + %v = load i32, i32* %loc + store i32 %v, i32* @PG1 + call void @aligned_barrier() + ret void +} +@G1 = global i32 42 +@G2 = addrspace(1) global i32 0 +define void @neg_mem() { +; CHECK-LABEL: define {{[^@]+}}@neg_mem() { +; CHECK-NEXT: [[ARG:%.*]] = load i32*, i32** @GPtr, align 8 +; CHECK-NEXT: [[A:%.*]] = load i32, i32* @G1, align 4 +; CHECK-NEXT: call void @aligned_barrier() +; CHECK-NEXT: store i32 [[A]], i32* [[ARG]], align 4 +; CHECK-NEXT: call void @aligned_barrier() +; CHECK-NEXT: [[B:%.*]] = load i32, i32* addrspacecast (i32 addrspace(1)* @G2 to i32*), align 4 +; CHECK-NEXT: store i32 [[B]], i32* @G1, align 4 +; CHECK-NEXT: ret void +; + %arg = load i32*, i32** @GPtr + %a = load i32, i32* @G1 + call void @aligned_barrier() + store i32 %a, i32* %arg + call void @aligned_barrier() + %G2c = addrspacecast i32 addrspace(1)* @G2 to i32* + %b = load i32, i32* %G2c + store i32 %b, i32* @G1 + call void @aligned_barrier() + ret void +} + +define void @pos_multiple() { +; CHECK-LABEL: define {{[^@]+}}@pos_multiple() { +; CHECK-NEXT: ret void +; + call void @llvm.nvvm.barrier0() + call void @aligned_barrier() + call void @aligned_barrier() + call void @llvm.amdgcn.s.barrier() + call void @aligned_barrier() + call void @llvm.nvvm.barrier0() + call void @aligned_barrier() + call void @aligned_barrier() + ret void +} + +!llvm.module.flags = !{!12,!13} +!nvvm.annotations = !{!0,!1,!2,!3,!4,!5,!6,!7,!8,!9,!10,!11} + +!0 = !{void ()* @pos_empty_1, !"kernel", i32 1} +!1 = !{void ()* @pos_empty_2, !"kernel", i32 1} +!2 = !{void ()* @pos_empty_3, !"kernel", i32 1} +!3 = !{void ()* @pos_empty_4, !"kernel", i32 1} +!4 = !{void ()* @pos_empty_5, !"kernel", i32 1} +!5 = !{void ()* @pos_empty_6, !"kernel", i32 1} +!6 = !{void ()* @pos_empty_7, !"kernel", i32 1} +!7 = !{void ()* @pos_constant_loads, !"kernel", i32 1} +!8 = !{void ()* @neg_loads, !"kernel", i32 1} +!9 = !{void ()* @pos_priv_mem, !"kernel", i32 1} +!10 = !{void ()* @neg_mem, !"kernel", i32 1} +!11 = !{void ()* @pos_multiple, !"kernel", i32 1} +!12 = !{i32 7, !"openmp", i32 50} +!13 = !{i32 7, !"openmp-device", i32 50} +;. +; CHECK: attributes #[[ATTR0:[0-9]+]] = { "llvm.assume"="ompx_aligned_barrier" } +; CHECK: attributes #[[ATTR1:[0-9]+]] = { convergent nounwind } +; CHECK: attributes #[[ATTR2:[0-9]+]] = { convergent nounwind willreturn } +;. +; CHECK: [[META0:![0-9]+]] = !{i32 7, !"openmp", i32 50} +; CHECK: [[META1:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} +; CHECK: [[META2:![0-9]+]] = !{void ()* @pos_empty_1, !"kernel", i32 1} +; CHECK: [[META3:![0-9]+]] = !{void ()* @pos_empty_2, !"kernel", i32 1} +; CHECK: [[META4:![0-9]+]] = !{void ()* @pos_empty_3, !"kernel", i32 1} +; CHECK: [[META5:![0-9]+]] = !{void ()* @pos_empty_4, !"kernel", i32 1} +; CHECK: [[META6:![0-9]+]] = !{void ()* @pos_empty_5, !"kernel", i32 1} +; CHECK: [[META7:![0-9]+]] = !{void ()* @pos_empty_6, !"kernel", i32 1} +; CHECK: [[META8:![0-9]+]] = !{void ()* @pos_empty_7, !"kernel", i32 1} +; CHECK: [[META9:![0-9]+]] = !{void ()* @pos_constant_loads, !"kernel", i32 1} +; CHECK: [[META10:![0-9]+]] = !{void ()* @neg_loads, !"kernel", i32 1} +; CHECK: [[META11:![0-9]+]] = !{void ()* @pos_priv_mem, !"kernel", i32 1} +; CHECK: [[META12:![0-9]+]] = !{void ()* @neg_mem, !"kernel", i32 1} +; CHECK: [[META13:![0-9]+]] = !{void ()* @pos_multiple, !"kernel", i32 1} +;. diff --git a/openmp/docs/remarks/OMP180.rst b/openmp/docs/remarks/OMP180.rst --- a/openmp/docs/remarks/OMP180.rst +++ b/openmp/docs/remarks/OMP180.rst @@ -14,7 +14,7 @@ This optimization will trigger for most target regions to simplify the runtime once certain constants are known. This will trigger for internal runtime functions so it requires enabling verbose remarks with -`-openmp-opt-verbose-remarks`. +`-openmp-opt-verbose-remarks` (prefixed with `-mllvm` for use with clang). .. code-block:: c++ diff --git a/openmp/docs/remarks/OMP190.rst b/openmp/docs/remarks/OMP190.rst new file mode 100644 --- /dev/null +++ b/openmp/docs/remarks/OMP190.rst @@ -0,0 +1,23 @@ +.. _omp190: + +Redundant barrier eliminated. (device only) +==================================================================== + +This optimization remark indicates that analysis determined an aligned +barrier in the device code to be redundant. This can occur when state +updates that have been synchronized by the barrier were eliminated too. +See also "Co-Designing an OpenMP GPU Runtime and Optimizations for Near-Zero +Overhead Execution", IPDPS'22. + +Example +------- + +This optimization will trigger for most target regions if state initialization +was removed as a consequence of "state forwarding". This will trigger for +internal runtime functions so it requires enabling verbose remarks with +`-openmp-opt-verbose-remarks` (prefixed with `-mllvm` for use with clang). + +Diagnostic Scope +---------------- + +OpenMP optimization remark. diff --git a/openmp/docs/remarks/OptimizationRemarks.rst b/openmp/docs/remarks/OptimizationRemarks.rst --- a/openmp/docs/remarks/OptimizationRemarks.rst +++ b/openmp/docs/remarks/OptimizationRemarks.rst @@ -40,6 +40,7 @@ OMP160 OMP170 OMP180 + OMP190 .. list-table:: :widths: 15 15 70 @@ -111,3 +112,6 @@ * - :ref:`OMP180 ` - Optimization - Replacing OpenMP runtime call with . + * - :ref:`OMP190 ` + - Optimization + - Redundant barrier eliminated. (device only)