diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -19,14 +19,15 @@ #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/CallGraphSCCPass.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/Frontend/OpenMP/OMPConstants.h" #include "llvm/Frontend/OpenMP/OMPIRBuilder.h" #include "llvm/InitializePasses.h" #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/IPO/Attributor.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/CallGraphUpdater.h" -#include "llvm/Analysis/ValueTracking.h" using namespace llvm; using namespace omp; @@ -38,6 +39,11 @@ cl::desc("Disable OpenMP specific optimizations."), cl::Hidden, cl::init(false)); +static cl::opt EnableParallelRegionMerging( + "openmp-opt-enable-merging", cl::ZeroOrMore, + cl::desc("Enable the OpenMP region merging optimization."), cl::Hidden, + cl::init(false)); + static cl::opt PrintICVValues("openmp-print-icv-values", cl::init(false), cl::Hidden); static cl::opt PrintOpenMPKernels("openmp-print-gpu-kernels", @@ -63,6 +69,8 @@ STATISTIC( NumOpenMPParallelRegionsReplacedInGPUStateMachine, "Number of OpenMP parallel regions replaced with ID in GPU state machines"); +STATISTIC(NumOpenMPParallelRegionsMerged, + "Number of OpenMP parallel regions merged"); #if !defined(NDEBUG) static constexpr auto TAG = "[" DEBUG_TYPE "]"; @@ -505,12 +513,18 @@ // Recollect uses, in case Attributor deleted any. OMPInfoCache.recollectUses(); - Changed |= deduplicateRuntimeCalls(); Changed |= deleteParallelRegions(); if (HideMemoryTransferLatency) Changed |= hideMemTransfersLatency(); if (remarksEnabled()) analysisGlobalization(); + Changed |= deduplicateRuntimeCalls(); + if (EnableParallelRegionMerging) { + if (mergeParallelRegions()) { + deduplicateRuntimeCalls(); + Changed = true; + } + } return Changed; } @@ -575,6 +589,244 @@ } private: + /// Merge parallel regions when it is safe. + bool mergeParallelRegions() { + const unsigned CallbackCalleeOperand = 2; + const unsigned CallbackFirstArgOperand = 3; + using InsertPointTy = OpenMPIRBuilder::InsertPointTy; + + // Check if there are any __kmpc_fork_call calls to merge. + OMPInformationCache::RuntimeFunctionInfo &RFI = + OMPInfoCache.RFIs[OMPRTL___kmpc_fork_call]; + + if (!RFI.Declaration) + return false; + + // Check if there any __kmpc_push_proc_bind calls for explicit affinities. + OMPInformationCache::RuntimeFunctionInfo &ProcBindRFI = + OMPInfoCache.RFIs[OMPRTL___kmpc_push_proc_bind]; + + // Defensively abort if explicit affinities are set. + // TODO: Track ICV proc_bind to merge when mergable regions have the same + // affinity. + if (ProcBindRFI.Declaration) + return false; + + bool Changed = false; + LoopInfo *LI = nullptr; + DominatorTree *DT = nullptr; + + SmallDenseMap> BB2PRMap; + + BasicBlock *StartBB = nullptr, *EndBB = nullptr; + auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP, + BasicBlock &ContinuationIP) { + BasicBlock *CGStartBB = CodeGenIP.getBlock(); + BasicBlock *CGEndBB = + SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI); + assert(StartBB != nullptr && "StartBB should not be null"); + CGStartBB->getTerminator()->setSuccessor(0, StartBB); + assert(EndBB != nullptr && "EndBB should not be null"); + EndBB->getTerminator()->setSuccessor(0, CGEndBB); + }; + + auto PrivCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP, + Value &VPtr, Value *&ReplacementValue) -> InsertPointTy { + ReplacementValue = &VPtr; + return CodeGenIP; + }; + + auto FiniCB = [&](InsertPointTy CodeGenIP) {}; + + // Helper to merge the __kmpc_fork_call calls in MergableCIs. They are all + // contained in BB and only separated by instructions that can be + // redundantly executed in parallel. The block BB is split before the first + // call (in MergableCIs) and after the last so the entire region we merge + // into a single parallel region is contained in a single basic block + // without any other instructions. We use the OpenMPIRBuilder to outline + // that block and call the resulting function via __kmpc_fork_call. + auto Merge = [&](SmallVectorImpl &MergableCIs, BasicBlock *BB) { + // TODO: Change the interface to allow single CIs expanded, e.g, to + // include an outer loop. + assert(MergableCIs.size() > 1 && "Assumed multiple mergable CIs"); + + auto Remark = [&](OptimizationRemark OR) { + OR << "Parallel region at " + << ore::NV("OpenMPParallelMergeFront", + MergableCIs.front()->getDebugLoc()) + << " merged with parallel regions at "; + for (auto *CI : + llvm::make_range(MergableCIs.begin() + 1, MergableCIs.end())) { + OR << ore::NV("OpenMPParallelMerge", CI->getDebugLoc()); + if (CI != MergableCIs.back()) + OR << ", "; + } + return OR; + }; + + emitRemark(MergableCIs.front(), + "OpenMPParallelRegionMerging", Remark); + + Function *OriginalFn = BB->getParent(); + LLVM_DEBUG(dbgs() << TAG << "Merge " << MergableCIs.size() + << " parallel regions in " << OriginalFn->getName() + << "\n"); + + // Isolate the calls to merge in a separate block. + EndBB = SplitBlock(BB, MergableCIs.back()->getNextNode(), DT, LI); + BasicBlock *AfterBB = + SplitBlock(EndBB, &*EndBB->getFirstInsertionPt(), DT, LI); + StartBB = SplitBlock(BB, MergableCIs.front(), DT, LI, nullptr, + "omp.par.merged"); + + assert(BB->getUniqueSuccessor() == StartBB && "Expected a different CFG"); + const DebugLoc DL = BB->getTerminator()->getDebugLoc(); + BB->getTerminator()->eraseFromParent(); + + OpenMPIRBuilder::LocationDescription Loc(InsertPointTy(BB, BB->end()), + DL); + IRBuilder<>::InsertPoint AllocaIP( + &OriginalFn->getEntryBlock(), + OriginalFn->getEntryBlock().getFirstInsertionPt()); + // Create the merged parallel region with default proc binding, to + // avoid overriding binding settings, and without explicit cancellation. + InsertPointTy AfterIP = OMPInfoCache.OMPBuilder.CreateParallel( + Loc, AllocaIP, BodyGenCB, PrivCB, FiniCB, nullptr, nullptr, + OMP_PROC_BIND_default, /* IsCancellable */ false); + BranchInst::Create(AfterBB, AfterIP.getBlock()); + + // Perform the actual outlining. + OMPInfoCache.OMPBuilder.finalize(); + + Function *OutlinedFn = MergableCIs.front()->getCaller(); + + // Replace the __kmpc_fork_call calls with direct calls to the outlined + // callbacks. + SmallVector Args; + for (auto *CI : MergableCIs) { + Value *Callee = + CI->getArgOperand(CallbackCalleeOperand)->stripPointerCasts(); + FunctionType *FT = + cast(Callee->getType()->getPointerElementType()); + Args.clear(); + Args.push_back(OutlinedFn->getArg(0)); + Args.push_back(OutlinedFn->getArg(1)); + for (unsigned U = CallbackFirstArgOperand, E = CI->getNumArgOperands(); + U < E; ++U) + Args.push_back(CI->getArgOperand(U)); + + CallInst *NewCI = CallInst::Create(FT, Callee, Args, "", CI); + if (CI->getDebugLoc()) + NewCI->setDebugLoc(CI->getDebugLoc()); + + // Forward parameter attributes from the callback to the callee. + for (unsigned U = CallbackFirstArgOperand, E = CI->getNumArgOperands(); + U < E; ++U) + for (const Attribute &A : CI->getAttributes().getParamAttributes(U)) + NewCI->addParamAttr( + U - (CallbackFirstArgOperand - CallbackCalleeOperand), A); + + // Emit an explicit barrier to replace the implicit fork-join barrier. + if (CI != MergableCIs.back()) { + // TODO: Remove barrier if the merged parallel region includes the + // 'nowait' clause. + OMPInfoCache.OMPBuilder.CreateBarrier( + InsertPointTy(NewCI->getParent(), + NewCI->getNextNode()->getIterator()), + OMPD_parallel); + } + + auto Remark = [&](OptimizationRemark OR) { + return OR << "Parallel region at " + << ore::NV("OpenMPParallelMerge", CI->getDebugLoc()) + << " merged with " + << ore::NV("OpenMPParallelMergeFront", + MergableCIs.front()->getDebugLoc()); + }; + if (CI != MergableCIs.front()) + emitRemark(CI, "OpenMPParallelRegionMerging", + Remark); + + CI->eraseFromParent(); + } + + assert(OutlinedFn != OriginalFn && "Outlining failed"); + CGUpdater.registerOutlinedFunction(*OutlinedFn); + CGUpdater.reanalyzeFunction(*OriginalFn); + + NumOpenMPParallelRegionsMerged += MergableCIs.size(); + + return true; + }; + + // Helper function that identifes sequences of + // __kmpc_fork_call uses in a basic block. + auto DetectPRsCB = [&](Use &U, Function &F) { + CallInst *CI = getCallIfRegularCall(U, &RFI); + BB2PRMap[CI->getParent()].insert(CI); + + return false; + }; + + BB2PRMap.clear(); + RFI.foreachUse(SCC, DetectPRsCB); + SmallVector, 4> MergableCIsVector; + // Find mergable parallel regions within a basic block that are + // safe to merge, that is any in-between instructions can safely + // execute in parallel after merging. + // TODO: support merging across basic-blocks. + for (auto &It : BB2PRMap) { + auto &CIs = It.getSecond(); + if (CIs.size() < 2) + continue; + + BasicBlock *BB = It.getFirst(); + SmallVector MergableCIs; + + // Find maximal number of parallel region CIs that are safe to merge. + for (Instruction &I : *BB) { + if (CIs.count(&I)) { + MergableCIs.push_back(cast(&I)); + continue; + } + + if (isSafeToSpeculativelyExecute(&I, &I, DT)) + continue; + + if (MergableCIs.size() > 1) { + MergableCIsVector.push_back(MergableCIs); + LLVM_DEBUG(dbgs() << TAG << "Found " << MergableCIs.size() + << " parallel regions in block " << BB->getName() + << " of function " << BB->getParent()->getName() + << "\n";); + } + + MergableCIs.clear(); + } + + if (!MergableCIsVector.empty()) { + Changed = true; + + for (auto &MergableCIs : MergableCIsVector) + Merge(MergableCIs, BB); + } + } + + if (Changed) { + // Update RFI info to set it up for later passes. + RFI.clearUsesMap(); + OMPInfoCache.collectUses(RFI, /* CollectStats */ false); + + // Collect uses for the emitted barrier call. + OMPInformationCache::RuntimeFunctionInfo &BarrierRFI = + OMPInfoCache.RFIs[OMPRTL___kmpc_barrier]; + BarrierRFI.clearUsesMap(); + OMPInfoCache.collectUses(BarrierRFI, /* CollectStats */ false); + } + + return Changed; + } + /// Try to delete parallel regions if possible. bool deleteParallelRegions() { const unsigned CallbackCalleeOperand = 2; diff --git a/llvm/test/Transforms/OpenMP/parallel_region_merging.ll b/llvm/test/Transforms/OpenMP/parallel_region_merging.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/OpenMP/parallel_region_merging.ll @@ -0,0 +1,412 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs +; RUN: opt -S -passes='attributor,cgscc(openmpopt)' -openmp-opt-enable-merging < %s | FileCheck %s + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" + +%struct.ident_t = type { i32, i32, i32, i32, i8* } + +@0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 +@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8 + +; void merge_all() { +; int a = 1; +; #pragma omp parallel +; { +; a = 2; +; } +; #pragma omp parallel +; { +; a = 3; +; } +; } +; +; Merge all parallel regions. +define dso_local void @merge_all() local_unnamed_addr { + %1 = alloca i32, align 4 + %2 = bitcast i32* %1 to i8* + store i32 1, i32* %1, align 4 + %3 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1) + call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_all..omp_par to void (i32*, i32*, ...)*), i32* nonnull %1) + %4 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1) + call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_all..omp_par.1 to void (i32*, i32*, ...)*), i32* nonnull %1) + ret void +} + +define internal void @merge_all..omp_par.1(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2) { + store i32 3, i32* %2, align 4 + ret void +} + +define internal void @merge_all..omp_par(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2) { + store i32 2, i32* %2, align 4 + ret void +} + + +declare i32 @__kmpc_global_thread_num(%struct.ident_t*) local_unnamed_addr + +declare !callback !1 void @__kmpc_fork_call(%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) local_unnamed_addr + +; void merge_none() { +; int a = 1; +; #pragma omp parallel +; { +; a = 2; +; } +; a = 3; +; #pragma omp parallel +; { +; a = 4; +; } +; } +; +; Does not merge parallel regions, in-between store +; instruction is unsafe to execute in parallel. +define dso_local void @merge_none() local_unnamed_addr { + %1 = alloca i32, align 4 + %2 = bitcast i32* %1 to i8* + store i32 1, i32* %1, align 4 + %3 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1) + call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_none..omp_par to void (i32*, i32*, ...)*), i32* nonnull %1) + store i32 3, i32* %1, align 4 + %4 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1) + call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_none..omp_par.2 to void (i32*, i32*, ...)*), i32* nonnull %1) + ret void +} + +define internal void @merge_none..omp_par.2(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2) { + store i32 4, i32* %2, align 4 + ret void +} + +define internal void @merge_none..omp_par(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2) { + store i32 2, i32* %2, align 4 + ret void +} + +; void merge_some() { +; int a = 1; +; #pragma omp parallel +; { +; a = 2; +; } +; a = 3; +; #pragma omp parallel +; { +; a = 4; +; } +; #pragma omp parallel +; { +; a = 5; +; } +; } +; +; Do not merge first parallel region, due to the +; unsafe store, but merge the two next parallel +; regions. +define dso_local void @merge_some() local_unnamed_addr { + %1 = alloca i32, align 4 + %2 = bitcast i32* %1 to i8* + store i32 1, i32* %1, align 4 + %3 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1) + call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_some..omp_par to void (i32*, i32*, ...)*), i32* nonnull %1) + store i32 3, i32* %1, align 4 + %4 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1) + call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_some..omp_par.3 to void (i32*, i32*, ...)*), i32* nonnull %1) + %5 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1) + call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_some..omp_par.4 to void (i32*, i32*, ...)*), i32* nonnull %1) + ret void +} + +define internal void @merge_some..omp_par.4(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2) { + store i32 5, i32* %2, align 4 + ret void +} + +define internal void @merge_some..omp_par.3(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2) { + store i32 4, i32* %2, align 4 + ret void +} + +define internal void @merge_some..omp_par(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2) { + store i32 2, i32* %2, align 4 + ret void +} + +; void merge_cancellable_regions(int cancel1, int cancel2) +; { +; #pragma omp parallel +; { +; if(cancel1) { +; #pragma omp cancel parallel +; } +; } +; +; #pragma omp parallel +; { +; if (cancel2) { +; #pragma omp cancel parallel +; } +; } +; } +; +; Merge correctly cancellable regions. +define dso_local void @merge_cancellable_regions(i32 %0, i32 %1) local_unnamed_addr { + %3 = alloca i32, align 4 + %4 = alloca i32, align 4 + store i32 %0, i32* %3, align 4 + store i32 %1, i32* %4, align 4 + %5 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1) + call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_cancellable_regions..omp_par to void (i32*, i32*, ...)*), i32* nonnull %3) + %6 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1) + call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_cancellable_regions..omp_par.5 to void (i32*, i32*, ...)*), i32* nonnull %4) + ret void +} + +define internal void @merge_cancellable_regions..omp_par.5(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture readonly %2) { + %4 = load i32, i32* %2, align 4 + %5 = icmp eq i32 %4, 0 + br i1 %5, label %6, label %7 + +6: ; preds = %3 + ret void + +7: ; preds = %3 + %8 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1) + %9 = call i32 @__kmpc_cancel(%struct.ident_t* nonnull @1, i32 %8, i32 1) + ret void +} + +define internal void @merge_cancellable_regions..omp_par(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture readonly %2) { + %4 = load i32, i32* %2, align 4 + %5 = icmp eq i32 %4, 0 + br i1 %5, label %6, label %7 + +6: ; preds = %3 + ret void + +7: ; preds = %3 + %8 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1) + %9 = call i32 @__kmpc_cancel(%struct.ident_t* nonnull @1, i32 %8, i32 1) + ret void +} + +declare i32 @__kmpc_cancel(%struct.ident_t*, i32, i32) local_unnamed_addr + + +!llvm.module.flags = !{!0} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{!2} +!2 = !{i64 2, i64 -1, i64 -1, i1 true} +; CHECK-LABEL: define {{[^@]+}}@merge_all() local_unnamed_addr { +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1:@.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = alloca i32, align 4 +; CHECK-NEXT: store i32 1, i32* [[TMP2]], align 4 +; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]]) +; CHECK-NEXT: br label [[OMP_PARALLEL:%.*]] +; CHECK: omp_parallel: +; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_all..omp_par.3 to void (i32*, i32*, ...)*), i32* [[TMP2]]) +; CHECK-NEXT: br label [[OMP_PAR_OUTLINED_EXIT:%.*]] +; CHECK: omp.par.outlined.exit: +; CHECK-NEXT: br label [[OMP_PAR_EXIT_SPLIT:%.*]] +; CHECK: omp.par.exit.split: +; CHECK-NEXT: br label [[DOTSPLIT_SPLIT:%.*]] +; CHECK: .split.split: +; CHECK-NEXT: ret void +; +; +; CHECK-LABEL: define {{[^@]+}}@merge_all..omp_par.3 +; CHECK-SAME: (i32* noalias [[TID_ADDR:%.*]], i32* noalias [[ZERO_ADDR:%.*]], i32* [[TMP0:%.*]]) [[ATTR0:#.*]] { +; CHECK-NEXT: omp.par.entry: +; CHECK-NEXT: [[TID_ADDR_LOCAL:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[TID_ADDR]], align 4 +; CHECK-NEXT: store i32 [[TMP1]], i32* [[TID_ADDR_LOCAL]], align 4 +; CHECK-NEXT: [[TID:%.*]] = load i32, i32* [[TID_ADDR_LOCAL]], align 4 +; CHECK-NEXT: br label [[OMP_PAR_REGION:%.*]] +; CHECK: omp.par.outlined.exit.exitStub: +; CHECK-NEXT: ret void +; CHECK: omp.par.region: +; CHECK-NEXT: br label [[OMP_PAR_MERGED:%.*]] +; CHECK: omp.par.merged: +; CHECK-NEXT: call void @merge_all..omp_par(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP0]]) +; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]]) +; CHECK-NEXT: call void @__kmpc_barrier(%struct.ident_t* [[GLOB2:@.*]], i32 [[OMP_GLOBAL_THREAD_NUM]]) +; CHECK-NEXT: call void @merge_all..omp_par.1(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP0]]) +; CHECK-NEXT: br label [[DOTSPLIT:%.*]] +; CHECK: .split: +; CHECK-NEXT: br label [[OMP_PAR_REGION_SPLIT:%.*]] +; CHECK: omp.par.region.split: +; CHECK-NEXT: br label [[OMP_PAR_PRE_FINALIZE:%.*]] +; CHECK: omp.par.pre_finalize: +; CHECK-NEXT: br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]] +; +; +; CHECK-LABEL: define {{[^@]+}}@merge_all..omp_par.1 +; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1:#.*]] { +; CHECK-NEXT: store i32 3, i32* [[TMP2]], align 4 +; CHECK-NEXT: ret void +; +; +; CHECK-LABEL: define {{[^@]+}}@merge_all..omp_par +; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1]] { +; CHECK-NEXT: store i32 2, i32* [[TMP2]], align 4 +; CHECK-NEXT: ret void +; +; +; CHECK-LABEL: define {{[^@]+}}@merge_none() local_unnamed_addr { +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]]) +; CHECK-NEXT: [[TMP2:%.*]] = alloca i32, align 4 +; CHECK-NEXT: store i32 1, i32* [[TMP2]], align 4 +; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @merge_none..omp_par to void (i32*, i32*, ...)*), i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2]]) +; CHECK-NEXT: store i32 3, i32* [[TMP2]], align 4 +; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @merge_none..omp_par.2 to void (i32*, i32*, ...)*), i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2]]) +; CHECK-NEXT: ret void +; +; +; CHECK-LABEL: define {{[^@]+}}@merge_none..omp_par.2 +; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1]] { +; CHECK-NEXT: store i32 4, i32* [[TMP2]], align 4 +; CHECK-NEXT: ret void +; +; +; CHECK-LABEL: define {{[^@]+}}@merge_none..omp_par +; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1]] { +; CHECK-NEXT: store i32 2, i32* [[TMP2]], align 4 +; CHECK-NEXT: ret void +; +; +; CHECK-LABEL: define {{[^@]+}}@merge_some() local_unnamed_addr { +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]]) +; CHECK-NEXT: [[TMP2:%.*]] = alloca i32, align 4 +; CHECK-NEXT: store i32 1, i32* [[TMP2]], align 4 +; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @merge_some..omp_par to void (i32*, i32*, ...)*), i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2]]) +; CHECK-NEXT: store i32 3, i32* [[TMP2]], align 4 +; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]]) +; CHECK-NEXT: br label [[OMP_PARALLEL:%.*]] +; CHECK: omp_parallel: +; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_some..omp_par.2 to void (i32*, i32*, ...)*), i32* [[TMP2]]) +; CHECK-NEXT: br label [[OMP_PAR_OUTLINED_EXIT:%.*]] +; CHECK: omp.par.outlined.exit: +; CHECK-NEXT: br label [[OMP_PAR_EXIT_SPLIT:%.*]] +; CHECK: omp.par.exit.split: +; CHECK-NEXT: br label [[DOTSPLIT_SPLIT:%.*]] +; CHECK: .split.split: +; CHECK-NEXT: ret void +; +; +; CHECK-LABEL: define {{[^@]+}}@merge_some..omp_par.2 +; CHECK-SAME: (i32* noalias [[TID_ADDR:%.*]], i32* noalias [[ZERO_ADDR:%.*]], i32* [[TMP0:%.*]]) [[ATTR0]] { +; CHECK-NEXT: omp.par.entry: +; CHECK-NEXT: [[TID_ADDR_LOCAL:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[TID_ADDR]], align 4 +; CHECK-NEXT: store i32 [[TMP1]], i32* [[TID_ADDR_LOCAL]], align 4 +; CHECK-NEXT: [[TID:%.*]] = load i32, i32* [[TID_ADDR_LOCAL]], align 4 +; CHECK-NEXT: br label [[OMP_PAR_REGION:%.*]] +; CHECK: omp.par.outlined.exit.exitStub: +; CHECK-NEXT: ret void +; CHECK: omp.par.region: +; CHECK-NEXT: br label [[OMP_PAR_MERGED:%.*]] +; CHECK: omp.par.merged: +; CHECK-NEXT: call void @merge_some..omp_par.3(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP0]]) +; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]]) +; CHECK-NEXT: call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]]) +; CHECK-NEXT: call void @merge_some..omp_par.4(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP0]]) +; CHECK-NEXT: br label [[DOTSPLIT:%.*]] +; CHECK: .split: +; CHECK-NEXT: br label [[OMP_PAR_REGION_SPLIT:%.*]] +; CHECK: omp.par.region.split: +; CHECK-NEXT: br label [[OMP_PAR_PRE_FINALIZE:%.*]] +; CHECK: omp.par.pre_finalize: +; CHECK-NEXT: br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]] +; +; +; CHECK-LABEL: define {{[^@]+}}@merge_some..omp_par.4 +; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1]] { +; CHECK-NEXT: store i32 5, i32* [[TMP2]], align 4 +; CHECK-NEXT: ret void +; +; +; CHECK-LABEL: define {{[^@]+}}@merge_some..omp_par.3 +; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1]] { +; CHECK-NEXT: store i32 4, i32* [[TMP2]], align 4 +; CHECK-NEXT: ret void +; +; +; CHECK-LABEL: define {{[^@]+}}@merge_some..omp_par +; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1]] { +; CHECK-NEXT: store i32 2, i32* [[TMP2]], align 4 +; CHECK-NEXT: ret void +; +; +; CHECK-LABEL: define {{[^@]+}}@merge_cancellable_regions +; CHECK-SAME: (i32 [[TMP0:%.*]], i32 [[TMP1:%.*]]) local_unnamed_addr { +; CHECK-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]]) +; CHECK-NEXT: [[TMP4:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[TMP5:%.*]] = alloca i32, align 4 +; CHECK-NEXT: store i32 [[TMP0]], i32* [[TMP4]], align 4 +; CHECK-NEXT: store i32 [[TMP1]], i32* [[TMP5]], align 4 +; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]]) +; CHECK-NEXT: br label [[OMP_PARALLEL:%.*]] +; CHECK: omp_parallel: +; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i32*)* @merge_cancellable_regions..omp_par.1 to void (i32*, i32*, ...)*), i32* [[TMP4]], i32* [[TMP5]]) +; CHECK-NEXT: br label [[OMP_PAR_OUTLINED_EXIT:%.*]] +; CHECK: omp.par.outlined.exit: +; CHECK-NEXT: br label [[OMP_PAR_EXIT_SPLIT:%.*]] +; CHECK: omp.par.exit.split: +; CHECK-NEXT: br label [[DOTSPLIT_SPLIT:%.*]] +; CHECK: .split.split: +; CHECK-NEXT: ret void +; +; +; CHECK-LABEL: define {{[^@]+}}@merge_cancellable_regions..omp_par.1 +; CHECK-SAME: (i32* noalias [[TID_ADDR:%.*]], i32* noalias [[ZERO_ADDR:%.*]], i32* [[TMP0:%.*]], i32* [[TMP1:%.*]]) [[ATTR0]] { +; CHECK-NEXT: omp.par.entry: +; CHECK-NEXT: [[TID_ADDR_LOCAL:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[TID_ADDR]], align 4 +; CHECK-NEXT: store i32 [[TMP2]], i32* [[TID_ADDR_LOCAL]], align 4 +; CHECK-NEXT: [[TID:%.*]] = load i32, i32* [[TID_ADDR_LOCAL]], align 4 +; CHECK-NEXT: br label [[OMP_PAR_REGION:%.*]] +; CHECK: omp.par.outlined.exit.exitStub: +; CHECK-NEXT: ret void +; CHECK: omp.par.region: +; CHECK-NEXT: br label [[OMP_PAR_MERGED:%.*]] +; CHECK: omp.par.merged: +; CHECK-NEXT: call void @merge_cancellable_regions..omp_par(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[TMP0]]) +; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]]) +; CHECK-NEXT: call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]]) +; CHECK-NEXT: call void @merge_cancellable_regions..omp_par.5(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[TMP1]]) +; CHECK-NEXT: br label [[DOTSPLIT:%.*]] +; CHECK: .split: +; CHECK-NEXT: br label [[OMP_PAR_REGION_SPLIT:%.*]] +; CHECK: omp.par.region.split: +; CHECK-NEXT: br label [[OMP_PAR_PRE_FINALIZE:%.*]] +; CHECK: omp.par.pre_finalize: +; CHECK-NEXT: br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]] +; +; +; CHECK-LABEL: define {{[^@]+}}@merge_cancellable_regions..omp_par.5 +; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[TMP2:%.*]]) { +; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0 +; CHECK-NEXT: br i1 [[TMP5]], label [[TMP6:%.*]], label [[TMP7:%.*]] +; CHECK: 6: +; CHECK-NEXT: ret void +; CHECK: 7: +; CHECK-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull [[GLOB1]]) +; CHECK-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* noundef nonnull [[GLOB1]], i32 [[TMP8]], i32 noundef 1) +; CHECK-NEXT: ret void +; +; +; CHECK-LABEL: define {{[^@]+}}@merge_cancellable_regions..omp_par +; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[TMP2:%.*]]) { +; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0 +; CHECK-NEXT: br i1 [[TMP5]], label [[TMP6:%.*]], label [[TMP7:%.*]] +; CHECK: 6: +; CHECK-NEXT: ret void +; CHECK: 7: +; CHECK-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull [[GLOB1]]) +; CHECK-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* noundef nonnull [[GLOB1]], i32 [[TMP8]], i32 noundef 1) +; CHECK-NEXT: ret void +; diff --git a/llvm/test/Transforms/OpenMP/parallel_region_merging_legacy_pm.ll b/llvm/test/Transforms/OpenMP/parallel_region_merging_legacy_pm.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/OpenMP/parallel_region_merging_legacy_pm.ll @@ -0,0 +1,412 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs +; RUN: opt -S -attributor -openmpopt -openmp-opt-enable-merging < %s | FileCheck %s + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" + +%struct.ident_t = type { i32, i32, i32, i32, i8* } + +@0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 +@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8 + +; void merge_all() { +; int a = 1; +; #pragma omp parallel +; { +; a = 2; +; } +; #pragma omp parallel +; { +; a = 3; +; } +; } +; +; Merge all parallel regions. +define dso_local void @merge_all() local_unnamed_addr { + %1 = alloca i32, align 4 + %2 = bitcast i32* %1 to i8* + store i32 1, i32* %1, align 4 + %3 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1) + call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_all..omp_par to void (i32*, i32*, ...)*), i32* nonnull %1) + %4 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1) + call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_all..omp_par.1 to void (i32*, i32*, ...)*), i32* nonnull %1) + ret void +} + +define internal void @merge_all..omp_par.1(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2) { + store i32 3, i32* %2, align 4 + ret void +} + +define internal void @merge_all..omp_par(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2) { + store i32 2, i32* %2, align 4 + ret void +} + + +declare i32 @__kmpc_global_thread_num(%struct.ident_t*) local_unnamed_addr + +declare !callback !1 void @__kmpc_fork_call(%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) local_unnamed_addr + +; void merge_none() { +; int a = 1; +; #pragma omp parallel +; { +; a = 2; +; } +; a = 3; +; #pragma omp parallel +; { +; a = 4; +; } +; } +; +; Does not merge parallel regions, in-between store +; instruction is unsafe to execute in parallel. +define dso_local void @merge_none() local_unnamed_addr { + %1 = alloca i32, align 4 + %2 = bitcast i32* %1 to i8* + store i32 1, i32* %1, align 4 + %3 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1) + call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_none..omp_par to void (i32*, i32*, ...)*), i32* nonnull %1) + store i32 3, i32* %1, align 4 + %4 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1) + call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_none..omp_par.2 to void (i32*, i32*, ...)*), i32* nonnull %1) + ret void +} + +define internal void @merge_none..omp_par.2(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2) { + store i32 4, i32* %2, align 4 + ret void +} + +define internal void @merge_none..omp_par(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2) { + store i32 2, i32* %2, align 4 + ret void +} + +; void merge_some() { +; int a = 1; +; #pragma omp parallel +; { +; a = 2; +; } +; a = 3; +; #pragma omp parallel +; { +; a = 4; +; } +; #pragma omp parallel +; { +; a = 5; +; } +; } +; +; Do not merge first parallel region, due to the +; unsafe store, but merge the two next parallel +; regions. +define dso_local void @merge_some() local_unnamed_addr { + %1 = alloca i32, align 4 + %2 = bitcast i32* %1 to i8* + store i32 1, i32* %1, align 4 + %3 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1) + call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_some..omp_par to void (i32*, i32*, ...)*), i32* nonnull %1) + store i32 3, i32* %1, align 4 + %4 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1) + call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_some..omp_par.3 to void (i32*, i32*, ...)*), i32* nonnull %1) + %5 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1) + call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_some..omp_par.4 to void (i32*, i32*, ...)*), i32* nonnull %1) + ret void +} + +define internal void @merge_some..omp_par.4(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2) { + store i32 5, i32* %2, align 4 + ret void +} + +define internal void @merge_some..omp_par.3(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2) { + store i32 4, i32* %2, align 4 + ret void +} + +define internal void @merge_some..omp_par(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2) { + store i32 2, i32* %2, align 4 + ret void +} + +; void merge_cancellable_regions(int cancel1, int cancel2) +; { +; #pragma omp parallel +; { +; if(cancel1) { +; #pragma omp cancel parallel +; } +; } +; +; #pragma omp parallel +; { +; if (cancel2) { +; #pragma omp cancel parallel +; } +; } +; } +; +; Merge correctly cancellable regions. +define dso_local void @merge_cancellable_regions(i32 %0, i32 %1) local_unnamed_addr { + %3 = alloca i32, align 4 + %4 = alloca i32, align 4 + store i32 %0, i32* %3, align 4 + store i32 %1, i32* %4, align 4 + %5 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1) + call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_cancellable_regions..omp_par to void (i32*, i32*, ...)*), i32* nonnull %3) + %6 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1) + call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_cancellable_regions..omp_par.5 to void (i32*, i32*, ...)*), i32* nonnull %4) + ret void +} + +define internal void @merge_cancellable_regions..omp_par.5(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture readonly %2) { + %4 = load i32, i32* %2, align 4 + %5 = icmp eq i32 %4, 0 + br i1 %5, label %6, label %7 + +6: ; preds = %3 + ret void + +7: ; preds = %3 + %8 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1) + %9 = call i32 @__kmpc_cancel(%struct.ident_t* nonnull @1, i32 %8, i32 1) + ret void +} + +define internal void @merge_cancellable_regions..omp_par(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture readonly %2) { + %4 = load i32, i32* %2, align 4 + %5 = icmp eq i32 %4, 0 + br i1 %5, label %6, label %7 + +6: ; preds = %3 + ret void + +7: ; preds = %3 + %8 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1) + %9 = call i32 @__kmpc_cancel(%struct.ident_t* nonnull @1, i32 %8, i32 1) + ret void +} + +declare i32 @__kmpc_cancel(%struct.ident_t*, i32, i32) local_unnamed_addr + + +!llvm.module.flags = !{!0} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{!2} +!2 = !{i64 2, i64 -1, i64 -1, i1 true} +; CHECK-LABEL: define {{[^@]+}}@merge_all() local_unnamed_addr { +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1:@.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = alloca i32, align 4 +; CHECK-NEXT: store i32 1, i32* [[TMP2]], align 4 +; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]]) +; CHECK-NEXT: br label [[OMP_PARALLEL:%.*]] +; CHECK: omp_parallel: +; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_all..omp_par.2 to void (i32*, i32*, ...)*), i32* [[TMP2]]) +; CHECK-NEXT: br label [[OMP_PAR_OUTLINED_EXIT:%.*]] +; CHECK: omp.par.outlined.exit: +; CHECK-NEXT: br label [[OMP_PAR_EXIT_SPLIT:%.*]] +; CHECK: omp.par.exit.split: +; CHECK-NEXT: br label [[DOTSPLIT_SPLIT:%.*]] +; CHECK: .split.split: +; CHECK-NEXT: ret void +; +; +; CHECK-LABEL: define {{[^@]+}}@merge_all..omp_par.2 +; CHECK-SAME: (i32* noalias [[TID_ADDR:%.*]], i32* noalias [[ZERO_ADDR:%.*]], i32* [[TMP0:%.*]]) [[ATTR0:#.*]] { +; CHECK-NEXT: omp.par.entry: +; CHECK-NEXT: [[TID_ADDR_LOCAL:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[TID_ADDR]], align 4 +; CHECK-NEXT: store i32 [[TMP1]], i32* [[TID_ADDR_LOCAL]], align 4 +; CHECK-NEXT: [[TID:%.*]] = load i32, i32* [[TID_ADDR_LOCAL]], align 4 +; CHECK-NEXT: br label [[OMP_PAR_REGION:%.*]] +; CHECK: omp.par.outlined.exit.exitStub: +; CHECK-NEXT: ret void +; CHECK: omp.par.region: +; CHECK-NEXT: br label [[OMP_PAR_MERGED:%.*]] +; CHECK: omp.par.merged: +; CHECK-NEXT: call void @merge_all..omp_par(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP0]]) +; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]]) +; CHECK-NEXT: call void @__kmpc_barrier(%struct.ident_t* [[GLOB2:@.*]], i32 [[OMP_GLOBAL_THREAD_NUM]]) +; CHECK-NEXT: call void @merge_all..omp_par.1(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP0]]) +; CHECK-NEXT: br label [[DOTSPLIT:%.*]] +; CHECK: .split: +; CHECK-NEXT: br label [[OMP_PAR_REGION_SPLIT:%.*]] +; CHECK: omp.par.region.split: +; CHECK-NEXT: br label [[OMP_PAR_PRE_FINALIZE:%.*]] +; CHECK: omp.par.pre_finalize: +; CHECK-NEXT: br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]] +; +; +; CHECK-LABEL: define {{[^@]+}}@merge_all..omp_par.1 +; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1:#.*]] { +; CHECK-NEXT: store i32 3, i32* [[TMP2]], align 4 +; CHECK-NEXT: ret void +; +; +; CHECK-LABEL: define {{[^@]+}}@merge_all..omp_par +; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1]] { +; CHECK-NEXT: store i32 2, i32* [[TMP2]], align 4 +; CHECK-NEXT: ret void +; +; +; CHECK-LABEL: define {{[^@]+}}@merge_none() local_unnamed_addr { +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]]) +; CHECK-NEXT: [[TMP2:%.*]] = alloca i32, align 4 +; CHECK-NEXT: store i32 1, i32* [[TMP2]], align 4 +; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @merge_none..omp_par to void (i32*, i32*, ...)*), i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2]]) +; CHECK-NEXT: store i32 3, i32* [[TMP2]], align 4 +; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @merge_none..omp_par.2 to void (i32*, i32*, ...)*), i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2]]) +; CHECK-NEXT: ret void +; +; +; CHECK-LABEL: define {{[^@]+}}@merge_none..omp_par.2 +; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1]] { +; CHECK-NEXT: store i32 4, i32* [[TMP2]], align 4 +; CHECK-NEXT: ret void +; +; +; CHECK-LABEL: define {{[^@]+}}@merge_none..omp_par +; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1]] { +; CHECK-NEXT: store i32 2, i32* [[TMP2]], align 4 +; CHECK-NEXT: ret void +; +; +; CHECK-LABEL: define {{[^@]+}}@merge_some() local_unnamed_addr { +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]]) +; CHECK-NEXT: [[TMP2:%.*]] = alloca i32, align 4 +; CHECK-NEXT: store i32 1, i32* [[TMP2]], align 4 +; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @merge_some..omp_par to void (i32*, i32*, ...)*), i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2]]) +; CHECK-NEXT: store i32 3, i32* [[TMP2]], align 4 +; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]]) +; CHECK-NEXT: br label [[OMP_PARALLEL:%.*]] +; CHECK: omp_parallel: +; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_some..omp_par.5 to void (i32*, i32*, ...)*), i32* [[TMP2]]) +; CHECK-NEXT: br label [[OMP_PAR_OUTLINED_EXIT:%.*]] +; CHECK: omp.par.outlined.exit: +; CHECK-NEXT: br label [[OMP_PAR_EXIT_SPLIT:%.*]] +; CHECK: omp.par.exit.split: +; CHECK-NEXT: br label [[DOTSPLIT_SPLIT:%.*]] +; CHECK: .split.split: +; CHECK-NEXT: ret void +; +; +; CHECK-LABEL: define {{[^@]+}}@merge_some..omp_par.5 +; CHECK-SAME: (i32* noalias [[TID_ADDR:%.*]], i32* noalias [[ZERO_ADDR:%.*]], i32* [[TMP0:%.*]]) [[ATTR0]] { +; CHECK-NEXT: omp.par.entry: +; CHECK-NEXT: [[TID_ADDR_LOCAL:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[TID_ADDR]], align 4 +; CHECK-NEXT: store i32 [[TMP1]], i32* [[TID_ADDR_LOCAL]], align 4 +; CHECK-NEXT: [[TID:%.*]] = load i32, i32* [[TID_ADDR_LOCAL]], align 4 +; CHECK-NEXT: br label [[OMP_PAR_REGION:%.*]] +; CHECK: omp.par.outlined.exit.exitStub: +; CHECK-NEXT: ret void +; CHECK: omp.par.region: +; CHECK-NEXT: br label [[OMP_PAR_MERGED:%.*]] +; CHECK: omp.par.merged: +; CHECK-NEXT: call void @merge_some..omp_par.3(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP0]]) +; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]]) +; CHECK-NEXT: call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]]) +; CHECK-NEXT: call void @merge_some..omp_par.4(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP0]]) +; CHECK-NEXT: br label [[DOTSPLIT:%.*]] +; CHECK: .split: +; CHECK-NEXT: br label [[OMP_PAR_REGION_SPLIT:%.*]] +; CHECK: omp.par.region.split: +; CHECK-NEXT: br label [[OMP_PAR_PRE_FINALIZE:%.*]] +; CHECK: omp.par.pre_finalize: +; CHECK-NEXT: br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]] +; +; +; CHECK-LABEL: define {{[^@]+}}@merge_some..omp_par.4 +; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1]] { +; CHECK-NEXT: store i32 5, i32* [[TMP2]], align 4 +; CHECK-NEXT: ret void +; +; +; CHECK-LABEL: define {{[^@]+}}@merge_some..omp_par.3 +; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1]] { +; CHECK-NEXT: store i32 4, i32* [[TMP2]], align 4 +; CHECK-NEXT: ret void +; +; +; CHECK-LABEL: define {{[^@]+}}@merge_some..omp_par +; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1]] { +; CHECK-NEXT: store i32 2, i32* [[TMP2]], align 4 +; CHECK-NEXT: ret void +; +; +; CHECK-LABEL: define {{[^@]+}}@merge_cancellable_regions +; CHECK-SAME: (i32 [[TMP0:%.*]], i32 [[TMP1:%.*]]) local_unnamed_addr { +; CHECK-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]]) +; CHECK-NEXT: [[TMP4:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[TMP5:%.*]] = alloca i32, align 4 +; CHECK-NEXT: store i32 [[TMP0]], i32* [[TMP4]], align 4 +; CHECK-NEXT: store i32 [[TMP1]], i32* [[TMP5]], align 4 +; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]]) +; CHECK-NEXT: br label [[OMP_PARALLEL:%.*]] +; CHECK: omp_parallel: +; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i32*)* @merge_cancellable_regions..omp_par.6 to void (i32*, i32*, ...)*), i32* [[TMP4]], i32* [[TMP5]]) +; CHECK-NEXT: br label [[OMP_PAR_OUTLINED_EXIT:%.*]] +; CHECK: omp.par.outlined.exit: +; CHECK-NEXT: br label [[OMP_PAR_EXIT_SPLIT:%.*]] +; CHECK: omp.par.exit.split: +; CHECK-NEXT: br label [[DOTSPLIT_SPLIT:%.*]] +; CHECK: .split.split: +; CHECK-NEXT: ret void +; +; +; CHECK-LABEL: define {{[^@]+}}@merge_cancellable_regions..omp_par.6 +; CHECK-SAME: (i32* noalias [[TID_ADDR:%.*]], i32* noalias [[ZERO_ADDR:%.*]], i32* [[TMP0:%.*]], i32* [[TMP1:%.*]]) [[ATTR0]] { +; CHECK-NEXT: omp.par.entry: +; CHECK-NEXT: [[TID_ADDR_LOCAL:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[TID_ADDR]], align 4 +; CHECK-NEXT: store i32 [[TMP2]], i32* [[TID_ADDR_LOCAL]], align 4 +; CHECK-NEXT: [[TID:%.*]] = load i32, i32* [[TID_ADDR_LOCAL]], align 4 +; CHECK-NEXT: br label [[OMP_PAR_REGION:%.*]] +; CHECK: omp.par.outlined.exit.exitStub: +; CHECK-NEXT: ret void +; CHECK: omp.par.region: +; CHECK-NEXT: br label [[OMP_PAR_MERGED:%.*]] +; CHECK: omp.par.merged: +; CHECK-NEXT: call void @merge_cancellable_regions..omp_par(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[TMP0]]) +; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]]) +; CHECK-NEXT: call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]]) +; CHECK-NEXT: call void @merge_cancellable_regions..omp_par.5(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[TMP1]]) +; CHECK-NEXT: br label [[DOTSPLIT:%.*]] +; CHECK: .split: +; CHECK-NEXT: br label [[OMP_PAR_REGION_SPLIT:%.*]] +; CHECK: omp.par.region.split: +; CHECK-NEXT: br label [[OMP_PAR_PRE_FINALIZE:%.*]] +; CHECK: omp.par.pre_finalize: +; CHECK-NEXT: br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]] +; +; +; CHECK-LABEL: define {{[^@]+}}@merge_cancellable_regions..omp_par.5 +; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[TMP2:%.*]]) { +; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0 +; CHECK-NEXT: br i1 [[TMP5]], label [[TMP6:%.*]], label [[TMP7:%.*]] +; CHECK: 6: +; CHECK-NEXT: ret void +; CHECK: 7: +; CHECK-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull [[GLOB1]]) +; CHECK-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* noundef nonnull [[GLOB1]], i32 [[TMP8]], i32 noundef 1) +; CHECK-NEXT: ret void +; +; +; CHECK-LABEL: define {{[^@]+}}@merge_cancellable_regions..omp_par +; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[TMP2:%.*]]) { +; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0 +; CHECK-NEXT: br i1 [[TMP5]], label [[TMP6:%.*]], label [[TMP7:%.*]] +; CHECK: 6: +; CHECK-NEXT: ret void +; CHECK: 7: +; CHECK-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull [[GLOB1]]) +; CHECK-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* noundef nonnull [[GLOB1]], i32 [[TMP8]], i32 noundef 1) +; CHECK-NEXT: ret void +;