diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -19,12 +19,14 @@ #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/CallGraphSCCPass.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/Frontend/OpenMP/OMPConstants.h" #include "llvm/Frontend/OpenMP/OMPIRBuilder.h" #include "llvm/InitializePasses.h" #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/IPO/Attributor.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/CallGraphUpdater.h" using namespace llvm; @@ -55,6 +57,8 @@ STATISTIC( NumOpenMPParallelRegionsReplacedInGPUStateMachine, "Number of OpenMP parallel regions replaced with ID in GPU state machines"); +STATISTIC(NumOpenMPParallelRegionsMerged, + "Number of OpenMP parallel regions merged"); #if !defined(NDEBUG) static constexpr auto TAG = "[" DEBUG_TYPE "]"; @@ -506,8 +510,9 @@ // Recollect uses, in case Attributor deleted any. OMPInfoCache.recollectUses(); - Changed |= deduplicateRuntimeCalls(); Changed |= deleteParallelRegions(); + Changed |= mergeParallelRegions(); + Changed |= deduplicateRuntimeCalls(); return Changed; } @@ -571,6 +576,209 @@ } private: + /// Merge parallel regions when it is safe. + bool mergeParallelRegions() { + const unsigned CallbackCalleeOperand = 2; + const unsigned CallbackFirstArgOperand = 3; + using InsertPointTy = OpenMPIRBuilder::InsertPointTy; + + // Check if there are any __kmpc_fork_call calls to merge. + OMPInformationCache::RuntimeFunctionInfo &RFI = + OMPInfoCache.RFIs[OMPRTL___kmpc_fork_call]; + + if (!RFI.Declaration) + return false; + + bool Changed = false; + LoopInfo *LI = nullptr; + DominatorTree *DT = nullptr; + + SmallDenseMap> BB2PRMap; + + BasicBlock *StartBB, *EndBB; + auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP, + BasicBlock &ContinuationIP) { + BasicBlock *CGStartBB = CodeGenIP.getBlock(); + BasicBlock *CGEndBB = + SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI); + CGStartBB->getTerminator()->setSuccessor(0, StartBB); + EndBB->getTerminator()->setSuccessor(0, CGEndBB); + }; + + auto PrivCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP, + Value &VPtr, Value *&ReplacementValue) -> InsertPointTy { + ReplacementValue = &VPtr; + return CodeGenIP; + }; + + auto FiniCB = [&](InsertPointTy CodeGenIP) {}; + + // Helper to merge the __kmpc_fork_call calls in MergableCIs. They are all + // contained in BB and only separated by instructions that can be + // redundantly executed in parallel. The block BB is split before the first + // call (in MergableCIs) and after the last so the entire region we merge + // into a single parallel region is contained in a single basic block + // without any other instructions. We use the OpenMPIRBuilder to outline + // that block and call the resulting function via __kmpc_fork_call. + auto Merge = [&](SmallVectorImpl &MergableCIs, BasicBlock *BB) { + // TODO: Change the interface to allow single CIs expanded, e.g, to + // include an outer loop. + assert(MergableCIs.size() > 1 && "Assumed multiple mergable CIs"); + + Function *OriginalFn = BB->getParent(); + LLVM_DEBUG(dbgs() << TAG << "Merge " << MergableCIs.size() + << " parallel regions in " << OriginalFn->getName() + << "\n"); + + // Isolate the calls to merge in a separate block. + EndBB = SplitBlock(BB, MergableCIs.back()->getNextNode(), DT, LI); + BasicBlock *AfterBB = + SplitBlock(EndBB, &*EndBB->getFirstInsertionPt(), DT, LI); + StartBB = + SplitBlock(BB, MergableCIs.front(), DT, LI, nullptr, "expnd_omp_pr"); + + assert(BB->getUniqueSuccessor() == StartBB && "Expected a different CFG"); + const DebugLoc DL = BB->getTerminator()->getDebugLoc(); + BB->getTerminator()->eraseFromParent(); + + OpenMPIRBuilder::LocationDescription Loc(InsertPointTy(BB, BB->end()), + DL); + // TODO: Verify proc bind matches and use the value. + // TODO: Check the cancellable flag. + InsertPointTy AfterIP = OMPInfoCache.OMPBuilder.CreateParallel( + Loc, BodyGenCB, PrivCB, FiniCB, nullptr, nullptr, + OMP_PROC_BIND_default, /* IsCancellable */ false); + BranchInst::Create(AfterBB, AfterIP.getBlock()); + + // Perform the actual outlining. + OMPInfoCache.OMPBuilder.finalize(); + + Function *OutlinedFn = MergableCIs.front()->getCaller(); + + // Replace the __kmpc_fork_call calls with direct calls to the outlined + // callbacks. + SmallVector Args; + for (auto *CI : MergableCIs) { + Value *Callee = + CI->getArgOperand(CallbackCalleeOperand)->stripPointerCasts(); + FunctionType *FT = + cast(Callee->getType()->getPointerElementType()); + Args.clear(); + Args.push_back(OutlinedFn->getArg(0)); + Args.push_back(OutlinedFn->getArg(1)); + for (unsigned u = CallbackFirstArgOperand, e = CI->getNumArgOperands(); + u < e; ++u) + Args.push_back(CI->getArgOperand(u)); + + CallInst *newCI = CallInst::Create(FT, Callee, Args, "", CI); + if (CI->getDebugLoc()) + newCI->setDebugLoc(CI->getDebugLoc()); + + // Forward parameter attributes. + for (const Attribute &A : + OutlinedFn->getAttributes().getParamAttributes(0)) + newCI->addParamAttr(0, A); + for (const Attribute &A : + OutlinedFn->getAttributes().getParamAttributes(1)) + newCI->addParamAttr(1, A); + for (unsigned u = CallbackFirstArgOperand, e = CI->getNumArgOperands(); + u < e; ++u) + for (const Attribute &A : CI->getAttributes().getParamAttributes(u)) + newCI->addParamAttr(u - 1, A); + + // Emit an explicit barrier to replace the implicit fork-join barrier. + if (CI != MergableCIs.back()) { + // TODO: Remove barrier if the merged parallel region includes the + // 'nowait' clause. + OMPInfoCache.OMPBuilder.CreateBarrier( + InsertPointTy(newCI->getParent(), + newCI->getNextNode()->getIterator()), + OMPD_parallel); + OMPInfoCache.OMPBuilder.finalize(); + } + + auto Remark = [&](OptimizationRemark OR) { + return OR << "Parallel region in " + << ore::NV("OpenMPParallelMerge", + CI->getCaller()->getName()) + << " merged"; + }; + emitRemark(CI, "OpenMPParallelRegionMerging", + Remark); + + CI->eraseFromParent(); + } + + assert(OutlinedFn != OriginalFn && "Outlining failed"); + CGUpdater.registerOutlinedFunction(*OutlinedFn); + CGUpdater.reanalyzeFunction(*OriginalFn); + + NumOpenMPParallelRegionsMerged += MergableCIs.size(); + + return true; + }; + + // Helper function that performs that identifes sequences of + // __kmpc_fork_call uses in a basic block. + auto DetectPRsCB = [&](Use &U, Function &F) { + CallInst *CI = getCallIfRegularCall(U, &RFI); + BB2PRMap[CI->getParent()].insert(CI); + + LLVM_DEBUG(dbgs() << TAG << "Found parallel regions in " + << BB2PRMap.size() << " blocks in " << F.getName() + << "\n"); + + return false; + }; + + BB2PRMap.clear(); + RFI.foreachUse(SCC, DetectPRsCB); + SmallVector, 4> MergableCIsVector; + // Find mergable parallel regions within a basic block that are + // safe to merge, that is any in-between instructions can safely + // execute in parallel after merging. + // TODO: support merging across basic-blocks. + for (auto &It : BB2PRMap) { + auto &CIs = It.getSecond(); + if (CIs.size() < 2) + continue; + + BasicBlock *BB = It.getFirst(); + SmallVector MergableCIs; + + // Find maximal number of parallel region CIs that are safe to merge. + for (Instruction &I : *BB) { + if (CIs.count(&I)) { + MergableCIs.push_back(cast(&I)); + continue; + } + + if (isSafeToSpeculativelyExecute(&I, &I, DT)) + continue; + + if (MergableCIs.size() > 1) + MergableCIsVector.push_back(MergableCIs); + + MergableCIs.clear(); + } + + if (!MergableCIsVector.empty()) { + Changed = true; + + for (auto &MergableCIs : MergableCIsVector) + Merge(MergableCIs, BB); + } + } + + if (Changed) { + // Update RFI info to set it up for later passes. + RFI.clearUsesMap(); + OMPInfoCache.collectUses(RFI, /* CollectStats */ false); + } + + return Changed; + } + /// Try to delete parallel regions if possible. bool deleteParallelRegions() { const unsigned CallbackCalleeOperand = 2; diff --git a/llvm/test/Transforms/OpenMP/parallel_deletion.ll b/llvm/test/Transforms/OpenMP/parallel_deletion.ll --- a/llvm/test/Transforms/OpenMP/parallel_deletion.ll +++ b/llvm/test/Transforms/OpenMP/parallel_deletion.ll @@ -54,7 +54,7 @@ ; CHECK-LABEL: define {{[^@]+}}@.omp_outlined.willreturn.0 ; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #1 ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @readonly() #4 +; CHECK-NEXT: call void @readonly() #5 ; CHECK-NEXT: ret void ; entry: @@ -99,9 +99,16 @@ define void @delete_parallel_1() { ; CHECK-LABEL: define {{[^@]+}}@delete_parallel_1() ; CHECK-NEXT: entry: -; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull align 8 dereferenceable(24) @0, i32 0, void (i32*, i32*, ...)* nonnull bitcast (void (i32*, i32*)* @.omp_outlined. to void (i32*, i32*, ...)*)) -; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull align 8 dereferenceable(24) @0, i32 0, void (i32*, i32*, ...)* nonnull bitcast (void (i32*, i32*)* @.omp_outlined..0 to void (i32*, i32*, ...)*)) -; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull align 8 dereferenceable(24) @0, i32 0, void (i32*, i32*, ...)* nonnull bitcast (void (i32*, i32*)* @.omp_outlined..1 to void (i32*, i32*, ...)*)) +; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @0) +; CHECK-NEXT: br label [[OMP_PARALLEL:%.*]] +; CHECK: omp_parallel: +; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @0, i32 0, void (i32*, i32*, ...)* bitcast (void (i32*, i32*)* @delete_parallel_1..omp_par to void (i32*, i32*, ...)*)) +; CHECK-NEXT: br label [[OMP_PAR_OUTLINED_EXIT:%.*]] +; CHECK: omp.par.outlined.exit: +; CHECK-NEXT: br label [[OMP_PAR_EXIT_SPLIT:%.*]] +; CHECK: omp.par.exit.split: +; CHECK-NEXT: br label [[ENTRY_SPLIT_SPLIT:%.*]] +; CHECK: entry.split.split: ; CHECK-NEXT: ret void ; entry: @@ -126,9 +133,9 @@ define internal void @.omp_outlined..0(i32* noalias %.global_tid., i32* noalias %.bound_tid.) { ; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..0 -; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #4 +; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #5 ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @readonly() #4 +; CHECK-NEXT: call void @readonly() #5 ; CHECK-NEXT: ret void ; entry: @@ -138,7 +145,7 @@ define internal void @.omp_outlined..1(i32* noalias %.global_tid., i32* noalias %.bound_tid.) { ; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..1 -; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #5 +; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #6 ; CHECK-NEXT: entry: ; CHECK-NEXT: call void @readnone() ; CHECK-NEXT: ret void @@ -190,10 +197,16 @@ ; CHECK-NEXT: [[TMP:%.*]] = bitcast i32* [[A]] to i8* ; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull align 4 dereferenceable(4) [[TMP]]) #0 ; CHECK-NEXT: store i32 0, i32* [[A]], align 4 -; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull align 8 dereferenceable(24) @0, i32 1, void (i32*, i32*, ...)* nonnull bitcast (void (i32*, i32*, i32*)* @.omp_outlined..3 to void (i32*, i32*, ...)*), i32* nocapture nofree nonnull align 4 dereferenceable(4) [[A]]) -; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull align 8 dereferenceable(24) @0, i32 1, void (i32*, i32*, ...)* nonnull bitcast (void (i32*, i32*, i32*)* @.omp_outlined..4 to void (i32*, i32*, ...)*), i32* nocapture nonnull align 4 dereferenceable(4) [[A]]) -; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull align 8 dereferenceable(24) @0, i32 1, void (i32*, i32*, ...)* nonnull bitcast (void (i32*, i32*, i32*)* @.omp_outlined..5 to void (i32*, i32*, ...)*), i32* nocapture nonnull align 4 dereferenceable(4) [[A]]) -; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull align 8 dereferenceable(24) @0, i32 1, void (i32*, i32*, ...)* nonnull bitcast (void (i32*, i32*, i32*)* @.omp_outlined..6 to void (i32*, i32*, ...)*), i32* nocapture nonnull align 4 dereferenceable(4) [[A]]) +; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @0) +; CHECK-NEXT: br label [[OMP_PARALLEL:%.*]] +; CHECK: omp_parallel: +; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @0, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @delete_parallel_2..omp_par to void (i32*, i32*, ...)*), i32* [[A]]) +; CHECK-NEXT: br label [[OMP_PAR_OUTLINED_EXIT:%.*]] +; CHECK: omp.par.outlined.exit: +; CHECK-NEXT: br label [[OMP_PAR_EXIT_SPLIT:%.*]] +; CHECK: omp.par.exit.split: +; CHECK-NEXT: br label [[ENTRY_SPLIT_SPLIT:%.*]] +; CHECK: entry.split.split: ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[A]] to i8* ; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull [[TMP1]]) ; CHECK-NEXT: ret void @@ -214,9 +227,9 @@ define internal void @.omp_outlined..3(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32* dereferenceable(4) %a) { ; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..3 -; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture nofree nonnull align 4 dereferenceable(4) [[A:%.*]]) #6 +; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture nofree nonnull align 4 dereferenceable(4) [[A:%.*]]) #7 ; CHECK-NEXT: entry: -; CHECK-NEXT: [[CALL:%.*]] = call i32 @omp_get_thread_num() #4 +; CHECK-NEXT: [[CALL:%.*]] = call i32 @omp_get_thread_num() #5 ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 ; CHECK-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] ; CHECK: if.then: @@ -393,7 +406,7 @@ define internal void @.omp.reduction.reduction_func(i8* %arg, i8* %arg1) { ; CHECK-LABEL: define {{[^@]+}}@.omp.reduction.reduction_func -; CHECK-SAME: (i8* nocapture nofree nonnull readonly align 8 dereferenceable(8) [[ARG:%.*]], i8* nocapture nofree nonnull readonly align 8 dereferenceable(8) [[ARG1:%.*]]) #10 +; CHECK-SAME: (i8* nocapture nofree nonnull readonly align 8 dereferenceable(8) [[ARG:%.*]], i8* nocapture nofree nonnull readonly align 8 dereferenceable(8) [[ARG1:%.*]]) #11 ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP:%.*]] = bitcast i8* [[ARG1]] to i32** ; CHECK-NEXT: [[TMP2:%.*]] = load i32*, i32** [[TMP]], align 8 diff --git a/llvm/test/Transforms/OpenMP/parallel_region_merging.ll b/llvm/test/Transforms/OpenMP/parallel_region_merging.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/OpenMP/parallel_region_merging.ll @@ -0,0 +1,318 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature +; RUN: opt -S -attributor -openmpopt < %s | FileCheck %s +; RUN: opt -S -passes='attributor,cgscc(openmpopt)' < %s | FileCheck %s +; +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" + +%struct.ident_t = type { i32, i32, i32, i32, i8* } + +@.str = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 +@0 = private unnamed_addr global %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 + +; void merge_all() { +; int a = 1; +; #pragma omp parallel +; { +; a = 2; +; } +; #pragma omp parallel +; { +; a = 3; +; } +; } +; +; Merge all parallel regions +define dso_local void @merge_all() #0 { +; CHECK-LABEL: define {{[^@]+}}@merge_all() +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 +; CHECK-NEXT: store i32 1, i32* [[A]], align 4 +; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @0) +; CHECK-NEXT: br label [[OMP_PARALLEL:%.*]] +; CHECK: omp_parallel: +; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @0, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_all..omp_par to void (i32*, i32*, ...)*), i32* [[A]]) +; CHECK-NEXT: br label [[OMP_PAR_OUTLINED_EXIT:%.*]] +; CHECK: omp.par.outlined.exit: +; CHECK-NEXT: br label [[OMP_PAR_EXIT_SPLIT:%.*]] +; CHECK: omp.par.exit.split: +; CHECK-NEXT: br label [[ENTRY_SPLIT_SPLIT:%.*]] +; CHECK: entry.split.split: +; CHECK-NEXT: ret void +; +entry: + %a = alloca i32, align 4 + store i32 1, i32* %a, align 4 + call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @0, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined. to void (i32*, i32*, ...)*), i32* %a) + call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @0, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..1 to void (i32*, i32*, ...)*), i32* %a) + ret void +} + +define internal void @.omp_outlined.(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32* nonnull align 4 dereferenceable(4) %a) { +; CHECK-LABEL: define {{[^@]+}}@.omp_outlined. +; CHECK-SAME: (i32* noalias nofree [[DOTGLOBAL_TID_:%.*]], i32* noalias nofree [[DOTBOUND_TID_:%.*]], i32* nofree nonnull align 4 dereferenceable(4) [[A:%.*]]) #1 +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +; CHECK-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +; CHECK-NEXT: store i32 2, i32* [[TMP0]], align 4 +; CHECK-NEXT: ret void +; +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %a.addr = alloca i32*, align 8 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + store i32* %a, i32** %a.addr, align 8 + %0 = load i32*, i32** %a.addr, align 8 + store i32 2, i32* %0, align 4 + ret void +} + +declare !callback !0 void @__kmpc_fork_call(%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) + +define internal void @.omp_outlined..1(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32* nonnull align 4 dereferenceable(4) %a) { +; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..1 +; CHECK-SAME: (i32* noalias nofree [[DOTGLOBAL_TID_:%.*]], i32* noalias nofree [[DOTBOUND_TID_:%.*]], i32* nofree nonnull align 4 dereferenceable(4) [[A:%.*]]) #1 +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +; CHECK-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +; CHECK-NEXT: store i32 3, i32* [[TMP0]], align 4 +; CHECK-NEXT: ret void +; +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %a.addr = alloca i32*, align 8 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + store i32* %a, i32** %a.addr, align 8 + %0 = load i32*, i32** %a.addr, align 8 + store i32 3, i32* %0, align 4 + ret void +} + +; void merge_none() { +; int a = 1; +; #pragma omp parallel +; { +; a = 2; +; } +; a = 3; +; #pragma omp parallel +; { +; a = 4; +; } +; } +; +; Do not merge parallel regions, in-between store +; instruction is unsafe to execute in parallel +define dso_local void @merge_none() #0 { +; CHECK-LABEL: define {{[^@]+}}@merge_none() +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 +; CHECK-NEXT: store i32 1, i32* [[A]], align 4 +; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull align 8 dereferenceable(24) @0, i32 1, void (i32*, i32*, ...)* nonnull bitcast (void (i32*, i32*, i32*)* @.omp_outlined..2 to void (i32*, i32*, ...)*), i32* nofree nonnull align 4 dereferenceable(4) [[A]]) +; CHECK-NEXT: store i32 3, i32* [[A]], align 4 +; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull align 8 dereferenceable(24) @0, i32 1, void (i32*, i32*, ...)* nonnull bitcast (void (i32*, i32*, i32*)* @.omp_outlined..3 to void (i32*, i32*, ...)*), i32* nofree nonnull align 4 dereferenceable(4) [[A]]) +; CHECK-NEXT: ret void +; +entry: + %a = alloca i32, align 4 + store i32 1, i32* %a, align 4 + call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @0, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..2 to void (i32*, i32*, ...)*), i32* %a) + store i32 3, i32* %a, align 4 + call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @0, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..3 to void (i32*, i32*, ...)*), i32* %a) + ret void +} + +define internal void @.omp_outlined..2(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32* nonnull align 4 dereferenceable(4) %a) { +; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..2 +; CHECK-SAME: (i32* noalias nofree [[DOTGLOBAL_TID_:%.*]], i32* noalias nofree [[DOTBOUND_TID_:%.*]], i32* nofree nonnull align 4 dereferenceable(4) [[A:%.*]]) #1 +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +; CHECK-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +; CHECK-NEXT: store i32 2, i32* [[TMP0]], align 4 +; CHECK-NEXT: ret void +; +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %a.addr = alloca i32*, align 8 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + store i32* %a, i32** %a.addr, align 8 + %0 = load i32*, i32** %a.addr, align 8 + store i32 2, i32* %0, align 4 + ret void +} + +define internal void @.omp_outlined..3(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32* nonnull align 4 dereferenceable(4) %a) { +; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..3 +; CHECK-SAME: (i32* noalias nofree [[DOTGLOBAL_TID_:%.*]], i32* noalias nofree [[DOTBOUND_TID_:%.*]], i32* nofree nonnull align 4 dereferenceable(4) [[A:%.*]]) #1 +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +; CHECK-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +; CHECK-NEXT: store i32 4, i32* [[TMP0]], align 4 +; CHECK-NEXT: ret void +; +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %a.addr = alloca i32*, align 8 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + store i32* %a, i32** %a.addr, align 8 + %0 = load i32*, i32** %a.addr, align 8 + store i32 4, i32* %0, align 4 + ret void +} + +; void merge_some() { +; int a = 1; +; #pragma omp parallel +; { +; a = 2; +; } +; a = 3; +; #pragma omp parallel +; { +; a = 4; +; } +; #pragma omp parallel +; { +; a = 5; +; } +; } +; +; Do not merge first parallel region, due to the +; unsafe store, but merge the two next parallel +; regions +define dso_local void @merge_some() #0 { +; CHECK-LABEL: define {{[^@]+}}@merge_some() +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 +; CHECK-NEXT: store i32 1, i32* [[A]], align 4 +; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull align 8 dereferenceable(24) @0, i32 1, void (i32*, i32*, ...)* nonnull bitcast (void (i32*, i32*, i32*)* @.omp_outlined..4 to void (i32*, i32*, ...)*), i32* nofree nonnull align 4 dereferenceable(4) [[A]]) +; CHECK-NEXT: store i32 3, i32* [[A]], align 4 +; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @0) +; CHECK-NEXT: br label [[OMP_PARALLEL:%.*]] +; CHECK: omp_parallel: +; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @0, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_some..omp_par to void (i32*, i32*, ...)*), i32* [[A]]) +; CHECK-NEXT: br label [[OMP_PAR_OUTLINED_EXIT:%.*]] +; CHECK: omp.par.outlined.exit: +; CHECK-NEXT: br label [[OMP_PAR_EXIT_SPLIT:%.*]] +; CHECK: omp.par.exit.split: +; CHECK-NEXT: br label [[ENTRY_SPLIT_SPLIT:%.*]] +; CHECK: entry.split.split: +; CHECK-NEXT: ret void +; +entry: + %a = alloca i32, align 4 + store i32 1, i32* %a, align 4 + call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @0, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..4 to void (i32*, i32*, ...)*), i32* %a) + store i32 3, i32* %a, align 4 + call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @0, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..5 to void (i32*, i32*, ...)*), i32* %a) + call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @0, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..6 to void (i32*, i32*, ...)*), i32* %a) + ret void +} + +define internal void @.omp_outlined..4(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32* nonnull align 4 dereferenceable(4) %a) { +; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..4 +; CHECK-SAME: (i32* noalias nofree [[DOTGLOBAL_TID_:%.*]], i32* noalias nofree [[DOTBOUND_TID_:%.*]], i32* nofree nonnull align 4 dereferenceable(4) [[A:%.*]]) #1 +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +; CHECK-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +; CHECK-NEXT: store i32 2, i32* [[TMP0]], align 4 +; CHECK-NEXT: ret void +; +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %a.addr = alloca i32*, align 8 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + store i32* %a, i32** %a.addr, align 8 + %0 = load i32*, i32** %a.addr, align 8 + store i32 2, i32* %0, align 4 + ret void +} + +define internal void @.omp_outlined..5(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32* nonnull align 4 dereferenceable(4) %a) { +; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..5 +; CHECK-SAME: (i32* noalias nofree [[DOTGLOBAL_TID_:%.*]], i32* noalias nofree [[DOTBOUND_TID_:%.*]], i32* nofree nonnull align 4 dereferenceable(4) [[A:%.*]]) #1 +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +; CHECK-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +; CHECK-NEXT: store i32 4, i32* [[TMP0]], align 4 +; CHECK-NEXT: ret void +; +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %a.addr = alloca i32*, align 8 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + store i32* %a, i32** %a.addr, align 8 + %0 = load i32*, i32** %a.addr, align 8 + store i32 4, i32* %0, align 4 + ret void +} + +define internal void @.omp_outlined..6(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32* nonnull align 4 dereferenceable(4) %a) { +; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..6 +; CHECK-SAME: (i32* noalias nofree [[DOTGLOBAL_TID_:%.*]], i32* noalias nofree [[DOTBOUND_TID_:%.*]], i32* nofree nonnull align 4 dereferenceable(4) [[A:%.*]]) #1 +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +; CHECK-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +; CHECK-NEXT: store i32 5, i32* [[TMP0]], align 4 +; CHECK-NEXT: ret void +; +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %a.addr = alloca i32*, align 8 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + store i32* %a, i32** %a.addr, align 8 + %0 = load i32*, i32** %a.addr, align 8 + store i32 5, i32* %0, align 4 + ret void +} + +!0 = !{!1} +!1 = !{i64 2, i64 -1, i64 -1, i1 true}