diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -19,12 +19,14 @@ #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/CallGraphSCCPass.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/Frontend/OpenMP/OMPConstants.h" #include "llvm/Frontend/OpenMP/OMPIRBuilder.h" #include "llvm/InitializePasses.h" #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/IPO/Attributor.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/CallGraphUpdater.h" using namespace llvm; @@ -55,6 +57,8 @@ STATISTIC( NumOpenMPParallelRegionsReplacedInGPUStateMachine, "Number of OpenMP parallel regions replaced with ID in GPU state machines"); +STATISTIC(NumOpenMPParallelRegionsMerged, + "Number of OpenMP parallel regions merged"); #if !defined(NDEBUG) static constexpr auto TAG = "[" DEBUG_TYPE "]"; @@ -506,8 +510,9 @@ // Recollect uses, in case Attributor deleted any. OMPInfoCache.recollectUses(); - Changed |= deduplicateRuntimeCalls(); Changed |= deleteParallelRegions(); + Changed |= mergeParallelRegions(); + Changed |= deduplicateRuntimeCalls(); return Changed; } @@ -571,6 +576,209 @@ } private: + /// Merge parallel regions when it is safe. + bool mergeParallelRegions() { + const unsigned CallbackCalleeOperand = 2; + const unsigned CallbackFirstArgOperand = 3; + using InsertPointTy = OpenMPIRBuilder::InsertPointTy; + + // Check if there are any __kmpc_fork_call calls to merge. + OMPInformationCache::RuntimeFunctionInfo &RFI = + OMPInfoCache.RFIs[OMPRTL___kmpc_fork_call]; + + if (!RFI.Declaration) + return false; + + bool Changed = false; + LoopInfo *LI = nullptr; + DominatorTree *DT = nullptr; + + SmallDenseMap> BB2PRMap; + + BasicBlock *StartBB, *EndBB; + auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP, + BasicBlock &ContinuationIP) { + BasicBlock *CGStartBB = CodeGenIP.getBlock(); + BasicBlock *CGEndBB = + SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI); + CGStartBB->getTerminator()->setSuccessor(0, StartBB); + EndBB->getTerminator()->setSuccessor(0, CGEndBB); + }; + + auto PrivCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP, + Value &VPtr, Value *&ReplacementValue) -> InsertPointTy { + ReplacementValue = &VPtr; + return CodeGenIP; + }; + + auto FiniCB = [&](InsertPointTy CodeGenIP) {}; + + // Helper to merge the __kmpc_fork_call calls in MergableCIs. They are all + // contained in BB and only separated by instructions that can be + // redundantly executed in parallel. The block BB is split before the first + // call (in MergableCIs) and after the last so the entire region we merge + // into a single parallel region is contained in a single basic block + // without any other instructions. We use the OpenMPIRBuilder to outline + // that block and call the resulting function via __kmpc_fork_call. + auto Merge = [&](SmallVectorImpl &MergableCIs, BasicBlock *BB) { + // TODO: Change the interface to allow single CIs expanded, e.g, to + // include an outer loop. + assert(MergableCIs.size() > 1 && "Assumed multiple mergable CIs"); + + Function *OriginalFn = BB->getParent(); + LLVM_DEBUG(dbgs() << TAG << "Merge " << MergableCIs.size() + << " parallel regions in " << OriginalFn->getName() + << "\n"); + + // Isolate the calls to merge in a separate block. + EndBB = SplitBlock(BB, MergableCIs.back()->getNextNode(), DT, LI); + BasicBlock *AfterBB = + SplitBlock(EndBB, &*EndBB->getFirstInsertionPt(), DT, LI); + StartBB = + SplitBlock(BB, MergableCIs.front(), DT, LI, nullptr, "expnd_omp_pr"); + + assert(BB->getUniqueSuccessor() == StartBB && "Expected a different CFG"); + const DebugLoc DL = BB->getTerminator()->getDebugLoc(); + BB->getTerminator()->eraseFromParent(); + + OpenMPIRBuilder::LocationDescription Loc(InsertPointTy(BB, BB->end()), + DL); + // TODO: Verify proc bind matches and use the value. + // TODO: Check the cancellable flag. + InsertPointTy AfterIP = OMPInfoCache.OMPBuilder.CreateParallel( + Loc, BodyGenCB, PrivCB, FiniCB, nullptr, nullptr, + OMP_PROC_BIND_default, /* IsCancellable */ false); + BranchInst::Create(AfterBB, AfterIP.getBlock()); + + // Perform the actual outlining. + OMPInfoCache.OMPBuilder.finalize(); + + Function *OutlinedFn = MergableCIs.front()->getCaller(); + + // Replace the __kmpc_fork_call calls with direct calls to the outlined + // callbacks. + SmallVector Args; + for (auto *CI : MergableCIs) { + Value *Callee = + CI->getArgOperand(CallbackCalleeOperand)->stripPointerCasts(); + FunctionType *FT = + cast(Callee->getType()->getPointerElementType()); + Args.clear(); + Args.push_back(OutlinedFn->getArg(0)); + Args.push_back(OutlinedFn->getArg(1)); + for (unsigned u = CallbackFirstArgOperand, e = CI->getNumArgOperands(); + u < e; ++u) + Args.push_back(CI->getArgOperand(u)); + + CallInst *newCI = CallInst::Create(FT, Callee, Args, "", CI); + if (CI->getDebugLoc()) + newCI->setDebugLoc(CI->getDebugLoc()); + + // Forward parameter attributes. + for (const Attribute &A : + OutlinedFn->getAttributes().getParamAttributes(0)) + newCI->addParamAttr(0, A); + for (const Attribute &A : + OutlinedFn->getAttributes().getParamAttributes(1)) + newCI->addParamAttr(1, A); + for (unsigned u = CallbackFirstArgOperand, e = CI->getNumArgOperands(); + u < e; ++u) + for (const Attribute &A : CI->getAttributes().getParamAttributes(u)) + newCI->addParamAttr(u - 1, A); + + // Emit an explicit barrier to replace the implicit fork-join barrier. + if (CI != MergableCIs.back()) { + // TODO: Remove barrier if the merged parallel region includes the + // 'nowait' clause. + OMPInfoCache.OMPBuilder.CreateBarrier( + InsertPointTy(newCI->getParent(), + newCI->getNextNode()->getIterator()), + OMPD_parallel); + OMPInfoCache.OMPBuilder.finalize(); + } + + auto Remark = [&](OptimizationRemark OR) { + return OR << "Parallel region in " + << ore::NV("OpenMPParallelMerge", + CI->getCaller()->getName()) + << " merged"; + }; + emitRemark(CI, "OpenMPParallelRegionMerging", + Remark); + + CI->eraseFromParent(); + } + + assert(OutlinedFn != OriginalFn && "Outlining failed"); + CGUpdater.registerOutlinedFunction(*OutlinedFn); + CGUpdater.reanalyzeFunction(*OriginalFn); + + NumOpenMPParallelRegionsMerged += MergableCIs.size(); + + return true; + }; + + // Helper function that performs that identifes sequences of + // __kmpc_fork_call uses in a basic block. + auto DetectPRsCB = [&](Use &U, Function &F) { + CallInst *CI = getCallIfRegularCall(U, &RFI); + BB2PRMap[CI->getParent()].insert(CI); + + LLVM_DEBUG(dbgs() << TAG << "Found parallel regions in " + << BB2PRMap.size() << " blocks in " << F.getName() + << "\n"); + + return false; + }; + + BB2PRMap.clear(); + RFI.foreachUse(SCC, DetectPRsCB); + SmallVector, 4> MergableCIsVector; + // Find mergable parallel regions within a basic block that are + // safe to merge, that is any in-between instructions can safely + // execute in parallel after merging. + // TODO: support merging across basic-blocks. + for (auto &It : BB2PRMap) { + auto &CIs = It.getSecond(); + if (CIs.size() < 2) + continue; + + BasicBlock *BB = It.getFirst(); + SmallVector MergableCIs; + + // Find maximal number of parallel region CIs that are safe to merge. + for (Instruction &I : *BB) { + if (CIs.count(&I)) { + MergableCIs.push_back(cast(&I)); + continue; + } + + if (isSafeToSpeculativelyExecute(&I, &I, DT)) + continue; + + if (MergableCIs.size() > 1) + MergableCIsVector.push_back(MergableCIs); + + MergableCIs.clear(); + } + + if (!MergableCIsVector.empty()) { + Changed = true; + + for (auto &MergableCIs : MergableCIsVector) + Merge(MergableCIs, BB); + } + } + + if (Changed) { + // Update RFI info to set it up for later passes. + RFI.clearUsesMap(); + OMPInfoCache.collectUses(RFI, /* CollectStats */ false); + } + + return Changed; + } + /// Try to delete parallel regions if possible. bool deleteParallelRegions() { const unsigned CallbackCalleeOperand = 2; @@ -1299,8 +1507,10 @@ // TODO: Compute the module slice we are allowed to look at. OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A); bool Changed = OMPOpt.run(); - (void)Changed; - return PreservedAnalyses::all(); + if (Changed) + return PreservedAnalyses::none(); + else + return PreservedAnalyses::all(); } namespace { diff --git a/llvm/test/Transforms/OpenMP/parallel_deletion.ll b/llvm/test/Transforms/OpenMP/parallel_deletion.ll --- a/llvm/test/Transforms/OpenMP/parallel_deletion.ll +++ b/llvm/test/Transforms/OpenMP/parallel_deletion.ll @@ -39,24 +39,47 @@ } define internal void @.omp_outlined.willreturn(i32* noalias %.global_tid., i32* noalias %.bound_tid.) { +; CHECK-LABEL: define {{[^@]+}}@.omp_outlined.willreturn +; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #0 +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @unknown() #0 +; CHECK-NEXT: ret void +; entry: call void @unknown() willreturn ret void } define internal void @.omp_outlined.willreturn.0(i32* noalias %.global_tid., i32* noalias %.bound_tid.) willreturn { +; CHECK-LABEL: define {{[^@]+}}@.omp_outlined.willreturn.0 +; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #1 +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @readonly() #5 +; CHECK-NEXT: ret void +; entry: call void @readonly() ret void } define internal void @.omp_outlined.willreturn.1(i32* noalias %.global_tid., i32* noalias %.bound_tid.) { +; CHECK-LABEL: define {{[^@]+}}@.omp_outlined.willreturn.1 +; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #2 +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @readnone() #0 +; CHECK-NEXT: ret void +; entry: call void @readnone() willreturn ret void } define internal void @.omp_outlined.willreturn.2(i32* noalias %.global_tid., i32* noalias %.bound_tid.) { +; CHECK-LABEL: define {{[^@]+}}@.omp_outlined.willreturn.2 +; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #3 +; CHECK-NEXT: entry: +; CHECK-NEXT: ret void +; entry: ret void } @@ -76,9 +99,14 @@ define void @delete_parallel_1() { ; CHECK-LABEL: define {{[^@]+}}@delete_parallel_1() ; CHECK-NEXT: entry: -; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull align 8 dereferenceable(24) @0, i32 0, void (i32*, i32*, ...)* nonnull bitcast (void (i32*, i32*)* @.omp_outlined. to void (i32*, i32*, ...)*)) -; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull align 8 dereferenceable(24) @0, i32 0, void (i32*, i32*, ...)* nonnull bitcast (void (i32*, i32*)* @.omp_outlined..0 to void (i32*, i32*, ...)*)) -; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull align 8 dereferenceable(24) @0, i32 0, void (i32*, i32*, ...)* nonnull bitcast (void (i32*, i32*)* @.omp_outlined..1 to void (i32*, i32*, ...)*)) +; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @0) +; CHECK-NEXT: br label [[OMP_PARALLEL:%.*]] +; CHECK: omp_parallel: +; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @0, i32 0, void (i32*, i32*, ...)* bitcast (void (i32*, i32*)* @delete_parallel_1..omp_par to void (i32*, i32*, ...)*)) +; CHECK-NEXT: br label [[OMP_PAR_EXIT_SPLIT:%.*]] +; CHECK: omp.par.exit.split: +; CHECK-NEXT: br label [[ENTRY_SPLIT_SPLIT:%.*]] +; CHECK: entry.split.split: ; CHECK-NEXT: ret void ; entry: @@ -90,24 +118,47 @@ } define internal void @.omp_outlined.(i32* noalias %.global_tid., i32* noalias %.bound_tid.) { +; CHECK-LABEL: define {{[^@]+}}@.omp_outlined. +; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @unknown() +; CHECK-NEXT: ret void +; entry: call void @unknown() ret void } define internal void @.omp_outlined..0(i32* noalias %.global_tid., i32* noalias %.bound_tid.) { +; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..0 +; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #5 +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @readonly() #5 +; CHECK-NEXT: ret void +; entry: call void @readonly() ret void } define internal void @.omp_outlined..1(i32* noalias %.global_tid., i32* noalias %.bound_tid.) { +; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..1 +; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #6 +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @readnone() +; CHECK-NEXT: ret void +; entry: call void @readnone() ret void } define internal void @.omp_outlined..2(i32* noalias %.global_tid., i32* noalias %.bound_tid.) { +; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..2 +; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #3 +; CHECK-NEXT: entry: +; CHECK-NEXT: ret void +; entry: ret void } @@ -144,10 +195,14 @@ ; CHECK-NEXT: [[TMP:%.*]] = bitcast i32* [[A]] to i8* ; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull align 4 dereferenceable(4) [[TMP]]) #0 ; CHECK-NEXT: store i32 0, i32* [[A]], align 4 -; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull align 8 dereferenceable(24) @0, i32 1, void (i32*, i32*, ...)* nonnull bitcast (void (i32*, i32*, i32*)* @.omp_outlined..3 to void (i32*, i32*, ...)*), i32* nocapture nofree nonnull align 4 dereferenceable(4) [[A]]) -; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull align 8 dereferenceable(24) @0, i32 1, void (i32*, i32*, ...)* nonnull bitcast (void (i32*, i32*, i32*)* @.omp_outlined..4 to void (i32*, i32*, ...)*), i32* nocapture nonnull align 4 dereferenceable(4) [[A]]) -; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull align 8 dereferenceable(24) @0, i32 1, void (i32*, i32*, ...)* nonnull bitcast (void (i32*, i32*, i32*)* @.omp_outlined..5 to void (i32*, i32*, ...)*), i32* nocapture nonnull align 4 dereferenceable(4) [[A]]) -; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull align 8 dereferenceable(24) @0, i32 1, void (i32*, i32*, ...)* nonnull bitcast (void (i32*, i32*, i32*)* @.omp_outlined..6 to void (i32*, i32*, ...)*), i32* nocapture nonnull align 4 dereferenceable(4) [[A]]) +; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @0) +; CHECK-NEXT: br label [[OMP_PARALLEL:%.*]] +; CHECK: omp_parallel: +; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @0, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @delete_parallel_2..omp_par to void (i32*, i32*, ...)*), i32* [[A]]) +; CHECK-NEXT: br label [[OMP_PAR_EXIT_SPLIT:%.*]] +; CHECK: omp.par.exit.split: +; CHECK-NEXT: br label [[ENTRY_SPLIT_SPLIT:%.*]] +; CHECK: entry.split.split: ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[A]] to i8* ; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull [[TMP1]]) ; CHECK-NEXT: ret void @@ -167,6 +222,20 @@ } define internal void @.omp_outlined..3(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32* dereferenceable(4) %a) { +; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..3 +; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture nofree nonnull align 4 dereferenceable(4) [[A:%.*]]) #7 +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CALL:%.*]] = call i32 @omp_get_thread_num() #5 +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 +; CHECK-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[TMP:%.*]] = load i32, i32* [[A]], align 4 +; CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP]], 1 +; CHECK-NEXT: store i32 [[INC]], i32* [[A]], align 4 +; CHECK-NEXT: br label [[IF_END]] +; CHECK: if.end: +; CHECK-NEXT: ret void +; entry: %call = call i32 @omp_get_thread_num() %cmp = icmp eq i32 %call, 0 @@ -183,6 +252,22 @@ } define internal void @.omp_outlined..4(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32* dereferenceable(4) %a) { +; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..4 +; CHECK-SAME: (i32* noalias nocapture nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture nonnull align 4 dereferenceable(4) [[A:%.*]]) +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_master(%struct.ident_t* nonnull @0, i32 [[TMP]]) +; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[TMP2]], label [[OMP_IF_END:%.*]], label [[OMP_IF_THEN:%.*]] +; CHECK: omp_if.then: +; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[A]], align 4 +; CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP3]], 1 +; CHECK-NEXT: store i32 [[INC]], i32* [[A]], align 4 +; CHECK-NEXT: call void @__kmpc_end_master(%struct.ident_t* nonnull @0, i32 [[TMP]]) +; CHECK-NEXT: br label [[OMP_IF_END]] +; CHECK: omp_if.end: +; CHECK-NEXT: ret void +; entry: %tmp = load i32, i32* %.global_tid., align 4 %tmp1 = call i32 @__kmpc_master(%struct.ident_t* nonnull @0, i32 %tmp) @@ -209,6 +294,24 @@ declare void @__kmpc_end_master(%struct.ident_t*, i32) define internal void @.omp_outlined..5(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32* dereferenceable(4) %a) { +; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..5 +; CHECK-SAME: (i32* noalias nocapture nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture nonnull align 4 dereferenceable(4) [[A:%.*]]) +; CHECK-NEXT: entry: +; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @0) +; CHECK-NEXT: [[TMP:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_single(%struct.ident_t* nonnull @0, i32 [[TMP]]) +; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[TMP2]], label [[OMP_IF_END:%.*]], label [[OMP_IF_THEN:%.*]] +; CHECK: omp_if.then: +; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[A]], align 4 +; CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP3]], 1 +; CHECK-NEXT: store i32 [[INC]], i32* [[A]], align 4 +; CHECK-NEXT: call void @__kmpc_end_single(%struct.ident_t* nonnull @0, i32 [[TMP]]) +; CHECK-NEXT: br label [[OMP_IF_END]] +; CHECK: omp_if.end: +; CHECK-NEXT: call void @__kmpc_barrier(%struct.ident_t* nonnull @1, i32 [[OMP_GLOBAL_THREAD_NUM]]) +; CHECK-NEXT: ret void +; entry: %omp_global_thread_num = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @0) %tmp = load i32, i32* %.global_tid., align 4 @@ -229,6 +332,39 @@ } define internal void @.omp_outlined..6(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32* dereferenceable(4) %a) { +; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..6 +; CHECK-SAME: (i32* noalias nocapture nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture nonnull align 4 dereferenceable(4) [[A:%.*]]) +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A1:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 8 +; CHECK-NEXT: [[TMP:%.*]] = bitcast i32* [[A1]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull align 4 [[TMP]]) #0 +; CHECK-NEXT: store i32 1, i32* [[A1]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i32** +; CHECK-NEXT: store i32* [[A1]], i32** [[TMP1]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +; CHECK-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_reduce_nowait(%struct.ident_t* nonnull @2, i32 [[TMP2]], i32 1, i64 8, i8* nonnull align 8 [[TMP3]], void (i8*, i8*)* nonnull @.omp.reduction.reduction_func, [8 x i32]* nonnull @.gomp_critical_user_.reduction.var) +; CHECK-NEXT: switch i32 [[TMP4]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ +; CHECK-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +; CHECK-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +; CHECK-NEXT: ] +; CHECK: .omp.reduction.case1: +; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[A]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[A1]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP5]], [[TMP6]] +; CHECK-NEXT: store i32 [[ADD]], i32* [[A]], align 4 +; CHECK-NEXT: call void @__kmpc_end_reduce_nowait(%struct.ident_t* nonnull @2, i32 [[TMP2]], [8 x i32]* nonnull @.gomp_critical_user_.reduction.var) +; CHECK-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]] +; CHECK: .omp.reduction.case2: +; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[A1]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = atomicrmw add i32* [[A]], i32 [[TMP7]] monotonic +; CHECK-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]] +; CHECK: .omp.reduction.default: +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32* [[A1]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull [[TMP9]]) +; CHECK-NEXT: ret void +; entry: %a1 = alloca i32, align 4 %.omp.reduction.red_list = alloca [1 x i8*], align 8 @@ -241,8 +377,8 @@ %tmp3 = bitcast [1 x i8*]* %.omp.reduction.red_list to i8* %tmp4 = call i32 @__kmpc_reduce_nowait(%struct.ident_t* nonnull @2, i32 %tmp2, i32 1, i64 8, i8* nonnull %tmp3, void (i8*, i8*)* nonnull @.omp.reduction.reduction_func, [8 x i32]* nonnull @.gomp_critical_user_.reduction.var) switch i32 %tmp4, label %.omp.reduction.default [ - i32 1, label %.omp.reduction.case1 - i32 2, label %.omp.reduction.case2 + i32 1, label %.omp.reduction.case1 + i32 2, label %.omp.reduction.case2 ] .omp.reduction.case1: ; preds = %entry @@ -265,6 +401,19 @@ } define internal void @.omp.reduction.reduction_func(i8* %arg, i8* %arg1) { +; CHECK-LABEL: define {{[^@]+}}@.omp.reduction.reduction_func +; CHECK-SAME: (i8* nocapture nofree nonnull readonly align 8 dereferenceable(8) [[ARG:%.*]], i8* nocapture nofree nonnull readonly align 8 dereferenceable(8) [[ARG1:%.*]]) #11 +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP:%.*]] = bitcast i8* [[ARG1]] to i32** +; CHECK-NEXT: [[TMP2:%.*]] = load i32*, i32** [[TMP]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[ARG]] to i32** +; CHECK-NEXT: [[TMP4:%.*]] = load i32*, i32** [[TMP3]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP2]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP5]], [[TMP6]] +; CHECK-NEXT: store i32 [[ADD]], i32* [[TMP4]], align 4 +; CHECK-NEXT: ret void +; entry: %tmp = bitcast i8* %arg1 to i32** %tmp2 = load i32*, i32** %tmp, align 8 diff --git a/llvm/test/Transforms/OpenMP/parallel_region_merging.ll b/llvm/test/Transforms/OpenMP/parallel_region_merging.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/OpenMP/parallel_region_merging.ll @@ -0,0 +1,254 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature +; RUN: opt -S -attributor -openmpopt < %s | FileCheck %s +; RUN: opt -S -passes='attributor,cgscc(openmpopt)' < %s | FileCheck %s +; +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" + +%struct.ident_t = type { i32, i32, i32, i32, i8* } + +@.str = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 +@0 = private unnamed_addr global %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 + +; void merge_all() { +; #pragma omp parallel +; { +; } +; #pragma omp parallel +; { +; } +; } +; +; Merge all parallel regions +define dso_local void @merge_all() #0 { +; CHECK-LABEL: define {{[^@]+}}@merge_all() #0 +; CHECK-NEXT: entry: +; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @0) +; CHECK-NEXT: br label [[OMP_PARALLEL:%.*]] +; CHECK: omp_parallel: +; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @0, i32 0, void (i32*, i32*, ...)* bitcast (void (i32*, i32*)* @merge_all..omp_par to void (i32*, i32*, ...)*)) +; CHECK-NEXT: br label [[OMP_PAR_EXIT_SPLIT:%.*]] +; CHECK: omp.par.exit.split: +; CHECK-NEXT: br label [[ENTRY_SPLIT_SPLIT:%.*]] +; CHECK: entry.split.split: +; CHECK-NEXT: ret void +; +entry: + call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @0, i32 0, void (i32*, i32*, ...)* bitcast (void (i32*, i32*)* @.omp_outlined. to void (i32*, i32*, ...)*)) + call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @0, i32 0, void (i32*, i32*, ...)* bitcast (void (i32*, i32*)* @.omp_outlined..1 to void (i32*, i32*, ...)*)) + ret void +} + +define internal void @.omp_outlined.(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #1 { +; CHECK-LABEL: define {{[^@]+}}@.omp_outlined. +; CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #1 +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +; CHECK-NEXT: ret void +; +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + ret void +} + +declare !callback !2 void @__kmpc_fork_call(%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) #2 + +define internal void @.omp_outlined..1(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #1 { +; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..1 +; CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #1 +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +; CHECK-NEXT: ret void +; +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + ret void +} +; void merge_none() { +; int a = 1; +; #pragma omp parallel +; { +; } +; a = 2; +; #pragma omp parallel +; { +; } +; } +; +; Do not merge parallel regions, in-between store +; instruction is unsafe to execute in parallel +define dso_local void @merge_none() #0 { +; CHECK-LABEL: define {{[^@]+}}@merge_none() #0 +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 +; CHECK-NEXT: store i32 1, i32* [[A]], align 4 +; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @0, i32 0, void (i32*, i32*, ...)* bitcast (void (i32*, i32*)* @.omp_outlined..2 to void (i32*, i32*, ...)*)) +; CHECK-NEXT: store i32 2, i32* [[A]], align 4 +; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @0, i32 0, void (i32*, i32*, ...)* bitcast (void (i32*, i32*)* @.omp_outlined..3 to void (i32*, i32*, ...)*)) +; CHECK-NEXT: ret void +; +entry: + %a = alloca i32, align 4 + store i32 1, i32* %a, align 4 + call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @0, i32 0, void (i32*, i32*, ...)* bitcast (void (i32*, i32*)* @.omp_outlined..2 to void (i32*, i32*, ...)*)) + store i32 2, i32* %a, align 4 + call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @0, i32 0, void (i32*, i32*, ...)* bitcast (void (i32*, i32*)* @.omp_outlined..3 to void (i32*, i32*, ...)*)) + ret void +} + +define internal void @.omp_outlined..2(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #1 { +; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..2 +; CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #1 +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +; CHECK-NEXT: ret void +; +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + ret void +} + +define internal void @.omp_outlined..3(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #1 { +; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..3 +; CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #1 +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +; CHECK-NEXT: ret void +; +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + ret void +} + +; void merge_some() { +; int a = 1; +; #pragma omp parallel +; { +; } +; a = 2; +; #pragma omp parallel +; { +; } +; #pragma omp parallel +; { +; } +; } +; +; Do not merge first parallel region, due to the +; unsafe store, but merge the two next parallel +; regions +define dso_local void @merge_some() #0 { +; CHECK-LABEL: define {{[^@]+}}@merge_some() #0 +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 +; CHECK-NEXT: store i32 1, i32* [[A]], align 4 +; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @0, i32 0, void (i32*, i32*, ...)* bitcast (void (i32*, i32*)* @.omp_outlined..4 to void (i32*, i32*, ...)*)) +; CHECK-NEXT: store i32 2, i32* [[A]], align 4 +; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @0) +; CHECK-NEXT: br label [[OMP_PARALLEL:%.*]] +; CHECK: omp_parallel: +; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @0, i32 0, void (i32*, i32*, ...)* bitcast (void (i32*, i32*)* @merge_some..omp_par to void (i32*, i32*, ...)*)) +; CHECK-NEXT: br label [[OMP_PAR_EXIT_SPLIT:%.*]] +; CHECK: omp.par.exit.split: +; CHECK-NEXT: br label [[ENTRY_SPLIT_SPLIT:%.*]] +; CHECK: entry.split.split: +; CHECK-NEXT: ret void +; +entry: + %a = alloca i32, align 4 + store i32 1, i32* %a, align 4 + call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @0, i32 0, void (i32*, i32*, ...)* bitcast (void (i32*, i32*)* @.omp_outlined..4 to void (i32*, i32*, ...)*)) + store i32 2, i32* %a, align 4 + call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @0, i32 0, void (i32*, i32*, ...)* bitcast (void (i32*, i32*)* @.omp_outlined..5 to void (i32*, i32*, ...)*)) + call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @0, i32 0, void (i32*, i32*, ...)* bitcast (void (i32*, i32*)* @.omp_outlined..6 to void (i32*, i32*, ...)*)) + ret void +} + +define internal void @.omp_outlined..4(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #1 { +; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..4 +; CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #1 +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +; CHECK-NEXT: ret void +; +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + ret void +} + +define internal void @.omp_outlined..5(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #1 { +; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..5 +; CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #1 +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +; CHECK-NEXT: ret void +; +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + ret void +} + +define internal void @.omp_outlined..6(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #1 { +; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..6 +; CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #1 +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +; CHECK-NEXT: ret void +; +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + ret void +} + +attributes #0 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { noinline norecurse nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nounwind } + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{!"clang version 11.0.0 (git@github.com:ggeorgakoudis/llvm-project.git e606e9aa891b114da1e9a0dbcede01dfc077f3f8)"} +!2 = !{!3} +!3 = !{i64 2, i64 -1, i64 -1, i1 true}