diff --git a/llvm/lib/Transforms/IPO/PartialInlining.cpp b/llvm/lib/Transforms/IPO/PartialInlining.cpp --- a/llvm/lib/Transforms/IPO/PartialInlining.cpp +++ b/llvm/lib/Transforms/IPO/PartialInlining.cpp @@ -226,10 +226,13 @@ // multi-region outlining. FunctionCloner(Function *F, FunctionOutliningInfo *OI, OptimizationRemarkEmitter &ORE, - function_ref LookupAC); + function_ref LookupAC, + function_ref GetTTI); FunctionCloner(Function *F, FunctionOutliningMultiRegionInfo *OMRI, OptimizationRemarkEmitter &ORE, - function_ref LookupAC); + function_ref LookupAC, + function_ref GetTTI); + ~FunctionCloner(); // Prepare for function outlining: making sure there is only @@ -266,6 +269,7 @@ std::unique_ptr ClonedFuncBFI = nullptr; OptimizationRemarkEmitter &ORE; function_ref LookupAC; + function_ref GetTTI; }; private: @@ -334,7 +338,7 @@ // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to // approximate both the size and runtime cost (Note that in the current // inline cost analysis, there is no clear distinction there either). - static int computeBBInlineCost(BasicBlock *BB); + static int computeBBInlineCost(BasicBlock *BB, TargetTransformInfo *TTI); std::unique_ptr computeOutliningInfo(Function *F); std::unique_ptr @@ -448,9 +452,10 @@ // Use the same computeBBInlineCost function to compute the cost savings of // the outlining the candidate region. + TargetTransformInfo *FTTI = &GetTTI(*F); int OverallFunctionCost = 0; for (auto &BB : *F) - OverallFunctionCost += computeBBInlineCost(&BB); + OverallFunctionCost += computeBBInlineCost(&BB, FTTI); #ifndef NDEBUG if (TracePartialInlining) @@ -509,7 +514,7 @@ continue; int OutlineRegionCost = 0; for (auto *BB : DominateVector) - OutlineRegionCost += computeBBInlineCost(BB); + OutlineRegionCost += computeBBInlineCost(BB, &GetTTI(*BB->getParent())); #ifndef NDEBUG if (TracePartialInlining) @@ -843,7 +848,8 @@ // TODO: Ideally we should share Inliner's InlineCost Analysis code. // For now use a simplified version. The returned 'InlineCost' will be used // to esimate the size cost as well as runtime cost of the BB. -int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB) { +int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB, + TargetTransformInfo *TTI) { int InlineCost = 0; const DataLayout &DL = BB->getParent()->getParent()->getDataLayout(); for (Instruction &I : BB->instructionsWithoutDebug()) { @@ -866,6 +872,21 @@ if (I.isLifetimeStartOrEnd()) continue; + if (auto *II = dyn_cast(&I)) { + Intrinsic::ID IID = II->getIntrinsicID(); + SmallVector Tys; + FastMathFlags FMF; + for (Value *Val : II->args()) + Tys.push_back(Val->getType()); + + if (auto *FPMO = dyn_cast(II)) + FMF = FPMO->getFastMathFlags(); + + IntrinsicCostAttributes ICA(IID, II->getType(), Tys, FMF); + InlineCost += TTI->getIntrinsicInstrCost(ICA, TTI::TCK_SizeAndLatency); + continue; + } + if (CallInst *CI = dyn_cast(&I)) { InlineCost += getCallsiteCost(*CI, DL); continue; @@ -893,11 +914,13 @@ BasicBlock* OutliningCallBB = FuncBBPair.second; // Now compute the cost of the call sequence to the outlined function // 'OutlinedFunction' in BB 'OutliningCallBB': - OutliningFuncCallCost += computeBBInlineCost(OutliningCallBB); + auto *OutlinedFuncTTI = &GetTTI(*OutlinedFunc); + OutliningFuncCallCost += + computeBBInlineCost(OutliningCallBB, OutlinedFuncTTI); // Now compute the cost of the extracted/outlined function itself: for (BasicBlock &BB : *OutlinedFunc) - OutlinedFunctionCost += computeBBInlineCost(&BB); + OutlinedFunctionCost += computeBBInlineCost(&BB, OutlinedFuncTTI); } assert(OutlinedFunctionCost >= Cloner.OutlinedRegionCost && "Outlined function cost should be no less than the outlined region"); @@ -962,8 +985,9 @@ PartialInlinerImpl::FunctionCloner::FunctionCloner( Function *F, FunctionOutliningInfo *OI, OptimizationRemarkEmitter &ORE, - function_ref LookupAC) - : OrigFunc(F), ORE(ORE), LookupAC(LookupAC) { + function_ref LookupAC, + function_ref GetTTI) + : OrigFunc(F), ORE(ORE), LookupAC(LookupAC), GetTTI(GetTTI) { ClonedOI = std::make_unique(); // Clone the function, so that we can hack away on it. @@ -987,8 +1011,9 @@ PartialInlinerImpl::FunctionCloner::FunctionCloner( Function *F, FunctionOutliningMultiRegionInfo *OI, OptimizationRemarkEmitter &ORE, - function_ref LookupAC) - : OrigFunc(F), ORE(ORE), LookupAC(LookupAC) { + function_ref LookupAC, + function_ref GetTTI) + : OrigFunc(F), ORE(ORE), LookupAC(LookupAC), GetTTI(GetTTI) { ClonedOMRI = std::make_unique(); // Clone the function, so that we can hack away on it. @@ -1099,10 +1124,10 @@ bool PartialInlinerImpl::FunctionCloner::doMultiRegionFunctionOutlining() { - auto ComputeRegionCost = [](SmallVectorImpl &Region) { + auto ComputeRegionCost = [&](SmallVectorImpl &Region) { int Cost = 0; for (BasicBlock* BB : Region) - Cost += computeBBInlineCost(BB); + Cost += computeBBInlineCost(BB, &GetTTI(*BB->getParent())); return Cost; }; @@ -1196,9 +1221,10 @@ // Gather up the blocks that we're going to extract. std::vector ToExtract; + auto *ClonedFuncTTI = &GetTTI(*ClonedFunc); ToExtract.push_back(ClonedOI->NonReturnBlock); - OutlinedRegionCost += - PartialInlinerImpl::computeBBInlineCost(ClonedOI->NonReturnBlock); + OutlinedRegionCost += PartialInlinerImpl::computeBBInlineCost( + ClonedOI->NonReturnBlock, ClonedFuncTTI); for (BasicBlock &BB : *ClonedFunc) if (!ToBeInlined(&BB) && &BB != ClonedOI->NonReturnBlock) { ToExtract.push_back(&BB); @@ -1206,7 +1232,7 @@ // into the outlined function which may make the outlining // overhead (the difference of the outlined function cost // and OutliningRegionCost) look larger. - OutlinedRegionCost += computeBBInlineCost(&BB); + OutlinedRegionCost += computeBBInlineCost(&BB, ClonedFuncTTI); } // Extract the body of the if. @@ -1276,7 +1302,7 @@ std::unique_ptr OMRI = computeOutliningColdRegionsInfo(F, ORE); if (OMRI) { - FunctionCloner Cloner(F, OMRI.get(), ORE, LookupAssumptionCache); + FunctionCloner Cloner(F, OMRI.get(), ORE, LookupAssumptionCache, GetTTI); #ifndef NDEBUG if (TracePartialInlining) { @@ -1309,7 +1335,7 @@ if (!OI) return {false, nullptr}; - FunctionCloner Cloner(F, OI.get(), ORE, LookupAssumptionCache); + FunctionCloner Cloner(F, OI.get(), ORE, LookupAssumptionCache, GetTTI); Cloner.NormalizeReturnBlock(); Function *OutlinedFunction = Cloner.doSingleRegionFunctionOutlining(); diff --git a/llvm/test/Transforms/PartialInlining/intrinsic-call-cost.ll b/llvm/test/Transforms/PartialInlining/intrinsic-call-cost.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/PartialInlining/intrinsic-call-cost.ll @@ -0,0 +1,55 @@ +; RUN: opt -partial-inliner -S < %s | FileCheck %s + +; Checks that valid costs are computed for intrinsic calls. +; https://bugs.llvm.org/show_bug.cgi?id=45932 + + +@emit_notes = external global i8, align 2 + +; CHECK: var_reg_delete +; CHECK-NEXT: bb +; CHECK-NEXT: tail call void @delete_variable_part() +; CHECK-NEXT: ret void + +define void @var_reg_delete() { +bb: + tail call void @delete_variable_part() + ret void +} + +; CHECK: delete_variable_part +; CHECK-NEXT: bb +; CHECK-NEXT: %tmp1.i = tail call i32 @find_variable_location_part() +; CHECK-NEXT: %tmp3.i = icmp sgt i32 %tmp1.i, -1 +; CHECK-NEXT: br i1 %tmp3.i, label %bb4.i, label %delete_slot_part.exit + +; CHECK: bb4.i +; CHECK-NEXT: %tmp.i.i = load i8, i8* @emit_notes +; CHECK-NEXT: %tmp1.i.i = icmp ne i8 %tmp.i.i, 0 +; CHECK-NEXT: tail call void @llvm.assume(i1 %tmp1.i.i) +; CHECK-NEXT: unreachable + +; CHECK: delete_slot_part.exit +; CHECK-NEXT: ret void + +define void @delete_variable_part() { +bb: + %tmp1.i = tail call i32 @find_variable_location_part() + %tmp3.i = icmp sgt i32 %tmp1.i, -1 + br i1 %tmp3.i, label %bb4.i, label %delete_slot_part.exit + +bb4.i: + %tmp.i.i = load i8, i8* @emit_notes, align 2 + %tmp1.i.i = icmp ne i8 %tmp.i.i, 0 + tail call void @llvm.assume(i1 %tmp1.i.i) + unreachable + +delete_slot_part.exit: + ret void +} + +; CHECK: declare i32 @find_variable_location_part +declare i32 @find_variable_location_part() + +; CHECK: declare void @llvm.assume(i1 noundef) +declare void @llvm.assume(i1 noundef)