Index: llvm/trunk/lib/Transforms/IPO/PartialInlining.cpp =================================================================== --- llvm/trunk/lib/Transforms/IPO/PartialInlining.cpp +++ llvm/trunk/lib/Transforms/IPO/PartialInlining.cpp @@ -652,12 +652,21 @@ // only split block when necessary: PHINode *FirstPhi = getFirstPHI(PreReturn); unsigned NumPredsFromEntries = OI->ReturnBlockPreds.size(); + auto IsTrivialPhi = [](PHINode *PN) -> Value * { + Value *CommonValue = PN->getIncomingValue(0); + if (all_of(PN->incoming_values(), + [&](Value *V) { return V == CommonValue; })) + return CommonValue; + return nullptr; + }; + if (FirstPhi && FirstPhi->getNumIncomingValues() > NumPredsFromEntries + 1) { NewReturnBlock = NewReturnBlock->splitBasicBlock( NewReturnBlock->getFirstNonPHI()->getIterator()); BasicBlock::iterator I = PreReturn->begin(); Instruction *Ins = &NewReturnBlock->front(); + SmallVector DeadPhis; while (I != PreReturn->end()) { PHINode *OldPhi = dyn_cast(I); if (!OldPhi) @@ -674,8 +683,22 @@ RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(NewE), NewE); OldPhi->removeIncomingValue(NewE); } + + // After incoming values splitting, the old phi may become trivial. + // Keeping the trivial phi can introduce definition inside the outline + // region which is live-out, causing necessary overhead (load, store + // arg passing etc). + if (auto *OldPhiVal = IsTrivialPhi(OldPhi)) { + OldPhi->replaceAllUsesWith(OldPhiVal); + DeadPhis.push_back(OldPhi); + } + ++I; } + + for (auto *DP : DeadPhis) + DP->eraseFromParent(); + for (auto E : OI->ReturnBlockPreds) { BasicBlock *NewE = cast(VMap[E]); NewE->getTerminator()->replaceUsesOfWith(PreReturn, NewReturnBlock); Index: llvm/trunk/test/Transforms/CodeExtractor/PartialInlineLiveAcross.ll =================================================================== --- llvm/trunk/test/Transforms/CodeExtractor/PartialInlineLiveAcross.ll +++ llvm/trunk/test/Transforms/CodeExtractor/PartialInlineLiveAcross.ll @@ -0,0 +1,61 @@ +; RUN: opt -S -partial-inliner -max-num-inline-blocks=2 -skip-partial-inlining-cost-analysis < %s | FileCheck %s +; RUN: opt -S -passes=partial-inliner -max-num-inline-blocks=2 -skip-partial-inlining-cost-analysis < %s | FileCheck %s +define i32 @test(i32 %arg) local_unnamed_addr #0 { +bb: + %tmp = tail call i32 (...) @bar() #1 + %tmp1 = icmp slt i32 %arg, 0 + br i1 %tmp1, label %bb6, label %bb2 + +bb2: ; preds = %bb + tail call void (...) @foo() #1 + tail call void (...) @foo() #1 + tail call void (...) @foo() #1 + tail call void (...) @foo() #1 + tail call void (...) @foo() #1 + %tmp3 = tail call i32 (...) @bar() #1 + %tmp4 = icmp eq i32 %tmp3, 10 + br i1 %tmp4, label %bb6, label %bb5 + +bb5: ; preds = %bb2 + tail call void (...) @foo() #1 + tail call void (...) @foo() #1 + tail call void (...) @foo() #1 + tail call void (...) @foo() #1 + br label %bb6 + +bb6: ; preds = %bb5, %bb2, %bb + %tmp7 = phi i32 [ %tmp, %bb5 ], [ 0, %bb ], [ %tmp, %bb2 ] + ret i32 %tmp7 +} + +declare i32 @bar(...) local_unnamed_addr #1 + +declare void @foo(...) local_unnamed_addr #1 + +; Function Attrs: nounwind uwtable +define i32 @dummy_caller(i32 %arg) local_unnamed_addr #0 { +; CHECK-LABEL: @dummy_caller +; CHECK: codeRepl.i: +; CHECK: call void @test.1_bb2() +; CHECK-NOT: load +; CHECK br + +bb: + %tmp = tail call i32 @test(i32 %arg) + ret i32 %tmp +} + +; CHECK-LABEL: define internal void @test.1_bb2() +; CHECK: .exitStub: +; CHECK-NOT: store i32 %tmp7, i32* %tmp7.out +; CHECK: ret + + +attributes #0 = { nounwind uwtable } +attributes #1 = { nounwind uwtable } + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{!"clang version 5.0.0 (trunk 303574)"} Index: llvm/trunk/test/Transforms/CodeExtractor/PartialInlineNoLiveOut.ll =================================================================== --- llvm/trunk/test/Transforms/CodeExtractor/PartialInlineNoLiveOut.ll +++ llvm/trunk/test/Transforms/CodeExtractor/PartialInlineNoLiveOut.ll @@ -0,0 +1,62 @@ +; RUN: opt -S -partial-inliner -max-num-inline-blocks=2 -skip-partial-inlining-cost-analysis < %s | FileCheck %s +; RUN: opt -S -passes=partial-inliner -max-num-inline-blocks=2 -skip-partial-inlining-cost-analysis < %s | FileCheck %s + +define i32 @test(i32 %arg) local_unnamed_addr #0 { +bb: + %tmp = tail call i32 (...) @bar() #1 + %tmp1 = icmp slt i32 %arg, 0 + br i1 %tmp1, label %bb6, label %bb2 + +bb2: ; preds = %bb + tail call void (...) @foo() #1 + tail call void (...) @foo() #1 + tail call void (...) @foo() #1 + tail call void (...) @foo() #1 + tail call void (...) @foo() #1 + %tmp3 = tail call i32 (...) @bar() #1 + %tmp4 = icmp eq i32 %tmp3, 10 + br i1 %tmp4, label %bb6, label %bb5 + +bb5: ; preds = %bb2 + tail call void (...) @foo() #1 + tail call void (...) @foo() #1 + tail call void (...) @foo() #1 + tail call void (...) @foo() #1 + br label %bb6 + +bb6: ; preds = %bb5, %bb2, %bb + %tmp7 = phi i32 [ 1, %bb5 ], [ 0, %bb ], [ 1, %bb2 ] + ret i32 %tmp7 +} + +; Function Attrs: nounwind uwtable +declare i32 @bar(...) local_unnamed_addr #0 + +; Function Attrs: nounwind uwtable +declare void @foo(...) local_unnamed_addr #0 + +; Function Attrs: nounwind uwtable +define i32 @dummy_caller(i32 %arg) local_unnamed_addr #0 { +; CHECK-LABEL: @dummy_caller +; CHECK: codeRepl.i: +; CHECK: call void @test.1_bb2() +; CHECK-NOT: load +; CHECK br +bb: + %tmp = tail call i32 @test(i32 %arg) + ret i32 %tmp +} + +; CHECK-LABEL: define internal void @test.1_bb2() +; CHECK: .exitStub: +; CHECK-NOT: store i32 %tmp7, i32* %tmp7.out +; CHECK: ret + +attributes #0 = { nounwind uwtable } +attributes #1 = { nounwind } + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{!"clang version 5.0.0 (trunk 303574)"}