Index: llvm/lib/Transforms/IPO/HotColdSplitting.cpp =================================================================== --- llvm/lib/Transforms/IPO/HotColdSplitting.cpp +++ llvm/lib/Transforms/IPO/HotColdSplitting.cpp @@ -332,6 +332,10 @@ static constexpr unsigned ScoreForSuccBlock = 1; static constexpr unsigned ScoreForSinkBlock = 1; + using BlockSetVector = SetVector; + + using PHIFrontierMap = SmallDenseMap; + OutliningRegion(const OutliningRegion &) = delete; OutliningRegion &operator=(const OutliningRegion &) = delete; @@ -344,6 +348,14 @@ const PostDomTree &PDT) { OutliningRegion ColdRegion; + SmallPtrSet RegionBlocks; + + auto addBlockToRegion = [&](BasicBlock *BB, unsigned Score) { + RegionBlocks.insert(BB); + ColdRegion.Blocks.emplace_back(BB, Score); + assert(RegionBlocks.size() == ColdRegion.Blocks.size() && "Duplicate BB"); + }; + // The ancestor farthest-away from SinkBB, and also post-dominated by it. unsigned SinkScore = getEntryPointScore(SinkBB, ScoreForSinkBlock); ColdRegion.SuggestedEntryPoint = (SinkScore > 0) ? &SinkBB : nullptr; @@ -379,13 +391,13 @@ BestScore = PredScore; } - ColdRegion.Blocks.emplace_back(&PredBB, PredScore); + addBlockToRegion(&PredBB, PredScore); ++PredIt; } // Add SinkBB to the cold region. It's considered as an entry point before // any sink-successor blocks. - ColdRegion.Blocks.emplace_back(&SinkBB, SinkScore); + addBlockToRegion(&SinkBB, SinkScore); // Find all successors of SinkBB dominated by SinkBB using DFS. auto SuccIt = ++df_begin(&SinkBB); @@ -394,9 +406,12 @@ BasicBlock &SuccBB = **SuccIt; bool SinkDom = DT.dominates(&SinkBB, &SuccBB); + // Don't allow the backwards & forwards DFSes to mark the same block. + bool DuplicateBlock = RegionBlocks.count(&SuccBB); + // If SinkBB does not dominate a successor, do not mark the successor (or // any of its successors) cold. - if (!SinkDom || !mayExtractBlock(SuccBB)) { + if (DuplicateBlock || !SinkDom || !mayExtractBlock(SuccBB)) { SuccIt.skipChildren(); continue; } @@ -407,7 +422,7 @@ BestScore = SuccScore; } - ColdRegion.Blocks.emplace_back(&SuccBB, SuccScore); + addBlockToRegion(&SuccBB, SuccScore); ++SuccIt; } @@ -430,7 +445,8 @@ // Remove blocks dominated by the suggested entry point from this region. // During the removal, identify the next best entry point into the region. // Ensure that the first extracted block is the suggested entry point. - BlockSequence SubRegion{SuggestedEntryPoint}; + BlockSetVector SubRegion; + SubRegion.insert(SuggestedEntryPoint); BasicBlock *NextEntryPoint = nullptr; unsigned NextScore = 0; auto RegionEndIt = Blocks.end(); @@ -444,7 +460,7 @@ NextScore = Score; } if (InSubRegion && BB != SuggestedEntryPoint) - SubRegion.push_back(BB); + SubRegion.insert(BB); return InSubRegion; }); Blocks.erase(RegionStartIt, RegionEndIt); @@ -452,7 +468,56 @@ // Update the suggested entry point. SuggestedEntryPoint = NextEntryPoint; - return SubRegion; + // Prune blocks which would cause PHIs outside of the outlining region to + // have non-unique incoming values from the outlining region. A worklist is + // needed because pruning one invalid block may invalidate other blocks. + SmallPtrSet InvalidBlocks; + do { + // Map PHIs outside of the outlining region to a unique incoming value + // originating from the outlining region. + InvalidBlocks.clear(); + PHIFrontierMap PHIFrontier; + for (BasicBlock *BB : SubRegion) + if (!updatePHIFrontier(*BB, SubRegion, PHIFrontier)) + InvalidBlocks.insert(BB); + + // Remove blocks dominated by invalid blocks from the outlining region. + SubRegion.remove_if([&](BasicBlock *BB) { + return InvalidBlocks.count(BB) || + any_of(InvalidBlocks, [&](BasicBlock *InvalidBB) { + return DT.dominates(InvalidBB, BB); + }); + }); + } while (!InvalidBlocks.empty()); + + return SubRegion.takeVector(); + } + +private: + /// Update the PHI frontier by recording \p BB's outgoing values. If \p BB + /// can be added to the outlining region, return true. + static bool updatePHIFrontier(BasicBlock &BB, + const BlockSetVector &RegionBlocks, + PHIFrontierMap &PHIFrontier) { + // Make sure that each successor of BB in the frontier has a unique + // incoming value originating from the outlining region. + for (BasicBlock *SuccBB : successors(&BB)) { + // Ignore self-edges and successors which aren't in the frontier. + if (SuccBB == &BB || RegionBlocks.count(SuccBB)) + continue; + + for (const PHINode &SuccPhi : SuccBB->phis()) { + // Record the incoming value from BB. Flag it if it isn't unique. + Value *&RecordedIncomingVal = PHIFrontier[&SuccPhi]; + Value *ActualIncomingVal = SuccPhi.getIncomingValueForBlock(&BB); + if (RecordedIncomingVal) + if (ActualIncomingVal != RecordedIncomingVal) + return false; + + RecordedIncomingVal = ActualIncomingVal; + } + } + return true; } }; @@ -533,6 +598,12 @@ continue; } + LLVM_DEBUG({ + dbgs() << "Hot/cold splitting attempting to outline these blocks:\n"; + for (BasicBlock *BB : SubRegion) + BB->dump(); + }); + Function *Outlined = extractColdRegion(SubRegion, DT, BFI, TTI, ORE, OutlinedFunctionID); if (Outlined) { Index: llvm/test/Transforms/HotColdSplit/extraction-subregion-breaks-phis.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/HotColdSplit/extraction-subregion-breaks-phis.ll @@ -0,0 +1,63 @@ +; RUN: opt -S -hotcoldsplit < %s | FileCheck %s + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.14.0" + +; CHECK-LABEL: define {{.*}}@foo( +; CHECK: call {{.*}}@foo.cold.1( +; CHECK: unreachable + +; CHECK-LABEL: define {{.*}}@foo.cold.1( +; CHECK: switch i32 undef, label %sw.epilog.i +define void @foo(i32 %QMM) { +entry: + switch i32 %QMM, label %entry.if.end16_crit_edge [ + i32 1, label %if.then + ] + +entry.if.end16_crit_edge: ; preds = %entry + br label %if.end16 + +if.then: ; preds = %entry + br i1 undef, label %cond.true.i.i, label %_ZN10StringView8popFrontEv.exit.i + +cond.true.i.i: ; preds = %if.then + ret void + +_ZN10StringView8popFrontEv.exit.i: ; preds = %if.then + switch i32 undef, label %sw.epilog.i [ + i32 81, label %if.end16 + i32 82, label %sw.bb4.i + i32 83, label %sw.bb8.i + i32 84, label %sw.bb12.i + i32 65, label %if.end16 + i32 66, label %sw.bb20.i + i32 67, label %sw.bb24.i + i32 68, label %sw.bb28.i + ] + +sw.bb4.i: ; preds = %_ZN10StringView8popFrontEv.exit.i + br label %if.end16 + +sw.bb8.i: ; preds = %_ZN10StringView8popFrontEv.exit.i + br label %if.end16 + +sw.bb12.i: ; preds = %_ZN10StringView8popFrontEv.exit.i + br label %if.end16 + +sw.bb20.i: ; preds = %_ZN10StringView8popFrontEv.exit.i + br label %if.end16 + +sw.bb24.i: ; preds = %_ZN10StringView8popFrontEv.exit.i + br label %if.end16 + +sw.bb28.i: ; preds = %_ZN10StringView8popFrontEv.exit.i + br label %if.end16 + +sw.epilog.i: ; preds = %_ZN10StringView8popFrontEv.exit.i + br label %if.end16 + +if.end16: ; preds = %sw.epilog.i, %sw.bb28.i, %sw.bb24.i, %sw.bb20.i, %sw.bb12.i, %sw.bb8.i, %sw.bb4.i, %_ZN10StringView8popFrontEv.exit.i, %_ZN10StringView8popFrontEv.exit.i, %entry.if.end16_crit_edge + %0 = phi i8 [ 0, %entry.if.end16_crit_edge ], [ 0, %_ZN10StringView8popFrontEv.exit.i ], [ 0, %_ZN10StringView8popFrontEv.exit.i ], [ 1, %sw.bb4.i ], [ 2, %sw.bb8.i ], [ 3, %sw.bb12.i ], [ 1, %sw.bb20.i ], [ 2, %sw.bb24.i ], [ 3, %sw.bb28.i ], [ 0, %sw.epilog.i ] + unreachable +} Index: llvm/test/Transforms/HotColdSplit/forward-dfs-reaches-marked-block.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/HotColdSplit/forward-dfs-reaches-marked-block.ll @@ -0,0 +1,29 @@ +; RUN: opt -hotcoldsplit -S < %s | FileCheck %s + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.14.0" + +; CHECK-LABEL: define {{.*}}@fun +; CHECK: call {{.*}}@fun.cold.1( +define void @fun() { +entry: + br i1 undef, label %if.then, label %if.else + +if.then: + ; This will be marked by the inverse DFS on sink-predecesors. + br label %sink + +sink: + call void @sink() + + ; Do not allow the forward-DFS on sink-successors to mark the block again. + br i1 undef, label %if.then, label %if.then.exit + +if.then.exit: + ret void + +if.else: + ret void +} + +declare void @sink() cold Index: llvm/test/Transforms/HotColdSplit/iterated-pruning-for-valid-phi-frontier.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/HotColdSplit/iterated-pruning-for-valid-phi-frontier.ll @@ -0,0 +1,69 @@ +; RUN: opt -hotcoldsplit -S < %s | FileCheck %s + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.14.0" + +; In this function, the cold block is bb2, and the surrounding outlining region +; includes bb1, bb2 (the sink), bb3, bb4, bb5, and bb6. +; +; Note that bb7 is outside of the outlining region (because it has bb as a +; predecessor, and bb is not cold). bb7 has a phi with incoming values coming +; from bb, bb2, and bb6, i.e. it has multiple non-unique incoming values from +; from within the outlining region. +; +; There is no way to outline bb6 without breaking the phi inside of bb7, +; therefore we must remove bb6 from the outlining region. +; +; However, removing bb6 from the outlining region *also* makes it impossible +; to outline either bb4 or bb5, because bb6 would then have a phi with non- +; unique incoming values from the outlining region. Prior to removing bb6 this +; is not obvious, but after doing so it is. +; +; Therefore some kind of iterated pruning algorithm is needed. + +; CHECK-LABEL: define {{.*}}@pluto.cold.1( +; CHECK: call {{.*}}@sideeffect(i32 1) +; CHECK: call {{.*}}@sink( +; CHECK: call {{.*}}@sideeffect(i32 3) +; CHECK: call {{.*}}@sideeffect(i32 4) +; CHECK-NOT: call {{.*}}@sideeffect +define void @pluto() { +bb: + switch i8 undef, label %bb1 [ + i8 0, label %bb7 + i8 1, label %bb7 + ] + +bb1: ; preds = %bb + call void @sideeffect(i32 1) + br label %bb2 + +bb2: ; preds = %bb1 + call void @sink() + br i1 undef, label %bb7, label %bb3 + +bb3: ; preds = %bb2 + call void @sideeffect(i32 3) + br label %bb4 + +bb4: ; preds = %bb3 + call void @sideeffect(i32 4) + br i1 undef, label %bb5, label %bb6 + +bb5: ; preds = %bb4 + call void @sideeffect(i32 5) + br label %bb6 + +bb6: ; preds = %bb5, %bb4 + %tmp = phi i1 [ true, %bb5 ], [ false, %bb4 ] + call void @sideeffect(i32 6) + br label %bb7 + +bb7: ; preds = %bb6, %bb2, %bb, %bb + %tmp8 = phi i1 [ true, %bb ], [ true, %bb ], [ true, %bb2 ], [ %tmp, %bb6 ] + ret void +} + +declare void @sink() cold + +declare void @sideeffect(i32) Index: llvm/test/Transforms/HotColdSplit/outline-while-loop.ll =================================================================== --- llvm/test/Transforms/HotColdSplit/outline-while-loop.ll +++ llvm/test/Transforms/HotColdSplit/outline-while-loop.ll @@ -55,6 +55,47 @@ ret void } +; This is the same as @foo, but the while loop comes after the sink block. +; CHECK-LABEL: define {{.*}}@while_loop_after_sink( +; CHECK: br i1 {{.*}}, label %if.end, label %codeRepl +; CHECK-LABEL: codeRepl: +; CHECK-NEXT: call void @while_loop_after_sink.cold.1 +; CHECK-LABEL: if.end: +; CHECK: call void @sideeffect(i32 1) +define void @while_loop_after_sink(i32 %cond) { +entry: + %tobool = icmp eq i32 %cond, 0 + br i1 %tobool, label %if.end, label %sink + +sink: + tail call void (...) @sink() + br label %while.cond.preheader + +while.cond.preheader: + %cmp3 = icmp sgt i32 %cond, 10 + br i1 %cmp3, label %while.body.preheader, label %while.end + +while.body.preheader: ; preds = %while.cond.preheader + br label %while.body + +while.body: ; preds = %while.body.preheader, %while.body + %cond.addr.04 = phi i32 [ %dec, %while.body ], [ %cond, %while.body.preheader ] + %dec = add nsw i32 %cond.addr.04, -1 + tail call void @sideeffect(i32 0) #3 + %cmp = icmp sgt i32 %dec, 10 + br i1 %cmp, label %while.body, label %while.end.loopexit + +while.end.loopexit: ; preds = %while.body + br label %while.end + +while.end: ; preds = %while.end.loopexit, %while.cond.preheader + ret void + +if.end: ; preds = %entry + tail call void @sideeffect(i32 1) + ret void +} + ; CHECK-LABEL: define {{.*}}@foo.cold.1 ; CHECK: phi i32 ; CHECK-NEXT: add nsw i32 @@ -62,6 +103,14 @@ ; CHECK-NEXT: icmp ; CHECK-NEXT: br +; CHECK-LABEL: define {{.*}}@while_loop_after_sink.cold.1 +; CHECK: call {{.*}}@sink +; CHECK: phi i32 +; CHECK-NEXT: add nsw i32 +; CHECK-NEXT: call {{.*}}@sideeffect +; CHECK-NEXT: icmp +; CHECK-NEXT: br + declare void @sideeffect(i32) declare void @sink(...) cold Index: llvm/test/Transforms/HotColdSplit/phi-with-distinct-outlined-values.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/HotColdSplit/phi-with-distinct-outlined-values.ll @@ -0,0 +1,33 @@ +; RUN: opt -S -hotcoldsplit < %s | FileCheck %s + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.14.0" + +; CHECK-LABEL: define {{.*}}@foo( +; CHECK: phi i32 [ 0, %entry ], [ 1, %codeRepl ], [ 3, %coldbb2 ] + +; CHECK-LABEL: define {{.*}}@foo.cold.1( +; CHECK: call {{.*}}@sink + +define void @foo(i32 %cond) { +entry: + %tobool = icmp eq i32 %cond, 0 + br i1 %tobool, label %if.end, label %coldbb + +coldbb: + call void @sink() + call void @sideeffect() + call void @sideeffect() + br i1 undef, label %if.end, label %coldbb2 + +coldbb2: + br label %if.end + +if.end: + %p = phi i32 [0, %entry], [1, %coldbb], [3, %coldbb2] + ret void +} + +declare void @sink() cold + +declare void @sideeffect() Index: llvm/test/Transforms/HotColdSplit/succ-block-with-self-edge.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/HotColdSplit/succ-block-with-self-edge.ll @@ -0,0 +1,56 @@ +; RUN: opt -S -hotcoldsplit < %s | FileCheck %s + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.14.0" + +; CHECK-LABEL: define {{.*}}@exit_block_with_same_incoming_vals +; CHECK: call {{.*}}@exit_block_with_same_incoming_vals.cold.1( +; CHECK-NOT: br i1 undef +; CHECK: phi i32 [ 0, %entry ], [ 1, %codeRepl ] +define void @exit_block_with_same_incoming_vals(i32 %cond) { +entry: + %tobool = icmp eq i32 %cond, 0 + br i1 %tobool, label %if.end, label %coldbb + +coldbb: + call void @sink() + call void @sideeffect() + call void @sideeffect() + br i1 undef, label %if.end, label %coldbb2 + +coldbb2: + %p2 = phi i32 [0, %coldbb], [1, %coldbb2] + br i1 undef, label %if.end, label %coldbb2 + +if.end: + %p = phi i32 [0, %entry], [1, %coldbb], [1, %coldbb2] + ret void +} + +; CHECK-LABEL: define {{.*}}@exit_block_with_distinct_incoming_vals +; CHECK: call {{.*}}@exit_block_with_distinct_incoming_vals.cold.1( +; CHECK: br i1 undef +; CHECK: phi i32 [ 0, %entry ], [ 1, %codeRepl ], [ 2, %coldbb2 ] +define void @exit_block_with_distinct_incoming_vals(i32 %cond) { +entry: + %tobool = icmp eq i32 %cond, 0 + br i1 %tobool, label %if.end, label %coldbb + +coldbb: + call void @sink() + call void @sideeffect() + call void @sideeffect() + br i1 undef, label %if.end, label %coldbb2 + +coldbb2: + %p2 = phi i32 [0, %coldbb], [1, %coldbb2] + br i1 undef, label %if.end, label %coldbb2 + +if.end: + %p = phi i32 [0, %entry], [1, %coldbb], [2, %coldbb2] + ret void +} + +declare void @sink() cold + +declare void @sideeffect()