Index: lib/Transforms/Utils/SimplifyCFG.cpp =================================================================== --- lib/Transforms/Utils/SimplifyCFG.cpp +++ lib/Transforms/Utils/SimplifyCFG.cpp @@ -73,6 +73,17 @@ "simplifycfg-hoist-cond-stores", cl::Hidden, cl::init(true), cl::desc("Hoist conditional stores if an unconditional store precedes")); +static cl::opt SpeculativeFlattenBias( + "speculative-flatten-bias", cl::Hidden, cl::init(100), + cl::desc("Control how biased a branch needs to be to be considered rarely" + " failing for speculative flattening (default = 100)")); + +static cl::opt SpeculativeFlattenThreshold( + "speculative-flatten-threshold", cl::Hidden, cl::init(10), + cl::desc("Control how much speculation happens due to speculative" + " flattening (default = 10)")); + + STATISTIC(NumBitMaps, "Number of switch instructions turned into bitmaps"); STATISTIC(NumLinearMaps, "Number of switch instructions turned into linear mapping"); STATISTIC(NumLookupTables, "Number of switch instructions turned into lookup tables"); @@ -2334,11 +2345,115 @@ return false; } + +/// Return true if B is known to be implied by A. A & B must be i1 (boolean) +static bool implies(Value *A, Value *B, const DataLayout &DL) { + assert(A->getType()->isIntegerTy(1) && B->getType()->isIntegerTy(1)); + // Note that the truth table for implication is the same as <=u on i1 + // values. + Value *Simplified = SimplifyICmpInst(ICmpInst::ICMP_ULE, A, B, DL); + Constant *Con = dyn_cast_or_null(Simplified); + return Con && Con->isOneValue(); +} + +/// If we have a series of tests leading to a frequently executed fallthrough +/// path and we can test all the conditions at once, we can execute a single +/// test on the fast path and figure out which condition failed on the slow +/// path. This transformation considers a pair of branches at a time since +/// recursively considering branches pairwise will cause an entire chain to +/// collapse. This transformation is code size neutral, but makes it +/// dynamically more expensive to fail either check. As such, we only want to +/// do this if both checks are expected to essentially never fail. +/// The key motivating examples are cases like: +/// br (i < Length), in_bounds, fail1 +/// in_bounds: +/// br (i+1 < Length), in_bounds2, fail2 +/// in_bounds2: +/// ... +/// +/// We can rewrite this as: +/// br (i+1 < Length), in_bounds2, dispatch +/// in_bounds2: +/// ... +/// dispatch: +/// br (i < Length), fail2, fail1 +/// +/// TODO: we could consider duplicating some (non-speculatable) instructions +/// from BI->getParent() down both paths +static bool SpeculativelyFlattenCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI, + const DataLayout &DL) { + auto *PredBB = PBI->getParent(); + auto *BB = BI->getParent(); + + /// Is the failing path of this branch taken rarely if at all? + auto isRarelyUntaken = [](BranchInst *BI) { + uint64_t ProbTrue; + uint64_t ProbFalse; + if (!ExtractBranchMetadata(BI, ProbTrue, ProbFalse)) + return false; + return ProbTrue > ProbFalse * SpeculativeFlattenBias; + }; + + if (PBI->getSuccessor(0) != BB || + !isRarelyUntaken(PBI) || !isRarelyUntaken(BI) || + !implies(BI->getCondition(), PBI->getCondition(), DL)) + return false; + + // TODO: The following code performs a similiar, but slightly distinct + // transformation to that done by SpeculativelyExecuteBB. We should consider + // combining them at some point. + + // Can we speculate everything in the given block (except for the terminator + // instruction) at the instruction boundary before 'At'? + unsigned SpeculationCost = 0; + for (Instruction &I : *BB) { + if (isa(I)) break; + if (!isSafeToSpeculativelyExecute(&I, PBI)) + return false; + SpeculationCost++; + // Only flatten relatively small BBs to avoid making the bad case terrible + if (SpeculationCost > SpeculativeFlattenThreshold || isa(I)) + return false; + } + + DEBUG(dbgs() << "Outlining slow path comparison: " + << *PBI->getCondition() << " implied by " + << *BI->getCondition() << "\n"); + // See the example in the function comment. + Value *WhichCond = PBI->getCondition(); + auto *Success = BI->getSuccessor(0); + auto *FailPBI = PBI->getSuccessor(1); + auto *FailBI = BI->getSuccessor(1); + // Have PBI branch directly to the fast path using BI's condition, branch + // to this BI's block for the slow path dispatch + PBI->setSuccessor(0, Success); + PBI->setSuccessor(1, BB); + PBI->setCondition(BI->getCondition()); + // Rewrite BI to distinguish between the two failing cases + BI->setSuccessor(0, FailBI); + BI->setSuccessor(1, FailPBI); + BI->setCondition(WhichCond); + // Move all of the instructions from BI->getParent other than + // the terminator into the fastpath. This requires speculating them past + // the original PBI branch, but we checked for that legality above. + // TODO: This doesn't handle dependent loads. We could duplicate those + // down both paths, but that involves further code growth. We need to + // figure out a good cost model here. + PredBB->getInstList().splice(PBI, BB->getInstList(), + BB->begin(), std::prev(BB->end())); + + // To be conservatively correct, drop all metadata on the rewritten + // branches. TODO: update metadata + PBI->dropUnknownNonDebugMetadata(); + BI->dropUnknownNonDebugMetadata(); + return true; +} /// If we have a conditional branch as a predecessor of another block, /// this function tries to simplify it. We know /// that PBI and BI are both conditional branches, and BI is in one of the /// successor blocks of PBI - PBI branches to BI. -static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI) { +static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI, + const DataLayout &DL) { assert(PBI->isConditional() && BI->isConditional()); BasicBlock *BB = BI->getParent(); @@ -2388,6 +2503,13 @@ } } + if (auto *CE = dyn_cast(BI->getCondition())) + if (CE->canTrap()) + return false; + + if (SpeculativelyFlattenCondBranchToCondBranch(PBI, BI, DL)) + return true; + // If this is a conditional branch in an empty block, and if any // predecessors are a conditional branch to one of our destinations, // fold the conditions into logical ops and one cond br. @@ -2398,11 +2520,6 @@ if (&*BBI != BI) return false; - - if (ConstantExpr *CE = dyn_cast(BI->getCondition())) - if (CE->canTrap()) - return false; - int PBIOp, BIOp; if (PBI->getSuccessor(0) == BI->getSuccessor(0)) PBIOp = BIOp = 0; @@ -4653,7 +4770,7 @@ for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) if (BranchInst *PBI = dyn_cast((*PI)->getTerminator())) if (PBI != BI && PBI->isConditional()) - if (SimplifyCondBranchToCondBranch(PBI, BI)) + if (SimplifyCondBranchToCondBranch(PBI, BI, DL)) return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true; return false; Index: test/Transforms/SimplifyCFG/fast-fallthrough.ll =================================================================== --- test/Transforms/SimplifyCFG/fast-fallthrough.ll +++ test/Transforms/SimplifyCFG/fast-fallthrough.ll @@ -0,0 +1,86 @@ +; RUN: opt -S %s -simplifycfg | FileCheck %s + +define void @test(i32 %length.i, i32 %i) { +; CHECK-LABEL: @test + %iplus1 = add nsw i32 %i, 1 + %var29 = icmp slt i32 %i, %length.i + %var30 = icmp slt i32 %iplus1, %length.i +; CHECK: br i1 %var30, label %in_bounds, label %next + br i1 %var29, label %next, label %out_of_bounds, !prof !{!"branch_weights", i32 1000, i32 0} + +next: +; CHECK-LABEL: next: +; CHECK: br i1 %var29, label %out_of_bounds2, label %out_of_bounds + br i1 %var30, label %in_bounds, label %out_of_bounds2, !prof !{!"branch_weights", i32 1000, i32 0} + +in_bounds: + ret void + +out_of_bounds: + call void @foo(i64 0) + unreachable + +out_of_bounds2: + call void @foo(i64 1) + unreachable +} + +define void @test2(i32 %length.i, i32 %i) { +; CHECK-LABEL: @test2 + %var29 = icmp slt i32 %i, %length.i +; CHECK: br i1 %var30, label %in_bounds, label %next + br i1 %var29, label %next, label %out_of_bounds, !prof !{!"branch_weights", i32 1000, i32 0} + +next: +; CHECK-LABEL: next: +; CHECK: br i1 %var29, label %out_of_bounds2, label %out_of_bounds + %iplus1 = add nsw i32 %i, 1 + %var30 = icmp slt i32 %iplus1, %length.i + br i1 %var30, label %in_bounds, label %out_of_bounds2, !prof !{!"branch_weights", i32 1000, i32 0} + +in_bounds: + ret void + +out_of_bounds: + call void @foo(i64 0) + unreachable + +out_of_bounds2: + call void @foo(i64 1) + unreachable +} + +; As written, this one can't trigger today. It would require us to duplicate +; the %val1 load down two paths and that's not implemented yet. +define i64 @test3(i32 %length.i, i32 %i, i64* %base) { +; CHECK-LABEL: @test3 + %var29 = icmp slt i32 %i, %length.i +; CHECK: br i1 %var29, label %next, label %out_of_bounds + br i1 %var29, label %next, label %out_of_bounds, !prof !{!"branch_weights", i32 1000, i32 0} + +next: +; CHECK-LABEL: next: + %addr1 = getelementptr i64, i64* %base, i32 %i + %val1 = load i64, i64* %addr1 + %iplus1 = add nsw i32 %i, 1 + %var30 = icmp slt i32 %iplus1, %length.i +; CHECK: br i1 %var30, label %in_bounds, label %out_of_bounds2 + br i1 %var30, label %in_bounds, label %out_of_bounds2, !prof !{!"branch_weights", i32 1000, i32 0} + +in_bounds: + %addr2 = getelementptr i64, i64* %base, i32 %iplus1 + %val2 = load i64, i64* %addr2 + %res = sub i64 %val1, %val2 + ret i64 %res + +out_of_bounds: + call void @foo(i64 0) + unreachable + +out_of_bounds2: + call void @foo(i64 %val1) + unreachable +} + +declare void @foo(i64) +