Index: lib/Transforms/Utils/SimplifyCFG.cpp =================================================================== --- lib/Transforms/Utils/SimplifyCFG.cpp +++ lib/Transforms/Utils/SimplifyCFG.cpp @@ -2334,11 +2334,113 @@ return false; } + +/// Return true if B is known to be implied by A. A & B must be i1 (boolean) +static bool implies(Value *A, Value *B, const DataLayout &DL) { + assert(A->getType()->isIntegerTy(1) && B->getType()->isIntegerTy(1)); + // Note that the truth table for implication is the same as <=u on i1 + // values. + Value *Simplified = SimplifyICmpInst(ICmpInst::ICMP_ULE, A, B, DL); + if (!Simplified) return false; + Constant *Con = dyn_cast(Simplified); + return Con && Con->isOneValue(); +} + +/// If we have a series of tests leading to a frequently executed fallthrough +/// path and we can test all the conditions at once, we can execute a single +/// test on the fast path and figure out which condition failed on the slow +/// path. This is code size neutral, but makes it dynamically more expensive +/// to fail either check. As such, we only want to do this if both checks are +/// expected to essentially never fail. +/// The key motivating examples are cases like: +/// br (i < Length), in_bounds, fail1 +/// in_bounds: +/// br (i+1 < Length), in_bounds2, fail2 +/// in_bounds2: +/// ... +/// +/// We can rewrite this as: +/// br (i+1 < Length), in_bounds2, dispatch +/// in_bounds2: +/// ... +/// dispatch: +/// br (i < Length), fail2, fail1 +/// +/// TODO: we could consider duplicating some (non-speculatable) instructions +/// from BI->getParent() down both paths +static bool TryToSpeculativelyFlattenCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI, + const DataLayout &DL) { + auto *PredBB = PBI->getParent(); + auto *BB = BI->getParent(); + + /// Is the failing path of this branch taken rarely if at all? + auto isRarelyUntaken = [](BranchInst *BI) { + uint64_t ProbTrue; + uint64_t ProbFalse; + if (!ExtractBranchMetadata(BI, ProbTrue, ProbFalse)) + return false; + return ProbTrue > ProbFalse * 100; + }; + + if (PBI->getSuccessor(0) != BB || + !isRarelyUntaken(PBI) || !isRarelyUntaken(BI) || + !implies(BI->getCondition(), PBI->getCondition(), DL)) + return false; + + // TODO: The following code performs a similiar, but slightly distinct + // transformation to that done by SpeculativelyExecuteBB. We should consider + // combining them at some point. + + // Can we speculate everything in the given block (except for the terminator + // instruction) at the instruction boundary before 'At'? + unsigned SpeculationCost = 0; + for (Instruction &I : *BB) { + if (isa(I)) break; + if (!isSafeToSpeculativelyExecute(&I, PBI)) + return false; + SpeculationCost++; + // Only flatten relatively small BBs to avoid making the bad case terrible + if (SpeculationCost > 10 || isa(I) || isa(I)) + return false; + } + + DEBUG(dbgs() << "Outlining slow path comparison: " + << *PBI->getCondition() << " implied by " + << *BI->getCondition() << "\n"); + Value *WhichCond = PBI->getCondition(); + auto *Success = BI->getSuccessor(0); + auto *Fail1 = PBI->getSuccessor(1); + auto *Fail2 = BI->getSuccessor(1); + // Have PBI branch directly to the fast path using BI's condition, branch + // to this BI's block for the slow path dispatch + PBI->setSuccessor(0, Success); + PBI->setSuccessor(1, BB); + PBI->setCondition(BI->getCondition()); + // Rewrite BI to distinguish between the two failing cases + BI->setSuccessor(0, Fail2); + BI->setSuccessor(1, Fail1); + BI->setCondition(WhichCond); + // Move all of the instructions from BI->getParent other than + // the terminator into the fastpath. This requires speculating them past + // the original PBI branch, but we checked for that legality above. + // TODO: This doesn't handle dependent loads. We could duplicate those + // down both paths, but that involves further code growth. We need to + // figure out a good cost model here. + PredBB->getInstList().splice(PBI, BB->getInstList(), + BB->begin(), std::prev(BB->end())); + + // To be conservatively correct, drop all metadata on the rewritten + // branches. TODO: update metadata + PBI->dropUnknownNonDebugMetadata(); + BI->dropUnknownNonDebugMetadata(); + return true; +} /// If we have a conditional branch as a predecessor of another block, /// this function tries to simplify it. We know /// that PBI and BI are both conditional branches, and BI is in one of the /// successor blocks of PBI - PBI branches to BI. -static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI) { +static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI, + const DataLayout &DL) { assert(PBI->isConditional() && BI->isConditional()); BasicBlock *BB = BI->getParent(); @@ -2388,6 +2490,13 @@ } } + if (ConstantExpr *CE = dyn_cast(BI->getCondition())) + if (CE->canTrap()) + return false; + + if (TryToSpeculativelyFlattenCondBranchToCondBranch(PBI, BI, DL)) + return true; + // If this is a conditional branch in an empty block, and if any // predecessors are a conditional branch to one of our destinations, // fold the conditions into logical ops and one cond br. @@ -2398,11 +2507,6 @@ if (&*BBI != BI) return false; - - if (ConstantExpr *CE = dyn_cast(BI->getCondition())) - if (CE->canTrap()) - return false; - int PBIOp, BIOp; if (PBI->getSuccessor(0) == BI->getSuccessor(0)) PBIOp = BIOp = 0; @@ -4734,7 +4838,7 @@ for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) if (BranchInst *PBI = dyn_cast((*PI)->getTerminator())) if (PBI != BI && PBI->isConditional()) - if (SimplifyCondBranchToCondBranch(PBI, BI)) + if (SimplifyCondBranchToCondBranch(PBI, BI, DL)) return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true; return false; Index: test/Transforms/SimplifyCFG/fast-fallthrough.ll =================================================================== --- test/Transforms/SimplifyCFG/fast-fallthrough.ll +++ test/Transforms/SimplifyCFG/fast-fallthrough.ll @@ -0,0 +1,86 @@ +; RUN: opt -S %s -simplifycfg | FileCheck %s + +define void @test(i32 %length.i, i32 %i) { +; CHECK-LABEL: @test + %iplus1 = add nsw i32 %i, 1 + %var29 = icmp slt i32 %i, %length.i + %var30 = icmp slt i32 %iplus1, %length.i +; CHECK: br i1 %var30, label %in_bounds, label %next + br i1 %var29, label %next, label %out_of_bounds, !prof !{!"branch_weights", i32 1000, i32 0} + +next: +; CHECK-LABEL: next: +; CHECK: br i1 %var29, label %out_of_bounds2, label %out_of_bounds + br i1 %var30, label %in_bounds, label %out_of_bounds2, !prof !{!"branch_weights", i32 1000, i32 0} + +in_bounds: + ret void + +out_of_bounds: + call void @foo(i64 0) + unreachable + +out_of_bounds2: + call void @foo(i64 1) + unreachable +} + +define void @test2(i32 %length.i, i32 %i) { +; CHECK-LABEL: @test2 + %var29 = icmp slt i32 %i, %length.i +; CHECK: br i1 %var30, label %in_bounds, label %next + br i1 %var29, label %next, label %out_of_bounds, !prof !{!"branch_weights", i32 1000, i32 0} + +next: +; CHECK-LABEL: next: +; CHECK: br i1 %var29, label %out_of_bounds2, label %out_of_bounds + %iplus1 = add nsw i32 %i, 1 + %var30 = icmp slt i32 %iplus1, %length.i + br i1 %var30, label %in_bounds, label %out_of_bounds2, !prof !{!"branch_weights", i32 1000, i32 0} + +in_bounds: + ret void + +out_of_bounds: + call void @foo(i64 0) + unreachable + +out_of_bounds2: + call void @foo(i64 1) + unreachable +} + +; As written, this one can't trigger today. It would require us to duplicate +; the %val1 load down two paths and that's not implemented yet. +define i64 @test3(i32 %length.i, i32 %i, i64* %base) { +; CHECK-LABEL: @test3 + %var29 = icmp slt i32 %i, %length.i +; CHECK: br i1 %var29, label %next, label %out_of_bounds + br i1 %var29, label %next, label %out_of_bounds, !prof !{!"branch_weights", i32 1000, i32 0} + +next: +; CHECK-LABEL: next: + %addr1 = getelementptr i64, i64* %base, i32 %i + %val1 = load i64, i64* %addr1 + %iplus1 = add nsw i32 %i, 1 + %var30 = icmp slt i32 %iplus1, %length.i +; CHECK: br i1 %var30, label %in_bounds, label %out_of_bounds2 + br i1 %var30, label %in_bounds, label %out_of_bounds2, !prof !{!"branch_weights", i32 1000, i32 0} + +in_bounds: + %addr2 = getelementptr i64, i64* %base, i32 %iplus1 + %val2 = load i64, i64* %addr2 + %res = sub i64 %val1, %val2 + ret i64 %res + +out_of_bounds: + call void @foo(i64 0) + unreachable + +out_of_bounds2: + call void @foo(i64 %val1) + unreachable +} + +declare void @foo(i64) +