Index: include/llvm/Transforms/Scalar/JumpThreading.h =================================================================== --- include/llvm/Transforms/Scalar/JumpThreading.h +++ include/llvm/Transforms/Scalar/JumpThreading.h @@ -23,6 +23,7 @@ #include "llvm/Analysis/LazyValueInfo.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/ValueHandle.h" namespace llvm { @@ -62,6 +63,7 @@ std::unique_ptr BFI; std::unique_ptr BPI; bool HasProfileData = false; + bool HasGuards = false; #ifdef NDEBUG SmallPtrSet LoopHeaders; #else @@ -122,6 +124,9 @@ bool TryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB); bool TryToUnfoldSelectInCurrBB(BasicBlock *BB); + bool ProcessGuards(BasicBlock *BB); + bool ThreadGuard(BasicBlock *BB, IntrinsicInst *Guard, BranchInst *BI); + private: BasicBlock *SplitBlockPreds(BasicBlock *BB, ArrayRef Preds, const char *Suffix); Index: include/llvm/Transforms/Utils/Cloning.h =================================================================== --- include/llvm/Transforms/Utils/Cloning.h +++ include/llvm/Transforms/Utils/Cloning.h @@ -245,6 +245,16 @@ void remapInstructionsInBlocks(const SmallVectorImpl &Blocks, ValueToValueMapTy &VMap); +/// Split edge between BB and PredBB and duplicate all non-Phi instructions +/// from BB between its beginning and the StopAt instruction into the split +/// block. Phi nodes are not duplicated, but their uses are handled correctly: +/// we replace them with the uses of corresponding Phi inputs. ValueMapping +/// is used to map the original instructions from BB to their newly-created +/// copies. Returns the split block. +BasicBlock * +DuplicateInstructionsInSplitBetween(BasicBlock *BB, BasicBlock *PredBB, + Instruction *StopAt, + ValueToValueMapTy &ValueMapping); } // end namespace llvm #endif // LLVM_TRANSFORMS_UTILS_CLONING_H Index: lib/Transforms/Scalar/JumpThreading.cpp =================================================================== --- lib/Transforms/Scalar/JumpThreading.cpp +++ lib/Transforms/Scalar/JumpThreading.cpp @@ -30,11 +30,13 @@ #include "llvm/IR/LLVMContext.h" #include "llvm/IR/MDBuilder.h" #include "llvm/IR/Metadata.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/SSAUpdater.h" #include @@ -169,6 +171,9 @@ // When profile data is available, we need to update edge weights after // successful jump threading, which requires both BPI and BFI being available. HasProfileData = HasProfileData_; + auto *GuardDecl = F.getParent()->getFunction( + Intrinsic::getName(Intrinsic::experimental_guard)); + HasGuards = GuardDecl && !GuardDecl->use_empty(); if (HasProfileData) { BPI = std::move(BPI_); BFI = std::move(BFI_); @@ -238,10 +243,13 @@ return EverChanged; } -/// getJumpThreadDuplicationCost - Return the cost of duplicating this block to -/// thread across it. Stop scanning the block when passing the threshold. -static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB, +/// Return the cost of duplicating a piece of this block from first non-phi +/// and before StopAt instruction to thread across it. Stop scanning the block +/// when exceeding the threshold. If duplication is impossible, returns ~0U. +static unsigned getJumpThreadDuplicationCost(BasicBlock *BB, + Instruction *StopAt, unsigned Threshold) { + assert(StopAt->getParent() == BB && "Not an instruction from proper BB?"); /// Ignore PHI nodes, these will be flattened when duplication happens. BasicBlock::const_iterator I(BB->getFirstNonPHI()); @@ -249,15 +257,17 @@ // branch, so they shouldn't count against the duplication cost. unsigned Bonus = 0; - const TerminatorInst *BBTerm = BB->getTerminator(); - // Threading through a switch statement is particularly profitable. If this - // block ends in a switch, decrease its cost to make it more likely to happen. - if (isa(BBTerm)) - Bonus = 6; - - // The same holds for indirect branches, but slightly more so. - if (isa(BBTerm)) - Bonus = 8; + if (BB->getTerminator() == StopAt) { + // Threading through a switch statement is particularly profitable. If this + // block ends in a switch, decrease its cost to make it more likely to + // happen. + if (isa(StopAt)) + Bonus = 6; + + // The same holds for indirect branches, but slightly more so. + if (isa(StopAt)) + Bonus = 8; + } // Bump the threshold up so the early exit from the loop doesn't skip the // terminator-based Size adjustment at the end. @@ -266,7 +276,7 @@ // Sum up the cost of each instruction until we get to the terminator. Don't // include the terminator because the copy won't include it. unsigned Size = 0; - for (; !isa(I); ++I) { + for (; &*I != StopAt; ++I) { // Stop scanning the block if we've reached the threshold. if (Size > Threshold) @@ -712,6 +722,10 @@ if (TryToUnfoldSelectInCurrBB(BB)) return true; + // Look if we can propagate guards to predecessors. + if (HasGuards && ProcessGuards(BB)) + return true; + // What kind of constant we're looking for. ConstantPreference Preference = WantInteger; @@ -1466,7 +1480,8 @@ return false; } - unsigned JumpThreadCost = getJumpThreadDuplicationCost(BB, BBDupThreshold); + unsigned JumpThreadCost = + getJumpThreadDuplicationCost(BB, BB->getTerminator(), BBDupThreshold); if (JumpThreadCost > BBDupThreshold) { DEBUG(dbgs() << " Not threading BB '" << BB->getName() << "' - Cost is too high: " << JumpThreadCost << "\n"); @@ -1754,7 +1769,8 @@ return false; } - unsigned DuplicationCost = getJumpThreadDuplicationCost(BB, BBDupThreshold); + unsigned DuplicationCost = + getJumpThreadDuplicationCost(BB, BB->getTerminator(), BBDupThreshold); if (DuplicationCost > BBDupThreshold) { DEBUG(dbgs() << " Not duplicating BB '" << BB->getName() << "' - Cost is too high: " << DuplicationCost << "\n"); @@ -2019,3 +2035,130 @@ return false; } + +/// Try to propagate a guard from the current BB into one of its predecessors +/// in case if another branch of execution implies that the condition of this +/// guard is always true. Currently we only process the simplest case that +/// looks like: +/// +/// Start: +/// %cond = ... +/// br i1 %cond, label %T1, label %F1 +/// T1: +/// br label %Merge +/// F1: +/// br label %Merge +/// Merge: +/// %condGuard = ... +/// call void(i1, ...) @llvm.experimental.guard( i1 %condGuard )[ "deopt"() ] +/// +/// And cond either implies condGuard or !condGuard. In this case all the +/// instructions before the guard can be duplicated in both branches, and the +/// guard is then threaded to one of them. +bool JumpThreadingPass::ProcessGuards(BasicBlock *BB) { + using namespace PatternMatch; + // We only want to deal with two predecessors. + BasicBlock *Pred1, *Pred2; + auto PI = pred_begin(BB), PE = pred_end(BB); + if (PI == PE) + return false; + Pred1 = *PI++; + if (PI == PE) + return false; + Pred2 = *PI++; + if (PI != PE) + return false; + if (Pred1 == Pred2) + return false; + + // Try to thread one of the guards of the block. + // TODO: Look up deeper than to immediate predecessor? + auto *Parent = Pred1->getSinglePredecessor(); + if (!Parent || Parent != Pred2->getSinglePredecessor()) + return false; + + if (auto *BI = dyn_cast(Parent->getTerminator())) + for (auto &I : *BB) + if (match(&I, m_Intrinsic())) + if (ThreadGuard(BB, cast(&I), BI)) + return true; + + return false; +} + +/// Try to propagate the guard from BB which is the lower block of a diamond +/// to one of its branches, in case if diamond's condition implies guard's +/// condition. +bool JumpThreadingPass::ThreadGuard(BasicBlock *BB, IntrinsicInst *Guard, + BranchInst *BI) { + assert(BI->getNumSuccessors() == 2 && "Wrong number of successors?"); + assert(BI->isConditional() && "Unconditional branch has 2 successors?"); + Value *GuardCond = Guard->getArgOperand(0); + Value *BranchCond = BI->getCondition(); + BasicBlock *TrueDest = BI->getSuccessor(0); + BasicBlock *FalseDest = BI->getSuccessor(1); + + auto &DL = BB->getModule()->getDataLayout(); + bool TrueDestIsSafe = false; + bool FalseDestIsSafe = false; + + // True dest is safe if BranchCond => GuardCond. + auto Impl = isImpliedCondition(BranchCond, GuardCond, DL); + if (Impl && *Impl) + TrueDestIsSafe = true; + else { + // False dest is safe if !BranchCond => GuardCond. + Impl = + isImpliedCondition(BranchCond, GuardCond, DL, /* InvertAPred */ true); + if (Impl && *Impl) + FalseDestIsSafe = true; + } + + if (!TrueDestIsSafe && !FalseDestIsSafe) + return false; + + BasicBlock *UnguardedBlock = TrueDestIsSafe ? TrueDest : FalseDest; + BasicBlock *GuardedBlock = FalseDestIsSafe ? TrueDest : FalseDest; + + ValueToValueMapTy UnguardedMapping, GuardedMapping; + Instruction *AfterGuard = Guard->getNextNode(); + unsigned Cost = getJumpThreadDuplicationCost(BB, AfterGuard, BBDupThreshold); + if (Cost > BBDupThreshold) + return false; + // Duplicate all instructions before the guard and the guard itself to the + // branch where implication is not proved. + GuardedBlock = DuplicateInstructionsInSplitBetween( + BB, GuardedBlock, AfterGuard, GuardedMapping); + assert(GuardedBlock && "Could not create the guarded block?"); + // Duplicate all instructions before the guard in the unguarded branch. + // Since we have successfully duplicated the guarded block and this block + // has fewer instructions, we expect it to succeed. + UnguardedBlock = DuplicateInstructionsInSplitBetween(BB, UnguardedBlock, + Guard, UnguardedMapping); + assert(UnguardedBlock && "Could not create the unguarded block?"); + DEBUG(dbgs() << "Moved guard " << *Guard << " to block " + << GuardedBlock->getName() << "\n"); + + // Some instructions before the guard may still have uses. For them, we need + // to create Phi nodes merging their copies in both guarded and unguarded + // branches. Those instructions that have no uses can be just removed. + SmallVector ToRemove; + for (auto BI = BB->begin(); &*BI != AfterGuard; ++BI) + if (!isa(&*BI)) + ToRemove.push_back(&*BI); + + Instruction *InsertionPoint = &*BB->getFirstInsertionPt(); + assert(InsertionPoint && "Empty block?"); + // Substitute with Phis & remove. + for (auto *Inst : reverse(ToRemove)) { + if (!Inst->use_empty()) { + PHINode *NewPN = PHINode::Create(Inst->getType(), 2); + NewPN->addIncoming(UnguardedMapping[Inst], UnguardedBlock); + NewPN->addIncoming(GuardedMapping[Inst], GuardedBlock); + NewPN->insertBefore(InsertionPoint); + Inst->replaceAllUsesWith(NewPN); + } + Inst->eraseFromParent(); + } + return true; +} Index: lib/Transforms/Utils/CloneFunction.cpp =================================================================== --- lib/Transforms/Utils/CloneFunction.cpp +++ lib/Transforms/Utils/CloneFunction.cpp @@ -747,3 +747,40 @@ return NewLoop; } + +/// \brief Duplicate non-Phi instructions from the beginning of block up to +/// StopAt instruction into a split block between BB and its predecessor. +BasicBlock * +llvm::DuplicateInstructionsInSplitBetween(BasicBlock *BB, BasicBlock *PredBB, + Instruction *StopAt, + ValueToValueMapTy &ValueMapping) { + // We are going to have to map operands from the original BB block to the new + // copy of the block 'NewBB'. If there are PHI nodes in BB, evaluate them to + // account for entry from PredBB. + BasicBlock::iterator BI = BB->begin(); + for (; PHINode *PN = dyn_cast(BI); ++BI) + ValueMapping[PN] = PN->getIncomingValueForBlock(PredBB); + + BasicBlock *NewBB = SplitEdge(PredBB, BB); + NewBB->setName(PredBB->getName() + ".split"); + Instruction *NewTerm = NewBB->getTerminator(); + + // Clone the non-phi instructions of BB into NewBB, keeping track of the + // mapping and using it to remap operands in the cloned instructions. + for (; StopAt != &*BI; ++BI) { + Instruction *New = BI->clone(); + New->setName(BI->getName()); + New->insertBefore(NewTerm); + ValueMapping[&*BI] = New; + + // Remap operands to patch up intra-block references. + for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i) + if (Instruction *Inst = dyn_cast(New->getOperand(i))) { + auto I = ValueMapping.find(Inst); + if (I != ValueMapping.end()) + New->setOperand(i, I->second); + } + } + + return NewBB; +} Index: test/Transforms/JumpThreading/guards.ll =================================================================== --- test/Transforms/JumpThreading/guards.ll +++ test/Transforms/JumpThreading/guards.ll @@ -0,0 +1,183 @@ +; RUN: opt < %s -jump-threading -dce -S | FileCheck %s + +declare void @llvm.experimental.guard(i1, ...) + +declare i32 @f1() +declare i32 @f2() + +define i32 @branch_implies_guard(i32 %a) { +; CHECK-LABEL: @branch_implies_guard( + %cond = icmp slt i32 %a, 10 + br i1 %cond, label %T1, label %F1 + +T1: +; CHECK: T1.split +; CHECK: %v1 = call i32 @f1() +; CHECK-NEXT: %retVal +; CHECK-NEXT: br label %Merge + %v1 = call i32 @f1() + br label %Merge + +F1: +; CHECK: F1.split +; CHECK: %v2 = call i32 @f2() +; CHECK-NEXT: %retVal +; CHECK-NEXT: %condGuard +; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 %condGuard +; CHECK-NEXT: br label %Merge + %v2 = call i32 @f2() + br label %Merge + +Merge: +; CHECK: Merge +; CHECK-NOT: call void(i1, ...) @llvm.experimental.guard( + %retPhi = phi i32 [ %v1, %T1 ], [ %v2, %F1 ] + %retVal = add i32 %retPhi, 10 + %condGuard = icmp slt i32 %a, 20 + call void(i1, ...) @llvm.experimental.guard(i1 %condGuard) [ "deopt"() ] + ret i32 %retVal +} + +define i32 @not_branch_implies_guard(i32 %a) { +; CHECK-LABEL: @not_branch_implies_guard( + %cond = icmp slt i32 %a, 20 + br i1 %cond, label %T1, label %F1 + +T1: +; CHECK: T1.split: +; CHECK-NEXT: %v1 = call i32 @f1() +; CHECK-NEXT: %retVal +; CHECK-NEXT: %condGuard +; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 %condGuard +; CHECK-NEXT: br label %Merge + %v1 = call i32 @f1() + br label %Merge + +F1: +; CHECK: F1.split: +; CHECK-NEXT: %v2 = call i32 @f2() +; CHECK-NEXT: %retVal +; CHECK-NEXT: br label %Merge + %v2 = call i32 @f2() + br label %Merge + +Merge: +; CHECK: Merge +; CHECK-NOT: call void(i1, ...) @llvm.experimental.guard( + %retPhi = phi i32 [ %v1, %T1 ], [ %v2, %F1 ] + %retVal = add i32 %retPhi, 10 + %condGuard = icmp sgt i32 %a, 10 + call void(i1, ...) @llvm.experimental.guard(i1 %condGuard) [ "deopt"() ] + ret i32 %retVal +} + +define i32 @branch_overlaps_guard(i32 %a) { +; CHECK-LABEL: @branch_overlaps_guard( + %cond = icmp slt i32 %a, 20 + br i1 %cond, label %T1, label %F1 + +T1: +; CHECK: T1: +; CHECK-NEXT: %v1 = call i32 @f1() +; CHECK-NEXT: br label %Merge + %v1 = call i32 @f1() + br label %Merge + +F1: +; CHECK: F1: +; CHECK-NEXT: %v2 = call i32 @f2() +; CHECK-NEXT: br label %Merge + %v2 = call i32 @f2() + br label %Merge + +Merge: +; CHECK: Merge +; CHECK: %condGuard = icmp slt i32 %a, 10 +; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 %condGuard) [ "deopt"() ] + %retPhi = phi i32 [ %v1, %T1 ], [ %v2, %F1 ] + %retVal = add i32 %retPhi, 10 + %condGuard = icmp slt i32 %a, 10 + call void(i1, ...) @llvm.experimental.guard(i1 %condGuard) [ "deopt"() ] + ret i32 %retVal +} + +define i32 @branch_doesnt_overlap_guard(i32 %a) { +; CHECK-LABEL: @branch_doesnt_overlap_guard( + %cond = icmp slt i32 %a, 10 + br i1 %cond, label %T1, label %F1 + +T1: +; CHECK: T1: +; CHECK-NEXT: %v1 = call i32 @f1() +; CHECK-NEXT: br label %Merge + %v1 = call i32 @f1() + br label %Merge + +F1: +; CHECK: F1: +; CHECK-NEXT: %v2 = call i32 @f2() +; CHECK-NEXT: br label %Merge + %v2 = call i32 @f2() + br label %Merge + +Merge: +; CHECK: Merge +; CHECK: %condGuard = icmp sgt i32 %a, 20 +; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 %condGuard) [ "deopt"() ] + %retPhi = phi i32 [ %v1, %T1 ], [ %v2, %F1 ] + %retVal = add i32 %retPhi, 10 + %condGuard = icmp sgt i32 %a, 20 + call void(i1, ...) @llvm.experimental.guard(i1 %condGuard) [ "deopt"() ] + ret i32 %retVal +} + +define i32 @not_a_diamond1(i32 %a, i1 %cond1) { +; CHECK-LABEL: @not_a_diamond1( + br i1 %cond1, label %Pred, label %Exit + +Pred: +; CHECK: Pred: +; CHECK-NEXT: switch i32 %a, label %Exit + switch i32 %a, label %Exit [ + i32 10, label %Merge + i32 20, label %Merge + ] + +Merge: +; CHECK: Merge: +; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 %cond1) [ "deopt"() ] +; CHECK-NEXT: br label %Exit + call void(i1, ...) @llvm.experimental.guard(i1 %cond1) [ "deopt"() ] + br label %Exit + +Exit: +; CHECK: Exit: +; CHECK-NEXT: ret i32 %a + ret i32 %a +} + +define void @not_a_diamond2(i32 %a, i1 %cond1) { +; CHECK-LABEL: @not_a_diamond2( + br label %Parent + +Merge: + call void(i1, ...) @llvm.experimental.guard(i1 %cond1)[ "deopt"() ] + ret void + +Pred: +; CHECK-NEXT: Pred: +; CHECK-NEXT: switch i32 %a, label %Exit + switch i32 %a, label %Exit [ + i32 10, label %Merge + i32 20, label %Merge + ] + +Parent: + br label %Pred + +Exit: +; CHECK: Merge: +; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 %cond1) [ "deopt"() ] +; CHECK-NEXT: ret void + ret void +} Index: unittests/Transforms/Utils/Cloning.cpp =================================================================== --- unittests/Transforms/Utils/Cloning.cpp +++ unittests/Transforms/Utils/Cloning.cpp @@ -201,6 +201,53 @@ delete F2; } +TEST_F(CloneInstruction, DuplicateInstructionsToSplit) { + Type *ArgTy1[] = {Type::getInt32PtrTy(context)}; + FunctionType *FT = FunctionType::get(Type::getVoidTy(context), ArgTy1, false); + V = new Argument(Type::getInt32Ty(context)); + + Function *F = Function::Create(FT, Function::ExternalLinkage); + + BasicBlock *BB1 = BasicBlock::Create(context, "", F); + IRBuilder<> Builder1(BB1); + + BasicBlock *BB2 = BasicBlock::Create(context, "", F); + IRBuilder<> Builder2(BB2); + + Builder1.CreateBr(BB2); + + Instruction *AddInst = cast(Builder2.CreateAdd(V, V)); + Instruction *MulInst = cast(Builder2.CreateMul(AddInst, V)); + Instruction *SubInst = cast(Builder2.CreateSub(MulInst, V)); + Builder2.CreateRetVoid(); + + ValueToValueMapTy Mapping; + + auto Split = DuplicateInstructionsInSplitBetween(BB2, BB1, SubInst, Mapping); + + EXPECT_TRUE(Split); + EXPECT_EQ(Mapping.size(), 2u); + EXPECT_TRUE(Mapping.find(AddInst) != Mapping.end()); + EXPECT_TRUE(Mapping.find(MulInst) != Mapping.end()); + + auto AddSplit = dyn_cast(Mapping[AddInst]); + EXPECT_TRUE(AddSplit); + EXPECT_EQ(AddSplit->getOperand(0), V); + EXPECT_EQ(AddSplit->getOperand(1), V); + EXPECT_EQ(AddSplit->getParent(), Split); + + auto MulSplit = dyn_cast(Mapping[MulInst]); + EXPECT_TRUE(MulSplit); + EXPECT_EQ(MulSplit->getOperand(0), AddSplit); + EXPECT_EQ(MulSplit->getOperand(1), V); + EXPECT_EQ(MulSplit->getParent(), Split); + + EXPECT_EQ(AddSplit->getNextNode(), MulSplit); + EXPECT_EQ(MulSplit->getNextNode(), Split->getTerminator()); + + delete F; +} + class CloneFunc : public ::testing::Test { protected: void SetUp() override {