Index: lib/Transforms/Utils/SimplifyCFG.cpp =================================================================== --- lib/Transforms/Utils/SimplifyCFG.cpp +++ lib/Transforms/Utils/SimplifyCFG.cpp @@ -73,6 +73,17 @@ "simplifycfg-hoist-cond-stores", cl::Hidden, cl::init(true), cl::desc("Hoist conditional stores if an unconditional store precedes")); +static cl::opt MergeCondStores( + "simplifycfg-merge-cond-stores", cl::Hidden, cl::init(true), + cl::desc("Hoist conditional stores even if an unconditional store does not " + "precede - hoist multiple conditional stores into a single " + "predicated store")); + +static cl::opt MergeCondStoresAggressively( + "simplifycfg-merge-cond-stores-aggressively", cl::Hidden, cl::init(false), + cl::desc("When merging conditional stores, do so even if the resultant " + "basic blocks are unlikely to be if-converted as a result")); + STATISTIC(NumBitMaps, "Number of switch instructions turned into bitmaps"); STATISTIC(NumLinearMaps, "Number of switch instructions turned into linear mapping"); STATISTIC(NumLookupTables, "Number of switch instructions turned into lookup tables"); @@ -2332,6 +2343,268 @@ return false; } +// If there is only one Store to Address in BB1 and BB2, return it, otherwise +// return nullptr. +static StoreInst *findUniqueStoreInBlocks(Value *Address, BasicBlock *BB1, + BasicBlock *BB2) { + StoreInst *S = nullptr; + for (User *U : Address->users()) + if (auto *SI = dyn_cast(U)) + if (SI->getParent() == BB1 || SI->getParent() == BB2) { + if (S) + // Multiple stores seen. + return nullptr; + else + S = SI; + } + return S; +} + +static Value *ensureValueAvailableInSuccessor(Value *V, BasicBlock *BB, + Value *AlternativeV = nullptr) { + // PHI is going to be a PHI node that allows the value V that is defined in + // BB to be referenced in BB's only successor. + // + // If AlternativeV is nullptr, the only value we care about in PHI is V. It + // doesn't matter to us what the other operand is (it'll never get used). We + // could just create a new PHI with an undef incoming value, but that could + // increase register pressure if EarlyCSE/InstCombine can't fold it with some + // other PHI. So here we directly look for some PHI in BB's successor with V + // as an incoming operand. If we find one, we use it, else we create a new + // one. + PHINode *PHI = nullptr; + BasicBlock *Succ = BB->getSingleSuccessor(); + + for (auto I = Succ->begin(); isa(I); ++I) + if (cast(I)->getIncomingValueForBlock(BB) == V) + PHI = cast(I); + + if (AlternativeV && PHI) + for (BasicBlock *PredBB : predecessors(Succ)) + if (PredBB != BB && + PHI->getIncomingValueForBlock(PredBB) != AlternativeV) { + PHI = nullptr; + break; + } + if (PHI) + return PHI; + + PHI = PHINode::Create(V->getType(), 2, "simplifycfg.merge", Succ->begin()); + PHI->addIncoming(V, BB); + for (BasicBlock *PredBB : predecessors(Succ)) + if (PredBB != BB) + PHI->addIncoming(AlternativeV ? AlternativeV : UndefValue::get(V->getType()), + PredBB); + return PHI; +} + +static bool mergeConditionalStoreToAddress(BasicBlock *PTB, BasicBlock *PFB, + BasicBlock *QTB, BasicBlock *QFB, + BasicBlock *PostBB, Value *Address, + bool SwapPCond, bool SwapQCond) { + // For every pointer, there must be exactly two stores, one coming from + // PTB or PFB, and the other from QTB or QFB. We don't support more than one + // store (to any address) in PTB,PFB or QTB,QFB. + // FIXME: We could relax this restriction with a bit more work and performance + // testing. + StoreInst *PStore = findUniqueStoreInBlocks(Address, PTB, PFB); + StoreInst *QStore = findUniqueStoreInBlocks(Address, QTB, QFB); + if (!PStore || !QStore) + return false; + + // Now check the stores are compatible. + if (QStore->getType() != PStore->getType() || + !QStore->isUnordered() || !PStore->isUnordered()) + return false; + + // If we're not in aggressive mode, we only optimize if we have some + // confidence that by optimizing we'll allow P and/or Q to be if-converted. + auto IsWorthwhile = [](BasicBlock *BB) { + // Heuristic: if the block can be if-converted and the instructions inside + // are all cheap (arithmetic/GEPs), it's worthwhile to thread this store. + if (BB->size() > PHINodeFoldingThreshold) + return false; + for (auto &I : *BB) + if (!isa(I) && !isa(I) && + !isa(I) && !isa(I)) + return false; + return true; + }; + + if (!MergeCondStoresAggressively && (!IsWorthwhile(PStore->getParent()) || + !IsWorthwhile(QStore->getParent()))) + return false; + + // Check that sinking the store won't cause program behavior changes. Sinking + // the store out of the Q blocks won't change any behavior as we're sinking + // from a block to its unconditional successor. But we're moving a store from + // the P blocks down through the middle block (QBI) and past both QFB and QTB. + // So we need to check that there are no aliasing loads or stores in + // QBI, QTB and QFB. We also need to check there are no conflicting memory + // operations between PStore and the end of its parent block. + // + // The ideal way to do this is to query AliasAnalysis, but we don't + // preserve AA currently so that is dangerous. Be super safe and just + // check there are no other memory operations at all. + auto IsDangerous = [](const Instruction &I) { + return isa(I) || isa(I) || isa(I) || + isa(I); + }; + for (auto &I : *QFB->getSinglePredecessor()) + if (IsDangerous(I)) + return false; + for (auto &I : *QFB) + if (&I != QStore && IsDangerous(I)) + return false; + if (QTB) + for (auto &I : *QTB) + if (&I != QStore && IsDangerous(I)) + return false; + for (auto I = BasicBlock::iterator(PStore), E = PStore->getParent()->end(); + I != E; ++I) + if (&*I != PStore && IsDangerous(*I)) + return false; + + // OK, we're going to sink the stores to PostBB. The store has to be + // conditional though, so first create the predicate. + Value *PCond = cast(PFB->getSinglePredecessor()->getTerminator()) + ->getCondition(); + Value *QCond = cast(QFB->getSinglePredecessor()->getTerminator()) + ->getCondition(); + + // At this point, it's useful to redefine TB to be the common predecessor + // if it is currently nullptr. + if (!QTB) + QTB = QFB->getSinglePredecessor(); + if (!PTB) + PTB = PFB->getSinglePredecessor(); + + Value *PPHI = ensureValueAvailableInSuccessor(PStore->getValueOperand(), + PStore->getParent()); + Value *QPHI = ensureValueAvailableInSuccessor(QStore->getValueOperand(), + QStore->getParent(), PPHI); + + IRBuilder<> PB(QFB->getSinglePredecessor()->getFirstInsertionPt()); + IRBuilder<> QB(PostBB->getFirstInsertionPt()); + + Value *PPred = PStore->getParent() == PTB ? PCond : QB.CreateNot(PCond); + Value *QPred = QStore->getParent() == QTB ? QCond : QB.CreateNot(QCond); + + if (SwapPCond) + PPred = QB.CreateNot(PPred); + if (SwapQCond) + QPred = QB.CreateNot(QPred); + Value *CombinedPred = QB.CreateOr(PPred, QPred); + + auto *T = SplitBlockAndInsertIfThen(CombinedPred, QB.GetInsertPoint(), false); + QB.SetInsertPoint(T); + QB.CreateStore(QPHI, Address); + + QStore->eraseFromParent(); + PStore->eraseFromParent(); + + return true; +} + +static bool mergeConditionalStores(BranchInst *PBI, BranchInst *QBI) { + // The intention here is to find diamonds or triangles (see below) where each + // conditional block contains a store to the same address. Both of these + // stores are conditional, so they can't be unconditionally sunk. But it may + // be profitable to speculatively sink the stores into one merged store at the + // end, and predicate the merged store on the union of the two conditions of + // PBI and QBI. + // + // This can reduce the number of stores executed if both of the conditions are + // true, and can allow the blocks to become small enough to be if-converted. + // This optimization will also chain, so that ladders of test-and-set + // sequences can be if-converted away. + // + // We only deal with simple diamonds or triangles: + // + // PBI or PBI or a combination of the two + // / \ | \ + // PTB PFB | PFB + // \ / | / + // QBI QBI + // / \ | \ + // QTB QFB | QFB + // \ / | / + // PostBB PostBB + // + // We model triangles as a type of diamond with a nullptr "true" block. + // Triangles are canonicalized so that the fallthrough edge is represented by + // a true condition, as in the diagram above. + // + BasicBlock *PTB = PBI->getSuccessor(0); + BasicBlock *PFB = PBI->getSuccessor(1); + BasicBlock *QTB = QBI->getSuccessor(0); + BasicBlock *QFB = QBI->getSuccessor(1); + BasicBlock *PostBB = QFB->getSingleSuccessor(); + + bool SwapPCond = false, SwapQCond = false; + // Canonicalize fallthroughs to the true branches. + if (PFB == QBI->getParent()) { + std::swap(PFB, PTB); + SwapPCond = true; + } + if (QFB == PostBB) { + std::swap(QFB, QTB); + SwapQCond = true; + } + + // From this point on we can assume PTB or QTB may be fallthroughs but PFB + // and QFB may not. Model fallthroughs as a nullptr block. + if (PTB == QBI->getParent()) + PTB = nullptr; + if (QTB == PostBB) + QTB = nullptr; + + // Legality bailouts. We must have at least the non-fallthrough blocks and + // the post-donimating block, and the non-fallthroughs must only have one + // predecessor. + auto HasOnePredAndOneSucc = [](BasicBlock *BB, BasicBlock *P, BasicBlock *S) { + return BB && BB->getSinglePredecessor() == P && + BB->getSingleSuccessor() == S; + }; + if (!PostBB || + !HasOnePredAndOneSucc(PFB, PBI->getParent(), QBI->getParent()) || + !HasOnePredAndOneSucc(QFB, QBI->getParent(), PostBB)) + return false; + if ((PTB && !HasOnePredAndOneSucc(PTB, PBI->getParent(), QBI->getParent())) || + (QTB && !HasOnePredAndOneSucc(QTB, QBI->getParent(), PostBB))) + return false; + if (PostBB->getNumUses() != 2 || QBI->getParent()->getNumUses() != 2) + return false; + + // OK, this is a sequence of two diamonds or triangles. + // Check if there are stores in PTB or PFB that are repeated in QTB or QFB. + SmallPtrSet PStoreAddresses, QStoreAddresses; + if (PTB) + for (auto &I : *PTB) + if (StoreInst *SI = dyn_cast(&I)) + PStoreAddresses.insert(SI->getPointerOperand()); + for (auto &I : *PFB) + if (StoreInst *SI = dyn_cast(&I)) + PStoreAddresses.insert(SI->getPointerOperand()); + if (QTB) + for (auto &I : *QTB) + if (StoreInst *SI = dyn_cast(&I)) + QStoreAddresses.insert(SI->getPointerOperand()); + for (auto &I : *QFB) + if (StoreInst *SI = dyn_cast(&I)) + QStoreAddresses.insert(SI->getPointerOperand()); + + SmallVector CommonAddresses; + std::set_intersection(PStoreAddresses.begin(), PStoreAddresses.end(), + QStoreAddresses.begin(), QStoreAddresses.end(), + std::back_inserter(CommonAddresses)); + bool Changed = false; + for (auto *Address : CommonAddresses) + Changed |= mergeConditionalStoreToAddress( + PTB, PFB, QTB, QFB, PostBB, Address, SwapPCond, SwapQCond); + return Changed; +} + /// If we have a conditional branch as a predecessor of another block, /// this function tries to simplify it. We know /// that PBI and BI are both conditional branches, and BI is in one of the @@ -2385,6 +2658,11 @@ } } + // If both branches are conditional and both contain stores to the same + // address, remove the stores from the conditionals and create a conditional merged store at the end. + if (MergeCondStores && mergeConditionalStores(PBI, BI)) + return true; + // If this is a conditional branch in an empty block, and if any // predecessors are a conditional branch to one of our destinations, // fold the conditions into logical ops and one cond br. @@ -2963,7 +3241,7 @@ IE = UnwindDest->getFirstNonPHI()->getIterator(); I != IE; ++I) { PHINode *DestPN = cast(I); - + int Idx = DestPN->getBasicBlockIndex(BB); // Since BB unwinds to UnwindDest, it has to be in the PHI node. assert(Idx != -1); @@ -2988,7 +3266,7 @@ // If the incoming value was a PHI node in the cleanup pad we are // removing, we need to merge that PHI node's incoming values into // DestPN. - for (unsigned SrcIdx = 0, SrcE = SrcPN->getNumIncomingValues(); + for (unsigned SrcIdx = 0, SrcE = SrcPN->getNumIncomingValues(); SrcIdx != SrcE; ++SrcIdx) { DestPN->addIncoming(SrcPN->getIncomingValue(SrcIdx), SrcPN->getIncomingBlock(SrcIdx)); @@ -3341,17 +3619,17 @@ } } - // If we can prove that the cases must cover all possible values, the - // default destination becomes dead and we can remove it. If we know some + // If we can prove that the cases must cover all possible values, the + // default destination becomes dead and we can remove it. If we know some // of the bits in the value, we can use that to more precisely compute the // number of possible unique case values. bool HasDefault = !isa(SI->getDefaultDest()->getFirstNonPHIOrDbg()); - const unsigned NumUnknownBits = Bits - + const unsigned NumUnknownBits = Bits - (KnownZero.Or(KnownOne)).countPopulation(); assert(NumUnknownBits <= Bits); if (HasDefault && DeadCases.empty() && - NumUnknownBits < 64 /* avoid overflow */ && + NumUnknownBits < 64 /* avoid overflow */ && SI->getNumCases() == (1ULL << NumUnknownBits)) { DEBUG(dbgs() << "SimplifyCFG: switch default is dead.\n"); BasicBlock *NewDefault = SplitBlockPredecessors(SI->getDefaultDest(), @@ -4107,7 +4385,7 @@ assert((CaseConst == TrueConst || CaseConst == FalseConst) && "Expect true or false as compare result."); } - + // Check if the branch instruction dominates the phi node. It's a simple // dominance check, but sufficient for our needs. // Although this check is invariant in the calling loops, it's better to do it @@ -4572,6 +4850,16 @@ return false; } +static BasicBlock *allPredecessorsComeFromSameSource(BasicBlock *BB) { + BasicBlock *PredPred = nullptr; + for (auto *P : predecessors(BB)) { + BasicBlock *PPred = P->getSinglePredecessor(); + if (!PPred || (PredPred && PredPred != PPred)) + return nullptr; + PredPred = PPred; + } + return PredPred; +} bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) { BasicBlock *BB = BI->getParent(); @@ -4655,6 +4943,14 @@ if (SimplifyCondBranchToCondBranch(PBI, BI)) return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true; + // Look for diamond patterns. + if (MergeCondStores) + if (BasicBlock *PrevBB = allPredecessorsComeFromSameSource(BB)) + if (BranchInst *PBI = dyn_cast(PrevBB->getTerminator())) + if (PBI != BI && PBI->isConditional()) + if (mergeConditionalStores(PBI, BI)) + return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true; + return false; } @@ -4802,5 +5098,6 @@ bool llvm::SimplifyCFG(BasicBlock *BB, const TargetTransformInfo &TTI, unsigned BonusInstThreshold, AssumptionCache *AC) { return SimplifyCFGOpt(TTI, BB->getModule()->getDataLayout(), - BonusInstThreshold, AC).run(BB); + BonusInstThreshold, AC) + .run(BB); } Index: test/Transforms/SimplifyCFG/merge-cond-stores.ll =================================================================== --- /dev/null +++ test/Transforms/SimplifyCFG/merge-cond-stores.ll @@ -0,0 +1,279 @@ +; RUN: opt -simplifycfg -instcombine < %s -simplifycfg-merge-cond-stores=true -simplifycfg-merge-cond-stores-aggressively=false -phi-node-folding-threshold=2 -S | FileCheck %s + +; CHECK-LABEL: @test_simple +; This test should succeed and end up if-converted. +; CHECK: icmp eq i32 %b, 0 +; CHECK-NEXT: icmp ne i32 %a, 0 +; CHECK-NEXT: xor i1 %x2, true +; CHECK-NEXT: %[[x:.*]] = or i1 %{{.*}}, %{{.*}} +; CHECK-NEXT: br i1 %[[x]] +; CHECK: store +; CHECK-NOT: store +; CHECK: ret +define void @test_simple(i32* %p, i32 %a, i32 %b) { +entry: + %x1 = icmp eq i32 %a, 0 + br i1 %x1, label %fallthrough, label %yes1 + +yes1: + store i32 0, i32* %p + br label %fallthrough + +fallthrough: + %x2 = icmp eq i32 %b, 0 + br i1 %x2, label %end, label %yes2 + +yes2: + store i32 1, i32* %p + br label %end + +end: + ret void +} + +; CHECK-LABEL: @test_recursive +; This test should entirely fold away, leaving one large basic block. +; CHECK: store +; CHECK-NOT: store +; CHECK: ret +define void @test_recursive(i32* %p, i32 %a, i32 %b, i32 %c, i32 %d) { +entry: + %x1 = icmp eq i32 %a, 0 + br i1 %x1, label %fallthrough, label %yes1 + +yes1: + store i32 0, i32* %p + br label %fallthrough + +fallthrough: + %x2 = icmp eq i32 %b, 0 + br i1 %x2, label %next, label %yes2 + +yes2: + store i32 1, i32* %p + br label %next + +next: + %x3 = icmp eq i32 %c, 0 + br i1 %x3, label %fallthrough2, label %yes3 + +yes3: + store i32 2, i32* %p + br label %fallthrough2 + +fallthrough2: + %x4 = icmp eq i32 %d, 0 + br i1 %x4, label %end, label %yes4 + +yes4: + store i32 3, i32* %p + br label %end + + +end: + ret void +} + +; CHECK-LABEL: @test_not_ifconverted +; The code in each diamond is too large - it won't be if-converted so our +; heuristics should say no. +; CHECK: store +; CHECK: store +; CHECK: ret +define void @test_not_ifconverted(i32* %p, i32 %a, i32 %b) { +entry: + %x1 = icmp eq i32 %a, 0 + br i1 %x1, label %fallthrough, label %yes1 + +yes1: + %y1 = or i32 %b, 55 + %y2 = add i32 %y1, 24 + %y3 = and i32 %y2, 67 + store i32 %y3, i32* %p + br label %fallthrough + +fallthrough: + %x2 = icmp eq i32 %b, 0 + br i1 %x2, label %end, label %yes2 + +yes2: + %z1 = or i32 %a, 55 + %z2 = add i32 %z1, 24 + %z3 = and i32 %z2, 67 + store i32 %z3, i32* %p + br label %end + +end: + ret void +} + +; CHECK-LABEL: @test_aliasing1 +; The store to %p clobbers the previous store, so if-converting this would +; be illegal. +; CHECK: store +; CHECK: store +; CHECK: ret +define void @test_aliasing1(i32* %p, i32 %a, i32 %b) { +entry: + %x1 = icmp eq i32 %a, 0 + br i1 %x1, label %fallthrough, label %yes1 + +yes1: + store i32 0, i32* %p + br label %fallthrough + +fallthrough: + %y1 = load i32, i32* %p + %x2 = icmp eq i32 %y1, 0 + br i1 %x2, label %end, label %yes2 + +yes2: + store i32 1, i32* %p + br label %end + +end: + ret void +} + +; CHECK-LABEL: @test_aliasing2 +; The load from %q aliases with %p, so if-converting this would be illegal. +; CHECK: store +; CHECK: store +; CHECK: ret +define void @test_aliasing2(i32* %p, i32* %q, i32 %a, i32 %b) { +entry: + %x1 = icmp eq i32 %a, 0 + br i1 %x1, label %fallthrough, label %yes1 + +yes1: + store i32 0, i32* %p + br label %fallthrough + +fallthrough: + %y1 = load i32, i32* %q + %x2 = icmp eq i32 %y1, 0 + br i1 %x2, label %end, label %yes2 + +yes2: + store i32 1, i32* %p + br label %end + +end: + ret void +} + +declare void @f() + +; CHECK-LABEL: @test_diamond_simple +; This should get if-converted. +; CHECK: store +; CHECK-NOT: store +; CHECK: ret +define i32 @test_diamond_simple(i32* %p, i32* %q, i32 %a, i32 %b) { +entry: + %x1 = icmp eq i32 %a, 0 + br i1 %x1, label %no1, label %yes1 + +yes1: + store i32 0, i32* %p + br label %fallthrough + +no1: + %z1 = add i32 %a, %b + br label %fallthrough + +fallthrough: + %z2 = phi i32 [ %z1, %no1 ], [ 0, %yes1 ] + %x2 = icmp eq i32 %b, 0 + br i1 %x2, label %no2, label %yes2 + +yes2: + store i32 1, i32* %p + br label %end + +no2: + %z3 = sub i32 %z2, %b + br label %end + +end: + %z4 = phi i32 [ %z3, %no2 ], [ 3, %yes2 ] + ret i32 %z4 +} + +; CHECK-LABEL: @test_diamond_simple2 +; This should get if-converted. The call to f() occurs in the first branch, +; so it does not affect the if-conversion. +; CHECK: store +; CHECK-NOT: store +; CHECK: ret +define i32 @test_diamond_simple2(i32* %p, i32* %q, i32 %a, i32 %b) { +entry: + %x1 = icmp eq i32 %a, 0 + br i1 %x1, label %no1, label %yes1 + +yes1: + store i32 0, i32* %p + br label %fallthrough + +no1: + call void @f() + %z1 = add i32 %a, %b + br label %fallthrough + +fallthrough: + %z2 = phi i32 [ %z1, %no1 ], [ 0, %yes1 ] + %x2 = icmp eq i32 %b, 0 + br i1 %x2, label %no2, label %yes2 + +yes2: + store i32 1, i32* %p + br label %end + +no2: + %z3 = sub i32 %z2, %b + br label %end + +end: + %z4 = phi i32 [ %z3, %no2 ], [ 3, %yes2 ] + ret i32 %z4 +} + +; CHECK-LABEL: @test_diamond_alias3 +; Now there is a call to f() in the bottom branch. The store in the first +; branch would now be reordered with respect to the call if we if-converted, +; so we must not. +; CHECK: store +; CHECK: store +; CHECK: ret +define i32 @test_diamond_alias3(i32* %p, i32* %q, i32 %a, i32 %b) { +entry: + %x1 = icmp eq i32 %a, 0 + br i1 %x1, label %no1, label %yes1 + +yes1: + store i32 0, i32* %p + br label %fallthrough + +no1: + call void @f() + %z1 = add i32 %a, %b + br label %fallthrough + +fallthrough: + %z2 = phi i32 [ %z1, %no1 ], [ 0, %yes1 ] + %x2 = icmp eq i32 %b, 0 + br i1 %x2, label %no2, label %yes2 + +yes2: + store i32 1, i32* %p + br label %end + +no2: + call void @f() + %z3 = sub i32 %z2, %b + br label %end + +end: + %z4 = phi i32 [ %z3, %no2 ], [ 3, %yes2 ] + ret i32 %z4 +}