Index: lib/Transforms/Utils/SimplifyCFG.cpp =================================================================== --- lib/Transforms/Utils/SimplifyCFG.cpp +++ lib/Transforms/Utils/SimplifyCFG.cpp @@ -1319,78 +1319,14 @@ return true; } -// Return true if V0 and V1 are equivalent. This handles the obvious cases -// where V0 == V1 and V0 and V1 are both identical instructions, but also -// handles loads and stores with identical operands. -// -// Because determining if two memory instructions are equivalent -// depends on control flow, the \c At0 and \c At1 parameters specify a -// location for the query. This function is essentially answering the -// query "If V0 were moved to At0, and V1 were moved to At1, are V0 and V1 -// equivalent?". In practice this means checking that moving V0 to At0 -// doesn't cross any other memory instructions. -static bool areValuesTriviallySame(Value *V0, BasicBlock::const_iterator At0, - Value *V1, BasicBlock::const_iterator At1) { - if (V0 == V1) - return true; - - // Also check for instructions that are identical but not pointer-identical. - // This can include load instructions that haven't been CSE'd. - if (!isa(V0) || !isa(V1)) - return false; - const auto *I0 = cast(V0); - const auto *I1 = cast(V1); - if (!I0->isIdenticalToWhenDefined(I1)) - return false; - - if (!I0->mayReadOrWriteMemory()) - return true; - - // Instructions that may read or write memory have extra restrictions. We - // must ensure we don't treat %a and %b as equivalent in code such as: - // - // %a = load %x - // store %x, 1 - // if (%c) { - // %b = load %x - // %d = add %b, 1 - // } else { - // %d = add %a, 1 - // } - - // Be conservative. We don't want to search the entire CFG between def - // and use; if the def isn't in the same block as the use just bail. - if (I0->getParent() != At0->getParent() || - I1->getParent() != At1->getParent()) - return false; - - // Again, be super conservative. Ideally we'd be able to query AliasAnalysis - // but we currently don't have that available. - auto WritesMemory = [](const Instruction &I) { - return I.mayReadOrWriteMemory(); - }; - if (std::any_of(std::next(I0->getIterator()), At0, WritesMemory)) - return false; - if (std::any_of(std::next(I1->getIterator()), At1, WritesMemory)) - return false; - return true; -} - -// All blocks in Blocks unconditionally jump to a common successor. Analyze -// the last non-terminator instruction in each block and return true if it would -// be possible to sink them into their successor, creating one common -// instruction instead. Set NumPHIsRequired to the number of PHI nodes that -// would need to be created during sinking. -static bool canSinkLastInstruction(ArrayRef Blocks, - unsigned &NumPHIsRequired) { - SmallVector Insts; - for (auto *BB : Blocks) { - if (BB->getTerminator() == &BB->front()) - // Block was empty. - return false; - Insts.push_back(BB->getTerminator()->getPrevNode()); - } - +// All instructions in Insts belong to different blocks that all unconditionally +// branch to a common successor. Analyze each instruction and return true if it +// would be possible to sink them into their successor, creating one common +// instruction instead. For every value that would be required to be provided by +// PHI node (because an operand varies in each input block), add to PHIOperands. +static bool canSinkInstructions( + ArrayRef Insts, + DenseMap> &PHIOperands) { // Prune out obviously bad instructions to move. Any non-store instruction // must have exactly one use, and we check later that use is by a single, // common PHI instruction in the successor. @@ -1410,24 +1346,26 @@ if (!I->isSameOperationAs(I0)) return false; - // If this isn't a store, check the only user is a single PHI. + // All instructions in Insts are known to be the same opcode. If they aren't + // stores, check the only user of each is a PHI or in the same block as the + // instruction, because if a user is in the same block as an instruction + // we're contemplating sinking, it must already be determined to be sinkable. if (!isa(I0)) { auto *PNUse = dyn_cast(*I0->user_begin()); - if (!PNUse || - !all_of(Insts, [&PNUse](const Instruction *I) { - return *I->user_begin() == PNUse; + if (!all_of(Insts, [&PNUse](const Instruction *I) -> bool { + auto *U = cast(*I->user_begin()); + return U == PNUse || U->getParent() == I->getParent(); })) return false; } - NumPHIsRequired = 0; for (unsigned OI = 0, OE = I0->getNumOperands(); OI != OE; ++OI) { if (I0->getOperand(OI)->getType()->isTokenTy()) // Don't touch any operand of token type. return false; auto SameAsI0 = [&I0, OI](const Instruction *I) { - return areValuesTriviallySame(I->getOperand(OI), I->getIterator(), - I0->getOperand(OI), I0->getIterator()); + assert(I->getNumOperands() == I0->getNumOperands()); + return I->getOperand(OI) == I0->getOperand(OI); }; if (!all_of(Insts, SameAsI0)) { if (I0->mustOperandBeConstant(OI)) @@ -1438,7 +1376,8 @@ // FIXME: if the call was *already* indirect, we should do this. return false; } - ++NumPHIsRequired; + for (auto *I : Insts) + PHIOperands[I].push_back(I->getOperand(OI)); } } return true; @@ -1448,10 +1387,6 @@ // instruction of every block in Blocks to their common successor, commoning // into one instruction. static void sinkLastInstruction(ArrayRef Blocks) { - unsigned Dummy; - (void)Dummy; - assert(canSinkLastInstruction(Blocks, Dummy) && - "Must analyze before transforming!"); auto *BBEnd = Blocks[0]->getTerminator()->getSuccessor(0); // canSinkLastInstruction returning true guarantees that every block has at @@ -1511,6 +1446,60 @@ I->eraseFromParent(); } +namespace { + // LockstepReverseIterator - Iterates through instructions + // in a set of blocks in reverse order from the first non-terminator. + // For example (assume all blocks have size n): + // LockstepReverseIterator I([B1, B2, B3]); + // *I-- = [B1[n], B2[n], B3[n]]; + // *I-- = [B1[n-1], B2[n-1], B3[n-1]]; + // *I-- = [B1[n-2], B2[n-2], B3[n-2]]; + // ... + class LockstepReverseIterator { + ArrayRef Blocks; + SmallVector Insts; + bool Fail; + public: + LockstepReverseIterator(ArrayRef Blocks) : + Blocks(Blocks) { + reset(); + } + + void reset() { + Fail = false; + Insts.clear(); + for (auto *BB : Blocks) { + if (BB->size() <= 1) { + // Block wasn't big enough + Fail = true; + return; + } + Insts.push_back(BB->getTerminator()->getPrevNode()); + } + } + + bool isValid() const { + return !Fail; + } + + void operator -- () { + if (Fail) + return; + for (auto *&Inst : Insts) { + if (Inst == &Inst->getParent()->front()) { + Fail = true; + return; + } + Inst = Inst->getPrevNode(); + } + } + + ArrayRef operator * () const { + return Insts; + } + }; +} + /// Given an unconditional branch that goes to BBEnd, /// check whether BBEnd has only two predecessors and the other predecessor /// ends with an unconditional branch. If it is true, sink any common code @@ -1519,6 +1508,8 @@ assert(BI1->isUnconditional()); BasicBlock *BBEnd = BI1->getSuccessor(0); + // We currently only support branch targets with two predecessors. + // FIXME: this is an arbitrary restriction and should be lifted. SmallVector Blocks; for (auto *BB : predecessors(BBEnd)) Blocks.push_back(BB); @@ -1530,12 +1521,62 @@ return false; bool Changed = false; - unsigned NumPHIsToInsert; - while (canSinkLastInstruction(Blocks, NumPHIsToInsert) && NumPHIsToInsert <= 1) { + + // We take a two-step approach to tail sinking. First we scan from the end of + // each block upwards in lockstep. If the n'th instruction from the end of each + // block can be sunk, those instructions are added to ValuesToSink and we + // carry on. If we can sink an instruction but need to PHI-merge some operands + // (because they're not identical in each instruction) we add these to + // PHIOperands. + unsigned ScanIdx = 0; + SmallPtrSet InstructionsToSink; + DenseMap> PHIOperands; + LockstepReverseIterator LRI(Blocks); + while (LRI.isValid() && + canSinkInstructions(*LRI, PHIOperands)) { + DEBUG(dbgs() << "SINK: instruction can be sunk: " << (*LRI)[0] << "\n"); + InstructionsToSink.insert((*LRI).begin(), (*LRI).end()); + ++ScanIdx; + --LRI; + } + + // Now that we've analyzed all potential sinking candidates, perform the + // actual sink. We iteratively sink the last non-terminator of the source + // blocks into their common successor unless doing so would require too + // many PHI instructions to be generated (currently only one PHI is allowed + // per sunk instruction). + // + // We can use InstructionsToSink to discount values needing PHI-merging that will + // actually be sunk in a later iteration. This allows us to be more + // aggressive in what we sink. This does allow a false positive where we + // sink presuming a later value will also be sunk, but stop half way through + // and never actually sink it which means we produce more PHIs than intended. + // This is unlikely in practice though. + for (unsigned SinkIdx = 0; SinkIdx != ScanIdx; ++SinkIdx) { + DEBUG(dbgs() << "SINK: Sink: " + << *Blocks[0]->getTerminator()->getPrevNode() + << "\n"); + // Because we've sunk every instruction in turn, the current instruction to + // sink is always at index 0. + LRI.reset(); + unsigned NumPHIdValues = 0; + for (auto *I : *LRI) + for (auto *V : PHIOperands[I]) + if (InstructionsToSink.count(V) == 0) + ++NumPHIdValues; + DEBUG(dbgs() << "SINK: #phid values: " << NumPHIdValues << "\n"); + assert((NumPHIdValues % Blocks.size() == 0) && + "Every operand must either be PHId or not PHId!"); + + if (NumPHIdValues / Blocks.size() > 1) + // Too many PHIs would be created. + break; + sinkLastInstruction(Blocks); NumSinkCommons++; Changed = true; } + return Changed; } Index: test/Transforms/SimplifyCFG/sink-common-code.ll =================================================================== --- test/Transforms/SimplifyCFG/sink-common-code.ll +++ test/Transforms/SimplifyCFG/sink-common-code.ll @@ -333,3 +333,63 @@ ; CHECK-LABEL: test13 ; CHECK: %[[x:.*]] = select i1 %flag ; CHECK: call i32 @bar(i32 %[[x]]) + +; The load should be commoned. +define i32 @test14(i1 zeroext %flag, i32 %w, i32 %x, i32 %y, %struct.anon* %s) { +entry: + br i1 %flag, label %if.then, label %if.else + +if.then: + %dummy = add i32 %x, 1 + %gepa = getelementptr inbounds %struct.anon, %struct.anon* %s, i32 0, i32 1 + %sv1 = load i32, i32* %gepa + %cmp1 = icmp eq i32 %sv1, 56 + br label %if.end + +if.else: + %dummy2 = add i32 %x, 4 + %gepb = getelementptr inbounds %struct.anon, %struct.anon* %s, i32 0, i32 1 + %sv2 = load i32, i32* %gepb + %cmp2 = icmp eq i32 %sv2, 57 + br label %if.end + +if.end: + %p = phi i1 [ %cmp1, %if.then ], [ %cmp2, %if.else ] + ret i32 1 +} + +; CHECK-LABEL: test14 +; CHECK: getelementptr +; CHECK: load +; CHECK-NOT: load + +; The load should be commoned. +define i32 @test15(i1 zeroext %flag, i32 %w, i32 %x, i32 %y, %struct.anon* %s) { +entry: + br i1 %flag, label %if.then, label %if.else + +if.then: + %dummy = add i32 %x, 1 + %gepa = getelementptr inbounds %struct.anon, %struct.anon* %s, i32 0, i32 0 + %sv1 = load i32, i32* %gepa + %ext1 = zext i32 %sv1 to i64 + %cmp1 = icmp eq i64 %ext1, 56 + br label %if.end + +if.else: + %dummy2 = add i32 %x, 4 + %gepb = getelementptr inbounds %struct.anon, %struct.anon* %s, i32 0, i32 1 + %sv2 = load i32, i32* %gepb + %ext2 = zext i32 %sv2 to i64 + %cmp2 = icmp eq i64 %ext2, 57 + br label %if.end + +if.end: + %p = phi i1 [ %cmp1, %if.then ], [ %cmp2, %if.else ] + ret i32 1 +} + +; CHECK-LABEL: test15 +; CHECK: getelementptr +; CHECK: load +; CHECK-NOT: load