Index: lib/Transforms/Utils/SimplifyCFG.cpp =================================================================== --- lib/Transforms/Utils/SimplifyCFG.cpp +++ lib/Transforms/Utils/SimplifyCFG.cpp @@ -1319,78 +1319,13 @@ return true; } -// Return true if V0 and V1 are equivalent. This handles the obvious cases -// where V0 == V1 and V0 and V1 are both identical instructions, but also -// handles loads and stores with identical operands. -// -// Because determining if two memory instructions are equivalent -// depends on control flow, the \c At0 and \c At1 parameters specify a -// location for the query. This function is essentially answering the -// query "If V0 were moved to At0, and V1 were moved to At1, are V0 and V1 -// equivalent?". In practice this means checking that moving V0 to At0 -// doesn't cross any other memory instructions. -static bool areValuesTriviallySame(Value *V0, BasicBlock::const_iterator At0, - Value *V1, BasicBlock::const_iterator At1) { - if (V0 == V1) - return true; - - // Also check for instructions that are identical but not pointer-identical. - // This can include load instructions that haven't been CSE'd. - if (!isa(V0) || !isa(V1)) - return false; - const auto *I0 = cast(V0); - const auto *I1 = cast(V1); - if (!I0->isIdenticalToWhenDefined(I1)) - return false; - - if (!I0->mayReadOrWriteMemory()) - return true; - - // Instructions that may read or write memory have extra restrictions. We - // must ensure we don't treat %a and %b as equivalent in code such as: - // - // %a = load %x - // store %x, 1 - // if (%c) { - // %b = load %x - // %d = add %b, 1 - // } else { - // %d = add %a, 1 - // } - - // Be conservative. We don't want to search the entire CFG between def - // and use; if the def isn't in the same block as the use just bail. - if (I0->getParent() != At0->getParent() || - I1->getParent() != At1->getParent()) - return false; - - // Again, be super conservative. Ideally we'd be able to query AliasAnalysis - // but we currently don't have that available. - auto WritesMemory = [](const Instruction &I) { - return I.mayReadOrWriteMemory(); - }; - if (std::any_of(std::next(I0->getIterator()), At0, WritesMemory)) - return false; - if (std::any_of(std::next(I1->getIterator()), At1, WritesMemory)) - return false; - return true; -} - // All blocks in Blocks unconditionally jump to a common successor. Analyze // the last non-terminator instruction in each block and return true if it would // be possible to sink them into their successor, creating one common // instruction instead. Set NumPHIsRequired to the number of PHI nodes that // would need to be created during sinking. -static bool canSinkLastInstruction(ArrayRef Blocks, - unsigned &NumPHIsRequired) { - SmallVector Insts; - for (auto *BB : Blocks) { - if (BB->getTerminator() == &BB->front()) - // Block was empty. - return false; - Insts.push_back(BB->getTerminator()->getPrevNode()); - } - +static bool canSinkInstruction(ArrayRef Insts, + SmallVectorImpl &ValuesNeedingPHIs) { // Prune out obviously bad instructions to move. Any non-store instruction // must have exactly one use, and we check later that use is by a single, // common PHI instruction in the successor. @@ -1410,24 +1345,25 @@ if (!I->isSameOperationAs(I0)) return false; - // If this isn't a store, check the only user is a single PHI. + // If this isn't a store, check the only user is a single PHI or in this + // block. If a user is in this block then it must come after I0 and + // therefore be sunk. if (!isa(I0)) { auto *PNUse = dyn_cast(*I0->user_begin()); - if (!PNUse || - !all_of(Insts, [&PNUse](const Instruction *I) { - return *I->user_begin() == PNUse; + if (!all_of(Insts, [&PNUse](const Instruction *I) -> bool { + auto *U = cast(*I->user_begin()); + return U == PNUse || U->getParent() == I->getParent(); })) return false; } - NumPHIsRequired = 0; + ValuesNeedingPHIs.clear(); for (unsigned OI = 0, OE = I0->getNumOperands(); OI != OE; ++OI) { if (I0->getOperand(OI)->getType()->isTokenTy()) // Don't touch any operand of token type. return false; auto SameAsI0 = [&I0, OI](const Instruction *I) { - return areValuesTriviallySame(I->getOperand(OI), I->getIterator(), - I0->getOperand(OI), I0->getIterator()); + return I->getOperand(OI) == I0->getOperand(OI); }; if (!all_of(Insts, SameAsI0)) { if (I0->mustOperandBeConstant(OI)) @@ -1438,7 +1374,8 @@ // FIXME: if the call was *already* indirect, we should do this. return false; } - ++NumPHIsRequired; + for (auto *I : Insts) + ValuesNeedingPHIs.push_back(I->getOperand(OI)); } } return true; @@ -1448,10 +1385,6 @@ // instruction of every block in Blocks to their common successor, commoning // into one instruction. static void sinkLastInstruction(ArrayRef Blocks) { - unsigned Dummy; - (void)Dummy; - assert(canSinkLastInstruction(Blocks, Dummy) && - "Must analyze before transforming!"); auto *BBEnd = Blocks[0]->getTerminator()->getSuccessor(0); // canSinkLastInstruction returning true guarantees that every block has at @@ -1519,6 +1452,8 @@ assert(BI1->isUnconditional()); BasicBlock *BBEnd = BI1->getSuccessor(0); + // We currently only support branch targets with two predecessors. + // FIXME: this is an arbitrary restriction and should be lifted. SmallVector Blocks; for (auto *BB : predecessors(BBEnd)) Blocks.push_back(BB); @@ -1530,12 +1465,70 @@ return false; bool Changed = false; - unsigned NumPHIsToInsert; - while (canSinkLastInstruction(Blocks, NumPHIsToInsert) && NumPHIsToInsert <= 1) { + SmallVector Insts; + + // Helper to populate the Insts list with the Idx'th instruction from + // the end of each block (excluding terminator). So Idx=0 would populate + // Insts with the last non-terminator instruction from each block. + // + // Return false if this isn't possible (one block isn't large enough) + auto PopulateInsts = [&Insts,&Blocks](unsigned Idx) -> bool { + Insts.clear(); + for (auto *BB : Blocks) { + if (BB->size() - 1 <= Idx) + // Block wasn't big enough + return false; + Insts.push_back(&*std::prev(BB->getTerminator()->getIterator(), Idx + 1)); + } + return true; + }; + + // We take a two-step approach to tail sinking. First we scan from the end of + // each block upwards in lockstep. If the n'th instruction from the end of each + // block can be sunk, the values are added to ValuesToSink and we carry on. + // If we can sink an instruction but need to PHI-merge some operands (because + // they're not identical in each instruction) we add these to + // ValuesNeedingPHIs. + unsigned ScanIdx = 0; + SmallPtrSet ValuesToSink; + SmallVector,4> ValuesNeedingPHIs; + SmallVector ThisValuesNeedingPHIs; + while (PopulateInsts(ScanIdx) && + canSinkInstruction(Insts, ThisValuesNeedingPHIs)) { + ValuesNeedingPHIs.push_back(ThisValuesNeedingPHIs); + ValuesToSink.insert(Insts.begin(), Insts.end()); + ++ScanIdx; + } + + // Now that we've analyzed all potential sinking candidates, perform the + // actual sink. We iteratively sink the last non-terminator of the source + // blocks into their common successor unless doing so would require too + // many PHI instructions to be generated (currently only one PHI is allowed + // per sunk instruction). + // + // We can use ValuesToSink to discount values needing PHI-merging that will + // actually be sunk in a later iteration. This allows us to be more + // aggressive in what we sink. This does allow a false positive where we + // sink presuming a later value will also be sunk, but stop half way through + // and never actually sink it which means we produce more PHIs than intended. + // This is unlikely in practice though. + for (unsigned SinkIdx = 0; SinkIdx != ScanIdx; ++SinkIdx) { + unsigned NumPHIdValues = 0; + for (auto *V : ValuesNeedingPHIs[SinkIdx]) + if (ValuesToSink.count(V) == 0) + ++NumPHIdValues; + assert((NumPHIdValues % Blocks.size() == 0) && + "Every operand must either be PHId or not PHId!"); + + if (NumPHIdValues / Blocks.size() > 1) + // Too many PHIs would be created. + break; + sinkLastInstruction(Blocks); NumSinkCommons++; Changed = true; } + return Changed; } Index: test/Transforms/SimplifyCFG/sink-common-code.ll =================================================================== --- test/Transforms/SimplifyCFG/sink-common-code.ll +++ test/Transforms/SimplifyCFG/sink-common-code.ll @@ -333,3 +333,63 @@ ; CHECK-LABEL: test13 ; CHECK: %[[x:.*]] = select i1 %flag ; CHECK: call i32 @bar(i32 %[[x]]) + +; The load should be commoned. +define i32 @test14(i1 zeroext %flag, i32 %w, i32 %x, i32 %y, %struct.anon* %s) { +entry: + br i1 %flag, label %if.then, label %if.else + +if.then: + %dummy = add i32 %x, 1 + %gepa = getelementptr inbounds %struct.anon, %struct.anon* %s, i32 0, i32 1 + %sv1 = load i32, i32* %gepa + %cmp1 = icmp eq i32 %sv1, 56 + br label %if.end + +if.else: + %dummy2 = add i32 %x, 4 + %gepb = getelementptr inbounds %struct.anon, %struct.anon* %s, i32 0, i32 1 + %sv2 = load i32, i32* %gepb + %cmp2 = icmp eq i32 %sv2, 57 + br label %if.end + +if.end: + %p = phi i1 [ %cmp1, %if.then ], [ %cmp2, %if.else ] + ret i32 1 +} + +; CHECK-LABEL: test14 +; CHECK: getelementptr +; CHECK: load +; CHECK-NOT: load + +; The load should be commoned. +define i32 @test15(i1 zeroext %flag, i32 %w, i32 %x, i32 %y, %struct.anon* %s) { +entry: + br i1 %flag, label %if.then, label %if.else + +if.then: + %dummy = add i32 %x, 1 + %gepa = getelementptr inbounds %struct.anon, %struct.anon* %s, i32 0, i32 0 + %sv1 = load i32, i32* %gepa + %ext1 = zext i32 %sv1 to i64 + %cmp1 = icmp eq i64 %ext1, 56 + br label %if.end + +if.else: + %dummy2 = add i32 %x, 4 + %gepb = getelementptr inbounds %struct.anon, %struct.anon* %s, i32 0, i32 1 + %sv2 = load i32, i32* %gepb + %ext2 = zext i32 %sv2 to i64 + %cmp2 = icmp eq i64 %ext2, 57 + br label %if.end + +if.end: + %p = phi i1 [ %cmp1, %if.then ], [ %cmp2, %if.else ] + ret i32 1 +} + +; CHECK-LABEL: test15 +; CHECK: getelementptr +; CHECK: load +; CHECK-NOT: load