Index: lib/Transforms/Scalar/GVN.cpp =================================================================== --- lib/Transforms/Scalar/GVN.cpp +++ lib/Transforms/Scalar/GVN.cpp @@ -1514,6 +1514,99 @@ } } +/// Return true if this is a natural loop header and at least one 'real latch' +/// (i.e. what was a latch before LoopSimplify introduced a merge point) is +/// fully available. The idea here is that we want to perform a transform +/// analogous to LICM, but also allow pushing an extra copy of the load into +/// one of several paths which lead to the backedge. (i.e. most paths through +/// the loop should not involve a load, but the path which clobbers it will +/// reload) +static bool isLICMLike(BasicBlock* Header, + SmallVector& PredBlocks, + unsigned NumUnavailable, + DominatorTree &DT) { + // We can't assume we're given a loop header. We need to ensure that all of + // non-mergin predecessors are either inside the loop (i.e. dominated by the + // header) or a single preheader. If we find any block which doesn't meet + // this requirement, we do not have a natural loop. + BasicBlock *PreHeader = nullptr; + unsigned NumLatches = 0; + for (BasicBlock *Pred : PredBlocks) { + if (DT.properlyDominates(Pred, Header)) { + assert(!PreHeader && "Can only have one preheader!"); + PreHeader = Pred; + } else if (DT.dominates(Header, Pred)) { + // Iff this is a loop, this is either a latch, or a block which leads to + // latch (i.e. a merge only block) + NumLatches++; + } else { + // Not a natural loop at all + return false; + } + } + // Is this a loop header at all? + if (PreHeader && NumLatches) + // Do we end up with a least one latch where the load is fully available? + if (NumLatches + 1 > NumUnavailable) + return true; + return false; +} + +/// Return true if this is a merge block. +/// A 'merge block' is one that ends with an unconditional jump to a single +/// basic block and contains no useful computation of it's own. Such blocks +/// arise frequently from loop simplify. JumpThreading removes such blocks, +/// but may not have run immediately before GVN. +static bool isMergeOnlyBlock(BasicBlock *BB) { + if (&BB->getParent()->getEntryBlock() == BB) + return false; + if (BB->getTerminator()->getNumSuccessors() != 1) { + return false; + } + for (auto I = BB->begin(), E = BB->end(); I != E; ++I) { + switch (I->getOpcode()) { + default: + // unsupported instruction + return false; + case Instruction::PHI: + case Instruction::Br: + continue; + }; + } + return true; +} + +/// Given a 'merge block' return true if we think it is profitable to look +/// through the block and treat it's predeccessors as possible insertion +/// points. This is conservative in that it defaults to not splitting unless +/// it clearly does no harm. +static bool isProfitableToSplit(BasicBlock *Merge, + DenseMap& FullyAvailableBlocks) { + // If by looking at the merge blocks predeccessors we find only a single + // unavailable predeccessor, we can do no harm by looking through it since + // this block would be in our list of unavailable blocks anyways + bool HasUnavailablePred = false; + for (pred_iterator PI = pred_begin(Merge), E = pred_end(Merge); + PI != E; ++PI) { + BasicBlock *Pred = *PI; + if (IsValueFullyAvailableInBlock(Pred, FullyAvailableBlocks, 0)) { + continue; + } + if (HasUnavailablePred) + return false; + HasUnavailablePred = true; + + if (Pred->getTerminator()->getNumSuccessors() != 1) { + // We'd need to split this critical edge and don't currently keep + // around the info to do so if we look through merge blocks. + return false; + } + } + return true; +} + + + bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, UnavailBlkVect &UnavailableBlocks) { // Okay, we have *some* definitions of the value. This means that the value @@ -1561,14 +1654,51 @@ for (unsigned i = 0, e = UnavailableBlocks.size(); i != e; ++i) FullyAvailableBlocks[UnavailableBlocks[i]] = false; - SmallVector CriticalEdgePred; + + // Note: PredBlocks here includes indirect predecessors which reach the basic + // block of interest only through partially available merge blocks. + SmallVector PredBlocks; + SmallSet Seen; for (pred_iterator PI = pred_begin(LoadBB), E = pred_end(LoadBB); PI != E; ++PI) { BasicBlock *Pred = *PI; + PredBlocks.push_back(Pred); + Seen.insert(Pred); + } + + SmallVector CriticalEdgePred; + for(auto PI = PredBlocks.begin(), E = PredBlocks.end(); PI != E; ++PI) { + BasicBlock *Pred = *PI; if (IsValueFullyAvailableInBlock(Pred, FullyAvailableBlocks, 0)) { continue; } + // If this a merge block - one with a single successor and no computation + // of it's own - which is partially available, we may want to look at it's + // predeccessors as possible load insertion points. Note that a merge + // block can not effect whether a load is 'anticipated' along some path. + if (isMergeOnlyBlock(Pred)) { + errs() << "MergeOnly: " << Pred->getName() << "\n"; + // This might be the single block we want to insert a load in. Look + // through this block only if can do no harm. This is enough to get most + // cases produced by loop simplify which will trigger the LICM-like case + // below. + if (isProfitableToSplit(Pred, FullyAvailableBlocks)) { + for (pred_iterator PI = pred_begin(Pred), E = pred_end(Pred); + PI != E; ++PI) { + BasicBlock *Pred = *PI; + //avoid infinite loop! + if (!Seen.count(Pred)) { + Seen.insert(Pred); + PredBlocks.push_back(Pred); + } + } + E = PredBlocks.end(); + continue; + } + // otherwise, fall through + } + if (Pred->getTerminator()->getNumSuccessors() != 1) { if (isa(Pred->getTerminator())) { DEBUG(dbgs() << "COULD NOT PRE LOAD BECAUSE OF INDBR CRITICAL EDGE '" @@ -1582,7 +1712,6 @@ << Pred->getName() << "': " << *LI << '\n'); return false; } - CriticalEdgePred.push_back(Pred); } else { // Only add the predecessors that will not be split for now. @@ -1590,16 +1719,25 @@ } } + // FIXME: If we could restructure the CFG, we could make a common pred with + // all the preds that don't have an available LI and insert a new load into + // that one block. With care, this works for 'indirect' predecessors due to + // merge blocks as well. + // Decide whether PRE is profitable for this load. unsigned NumUnavailablePreds = PredLoads.size() + CriticalEdgePred.size(); assert(NumUnavailablePreds != 0 && "Fully available value should already be eliminated!"); - // If this load is unavailable in multiple predecessors, reject it. - // FIXME: If we could restructure the CFG, we could make a common pred with - // all the preds that don't have an available LI and insert a new load into - // that one block. - if (NumUnavailablePreds != 1) + // We judge a load placement as profitable if: + // - We're moving a single load into some (potentially indirect) + // predeccessor. + // - We can pull a load into a loop header and reload it only along some + // paths through the loop. This is analogous to LICM, but additional allows + // a reload along one path in loops with interior control flow or multiple + // latches. + if (NumUnavailablePreds != 1 && + !isLICMLike(LoadBB, PredBlocks, NumUnavailablePreds, *DT)) return false; // Split critical edges, and update the unavailable predecessors accordingly. @@ -1671,6 +1809,8 @@ for (const auto &PredLoad : PredLoads) { BasicBlock *UnavailablePred = PredLoad.first; Value *LoadPtr = PredLoad.second; + errs() << "Pred Load: " << UnavailablePred->getName() << "\n"; + LoadPtr->dump(); Instruction *NewLoad = new LoadInst(LoadPtr, LI->getName()+".pre", false, LI->getAlignment(), @@ -1711,6 +1851,14 @@ LoadDepVect Deps; MD->getNonLocalPointerDependency(LI, Deps); + for (auto Dep : Deps) { + if (Dep.getResult().getInst()) + Dep.getResult().getInst()->dump(); + else + errs() << "nullptr\n"; + errs() << Dep.getBB()->getName() << "\n"; + } + // If we had to process more than one hundred blocks to find the // dependencies, this load isn't worth worrying about. Optimizing // it will be too expensive.