Index: lib/Transforms/Scalar/GVN.cpp =================================================================== --- lib/Transforms/Scalar/GVN.cpp +++ lib/Transforms/Scalar/GVN.cpp @@ -1514,6 +1514,119 @@ } } +/// Return true if 'Header' is actually the header of a valid loop. PredBlocks +/// are either direct predeccessors of the 'Header' BB or indirect predecessors +/// through 'merge only' blocks. +static bool isLoopHeader(BasicBlock* Header, + SmallVector& PredBlocks, + DominatorTree &DT) { + // TODO: If LoopInfo is available, just use that. + + if (PredBlocks.size() <= 1) { + // Without at least two predecessors, this can't be a loop + return false; + } + // We can't assume we're given a loop header. We need to ensure that all of + // non-mergin predecessors are either inside the loop (i.e. dominated by the + // header) or a single preheader. If we find any block which doesn't meet + // this requirement, we do not have a natural loop. + // Note: The term 'preheader' is being slightly abused here. We allow + BasicBlock *LoopPred = nullptr; + for (BasicBlock *Pred : PredBlocks) { + if (DT.properlyDominates(Pred, Header)) { + assert(!LoopPred && "Can only have one preheader!"); + LoopPred = Pred; + } else if (DT.dominates(Header, Pred)) { + // Iff this is a loop, this is either a latch, or a block which leads to + // latch (i.e. via a merge only block) + } else { + // Not a natural loop at all + return false; + } + } + // At this point, we've seen all the predecessors, did we find *one* from + // outside the loop? If not, this is dead code and should just be deleted. + return LoopPred; +} + + +/// Return true if this is a natural loop header and at least one 'real latch' +/// (i.e. what was a latch before LoopSimplify introduced a merge point) is +/// fully available. The idea here is that we want to perform a transform +/// analogous to LICM, but also allow pushing an extra copy of the load into +/// one of several paths which lead to the backedge. (i.e. most paths through +/// the loop should not involve a load, but the path which clobbers it will +/// reload) +static bool isLICMLike(BasicBlock* Header, + SmallVector& PredBlocks, + unsigned NumUnavailable, + DominatorTree &DT) { + // We can only judge profitability if this is a loop + if (!isLoopHeader(Header, PredBlocks, DT)) + return false; + + // Do we end up with a least one (possibly indirect) predeccesor + // (i.e. hopefully a latch) where the load is fully available? The + // unavailable block could also be the loop predeccessor in which case this + // devolves to classic LICM. + return (PredBlocks.size() > NumUnavailable); +} + +/// Return true if this is a merge block. +/// A 'merge block' is one that ends with an unconditional jump to a single +/// basic block and contains no useful computation of it's own. Such blocks +/// arise frequently from loop simplify. JumpThreading removes such blocks, +/// but may not have run immediately before GVN. +static bool isMergeOnlyBlock(BasicBlock *BB) { + if (&BB->getParent()->getEntryBlock() == BB) + return false; + if (BB->getTerminator()->getNumSuccessors() != 1) { + return false; + } + for (auto I = BB->begin(), E = BB->end(); I != E; ++I) { + switch (I->getOpcode()) { + default: + // unsupported instruction + return false; + case Instruction::PHI: + case Instruction::Br: + continue; + }; + } + return true; +} + +/// Given a 'merge block' return true if we think it is profitable to look +/// through the block and treat it's predeccessors as possible insertion +/// points. This is conservative in that it defaults to not splitting unless +/// it clearly does no harm. +static bool isProfitableToSplit(BasicBlock *Merge, + DenseMap& FullyAvailableBlocks) { + // If by looking at the merge blocks predeccessors we find only a single + // unavailable predeccessor, we can do no harm by looking through it since + // this block would be in our list of unavailable blocks anyways + BasicBlock *UnavailablePred = nullptr; + for (pred_iterator PI = pred_begin(Merge), E = pred_end(Merge); + PI != E; ++PI) { + BasicBlock *Pred = *PI; + if (IsValueFullyAvailableInBlock(Pred, FullyAvailableBlocks, 0)) { + continue; + } + if (UnavailablePred && UnavailablePred != Pred) + return false; + UnavailablePred = Pred; + + if (Pred->getTerminator()->getNumSuccessors() != 1) { + // We'd need to split this critical edge and don't currently keep + // around the info to do so if we look through merge blocks. + return false; + } + } + return nullptr != UnavailablePred; +} + + + bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, UnavailBlkVect &UnavailableBlocks) { // Okay, we have *some* definitions of the value. This means that the value @@ -1561,13 +1674,63 @@ for (unsigned i = 0, e = UnavailableBlocks.size(); i != e; ++i) FullyAvailableBlocks[UnavailableBlocks[i]] = false; - SmallVector CriticalEdgePred; + + // When we get done, we want PredBlocks to contain both all of the indirect + // predeccessors we considered for placement including those which are fully + // available. We only exclude a block from this list if we've looked through + // it to consider *its* predeccessors. This only happens for partially + // available 'merge' blocks. + SmallVector PredBlocks; + SmallSet Seen; + // Step 1 - Identify any unique direct predecessors of LoadBB for (pred_iterator PI = pred_begin(LoadBB), E = pred_end(LoadBB); PI != E; ++PI) { BasicBlock *Pred = *PI; + if (!Seen.count(Pred)) { + PredBlocks.push_back(Pred); + Seen.insert(Pred); + } + } + + // Step 2 - Identify actual insertion sites (may involve looking through some + // merge blocks already in the list) + SmallVector CriticalEdgePred; + for(auto PI = PredBlocks.begin(), E = PredBlocks.end(); PI != E; ++PI) { + BasicBlock *Pred = *PI; if (IsValueFullyAvailableInBlock(Pred, FullyAvailableBlocks, 0)) { continue; } + + // If this a merge block - one with a single successor and no computation + // of it's own - which is partially available, we may want to look at it's + // predeccessors as possible load insertion points. Note that a merge + // block can not effect whether a load is 'anticipated' along some path. + // This might be the single block we want to insert a load in. Look + // through this block only if can do no harm. This is enough to get most + // cases produced by loop simplify which will trigger the LICM-like case + // below. + if (isMergeOnlyBlock(Pred) && + isProfitableToSplit(Pred, FullyAvailableBlocks)) { + errs() << "MergeOnly: " << Pred->getName() << "\n"; + // First, add each predeccessor block which hasn't already been + // considered as a placement candidate. We happen to know that only + // one of those is unavailable, but we need to add each so we can judge + // profitability when we get done. + for (pred_iterator PI = pred_begin(Pred), E = pred_end(Pred); + PI != E; ++PI) { + BasicBlock *Pred = *PI; + if (!Seen.count(Pred)) { + Seen.insert(Pred); + PredBlocks.push_back(Pred); + } + } + // Delete the original merge block from the list and adjust iterators + auto Last = PI; --Last; + PredBlocks.erase(PI); + PI = Last; + E = PredBlocks.end(); + continue; + } if (Pred->getTerminator()->getNumSuccessors() != 1) { if (isa(Pred->getTerminator())) { @@ -1582,7 +1745,6 @@ << Pred->getName() << "': " << *LI << '\n'); return false; } - CriticalEdgePred.push_back(Pred); } else { // Only add the predecessors that will not be split for now. @@ -1590,16 +1752,30 @@ } } + for (auto P : PredBlocks) { + errs() << "pred block: " << P->getName() << "\n"; + } + + // FIXME: If we could restructure the CFG, we could make a common pred with + // all the preds that don't have an available LI and insert a new load into + // that one block. With care, this works for 'indirect' predecessors due to + // merge blocks as well. + // Decide whether PRE is profitable for this load. unsigned NumUnavailablePreds = PredLoads.size() + CriticalEdgePred.size(); assert(NumUnavailablePreds != 0 && "Fully available value should already be eliminated!"); - // If this load is unavailable in multiple predecessors, reject it. - // FIXME: If we could restructure the CFG, we could make a common pred with - // all the preds that don't have an available LI and insert a new load into - // that one block. - if (NumUnavailablePreds != 1) + errs() << NumUnavailablePreds << "\n"; + + // We judge a load placement as profitable if: + // - We're moving a single load into some (potentially indirect) predeccessor + // - We can pull a load into a loop header and reload it only along only some + // paths through the loop. This is analogous to LICM, but additional allows + // a reload along one path in loops with interior control flow or multiple + // latches. + if (NumUnavailablePreds != 1 && + !isLICMLike(LoadBB, PredBlocks, NumUnavailablePreds, *DT)) return false; // Split critical edges, and update the unavailable predecessors accordingly. @@ -1671,6 +1847,8 @@ for (const auto &PredLoad : PredLoads) { BasicBlock *UnavailablePred = PredLoad.first; Value *LoadPtr = PredLoad.second; + errs() << "Pred Load: " << UnavailablePred->getName() << "\n"; + LoadPtr->dump(); Instruction *NewLoad = new LoadInst(LoadPtr, LI->getName()+".pre", false, LI->getAlignment(), @@ -1711,6 +1889,14 @@ LoadDepVect Deps; MD->getNonLocalPointerDependency(LI, Deps); + for (auto Dep : Deps) { + if (Dep.getResult().getInst()) + Dep.getResult().getInst()->dump(); + else + errs() << "nullptr\n"; + errs() << Dep.getBB()->getName() << "\n"; + } + // If we had to process more than one hundred blocks to find the // dependencies, this load isn't worth worrying about. Optimizing // it will be too expensive. Index: test/Transforms/GVN/pre-loopsimplify.ll =================================================================== --- /dev/null +++ test/Transforms/GVN/pre-loopsimplify.ll @@ -0,0 +1,113 @@ +; RUN: opt -basicaa -gvn -S < %s | FileCheck %s + +target datalayout = "e" + +declare void @hold(i32) readonly +declare void @clobber() + +; This is a classic LICM case +define i32 @test1(i1 %cnd, i32* %p) { +; CHECK-LABEL: @test1 +entry: +; CHECK-LABEL: entry +; CHECK-NEXT: %v1.pre = load i32* %p + br label %header + +header: +; CHECK-LABEL: header +; CHECK-NOT: load i32* %p + %v1 = load i32* %p + call void @hold(i32 %v1) + br label %header +} + +; This is classic LICM, but we need to look through a merge block +; to see it. +define i32 @test2(i1 %cnd, i32* %p) { +; CHECK-LABEL: @test2 +entry: +; CHECK-LABEL: entry +; CHECK-NEXT: %v1.pre = load i32* %p + br label %header + +header: +; CHECK-LABEL: header +; CHECK-NOT: load i32* %p + %v1 = load i32* %p + call void @hold(i32 %v1) + br i1 %cnd, label %bb1, label %bb2 + +bb1: + br label %merge + +bb2: + br label %merge + +merge: + br label %header +} + +; In this case we have a load which is 'almost' loop invariant +; it's modified along one path, but not another. We want to +; pull the load out of the loop and reload in the clobbering +; block. +define i32 @test3(i1 %cnd, i32* %p) { +; CHECK-LABEL: @test3 +entry: +; CHECK-LABEL: entry +; CHECK-NEXT: %v1.pre1 = load i32* %p + br label %header + +header: +; CHECK-LABEL: header +; CHECK-NOT: load i32* %p + %v1 = load i32* %p + call void @hold(i32 %v1) + br i1 %cnd, label %bb1, label %bb2 + +bb1: + br label %header + +bb2: +; CHECK-LABEL: bb2 +; CHECK: call void @clobber() +; CHECK-NEXT: %v1.pre = load i32* %p +; CHECK-NEXT: br label %header + + call void @clobber() + br label %header +} + +; Same as above, but we need to look through a merge block +; to see it. This is the form that loop-simplify produces +; and is thus the most important test case in this file. +define i32 @test4(i1 %cnd, i32* %p) { +; CHECK-LABEL: @test4 +entry: +; CHECK-LABEL: entry +; CHECK-NEXT: %v1.pre = load i32* %p + br label %header + +header: +; CHECK-LABEL: header +; CHECK-NOT: load i32* %p + %v1 = load i32* %p + call void @hold(i32 %v1) + br i1 %cnd, label %bb1, label %bb2 + +bb1: + br label %merge + +bb2: +; CHECK-LABEL: bb2 +; CHECK: call void @clobber() +; CHECK-NEXT: %v1.pre1 = load i32* %p +; CHECK-NEXT: br label %merge + + call void @clobber() + br label %merge + +merge: + br label %header +} +