Index: llvm/trunk/include/llvm/Analysis/LoopAccessAnalysis.h =================================================================== --- llvm/trunk/include/llvm/Analysis/LoopAccessAnalysis.h +++ llvm/trunk/include/llvm/Analysis/LoopAccessAnalysis.h @@ -250,6 +250,17 @@ return InstMap; } + /// \brief Generate a mapping between the memory instructions and their + /// indices according to program order. + DenseMap generateInstructionOrderMap() const { + DenseMap OrderMap; + + for (unsigned I = 0; I < InstMap.size(); ++I) + OrderMap[InstMap[I]] = I; + + return OrderMap; + } + /// \brief Find the set of instructions that read or write via \p Ptr. SmallVector getInstructionsForAccess(Value *Ptr, bool isWrite) const; @@ -453,6 +464,11 @@ /// index \p I and \p J to prove their independence. bool needsChecking(unsigned I, unsigned J) const; + /// \brief Return PointerInfo for pointer at index \p PtrIdx. + const PointerInfo &getPointerInfo(unsigned PtrIdx) const { + return Pointers[PtrIdx]; + } + private: /// \brief Groups pointers such that a single memcheck is required /// between two different groups. This will clear the CheckingGroups vector Index: llvm/trunk/include/llvm/InitializePasses.h =================================================================== --- llvm/trunk/include/llvm/InitializePasses.h +++ llvm/trunk/include/llvm/InitializePasses.h @@ -301,6 +301,7 @@ void initializeSjLjEHPreparePass(PassRegistry&); void initializeDemandedBitsPass(PassRegistry&); void initializeFuncletLayoutPass(PassRegistry &); +void initializeLoopLoadEliminationPass(PassRegistry&); } #endif Index: llvm/trunk/include/llvm/Transforms/Scalar.h =================================================================== --- llvm/trunk/include/llvm/Transforms/Scalar.h +++ llvm/trunk/include/llvm/Transforms/Scalar.h @@ -480,6 +480,12 @@ // FunctionPass *createLoopDistributePass(); +//===----------------------------------------------------------------------===// +// +// LoopLoadElimination - Perform loop-aware load elimination. +// +FunctionPass *createLoopLoadEliminationPass(); + } // End llvm namespace #endif Index: llvm/trunk/lib/Transforms/IPO/PassManagerBuilder.cpp =================================================================== --- llvm/trunk/lib/Transforms/IPO/PassManagerBuilder.cpp +++ llvm/trunk/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -99,6 +99,10 @@ cl::desc( "Enable the GlobalsModRef AliasAnalysis outside of the LTO pipeline.")); +static cl::opt EnableLoopLoadElim( + "enable-loop-load-elim", cl::init(false), cl::Hidden, + cl::desc("Enable the new, experimental LoopLoadElimination Pass")); + PassManagerBuilder::PassManagerBuilder() { OptLevel = 2; SizeLevel = 0; @@ -378,6 +382,12 @@ MPM.add(createLoopDistributePass()); MPM.add(createLoopVectorizePass(DisableUnrollLoops, LoopVectorize)); + + // Eliminate loads by forwarding stores from the previous iteration to loads + // of the current iteration. + if (EnableLoopLoadElim) + MPM.add(createLoopLoadEliminationPass()); + // FIXME: Because of #pragma vectorize enable, the passes below are always // inserted in the pipeline, even when the vectorizer doesn't run (ex. when // on -O1 and no #pragma is found). Would be good to have these two passes Index: llvm/trunk/lib/Transforms/Scalar/CMakeLists.txt =================================================================== --- llvm/trunk/lib/Transforms/Scalar/CMakeLists.txt +++ llvm/trunk/lib/Transforms/Scalar/CMakeLists.txt @@ -21,6 +21,7 @@ LoopIdiomRecognize.cpp LoopInstSimplify.cpp LoopInterchange.cpp + LoopLoadElimination.cpp LoopRerollPass.cpp LoopRotation.cpp LoopStrengthReduce.cpp Index: llvm/trunk/lib/Transforms/Scalar/LoopLoadElimination.cpp =================================================================== --- llvm/trunk/lib/Transforms/Scalar/LoopLoadElimination.cpp +++ llvm/trunk/lib/Transforms/Scalar/LoopLoadElimination.cpp @@ -0,0 +1,554 @@ +//===- LoopLoadElimination.cpp - Loop Load Elimination Pass ---------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implement a loop-aware load elimination pass. +// +// It uses LoopAccessAnalysis to identify loop-carried dependences with a +// distance of one between stores and loads. These form the candidates for the +// transformation. The source value of each store then propagated to the user +// of the corresponding load. This makes the load dead. +// +// The pass can also version the loop and add memchecks in order to prove that +// may-aliasing stores can't change the value in memory before it's read by the +// load. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/LoopAccessAnalysis.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/ScalarEvolutionExpander.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" +#include "llvm/Support/Debug.h" +#include "llvm/Transforms/Utils/LoopVersioning.h" +#include + +#define LLE_OPTION "loop-load-elim" +#define DEBUG_TYPE LLE_OPTION + +using namespace llvm; + +static cl::opt CheckPerElim( + "runtime-check-per-loop-load-elim", cl::Hidden, + cl::desc("Max number of memchecks allowed per eliminated load on average"), + cl::init(1)); + +STATISTIC(NumLoopLoadEliminted, "Number of loads eliminated by LLE"); + +namespace { + +/// \brief Represent a store-to-forwarding candidate. +struct StoreToLoadForwardingCandidate { + LoadInst *Load; + StoreInst *Store; + + StoreToLoadForwardingCandidate(LoadInst *Load, StoreInst *Store) + : Load(Load), Store(Store) {} + + /// \brief Return true if the dependence from the store to the load has a + /// distance of one. E.g. A[i+1] = A[i] + bool isDependenceDistanceOfOne(ScalarEvolution *SE) const { + Value *LoadPtr = Load->getPointerOperand(); + Value *StorePtr = Store->getPointerOperand(); + Type *LoadPtrType = LoadPtr->getType(); + Type *StorePtrType = StorePtr->getType(); + Type *LoadType = LoadPtrType->getPointerElementType(); + + assert(LoadPtrType->getPointerAddressSpace() == + StorePtrType->getPointerAddressSpace() && + LoadType == StorePtrType->getPointerElementType() && + "Should be a known dependence"); + + auto &DL = Load->getParent()->getModule()->getDataLayout(); + unsigned TypeByteSize = DL.getTypeAllocSize(const_cast(LoadType)); + + auto *LoadPtrSCEV = cast(SE->getSCEV(LoadPtr)); + auto *StorePtrSCEV = cast(SE->getSCEV(StorePtr)); + + // We don't need to check non-wrapping here because forward/backward + // dependence wouldn't be valid if these weren't monotonic accesses. + auto *Dist = + cast(SE->getMinusSCEV(StorePtrSCEV, LoadPtrSCEV)); + const APInt &Val = Dist->getValue()->getValue(); + return Val.abs() == TypeByteSize; + } + + Value *getLoadPtr() const { return Load->getPointerOperand(); } + +#ifndef NDEBUG + friend raw_ostream &operator<<(raw_ostream &OS, + const StoreToLoadForwardingCandidate &Cand) { + OS << *Cand.Store << " -->\n"; + OS.indent(2) << *Cand.Load << "\n"; + return OS; + } +#endif +}; + +/// \brief Check if the store dominates all latches, so as long as there is no +/// intervening store this value will be loaded in the next iteration. +bool doesStoreDominatesAllLatches(BasicBlock *StoreBlock, Loop *L, + DominatorTree *DT) { + SmallVector Latches; + L->getLoopLatches(Latches); + return std::all_of(Latches.begin(), Latches.end(), + [&](const BasicBlock *Latch) { + return DT->dominates(StoreBlock, Latch); + }); +} + +/// \brief The per-loop class that does most of the work. +class LoadEliminationForLoop { +public: + LoadEliminationForLoop(Loop *L, LoopInfo *LI, const LoopAccessInfo &LAI, + DominatorTree *DT, ScalarEvolution *SE) + : L(L), LI(LI), LAI(LAI), DT(DT), SE(SE) {} + + /// \brief Look through the loop-carried and loop-independent dependences in + /// this loop and find store->load dependences. + /// + /// Note that no candidate is returned if LAA has failed to analyze the loop + /// (e.g. if it's not bottom-tested, contains volatile memops, etc.) + std::forward_list + findStoreToLoadDependences(const LoopAccessInfo &LAI) { + std::forward_list Candidates; + + const auto *Deps = LAI.getDepChecker().getDependences(); + if (!Deps) + return Candidates; + + // Find store->load dependences (consequently true dep). Both lexically + // forward and backward dependences qualify. Disqualify loads that have + // other unknown dependences. + + SmallSet LoadsWithUnknownDepedence; + + for (const auto &Dep : *Deps) { + Instruction *Source = Dep.getSource(LAI); + Instruction *Destination = Dep.getDestination(LAI); + + if (Dep.Type == MemoryDepChecker::Dependence::Unknown) { + if (isa(Source)) + LoadsWithUnknownDepedence.insert(Source); + if (isa(Destination)) + LoadsWithUnknownDepedence.insert(Destination); + continue; + } + + if (Dep.isBackward()) + // Note that the designations source and destination follow the program + // order, i.e. source is always first. (The direction is given by the + // DepType.) + std::swap(Source, Destination); + else + assert(Dep.isForward() && "Needs to be a forward dependence"); + + auto *Store = dyn_cast(Source); + if (!Store) + continue; + auto *Load = dyn_cast(Destination); + if (!Load) + continue; + Candidates.emplace_front(Load, Store); + } + + if (!LoadsWithUnknownDepedence.empty()) + Candidates.remove_if([&](const StoreToLoadForwardingCandidate &C) { + return LoadsWithUnknownDepedence.count(C.Load); + }); + + return Candidates; + } + + /// \brief Return the index of the instruction according to program order. + unsigned getInstrIndex(Instruction *Inst) { + auto I = InstOrder.find(Inst); + assert(I != InstOrder.end() && "No index for instruction"); + return I->second; + } + + /// \brief If a load has multiple candidates associated (i.e. different + /// stores), it means that it could be forwarding from multiple stores + /// depending on control flow. Remove these candidates. + /// + /// Here, we rely on LAA to include the relevant loop-independent dependences. + /// LAA is known to omit these in the very simple case when the read and the + /// write within an alias set always takes place using the *same* pointer. + /// + /// However, we know that this is not the case here, i.e. we can rely on LAA + /// to provide us with loop-independent dependences for the cases we're + /// interested. Consider the case for example where a loop-independent + /// dependece S1->S2 invalidates the forwarding S3->S2. + /// + /// A[i] = ... (S1) + /// ... = A[i] (S2) + /// A[i+1] = ... (S3) + /// + /// LAA will perform dependence analysis here because there are two + /// *different* pointers involved in the same alias set (&A[i] and &A[i+1]). + void removeDependencesFromMultipleStores( + std::forward_list &Candidates) { + // If Store is nullptr it means that we have multiple stores forwarding to + // this store. + typedef DenseMap + LoadToSingleCandT; + LoadToSingleCandT LoadToSingleCand; + + for (const auto &Cand : Candidates) { + bool NewElt; + LoadToSingleCandT::iterator Iter; + + std::tie(Iter, NewElt) = + LoadToSingleCand.insert(std::make_pair(Cand.Load, &Cand)); + if (!NewElt) { + const StoreToLoadForwardingCandidate *&OtherCand = Iter->second; + // Already multiple stores forward to this load. + if (OtherCand == nullptr) + continue; + + // Handle the very basic of case when the two stores are in the same + // block so deciding which one forwards is easy. The later one forwards + // as long as they both have a dependence distance of one to the load. + if (Cand.Store->getParent() == OtherCand->Store->getParent() && + Cand.isDependenceDistanceOfOne(SE) && + OtherCand->isDependenceDistanceOfOne(SE)) { + // They are in the same block, the later one will forward to the load. + if (getInstrIndex(OtherCand->Store) < getInstrIndex(Cand.Store)) + OtherCand = &Cand; + } else + OtherCand = nullptr; + } + } + + Candidates.remove_if([&](const StoreToLoadForwardingCandidate &Cand) { + if (LoadToSingleCand[Cand.Load] != &Cand) { + DEBUG(dbgs() << "Removing from candidates: \n" << Cand + << " The load may have multiple stores forwarding to " + << "it\n"); + return true; + } + return false; + }); + } + + /// \brief Given two pointers operations by their RuntimePointerChecking + /// indices, return true if they require an alias check. + /// + /// We need a check if one is a pointer for a candidate load and the other is + /// a pointer for a possibly intervening store. + bool needsChecking(unsigned PtrIdx1, unsigned PtrIdx2, + const SmallSet &PtrsWrittenOnFwdingPath, + const std::set &CandLoadPtrs) { + Value *Ptr1 = + LAI.getRuntimePointerChecking()->getPointerInfo(PtrIdx1).PointerValue; + Value *Ptr2 = + LAI.getRuntimePointerChecking()->getPointerInfo(PtrIdx2).PointerValue; + return ((PtrsWrittenOnFwdingPath.count(Ptr1) && CandLoadPtrs.count(Ptr2)) || + (PtrsWrittenOnFwdingPath.count(Ptr2) && CandLoadPtrs.count(Ptr1))); + } + + /// \brief Return pointers that are possibly written to on the path from a + /// forwarding store to a load. + /// + /// These pointers need to be alias-checked against the forwarding candidates. + SmallSet findPointersWrittenOnForwardingPath( + const SmallVectorImpl &Candidates) { + // From FirstStore to LastLoad neither of the elimination candidate loads + // should overlap with any of the stores. + // + // E.g.: + // + // st1 C[i] + // ld1 B[i] <-------, + // ld0 A[i] <----, | * LastLoad + // ... | | + // st2 E[i] | | + // st3 B[i+1] -- | -' * FirstStore + // st0 A[i+1] ---' + // st4 D[i] + // + // st0 forwards to ld0 if the accesses in st4 and st1 don't overlap with + // ld0. + + LoadInst *LastLoad = + std::max_element(Candidates.begin(), Candidates.end(), + [&](const StoreToLoadForwardingCandidate &A, + const StoreToLoadForwardingCandidate &B) { + return getInstrIndex(A.Load) < getInstrIndex(B.Load); + }) + ->Load; + StoreInst *FirstStore = + std::min_element(Candidates.begin(), Candidates.end(), + [&](const StoreToLoadForwardingCandidate &A, + const StoreToLoadForwardingCandidate &B) { + return getInstrIndex(A.Store) < + getInstrIndex(B.Store); + }) + ->Store; + + // We're looking for stores after the first forwarding store until the end + // of the loop, then from the beginning of the loop until the last + // forwarded-to load. Collect the pointer for the stores. + SmallSet PtrsWrittenOnFwdingPath; + + auto InsertStorePtr = [&](Instruction *I) { + if (auto *S = dyn_cast(I)) + PtrsWrittenOnFwdingPath.insert(S->getPointerOperand()); + }; + const auto &MemInstrs = LAI.getDepChecker().getMemoryInstructions(); + std::for_each(MemInstrs.begin() + getInstrIndex(FirstStore) + 1, + MemInstrs.end(), InsertStorePtr); + std::for_each(MemInstrs.begin(), &MemInstrs[getInstrIndex(LastLoad)], + InsertStorePtr); + + return PtrsWrittenOnFwdingPath; + } + + /// \brief Determine the pointer alias checks to prove that there are no + /// intervening stores. + SmallVector collectMemchecks( + const SmallVectorImpl &Candidates) { + + SmallSet PtrsWrittenOnFwdingPath = + findPointersWrittenOnForwardingPath(Candidates); + + // Collect the pointers of the candidate loads. + // FIXME: SmallSet does not work with std::inserter. + std::set CandLoadPtrs; + std::transform(Candidates.begin(), Candidates.end(), + std::inserter(CandLoadPtrs, CandLoadPtrs.begin()), + std::mem_fn(&StoreToLoadForwardingCandidate::getLoadPtr)); + + const auto &AllChecks = LAI.getRuntimePointerChecking()->getChecks(); + SmallVector Checks; + + std::copy_if(AllChecks.begin(), AllChecks.end(), std::back_inserter(Checks), + [&](const RuntimePointerChecking::PointerCheck &Check) { + for (auto PtrIdx1 : Check.first->Members) + for (auto PtrIdx2 : Check.second->Members) + if (needsChecking(PtrIdx1, PtrIdx2, + PtrsWrittenOnFwdingPath, CandLoadPtrs)) + return true; + return false; + }); + + DEBUG(dbgs() << "\nPointer Checks (count: " << Checks.size() << "):\n"); + DEBUG(LAI.getRuntimePointerChecking()->printChecks(dbgs(), Checks)); + + return Checks; + } + + /// \brief Perform the transformation for a candidate. + void + propagateStoredValueToLoadUsers(const StoreToLoadForwardingCandidate &Cand, + SCEVExpander &SEE) { + // + // loop: + // %x = load %gep_i + // = ... %x + // store %y, %gep_i_plus_1 + // + // => + // + // ph: + // %x.initial = load %gep_0 + // loop: + // %x.storeforward = phi [%x.initial, %ph] [%y, %loop] + // %x = load %gep_i <---- now dead + // = ... %x.storeforward + // store %y, %gep_i_plus_1 + + Value *Ptr = Cand.Load->getPointerOperand(); + auto *PtrSCEV = cast(SE->getSCEV(Ptr)); + auto *PH = L->getLoopPreheader(); + Value *InitialPtr = SEE.expandCodeFor(PtrSCEV->getStart(), Ptr->getType(), + PH->getTerminator()); + Value *Initial = + new LoadInst(InitialPtr, "load_initial", PH->getTerminator()); + PHINode *PHI = PHINode::Create(Initial->getType(), 2, "store_forwarded", + L->getHeader()->begin()); + PHI->addIncoming(Initial, PH); + PHI->addIncoming(Cand.Store->getOperand(0), L->getLoopLatch()); + + Cand.Load->replaceAllUsesWith(PHI); + } + + /// \brief Top-level driver for each loop: find store->load forwarding + /// candidates, add run-time checks and perform transformation. + bool processLoop() { + DEBUG(dbgs() << "\nIn \"" << L->getHeader()->getParent()->getName() + << "\" checking " << *L << "\n"); + // Look for store-to-load forwarding cases across the + // backedge. E.g.: + // + // loop: + // %x = load %gep_i + // = ... %x + // store %y, %gep_i_plus_1 + // + // => + // + // ph: + // %x.initial = load %gep_0 + // loop: + // %x.storeforward = phi [%x.initial, %ph] [%y, %loop] + // %x = load %gep_i <---- now dead + // = ... %x.storeforward + // store %y, %gep_i_plus_1 + + // First start with store->load dependences. + auto StoreToLoadDependences = findStoreToLoadDependences(LAI); + if (StoreToLoadDependences.empty()) + return false; + + // Generate an index for each load and store according to the original + // program order. This will be used later. + InstOrder = LAI.getDepChecker().generateInstructionOrderMap(); + + // To keep things simple for now, remove those where the load is potentially + // fed by multiple stores. + removeDependencesFromMultipleStores(StoreToLoadDependences); + if (StoreToLoadDependences.empty()) + return false; + + // Filter the candidates further. + SmallVector Candidates; + unsigned NumForwarding = 0; + for (const StoreToLoadForwardingCandidate Cand : StoreToLoadDependences) { + DEBUG(dbgs() << "Candidate " << Cand); + // Make sure that the stored values is available everywhere in the loop in + // the next iteration. + if (!doesStoreDominatesAllLatches(Cand.Store->getParent(), L, DT)) + continue; + + // Check whether the SCEV difference is the same as the induction step, + // thus we load the value in the next iteration. + if (!Cand.isDependenceDistanceOfOne(SE)) + continue; + + ++NumForwarding; + DEBUG(dbgs() + << NumForwarding + << ". Valid store-to-load forwarding across the loop backedge\n"); + Candidates.push_back(Cand); + } + if (Candidates.empty()) + return false; + + // Check intervening may-alias stores. These need runtime checks for alias + // disambiguation. + SmallVector Checks = + collectMemchecks(Candidates); + + // Too many checks are likely to outweigh the benefits of forwarding. + if (Checks.size() > Candidates.size() * CheckPerElim) { + DEBUG(dbgs() << "Too many run-time checks needed.\n"); + return false; + } + + // Point of no-return, start the transformation. First, version the loop if + // necessary. + if (!Checks.empty()) { + LoopVersioning LV(std::move(Checks), LAI, L, LI, DT); + LV.versionLoop(); + } + + // Next, propagate the value stored by the store to the users of the load. + // Also for the first iteration, generate the initial value of the load. + SCEVExpander SEE(*SE, L->getHeader()->getModule()->getDataLayout(), + "storeforward"); + for (const auto &Cand : Candidates) + propagateStoredValueToLoadUsers(Cand, SEE); + NumLoopLoadEliminted += NumForwarding; + + return true; + } + +private: + Loop *L; + + /// \brief Maps the load/store instructions to their index according to + /// program order. + DenseMap InstOrder; + + // Analyses used. + LoopInfo *LI; + const LoopAccessInfo &LAI; + DominatorTree *DT; + ScalarEvolution *SE; +}; + +/// \brief The pass. Most of the work is delegated to the per-loop +/// LoadEliminationForLoop class. +class LoopLoadElimination : public FunctionPass { +public: + LoopLoadElimination() : FunctionPass(ID) { + initializeLoopLoadEliminationPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override { + auto *LI = &getAnalysis().getLoopInfo(); + auto *LAA = &getAnalysis(); + auto *DT = &getAnalysis().getDomTree(); + auto *SE = &getAnalysis().getSE(); + + // Build up a worklist of inner-loops to vectorize. This is necessary as the + // act of distributing a loop creates new loops and can invalidate iterators + // across the loops. + SmallVector Worklist; + + for (Loop *TopLevelLoop : *LI) + for (Loop *L : depth_first(TopLevelLoop)) + // We only handle inner-most loops. + if (L->empty()) + Worklist.push_back(L); + + // Now walk the identified inner loops. + bool Changed = false; + for (Loop *L : Worklist) { + const LoopAccessInfo &LAI = LAA->getInfo(L, ValueToValueMap()); + // The actual work is performed by LoadEliminationForLoop. + LoadEliminationForLoop LEL(L, LI, LAI, DT, SE); + Changed |= LEL.processLoop(); + } + + // Process each loop nest in the function. + return Changed; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addPreserved(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addPreserved(); + } + + static char ID; +}; +} + +char LoopLoadElimination::ID; +static const char LLE_name[] = "Loop Load Elimination"; + +INITIALIZE_PASS_BEGIN(LoopLoadElimination, LLE_OPTION, LLE_name, false, false) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopAccessAnalysis) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) +INITIALIZE_PASS_END(LoopLoadElimination, LLE_OPTION, LLE_name, false, false) + +namespace llvm { +FunctionPass *createLoopLoadEliminationPass() { + return new LoopLoadElimination(); +} +} Index: llvm/trunk/lib/Transforms/Scalar/Scalar.cpp =================================================================== --- llvm/trunk/lib/Transforms/Scalar/Scalar.cpp +++ llvm/trunk/lib/Transforms/Scalar/Scalar.cpp @@ -83,6 +83,7 @@ initializePlaceSafepointsPass(Registry); initializeFloat2IntPass(Registry); initializeLoopDistributePass(Registry); + initializeLoopLoadEliminationPass(Registry); } void LLVMInitializeScalarOpts(LLVMPassRegistryRef R) { Index: llvm/trunk/test/Transforms/LoopLoadElim/backward.ll =================================================================== --- llvm/trunk/test/Transforms/LoopLoadElim/backward.ll +++ llvm/trunk/test/Transforms/LoopLoadElim/backward.ll @@ -0,0 +1,32 @@ +; RUN: opt -loop-load-elim -S < %s | FileCheck %s + +; Simple st->ld forwarding derived from a lexical backward dep. +; +; for (unsigned i = 0; i < 100; i++) +; A[i+1] = A[i] + B[i]; + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" + +define void @f(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i64 %N) { +entry: +; CHECK: %load_initial = load i32, i32* %A + br label %for.body + +for.body: ; preds = %for.body, %entry +; CHECK: %store_forwarded = phi i32 [ %load_initial, %entry ], [ %add, %for.body ] + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %load = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv + %load_1 = load i32, i32* %arrayidx2, align 4 +; CHECK: %add = add i32 %load_1, %store_forwarded + %add = add i32 %load_1, %load + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %arrayidx_next = getelementptr inbounds i32, i32* %A, i64 %indvars.iv.next + store i32 %add, i32* %arrayidx_next, align 4 + %exitcond = icmp eq i64 %indvars.iv.next, %N + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} Index: llvm/trunk/test/Transforms/LoopLoadElim/def-store-before-load.ll =================================================================== --- llvm/trunk/test/Transforms/LoopLoadElim/def-store-before-load.ll +++ llvm/trunk/test/Transforms/LoopLoadElim/def-store-before-load.ll @@ -0,0 +1,35 @@ +; RUN: opt -loop-load-elim -S < %s | FileCheck %s + +; No loop-carried forwarding: The intervening store to A[i] kills the stored +; value from the previous iteration. +; +; for (unsigned i = 0; i < 100; i++) { +; A[i] = 1; +; A[i+1] = A[i] + B[i]; +; } + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" + +define void @f(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i64 %N) { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry +; CHECK-NOT: %store_forwarded + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + store i32 1, i32* %arrayidx, align 4 + %a = load i32, i32* %arrayidx, align 4 + %arrayidxB = getelementptr inbounds i32, i32* %B, i64 %indvars.iv + %b = load i32, i32* %arrayidxB, align 4 +; CHECK: %add = add i32 %b, %a + %add = add i32 %b, %a + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %arrayidx_next = getelementptr inbounds i32, i32* %A, i64 %indvars.iv.next + store i32 %add, i32* %arrayidx_next, align 4 + %exitcond = icmp eq i64 %indvars.iv.next, %N + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} Index: llvm/trunk/test/Transforms/LoopLoadElim/forward.ll =================================================================== --- llvm/trunk/test/Transforms/LoopLoadElim/forward.ll +++ llvm/trunk/test/Transforms/LoopLoadElim/forward.ll @@ -0,0 +1,47 @@ +; RUN: opt -loop-load-elim -S < %s | FileCheck %s + +; Simple st->ld forwarding derived from a lexical forwrad dep. +; +; for (unsigned i = 0; i < 100; i++) { +; A[i+1] = B[i] + 2; +; C[i] = A[i] * 2; +; } + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" + +define void @f(i32* %A, i32* %B, i32* %C, i64 %N) { + +; CHECK: for.body.lver.memcheck: +; CHECK: %found.conflict{{.*}} = +; CHECK-NOT: %found.conflict{{.*}} = + +entry: +; for.body.ph: +; CHECK: %load_initial = load i32, i32* %A + br label %for.body + +for.body: ; preds = %for.body, %entry +; CHECK: %store_forwarded = phi i32 [ %load_initial, %for.body.ph ], [ %a_p1, %for.body ] + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + + %Aidx_next = getelementptr inbounds i32, i32* %A, i64 %indvars.iv.next + %Bidx = getelementptr inbounds i32, i32* %B, i64 %indvars.iv + %Cidx = getelementptr inbounds i32, i32* %C, i64 %indvars.iv + %Aidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + + %b = load i32, i32* %Bidx, align 4 + %a_p1 = add i32 %b, 2 + store i32 %a_p1, i32* %Aidx_next, align 4 + + %a = load i32, i32* %Aidx, align 4 +; CHECK: %c = mul i32 %store_forwarded, 2 + %c = mul i32 %a, 2 + store i32 %c, i32* %Cidx, align 4 + + %exitcond = icmp eq i64 %indvars.iv.next, %N + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} Index: llvm/trunk/test/Transforms/LoopLoadElim/memcheck.ll =================================================================== --- llvm/trunk/test/Transforms/LoopLoadElim/memcheck.ll +++ llvm/trunk/test/Transforms/LoopLoadElim/memcheck.ll @@ -0,0 +1,52 @@ +; RUN: opt -loop-load-elim -S < %s | FileCheck %s +; RUN: opt -loop-load-elim -S -runtime-check-per-loop-load-elim=2 < %s | FileCheck %s --check-prefix=AGGRESSIVE + +; This needs two pairs of memchecks (A * { C, D }) for a single load +; elimination which is considered to expansive by default. +; +; for (unsigned i = 0; i < 100; i++) { +; A[i+1] = B[i] + 2; +; C[i] = A[i] * 2; +; D[i] = 2; +; } + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" + +define void @f(i32* %A, i32* %B, i32* %C, i64 %N, i32* %D) { +entry: + br label %for.body + +; AGGRESSIVE: for.body.lver.memcheck: +; AGGRESSIVE: %found.conflict{{.*}} = +; AGGRESSIVE: %found.conflict{{.*}} = +; AGGRESSIVE-NOT: %found.conflict{{.*}} = + +for.body: ; preds = %for.body, %entry +; CHECK-NOT: %store_forwarded = +; AGGRESSIVE: %store_forwarded = + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + + %Aidx_next = getelementptr inbounds i32, i32* %A, i64 %indvars.iv.next + %Bidx = getelementptr inbounds i32, i32* %B, i64 %indvars.iv + %Cidx = getelementptr inbounds i32, i32* %C, i64 %indvars.iv + %Aidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %Didx = getelementptr inbounds i32, i32* %D, i64 %indvars.iv + + %b = load i32, i32* %Bidx, align 4 + %a_p1 = add i32 %b, 2 + store i32 %a_p1, i32* %Aidx_next, align 4 + + %a = load i32, i32* %Aidx, align 4 +; CHECK: %c = mul i32 %a, 2 +; AGGRESSIVE: %c = mul i32 %store_forwarded, 2 + %c = mul i32 %a, 2 + store i32 %c, i32* %Cidx, align 4 + store i32 2, i32* %Didx, align 4 + + %exitcond = icmp eq i64 %indvars.iv.next, %N + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} Index: llvm/trunk/test/Transforms/LoopLoadElim/multiple-stores-same-block.ll =================================================================== --- llvm/trunk/test/Transforms/LoopLoadElim/multiple-stores-same-block.ll +++ llvm/trunk/test/Transforms/LoopLoadElim/multiple-stores-same-block.ll @@ -0,0 +1,48 @@ +; RUN: opt -basicaa -loop-load-elim -S < %s | FileCheck %s + +; In this case the later store forward to the load: +; +; for (unsigned i = 0; i < 100; i++) { +; B[i] = A[i] + 1; +; A[i+1] = C[i] + 2; +; A[i+1] = D[i] + 3; +; } + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" + +define void @f(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, + i32* noalias nocapture %C, i32* noalias nocapture readonly %D, + i64 %N) { +entry: +; CHECK: %load_initial = load i32, i32* %A + br label %for.body + +for.body: ; preds = %for.body, %entry +; CHECK: %store_forwarded = phi i32 [ %load_initial, %entry ], [ %addD, %for.body ] + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidxA = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %loadA = load i32, i32* %arrayidxA, align 4 +; CHECK: %addA = add i32 %store_forwarded, 1 + %addA = add i32 %loadA, 1 + + %arrayidxB = getelementptr inbounds i32, i32* %B, i64 %indvars.iv + store i32 %addA, i32* %arrayidxB, align 4 + + %arrayidxC = getelementptr inbounds i32, i32* %C, i64 %indvars.iv + %loadC = load i32, i32* %arrayidxC, align 4 + %addC = add i32 %loadC, 2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %arrayidxA_next = getelementptr inbounds i32, i32* %A, i64 %indvars.iv.next + store i32 %addC, i32* %arrayidxA_next, align 4 + + %arrayidxD = getelementptr inbounds i32, i32* %D, i64 %indvars.iv + %loadD = load i32, i32* %arrayidxD, align 4 + %addD = add i32 %loadD, 3 + store i32 %addD, i32* %arrayidxA_next, align 4 + + %exitcond = icmp eq i64 %indvars.iv.next, %N + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} Index: llvm/trunk/test/Transforms/LoopLoadElim/unknown-dep.ll =================================================================== --- llvm/trunk/test/Transforms/LoopLoadElim/unknown-dep.ll +++ llvm/trunk/test/Transforms/LoopLoadElim/unknown-dep.ll @@ -0,0 +1,54 @@ +; RUN: opt -basicaa -loop-load-elim -S < %s | FileCheck %s + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" + +; Give up in the presence of unknown deps. Here, the different strides result +; in unknown dependence: +; +; for (unsigned i = 0; i < 100; i++) { +; A[i+1] = B[i] + 2; +; A[2*i] = C[i] + 2; +; D[i] = A[i] + 2; +; } + +define void @f(i32* noalias %A, i32* noalias %B, i32* noalias %C, + i32* noalias %D, i64 %N) { + +entry: +; for.body.ph: +; CHECK-NOT: %load_initial = + br label %for.body + +for.body: ; preds = %for.body, %entry +; CHECK-NOT: %store_forwarded = + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + + %Aidx_next = getelementptr inbounds i32, i32* %A, i64 %indvars.iv.next + %Bidx = getelementptr inbounds i32, i32* %B, i64 %indvars.iv + %Cidx = getelementptr inbounds i32, i32* %C, i64 %indvars.iv + %Didx = getelementptr inbounds i32, i32* %D, i64 %indvars.iv + %Aidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %indvars.m2 = mul nuw nsw i64 %indvars.iv, 2 + %A2idx = getelementptr inbounds i32, i32* %A, i64 %indvars.m2 + + %b = load i32, i32* %Bidx, align 4 + %a_p1 = add i32 %b, 2 + store i32 %a_p1, i32* %Aidx_next, align 4 + + %c = load i32, i32* %Cidx, align 4 + %a_m2 = add i32 %c, 2 + store i32 %a_m2, i32* %A2idx, align 4 + + %a = load i32, i32* %Aidx, align 4 +; CHECK-NOT: %d = add i32 %store_forwarded, 2 +; CHECK: %d = add i32 %a, 2 + %d = add i32 %a, 2 + store i32 %d, i32* %Didx, align 4 + + %exitcond = icmp eq i64 %indvars.iv.next, %N + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +}