Index: include/llvm/Analysis/AccessAnalysis.h =================================================================== --- /dev/null +++ include/llvm/Analysis/AccessAnalysis.h @@ -0,0 +1,387 @@ +//===- - -------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// + +#ifndef LLVM_ANALYSIS_ACCESSANALYSIS_H +#define LLVM_ANALYSIS_ACCESSANALYSIS_H + +#include "llvm/ADT/EquivalenceClasses.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/AliasSetTracker.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Support/raw_ostream.h" + +namespace llvm { + +/// Optimization analysis message produced during vectorization. Messages inform +/// the user why vectorization did not occur. +class Report { + std::string Message; + raw_string_ostream Out; + Instruction *Instr; + +public: + Report(Instruction *I = nullptr) : Out(Message), Instr(I) { + Out << "loop not vectorized: "; + } + + template Report &operator<<(const A &Value) { + Out << Value; + return *this; + } + + Instruction *getInstr() { return Instr; } + + std::string &str() { return Out.str(); } + operator Twine() { return Out.str(); } +}; + +/// This struct holds information about the memory runtime legality +/// check that a group of pointers do not overlap. +struct RuntimePointerCheck { + RuntimePointerCheck() : Need(false) {} + + /// Reset the state of the pointer runtime information. + void reset() { + Need = false; + Pointers.clear(); + Starts.clear(); + Ends.clear(); + IsWritePtr.clear(); + DependencySetId.clear(); + AliasSetId.clear(); + PartitionSetId.clear(); + } + + void print(); + + /// Insert a pointer and calculate the start and end SCEVs. + void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr, bool WritePtr, + unsigned DepSetId, unsigned ASId, ValueToValueMap &Strides); + + /// This flag indicates if we need to add the runtime check. + bool Need; + /// Holds the pointers that we need to check. + SmallVector, 2> Pointers; + /// Holds the pointer value at the beginning of the loop. + SmallVector Starts; + /// Holds the pointer value at the end of the loop. + SmallVector Ends; + /// Holds the information if this pointer is used for writing to memory. + SmallVector IsWritePtr; + /// Holds the id of the set of pointers that could be dependent because of a + /// shared underlying object. + SmallVector DependencySetId; + /// Holds the id of the disjoint alias set to which this pointer belongs. + SmallVector AliasSetId; + // FIXME: this does not work if we have more than 64 partitions + SmallVector PartitionSetId; +}; + +/// \brief Analyses memory accesses in a loop. +/// +/// Checks whether run time pointer checks are needed and builds sets for data +/// dependence checking. +class AccessAnalysis { +public: + /// \brief Read or write access location. + typedef PointerIntPair MemAccessInfo; + typedef SmallPtrSet MemAccessInfoSet; + + /// \brief Set of potential dependent memory accesses. + typedef EquivalenceClasses DepCandidates; + + AccessAnalysis(const DataLayout *Dl, AliasAnalysis *AA, DepCandidates &DA) : + DL(Dl), AST(*AA), DepCands(DA), IsRTCheckNeeded(false) {} + + /// \brief Register a load and whether it is only read from. + void addLoad(AliasAnalysis::Location &Loc, bool IsReadOnly) { + Value *Ptr = const_cast(Loc.Ptr); + AST.add(Ptr, AliasAnalysis::UnknownSize, Loc.AATags); + Accesses.insert(MemAccessInfo(Ptr, false)); + if (IsReadOnly) + ReadOnlyPtr.insert(Ptr); + } + + /// \brief Register a store. + void addStore(AliasAnalysis::Location &Loc) { + Value *Ptr = const_cast(Loc.Ptr); + AST.add(Ptr, AliasAnalysis::UnknownSize, Loc.AATags); + Accesses.insert(MemAccessInfo(Ptr, true)); + } + + /// \brief Check whether we can check the pointers at runtime for + /// non-intersection. + bool canCheckPtrAtRT(RuntimePointerCheck &RtCheck, + unsigned &NumComparisons, ScalarEvolution *SE, + Loop *TheLoop, ValueToValueMap &Strides, + bool ShouldCheckStride = false); + + /// \brief Goes over all memory accesses, checks whether a RT check is needed + /// and builds sets of dependent accesses. + void buildDependenceSets() { + processMemAccesses(); + } + + bool isRTCheckNeeded() { return IsRTCheckNeeded; } + + bool isDependencyCheckNeeded() { return !CheckDeps.empty(); } + void resetDepChecks() { CheckDeps.clear(); } + + MemAccessInfoSet &getDependenciesToCheck() { return CheckDeps; } + +private: + typedef SetVector PtrAccessSet; + + /// \brief Go over all memory access and check whether runtime pointer checks + /// are needed /// and build sets of dependency check candidates. + void processMemAccesses(); + + /// Set of all accesses. + PtrAccessSet Accesses; + + /// Set of accesses that need a further dependence check. + MemAccessInfoSet CheckDeps; + + /// Set of pointers that are read only. + SmallPtrSet ReadOnlyPtr; + + const DataLayout *DL; + + /// An alias set tracker to partition the access set by underlying object and + //intrinsic property (such as TBAA metadata). + AliasSetTracker AST; + + /// Sets of potentially dependent accesses - members of one set share an + /// underlying pointer. The set "CheckDeps" identfies which sets really need a + /// dependence check. + DepCandidates &DepCands; + + bool IsRTCheckNeeded; +}; + +class RuntimeCheckEmitter { +public: + /// \brief Add code that checks at runtime if the accessed arrays overlap. + /// + /// Returns a pair of instructions where the first element is the first + /// instruction generated in possibly a sequence of instructions and the + /// second value is the final comparator value or NULL if no check is needed. + std::pair + addRuntimeCheck(Instruction *Loc, RuntimePointerCheck *PtrRtCheck, + ScalarEvolution *SE, Loop *Loop); +}; + +struct OrderedInstruction { + Instruction *I; + unsigned Order; + OrderedInstruction(Instruction *I, unsigned Order) : I(I), Order(Order) { + } +}; +bool operator<(const OrderedInstruction &LHS, const OrderedInstruction &RHS); + +/// \brief Checks memory dependences among accesses to the same underlying +/// object to determine whether there vectorization is legal or not (and at +/// which vectorization factor). +/// +/// This class works under the assumption that we already checked that memory +/// locations with different underlying pointers are "must-not alias". +/// We use the ScalarEvolution framework to symbolically evalutate access +/// functions pairs. Since we currently don't restructure the loop we can rely +/// on the program order of memory accesses to determine their safety. +/// At the moment we will only deem accesses as safe for: +/// * A negative constant distance assuming program order. +/// +/// Safe: tmp = a[i + 1]; OR a[i + 1] = x; +/// a[i] = tmp; y = a[i]; +/// +/// The latter case is safe because later checks guarantuee that there can't +/// be a cycle through a phi node (that is, we check that "x" and "y" is not +/// the same variable: a header phi can only be an induction or a reduction, a +/// reduction can't have a memory sink, an induction can't have a memory +/// source). This is important and must not be violated (or we have to +/// resort to checking for cycles through memory). +/// +/// * A positive constant distance assuming program order that is bigger +/// than the biggest memory access. +/// +/// tmp = a[i] OR b[i] = x +/// a[i+2] = tmp y = b[i+2]; +/// +/// Safe distance: 2 x sizeof(a[0]), and 2 x sizeof(b[0]), respectively. +/// +/// * Zero distances and all accesses have the same size. +/// +class MemoryDepChecker { +public: + typedef PointerIntPair MemAccessInfo; + typedef SmallPtrSet MemAccessInfoSet; + typedef std::vector > + DependenceVector; + + MemoryDepChecker(ScalarEvolution *Se, const DataLayout *Dl, const Loop *L) + : SE(Se), DL(Dl), InnermostLoop(L), AccessIdx(0), + ShouldRetryWithRuntimeCheck(false) {} + + /// \brief Register the location (instructions are given increasing numbers) + /// of a write access. + void addAccess(StoreInst *SI) { + Value *Ptr = SI->getPointerOperand(); + Accesses[MemAccessInfo(Ptr, true)].push_back(AccessIdx); + InstMap.push_back(SI); + ++AccessIdx; + } + + /// \brief Register the location (instructions are given increasing numbers) + /// of a write access. + void addAccess(LoadInst *LI) { + Value *Ptr = LI->getPointerOperand(); + Accesses[MemAccessInfo(Ptr, false)].push_back(AccessIdx); + InstMap.push_back(LI); + ++AccessIdx; + } + + /// \brief Check whether the dependencies between the accesses are safe. + /// + /// Only checks sets with elements in \p CheckDeps. + bool areDepsSafe(AccessAnalysis::DepCandidates &AccessSets, + MemAccessInfoSet &CheckDeps, ValueToValueMap &Strides); + + void findUnsafeDeps(AccessAnalysis::DepCandidates &AccessSets, + MemAccessInfoSet &CheckDeps, ValueToValueMap &Strides, + MemoryDepChecker::DependenceVector &UnsafeDeps); + + /// \brief The maximum number of bytes of a vector register we can vectorize + /// the accesses safely with. + unsigned getMaxSafeDepDistBytes() { return MaxSafeDepDistBytes; } + + /// \brief In same cases when the dependency check fails we can still + /// vectorize the loop with a dynamic array access check. + bool shouldRetryWithRuntimeCheck() { return ShouldRetryWithRuntimeCheck; } + +private: + ScalarEvolution *SE; + const DataLayout *DL; + const Loop *InnermostLoop; +public: + /// \brief Maps access locations (ptr, read/write) to program order. + DenseMap > Accesses; +private: +public: + /// \brief Memory access instructions in program order. + SmallVector InstMap; +private: + + /// \brief The program order index to be used for the next instruction. + unsigned AccessIdx; + + // We can access this many bytes in parallel safely. + unsigned MaxSafeDepDistBytes; + + /// \brief If we see a non-constant dependence distance we can still try to + /// vectorize this loop with runtime checks. + bool ShouldRetryWithRuntimeCheck; + + /// \brief Check whether there is a plausible dependence between the two + /// accesses. + /// + /// Access \p A must happen before \p B in program order. The two indices + /// identify the index into the program order map. + /// + /// This function checks whether there is a plausible dependence (or the + /// absence of such can't be proved) between the two accesses. If there is a + /// plausible dependence but the dependence distance is bigger than one + /// element access it records this distance in \p MaxSafeDepDistBytes (if this + /// distance is smaller than any other distance encountered so far). + /// Otherwise, this function returns true signaling a possible dependence. + bool isDependent(const MemAccessInfo &A, unsigned AIdx, + const MemAccessInfo &B, unsigned BIdx, + ValueToValueMap &Strides); + + /// \brief Check whether the data dependence could prevent store-load + /// forwarding. + bool couldPreventStoreLoadForward(unsigned Distance, unsigned TypeByteSize); +}; + +static inline Instruction *getFirstInst(Instruction *FirstInst, Value *V, + Instruction *Loc) { + if (FirstInst) + return FirstInst; + if (Instruction *I = dyn_cast(V)) + return I->getParent() == Loc->getParent() ? I : nullptr; + return nullptr; +} + +static inline void addInnerLoop(Loop &L, SmallVectorImpl &V) { + if (L.empty()) + return V.push_back(&L); + + for (Loop *InnerL : L) + addInnerLoop(*InnerL, V); +} + +class LoopAccessAnalysis : public RuntimeCheckEmitter { +public: + unsigned NumLoads; + unsigned NumStores; + unsigned NumPredStores; + + LoopAccessAnalysis(Loop *L, const DataLayout *DL, + AliasAnalysis *AA, ScalarEvolution *SCEV) : + AA(AA), + SE(SCEV), + Accesses(DL, AA, DependentAccesses), + DepChecker(SE, DL, L), + TheLoop(L){} + + bool analyzeLoop(); + // FIXME + void emitAnalysis(Report &Message) {} + + bool isUniform(Value *V) { + return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop)); + } + + SmallVector + getInstructionsForAccess(AccessAnalysis::MemAccessInfo &A) { + SmallVector Insts; + auto &Ids = DepChecker.Accesses[A]; + std::transform(Ids.begin(), Ids.end(), std::back_inserter(Insts), + [&](unsigned Id) { + return this->DepChecker.InstMap[Id]; + }); + return Insts; + } + + std::pair addRuntimeCheck(Instruction *Loc) { + return RuntimeCheckEmitter::addRuntimeCheck(Loc, &PtrRtCheck, SE, TheLoop); + } + +private: + AliasAnalysis *AA; + ScalarEvolution *SE; + AccessAnalysis Accesses; + AccessAnalysis::DepCandidates DependentAccesses; +public: + MemoryDepChecker DepChecker; +private: + Loop *TheLoop; +public: + RuntimePointerCheck PtrRtCheck; +private: + // FIXME + ValueToValueMap Strides; +public: + MemoryDepChecker::DependenceVector UnsafeDeps; +}; + +} // End llvm namespace + +#endif Index: include/llvm/Analysis/ControlDependence.h =================================================================== --- /dev/null +++ include/llvm/Analysis/ControlDependence.h @@ -0,0 +1,66 @@ +//===- - -------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// + +#ifndef LLVM_ANALYSIS_CONTROLDEPENDENCE_H +#define LLVM_ANALYSIS_CONTROLDEPENDENCE_H + +#include "llvm/ADT/DenseMap.h" +#include "llvm/Analysis/PostDominators.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/IR/CFG.h" +#include "llvm/Analysis/LoopInfo.h" +#include + +namespace llvm { + class ControlDependence { + // FIXME: do some caching here. + PostDominatorTree *PDT; + Loop *Region; + + public: + ControlDependence(PostDominatorTree *PDT, Loop *L) : PDT(PDT), Region(L) { + } + + Instruction *dependsOnBlock(BasicBlock *B) { + return nullptr; + } + + static Instruction *findControlInstInBlock(BasicBlock *B) { + return nullptr; + } + + SmallVector findControlBlocks(BasicBlock *B) { + SmallVector DirectCD; + + // FIXME: find non-direct CDs as well.x + std::copy_if(pred_begin(B), pred_end(B), + std::back_inserter(DirectCD), + [this, B](BasicBlock *P) { + return (this->Region->contains(P) && + !this->PDT->dominates(B, P)); + }); + + return DirectCD; + } + + SmallVector findControlInsts(BasicBlock *B) { + SmallVector CI; + + SmallVector CD = findControlBlocks(B); + for (BasicBlock *C : CD) { + auto *Br = cast(C->getTerminator()); + assert(Br->isConditional() && + "Control block with unconditional branch"); + CI.push_back(Br); + } + return CI; + } +}; +} + +#endif Index: include/llvm/InitializePasses.h =================================================================== --- include/llvm/InitializePasses.h +++ include/llvm/InitializePasses.h @@ -164,6 +164,7 @@ void initializeLoaderPassPass(PassRegistry&); void initializeLocalStackSlotPassPass(PassRegistry&); void initializeLoopDeletionPass(PassRegistry&); +void initializeLoopDistributePass(PassRegistry&); void initializeLoopExtractorPass(PassRegistry&); void initializeLoopInfoPass(PassRegistry&); void initializeLoopInstSimplifyPass(PassRegistry&); Index: include/llvm/Transforms/Scalar.h =================================================================== --- include/llvm/Transforms/Scalar.h +++ include/llvm/Transforms/Scalar.h @@ -138,6 +138,8 @@ // Pass *createLoopUnswitchPass(bool OptimizeForSize = false); +Pass *createLoopDistributePass(); + //===----------------------------------------------------------------------===// // // LoopInstSimplify - This pass simplifies instructions in a loop's body. Index: lib/Analysis/AccessAnalysis.cpp =================================================================== --- /dev/null +++ lib/Analysis/AccessAnalysis.cpp @@ -0,0 +1,399 @@ +#include "llvm/Analysis/AccessAnalysis.h" +#include "llvm/Analysis/ScalarEvolutionExpander.h" +#include "llvm/Support/Debug.h" + +#define DEBUG_TYPE "access-analysis" + +using namespace llvm; + +std::pair +RuntimeCheckEmitter::addRuntimeCheck(Instruction *Loc, + RuntimePointerCheck *PtrRtCheck, + ScalarEvolution *SE, + Loop *OrigLoop) { + Instruction *tnullptr = nullptr; + if (!PtrRtCheck->Need) + return std::pair(tnullptr, tnullptr); + + unsigned NumPointers = PtrRtCheck->Pointers.size(); + SmallVector , 2> Starts; + SmallVector , 2> Ends; + + LLVMContext &Ctx = Loc->getContext(); + SCEVExpander Exp(*SE, "induction"); + Instruction *FirstInst = nullptr; + + for (unsigned i = 0; i < NumPointers; ++i) { + Value *Ptr = PtrRtCheck->Pointers[i]; + const SCEV *Sc = SE->getSCEV(Ptr); + + if (SE->isLoopInvariant(Sc, OrigLoop)) { + DEBUG(dbgs() << "LV: Adding RT check for a loop invariant ptr:" << + *Ptr <<"\n"); + Starts.push_back(Ptr); + Ends.push_back(Ptr); + } else { + DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr << '\n'); + unsigned AS = Ptr->getType()->getPointerAddressSpace(); + + // Use this type for pointer arithmetic. + Type *PtrArithTy = Type::getInt8PtrTy(Ctx, AS); + + Value *Start = Exp.expandCodeFor(PtrRtCheck->Starts[i], PtrArithTy, Loc); + Value *End = Exp.expandCodeFor(PtrRtCheck->Ends[i], PtrArithTy, Loc); + Starts.push_back(Start); + Ends.push_back(End); + } + } + + unsigned NumChecks = 0; + IRBuilder<> ChkBuilder(Loc); + // Our instructions might fold to a constant. + Value *MemoryRuntimeCheck = nullptr; + for (unsigned i = 0; i < NumPointers; ++i) { + for (unsigned j = i+1; j < NumPointers; ++j) { + // No need to check if two readonly pointers intersect. + if (!PtrRtCheck->IsWritePtr[i] && !PtrRtCheck->IsWritePtr[j]) + continue; + + // Only need to check pointers between two different dependency sets. + if (PtrRtCheck->DependencySetId[i] == PtrRtCheck->DependencySetId[j]) + continue; + // Only need to check pointers in the same alias set. + if (PtrRtCheck->AliasSetId[i] != PtrRtCheck->AliasSetId[j]) + continue; + // FIXME: for LV make these all unique. + if (!PtrRtCheck->PartitionSetId.empty() && + PtrRtCheck->PartitionSetId[i] == PtrRtCheck->PartitionSetId[j]) + continue; + + ++NumChecks; + unsigned AS0 = Starts[i]->getType()->getPointerAddressSpace(); + unsigned AS1 = Starts[j]->getType()->getPointerAddressSpace(); + + assert((AS0 == Ends[j]->getType()->getPointerAddressSpace()) && + (AS1 == Ends[i]->getType()->getPointerAddressSpace()) && + "Trying to bounds check pointers with different address spaces"); + + Type *PtrArithTy0 = Type::getInt8PtrTy(Ctx, AS0); + Type *PtrArithTy1 = Type::getInt8PtrTy(Ctx, AS1); + + Value *Start0 = ChkBuilder.CreateBitCast(Starts[i], PtrArithTy0, "bc"); + Value *Start1 = ChkBuilder.CreateBitCast(Starts[j], PtrArithTy1, "bc"); + Value *End0 = ChkBuilder.CreateBitCast(Ends[i], PtrArithTy1, "bc"); + Value *End1 = ChkBuilder.CreateBitCast(Ends[j], PtrArithTy0, "bc"); + + Value *Cmp0 = ChkBuilder.CreateICmpULE(Start0, End1, "bound0"); + FirstInst = getFirstInst(FirstInst, Cmp0, Loc); + Value *Cmp1 = ChkBuilder.CreateICmpULE(Start1, End0, "bound1"); + FirstInst = getFirstInst(FirstInst, Cmp1, Loc); + Value *IsConflict = ChkBuilder.CreateAnd(Cmp0, Cmp1, "found.conflict"); + FirstInst = getFirstInst(FirstInst, IsConflict, Loc); + if (MemoryRuntimeCheck) { + IsConflict = ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict, + "conflict.rdx"); + FirstInst = getFirstInst(FirstInst, IsConflict, Loc); + } + MemoryRuntimeCheck = IsConflict; + } + } + + dbgs() << "Number of checks: " << NumChecks << " for " << NumPointers + << " pointers\n"; + + // We have to do this trickery because the IRBuilder might fold the check to a + // constant expression in which case there is no Instruction anchored in a + // the block. + Instruction *Check = BinaryOperator::CreateAnd(MemoryRuntimeCheck, + ConstantInt::getTrue(Ctx)); + ChkBuilder.Insert(Check, "memcheck.conflict"); + FirstInst = getFirstInst(FirstInst, Check, Loc); + return std::make_pair(FirstInst, Check); +} + +bool LoopAccessAnalysis::analyzeLoop() { + + typedef SmallVector ValueVector; + typedef SmallPtrSet ValueSet; + + // Holds the Load and Store *instructions*. + ValueVector Loads; + ValueVector Stores; + + // Holds all the different accesses in the loop. + unsigned NumReads = 0; + unsigned NumReadWrites = 0; + + PtrRtCheck.Pointers.clear(); + PtrRtCheck.Need = false; + + const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel(); + + // For each block. + for (Loop::block_iterator bb = TheLoop->block_begin(), + be = TheLoop->block_end(); bb != be; ++bb) { + + // Scan the BB and collect legal loads and stores. + for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e; + ++it) { + + // If this is a load, save it. If this instruction can read from memory + // but is not a load, then we quit. Notice that we don't handle function + // calls that read or write. + if (it->mayReadFromMemory()) { +#ifdef FIXME + // Many math library functions read the rounding mode. We will only + // vectorize a loop if it contains known function calls that don't set + // the flag. Therefore, it is safe to ignore this read from memory. + CallInst *Call = dyn_cast(it); + if (Call && getIntrinsicIDForCall(Call, TLI)) + continue; +#endif + LoadInst *Ld = dyn_cast(it); + if (!Ld || (!Ld->isSimple() && !IsAnnotatedParallel)) { + emitAnalysis(Report(Ld) + << "read with atomic ordering or volatile read"); + DEBUG(dbgs() << "LV: Found a non-simple load.\n"); + return false; + } + NumLoads++; + Loads.push_back(Ld); + DepChecker.addAccess(Ld); + continue; + } + + // Save 'store' instructions. Abort if other instructions write to memory. + if (it->mayWriteToMemory()) { + StoreInst *St = dyn_cast(it); + if (!St) { + emitAnalysis(Report(it) << "instruction cannot be vectorized"); + return false; + } + if (!St->isSimple() && !IsAnnotatedParallel) { + emitAnalysis(Report(St) + << "write with atomic ordering or volatile write"); + DEBUG(dbgs() << "LV: Found a non-simple store.\n"); + return false; + } + NumStores++; + Stores.push_back(St); + DepChecker.addAccess(St); + } + } // Next instr. + } // Next block. + + // Now we have two lists that hold the loads and the stores. + // Next, we find the pointers that they use. + + // Check if we see any stores. If there are no stores, then we don't + // care if the pointers are *restrict*. + if (!Stores.size()) { + DEBUG(dbgs() << "LV: Found a read-only loop!\n"); + return true; + } + + // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects + // multiple times on the same object. If the ptr is accessed twice, once + // for read and once for write, it will only appear once (on the write + // list). This is okay, since we are going to check for conflicts between + // writes and between reads and writes, but not between reads and reads. + ValueSet Seen; + + ValueVector::iterator I, IE; + for (I = Stores.begin(), IE = Stores.end(); I != IE; ++I) { + StoreInst *ST = cast(*I); + Value* Ptr = ST->getPointerOperand(); + + if (isUniform(Ptr)) { + emitAnalysis( + Report(ST) + << "write to a loop invariant address could not be vectorized"); + DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n"); + return false; + } + + // If we did *not* see this pointer before, insert it to the read-write + // list. At this phase it is only a 'write' list. + if (Seen.insert(Ptr)) { + ++NumReadWrites; + + AliasAnalysis::Location Loc = AA->getLocation(ST); +#ifdef FIXME + // The TBAA metadata could have a control dependency on the predication + // condition, so we cannot rely on it when determining whether or not we + // need runtime pointer checks. + if (blockNeedsPredication(ST->getParent())) + Loc.AATags.TBAA = nullptr; +#endif + + Accesses.addStore(Loc); + } + } + + if (IsAnnotatedParallel) { + DEBUG(dbgs() + << "LV: A loop annotated parallel, ignore memory dependency " + << "checks.\n"); + return true; + } + + for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) { + LoadInst *LD = cast(*I); + Value* Ptr = LD->getPointerOperand(); + // If we did *not* see this pointer before, insert it to the + // read list. If we *did* see it before, then it is already in + // the read-write list. This allows us to vectorize expressions + // such as A[i] += x; Because the address of A[i] is a read-write + // pointer. This only works if the index of A[i] is consecutive. + // If the address of i is unknown (for example A[B[i]]) then we may + // read a few words, modify, and write a few words, and some of the + // words may be written to the same address. + bool IsReadOnlyPtr = false; + if (Seen.insert(Ptr) +#ifdef FIXME + || !isStridedPtr(SE, DL, Ptr, TheLoop, Strides) +#endif + ) { + ++NumReads; + IsReadOnlyPtr = true; + } + + AliasAnalysis::Location Loc = AA->getLocation(LD); +#ifdef FIXME + // The TBAA metadata could have a control dependency on the predication + // condition, so we cannot rely on it when determining whether or not we + // need runtime pointer checks. + if (blockNeedsPredication(LD->getParent())) + Loc.AATags.TBAA = nullptr; +#endif + + Accesses.addLoad(Loc, IsReadOnlyPtr); + } + + // If we write (or read-write) to a single destination and there are no + // other reads in this loop then is it safe to vectorize. + if (NumReadWrites == 1 && NumReads == 0) { + DEBUG(dbgs() << "LV: Found a write-only loop!\n"); + return true; + } + + // Build dependence sets and check whether we need a runtime pointer bounds + // check. + Accesses.buildDependenceSets(); + bool NeedRTCheck = Accesses.isRTCheckNeeded(); + + // Find pointers with computable bounds. We are going to use this information + // to place a runtime bound check. + unsigned NumComparisons = 0; + bool CanDoRT = false; + if (NeedRTCheck) + CanDoRT = Accesses.canCheckPtrAtRT(PtrRtCheck, NumComparisons, SE, TheLoop, + Strides); + + DEBUG(dbgs() << "LV: We need to do " << NumComparisons << + " pointer comparisons.\n"); + + // If we only have one set of dependences to check pointers among we don't + // need a runtime check. + if (NumComparisons == 0 && NeedRTCheck) + NeedRTCheck = false; + +#ifdef FIXME + // Check that we did not collect too many pointers or found an unsizeable + // pointer. + if (!CanDoRT || NumComparisons > RuntimeMemoryCheckThreshold) { + PtrRtCheck.reset(); + CanDoRT = false; + } +#endif + if (CanDoRT) { + DEBUG(dbgs() << "LV: We can perform a memory runtime check if needed.\n"); + } + + if (NeedRTCheck && !CanDoRT) { + emitAnalysis(Report() << "cannot identify array bounds"); + DEBUG(dbgs() << "LV: We can't vectorize because we can't find " << + "the array bounds.\n"); + PtrRtCheck.reset(); + return false; + } + + PtrRtCheck.Need = NeedRTCheck; + + if (Accesses.isDependencyCheckNeeded()) { + DEBUG(dbgs() << "LV: Checking memory dependencies\n"); + DepChecker.findUnsafeDeps(DependentAccesses, + Accesses.getDependenciesToCheck(), + Strides, UnsafeDeps); + } + + DEBUG(dbgs() << "LV: We" << (NeedRTCheck ? "" : " don't") << + " need a runtime memory check.\n"); + + return true; +} + +void MemoryDepChecker::findUnsafeDeps(AccessAnalysis::DepCandidates &AccessSets, + MemAccessInfoSet &CheckDeps, + ValueToValueMap &Strides, + DependenceVector &UnsafeDeps) { + MaxSafeDepDistBytes = -1U; + while (!CheckDeps.empty()) { + MemAccessInfo CurAccess = *CheckDeps.begin(); + + // Get the relevant memory access set. + EquivalenceClasses::iterator I = + AccessSets.findValue(AccessSets.getLeaderValue(CurAccess)); + + // Check accesses within this set. + EquivalenceClasses::member_iterator AI, AE; + AI = AccessSets.member_begin(I), AE = AccessSets.member_end(); + + // Check every access pair. + while (AI != AE) { + CheckDeps.erase(*AI); + EquivalenceClasses::member_iterator OI = std::next(AI); + while (OI != AE) { + // Check every accessing instruction pair in program order. + for (std::vector::iterator I1 = Accesses[*AI].begin(), + I1E = Accesses[*AI].end(); I1 != I1E; ++I1) + for (std::vector::iterator I2 = Accesses[*OI].begin(), + I2E = Accesses[*OI].end(); I2 != I2E; ++I2) { + if (*I1 < *I2 && isDependent(*AI, *I1, *OI, *I2, Strides)) + UnsafeDeps.push_back( + std::make_pair(OrderedInstruction(InstMap[*I1], *I1), + OrderedInstruction(InstMap[*I2], *I2))); + if (*I2 < *I1 && isDependent(*OI, *I2, *AI, *I1, Strides)) + UnsafeDeps.push_back( + std::make_pair(OrderedInstruction(InstMap[*I2], *I2), + OrderedInstruction(InstMap[*I1], *I1))); + } + ++OI; + } + AI++; + } + } +} + +bool +MemoryDepChecker::areDepsSafe(AccessAnalysis::DepCandidates &AccessSets, + MemAccessInfoSet &CheckDeps, + ValueToValueMap &Strides) { + DependenceVector UnsafeDeps; + findUnsafeDeps(AccessSets, CheckDeps, Strides, UnsafeDeps); + return UnsafeDeps.size() == 0; +} + +void RuntimePointerCheck::print() { + for (unsigned I = 0; I < Pointers.size(); ++I) + dbgs() << *Pointers[I] + << " Partition: " << PartitionSetId[I] + << " DependencySet: " << DependencySetId[I] + << " Write: " << IsWritePtr[I] + << "\n"; +} +namespace llvm { +bool operator<(const OrderedInstruction &LHS, const OrderedInstruction &RHS) { + return LHS.Order < RHS.Order; +} +} Index: lib/Transforms/IPO/PassManagerBuilder.cpp =================================================================== --- lib/Transforms/IPO/PassManagerBuilder.cpp +++ lib/Transforms/IPO/PassManagerBuilder.cpp @@ -298,6 +298,10 @@ if (ExtraVectorizerPasses) MPM.add(createLoopRotatePass()); + // FIXME: ensure we have loop preheaders in ldist + MPM.add(createLoopSimplifyPass()); + MPM.add(createLoopDistributePass()); + MPM.add(createLoopVectorizePass(DisableUnrollLoops, LoopVectorize)); // FIXME: Because of #pragma vectorize enable, the passes below are always // inserted in the pipeline, even when the vectorizer doesn't run (ex. when Index: lib/Transforms/Scalar/LoopDistribute.cpp =================================================================== --- /dev/null +++ lib/Transforms/Scalar/LoopDistribute.cpp @@ -0,0 +1,532 @@ +#include "llvm/ADT/EquivalenceClasses.h" +#include "llvm/ADT/iterator_range.h" +#include +#include +#include "llvm/Analysis/AccessAnalysis.h" +#include "llvm/Analysis/PostDominators.h" +#include "llvm/Analysis/ControlDependence.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Support/Debug.h" +#include "llvm/Pass.h" + +#define LDIST_NAME "loop-distribute" +#define DEBUG_TYPE LDIST_NAME + +using namespace llvm; + +namespace { + +void remapInstructions(SmallVectorImpl &Blocks, + ValueToValueMapTy &VMap) { + // Rewrite the code to refer to itself. + for (unsigned i = 0, e = Blocks.size(); i != e; ++i) + for (BasicBlock::iterator I = Blocks[i]->begin(), + E = Blocks[i]->end(); I != E; ++I) + RemapInstruction(I, VMap, + RF_NoModuleLevelChanges|RF_IgnoreMissingEntries); +} + +class InstrPartition { + typedef std::set InstructionSet; + + InstructionSet Set; + Loop *L; + ControlDependence *CD; + +public: + typedef SmallVector BlocksT; + BlocksT Blocks; + ValueToValueMapTy VMap; + bool hasCycle; + + template + InstrPartition(IterTy B, IterTy E, bool hasCycle, Loop *L, + ControlDependence *CD) : L(L), CD(CD), hasCycle(hasCycle) { + for (IterTy I = B; I != E; ++I) { + Set.insert((*I).I); + } + } + + InstructionSet::iterator begin() { + return Set.begin(); + } + + InstructionSet::iterator end() { + return Set.end(); + } + + void populateUsedSet() { + SmallVector Worklist(Set.begin(), Set.end()); + + while (!Worklist.empty()) { + Instruction *I = Worklist.pop_back_val(); + BasicBlock *BB = I->getParent(); + + SmallVector CIs = CD->findControlInsts(BB); + // Insert in the set and any new elements onto the worklist. + for (Instruction *I : CIs) + if (Set.insert(I).second) + Worklist.push_back(I); + + // Insert instructions from the loop that we depend on as well. + for (Value *V : I->operand_values()) { + auto *I = dyn_cast(V); + if (I && L->contains(I->getParent()) && Set.insert(I).second) + Worklist.push_back(I); + } + } + } + + void print() { + for (auto *I : Set) + dbgs() << " " << I->getParent()->getName() << ":" << *I << "\n"; + if (hasCycle) + dbgs() << " (cycle)\n"; + } + + BasicBlock *getLoopPreheader() { + return Blocks[0]; + } + + void remapInstructions() { + ::remapInstructions(Blocks, VMap); + } + + iterator_range getLoopBlocks() { + return make_range(Blocks.begin() + 1, Blocks.end() - 1); + } + + void removeUnusedInsts() { + SmallVector Unused; + for (auto *BB: L->getBlocks()) + for (auto &I: *BB) + if (!Set.count(&I)) { + Instruction *NewI = &I; + if (!VMap.empty()) + NewI = cast(VMap[NewI]); + auto *Br = dyn_cast(NewI); + if (Br) { + // If this is a conditional branch that no-one is control dependent + // on then we can go either way. Turn it into an unconditional + // branch. Keep unconditional branches. + if (Br->isConditional()) { + BranchInst::Create(Br->getSuccessor(0), Br); + Unused.push_back(NewI); + } + } else + Unused.push_back(NewI); + } + for (auto I = Unused.rbegin(), E = Unused.rend(); I != E; ++I) + (*I)->eraseFromParent(); + + dbgs() << "\nAfter removing unused Instrs:\n"; + if (Blocks.empty()) + for (auto *BB: L->getBlocks()) + dbgs() << *BB; + else + for (auto *BB: Blocks) + dbgs() << *BB; + } +}; + +// When splitting A->B, SplitEdge produces A->B->B.split and returns the new +// block B.split, I think that in some other cases it returns the middle block. +// To keep this consistent when transforming A->B to A->M->B we return M and +// update B if necessary. +BasicBlock *LDistSplitEdge(BasicBlock *From, BasicBlock *&To, Pass *P) { + BasicBlock *NewBlock = SplitEdge(From, To, P); + BasicBlock *SinglePred = NewBlock->getSinglePredecessor(); + if (SinglePred && SinglePred != From) { + To = NewBlock; + return SinglePred; + } + return NewBlock; +} + +SmallVector splitLoopPreheadAndExit(Loop *L, Pass *P) { + BasicBlock *PH = L->getLoopPreheader(); + BasicBlock *H = L->getHeader(); + BasicBlock *NewPH = LDistSplitEdge(PH, H, P); + BasicBlock *ExitBlock = L->getExitBlock(); + BasicBlock *NewExit = LDistSplitEdge(L->getExitingBlock(), ExitBlock, P); + SmallVector LoopBlocks; + LoopBlocks.push_back(NewPH); + LoopBlocks.append(L->block_begin(), L->block_end()); + LoopBlocks.push_back(NewExit); + return LoopBlocks; +} + +void cloneLoop(BasicBlock *Before, + SmallVectorImpl &LoopBlocks, + SmallVectorImpl &NewBlocks, + ValueToValueMapTy &VMap, const Twine &NameSuffix) { + Function *F = LoopBlocks[0]->getParent(); + for (BasicBlock *BB : LoopBlocks) { + BasicBlock *NewBB = CloneBasicBlock(BB, VMap, NameSuffix, F); + NewBlocks.push_back(NewBB); + VMap[BB] = NewBB; +// FIXME: +// LPM->cloneBasicBlockSimpleAnalysis(LoopBlocks[i], NewBB, L); + } + // Move them physically from the end of the block list. + F->getBasicBlockList().splice(Before, F->getBasicBlockList(), + NewBlocks[0], F->end()); +} + +// +// +// +class InstrPartitionSet { + + SmallVector, 8> Vector; + EquivalenceClasses EC; + Loop *L; + PostDominatorTree *PDT; + ControlDependence CD; + DenseMap InstToPartitionId; + SmallSet ClassWithCycle; + +public: + InstrPartitionSet(Loop *L, PostDominatorTree *PDT) : + L(L), PDT(PDT), CD(PDT, L) { + } + + size_t size() { + return EC.getNumClasses(); + } + + void seedTogether(OrderedInstruction A, OrderedInstruction B) { + auto I = EC.unionSets(A, B); + ClassWithCycle.insert(&*I); + } + + void mergeClasses(const OrderedInstruction &A, const OrderedInstruction &B) { + auto I = EC.unionSets(A, B); + assert(&A == &*I); + if (ClassWithCycle.count(&B)) + ClassWithCycle.insert(&*I); + } + + void seed(OrderedInstruction I) { + EC.insert(I); + } + + bool isConditional(Instruction *I) { + return !PDT->dominates(I->getParent(), L->getHeader()); + } + + void mergeSingleStores() { + // FIXME: merge single stores. We should be able to vectorize these in one + // loop. (Multiple instructions in a class means that there is unsafe + // dependence among them, so we won't be able to vectorize that loop.) + for (auto I = EC.begin(), E = EC.end(); I != E; ++I) { + auto &P = *I; + if (P.isLeader() && P.getNext() == nullptr) { + auto Range = make_range(std::next(I), EC.end()); + for (auto &R: Range) + if (R.isLeader() && R.getNext() == nullptr && + R.getData().I->getParent() == P.getData().I->getParent()) + // FIXME: Doesn't this invalidate the iterator? + mergeClasses(P.getData(), R.getData()); + } + } + } + + void mergeConditionalStores() { + // If all stores are conditional in a partition, we won't be able to + // vectorize it so merge it into previous partition. + const OrderedInstruction *PrevClass = nullptr; + for (auto I = EC.begin(), IE = EC.end(); I != IE; ++I) + if (I->isLeader()) { + if (PrevClass) { + auto J = EC.member_begin(I), JE = EC.member_end(); + bool seenStore = false; + for (; J != JE; ++J) { + if (isa((*J).I)) + seenStore = true; + if (seenStore && !isConditional((*J).I)) + break; + } + if (J == JE) + mergeClasses(I->getData(), *PrevClass); + } + PrevClass = &I->getData(); + } + } + + void merge() { + mergeSingleStores(); + dbgs() << "\nMerged partitions (1):\n"; + printEC(); + + mergeConditionalStores(); + dbgs() << "\nMerged partitions(2):\n"; + printEC(); + } + + void createPartitions() { + for (auto I = EC.begin(), E = EC.end(); I != E; ++I) + if (I->isLeader()) + Vector.push_back(make_unique(EC.member_begin(I), + EC.member_end(), + ClassWithCycle.count(&I->getData()), + L, &CD)); + } + + void populateUsedSet() { + for (auto &P : Vector) + P->populateUsedSet(); + + for (unsigned P = 0; P < Vector.size(); ++P) + for (Instruction *I : *Vector[P]) + InstToPartitionId[I] = P; + } + + void setSuccessor(BasicBlock *BB, BasicBlock *Succ) { + auto Branch = cast(BB->getTerminator()); + assert(Branch->isUnconditional()); + Branch->setSuccessor(0, Succ); + } + + void cloneLoops(Pass *P) { + BasicBlock *ExitBlock = L->getExitBlock(); + BasicBlock *PH = L->getLoopPreheader(); + SmallVector LoopBlocks = splitLoopPreheadAndExit(L, P); + BasicBlock *NewPH = LoopBlocks[0]; + + // In order to stich in the new loop on the PH -> NewPH edge: + // * change the NewExit -> ExitBlock to NewExit -> NewPH + // * change PH -> NewPH to PH -> last cloned NewPH + for (int I = Vector.size() - 2; I >= 0; --I) { + auto &Part = Vector[I]; + + cloneLoop(NewPH, LoopBlocks, Part->Blocks, Part->VMap, + Twine(".ldist") + Twine(I + 1)); + Part->VMap[ExitBlock] = NewPH; + NewPH = Part->getLoopPreheader(); + Part->remapInstructions(); + } + setSuccessor(PH, NewPH); + } + + void removeUnusedInsts() { + for (unsigned I = 0; I < Vector.size(); ++I) { + auto &Part = Vector[I]; + Part->removeUnusedInsts(); + } + } + + void setPartitionPtrUse(LoopAccessAnalysis &LAA) { + RuntimePointerCheck &RtPtrCheck = LAA.PtrRtCheck; + unsigned N = RtPtrCheck.Pointers.size(); + assert(RtPtrCheck.PartitionSetId.empty() && "PartitionSetId not empty"); + RtPtrCheck.PartitionSetId.reserve(N); + for (unsigned I = 0; I < N; ++I) { + Value *Ptr = RtPtrCheck.Pointers[I]; + AccessAnalysis::MemAccessInfo Access(Ptr, RtPtrCheck.IsWritePtr[I]); + auto Instructions = LAA.getInstructionsForAccess(Access); + uint64_t P = + std::accumulate(Instructions.begin(), Instructions.end(), + (uint64_t) 0, + [&](uint64_t res, Instruction *Inst) { + return res | (1 << this->InstToPartitionId[Inst]); + }); + RtPtrCheck.PartitionSetId.push_back(P); + } + } + + void printEC() { + int i = 0; + for (auto I = EC.begin(), E = EC.end(); I != E; ++I) { + if (!I->isLeader()) continue; + dbgs() << i++; + if (ClassWithCycle.count(&I->getData())) + dbgs() << " (cycle)"; + dbgs() << ":\n"; + for (auto MI = EC.member_begin(I); MI != EC.member_end(); ++MI) + dbgs() << " " << (*MI).Order << " " << *(*MI).I << "\n"; + } + } + + void print() { + for (auto I = Vector.begin(); I != Vector.end(); ++I) { + dbgs() << I - Vector.begin() << ":\n"; + (*I)->print(); + } + } +}; + +class LDistRuntimeCheckEmitter { + LoopAccessAnalysis &LAA; + Loop *L; + BasicBlock *MemCheckBB; + SmallVector OrigLoopBlocks; + +public: + LDistRuntimeCheckEmitter(LoopAccessAnalysis &LAA, Loop *L) : + LAA(LAA), L(L), MemCheckBB(nullptr) { + } + + bool needsRuntimeChecks() const { + // FIXME + return true; + } + + void partitionPointers(InstrPartitionSet &Partitions) { + // Set up partition id in PtrRtChecks. Ptr -> Access -> Intruction -> + // Partition. + Partitions.setPartitionPtrUse(LAA); + dbgs() << "\nPointers:\n"; + LAA.PtrRtCheck.print(); + } + + void versionLoop(Pass *P) { + BasicBlock *OrigPH = L->getLoopPreheader(); + SmallVector LoopBlocks = splitLoopPreheadAndExit(L, P); + ValueToValueMapTy VMap; + cloneLoop(*(LoopBlocks.end() - 1), LoopBlocks, OrigLoopBlocks, VMap, + ".ldist.orig"); + remapInstructions(OrigLoopBlocks, VMap); + + MemCheckBB = SplitBlock(OrigPH, OrigPH->getTerminator(), P); + MemCheckBB->setName(L->getHeader()->getName() + ".ldist.memcheck"); + + } + + void insertChecks() { + Instruction *FirstCheckInst; + Instruction *MemRuntimeCheck; + Instruction *OrigTerm = MemCheckBB->getTerminator(); + std::tie(FirstCheckInst, MemRuntimeCheck) = LAA.addRuntimeCheck(OrigTerm); + BranchInst::Create(OrigLoopBlocks[0], L->getLoopPreheader(), + MemRuntimeCheck, OrigTerm); + OrigTerm->eraseFromParent(); + } +}; + +struct LoopDistribute : public FunctionPass { + static char ID; + typedef EquivalenceClasses PartitionsT; + + LoopDistribute() : FunctionPass(ID) { + initializeLoopDistributePass(*PassRegistry::getPassRegistry()); + } + + ScalarEvolution *SE; + const DataLayout *DL; + LoopInfo *LI; + AliasAnalysis *AA; + PostDominatorTree *PDT; + + virtual bool runOnFunction(Function &F) { + SE = &getAnalysis(); + DataLayoutPass *DLP = getAnalysisIfAvailable(); + DL = DLP ? &DLP->getDataLayout() : nullptr; + LI = &getAnalysis(); + AA = &getAnalysis(); + PDT = &getAnalysis(); + + // Build up a worklist of inner-loops to vectorize. This is necessary as + // the act of vectorizing or partially unrolling a loop creates new loops + // and can invalidate iterators across the loops. + SmallVector Worklist; + + for (Loop *L : *LI) + addInnerLoop(*L, Worklist); + + // Now walk the identified inner loops. + bool Changed = false; + while (!Worklist.empty()) + Changed |= processLoop(Worklist.pop_back_val()); + + // Process each loop nest in the function. + return Changed; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + } + + bool processLoop(Loop *L) { + assert(L->empty() && "Only process inner loops."); + + DEBUG(dbgs() << "\nLV: In \"" + << L->getHeader()->getParent()->getName() << + "\" checking " << *L << "\n"); + + // ScalarEvolution needs to be able to find the exit count. + const SCEV *ExitCount = SE->getBackedgeTakenCount(L); + if (ExitCount == SE->getCouldNotCompute()) { + DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n"); + return false; + } + + LoopAccessAnalysis LAA(L, DL, AA, SE); + if (!LAA.analyzeLoop()) + return false; + + InstrPartitionSet Partitions(L, PDT); + + // FIXME: This is not sufficient. We need to put all mem accesses between + // these two into the same partition as well. + for (auto &d : LAA.UnsafeDeps) { + dbgs() << *d.first.I + << "\n->\n" << *d.second.I << "\n\n"; + + Partitions.seedTogether(d.first, d.second); + } + + // FIXME: also add defs whose value is used outside the loop + for (auto B = LAA.DepChecker.InstMap.begin(), + E = LAA.DepChecker.InstMap.end(), I = B; I != E; ++I) + if (isa(*I)) + Partitions.seed(OrderedInstruction(*I, std::distance(B, I))); + + dbgs() << "Seeded partitions:\n"; + Partitions.printEC(); + + Partitions.merge(); + + if (Partitions.size() <= 1) + return false; + + Partitions.createPartitions(); + Partitions.populateUsedSet(); + dbgs() << "\nPopulated partitions:\n"; + Partitions.print(); + + LDistRuntimeCheckEmitter RtCheckEmitter(LAA, L); + if (RtCheckEmitter.needsRuntimeChecks()) { + RtCheckEmitter.partitionPointers(Partitions); + RtCheckEmitter.versionLoop(this); + RtCheckEmitter.insertChecks(); + } + + Partitions.cloneLoops(this); + + Partitions.removeUnusedInsts(); + + return true; + } +}; +} + +char LoopDistribute::ID = 0; +static const char ldist_name[] = "Loop Distribition"; + +INITIALIZE_PASS_BEGIN(LoopDistribute, LDIST_NAME, ldist_name, false, false) +INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(PostDominatorTree) +INITIALIZE_PASS_END(LoopDistribute, LDIST_NAME, ldist_name, false, false) + +namespace llvm { + Pass *createLoopDistributePass() { + return new LoopDistribute(); + } +} Index: lib/Transforms/Scalar/Scalar.cpp =================================================================== --- lib/Transforms/Scalar/Scalar.cpp +++ lib/Transforms/Scalar/Scalar.cpp @@ -44,6 +44,7 @@ initializeJumpThreadingPass(Registry); initializeLICMPass(Registry); initializeLoopDeletionPass(Registry); + initializeLoopDistributePass(Registry); initializeLoopInstSimplifyPass(Registry); initializeLoopRotatePass(Registry); initializeLoopStrengthReducePass(Registry); Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -66,6 +66,7 @@ #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/Analysis/AccessAnalysis.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DebugInfo.h" @@ -218,29 +219,6 @@ class LoopVectorizationCostModel; class LoopVectorizeHints; -/// Optimization analysis message produced during vectorization. Messages inform -/// the user why vectorization did not occur. -class Report { - std::string Message; - raw_string_ostream Out; - Instruction *Instr; - -public: - Report(Instruction *I = nullptr) : Out(Message), Instr(I) { - Out << "loop not vectorized: "; - } - - template Report &operator<<(const A &Value) { - Out << Value; - return *this; - } - - Instruction *getInstr() { return Instr; } - - std::string &str() { return Out.str(); } - operator Twine() { return Out.str(); } -}; - /// InnerLoopVectorizer vectorizes loops which contain only one basic /// block to a specified vectorization factor (VF). /// This class performs the widening of scalars into vectors, or multiple @@ -255,7 +233,7 @@ /// aspects. The InnerLoopVectorizer relies on the /// LoopVectorizationLegality class to provide information about the induction /// and reduction variables that were found to a given vectorization factor. -class InnerLoopVectorizer { +class InnerLoopVectorizer : public RuntimeCheckEmitter { public: InnerLoopVectorizer(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI, DominatorTree *DT, const DataLayout *DL, @@ -293,13 +271,6 @@ typedef DenseMap, VectorParts> EdgeMaskCache; - /// \brief Add code that checks at runtime if the accessed arrays overlap. - /// - /// Returns a pair of instructions where the first element is the first - /// instruction generated in possibly a sequence of instructions and the - /// second value is the final comparator value or NULL if no check is needed. - std::pair addRuntimeCheck(Instruction *Loc); - /// \brief Add checks for strides that where assumed to be 1. /// /// Returns the last check instruction and the first check instruction in the @@ -657,43 +628,6 @@ MinMaxReductionKind MinMaxKind; }; - /// This struct holds information about the memory runtime legality - /// check that a group of pointers do not overlap. - struct RuntimePointerCheck { - RuntimePointerCheck() : Need(false) {} - - /// Reset the state of the pointer runtime information. - void reset() { - Need = false; - Pointers.clear(); - Starts.clear(); - Ends.clear(); - IsWritePtr.clear(); - DependencySetId.clear(); - AliasSetId.clear(); - } - - /// Insert a pointer and calculate the start and end SCEVs. - void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr, bool WritePtr, - unsigned DepSetId, unsigned ASId, ValueToValueMap &Strides); - - /// This flag indicates if we need to add the runtime check. - bool Need; - /// Holds the pointers that we need to check. - SmallVector, 2> Pointers; - /// Holds the pointer value at the beginning of the loop. - SmallVector Starts; - /// Holds the pointer value at the end of the loop. - SmallVector Ends; - /// Holds the information if this pointer is used for writing to memory. - SmallVector IsWritePtr; - /// Holds the id of the set of pointers that could be dependent because of a - /// shared underlying object. - SmallVector DependencySetId; - /// Holds the id of the disjoint alias set to which this pointer belongs. - SmallVector AliasSetId; - }; - /// A struct for saving information about induction variables. struct InductionInfo { InductionInfo(Value *Start, InductionKind K) : StartValue(Start), IK(K) {} @@ -1220,14 +1154,6 @@ } } -static void addInnerLoop(Loop &L, SmallVectorImpl &V) { - if (L.empty()) - return V.push_back(&L); - - for (Loop *InnerL : L) - addInnerLoop(*InnerL, V); -} - /// The LoopVectorize Pass. struct LoopVectorize : public FunctionPass { /// Pass identification, replacement for typeid @@ -1536,7 +1462,7 @@ return SE->getSCEV(Ptr); } -void LoopVectorizationLegality::RuntimePointerCheck::insert( +void RuntimePointerCheck::insert( ScalarEvolution *SE, Loop *Lp, Value *Ptr, bool WritePtr, unsigned DepSetId, unsigned ASId, ValueToValueMap &Strides) { // Get the stride replaced scev. @@ -1998,15 +1924,6 @@ } } -static Instruction *getFirstInst(Instruction *FirstInst, Value *V, - Instruction *Loc) { - if (FirstInst) - return FirstInst; - if (Instruction *I = dyn_cast(V)) - return I->getParent() == Loc->getParent() ? I : nullptr; - return nullptr; -} - std::pair InnerLoopVectorizer::addStrideCheck(Instruction *Loc) { Instruction *tnullptr = nullptr; @@ -2044,102 +1961,6 @@ return std::make_pair(FirstInst, TheCheck); } -std::pair -InnerLoopVectorizer::addRuntimeCheck(Instruction *Loc) { - LoopVectorizationLegality::RuntimePointerCheck *PtrRtCheck = - Legal->getRuntimePointerCheck(); - - Instruction *tnullptr = nullptr; - if (!PtrRtCheck->Need) - return std::pair(tnullptr, tnullptr); - - unsigned NumPointers = PtrRtCheck->Pointers.size(); - SmallVector , 2> Starts; - SmallVector , 2> Ends; - - LLVMContext &Ctx = Loc->getContext(); - SCEVExpander Exp(*SE, "induction"); - Instruction *FirstInst = nullptr; - - for (unsigned i = 0; i < NumPointers; ++i) { - Value *Ptr = PtrRtCheck->Pointers[i]; - const SCEV *Sc = SE->getSCEV(Ptr); - - if (SE->isLoopInvariant(Sc, OrigLoop)) { - DEBUG(dbgs() << "LV: Adding RT check for a loop invariant ptr:" << - *Ptr <<"\n"); - Starts.push_back(Ptr); - Ends.push_back(Ptr); - } else { - DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr << '\n'); - unsigned AS = Ptr->getType()->getPointerAddressSpace(); - - // Use this type for pointer arithmetic. - Type *PtrArithTy = Type::getInt8PtrTy(Ctx, AS); - - Value *Start = Exp.expandCodeFor(PtrRtCheck->Starts[i], PtrArithTy, Loc); - Value *End = Exp.expandCodeFor(PtrRtCheck->Ends[i], PtrArithTy, Loc); - Starts.push_back(Start); - Ends.push_back(End); - } - } - - IRBuilder<> ChkBuilder(Loc); - // Our instructions might fold to a constant. - Value *MemoryRuntimeCheck = nullptr; - for (unsigned i = 0; i < NumPointers; ++i) { - for (unsigned j = i+1; j < NumPointers; ++j) { - // No need to check if two readonly pointers intersect. - if (!PtrRtCheck->IsWritePtr[i] && !PtrRtCheck->IsWritePtr[j]) - continue; - - // Only need to check pointers between two different dependency sets. - if (PtrRtCheck->DependencySetId[i] == PtrRtCheck->DependencySetId[j]) - continue; - // Only need to check pointers in the same alias set. - if (PtrRtCheck->AliasSetId[i] != PtrRtCheck->AliasSetId[j]) - continue; - - unsigned AS0 = Starts[i]->getType()->getPointerAddressSpace(); - unsigned AS1 = Starts[j]->getType()->getPointerAddressSpace(); - - assert((AS0 == Ends[j]->getType()->getPointerAddressSpace()) && - (AS1 == Ends[i]->getType()->getPointerAddressSpace()) && - "Trying to bounds check pointers with different address spaces"); - - Type *PtrArithTy0 = Type::getInt8PtrTy(Ctx, AS0); - Type *PtrArithTy1 = Type::getInt8PtrTy(Ctx, AS1); - - Value *Start0 = ChkBuilder.CreateBitCast(Starts[i], PtrArithTy0, "bc"); - Value *Start1 = ChkBuilder.CreateBitCast(Starts[j], PtrArithTy1, "bc"); - Value *End0 = ChkBuilder.CreateBitCast(Ends[i], PtrArithTy1, "bc"); - Value *End1 = ChkBuilder.CreateBitCast(Ends[j], PtrArithTy0, "bc"); - - Value *Cmp0 = ChkBuilder.CreateICmpULE(Start0, End1, "bound0"); - FirstInst = getFirstInst(FirstInst, Cmp0, Loc); - Value *Cmp1 = ChkBuilder.CreateICmpULE(Start1, End0, "bound1"); - FirstInst = getFirstInst(FirstInst, Cmp1, Loc); - Value *IsConflict = ChkBuilder.CreateAnd(Cmp0, Cmp1, "found.conflict"); - FirstInst = getFirstInst(FirstInst, IsConflict, Loc); - if (MemoryRuntimeCheck) { - IsConflict = ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict, - "conflict.rdx"); - FirstInst = getFirstInst(FirstInst, IsConflict, Loc); - } - MemoryRuntimeCheck = IsConflict; - } - } - - // We have to do this trickery because the IRBuilder might fold the check to a - // constant expression in which case there is no Instruction anchored in a - // the block. - Instruction *Check = BinaryOperator::CreateAnd(MemoryRuntimeCheck, - ConstantInt::getTrue(Ctx)); - ChkBuilder.Insert(Check, "memcheck.conflict"); - FirstInst = getFirstInst(FirstInst, Check, Loc); - return std::make_pair(FirstInst, Check); -} - void InnerLoopVectorizer::createEmptyLoop() { /* In this function we generate a new loop. The new loop will contain @@ -2364,7 +2185,8 @@ // faster. Instruction *MemRuntimeCheck; std::tie(FirstCheckInst, MemRuntimeCheck) = - addRuntimeCheck(LastBypassBlock->getTerminator()); + addRuntimeCheck(LastBypassBlock->getTerminator(), + Legal->getRuntimePointerCheck(), SE, OrigLoop); if (MemRuntimeCheck) { // Create a new block containing the memory check. BasicBlock *CheckBlock = @@ -3989,91 +3811,6 @@ } } -namespace { -/// \brief Analyses memory accesses in a loop. -/// -/// Checks whether run time pointer checks are needed and builds sets for data -/// dependence checking. -class AccessAnalysis { -public: - /// \brief Read or write access location. - typedef PointerIntPair MemAccessInfo; - typedef SmallPtrSet MemAccessInfoSet; - - /// \brief Set of potential dependent memory accesses. - typedef EquivalenceClasses DepCandidates; - - AccessAnalysis(const DataLayout *Dl, AliasAnalysis *AA, DepCandidates &DA) : - DL(Dl), AST(*AA), DepCands(DA), IsRTCheckNeeded(false) {} - - /// \brief Register a load and whether it is only read from. - void addLoad(AliasAnalysis::Location &Loc, bool IsReadOnly) { - Value *Ptr = const_cast(Loc.Ptr); - AST.add(Ptr, AliasAnalysis::UnknownSize, Loc.AATags); - Accesses.insert(MemAccessInfo(Ptr, false)); - if (IsReadOnly) - ReadOnlyPtr.insert(Ptr); - } - - /// \brief Register a store. - void addStore(AliasAnalysis::Location &Loc) { - Value *Ptr = const_cast(Loc.Ptr); - AST.add(Ptr, AliasAnalysis::UnknownSize, Loc.AATags); - Accesses.insert(MemAccessInfo(Ptr, true)); - } - - /// \brief Check whether we can check the pointers at runtime for - /// non-intersection. - bool canCheckPtrAtRT(LoopVectorizationLegality::RuntimePointerCheck &RtCheck, - unsigned &NumComparisons, ScalarEvolution *SE, - Loop *TheLoop, ValueToValueMap &Strides, - bool ShouldCheckStride = false); - - /// \brief Goes over all memory accesses, checks whether a RT check is needed - /// and builds sets of dependent accesses. - void buildDependenceSets() { - processMemAccesses(); - } - - bool isRTCheckNeeded() { return IsRTCheckNeeded; } - - bool isDependencyCheckNeeded() { return !CheckDeps.empty(); } - void resetDepChecks() { CheckDeps.clear(); } - - MemAccessInfoSet &getDependenciesToCheck() { return CheckDeps; } - -private: - typedef SetVector PtrAccessSet; - - /// \brief Go over all memory access and check whether runtime pointer checks - /// are needed /// and build sets of dependency check candidates. - void processMemAccesses(); - - /// Set of all accesses. - PtrAccessSet Accesses; - - /// Set of accesses that need a further dependence check. - MemAccessInfoSet CheckDeps; - - /// Set of pointers that are read only. - SmallPtrSet ReadOnlyPtr; - - const DataLayout *DL; - - /// An alias set tracker to partition the access set by underlying object and - //intrinsic property (such as TBAA metadata). - AliasSetTracker AST; - - /// Sets of potentially dependent accesses - members of one set share an - /// underlying pointer. The set "CheckDeps" identfies which sets really need a - /// dependence check. - DepCandidates &DepCands; - - bool IsRTCheckNeeded; -}; - -} // end anonymous namespace - /// \brief Check whether a pointer can participate in a runtime bounds check. static bool hasComputableBounds(ScalarEvolution *SE, ValueToValueMap &Strides, Value *Ptr) { @@ -4091,7 +3828,7 @@ const Loop *Lp, ValueToValueMap &StridesMap); bool AccessAnalysis::canCheckPtrAtRT( - LoopVectorizationLegality::RuntimePointerCheck &RtCheck, + RuntimePointerCheck &RtCheck, unsigned &NumComparisons, ScalarEvolution *SE, Loop *TheLoop, ValueToValueMap &StridesMap, bool ShouldCheckStride) { // Find pointers with computable bounds. We are going to use this information @@ -4261,6 +3998,7 @@ // there is no other write to the ptr - this is an optimization to // catch "a[i] = a[i] + " without having to do a dependence check). if ((IsWrite || IsReadOnlyPtr) && SetHasWrite) { + DEBUG(dbgs() << "LV: CheckDeps this ptr " << *Access.getPointer() << "\n"); CheckDeps.insert(Access); IsRTCheckNeeded = true; } @@ -4279,6 +4017,7 @@ if (Prev != ObjToLastAccess.end()) DepCands.unionSets(Access, Prev->second); + DEBUG(dbgs() << "LV: UnderlyingPtr " << *UnderlyingObj << "\n"); ObjToLastAccess[UnderlyingObj] = Access; } } @@ -4286,124 +4025,6 @@ } } -namespace { -/// \brief Checks memory dependences among accesses to the same underlying -/// object to determine whether there vectorization is legal or not (and at -/// which vectorization factor). -/// -/// This class works under the assumption that we already checked that memory -/// locations with different underlying pointers are "must-not alias". -/// We use the ScalarEvolution framework to symbolically evalutate access -/// functions pairs. Since we currently don't restructure the loop we can rely -/// on the program order of memory accesses to determine their safety. -/// At the moment we will only deem accesses as safe for: -/// * A negative constant distance assuming program order. -/// -/// Safe: tmp = a[i + 1]; OR a[i + 1] = x; -/// a[i] = tmp; y = a[i]; -/// -/// The latter case is safe because later checks guarantuee that there can't -/// be a cycle through a phi node (that is, we check that "x" and "y" is not -/// the same variable: a header phi can only be an induction or a reduction, a -/// reduction can't have a memory sink, an induction can't have a memory -/// source). This is important and must not be violated (or we have to -/// resort to checking for cycles through memory). -/// -/// * A positive constant distance assuming program order that is bigger -/// than the biggest memory access. -/// -/// tmp = a[i] OR b[i] = x -/// a[i+2] = tmp y = b[i+2]; -/// -/// Safe distance: 2 x sizeof(a[0]), and 2 x sizeof(b[0]), respectively. -/// -/// * Zero distances and all accesses have the same size. -/// -class MemoryDepChecker { -public: - typedef PointerIntPair MemAccessInfo; - typedef SmallPtrSet MemAccessInfoSet; - - MemoryDepChecker(ScalarEvolution *Se, const DataLayout *Dl, const Loop *L) - : SE(Se), DL(Dl), InnermostLoop(L), AccessIdx(0), - ShouldRetryWithRuntimeCheck(false) {} - - /// \brief Register the location (instructions are given increasing numbers) - /// of a write access. - void addAccess(StoreInst *SI) { - Value *Ptr = SI->getPointerOperand(); - Accesses[MemAccessInfo(Ptr, true)].push_back(AccessIdx); - InstMap.push_back(SI); - ++AccessIdx; - } - - /// \brief Register the location (instructions are given increasing numbers) - /// of a write access. - void addAccess(LoadInst *LI) { - Value *Ptr = LI->getPointerOperand(); - Accesses[MemAccessInfo(Ptr, false)].push_back(AccessIdx); - InstMap.push_back(LI); - ++AccessIdx; - } - - /// \brief Check whether the dependencies between the accesses are safe. - /// - /// Only checks sets with elements in \p CheckDeps. - bool areDepsSafe(AccessAnalysis::DepCandidates &AccessSets, - MemAccessInfoSet &CheckDeps, ValueToValueMap &Strides); - - /// \brief The maximum number of bytes of a vector register we can vectorize - /// the accesses safely with. - unsigned getMaxSafeDepDistBytes() { return MaxSafeDepDistBytes; } - - /// \brief In same cases when the dependency check fails we can still - /// vectorize the loop with a dynamic array access check. - bool shouldRetryWithRuntimeCheck() { return ShouldRetryWithRuntimeCheck; } - -private: - ScalarEvolution *SE; - const DataLayout *DL; - const Loop *InnermostLoop; - - /// \brief Maps access locations (ptr, read/write) to program order. - DenseMap > Accesses; - - /// \brief Memory access instructions in program order. - SmallVector InstMap; - - /// \brief The program order index to be used for the next instruction. - unsigned AccessIdx; - - // We can access this many bytes in parallel safely. - unsigned MaxSafeDepDistBytes; - - /// \brief If we see a non-constant dependence distance we can still try to - /// vectorize this loop with runtime checks. - bool ShouldRetryWithRuntimeCheck; - - /// \brief Check whether there is a plausible dependence between the two - /// accesses. - /// - /// Access \p A must happen before \p B in program order. The two indices - /// identify the index into the program order map. - /// - /// This function checks whether there is a plausible dependence (or the - /// absence of such can't be proved) between the two accesses. If there is a - /// plausible dependence but the dependence distance is bigger than one - /// element access it records this distance in \p MaxSafeDepDistBytes (if this - /// distance is smaller than any other distance encountered so far). - /// Otherwise, this function returns true signaling a possible dependence. - bool isDependent(const MemAccessInfo &A, unsigned AIdx, - const MemAccessInfo &B, unsigned BIdx, - ValueToValueMap &Strides); - - /// \brief Check whether the data dependence could prevent store-load - /// forwarding. - bool couldPreventStoreLoadForward(unsigned Distance, unsigned TypeByteSize); -}; - -} // end anonymous namespace - static bool isInBoundsGep(Value *Ptr) { if (GetElementPtrInst *GEP = dyn_cast(Ptr)) return GEP->isInBounds(); @@ -4493,6 +4114,8 @@ bool MemoryDepChecker::couldPreventStoreLoadForward(unsigned Distance, unsigned TypeByteSize) { + // FIXME + return false; // If loads occur at a distance that is not a multiple of a feasible vector // factor store-load forwarding does not take place. // Positive dependences might cause troubles because vectorizing them might @@ -4656,45 +4279,6 @@ return false; } -bool MemoryDepChecker::areDepsSafe(AccessAnalysis::DepCandidates &AccessSets, - MemAccessInfoSet &CheckDeps, - ValueToValueMap &Strides) { - - MaxSafeDepDistBytes = -1U; - while (!CheckDeps.empty()) { - MemAccessInfo CurAccess = *CheckDeps.begin(); - - // Get the relevant memory access set. - EquivalenceClasses::iterator I = - AccessSets.findValue(AccessSets.getLeaderValue(CurAccess)); - - // Check accesses within this set. - EquivalenceClasses::member_iterator AI, AE; - AI = AccessSets.member_begin(I), AE = AccessSets.member_end(); - - // Check every access pair. - while (AI != AE) { - CheckDeps.erase(*AI); - EquivalenceClasses::member_iterator OI = std::next(AI); - while (OI != AE) { - // Check every accessing instruction pair in program order. - for (std::vector::iterator I1 = Accesses[*AI].begin(), - I1E = Accesses[*AI].end(); I1 != I1E; ++I1) - for (std::vector::iterator I2 = Accesses[*OI].begin(), - I2E = Accesses[*OI].end(); I2 != I2E; ++I2) { - if (*I1 < *I2 && isDependent(*AI, *I1, *OI, *I2, Strides)) - return false; - if (*I2 < *I1 && isDependent(*OI, *I2, *AI, *I1, Strides)) - return false; - } - ++OI; - } - AI++; - } - } - return true; -} - bool LoopVectorizationLegality::canVectorizeMemory() { typedef SmallVector ValueVector;