Index: include/llvm/InitializePasses.h =================================================================== --- include/llvm/InitializePasses.h +++ include/llvm/InitializePasses.h @@ -216,6 +216,7 @@ void initializeLoopUnrollPass(PassRegistry&); void initializeLoopUnswitchPass(PassRegistry&); void initializeLoopVectorizePass(PassRegistry&); +void initializeLoopVectorizePredPass(PassRegistry&); void initializeLoopVersioningLICMPass(PassRegistry&); void initializeLoopVersioningPassPass(PassRegistry&); void initializeLowerAtomicLegacyPassPass(PassRegistry&); Index: include/llvm/Transforms/Vectorize.h =================================================================== --- include/llvm/Transforms/Vectorize.h +++ include/llvm/Transforms/Vectorize.h @@ -122,6 +122,13 @@ //===----------------------------------------------------------------------===// // +// LoopVectorizePred - Create a Pass to be run before loop vectorization pass. +// +Pass *createLoopVectorizePredPass(bool NoUnrolling = false, + bool AlwaysVectorize = true); + +//===----------------------------------------------------------------------===// +// // SLPVectorizer - Create a bottom-up SLP vectorizer pass. // Pass *createSLPVectorizerPass(); Index: include/llvm/Transforms/Vectorize/LoopVectorizePred.h =================================================================== --- /dev/null +++ include/llvm/Transforms/Vectorize/LoopVectorizePred.h @@ -0,0 +1,87 @@ +//===---- LoopVectorizePred.h ---------------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This is the LLVM pass which is intended to be run before loop vectorizer. +// This pass checks if there is a backward dependence between instructions, and +// if so, then it tries to reorder instructions, so as to convert +// non-vectorizable loop into vectorizable form. The pass uses +// LoopAccessAnalysis and MemorySSA analysis to check for dependences, in order +// to reorder the instructions. +// For example, the pass reorders the code below, in order to convert it into +// vectorizable form. + +// Before applying the pass +// for (int i = 0; i < n; i++) { +// a[i] = b[i]; +// c[i] = a[i + 1]; +// } +// +// After applying the pass +// for (int i = 0; i < n; i++) { +// c[i] = a[i + 1]; +// a[i] = b[i]; +// } + +//===---------------------------------------------------------------- + +#ifndef LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZEPRED_H +#define LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZEPRED_H + +#include "llvm/ADT/MapVector.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/BasicAliasAnalysis.h" +#include "llvm/Analysis/LoopAccessAnalysis.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/MemorySSA.h" +#include "llvm/Analysis/MemorySSAUpdater.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/PassManager.h" +#include "llvm/Transforms/Scalar/LoopPassManager.h" +#include + +namespace llvm { + +/// The LoopVectorizePred Pass. +struct LoopVectorizePredPass : public PassInfoMixin { + bool DisableUnrolling = false; + /// If true, consider all loops for vectorization. + /// If false, only loops that explicitly request vectorization are + /// considered. + bool AlwaysVectorize = true; + + ScalarEvolution *SE; + LoopInfo *LI; + TargetTransformInfo *TTI; + DominatorTree *DT; + + TargetLibraryInfo *TLI; + AliasAnalysis *AA; + AssumptionCache *AC; + std::function *GetLAA; + MemorySSA *MSSA; + + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); + + // Shim for old PM. + + bool runImpl(Function &F, ScalarEvolution &SE_, LoopInfo &LI_, + TargetTransformInfo &TTI_, DominatorTree &DT_, + TargetLibraryInfo *TLI_, AliasAnalysis &AA_, + AssumptionCache &AC_, + std::function &GetLAA_, + MemorySSA &MSSA_); + + bool processLoop(Loop *L); +}; +} // namespace llvm + +#endif // LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZEPRED_H Index: lib/Passes/PassBuilder.cpp =================================================================== --- lib/Passes/PassBuilder.cpp +++ lib/Passes/PassBuilder.cpp @@ -142,6 +142,7 @@ #include "llvm/Transforms/Utils/SimplifyInstructions.h" #include "llvm/Transforms/Utils/SymbolRewriter.h" #include "llvm/Transforms/Vectorize/LoopVectorize.h" +#include "llvm/Transforms/Vectorize/LoopVectorizePred.h" #include "llvm/Transforms/Vectorize/SLPVectorizer.h" #include @@ -644,6 +645,9 @@ // Now run the core loop vectorizer. OptimizePM.addPass(LoopVectorizePass()); + // Now run the pass to be run before loop vectorizer. + OptimizePM.addPass(LoopVectorizePredPass()); + // Eliminate loads by forwarding stores from the previous iteration to loads // of the current iteration. OptimizePM.addPass(LoopLoadEliminationPass()); Index: lib/Passes/PassRegistry.def =================================================================== --- lib/Passes/PassRegistry.def +++ lib/Passes/PassRegistry.def @@ -174,6 +174,7 @@ FUNCTION_PASS("loop-load-elim", LoopLoadEliminationPass()) FUNCTION_PASS("loop-distribute", LoopDistributePass()) FUNCTION_PASS("loop-vectorize", LoopVectorizePass()) +FUNCTION_PASS("loop-vectorize-pred", LoopVectorizePredPass()) FUNCTION_PASS("pgo-memop-opt", PGOMemOPSizeOpt()) FUNCTION_PASS("print", PrintFunctionPass(dbgs())) FUNCTION_PASS("print", AssumptionPrinterPass(dbgs())) Index: lib/Transforms/IPO/PassManagerBuilder.cpp =================================================================== --- lib/Transforms/IPO/PassManagerBuilder.cpp +++ lib/Transforms/IPO/PassManagerBuilder.cpp @@ -159,6 +159,10 @@ "enable-gvn-sink", cl::init(false), cl::Hidden, cl::desc("Enable the GVN sinking pass (default = off)")); +static cl::opt EnableLoopVectorizePred( + "enable-loopvectorizepred", cl::init(false), cl::Hidden, + cl::desc("Enable the new, experimental LoopVectorizePred Pass")); + PassManagerBuilder::PassManagerBuilder() { OptLevel = 2; SizeLevel = 0; @@ -605,6 +609,9 @@ // llvm.loop.distribute=true or when -enable-loop-distribute is specified. MPM.add(createLoopDistributePass()); + if (EnableLoopVectorizePred) + MPM.add(createLoopVectorizePredPass(DisableUnrollLoops)); + MPM.add(createLoopVectorizePass(DisableUnrollLoops, LoopVectorize)); // Eliminate loads by forwarding stores from the previous iteration to loads @@ -821,6 +828,10 @@ if (!DisableUnrollLoops) PM.add(createSimpleLoopUnrollPass(OptLevel)); // Unroll small loops + + if (EnableLoopVectorizePred) + PM.add(createLoopVectorizePredPass(true)); + PM.add(createLoopVectorizePass(true, LoopVectorize)); // The vectorizer may have significantly shortened a loop body; unroll again. if (!DisableUnrollLoops) Index: lib/Transforms/Vectorize/CMakeLists.txt =================================================================== --- lib/Transforms/Vectorize/CMakeLists.txt +++ lib/Transforms/Vectorize/CMakeLists.txt @@ -2,6 +2,7 @@ BBVectorize.cpp LoadStoreVectorizer.cpp LoopVectorize.cpp + LoopVectorizePred.cpp SLPVectorizer.cpp Vectorize.cpp Index: lib/Transforms/Vectorize/LoopVectorizePred.cpp =================================================================== --- /dev/null +++ lib/Transforms/Vectorize/LoopVectorizePred.cpp @@ -0,0 +1,457 @@ +//===- LoopVectorizePred.cpp - A Pass to be run before Loop Vectorizer +//------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This is the LLVM pass which is intended to be run before loop vectorizer. +// This pass checks if there is a backward dependence between instructions, and +// if so, then it tries to reorder instructions, so as to convert +// non-vectorizable loop into vectorizable form. The pass uses +// LoopAccessAnalysis and MemorySSA analysis to check for dependences, in order +// to reorder the instructions. +// For example, the pass reorders the code below, in order to convert it into +// vectorizable form. + +// Before applying the pass +// for (int i = 0; i < n; i++) { +// a[i] = b[i]; +// c[i] = a[i + 1]; +// } +// +// After applying the pass +// for (int i = 0; i < n; i++) { +// c[i] = a[i + 1]; +// a[i] = b[i]; +// } + +//===---------------------------------------------------------------- +#include "llvm/Transforms/Vectorize/LoopVectorizePred.h" +#include "llvm/ADT/SCCIterator.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/LoopIterator.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/MemorySSA.h" +#include "llvm/Analysis/MemorySSAUpdater.h" +#include "llvm/Analysis/ScalarEvolutionExpander.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/IR/DebugInfo.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/Pass.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/LoopSimplify.h" +#include "llvm/Transforms/Utils/LoopUtils.h" +#include "llvm/Transforms/Vectorize.h" +#include "llvm/Transforms/Utils/OrderedInstructions.h" +#include "queue" + +using namespace llvm; + +#define LV_NAME "loop-vectorize-pred" +#define DEBUG_TYPE LV_NAME + +STATISTIC(LoopsAnalyzed, "Number of loops analyzed "); + +namespace { + +// Forward declarations. + +class LoopInstructionsReordering; + +/// LoopInstructionsReordering checks if there is a backward dependency between +/// instructions, such that the instructions can be reordered so as to convert a +/// non-vectorizable loop into vectorizable form. It first finds out the store +/// instructions which aliases with a load instruction, and then uses MemorySSA +/// Analysis to check if it is legal to move a load instruction above the store +/// instructions. If it is legal, then it checks for all the dependencies of the +/// corresponding Load Instruction which also needs to be moved above the Store +/// Instruction +class LoopInstructionsReordering { +public: + LoopInstructionsReordering( + Loop *L, PredicatedScalarEvolution &PSE, DominatorTree *DT, + TargetLibraryInfo *TLI, AliasAnalysis *AA, Function *F, + const TargetTransformInfo *TTI, + std::function *GetLAA, LoopInfo *LI, + MemorySSA *MSSA) + : TheLoop(L), PSE(PSE), TLI(TLI), TTI(TTI), DT(DT), MSSA(MSSA), + GetLAA(GetLAA), LAI(nullptr) {} + + /// Returns true if the instructions have been reordered + bool canbeReordered(); + + const LoopAccessInfo *getLAI() const { return LAI; } + +private: + /// Gets the backward-dependent store and load instructions and checks if it + /// is legal to reorder them using MemorySSA results + bool reorderInstructions(); + + /// Shifts a load instructions and all its dependences above the corresponding + /// backward-dependent store instruction + bool checkDepAndReorder(StoreInst *StInst, + Instruction *LdInst); + + /// The loop that we evaluate. + Loop *TheLoop; + /// A wrapper around ScalarEvolution used to add runtime SCEV checks. + /// Applies dynamic knowledge to simplify SCEV expressions in the context + /// of existing SCEV assumptions. The analysis will also add a minimal set + /// of new predicates if this is required to enable vectorization and + /// unrolling. + PredicatedScalarEvolution &PSE; + /// Target Library Info. + TargetLibraryInfo *TLI; + /// Target Transform Info + const TargetTransformInfo *TTI; + /// Dominator Tree. + DominatorTree *DT; + /// MemorySSA + MemorySSA *MSSA; + // LoopAccess analysis. + std::function *GetLAA; + // And the loop-accesses info corresponding to this loop. This pointer is + // null until canVectorizeMemory sets it up. + const LoopAccessInfo *LAI; + +}; + +/// Moves load instructions and all its dependences above the corresponding +/// backward-dependent store instruction +bool LoopInstructionsReordering::checkDepAndReorder(StoreInst *StInst, + Instruction *LdInst) { + std::queue InstQueue; + SmallVector InstStack; + OrderedInstructions OI(&*DT); + InstQueue.push(LdInst); + + Instruction *CurrInst; + while (!InstQueue.empty()) { + CurrInst = InstQueue.front(); + InstQueue.pop(); + // Pushing all the instructions to be moved before the store instruction + InstStack.push_back(CurrInst); + + // Get all the operands of the current instruction, and if the operands + // don't dominate the store instruction, then they will be enqueued + for (User::op_iterator OpIter = CurrInst->op_begin(); + OpIter != CurrInst->op_end(); ++OpIter) { + Instruction *OpInst = dyn_cast(*OpIter); + if (OpInst) { + if (!OI.dominates(OpInst, StInst)) + InstQueue.push(OpInst); + } + } + } + + // Moving the load instruction and all its operands before the store + // instruction + while (!InstStack.empty()) { + CurrInst = InstStack.pop_back_val(); + CurrInst->moveBefore(StInst); + } + + return true; +} + + +/// Gets the backward-dependent store and load instructions and checks +/// if it is legal to reorder them using MemorySSA results +bool LoopInstructionsReordering::reorderInstructions() { + bool Reordered = false; + OrderedInstructions OI(&*DT); + + auto *Deps = LAI->getDepChecker().getDependences(); + for (auto Dep : *Deps) { + if ((Dep.Type == MemoryDepChecker::Dependence::DepType::Backward)) { + + StoreInst *St = dyn_cast(Dep.getSource(*LAI)); + LoadInst *Ld = dyn_cast(Dep.getDestination(*LAI)); + // If the backward dependence is between load/store and not between + // two loads or two stores + if (Ld && St) { + // If the Definition of the Load Instruction is live on Entry, + // then then load instruction can be moved above the store instruction + MemoryAccess *MemAcc = MSSA->getMemoryAccess(Ld); + MemoryUseOrDef *NewMemAcc = cast(MemAcc); + MemoryAccess *D = NewMemAcc->getDefiningAccess(); + if (!D) + return Reordered; + + if (MSSA->isLiveOnEntryDef(D)) + Reordered |= checkDepAndReorder(St, Ld); + else if (Instruction *DefInst = dyn_cast(D)) { + // If the defining access is not contained in the loop, then the load + // instruction can be moved above the store instruction + if (!TheLoop->contains(DefInst)) + Reordered |= checkDepAndReorder(St, Ld); + else { + // If the defining access is MemoryPhi, then it means that load + // instruction is before defining access, so if the store + // instruction is between phiNode and load instruction, then the + // instructions can be reordered + MemoryPhi *DefPhi = dyn_cast(D); + if (DefPhi) { + if (OI.dominates(St, Ld)) + Reordered |= checkDepAndReorder(St, Ld); + } else { + // If the defining access is MemoryUsOrDef, then it means that + // load Instruction is after defining access, so if the store + // instruction is between defining access and load instruction + // then, the instructions can be reordered + if (!OI.dominates(St, DefInst) && OI.dominates(St, Ld)) + Reordered |= checkDepAndReorder(St, Ld); + } + } + } + } + } + } + return Reordered; +} + + +#ifndef NDEBUG +/// \return string containing a file name and a line # for the given loop. +static std::string getDebugLocString(const Loop *L) { + std::string Result; + if (L) { + raw_string_ostream OS(Result); + if (const DebugLoc LoopDbgLoc = L->getStartLoc()) + LoopDbgLoc.print(OS); + else + // Just print the module name. + OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); + OS.flush(); + } + return Result; +} +#endif + +/// Returns true if the given loop body has a cycle, excluding the loop +/// itself. +static bool hasCyclesInLoopBody(const Loop &L) { + if (!L.empty()) + return true; + + for (const auto &SCC : + make_range(scc_iterator::begin(L), + scc_iterator::end(L))) { + if (SCC.size() > 1) { + DEBUG(dbgs() << "LVL: Detected a cycle in the loop body:\n"); + DEBUG(L.dump()); + return true; + } + } + return false; +} + +/// Detect innermost loops which do not contain cycles +static void addAcyclicInnerLoop(Loop &L, SmallVectorImpl &V) { + if (L.empty()) { + if (!hasCyclesInLoopBody(L)) + V.push_back(&L); + return; + } + for (Loop *InnerL : L) + addAcyclicInnerLoop(*InnerL, V); +} + +/// The LoopVectorize Pass. +struct LoopVectorizePred : public FunctionPass { + /// Pass identification, replacement for typeid + static char ID; + + explicit LoopVectorizePred(bool NoUnrolling = false, + bool AlwaysVectorize = true) + : FunctionPass(ID) { + Impl.DisableUnrolling = NoUnrolling; + Impl.AlwaysVectorize = AlwaysVectorize; + initializeLoopVectorizePredPass(*PassRegistry::getPassRegistry()); + } + + LoopVectorizePredPass Impl; + + bool runOnFunction(Function &F) override { + if (skipFunction(F)) + return false; + + auto *SE = &getAnalysis().getSE(); + auto *LI = &getAnalysis().getLoopInfo(); + auto *TTI = &getAnalysis().getTTI(F); + auto *DT = &getAnalysis().getDomTree(); + auto *TLIP = getAnalysisIfAvailable(); + auto *TLI = TLIP ? &TLIP->getTLI() : nullptr; + auto *AA = &getAnalysis().getAAResults(); + auto *AC = &getAnalysis().getAssumptionCache(F); + auto *LAA = &getAnalysis(); + auto *MSSA = &getAnalysis().getMSSA(); + + std::function GetLAA = + [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; + + return Impl.runImpl(F, *SE, *LI, *TTI, *DT, TLI, *AA, *AC, GetLAA, *MSSA); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addPreserved(); + AU.addPreserved(); + AU.addPreserved(); + AU.addPreserved(); + AU.addRequired(); + } +}; +} // namespace + +char LoopVectorizePred::ID = 0; +static const char lv_name[] = "Loop Vectorization Predecessor"; +INITIALIZE_PASS_BEGIN(LoopVectorizePred, LV_NAME, lv_name, false, false) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) +INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass) +INITIALIZE_PASS_END(LoopVectorizePred, LV_NAME, lv_name, false, false) + +namespace llvm { +Pass *createLoopVectorizePredPass(bool NoUnrolling, bool AlwaysVectorize) { + return new LoopVectorizePred(NoUnrolling, AlwaysVectorize); +} +} // namespace llvm + +bool LoopInstructionsReordering::canbeReordered() { + LAI = &(*GetLAA)(*TheLoop); + auto *Deps = LAI->getDepChecker().getDependences(); + + for (auto Dep : *Deps) { + if ((Dep.Type == MemoryDepChecker::Dependence::DepType::Backward)) { + bool Reordered = reorderInstructions(); + if (Reordered) + DEBUG(dbgs() << "\nLV: Instructions in loop have been reordered." + << "\n"); + return Reordered; + } + } + + return false; +} + +bool LoopVectorizePredPass::processLoop(Loop *L) { + assert(L->empty() && "Only process inner loops."); + +#ifndef NDEBUG + const std::string DebugLocStr = getDebugLocString(L); +#endif /* NDEBUG */ + + DEBUG(dbgs() << "\nLV: Checking a loop in \"" + << L->getHeader()->getParent()->getName() << "\" from " + << DebugLocStr << "\n"); + + PredicatedScalarEvolution PSE(*SE, *L); + Function *F = L->getHeader()->getParent(); + + LoopInstructionsReordering LVL(L, PSE, DT, TLI, AA, F, TTI, GetLAA, LI, MSSA); + bool Reordered = LVL.canbeReordered(); + + return Reordered; +} + +bool LoopVectorizePredPass::runImpl( + Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, + DominatorTree &DT_, TargetLibraryInfo *TLI_, AliasAnalysis &AA_, + AssumptionCache &AC_, + std::function &GetLAA_, MemorySSA &MSSA_) { + + SE = &SE_; + LI = &LI_; + TTI = &TTI_; + DT = &DT_; + TLI = TLI_; + AA = &AA_; + AC = &AC_; + GetLAA = &GetLAA_; + MSSA = &MSSA_; + + bool Changed = false; + + // The vectorizer requires loops to be in simplified form. + // Since simplification may add new inner loops, it has to run before the + // legality and profitability checks. This means running the loop vectorizer + // will simplify all loops, regardless of whether anything end up being + // vectorized. + for (auto &L : *LI) + Changed |= simplifyLoop(L, DT, LI, SE, AC, false /* PreserveLCSSA */); + + // Build up a worklist of inner-loops to vectorize. This is necessary as + // the act of vectorizing or partially unrolling a loop creates new loops + // and can invalidate iterators across the loops. + SmallVector Worklist; + + for (Loop *L : *LI) + addAcyclicInnerLoop(*L, Worklist); + + LoopsAnalyzed += Worklist.size(); + + // Now walk the identified inner loops. + while (!Worklist.empty()) { + Loop *L = Worklist.pop_back_val(); + + // For the inner loops we actually process, form LCSSA to simplify the + // transform. + Changed |= formLCSSARecursively(*L, *DT, LI, SE); + + Changed |= processLoop(L); + } + + // Process each loop nest in the function. + return Changed; +} + +PreservedAnalyses LoopVectorizePredPass::run(Function &F, + FunctionAnalysisManager &AM) { + auto &SE = AM.getResult(F); + auto &LI = AM.getResult(F); + auto &TTI = AM.getResult(F); + auto &DT = AM.getResult(F); + auto &TLI = AM.getResult(F); + auto &AA = AM.getResult(F); + auto &AC = AM.getResult(F); + auto &MSSA = AM.getResult(F).getMSSA(); + + auto &LAM = AM.getResult(F).getManager(); + std::function GetLAA = + [&](Loop &L) -> const LoopAccessInfo & { + LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI}; + return LAM.getResult(L, AR); + }; + bool Changed = runImpl(F, SE, LI, TTI, DT, &TLI, AA, AC, GetLAA, MSSA); + if (!Changed) + return PreservedAnalyses::all(); + PreservedAnalyses PA; + PA.preserve(); + PA.preserve(); + PA.preserve(); + PA.preserve(); + return PA; +} Index: lib/Transforms/Vectorize/Vectorize.cpp =================================================================== --- lib/Transforms/Vectorize/Vectorize.cpp +++ lib/Transforms/Vectorize/Vectorize.cpp @@ -28,6 +28,7 @@ void llvm::initializeVectorization(PassRegistry &Registry) { initializeBBVectorizePass(Registry); initializeLoopVectorizePass(Registry); + initializeLoopVectorizePredPass(Registry); initializeSLPVectorizerPass(Registry); initializeLoadStoreVectorizerPass(Registry); } @@ -44,6 +45,10 @@ unwrap(PM)->add(createLoopVectorizePass()); } +void LLVMAddLoopVectorizePredPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createLoopVectorizePredPass()); +} + void LLVMAddSLPVectorizePass(LLVMPassManagerRef PM) { unwrap(PM)->add(createSLPVectorizerPass()); } Index: test/Other/new-pm-defaults.ll =================================================================== --- test/Other/new-pm-defaults.ll +++ test/Other/new-pm-defaults.ll @@ -150,8 +150,10 @@ ; CHECK-O-NEXT: Running pass: LoopVectorizePass ; CHECK-O-NEXT: Running analysis: BlockFrequencyAnalysis ; CHECK-O-NEXT: Running analysis: BranchProbabilityAnalysis -; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass +; CHECK-O-NEXT: Running pass: LoopVectorizePredPass +; CHECK-O-NEXT: Running analysis: MemorySSAAnalysis ; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis +; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass ; CHECK-O-NEXT: Running pass: InstCombinePass ; CHECK-O-NEXT: Running pass: SLPVectorizerPass ; CHECK-O-NEXT: Running pass: SimplifyCFGPass Index: test/Other/new-pm-thinlto-defaults.ll =================================================================== --- test/Other/new-pm-thinlto-defaults.ll +++ test/Other/new-pm-thinlto-defaults.ll @@ -168,8 +168,10 @@ ; CHECK-POSTLINK-O-NEXT: Running pass: LoopVectorizePass ; CHECK-POSTLINK-O-NEXT: Running analysis: BlockFrequencyAnalysis ; CHECK-POSTLINK-O-NEXT: Running analysis: BranchProbabilityAnalysis -; CHECK-POSTLINK-O-NEXT: Running pass: LoopLoadEliminationPass +; CHECK-POSTLINK-O-NEXT: Running pass: LoopVectorizePredPass +; CHECK-POSTLINK-O-NEXT: Running analysis: MemorySSAAnalysis ; CHECK-POSTLINK-O-NEXT: Running analysis: LoopAccessAnalysis +; CHECK-POSTLINK-O-NEXT: Running pass: LoopLoadEliminationPass ; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass ; CHECK-POSTLINK-O-NEXT: Running pass: SLPVectorizerPass ; CHECK-POSTLINK-O-NEXT: Running pass: SimplifyCFGPass Index: test/Transforms/LoopVectorize/loop-with-backward-dependences.ll =================================================================== --- /dev/null +++ test/Transforms/LoopVectorize/loop-with-backward-dependences.ll @@ -0,0 +1,163 @@ +; RUN: opt < %s -S -loop-vectorize-pred -branch-prob -block-freq -scalar-evolution -aa -loop-accesses -demanded-bits -lazy-branch-prob -lazy-block-freq -opt-remark-emitter -loop-vectorize 2>&1 | FileCheck %s +; ModuleID = 'test-disablellvm.ll' +source_filename = "../test2.c" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-pc_linux" + + +; CHECK-LABEL: foo1 +; +; CHECK: vector.body +; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; CHECK: getelementptr inbounds i32, i32* %b, i64 {{.*}} +; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body +; + +; Function Attrs: norecurse nounwind uwtable +define i32 @foo1(i32 %n, i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %m) local_unnamed_addr #0 { +entry: + %cmp1 = icmp sgt i32 %n, 0 + br i1 %cmp1, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %n to i64 + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4, !tbaa !2 + %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv + store i32 %0, i32* %arrayidx2, align 4, !tbaa !2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %arrayidx4 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next + %1 = load i32, i32* %arrayidx4, align 4, !tbaa !2 + %arrayidx6 = getelementptr inbounds i32, i32* %m, i64 %indvars.iv + store i32 %1, i32* %arrayidx6, align 4, !tbaa !2 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + %arrayidx7 = getelementptr inbounds i32, i32* %m, i64 7 + %2 = load i32, i32* %arrayidx7, align 4, !tbaa !2 + ret i32 %2 +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1 + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1 + + +; CHECK-LABEL: foo2 +; +; CHECK: vector.body +; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; CHECK: getelementptr inbounds i32, i32* %b, i64 {{.*}} +; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body +; + +; Function Attrs: norecurse nounwind uwtable +define i32 @foo2(i32 %n, i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %m, i32* noalias nocapture readonly %c, i32* noalias nocapture %f, i32* noalias nocapture %d) local_unnamed_addr #0 { +entry: + %cmp1 = icmp sgt i32 %n, 0 + br i1 %cmp1, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %n to i64 + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4, !tbaa !2 + %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv + store i32 %0, i32* %arrayidx2, align 4, !tbaa !2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %arrayidx4 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next + %1 = load i32, i32* %arrayidx4, align 4, !tbaa !2 + %arrayidx6 = getelementptr inbounds i32, i32* %m, i64 %indvars.iv + store i32 %1, i32* %arrayidx6, align 4, !tbaa !2 + %arrayidx8 = getelementptr inbounds i32, i32* %c, i64 %indvars.iv + %2 = load i32, i32* %arrayidx8, align 4, !tbaa !2 + %arrayidx10 = getelementptr inbounds i32, i32* %f, i64 %indvars.iv + store i32 %2, i32* %arrayidx10, align 4, !tbaa !2 + %arrayidx13 = getelementptr inbounds i32, i32* %f, i64 %indvars.iv.next + %3 = load i32, i32* %arrayidx13, align 4, !tbaa !2 + %arrayidx15 = getelementptr inbounds i32, i32* %d, i64 %indvars.iv + store i32 %3, i32* %arrayidx15, align 4, !tbaa !2 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + %arrayidx16 = getelementptr inbounds i32, i32* %m, i64 7 + %4 = load i32, i32* %arrayidx16, align 4, !tbaa !2 + ret i32 %4 +} + +; CHECK-LABEL: foo3 +; +; CHECK: vector.body +; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; CHECK: getelementptr inbounds i32, i32* %b, i64 {{.*}} +; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body +; + + +; Function Attrs: norecurse nounwind uwtable +define i32 @foo3(i32 %n, i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %m, i32* noalias nocapture %c) local_unnamed_addr #0 { +entry: + %cmp1 = icmp sgt i32 %n, 0 + br i1 %cmp1, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %n to i64 + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4, !tbaa !2 + %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv + store i32 %0, i32* %arrayidx2, align 4, !tbaa !2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %arrayidx4 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next + %1 = load i32, i32* %arrayidx4, align 4, !tbaa !2 + %arrayidx6 = getelementptr inbounds i32, i32* %m, i64 %indvars.iv + store i32 %1, i32* %arrayidx6, align 4, !tbaa !2 + %2 = add nuw nsw i64 %indvars.iv, 4 + %arrayidx9 = getelementptr inbounds i32, i32* %a, i64 %2 + %3 = load i32, i32* %arrayidx9, align 4, !tbaa !2 + %arrayidx11 = getelementptr inbounds i32, i32* %c, i64 %indvars.iv + store i32 %3, i32* %arrayidx11, align 4, !tbaa !2 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + %arrayidx12 = getelementptr inbounds i32, i32* %m, i64 7 + %4 = load i32, i32* %arrayidx12, align 4, !tbaa !2 + ret i32 %4 +} + +attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="slm" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { argmemonly nounwind "target-cpu"="slm" } + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{!"clang version 5.0.0 (https://github.com/llvm-mirror/clang.git 6cfb5bf41823be28bca09fe72dd3d4b83f4e1be8) (https://github.com/llvm-mirror/llvm.git 71ff951212dc0f4da08d348cdc6f14997ecc7f96)"} +!2 = !{!3, !3, i64 0} +!3 = !{!"int", !4, i64 0} +!4 = !{!"omnipotent char", !5, i64 0} +!5 = !{!"Simple C/C++ TBAA"}