Index: include/llvm/Transforms/Vectorize/LoopVectorize.h =================================================================== --- include/llvm/Transforms/Vectorize/LoopVectorize.h +++ include/llvm/Transforms/Vectorize/LoopVectorize.h @@ -26,6 +26,14 @@ // of vectorization. It decides on the optimal vector width, which // can be one, if vectorization is not profitable. // +// There is a development effort going on to migrate loop vectorizer to the +// VPlan infrastructure and to introduce outer loop vectorization support (see +// docs/Proposal/VectorizationPlan.rst and +// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this +// purpose, we temporarily introduced the VPlan-native vectorization path: an +// alternative vectorization path that is natively implemented on top of the +// VPlan infrastructure. See EnableVPlanNativePath for enabling. +// //===----------------------------------------------------------------------===// // // The reduction-variable vectorization is based on the paper: Index: lib/Transforms/Vectorize/LoopVectorizationPlanner.h =================================================================== --- lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -144,6 +144,10 @@ /// Plan how to best vectorize, return the best VF and its cost. VectorizationFactor plan(bool OptForSize, unsigned UserVF); + /// Use the VPlan-native path to plan how to best vectorize, return the best + /// VF and its cost. + VectorizationFactor planInVPlanNativePath(bool OptForSize, unsigned UserVF); + /// Finalize the best decision and dispose of all other VPlans. void setBestPlan(unsigned VF, unsigned UF); Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -26,6 +26,14 @@ // of vectorization. It decides on the optimal vector width, which // can be one, if vectorization is not profitable. // +// There is a development effort going on to migrate loop vectorizer to the +// VPlan infrastructure and to introduce outer loop vectorization support (see +// docs/Proposal/VectorizationPlan.rst and +// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this +// purpose, we temporarily introduced the VPlan-native vectorization path: an +// alternative vectorization path that is natively implemented on top of the +// VPlan infrastructure. See EnableVPlanNativePath for enabling. +// //===----------------------------------------------------------------------===// // // The reduction-variable vectorization is based on the paper: @@ -251,6 +259,11 @@ cl::desc("The maximum number of SCEV checks allowed with a " "vectorize(enable) pragma")); +static cl::opt EnableVPlanNativePath( + "enable-vplan-native-path", cl::init(false), cl::Hidden, + cl::desc("Enable VPlan-native vectorization path with " + "support for outer loop vectorization.")); + /// Create an analysis remark that explains why vectorization failed /// /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p @@ -1519,7 +1532,7 @@ std::function *GetLAA, LoopInfo *LI, OptimizationRemarkEmitter *ORE, LoopVectorizationRequirements *R, LoopVectorizeHints *H, DemandedBits *DB, AssumptionCache *AC) - : TheLoop(L), PSE(PSE), TLI(TLI), DT(DT), GetLAA(GetLAA), + : TheLoop(L), LI(LI), PSE(PSE), TLI(TLI), DT(DT), GetLAA(GetLAA), ORE(ORE), Requirements(R), Hints(H), DB(DB), AC(AC) {} /// ReductionList contains the reduction descriptors for all @@ -1621,6 +1634,15 @@ bool hasFunNoNaNAttr() const { return HasFunNoNaNAttr; } private: + /// Return true if the pre-header, exiting and latch blocks of \p Lp and all + /// its nested loops are considered legal for vectorization. These legal + /// checks are common for inner and outer loop vectorization. + bool canVectorizeLoopNestCFG(Loop *Lp); + + /// Return true if the pre-header, exiting and latch blocks of \p Lp + /// (non-recursive) are considered legal for vectorization. + bool canVectorizeLoopCFG(Loop *Lp); + /// Check if a single basic block loop is vectorizable. /// At this point we know that this is a loop with a constant trip count /// and we only need to check individual instructions. @@ -1636,6 +1658,10 @@ /// transformation. bool canVectorizeWithIfConvert(); + /// Return true if we can vectorize this outer loop. The method performs + /// specific checks for outer loop vectorization. + bool canVectorizeOuterLoop(); + /// Return true if all of the instructions in the block can be speculatively /// executed. \p SafePtrs is a list of addresses that are known to be legal /// and we know that we can read from them without segfault. @@ -1672,6 +1698,9 @@ /// The loop that we evaluate. Loop *TheLoop; + /// Loop Info analysis. + LoopInfo *LI; + /// A wrapper around ScalarEvolution used to add runtime SCEV checks. /// Applies dynamic knowledge to simplify SCEV expressions in the context /// of existing SCEV assumptions. The analysis will also add a minimal set @@ -2275,17 +2304,73 @@ } // end anonymous namespace -static void addAcyclicInnerLoop(Loop &L, LoopInfo &LI, - SmallVectorImpl &V) { - if (L.empty()) { +// Return true if \p OuterLp is an outer loop annotated with hints for explicit +// vectorization. The loop needs to be annotated with #pragma omp simd +// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the +// vector length information is not provided, vectorization is not considered +// explicit. Interleave hints are not allowed either. These limitations will be +// relaxed in the future. +// Please, note that we are currently forced to abuse the pragma 'clang +// vectorize' semantics. This pragma provides *auto-vectorization hints* +// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' +// provides *explicit vectorization hints* (LV can bypass legal checks and +// assume that vectorization is legal). However, both hints are implemented +// using the same metadata (llvm.loop.vectorize, processed by +// LoopVectorizeHints). This will be fixed in the future when the native IR +// representation for pragma 'omp simd' is introduced. +static bool isExplicitVecOuterLoop(Loop *OuterLp, + OptimizationRemarkEmitter *ORE) { + assert(!OuterLp->empty() && "This is not an outer loop"); + LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); + + // Only outer loops with an explicit vectorization hint are supported. + // Unannotated outer loops are ignored. + if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) + return false; + + Function *Fn = OuterLp->getHeader()->getParent(); + if (!Hints.allowVectorization(Fn, OuterLp, false /*AlwaysVectorize*/)) { + DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); + return false; + } + + if (!Hints.getWidth()) { + DEBUG(dbgs() << "LV: Not vectorizing: No user vector width.\n"); + emitMissedWarning(Fn, OuterLp, Hints, ORE); + return false; + } + + if (Hints.getInterleave() > 1) { + // TODO: Interleave support is future work. + DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " + "outer loops.\n"); + emitMissedWarning(Fn, OuterLp, Hints, ORE); + return false; + } + + return true; +} + +static void collectSupportedLoops(Loop &L, LoopInfo *LI, + OptimizationRemarkEmitter *ORE, + SmallVectorImpl &V) { + // Collect inner loops and outer loops without irreducible control flow. For + // now, only collect outer loops that have explicit vectorization hints. + if (L.empty() || (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { LoopBlocksRPO RPOT(&L); - RPOT.perform(&LI); - if (!containsIrreducibleCFG(RPOT, LI)) + RPOT.perform(LI); + if (!containsIrreducibleCFG(RPOT, *LI)) { V.push_back(&L); - return; + // TODO: Collect inner loops inside marked outer loops in case + // vectorization fails for the outer loop. Do not invoke + // 'containsIrreducibleCFG' again for inner loops when the outer loop is + // already known to be reducible. We can use an inherited attribute for + // that. + return; + } } for (Loop *InnerL : L) - addAcyclicInnerLoop(*InnerL, LI, V); + collectSupportedLoops(*InnerL, LI, ORE, V); } namespace { @@ -4832,15 +4917,24 @@ return true; } -bool LoopVectorizationLegality::canVectorize() { +// Helper function to canVectorizeLoopNestCFG. +bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp) { + assert((EnableVPlanNativePath || Lp->empty()) && + "VPlan-native path is not enabled."); + + // TODO: ORE should be improved to show more accurate information when an + // outer loop can't be vectorized because a nested loop is not understood or + // legal. Something like: "outer_loop_location: loop not vectorized: + // (inner_loop_location) loop control flow is not understood by vectorizer". + // Store the result and return it at the end instead of exiting early, in case // allowExtraAnalysis is used to report multiple reasons for not vectorizing. bool Result = true; - bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE); + // We must have a loop in canonical form. Loops with indirectbr in them cannot // be canonicalized. - if (!TheLoop->getLoopPreheader()) { + if (!Lp->getLoopPreheader()) { DEBUG(dbgs() << "LV: Loop doesn't have a legal pre-header.\n"); ORE->emit(createMissedAnalysis("CFGNotUnderstood") << "loop control flow is not understood by vectorizer"); @@ -4850,21 +4944,8 @@ return false; } - // FIXME: The code is currently dead, since the loop gets sent to - // LoopVectorizationLegality is already an innermost loop. - // - // We can only vectorize innermost loops. - if (!TheLoop->empty()) { - ORE->emit(createMissedAnalysis("NotInnermostLoop") - << "loop is not the innermost loop"); - if (DoExtraAnalysis) - Result = false; - else - return false; - } - // We must have a single backedge. - if (TheLoop->getNumBackEdges() != 1) { + if (Lp->getNumBackEdges() != 1) { ORE->emit(createMissedAnalysis("CFGNotUnderstood") << "loop control flow is not understood by vectorizer"); if (DoExtraAnalysis) @@ -4874,7 +4955,7 @@ } // We must have a single exiting block. - if (!TheLoop->getExitingBlock()) { + if (!Lp->getExitingBlock()) { ORE->emit(createMissedAnalysis("CFGNotUnderstood") << "loop control flow is not understood by vectorizer"); if (DoExtraAnalysis) @@ -4886,7 +4967,7 @@ // We only handle bottom-tested loops, i.e. loop in which the condition is // checked at the end of each iteration. With that we can assume that all // instructions in the loop are executed the same number of times. - if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { + if (Lp->getExitingBlock() != Lp->getLoopLatch()) { ORE->emit(createMissedAnalysis("CFGNotUnderstood") << "loop control flow is not understood by vectorizer"); if (DoExtraAnalysis) @@ -4895,10 +4976,70 @@ return false; } + return Result; +} + +bool LoopVectorizationLegality::canVectorizeLoopNestCFG(Loop *Lp) { + // Store the result and return it at the end instead of exiting early, in case + // allowExtraAnalysis is used to report multiple reasons for not vectorizing. + bool Result = true; + bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE); + if (!canVectorizeLoopCFG(Lp)) { + if (DoExtraAnalysis) + Result = false; + else + return false; + } + + // Recursively check whether the loop control flow of nested loops is + // understood. + for (Loop *SubLp : *Lp) + if (!canVectorizeLoopNestCFG(SubLp)) { + if (DoExtraAnalysis) + Result = false; + else + return false; + } + + return Result; +} + +bool LoopVectorizationLegality::canVectorize() { + // Store the result and return it at the end instead of exiting early, in case + // allowExtraAnalysis is used to report multiple reasons for not vectorizing. + bool Result = true; + + bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE); + // Check whether the loop-related control flow in the loop nest is expected by + // vectorizer. + if (!canVectorizeLoopNestCFG(TheLoop)) { + if (DoExtraAnalysis) + Result = false; + else + return false; + } + // We need to have a loop header. DEBUG(dbgs() << "LV: Found a loop: " << TheLoop->getHeader()->getName() << '\n'); + // Specific checks for outer loops. We skip the remaining legal checks at this + // point because they don't support outer loops. + if (!TheLoop->empty()) { + assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); + + if (!canVectorizeOuterLoop()) { + DEBUG(dbgs() << "LV: Not vectorizing: Unsupported outer loop.\n"); + // TODO: Implement DoExtraAnalysis when subsequent legal checks support + // outer loops. + return false; + } + + DEBUG(dbgs() << "LV: We can vectorize this outer loop!\n"); + return Result; + } + + assert(TheLoop->empty() && "Inner loop expected."); // Check if we can if-convert non-single-bb loops. unsigned NumBlocks = TheLoop->getNumBlocks(); if (NumBlocks != 1 && !canVectorizeWithIfConvert()) { @@ -4955,6 +5096,140 @@ return Result; } +// Return true if the inner loop \p Lp is uniform with regard to the outer loop +// \p OuterLp (i.e., if the outer loop is vectorized, all the vector lanes +// executing the inner loop will execute the same iterations). This check is +// very constrained for now but it will be relaxed in the future. \p Lp is +// considered uniform if it meets all the following conditions: +// 1) it has a canonical IV (starting from 0 and with stride 1), +// 2) its latch terminator is a conditional branch and, +// 3) its latch condition is a compare instruction whose operands are the +// canonical IV and an OuterLp invariant. +// This check doesn't take into account the uniformity of other conditions not +// related to the loop latch because they don't affect the loop uniformity. +// +// NOTE: We decided to keep all these checks and its associated documentation +// together so that we can easily have a picture of the current supported loop +// nests. However, some of the current checks don't depend on \p OuterLp and +// would be redundantly executed for each \p Lp if we invoked this function for +// different candidate outer loops. This is not the case for now because we +// don't currently have the infrastructure to evaluate multiple candidate outer +// loops and \p OuterLp will be a fixed parameter while we only support explicit +// outer loop vectorization. It's also very likely that these checks go away +// before introducing the aforementioned infrastructure. However, if this is not +// the case, we should move the \p OuterLp independent checks to a separate +// function that is only executed once for each \p Lp. +static bool isUniformLoop(Loop *Lp, Loop *OuterLp) { + assert(Lp->getLoopLatch() && "Expected loop with a single latch."); + + // If Lp is the outer loop, it's uniform by definition. + if (Lp == OuterLp) + return true; + assert(OuterLp->contains(Lp) && "OuterLp must contain Lp."); + + // 1. + PHINode *IV = Lp->getCanonicalInductionVariable(); + if (!IV) { + DEBUG(dbgs() << "LV: Canonical IV not found.\n"); + return false; + } + + // 2. + BasicBlock *Latch = Lp->getLoopLatch(); + auto *LatchBr = dyn_cast(Latch->getTerminator()); + if (!LatchBr || LatchBr->isUnconditional()) { + DEBUG(dbgs() << "LV: Unsupported loop latch branch.\n"); + return false; + } + + // 3. + auto *LatchCmp = dyn_cast(LatchBr->getCondition()); + if (!LatchCmp) { + DEBUG(dbgs() << "LV: Loop latch condition is not a compare instruction.\n"); + return false; + } + + Value *CondOp0 = LatchCmp->getOperand(0); + Value *CondOp1 = LatchCmp->getOperand(1); + Value *IVUpdate = IV->getIncomingValueForBlock(Latch); + if (!(CondOp0 == IVUpdate && OuterLp->isLoopInvariant(CondOp1)) && + !(CondOp1 == IVUpdate && OuterLp->isLoopInvariant(CondOp0))) { + DEBUG(dbgs() << "LV: Loop latch condition is not uniform.\n"); + return false; + } + + return true; +} + +// Return true if \p Lp and all its nested loops are uniform with regard to \p +// OuterLp. +static bool isUniformLoopNest(Loop *Lp, Loop *OuterLp) { + if (!isUniformLoop(Lp, OuterLp)) + return false; + + // Check if nested loops are uniform. + for (Loop *SubLp : *Lp) + if (!isUniformLoopNest(SubLp, OuterLp)) + return false; + + return true; +} + +bool LoopVectorizationLegality::canVectorizeOuterLoop() { + assert(!TheLoop->empty() && "We are not vectorizing an outer loop."); + // Store the result and return it at the end instead of exiting early, in case + // allowExtraAnalysis is used to report multiple reasons for not vectorizing. + bool Result = true; + bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE); + + for (BasicBlock *BB : TheLoop->blocks()) { + // Check whether the BB terminator is a BranchInst. Any other terminator is + // not supported yet. + auto *Br = dyn_cast(BB->getTerminator()); + if (!Br) { + DEBUG(dbgs() << "LV: Unsupported basic block terminator.\n"); + ORE->emit(createMissedAnalysis("CFGNotUnderstood") + << "loop control flow is not understood by vectorizer"); + if (DoExtraAnalysis) + Result = false; + else + return false; + } + + // Check whether the BranchInst is a supported one. Only unconditional + // branches, conditional branches with an outer loop invariant condition or + // backedges are supported. + if (Br && Br->isConditional() && + !TheLoop->isLoopInvariant(Br->getCondition()) && + !LI->isLoopHeader(Br->getSuccessor(0)) && + !LI->isLoopHeader(Br->getSuccessor(1))) { + DEBUG(dbgs() << "LV: Unsupported conditional branch.\n"); + ORE->emit(createMissedAnalysis("CFGNotUnderstood") + << "loop control flow is not understood by vectorizer"); + if (DoExtraAnalysis) + Result = false; + else + return false; + } + } + + // Check whether inner loops are uniform. At this point, we only support + // simple outer loops scenarios with uniform nested loops. + if (!isUniformLoopNest(TheLoop /*loop nest*/, + TheLoop /*context outer loop*/)) { + DEBUG(dbgs() + << "LV: Not vectorizing: Outer loop contains divergent loops.\n"); + ORE->emit(createMissedAnalysis("CFGNotUnderstood") + << "loop control flow is not understood by vectorizer"); + if (DoExtraAnalysis) + Result = false; + else + return false; + } + + return Result; +} + static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) { if (Ty->isPointerTy()) return DL.getIntPtrType(Ty); @@ -7406,7 +7681,33 @@ } VectorizationFactor +LoopVectorizationPlanner::planInVPlanNativePath(bool OptForSize, + unsigned UserVF) { + // Width 1 means no vectorize, cost 0 means uncomputed cost. + const VectorizationFactor NoVectorization = {1U, 0U}; + + // Outer loop handling: They may require CFG and instruction level + // transformations before even evaluating whether vectorization is profitable. + // Since we cannot modify the incoming IR, we need to build VPlan upfront in + // the vectorization pipeline. + if (!OrigLoop->empty()) { + assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); + assert(UserVF && "Expected UserVF for outer loop vectorization."); + assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two"); + DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); + buildVPlans(UserVF, UserVF); + + return {UserVF, 0}; + } + + DEBUG(dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " + "VPlan-native path.\n"); + return NoVectorization; +} + +VectorizationFactor LoopVectorizationPlanner::plan(bool OptForSize, unsigned UserVF) { + assert(OrigLoop->empty() && "Inner loop expected."); // Width 1 means no vectorize, cost 0 means uncomputed cost. const VectorizationFactor NoVectorization = {1U, 0U}; Optional MaybeMaxVF = CM.computeMaxVF(OptForSize); @@ -7969,6 +8270,19 @@ LoopVectorizationPlanner::VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range, const SmallPtrSetImpl &NeedDef) { + // Outer loop handling: They may require CFG and instruction level + // transformations before even evaluating whether vectorization is profitable. + // Since we cannot modify the incoming IR, we need to build VPlan upfront in + // the vectorization pipeline. + if (!OrigLoop->empty()) { + assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); + + // Create new empty VPlan + auto Plan = llvm::make_unique(); + return Plan; + } + + assert(OrigLoop->empty() && "Inner loop expected."); EdgeMaskCache.clear(); BlockMaskCache.clear(); DenseMap &SinkAfter = Legal->getSinkAfter(); @@ -8298,8 +8612,45 @@ State.ILV->vectorizeMemoryInstruction(&Instr, &MaskValues); } +// Process the loop in the VPlan-native vectorization path. This path builds +// VPlan upfront in the vectorization pipeline, which allows to apply +// VPlan-to-VPlan transformations from the very beginning without modifying the +// input LLVM IR. +static bool processLoopInVPlanNativePath( + Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, + LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, + TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, + OptimizationRemarkEmitter *ORE, LoopVectorizeHints &Hints) { + + assert(EnableVPlanNativePath && "VPlan-native path is disabled."); + Function *F = L->getHeader()->getParent(); + InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); + LoopVectorizationCostModel CM(L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, + &Hints, IAI); + // Use the planner for outer loop vectorization. + // TODO: CM is not used at this point inside the planner. Turn CM into an + // optional argument if we don't need it in the future. + LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM); + + // Get user vectorization factor. + unsigned UserVF = Hints.getWidth(); + + // Check the function attributes to find out if this function should be + // optimized for size. + bool OptForSize = + Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->optForSize(); + + // Plan how to best vectorize, return the best VF and its cost. + LVP.planInVPlanNativePath(OptForSize, UserVF); + + // Returning false. We are currently not generating vector code in the VPlan + // native path. + return false; +} + bool LoopVectorizePass::processLoop(Loop *L) { - assert(L->empty() && "Only process inner loops."); + assert((EnableVPlanNativePath || L->empty()) && + "VPlan-native path is not enabled. Only process inner loops."); #ifndef NDEBUG const std::string DebugLocStr = getDebugLocString(L); @@ -8354,6 +8705,16 @@ bool OptForSize = Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->optForSize(); + // Entrance to the VPlan-native vectorization path. Outer loops are processed + // here. They may require CFG and instruction level transformations before + // even evaluating whether vectorization is profitable. Since we cannot modify + // the incoming IR, we need to build VPlan upfront in the vectorization + // pipeline. + if (!L->empty()) + return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, + ORE, Hints); + + assert(L->empty() && "Inner loop expected."); // Check the loop for a trip count threshold: vectorize loops with a tiny trip // count by optimizing for size, to minimize overheads. // Prefer constant trip counts over profile data, over upper bound estimate. @@ -8630,7 +8991,7 @@ SmallVector Worklist; for (Loop *L : *LI) - addAcyclicInnerLoop(*L, *LI, Worklist); + collectSupportedLoops(*L, LI, ORE, Worklist); LoopsAnalyzed += Worklist.size(); Index: test/Transforms/LoopVectorize/explicit_outer_detection.ll =================================================================== --- /dev/null +++ test/Transforms/LoopVectorize/explicit_outer_detection.ll @@ -0,0 +1,238 @@ +; RUN: opt < %s -loop-vectorize -enable-vplan-native-path -debug-only=loop-vectorize -S 2>&1 | FileCheck %s +; REQUIRES: asserts + +; Verify that outer loops annotated only with the expected explicit +; vectorization hints are collected for vectorization instead of inner loops. + +; Root C/C++ source code for all the test cases +; void foo(int *a, int *b, int N, int M) +; { +; int i, j; +; #pragma clang loop vectorize(enable) +; for (i = 0; i < N; i++) { +; for (j = 0; j < M; j++) { +; a[i*M+j] = b[i*M+j] * b[i*M+j]; +; } +; } +; } + +; Case 1: Annotated outer loop WITH vector width information must be collected. + +; CHECK-LABEL: vector_width +; CHECK: LV: Loop hints: force=enabled width=4 unroll=0 +; CHECK: LV: We can vectorize this outer loop! +; CHECK: LV: Using user VF 4. +; CHECK-NOT: LV: Loop hints: force=? +; CHECK-NOT: LV: Found a loop: inner.body + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @vector_width(i32* nocapture %a, i32* nocapture readonly %b, i32 %N, i32 %M) local_unnamed_addr { +entry: + %cmp32 = icmp sgt i32 %N, 0 + br i1 %cmp32, label %outer.ph, label %for.end15 + +outer.ph: ; preds = %entry + %cmp230 = icmp sgt i32 %M, 0 + %0 = sext i32 %M to i64 + %wide.trip.count = zext i32 %M to i64 + %wide.trip.count38 = zext i32 %N to i64 + br label %outer.body + +outer.body: ; preds = %outer.inc, %outer.ph + %indvars.iv35 = phi i64 [ 0, %outer.ph ], [ %indvars.iv.next36, %outer.inc ] + br i1 %cmp230, label %inner.ph, label %outer.inc + +inner.ph: ; preds = %outer.body + %1 = mul nsw i64 %indvars.iv35, %0 + br label %inner.body + +inner.body: ; preds = %inner.body, %inner.ph + %indvars.iv = phi i64 [ 0, %inner.ph ], [ %indvars.iv.next, %inner.body ] + %2 = add nsw i64 %indvars.iv, %1 + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %2 + %3 = load i32, i32* %arrayidx, align 4, !tbaa !2 + %mul8 = mul nsw i32 %3, %3 + %arrayidx12 = getelementptr inbounds i32, i32* %a, i64 %2 + store i32 %mul8, i32* %arrayidx12, align 4, !tbaa !2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %outer.inc, label %inner.body + +outer.inc: ; preds = %inner.body, %outer.body + %indvars.iv.next36 = add nuw nsw i64 %indvars.iv35, 1 + %exitcond39 = icmp eq i64 %indvars.iv.next36, %wide.trip.count38 + br i1 %exitcond39, label %for.end15, label %outer.body, !llvm.loop !6 + +for.end15: ; preds = %outer.inc, %entry + ret void +} + +; Case 2: Annotated outer loop WITHOUT vector width information doesn't have to +; be collected. + +; CHECK-LABEL: case2 +; CHECK-NOT: LV: Loop hints: force=enabled +; CHECK-NOT: LV: We can vectorize this outer loop! +; CHECK: LV: Loop hints: force=? +; CHECK: LV: Found a loop: inner.body + +define void @case2(i32* nocapture %a, i32* nocapture readonly %b, i32 %N, i32 %M) local_unnamed_addr { +entry: + %cmp32 = icmp sgt i32 %N, 0 + br i1 %cmp32, label %outer.ph, label %for.end15 + +outer.ph: ; preds = %entry + %cmp230 = icmp sgt i32 %M, 0 + %0 = sext i32 %M to i64 + %wide.trip.count = zext i32 %M to i64 + %wide.trip.count38 = zext i32 %N to i64 + br label %outer.body + +outer.body: ; preds = %outer.inc, %outer.ph + %indvars.iv35 = phi i64 [ 0, %outer.ph ], [ %indvars.iv.next36, %outer.inc ] + br i1 %cmp230, label %inner.ph, label %outer.inc + +inner.ph: ; preds = %outer.body + %1 = mul nsw i64 %indvars.iv35, %0 + br label %inner.body + +inner.body: ; preds = %inner.body, %inner.ph + %indvars.iv = phi i64 [ 0, %inner.ph ], [ %indvars.iv.next, %inner.body ] + %2 = add nsw i64 %indvars.iv, %1 + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %2 + %3 = load i32, i32* %arrayidx, align 4, !tbaa !2 + %mul8 = mul nsw i32 %3, %3 + %arrayidx12 = getelementptr inbounds i32, i32* %a, i64 %2 + store i32 %mul8, i32* %arrayidx12, align 4, !tbaa !2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %outer.inc, label %inner.body + +outer.inc: ; preds = %inner.body, %outer.body + %indvars.iv.next36 = add nuw nsw i64 %indvars.iv35, 1 + %exitcond39 = icmp eq i64 %indvars.iv.next36, %wide.trip.count38 + br i1 %exitcond39, label %for.end15, label %outer.body, !llvm.loop !9 + +for.end15: ; preds = %outer.inc, %entry + ret void +} + +; Case 3: Annotated outer loop WITH vector width and interleave information +; doesn't have to be collected. + +; CHECK-LABEL: case3 +; CHECK-NOT: LV: Loop hints: force=enabled +; CHECK-NOT: LV: We can vectorize this outer loop! +; CHECK: LV: Loop hints: force=? +; CHECK: LV: Found a loop: inner.body + +define void @case3(i32* nocapture %a, i32* nocapture readonly %b, i32 %N, i32 %M) local_unnamed_addr { +entry: + %cmp32 = icmp sgt i32 %N, 0 + br i1 %cmp32, label %outer.ph, label %for.end15 + +outer.ph: ; preds = %entry + %cmp230 = icmp sgt i32 %M, 0 + %0 = sext i32 %M to i64 + %wide.trip.count = zext i32 %M to i64 + %wide.trip.count38 = zext i32 %N to i64 + br label %outer.body + +outer.body: ; preds = %outer.inc, %outer.ph + %indvars.iv35 = phi i64 [ 0, %outer.ph ], [ %indvars.iv.next36, %outer.inc ] + br i1 %cmp230, label %inner.ph, label %outer.inc + +inner.ph: ; preds = %outer.body + %1 = mul nsw i64 %indvars.iv35, %0 + br label %inner.body + +inner.body: ; preds = %inner.body, %inner.ph + %indvars.iv = phi i64 [ 0, %inner.ph ], [ %indvars.iv.next, %inner.body ] + %2 = add nsw i64 %indvars.iv, %1 + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %2 + %3 = load i32, i32* %arrayidx, align 4, !tbaa !2 + %mul8 = mul nsw i32 %3, %3 + %arrayidx12 = getelementptr inbounds i32, i32* %a, i64 %2 + store i32 %mul8, i32* %arrayidx12, align 4, !tbaa !2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %outer.inc, label %inner.body + +outer.inc: ; preds = %inner.body, %outer.body + %indvars.iv.next36 = add nuw nsw i64 %indvars.iv35, 1 + %exitcond39 = icmp eq i64 %indvars.iv.next36, %wide.trip.count38 + br i1 %exitcond39, label %for.end15, label %outer.body, !llvm.loop !11 + +for.end15: ; preds = %outer.inc, %entry + ret void +} + +; Case 4: Outer loop without any explicit vectorization annotation doesn't have +; to be collected. + +; CHECK-LABEL: case4 +; CHECK-NOT: LV: Loop hints: force=enabled +; CHECK-NOT: LV: We can vectorize this outer loop! +; CHECK: LV: Loop hints: force=? +; CHECK: LV: Found a loop: inner.body + +define void @case4(i32* nocapture %a, i32* nocapture readonly %b, i32 %N, i32 %M) local_unnamed_addr { +entry: + %cmp32 = icmp sgt i32 %N, 0 + br i1 %cmp32, label %outer.ph, label %for.end15 + +outer.ph: ; preds = %entry + %cmp230 = icmp sgt i32 %M, 0 + %0 = sext i32 %M to i64 + %wide.trip.count = zext i32 %M to i64 + %wide.trip.count38 = zext i32 %N to i64 + br label %outer.body + +outer.body: ; preds = %outer.inc, %outer.ph + %indvars.iv35 = phi i64 [ 0, %outer.ph ], [ %indvars.iv.next36, %outer.inc ] + br i1 %cmp230, label %inner.ph, label %outer.inc + +inner.ph: ; preds = %outer.body + %1 = mul nsw i64 %indvars.iv35, %0 + br label %inner.body + +inner.body: ; preds = %inner.body, %inner.ph + %indvars.iv = phi i64 [ 0, %inner.ph ], [ %indvars.iv.next, %inner.body ] + %2 = add nsw i64 %indvars.iv, %1 + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %2 + %3 = load i32, i32* %arrayidx, align 4, !tbaa !2 + %mul8 = mul nsw i32 %3, %3 + %arrayidx12 = getelementptr inbounds i32, i32* %a, i64 %2 + store i32 %mul8, i32* %arrayidx12, align 4, !tbaa !2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %outer.inc, label %inner.body + +outer.inc: ; preds = %inner.body, %outer.body + %indvars.iv.next36 = add nuw nsw i64 %indvars.iv35, 1 + %exitcond39 = icmp eq i64 %indvars.iv.next36, %wide.trip.count38 + br i1 %exitcond39, label %for.end15, label %outer.body + +for.end15: ; preds = %outer.inc, %entry + ret void +} + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{!"clang version 6.0.0"} +!2 = !{!3, !3, i64 0} +!3 = !{!"int", !4, i64 0} +!4 = !{!"omnipotent char", !5, i64 0} +!5 = !{!"Simple C/C++ TBAA"} +; Case 1 +!6 = distinct !{!6, !7, !8} +!7 = !{!"llvm.loop.vectorize.width", i32 4} +!8 = !{!"llvm.loop.vectorize.enable", i1 true} +; Case 2 +!9 = distinct !{!9, !8} +; Case 3 +!10 = !{!"llvm.loop.interleave.count", i32 2} +!11 = distinct !{!11, !7, !10, !8} Index: test/Transforms/LoopVectorize/explicit_outer_nonuniform_inner.ll =================================================================== --- /dev/null +++ test/Transforms/LoopVectorize/explicit_outer_nonuniform_inner.ll @@ -0,0 +1,177 @@ +; RUN: opt < %s -loop-vectorize -enable-vplan-native-path -pass-remarks-analysis=loop-vectorize -debug-only=loop-vectorize -S 2>&1 | FileCheck %s +; REQUIRES: asserts + +; Verify that LV bails out on explicit vectorization outer loops that contain +; divergent inner loops. + +; Root C/C++ source code for all the test cases +; void foo(int *a, int *b, int N, int M) +; { +; int i, j; +; #pragma clang loop vectorize(enable) vectorize_width(8) +; for (i = 0; i < N; i++) { +; // Tested inner loop. It will be replaced per test. +; for (j = 0; j < M; j++) { +; a[i*M+j] = b[i*M+j] * b[i*M+j]; +; } +; } +; } + +; Case 1 (for (j = i; j < M; j++)): Inner loop with divergent IV start. + +; CHECK-LABEL: iv_start +; CHECK: LV: Not vectorizing: Outer loop contains divergent loops. +; CHECK: LV: Not vectorizing: Unsupported outer loop. + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @iv_start(i32* nocapture %a, i32* nocapture readonly %b, i32 %N, i32 %M) local_unnamed_addr { +entry: + %cmp33 = icmp sgt i32 %N, 0 + br i1 %cmp33, label %outer.ph, label %for.end15 + +outer.ph: ; preds = %entry + %0 = sext i32 %M to i64 + %wide.trip.count = zext i32 %M to i64 + %wide.trip.count41 = zext i32 %N to i64 + br label %outer.body + +outer.body: ; preds = %outer.inc, %outer.ph + %indvars.iv38 = phi i64 [ 0, %outer.ph ], [ %indvars.iv.next39, %outer.inc ] + %cmp231 = icmp slt i64 %indvars.iv38, %0 + br i1 %cmp231, label %inner.ph, label %outer.inc + +inner.ph: ; preds = %outer.body + %1 = mul nsw i64 %indvars.iv38, %0 + br label %inner.body + +inner.body: ; preds = %inner.body, %inner.ph + %indvars.iv35 = phi i64 [ %indvars.iv38, %inner.ph ], [ %indvars.iv.next36, %inner.body ] + %2 = add nsw i64 %indvars.iv35, %1 + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %2 + %3 = load i32, i32* %arrayidx, align 4, !tbaa !2 + %mul8 = mul nsw i32 %3, %3 + %arrayidx12 = getelementptr inbounds i32, i32* %a, i64 %2 + store i32 %mul8, i32* %arrayidx12, align 4, !tbaa !2 + %indvars.iv.next36 = add nuw nsw i64 %indvars.iv35, 1 + %exitcond = icmp eq i64 %indvars.iv.next36, %wide.trip.count + br i1 %exitcond, label %outer.inc, label %inner.body + +outer.inc: ; preds = %inner.body, %outer.body + %indvars.iv.next39 = add nuw nsw i64 %indvars.iv38, 1 + %exitcond42 = icmp eq i64 %indvars.iv.next39, %wide.trip.count41 + br i1 %exitcond42, label %for.end15, label %outer.body, !llvm.loop !6 + +for.end15: ; preds = %outer.inc, %entry + ret void +} + + +; Case 2 (for (j = 0; j < i; j++)): Inner loop with divergent upper-bound. + +; CHECK-LABEL: loop_ub +; CHECK: LV: Not vectorizing: Outer loop contains divergent loops. +; CHECK: LV: Not vectorizing: Unsupported outer loop. + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @loop_ub(i32* nocapture %a, i32* nocapture readonly %b, i32 %N, i32 %M) local_unnamed_addr { +entry: + %cmp32 = icmp sgt i32 %N, 0 + br i1 %cmp32, label %outer.ph, label %for.end15 + +outer.ph: ; preds = %entry + %0 = sext i32 %M to i64 + %wide.trip.count41 = zext i32 %N to i64 + br label %outer.body + +outer.body: ; preds = %outer.inc, %outer.ph + %indvars.iv38 = phi i64 [ 0, %outer.ph ], [ %indvars.iv.next39, %outer.inc ] + %cmp230 = icmp eq i64 %indvars.iv38, 0 + br i1 %cmp230, label %outer.inc, label %inner.ph + +inner.ph: ; preds = %outer.body + %1 = mul nsw i64 %indvars.iv38, %0 + br label %inner.body + +inner.body: ; preds = %inner.body, %inner.ph + %indvars.iv = phi i64 [ 0, %inner.ph ], [ %indvars.iv.next, %inner.body ] + %2 = add nsw i64 %indvars.iv, %1 + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %2 + %3 = load i32, i32* %arrayidx, align 4, !tbaa !2 + %mul8 = mul nsw i32 %3, %3 + %arrayidx12 = getelementptr inbounds i32, i32* %a, i64 %2 + store i32 %mul8, i32* %arrayidx12, align 4, !tbaa !2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %indvars.iv38 + br i1 %exitcond, label %outer.inc, label %inner.body + +outer.inc: ; preds = %inner.body, %outer.body + %indvars.iv.next39 = add nuw nsw i64 %indvars.iv38, 1 + %exitcond42 = icmp eq i64 %indvars.iv.next39, %wide.trip.count41 + br i1 %exitcond42, label %for.end15, label %outer.body, !llvm.loop !6 + +for.end15: ; preds = %outer.inc, %entry + ret void +} + +; Case 3 (for (j = 0; j < M; j+=i)): Inner loop with divergent step. + +; CHECK-LABEL: iv_step +; CHECK: LV: Not vectorizing: Outer loop contains divergent loops. +; CHECK: LV: Not vectorizing: Unsupported outer loop. + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @iv_step(i32* nocapture %a, i32* nocapture readonly %b, i32 %N, i32 %M) local_unnamed_addr { +entry: + %cmp33 = icmp sgt i32 %N, 0 + br i1 %cmp33, label %outer.ph, label %for.end15 + +outer.ph: ; preds = %entry + %cmp231 = icmp sgt i32 %M, 0 + %0 = sext i32 %M to i64 + %wide.trip.count = zext i32 %N to i64 + br label %outer.body + +outer.body: ; preds = %for.inc14, %outer.ph + %indvars.iv39 = phi i64 [ 0, %outer.ph ], [ %indvars.iv.next40, %for.inc14 ] + br i1 %cmp231, label %inner.ph, label %for.inc14 + +inner.ph: ; preds = %outer.body + %1 = mul nsw i64 %indvars.iv39, %0 + br label %inner.body + +inner.body: ; preds = %inner.ph, %inner.body + %indvars.iv36 = phi i64 [ 0, %inner.ph ], [ %indvars.iv.next37, %inner.body ] + %2 = add nsw i64 %indvars.iv36, %1 + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %2 + %3 = load i32, i32* %arrayidx, align 4, !tbaa !2 + %mul8 = mul nsw i32 %3, %3 + %arrayidx12 = getelementptr inbounds i32, i32* %a, i64 %2 + store i32 %mul8, i32* %arrayidx12, align 4, !tbaa !2 + %indvars.iv.next37 = add nuw nsw i64 %indvars.iv36, %indvars.iv39 + %cmp2 = icmp slt i64 %indvars.iv.next37, %0 + br i1 %cmp2, label %inner.body, label %for.inc14 + +for.inc14: ; preds = %inner.body, %outer.body + %indvars.iv.next40 = add nuw nsw i64 %indvars.iv39, 1 + %exitcond = icmp eq i64 %indvars.iv.next40, %wide.trip.count + br i1 %exitcond, label %for.end15, label %outer.body, !llvm.loop !6 + +for.end15: ; preds = %for.inc14, %entry + ret void +} + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{!"clang version 6.0.0"} +!2 = !{!3, !3, i64 0} +!3 = !{!"int", !4, i64 0} +!4 = !{!"omnipotent char", !5, i64 0} +!5 = !{!"Simple C/C++ TBAA"} +!6 = distinct !{!6, !7, !8} +!7 = !{!"llvm.loop.vectorize.width", i32 8} +!8 = !{!"llvm.loop.vectorize.enable", i1 true} Index: test/Transforms/LoopVectorize/explicit_outer_uniform_diverg_branch.ll =================================================================== --- /dev/null +++ test/Transforms/LoopVectorize/explicit_outer_uniform_diverg_branch.ll @@ -0,0 +1,138 @@ +; RUN: opt < %s -loop-vectorize -enable-vplan-native-path -debug-only=loop-vectorize -S 2>&1 | FileCheck %s +; REQUIRES: asserts + +; Verify that LV can handle explicit vectorization outer loops with uniform branches +; but bails out on outer loops with divergent branches. + +; Root C/C++ source code for the test cases +; void foo(int *a, int *b, int N, int M) +; { +; int i, j; +; #pragma clang loop vectorize(enable) vectorize_width(8) +; for (i = 0; i < N; i++) { +; // Tested conditional branch. COND will be replaced per test. +; if (COND) +; for (j = 0; j < M; j++) { +; a[i*M+j] = b[i*M+j] * b[i*M+j]; +; } +; } +; } + +; Case 1 (COND => M == N): Outer loop with uniform conditional branch. + +; CHECK-LABEL: uniform_branch +; CHECK: LV: We can vectorize this outer loop! + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @uniform_branch(i32* nocapture %a, i32* nocapture readonly %b, i32 %N, i32 %M) local_unnamed_addr { +entry: + %cmp39 = icmp sgt i32 %N, 0 + br i1 %cmp39, label %outer.ph, label %for.end19 + +outer.ph: ; preds = %entry + %cmp337 = icmp slt i32 %M, 1 + %0 = sext i32 %M to i64 + %N64 = zext i32 %N to i64 + %M64 = zext i32 %M to i64 + %cmp1 = icmp ne i32 %M, %N ; Uniform condition + %brmerge = or i1 %cmp1, %cmp337 ; Uniform condition + br label %outer.body + +outer.body: ; preds = %outer.inc, %outer.ph + %indvars.iv42 = phi i64 [ 0, %outer.ph ], [ %indvars.iv.next43, %outer.inc ] + %1 = mul nsw i64 %indvars.iv42, %0 + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %1 + %2 = load i32, i32* %arrayidx, align 4, !tbaa !2 + br i1 %brmerge, label %outer.inc, label %inner.ph ; Supported uniform branch + +inner.ph: ; preds = %outer.body + br label %inner.body + +inner.body: ; preds = %inner.ph, %inner.body + %indvars.iv = phi i64 [ %indvars.iv.next, %inner.body ], [ 0, %inner.ph ] + %3 = add nsw i64 %indvars.iv, %1 + %arrayidx7 = getelementptr inbounds i32, i32* %b, i64 %3 + %4 = load i32, i32* %arrayidx7, align 4, !tbaa !2 + %mul12 = mul nsw i32 %4, %4 + %arrayidx16 = getelementptr inbounds i32, i32* %a, i64 %3 + store i32 %mul12, i32* %arrayidx16, align 4, !tbaa !2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %M64 + br i1 %exitcond, label %outer.inc, label %inner.body + +outer.inc: ; preds = %inner.body, %outer.body + %indvars.iv.next43 = add nuw nsw i64 %indvars.iv42, 1 + %exitcond46 = icmp eq i64 %indvars.iv.next43, %N64 + br i1 %exitcond46, label %for.end19, label %outer.body, !llvm.loop !6 + +for.end19: ; preds = %outer.inc, %entry + ret void +} + + +; Case 2 (COND => B[i * M] == 0): Outer loop with divergent conditional branch. + +; CHECK-LABEL: divergent_branch +; CHECK: Unsupported conditional branch. +; CHECK: LV: Not vectorizing: Unsupported outer loop. + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @divergent_branch(i32* nocapture %a, i32* nocapture readonly %b, i32 %N, i32 %M) local_unnamed_addr { +entry: + %cmp39 = icmp sgt i32 %N, 0 + br i1 %cmp39, label %outer.ph, label %for.end19 + +outer.ph: ; preds = %entry + %cmp337 = icmp slt i32 %M, 1 + %0 = sext i32 %M to i64 + %N64 = zext i32 %N to i64 + %M64 = zext i32 %M to i64 + br label %outer.body + +outer.body: ; preds = %outer.inc, %outer.ph + %indvars.iv42 = phi i64 [ 0, %outer.ph ], [ %indvars.iv.next43, %outer.inc ] + %1 = mul nsw i64 %indvars.iv42, %0 + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %1 + %2 = load i32, i32* %arrayidx, align 4, !tbaa !2 + %cmp1 = icmp ne i32 %2, 0 ; Divergent condition + %brmerge = or i1 %cmp1, %cmp337 ; Divergent condition + br i1 %brmerge, label %outer.inc, label %inner.ph ; Unsupported divergent branch. + +inner.ph: ; preds = %outer.body + br label %inner.body + +inner.body: ; preds = %inner.ph, %inner.body + %indvars.iv = phi i64 [ %indvars.iv.next, %inner.body ], [ 0, %inner.ph ] + %3 = add nsw i64 %indvars.iv, %1 + %arrayidx7 = getelementptr inbounds i32, i32* %b, i64 %3 + %4 = load i32, i32* %arrayidx7, align 4, !tbaa !2 + %mul12 = mul nsw i32 %4, %4 + %arrayidx16 = getelementptr inbounds i32, i32* %a, i64 %3 + store i32 %mul12, i32* %arrayidx16, align 4, !tbaa !2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %M64 + br i1 %exitcond, label %outer.inc, label %inner.body + +outer.inc: ; preds = %inner.body, %outer.body + %indvars.iv.next43 = add nuw nsw i64 %indvars.iv42, 1 + %exitcond46 = icmp eq i64 %indvars.iv.next43, %N64 + br i1 %exitcond46, label %for.end19, label %outer.body, !llvm.loop !6 + +for.end19: ; preds = %outer.inc, %entry + ret void +} + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{!"clang version 6.0.0"} +!2 = !{!3, !3, i64 0} +!3 = !{!"int", !4, i64 0} +!4 = !{!"omnipotent char", !5, i64 0} +!5 = !{!"Simple C/C++ TBAA"} +!6 = distinct !{!6, !7, !8} +!7 = !{!"llvm.loop.vectorize.width", i32 8} +!8 = !{!"llvm.loop.vectorize.enable", i1 true}