Index: llvm/trunk/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h =================================================================== --- llvm/trunk/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h +++ llvm/trunk/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h @@ -332,6 +332,11 @@ /// If false, good old LV code. bool canVectorizeLoopNestCFG(Loop *Lp, bool UseVPlanNativePath); + /// Set up outer loop inductions by checking Phis in outer loop header for + /// supported inductions (int inductions). Return false if any of these Phis + /// is not a supported induction or if we fail to find an induction. + bool setupOuterLoopInductions(); + /// Return true if the pre-header, exiting and latch blocks of \p Lp /// (non-recursive) are considered legal for vectorization. /// Temporarily taking UseVPlanNativePath parameter. If true, take Index: llvm/trunk/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp =================================================================== --- llvm/trunk/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -516,6 +516,18 @@ return false; } + // Check whether we are able to set up outer loop induction. + if (!setupOuterLoopInductions()) { + LLVM_DEBUG( + dbgs() << "LV: Not vectorizing: Unsupported outer loop Phi(s).\n"); + ORE->emit(createMissedAnalysis("UnsupportedPhi") + << "Unsupported outer loop Phi(s)"); + if (DoExtraAnalysis) + Result = false; + else + return false; + } + return Result; } @@ -571,6 +583,32 @@ LLVM_DEBUG(dbgs() << "LV: Found an induction variable.\n"); } +bool LoopVectorizationLegality::setupOuterLoopInductions() { + BasicBlock *Header = TheLoop->getHeader(); + + // Returns true if a given Phi is a supported induction. + auto isSupportedPhi = [&](PHINode &Phi) -> bool { + InductionDescriptor ID; + if (InductionDescriptor::isInductionPHI(&Phi, TheLoop, PSE, ID) && + ID.getKind() == InductionDescriptor::IK_IntInduction) { + addInductionPhi(&Phi, ID, AllowedExit); + return true; + } else { + // Bail out for any Phi in the outer loop header that is not a supported + // induction. + LLVM_DEBUG( + dbgs() + << "LV: Found unsupported PHI for outer loop vectorization.\n"); + return false; + } + }; + + if (llvm::all_of(Header->phis(), isSupportedPhi)) + return true; + else + return false; +} + bool LoopVectorizationLegality::canVectorizeInstrs() { BasicBlock *Header = TheLoop->getHeader(); Index: llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -58,6 +58,7 @@ #include "LoopVectorizationPlanner.h" #include "VPRecipeBuilder.h" #include "VPlanHCFGBuilder.h" +#include "VPlanHCFGTransforms.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" @@ -234,7 +235,7 @@ cl::desc("The maximum interleave count to use when interleaving a scalar " "reduction in a nested loop.")); -static cl::opt EnableVPlanNativePath( +cl::opt EnableVPlanNativePath( "enable-vplan-native-path", cl::init(false), cl::Hidden, cl::desc("Enable VPlan-native vectorization path with " "support for outer loop vectorization.")); @@ -419,6 +420,9 @@ /// the instruction. void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); + /// Fix the non-induction PHIs in the OrigPHIsToFix vector. + void fixNonInductionPHIs(void); + protected: friend class LoopVectorizationPlanner; @@ -686,6 +690,10 @@ // Holds the end values for each induction variable. We save the end values // so we can later fix-up the external users of the induction variables. DenseMap IVEndValues; + + // Vector of original scalar PHIs whose corresponding widened PHIs need to be + // fixed up at the end of vector code generation. + SmallVector OrigPHIsToFix; }; class InnerLoopUnroller : public InnerLoopVectorizer { @@ -888,6 +896,12 @@ /// vectorization factor \p VF. bool isProfitableToScalarize(Instruction *I, unsigned VF) const { assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1."); + + // Cost model is not run in the VPlan-native path - return conservative + // result until this changes. + if (EnableVPlanNativePath) + return false; + auto Scalars = InstsToScalarize.find(VF); assert(Scalars != InstsToScalarize.end() && "VF not yet analyzed for scalarization profitability"); @@ -898,6 +912,12 @@ bool isUniformAfterVectorization(Instruction *I, unsigned VF) const { if (VF == 1) return true; + + // Cost model is not run in the VPlan-native path - return conservative + // result until this changes. + if (EnableVPlanNativePath) + return false; + auto UniformsPerVF = Uniforms.find(VF); assert(UniformsPerVF != Uniforms.end() && "VF not yet analyzed for uniformity"); @@ -908,6 +928,12 @@ bool isScalarAfterVectorization(Instruction *I, unsigned VF) const { if (VF == 1) return true; + + // Cost model is not run in the VPlan-native path - return conservative + // result until this changes. + if (EnableVPlanNativePath) + return false; + auto ScalarsPerVF = Scalars.find(VF); assert(ScalarsPerVF != Scalars.end() && "Scalar values are not calculated for VF"); @@ -962,6 +988,12 @@ /// through the cost modeling. InstWidening getWideningDecision(Instruction *I, unsigned VF) { assert(VF >= 2 && "Expected VF >=2"); + + // Cost model is not run in the VPlan-native path - return conservative + // result until this changes. + if (EnableVPlanNativePath) + return CM_GatherScatter; + std::pair InstOnVF = std::make_pair(I, VF); auto Itr = WideningDecisions.find(InstOnVF); if (Itr == WideningDecisions.end()) @@ -1397,8 +1429,16 @@ AU.addRequired(); AU.addRequired(); AU.addRequired(); - AU.addPreserved(); - AU.addPreserved(); + + // We currently do not preserve loopinfo/dominator analyses with outer loop + // vectorization. Until this is addressed, mark these analyses as preserved + // only for non-VPlan-native path. + // TODO: Preserve Loop and Dominator analyses for VPlan-native path. + if (!EnableVPlanNativePath) { + AU.addPreserved(); + AU.addPreserved(); + } + AU.addPreserved(); AU.addPreserved(); } @@ -1749,8 +1789,9 @@ assert(!V->getType()->isVectorTy() && "Can't widen a vector"); assert(!V->getType()->isVoidTy() && "Type does not produce a value"); - // If we have a stride that is replaced by one, do it here. - if (Legal->hasStride(V)) + // If we have a stride that is replaced by one, do it here. Defer this for + // the VPlan-native path until we start running Legal checks in that path. + if (!EnableVPlanNativePath && Legal->hasStride(V)) V = ConstantInt::get(V->getType(), 1); // If we have a vector mapped to this value, return it. @@ -2416,6 +2457,10 @@ } void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { + // VPlan-native path does not do any analysis for runtime checks currently. + if (EnableVPlanNativePath) + return; + BasicBlock *BB = L->getLoopPreheader(); // Generate the code that checks in runtime if arrays overlap. We put the @@ -3060,6 +3105,13 @@ if (VF > 1) truncateToMinimalBitwidths(); + // Fix widened non-induction PHIs by setting up the PHI operands. + if (OrigPHIsToFix.size()) { + assert(EnableVPlanNativePath && + "Unexpected non-induction PHIs for fixup in non VPlan-native path"); + fixNonInductionPHIs(); + } + // At this point every instruction in the original loop is widened to a // vector form. Now we need to fix the recurrences in the loop. These PHI // nodes are currently empty because we did not want to introduce cycles. @@ -3532,12 +3584,62 @@ } while (Changed); } +void InnerLoopVectorizer::fixNonInductionPHIs() { + for (PHINode *OrigPhi : OrigPHIsToFix) { + PHINode *NewPhi = + cast(VectorLoopValueMap.getVectorValue(OrigPhi, 0)); + unsigned NumIncomingValues = OrigPhi->getNumIncomingValues(); + + SmallVector ScalarBBPredecessors( + predecessors(OrigPhi->getParent())); + SmallVector VectorBBPredecessors( + predecessors(NewPhi->getParent())); + assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() && + "Scalar and Vector BB should have the same number of predecessors"); + + // The insertion point in Builder may be invalidated by the time we get + // here. Force the Builder insertion point to something valid so that we do + // not run into issues during insertion point restore in + // getOrCreateVectorValue calls below. + Builder.SetInsertPoint(NewPhi); + + // The predecessor order is preserved and we can rely on mapping between + // scalar and vector block predecessors. + for (unsigned i = 0; i < NumIncomingValues; ++i) { + BasicBlock *NewPredBB = VectorBBPredecessors[i]; + + // When looking up the new scalar/vector values to fix up, use incoming + // values from original phi. + Value *ScIncV = + OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]); + + // Scalar incoming value may need a broadcast + Value *NewIncV = getOrCreateVectorValue(ScIncV, 0); + NewPhi->addIncoming(NewIncV, NewPredBB); + } + } +} + void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF) { + PHINode *P = cast(PN); + if (EnableVPlanNativePath) { + // Currently we enter here in the VPlan-native path for non-induction + // PHIs where all control flow is uniform. We simply widen these PHIs. + // Create a vector phi with no operands - the vector phi operands will be + // set at the end of vector code generation. + Type *VecTy = + (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF); + Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); + VectorLoopValueMap.setVectorValue(P, 0, VecPhi); + OrigPHIsToFix.push_back(P); + + return; + } + assert(PN->getParent() == OrigLoop->getHeader() && "Non-header phis should have been handled elsewhere"); - PHINode *P = cast(PN); // In order to support recurrences we need to be able to vectorize Phi nodes. // Phi nodes have cycles, so we need to vectorize them in two stages. This is // stage #1: We create a new vector PHI node with no incoming edges. We'll use @@ -3893,6 +3995,10 @@ // Forget the original basic block. PSE.getSE()->forgetLoop(OrigLoop); + // DT is not kept up-to-date for outer loop vectorization + if (EnableVPlanNativePath) + return; + // Update the dominator tree information. assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) && "Entry does not dominate exit."); @@ -6527,6 +6633,13 @@ VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); HCFGBuilder.buildHierarchicalCFG(); + SmallPtrSet DeadInstructions; + VPlanHCFGTransforms::VPInstructionsToVPRecipes( + Plan, Legal->getInductionVars(), DeadInstructions); + + for (unsigned VF = Range.Start; VF < Range.End; VF *= 2) + Plan->addVF(VF); + return Plan; } @@ -6728,11 +6841,26 @@ Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->optForSize(); // Plan how to best vectorize, return the best VF and its cost. - LVP.planInVPlanNativePath(OptForSize, UserVF); + VectorizationFactor VF = LVP.planInVPlanNativePath(OptForSize, UserVF); - // Returning false. We are currently not generating vector code in the VPlan - // native path. - return false; + // If we are stress testing VPlan builds, do not attempt to generate vector + // code. + if (VPlanBuildStressTest) + return false; + + LVP.setBestPlan(VF.Width, 1); + + InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, UserVF, 1, LVL, + &CM); + LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" + << L->getHeader()->getParent()->getName() << "\"\n"); + LVP.executePlan(LB, DT); + + // Mark the loop as already vectorized to avoid vectorizing again. + Hints.setAlreadyVectorized(); + + LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent())); + return true; } bool LoopVectorizePass::processLoop(Loop *L) { @@ -7123,8 +7251,15 @@ if (!Changed) return PreservedAnalyses::all(); PreservedAnalyses PA; - PA.preserve(); - PA.preserve(); + + // We currently do not preserve loopinfo/dominator analyses with outer loop + // vectorization. Until this is addressed, mark these analyses as preserved + // only for non-VPlan-native path. + // TODO: Preserve Loop and Dominator analyses for VPlan-native path. + if (!EnableVPlanNativePath) { + PA.preserve(); + PA.preserve(); + } PA.preserve(); PA.preserve(); return PA; Index: llvm/trunk/lib/Transforms/Vectorize/VPlan.h =================================================================== --- llvm/trunk/lib/Transforms/Vectorize/VPlan.h +++ llvm/trunk/lib/Transforms/Vectorize/VPlan.h @@ -293,6 +293,10 @@ /// of replication, maps the BasicBlock of the last replica created. SmallDenseMap VPBB2IRBB; + /// Vector of VPBasicBlocks whose terminator instruction needs to be fixed + /// up at the end of vector code generation. + SmallVector VPBBsToFix; + CFGState() = default; } CFG; Index: llvm/trunk/lib/Transforms/Vectorize/VPlan.cpp =================================================================== --- llvm/trunk/lib/Transforms/Vectorize/VPlan.cpp +++ llvm/trunk/lib/Transforms/Vectorize/VPlan.cpp @@ -44,6 +44,7 @@ #include using namespace llvm; +extern cl::opt EnableVPlanNativePath; #define DEBUG_TYPE "vplan" @@ -124,6 +125,20 @@ VPBasicBlock *PredVPBB = PredVPBlock->getExitBasicBlock(); auto &PredVPSuccessors = PredVPBB->getSuccessors(); BasicBlock *PredBB = CFG.VPBB2IRBB[PredVPBB]; + + // In outer loop vectorization scenario, the predecessor BBlock may not yet + // be visited(backedge). Mark the VPBasicBlock for fixup at the end of + // vectorization. We do not encounter this case in inner loop vectorization + // as we start out by building a loop skeleton with the vector loop header + // and latch blocks. As a result, we never enter this function for the + // header block in the non VPlan-native path. + if (!PredBB) { + assert(EnableVPlanNativePath && + "Unexpected null predecessor in non VPlan-native path"); + CFG.VPBBsToFix.push_back(PredVPBB); + continue; + } + assert(PredBB && "Predecessor basic-block not found building successor."); auto *PredBBTerminator = PredBB->getTerminator(); LLVM_DEBUG(dbgs() << "LV: draw edge from" << PredBB->getName() << '\n'); @@ -185,6 +200,35 @@ for (VPRecipeBase &Recipe : Recipes) Recipe.execute(*State); + VPValue *CBV; + if (EnableVPlanNativePath && (CBV = getCondBit())) { + Value *IRCBV = CBV->getUnderlyingValue(); + assert(IRCBV && "Unexpected null underlying value for condition bit"); + + // Delete the condition bit at this point - it should be no longer needed. + delete CBV; + setCondBit(nullptr); + + // Condition bit value in a VPBasicBlock is used as the branch selector. In + // the VPlan-native path case, since all branches are uniform we generate a + // branch instruction using the condition value from vector lane 0 and dummy + // successors. The successors are fixed later when the successor blocks are + // visited. + Value *NewCond = State->Callback.getOrCreateVectorValues(IRCBV, 0); + NewCond = State->Builder.CreateExtractElement(NewCond, + State->Builder.getInt32(0)); + + // Replace the temporary unreachable terminator with the new conditional + // branch. + auto *CurrentTerminator = NewBB->getTerminator(); + assert(isa(CurrentTerminator) && + "Expected to replace unreachable terminator with conditional " + "branch."); + auto *CondBr = BranchInst::Create(NewBB, nullptr, NewCond); + CondBr->setSuccessor(0, nullptr); + ReplaceInstWithInst(CurrentTerminator, CondBr); + } + LLVM_DEBUG(dbgs() << "LV: filled BB:" << *NewBB); } @@ -194,6 +238,20 @@ if (!isReplicator()) { // Visit the VPBlocks connected to "this", starting from it. for (VPBlockBase *Block : RPOT) { + if (EnableVPlanNativePath) { + // The inner loop vectorization path does not represent loop preheader + // and exit blocks as part of the VPlan. In the VPlan-native path, skip + // vectorizing loop preheader block. In future, we may replace this + // check with the check for loop preheader. + if (Block->getNumPredecessors() == 0) + continue; + + // Skip vectorizing loop exit block. In future, we may replace this + // check with the check for loop exit. + if (Block->getNumSuccessors() == 0) + continue; + } + LLVM_DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n'); Block->execute(State); } @@ -319,11 +377,32 @@ for (VPBlockBase *Block : depth_first(Entry)) Block->execute(State); + // Setup branch terminator successors for VPBBs in VPBBsToFix based on + // VPBB's successors. + for (auto VPBB : State->CFG.VPBBsToFix) { + assert(EnableVPlanNativePath && + "Unexpected VPBBsToFix in non VPlan-native path"); + BasicBlock *BB = State->CFG.VPBB2IRBB[VPBB]; + assert(BB && "Unexpected null basic block for VPBB"); + + unsigned Idx = 0; + auto *BBTerminator = BB->getTerminator(); + + for (VPBlockBase *SuccVPBlock : VPBB->getHierarchicalSuccessors()) { + VPBasicBlock *SuccVPBB = SuccVPBlock->getEntryBasicBlock(); + BBTerminator->setSuccessor(Idx, State->CFG.VPBB2IRBB[SuccVPBB]); + ++Idx; + } + } + // 3. Merge the temporary latch created with the last basic-block filled. BasicBlock *LastBB = State->CFG.PrevBB; // Connect LastBB to VectorLatchBB to facilitate their merge. - assert(isa(LastBB->getTerminator()) && - "Expected VPlan CFG to terminate with unreachable"); + assert((EnableVPlanNativePath || + isa(LastBB->getTerminator())) && + "Expected InnerLoop VPlan CFG to terminate with unreachable"); + assert((!EnableVPlanNativePath || isa(LastBB->getTerminator())) && + "Expected VPlan CFG to terminate with branch in NativePath"); LastBB->getTerminator()->eraseFromParent(); BranchInst::Create(VectorLatchBB, LastBB); @@ -333,7 +412,9 @@ assert(Merged && "Could not merge last basic block with latch."); VectorLatchBB = LastBB; - updateDominatorTree(State->DT, VectorPreHeaderBB, VectorLatchBB); + // We do not attempt to preserve DT for outer loop vectorization currently. + if (!EnableVPlanNativePath) + updateDominatorTree(State->DT, VectorPreHeaderBB, VectorLatchBB); } void VPlan::updateDominatorTree(DominatorTree *DT, BasicBlock *LoopPreHeaderBB, Index: llvm/trunk/lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp =================================================================== --- llvm/trunk/lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp +++ llvm/trunk/lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp @@ -24,6 +24,17 @@ VPRegionBlock *TopRegion = dyn_cast(Plan->getEntry()); ReversePostOrderTraversal RPOT(TopRegion->getEntry()); + + // Condition bit VPValues get deleted during transformation to VPRecipes. + // Create new VPValues and save away as condition bits. These will be deleted + // after finalizing the vector IR basic blocks. + for (VPBlockBase *Base : RPOT) { + VPBasicBlock *VPBB = Base->getEntryBasicBlock(); + if (auto *CondBit = VPBB->getCondBit()) { + auto *NCondBit = new VPValue(CondBit->getUnderlyingValue()); + VPBB->setCondBit(NCondBit); + } + } for (VPBlockBase *Base : RPOT) { // Do not widen instructions in pre-header and exit blocks. if (Base->getNumPredecessors() == 0 || Base->getNumSuccessors() == 0) Index: llvm/trunk/lib/Transforms/Vectorize/VPlanValue.h =================================================================== --- llvm/trunk/lib/Transforms/Vectorize/VPlanValue.h +++ llvm/trunk/lib/Transforms/Vectorize/VPlanValue.h @@ -38,6 +38,9 @@ // and live-outs which the VPlan will need to fix accordingly. class VPValue { friend class VPBuilder; + friend class VPlanHCFGTransforms; + friend class VPBasicBlock; + private: const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast). Index: llvm/trunk/test/Transforms/LoopVectorize/outer_loop_test1.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/outer_loop_test1.ll +++ llvm/trunk/test/Transforms/LoopVectorize/outer_loop_test1.ll @@ -0,0 +1,82 @@ +; extern int arr[8][8]; +; extern int arr2[8]; +; +; void foo(int n) +; { +; int i1, i2; +; +; #pragma clang loop vectorize(enable) vectorize_width(4) +; for (i1 = 0; i1 < 8; i1++) { +; arr2[i1] = i1; +; for (i2 = 0; i2 < 8; i2++) +; arr[i2][i1] = i1 + n; +; } +; } +; +; RUN: opt -S -loop-vectorize -enable-vplan-native-path < %s | FileCheck %s +; CHECK-LABEL: vector.ph: +; CHECK: %[[SplatVal:.*]] = insertelement <4 x i32> undef, i32 %n, i32 0 +; CHECK: %[[Splat:.*]] = shufflevector <4 x i32> %[[SplatVal]], <4 x i32> undef, <4 x i32> zeroinitializer + +; CHECK-LABEL: vector.body: +; CHECK: %[[Ind:.*]] = phi i64 [ 0, %vector.ph ], [ %[[IndNext:.*]], %[[ForInc:.*]] ] +; CHECK: %[[VecInd:.*]] = phi <4 x i64> [ , %vector.ph ], [ %[[VecIndNext:.*]], %[[ForInc]] ] +; CHECK: %[[AAddr:.*]] = getelementptr inbounds [8 x i32], [8 x i32]* @arr2, i64 0, <4 x i64> %[[VecInd]] +; CHECK: %[[VecIndTr:.*]] = trunc <4 x i64> %[[VecInd]] to <4 x i32> +; CHECK: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %[[VecIndTr]], <4 x i32*> %[[AAddr]], i32 4, <4 x i1> ) +; CHECK: %[[VecIndTr2:.*]] = trunc <4 x i64> %[[VecInd]] to <4 x i32> +; CHECK: %[[StoreVal:.*]] = add nsw <4 x i32> %[[VecIndTr2]], %[[Splat]] +; CHECK: br label %[[InnerLoop:.+]] + +; CHECK: [[InnerLoop]]: +; CHECK: %[[InnerPhi:.*]] = phi <4 x i64> [ %[[InnerPhiNext:.*]], %[[InnerLoop]] ], [ zeroinitializer, %vector.body ] +; CHECK: %[[AAddr2:.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* @arr, i64 0, <4 x i64> %[[InnerPhi]], <4 x i64> %[[VecInd]] +; CHECK: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %[[StoreVal]], <4 x i32*> %[[AAddr2]], i32 4, <4 x i1> %[[InnerPhi]], +; CHECK: %[[VecCond:.*]] = icmp eq <4 x i64> %[[InnerPhiNext]], +; CHECK: %[[InnerCond:.*]] = extractelement <4 x i1> %[[VecCond]], i32 0 +; CHECK: br i1 %[[InnerCond]], label %[[ForInc]], label %[[InnerLoop]] + +; CHECK: [[ForInc]]: +; CHECK: %[[IndNext]] = add i64 %[[Ind]], 4 +; CHECK: %[[VecIndNext]] = add <4 x i64> %[[VecInd]], +; CHECK: %[[Cmp:.*]] = icmp eq i64 %[[IndNext]], 8 +; CHECK: br i1 %[[Cmp]], label %middle.block, label %vector.body + +@arr2 = external global [8 x i32], align 16 +@arr = external global [8 x [8 x i32]], align 16 + +; Function Attrs: norecurse nounwind uwtable +define void @foo(i32 %n) { +entry: + br label %for.body + +for.body: ; preds = %for.inc8, %entry + %indvars.iv21 = phi i64 [ 0, %entry ], [ %indvars.iv.next22, %for.inc8 ] + %arrayidx = getelementptr inbounds [8 x i32], [8 x i32]* @arr2, i64 0, i64 %indvars.iv21 + %0 = trunc i64 %indvars.iv21 to i32 + store i32 %0, i32* %arrayidx, align 4 + %1 = trunc i64 %indvars.iv21 to i32 + %add = add nsw i32 %1, %n + br label %for.body3 + +for.body3: ; preds = %for.body3, %for.body + %indvars.iv = phi i64 [ 0, %for.body ], [ %indvars.iv.next, %for.body3 ] + %arrayidx7 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* @arr, i64 0, i64 %indvars.iv, i64 %indvars.iv21 + store i32 %add, i32* %arrayidx7, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 8 + br i1 %exitcond, label %for.inc8, label %for.body3 + +for.inc8: ; preds = %for.body3 + %indvars.iv.next22 = add nuw nsw i64 %indvars.iv21, 1 + %exitcond23 = icmp eq i64 %indvars.iv.next22, 8 + br i1 %exitcond23, label %for.end10, label %for.body, !llvm.loop !1 + +for.end10: ; preds = %for.inc8 + ret void +} + +!1 = distinct !{!1, !2, !3} +!2 = !{!"llvm.loop.vectorize.width", i32 4} +!3 = !{!"llvm.loop.vectorize.enable", i1 true} Index: llvm/trunk/test/Transforms/LoopVectorize/outer_loop_test2.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/outer_loop_test2.ll +++ llvm/trunk/test/Transforms/LoopVectorize/outer_loop_test2.ll @@ -0,0 +1,112 @@ +; int A[1024], B[1024]; +; +; void foo(int iCount, int c, int jCount) +; { +; +; int i, j; +; +; #pragma clang loop vectorize(enable) vectorize_width(4) +; for (i = 0; i < iCount; i++) { +; A[i] = c; +; for (j = 0; j < jCount; j++) { +; A[i] += B[j] + i; +; } +; } +; } +; RUN: opt -S -loop-vectorize -enable-vplan-native-path < %s | FileCheck %s +; CHECK: %[[ZeroTripChk:.*]] = icmp sgt i32 %jCount, 0 +; CHECK-LABEL: vector.ph: +; CHECK: %[[CVal0:.*]] = insertelement <4 x i32> undef, i32 %c, i32 0 +; CHECK-NEXT: %[[CSplat:.*]] = shufflevector <4 x i32> %[[CVal0]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK: %[[ZVal0:.*]] = insertelement <4 x i1> undef, i1 %[[ZeroTripChk]], i32 0 +; CHECK-NEXT: %[[ZSplat:.*]] = shufflevector <4 x i1> %[[ZVal0]], <4 x i1> undef, <4 x i32> zeroinitializer + +; CHECK-LABEL: vector.body: +; CHECK: %[[Ind:.*]] = phi i64 [ 0, %vector.ph ], [ %[[IndNext:.*]], %[[ForInc:.*]] ] +; CHECK: %[[VecInd:.*]] = phi <4 x i64> [ , %vector.ph ], [ %[[VecIndNext:.*]], %[[ForInc]] ] +; CHECK: %[[AAddr:.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, <4 x i64> %[[VecInd]] +; CHECK: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %[[CSplat]], <4 x i32*> %[[AAddr]], i32 4, <4 x i1> ) +; CHECK: %[[ZCmpExtr:.*]] = extractelement <4 x i1> %[[ZSplat]], i32 0 +; CHECK: br i1 %[[ZCmpExtr]], label %[[InnerForPh:.*]], label %[[OuterInc:.*]] + +; CHECK: [[InnerForPh]]: +; CHECK: %[[WideAVal:.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %[[AAddr]], i32 4, <4 x i1> , <4 x i32> undef) +; CHECK: %[[VecIndTr:.*]] = trunc <4 x i64> %[[VecInd]] to <4 x i32> +; CHECK: br label %[[InnerForBody:.*]] + +; CHECK: [[InnerForBody]]: +; CHECK: %[[InnerInd:.*]] = phi <4 x i64> [ %[[InnerIndNext:.*]], %[[InnerForBody]] ], [ zeroinitializer, %[[InnerForPh]] ] +; CHECK: %[[AccumPhi:.*]] = phi <4 x i32> [ %[[AccumPhiNext:.*]], %[[InnerForBody]] ], [ %[[WideAVal]], %[[InnerForPh]] ] +; CHECK: %[[BAddr:.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @B, i64 0, <4 x i64> %[[InnerInd]] +; CHECK: %[[WideBVal:.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %[[BAddr]], i32 4, <4 x i1> , <4 x i32> undef) +; CHECK: %[[Add1:.*]] = add nsw <4 x i32> %[[WideBVal]], %[[VecIndTr]] +; CHECK: %[[AccumPhiNext]] = add nsw <4 x i32> %[[Add1]], %[[AccumPhi]] +; CHECK: %[[InnerIndNext]] = add nuw nsw <4 x i64> %[[InnerInd]], +; CHECK: %[[InnerVecCond:.*]] = icmp eq <4 x i64> %[[InnerIndNext]], {{.*}} +; CHECK: %[[InnerCond:.+]] = extractelement <4 x i1> %[[InnerVecCond]], i32 0 +; CHECK: br i1 %[[InnerCond]], label %[[InnerCrit:.*]], label %[[InnerForBody]] + +; CHECK: [[InnerCrit]]: +; CHECK: %[[StorePhi:.*]] = phi <4 x i32> [ %[[AccumPhiNext]], %[[InnerForBody]] ] +; CHECK: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %[[StorePhi]], <4 x i32*> %[[AAddr]], i32 4, <4 x i1> ) +; CHECK: br label %[[ForInc]] + +; CHECK: [[ForInc]]: +; CHECK: %[[IndNext]] = add i64 %[[Ind]], 4 +; CHECK: %[[VecIndNext]] = add <4 x i64> %[[VecInd]], +; CHECK: %[[Cmp:.*]] = icmp eq i64 %[[IndNext]], {{.*}} +; CHECK: br i1 %[[Cmp]], label %middle.block, label %vector.body + +@A = common global [1024 x i32] zeroinitializer, align 16 +@B = common global [1024 x i32] zeroinitializer, align 16 + +; Function Attrs: norecurse nounwind uwtable +define void @foo(i32 %iCount, i32 %c, i32 %jCount) { +entry: + %cmp22 = icmp sgt i32 %iCount, 0 + br i1 %cmp22, label %for.body.lr.ph, label %for.end11 + +for.body.lr.ph: ; preds = %entry + %cmp220 = icmp sgt i32 %jCount, 0 + %wide.trip.count = zext i32 %jCount to i64 + %wide.trip.count27 = zext i32 %iCount to i64 + br label %for.body + +for.body: ; preds = %for.inc9, %for.body.lr.ph + %indvars.iv25 = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next26, %for.inc9 ] + %arrayidx = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %indvars.iv25 + store i32 %c, i32* %arrayidx, align 4 + br i1 %cmp220, label %for.body3.lr.ph, label %for.inc9 + +for.body3.lr.ph: ; preds = %for.body + %arrayidx.promoted = load i32, i32* %arrayidx, align 4 + %0 = trunc i64 %indvars.iv25 to i32 + br label %for.body3 + +for.body3: ; preds = %for.body3, %for.body3.lr.ph + %indvars.iv = phi i64 [ 0, %for.body3.lr.ph ], [ %indvars.iv.next, %for.body3 ] + %1 = phi i32 [ %arrayidx.promoted, %for.body3.lr.ph ], [ %add8, %for.body3 ] + %arrayidx5 = getelementptr inbounds [1024 x i32], [1024 x i32]* @B, i64 0, i64 %indvars.iv + %2 = load i32, i32* %arrayidx5, align 4 + %add = add nsw i32 %2, %0 + %add8 = add nsw i32 %add, %1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.cond1.for.inc9_crit_edge, label %for.body3 + +for.cond1.for.inc9_crit_edge: ; preds = %for.body3 + store i32 %add8, i32* %arrayidx, align 4 + br label %for.inc9 + +for.inc9: ; preds = %for.cond1.for.inc9_crit_edge, %for.body + %indvars.iv.next26 = add nuw nsw i64 %indvars.iv25, 1 + %exitcond28 = icmp eq i64 %indvars.iv.next26, %wide.trip.count27 + br i1 %exitcond28, label %for.end11, label %for.body, !llvm.loop !1 + +for.end11: ; preds = %for.inc9, %entry + ret void +} + +!1 = distinct !{!1, !2, !3} +!2 = !{!"llvm.loop.vectorize.width", i32 4} +!3 = !{!"llvm.loop.vectorize.enable", i1 true}