diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2195,14 +2195,6 @@ return false; } - if (Hints.getInterleave() > 1) { - // TODO: Interleave support is future work. - LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " - "outer loops.\n"); - Hints.emitRemarkWithHints(); - return false; - } - return true; } @@ -4019,16 +4011,30 @@ auto Iter = vp_depth_first_deep(Plan.getEntry()); for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly(Iter)) { for (VPRecipeBase &P : VPBB->phis()) { - VPWidenPHIRecipe *VPPhi = dyn_cast(&P); - if (!VPPhi) - continue; - PHINode *NewPhi = cast(State.get(VPPhi, 0)); - // Make sure the builder has a valid insert point. - Builder.SetInsertPoint(NewPhi); - for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { - VPValue *Inc = VPPhi->getIncomingValue(i); - VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); - NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]); + if (auto *VPPhi = dyn_cast(&P)) { + for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) { + PHINode *NewPhi = cast(State.get(VPPhi, Part)); + // Make sure the builder has a valid insert point. + Builder.SetInsertPoint(NewPhi); + for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { + VPValue *Inc = VPPhi->getIncomingValue(i); + VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); + NewPhi->addIncoming(State.get(Inc, Part), + State.CFG.VPBB2IRBB[VPBB]); + } + } + } + + if (auto *VPPhi = dyn_cast(&P)) { + PHINode *NewPhi = cast(State.get(VPPhi, VPIteration(0, 0))); + // Make sure the builder has a valid insert point. + Builder.SetInsertPoint(NewPhi); + for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { + VPValue *Inc = VPPhi->getIncomingValue(i); + VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); + NewPhi->addIncoming(State.get(Inc, VPIteration(0, 0)), + State.CFG.VPBB2IRBB[VPBB]); + } } } } @@ -7342,16 +7348,23 @@ // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment // doesn't have a cost model that can choose which plan to execute if // more than one is generated. -static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, - LoopVectorizationCostModel &CM) { +static ElementCount determineVPlanVF(const TargetTransformInfo &TTI, + LoopVectorizationCostModel &CM) { unsigned WidestType; std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); - return WidestVectorRegBits / WidestType; + + TargetTransformInfo::RegisterKind RegKind = + TTI.enableScalableVectorization() + ? TargetTransformInfo::RGK_ScalableVector + : TargetTransformInfo::RGK_FixedWidthVector; + + TypeSize RegSize = TTI.getRegisterBitWidth(RegKind); + unsigned N = RegSize.getKnownMinValue() / WidestType; + return ElementCount::get(N, RegSize.isScalable()); } VectorizationFactor LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { - assert(!UserVF.isScalable() && "scalable vectors not yet supported"); ElementCount VF = UserVF; // Outer loop handling: They may require CFG and instruction level // transformations before even evaluating whether vectorization is profitable. @@ -7361,10 +7374,7 @@ // If the user doesn't provide a vectorization factor, determine a // reasonable one. if (UserVF.isZero()) { - VF = ElementCount::getFixed(determineVPlanVF( - TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) - .getFixedValue(), - CM)); + VF = determineVPlanVF(TTI, CM); LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); // Make sure we have a VF > 1 for stress testing. @@ -7373,6 +7383,19 @@ << "overriding computed VF.\n"); VF = ElementCount::getFixed(4); } + } else if (UserVF.isScalable() && !TTI.supportsScalableVectors() && + !ForceTargetSupportsScalableVectors) { + + ORE->emit([&]() { + return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", + OrigLoop->getStartLoc(), + OrigLoop->getHeader()) + << "User-specified vectorization factor " + << ore::NV("UserVectorizationFactor", UserVF) + << " cannot be used for outer-loop vectorization because the" + << " target does not support scalable vectors."; + }); + return VectorizationFactor::Disabled(); } assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); assert(isPowerOf2_32(VF.getKnownMinValue()) && @@ -8977,6 +9000,10 @@ addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(), CM.getTailFoldingStyle()); + VPlanTransforms::findAndReplaceUniformRecipes(*Plan, OrigLoop, *PSE.getSE(), + *LI); + VPlanTransforms::optimize(*Plan, *PSE.getSE()); + LLVM_DEBUG(Plan->dump()); return Plan; } @@ -9603,6 +9630,12 @@ CM.collectElementTypesForWidening(); + // The VPlan-native path does not have a cost model, so the only way to get + // a unroll factor is to query the loop vectorization hints. + unsigned UF = Hints.getInterleave(); + if (!UF) + UF = 1; + // Plan how to best vectorize, return the best VF and its cost. const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); @@ -9618,10 +9651,10 @@ GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI, F->getParent()->getDataLayout()); InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, - VF.Width, 1, LVL, &CM, BFI, PSI, Checks); + VF.Width, UF, LVL, &CM, BFI, PSI, Checks); LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" << L->getHeader()->getParent()->getName() << "\"\n"); - LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false); + LVP.executePlan(VF.Width, UF, BestPlan, LB, DT, false); } reportVectorization(ORE, L, VF, 1); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1116,6 +1116,7 @@ case VPInstruction::CalculateTripCountMinusVF: case VPInstruction::CanonicalIVIncrement: case VPInstruction::CanonicalIVIncrementForPart: + case VPInstruction::BranchOnCond: case VPInstruction::BranchOnCount: return true; }; @@ -1514,6 +1515,50 @@ VPValue *getIncomingValue(unsigned I) { return getOperand(I); } }; +// A recipe for handling header phis that stay scalar in the vector loop. +// Only to be used in the VPlan native path, and only in inner loops, +// never the top-level loop of a VPlan. +class VPScalarPHIRecipe : public VPHeaderPHIRecipe { + /// List of incoming blocks. + SmallVector IncomingBlocks; + +public: + /// Create a new VPScalarPHIRecipe for \p Phi with start value \p Start. + VPScalarPHIRecipe(PHINode *Phi) + : VPHeaderPHIRecipe(VPDef::VPScalarPHISC, Phi) {} + + ~VPScalarPHIRecipe() override = default; + + VP_CLASSOF_IMPL(VPDef::VPScalarPHISC) + + /// Generate the phi nodes. + void execute(VPTransformState &State) override; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif + + /// Adds a pair (\p IncomingV, \p IncomingBlock) to the phi. + void addIncoming(VPValue *IncomingV, VPBasicBlock *IncomingBlock) { + addOperand(IncomingV); + IncomingBlocks.push_back(IncomingBlock); + } + + /// Returns the \p I th incoming VPBasicBlock. + VPBasicBlock *getIncomingBlock(unsigned I) { return IncomingBlocks[I]; } + + /// Returns the \p I th incoming VPValue. + VPValue *getIncomingValue(unsigned I) { return getOperand(I); } + + bool onlyFirstLaneUsed(const VPValue *Op) const override { + assert(is_contained(operands(), Op) && + "Op must be an operand of the recipe"); + return true; + } +}; + /// A recipe for handling first-order recurrence phis. The start value is the /// first operand of the recipe and the incoming value from the backedge is the /// second operand. @@ -1975,6 +2020,9 @@ // Return whether the loaded-from / stored-to addresses are consecutive. bool isConsecutive() const { return Consecutive; } + // Mark the memory access of this recipe as beeing consecutive. + void makeConsecutive() { Consecutive = true; } + // Return whether the consecutive loaded/stored addresses are in reverse // order. bool isReverse() const { return Reverse; } @@ -3004,6 +3052,8 @@ return Rep->isUniform(); if (auto *GEP = dyn_cast(Def)) return all_of(GEP->operands(), isUniformAfterVectorization); + if (isa(VPV)) + return true; return false; } } // end namespace vputils diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -795,7 +795,7 @@ VPBasicBlock *Header = getVectorLoopRegion()->getEntryBasicBlock(); for (VPRecipeBase &R : Header->phis()) { // Skip phi-like recipes that generate their backedege values themselves. - if (isa(&R)) + if (isa(&R) || isa(&R)) continue; if (isa(&R) || diff --git a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp --- a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp @@ -119,7 +119,7 @@ // Get or create a region for the loop containing BB. Loop *CurrentLoop = LI->getLoopFor(BB); VPRegionBlock *ParentR = nullptr; - if (CurrentLoop) { + if (CurrentLoop && CurrentLoop->getLoopDepth() >= TheLoop->getLoopDepth()) { auto Iter = Loop2Region.insert({CurrentLoop, nullptr}); if (Iter.second) Iter.first->second = new VPRegionBlock( diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -61,6 +61,7 @@ case VPWidenGEPSC: case VPWidenIntOrFpInductionSC: case VPWidenPHISC: + case VPScalarPHISC: case VPWidenSC: case VPWidenSelectSC: { const Instruction *I = @@ -95,6 +96,7 @@ case VPWidenGEPSC: case VPWidenIntOrFpInductionSC: case VPWidenPHISC: + case VPScalarPHISC: case VPWidenSC: case VPWidenSelectSC: { const Instruction *I = @@ -136,6 +138,7 @@ case VPWidenGEPSC: case VPWidenIntOrFpInductionSC: case VPWidenPHISC: + case VPScalarPHISC: case VPWidenPointerInductionSC: case VPWidenSC: case VPWidenSelectSC: { @@ -1643,10 +1646,12 @@ StartIdx = I; } } - Value *Op0 = State.get(getOperand(StartIdx), 0); - Type *VecTy = Op0->getType(); - Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, "vec.phi"); - State.set(this, VecPhi, 0); + + Type *VecTy = State.get(getOperand(StartIdx), 0)->getType(); + for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) { + Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, "vec.phi"); + State.set(this, VecPhi, Part); + } } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -1670,6 +1675,46 @@ } #endif +void VPScalarPHIRecipe::execute(VPTransformState &State) { + assert(EnableVPlanNativePath && + "Non-native vplans are not expected to have VPScalarPHIRecipes."); + + // This recipe is used in outer-loop vectorization for the PHIs in + // the headers of inner loops. Only unifom loop nests are supported, + // control flow is always uniform. + + // Create a phi with no operands - the phi operands will be + // set at the end of vector code generation. + VPBasicBlock *Parent = getParent(); + VPRegionBlock *LoopRegion = Parent->getEnclosingLoopRegion(); + unsigned StartIdx = 0; + // For phis in header blocks of loop regions, use the index of the value + // coming from the preheader to get the type. + if (LoopRegion->getEntryBasicBlock() == Parent) { + for (unsigned I = 0; I < getNumOperands(); ++I) { + if (getIncomingBlock(I) == + LoopRegion->getSinglePredecessor()->getExitingBasicBlock()) + StartIdx = I; + } + } + + Type *Ty = State.get(getOperand(StartIdx), VPIteration(0, 0))->getType(); + Value *NewPhi = State.Builder.CreatePHI(Ty, 2, "scalar.phi"); + for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) + State.set(this, NewPhi, VPIteration(Part, 0)); +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void VPScalarPHIRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "SCALAR-PHI "; + + printAsOperand(O, SlotTracker); + O << " = phi "; + printOperands(O, SlotTracker); +} +#endif + // TODO: It would be good to use the existing VPWidenPHIRecipe instead and // remove VPActiveLaneMaskPHIRecipe. void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) { diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -23,6 +23,7 @@ class PHINode; class ScalarEvolution; class Loop; +class LoopInfo; class PredicatedScalarEvolution; class TargetLibraryInfo; class VPBuilder; @@ -37,6 +38,14 @@ GetIntOrFpInductionDescriptor, ScalarEvolution &SE, const TargetLibraryInfo &TLI); + /// Replace widening recipes where all users only use the first lane by + /// uniform VPReplicateRecipes. Also, check if memory accesses can be + /// marked as uniform or consecutive. This transformation is only usefull to + /// the VPlan-native path. + static void findAndReplaceUniformRecipes(VPlan &Plan, const Loop *TheLoop, + ScalarEvolution &SE, + const LoopInfo &LI); + /// Sink users of fixed-order recurrences after the recipe defining their /// previous value. Then introduce FirstOrderRecurrenceSplice VPInstructions /// to combine the value from the recurrence phis and previous values. The diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -13,15 +13,19 @@ #include "VPlanTransforms.h" #include "VPRecipeBuilder.h" +#include "VPlan.h" #include "VPlanCFG.h" #include "VPlanDominatorTree.h" #include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetVector.h" #include "llvm/Analysis/IVDescriptors.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/PatternMatch.h" +#define DEBUG_TYPE "loop-vectorize" + using namespace llvm; using namespace llvm::PatternMatch; @@ -96,6 +100,297 @@ } } +// Knowing that all recipes in ScalarUse have at least one user that only uses +// the first lane, find recipes that are widening but that can be replaced to +// only calculate the scalar value of the first lane. +static void +collectScalarizeableRecipes(SetVector &ScalarUse, + SetVector &Scalarizeable) { + // Return true if the value V is only used by recipes that only require + // the first lane or by VPWidenPHINodes, and false otherwise. + auto CheckUses = + [&](VPValue *V, + SmallSetVector &NonScalarPHIUses) -> bool { + for (VPUser *U : V->users()) { + if (U->onlyFirstLaneUsed(V)) + continue; + + if (auto *R = dyn_cast(U); R && Scalarizeable.contains(R)) + continue; + + if (auto *Phi = dyn_cast(U)) { + NonScalarPHIUses.insert(Phi); + continue; + } + + return false; + } + return true; + }; + + // If the Phi has a single non-scalar user that is a VPWidenPHIRecipe, + // return that Phi. Otherwise, return nullptr. + auto GetOnlyNonScalarUseOfPhi = + [&](VPWidenPHIRecipe *Phi) -> VPWidenRecipe * { + VPWidenRecipe *SingleNonScalarUse = nullptr; + for (VPUser *U : Phi->users()) { + if (U->onlyFirstLaneUsed(Phi) || + (isa(U) && + Scalarizeable.contains(cast(U)))) + continue; + + if (SingleNonScalarUse || !isa(U)) + return nullptr; + + SingleNonScalarUse = cast(U); + } + return SingleNonScalarUse; + }; + + SmallSetVector NonScalarPHIUses; + + // Start the worklist with all recipes that have at least one scalar use. + SetVector Worklist(ScalarUse); + while (!Worklist.empty()) { + VPRecipeBase *R = Worklist.pop_back_val(); + VPValue *V = R->getVPSingleValue(); + if (!V || Scalarizeable.contains(R) || + !(isa(R) || isa(R) || + isa(R) || isa(R) || + isa(R))) + continue; + + LLVM_DEBUG(dbgs() << "LV: Poped worklist item: "; V->dump()); + + // Phi-nodes can create def-use chain cycles, so we look exactly one + // instruction ahead to know if its only non-scalar use could be scalarized + // if the PHI itself is scalar. This allowes scalarization of inner-loop + // induction variables. + if (auto *Phi = dyn_cast(R)) { + VPWidenRecipe *User = GetOnlyNonScalarUseOfPhi(Phi); + if (User && is_contained(User->users(), Phi) && + all_of(User->users(), [&](VPUser *U) -> bool { + return U == Phi || U->onlyFirstLaneUsed(User) || + (isa(U) && + Scalarizeable.contains(cast(U))); + })) { + + // The PHI can be scalarized! + LLVM_DEBUG(dbgs() << "LV: Scalarize: "; V->dump()); + Scalarizeable.insert(Phi); + for (VPValue *Op : R->operands()) + if (auto *OpR = Op->getDefiningRecipe()) + Worklist.insert(OpR); + } + } + + NonScalarPHIUses.clear(); + if (!CheckUses(V, NonScalarPHIUses)) + continue; + + // If absolutely all uses are scalar, add the recipe to the set of + // scalarizeable recipes and add everything it uses itself to the + // worklist (if that is a recipe that is not already in the set). + if (NonScalarPHIUses.empty()) { + LLVM_DEBUG(dbgs() << "LV: Scalarize: "; V->dump()); + Scalarizeable.insert(R); + for (VPValue *Op : R->operands()) + if (auto *OpR = Op->getDefiningRecipe()) + Worklist.insert(OpR); + continue; + } + + // Make sure all PHI + for (VPWidenPHIRecipe *UsingPhi : NonScalarPHIUses) { + // Add all users of the PHI to the worklist, except the current recipe. + for (VPUser *U : UsingPhi->users()) + if (auto *UR = dyn_cast(U); UR && UR != R) + Worklist.insert(UR); + + // Now add the PHI itself to the worklist. + Worklist.insert(UsingPhi); + } + } +} + +enum class MemAccessKind { Unknown, Uniform, Consecutive }; + +// Helper function for the VPlan-native path that returns what kind +// of memory access the pointer represents: Unknown, Uniform or Consecutive. +static MemAccessKind +checkMemoryAccessesForVPlanNativePath(VPValue *Ptr, Type *AccessTy, + ScalarEvolution &SE, const LoopInfo &LI, + const Loop *TheLoop) { + Value *V = Ptr->getUnderlyingValue(); + if (!V || !V->getType()->isPointerTy()) + return MemAccessKind::Unknown; + + const SCEV *PtrScev = SE.getSCEV(V); + if (isa(PtrScev)) + return MemAccessKind::Unknown; + + // Peel of recurrences around inner loops of TheLoop. + const SCEV *S = PtrScev; + while (true) { + if (auto *AR = dyn_cast(S)) { + // Stop when a recurrence around TheLoop was found, or when we hit a outer + // loop of TheLoop. + if (AR->getLoop() == TheLoop || AR->getLoop()->contains(TheLoop)) + break; + + // The step of a inner loop can be whatever it wants, as long as it + // does not depend on the current iteration of TheLoop. + const SCEV *Step = AR->getStepRecurrence(SE); + if (!SE.isLoopInvariant(Step, TheLoop)) + return MemAccessKind::Unknown; + + S = AR->getStart(); + continue; + } + + // Add's can be ignored if the value that is added is loop invariant. + if (auto *Add = dyn_cast(S)) { + for (unsigned I = 1, N = Add->getNumOperands(); I < N; ++I) + if (!SE.isLoopInvariant(Add->getOperand(I), TheLoop)) + return MemAccessKind::Unknown; + + S = Add->getOperand(0); + continue; + } + + break; + } + + // If the unpeeled SCEV for the pointer is a recurrence around TheLoop, + // this memory access could be consecutive. + auto *AR = dyn_cast(S); + if (AR && AR->getLoop() == TheLoop) { + const auto *Step = dyn_cast(AR->getStepRecurrence(SE)); + if (!Step) + return MemAccessKind::Unknown; + + // Check if the step if equal to the size of the accessed elements. + auto &DL = TheLoop->getHeader()->getModule()->getDataLayout(); + TypeSize AllocSize = DL.getTypeAllocSize(AccessTy); + int64_t Size = AllocSize.getFixedValue(); + if (Step->getAPInt() != Size) + return MemAccessKind::Unknown; + + // The address calculation is not allowed to wrap. + if (auto *GEP = dyn_cast(V); GEP && GEP->isInBounds()) + return MemAccessKind::Consecutive; + + // Even if the address calculation is not explicitly marked as not wrapping, + // we can assume that it does not if the null pointer is undefined. + if (!NullPointerIsDefined(TheLoop->getHeader()->getParent(), + V->getType()->getPointerAddressSpace())) + return MemAccessKind::Consecutive; + + return MemAccessKind::Unknown; + } + + // If the unpeeled SCEV for the pointer is invariant to the vectorized loop, + // the access will be uniform accross all lanes. + return SE.isLoopInvariant(S, TheLoop) ? MemAccessKind::Uniform + : MemAccessKind::Unknown; +} + +void VPlanTransforms::findAndReplaceUniformRecipes(VPlan &Plan, + const Loop *TheLoop, + ScalarEvolution &SE, + const LoopInfo &LI) { + ReversePostOrderTraversal> RPOT( + Plan.getEntry()); + + // Helper function to replace a recipe by another one. + auto ReplaceRecipe = [](VPRecipeBase *OldRep, VPRecipeBase *NewRep) { + assert(NewRep->getNumDefinedValues() <= 1 && + OldRep->getNumDefinedValues() <= 1 && "unexpected number of values"); + NewRep->insertBefore(OldRep); + if (OldRep->getNumDefinedValues() == 1) + OldRep->getVPSingleValue()->replaceAllUsesWith( + NewRep->getVPSingleValue()); + OldRep->eraseFromParent(); + }; + + SetVector HasScalarUse; + + // Recipes are visited in reverse order because that minimizes the amount + // of work in collectScalarizeableRecipes() in the common cases. + for (VPBasicBlock *VPBB : + reverse(VPBlockUtils::blocksOnly(RPOT))) { + // The branch-on-cond terminator recipe only uses the first lane value. + if (auto *Br = dyn_cast_or_null(VPBB->getTerminator())) { + if (Br->getOpcode() == VPInstruction::BranchOnCond) + if (auto *R = Br->getOperand(0)->getDefiningRecipe()) + HasScalarUse.insert(R); + } + + for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) { + if (auto *MemRecipe = dyn_cast(&R)) { + VPValue *Ptr = MemRecipe->getAddr(); + Type *ETy = getLoadStoreType(&MemRecipe->getIngredient()); + MemAccessKind MemAccess = + checkMemoryAccessesForVPlanNativePath(Ptr, ETy, SE, LI, TheLoop); + + // Replace uniform loads by a replicating load, and check if the + // recipes used for the address calculation can be scalarized. + if (MemAccess == MemAccessKind::Uniform && !MemRecipe->isStore()) { + LLVM_DEBUG(dbgs() << "LV: Uniform memory access: "; + MemRecipe->dump()); + assert(MemRecipe->getMask() == nullptr); + auto *UniformLoad = new VPReplicateRecipe( + &MemRecipe->getIngredient(), MemRecipe->operands(), true); + ReplaceRecipe(MemRecipe, UniformLoad); + if (auto *R = Ptr->getDefiningRecipe()) + HasScalarUse.insert(R); + continue; + } + + // Mark consecutive loads or stores as such, and check if the address + // calculation recipes can be scalarized. + if (MemAccess == MemAccessKind::Consecutive) { + LLVM_DEBUG(dbgs() << "LV: Consecutive memory access: "; + MemRecipe->dump()); + MemRecipe->makeConsecutive(); + if (auto *R = Ptr->getDefiningRecipe()) + HasScalarUse.insert(R); + continue; + } + + LLVM_DEBUG(dbgs() << "LV: Non-consecutive non-uniform memory access: "; + MemRecipe->dump()); + } + } + } + + // A set of recipes where only the value of lane zero is needed. + SetVector ScalarizeableRecipes; + collectScalarizeableRecipes(HasScalarUse, ScalarizeableRecipes); + + // Replace all the recipes that compute vectors by ones that + // only compute the fist lane. + for (VPRecipeBase *R : ScalarizeableRecipes) { + Instruction *I = R->getUnderlyingInstr(); + + // Handle PHIs: + if (auto *WidenPhi = dyn_cast(R)) { + auto *ScalarPhi = new VPScalarPHIRecipe(cast(I)); + for (unsigned I = 0, E = WidenPhi->getNumOperands(); I != E; I++) + ScalarPhi->addIncoming(WidenPhi->getIncomingValue(I), + WidenPhi->getIncomingBlock(I)); + ReplaceRecipe(R, ScalarPhi); + continue; + } + + // All other widening recipes can be replaced by VPReplicateRecipe + // instances that are marked as uniform. + assert(isa(R) || isa(R) || + isa(R) || isa(R)); + ReplaceRecipe(R, new VPReplicateRecipe(I, R->operands(), true)); + } +} + static bool sinkScalarOperands(VPlan &Plan) { auto Iter = vp_depth_first_deep(Plan.getEntry()); bool Changed = false; diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -359,6 +359,7 @@ VPActiveLaneMaskPHISC, VPFirstOrderRecurrencePHISC, VPWidenPHISC, + VPScalarPHISC, VPWidenIntOrFpInductionSC, VPWidenPointerInductionSC, VPReductionPHISC, diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_test1_no_explicit_vect_width.ll b/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_test1_no_explicit_vect_width.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_test1_no_explicit_vect_width.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_test1_no_explicit_vect_width.ll @@ -16,35 +16,42 @@ ; } ; -; CHECK-LABEL: @foo_i32( -; CHECK-LABEL: vector.ph: -; CHECK: %[[SplatVal:.*]] = insertelement <4 x i32> poison, i32 %n, i64 0 -; CHECK: %[[Splat:.*]] = shufflevector <4 x i32> %[[SplatVal]], <4 x i32> poison, <4 x i32> zeroinitializer - -; CHECK-LABEL: vector.body: -; CHECK: %[[Ind:.*]] = phi i64 [ 0, %vector.ph ], [ %[[IndNext:.*]], %[[ForInc:.*]] ] -; CHECK: %[[VecInd:.*]] = phi <4 x i64> [ , %vector.ph ], [ %[[VecIndNext:.*]], %[[ForInc]] ] -; CHECK: %[[AAddr:.*]] = getelementptr inbounds [8 x i32], ptr @arr2, i64 0, <4 x i64> %[[VecInd]] -; CHECK: %[[VecIndTr:.*]] = trunc <4 x i64> %[[VecInd]] to <4 x i32> -; CHECK: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %[[VecIndTr]], <4 x ptr> %[[AAddr]], i32 4, <4 x i1> ) -; CHECK: %[[VecIndTr2:.*]] = trunc <4 x i64> %[[VecInd]] to <4 x i32> -; CHECK: %[[StoreVal:.*]] = add nsw <4 x i32> %[[VecIndTr2]], %[[Splat]] -; CHECK: br label %[[InnerLoop:.+]] - -; CHECK: [[InnerLoop]]: -; CHECK: %[[InnerPhi:.*]] = phi <4 x i64> [ zeroinitializer, %vector.body ], [ %[[InnerPhiNext:.*]], %[[InnerLoop]] ] -; CHECK: %[[AAddr2:.*]] = getelementptr inbounds [8 x [8 x i32]], ptr @arr, i64 0, <4 x i64> %[[InnerPhi]], <4 x i64> %[[VecInd]] -; CHECK: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %[[StoreVal]], <4 x ptr> %[[AAddr2]], i32 4, <4 x i1> %[[InnerPhi]], -; CHECK: %[[VecCond:.*]] = icmp eq <4 x i64> %[[InnerPhiNext]], -; CHECK: %[[InnerCond:.*]] = extractelement <4 x i1> %[[VecCond]], i32 0 -; CHECK: br i1 %[[InnerCond]], label %[[ForInc]], label %[[InnerLoop]] - -; CHECK: [[ForInc]]: -; CHECK: %[[IndNext]] = add nuw i64 %[[Ind]], 4 -; CHECK: %[[VecIndNext]] = add <4 x i64> %[[VecInd]], -; CHECK: %[[Cmp:.*]] = icmp eq i64 %[[IndNext]], 8 -; CHECK: br i1 %[[Cmp]], label %middle.block, label %vector.body +; CHECK-LABEL: define void @foo_i32 +; CHECK-SAME: (i32 [[N:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %vector.ph + +; CHECK: vector.ph: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[N]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label %vector.body + +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %[[FOR_INC:.*]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %[[FOR_INC]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [8 x i32], ptr @arr2, i64 0, i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = trunc <4 x i64> [[VEC_IND]] to <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; CHECK-NEXT: store <4 x i32> [[TMP2]], ptr [[TMP3]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = trunc <4 x i64> [[VEC_IND]] to <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[TMP4]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: br label %[[FOR_INNER:.*]] + +; CHECK: [[FOR_INNER]]: +; CHECK-NEXT: [[SCALAR_PHI:%.*]] = phi i64 [ 0, %vector.body ], [ [[TMP8:%.*]], %[[FOR_INNER]] ] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [8 x [8 x i32]], ptr @arr, i64 0, i64 [[SCALAR_PHI]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0 +; CHECK-NEXT: store <4 x i32> [[TMP5]], ptr [[TMP7]], align 4 +; CHECK-NEXT: [[TMP8]] = add nuw nsw i64 [[SCALAR_PHI]], 1 +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 8 +; CHECK-NEXT: br i1 [[TMP9]], label %[[FOR_INC]], label %[[FOR_INNER]] + +; CHECK: [[FOR_INC]]: +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 8 +; CHECK-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %vector.body @arr2 = external global [8 x i32], align 16 @arr = external global [8 x [8 x i32]], align 16 @@ -83,33 +90,40 @@ ret void } -; CHECK-LABEL: @foo_i64( -; CHECK-LABEL: vector.ph: -; CHECK: %[[SplatVal:.*]] = insertelement <2 x i64> poison, i64 %n, i64 0 -; CHECK: %[[Splat:.*]] = shufflevector <2 x i64> %[[SplatVal]], <2 x i64> poison, <2 x i32> zeroinitializer - -; CHECK-LABEL: vector.body: -; CHECK: %[[Ind:.*]] = phi i64 [ 0, %vector.ph ], [ %[[IndNext:.*]], %[[ForInc:.*]] ] -; CHECK: %[[VecInd:.*]] = phi <2 x i64> [ , %vector.ph ], [ %[[VecIndNext:.*]], %[[ForInc]] ] -; CHECK: %[[AAddr:.*]] = getelementptr inbounds [8 x i64], ptr @arrX, i64 0, <2 x i64> %[[VecInd]] -; CHECK: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> %[[VecInd]], <2 x ptr> %[[AAddr]], i32 4, <2 x i1> ) -; CHECK: %[[StoreVal:.*]] = add nsw <2 x i64> %[[VecInd]], %[[Splat]] -; CHECK: br label %[[InnerLoop:.+]] - -; CHECK: [[InnerLoop]]: -; CHECK: %[[InnerPhi:.*]] = phi <2 x i64> [ zeroinitializer, %vector.body ], [ %[[InnerPhiNext:.*]], %[[InnerLoop]] ] -; CHECK: %[[AAddr2:.*]] = getelementptr inbounds [8 x [8 x i64]], ptr @arrY, i64 0, <2 x i64> %[[InnerPhi]], <2 x i64> %[[VecInd]] -; CHECK: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> %[[StoreVal]], <2 x ptr> %[[AAddr2]], i32 4, <2 x i1> -; CHECK: %[[InnerPhiNext]] = add nuw nsw <2 x i64> %[[InnerPhi]], -; CHECK: %[[VecCond:.*]] = icmp eq <2 x i64> %[[InnerPhiNext]], -; CHECK: %[[InnerCond:.*]] = extractelement <2 x i1> %[[VecCond]], i32 0 -; CHECK: br i1 %[[InnerCond]], label %[[ForInc]], label %[[InnerLoop]] - -; CHECK: [[ForInc]]: -; CHECK: %[[IndNext]] = add nuw i64 %[[Ind]], 2 -; CHECK: %[[VecIndNext]] = add <2 x i64> %[[VecInd]], -; CHECK: %[[Cmp:.*]] = icmp eq i64 %[[IndNext]], 8 -; CHECK: br i1 %[[Cmp]], label %middle.block, label %vector.body +; CHECK-LABEL: define void @foo_i64 +; CHECK-SAME: (i64 [[N:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label %vector.ph + +; CHECK: vector.ph: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[N]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: br label %vector.body + +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %[[FOR_INC:.*]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %[[FOR_INC]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [8 x i64], ptr @arrX, i64 0, i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0 +; CHECK-NEXT: store <2 x i64> [[VEC_IND]], ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = add nsw <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: br label %[[FOR_INNER:.*]] + +; CHECK: [[FOR_INNER]]: +; CHECK-NEXT: [[SCALAR_PHI:%.*]] = phi i64 [ 0, %vector.body ], [ [[TMP6:%.*]], %[[FOR_INNER]] ] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [8 x [8 x i64]], ptr @arrY, i64 0, i64 [[SCALAR_PHI]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0 +; CHECK-NEXT: store <2 x i64> [[TMP3]], ptr [[TMP5]], align 4 +; CHECK-NEXT: [[TMP6]] = add nuw nsw i64 [[SCALAR_PHI]], 1 +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[TMP6]], 8 +; CHECK-NEXT: br i1 [[TMP7]], label %[[FOR_INC]], label %[[FOR_INNER]] + +; CHECK: [[FOR_INC]]: +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 8 +; CHECK-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %vector.body ; Function Attrs: norecurse nounwind uwtable define void @foo_i64(i64 %n) { entry: diff --git a/llvm/test/Transforms/LoopVectorize/X86/outer_loop_test1_no_explicit_vect_width.ll b/llvm/test/Transforms/LoopVectorize/X86/outer_loop_test1_no_explicit_vect_width.ll --- a/llvm/test/Transforms/LoopVectorize/X86/outer_loop_test1_no_explicit_vect_width.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/outer_loop_test1_no_explicit_vect_width.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 ; RUN: opt -S -passes=loop-vectorize -enable-vplan-native-path -mtriple x86_64 < %s | FileCheck %s ; RUN: opt -S -passes=loop-vectorize -enable-vplan-native-path -mtriple x86_64 -mattr=+avx < %s | FileCheck %s --check-prefix=AVX ; RUN: opt -S -passes=loop-vectorize -enable-vplan-native-path -mtriple x86_64 -mattr=+avx2 < %s | FileCheck %s --check-prefix=AVX @@ -18,68 +19,130 @@ ; } ; -; CHECK-LABEL: vector.ph: -; CHECK: %[[SplatVal:.*]] = insertelement <4 x i32> poison, i32 %n, i64 0 -; CHECK: %[[Splat:.*]] = shufflevector <4 x i32> %[[SplatVal]], <4 x i32> poison, <4 x i32> zeroinitializer - -; CHECK-LABEL: vector.body: -; CHECK: %[[Ind:.*]] = phi i64 [ 0, %vector.ph ], [ %[[IndNext:.*]], %[[ForInc:.*]] ] -; CHECK: %[[VecInd:.*]] = phi <4 x i64> [ , %vector.ph ], [ %[[VecIndNext:.*]], %[[ForInc]] ] -; CHECK: %[[AAddr:.*]] = getelementptr inbounds [8 x i32], ptr @arr2, i64 0, <4 x i64> %[[VecInd]] -; CHECK: %[[VecIndTr:.*]] = trunc <4 x i64> %[[VecInd]] to <4 x i32> -; CHECK: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %[[VecIndTr]], <4 x ptr> %[[AAddr]], i32 4, <4 x i1> ) -; CHECK: %[[VecIndTr2:.*]] = trunc <4 x i64> %[[VecInd]] to <4 x i32> -; CHECK: %[[StoreVal:.*]] = add nsw <4 x i32> %[[VecIndTr2]], %[[Splat]] -; CHECK: br label %[[InnerLoop:.+]] - -; CHECK: [[InnerLoop]]: -; CHECK: %[[InnerPhi:.*]] = phi <4 x i64> [ zeroinitializer, %vector.body ], [ %[[InnerPhiNext:.*]], %[[InnerLoop]] ] -; CHECK: %[[AAddr2:.*]] = getelementptr inbounds [8 x [8 x i32]], ptr @arr, i64 0, <4 x i64> %[[InnerPhi]], <4 x i64> %[[VecInd]] -; CHECK: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %[[StoreVal]], <4 x ptr> %[[AAddr2]], i32 4, <4 x i1> %[[InnerPhi]], -; CHECK: %[[VecCond:.*]] = icmp eq <4 x i64> %[[InnerPhiNext]], -; CHECK: %[[InnerCond:.*]] = extractelement <4 x i1> %[[VecCond]], i32 0 -; CHECK: br i1 %[[InnerCond]], label %[[ForInc]], label %[[InnerLoop]] - -; CHECK: [[ForInc]]: -; CHECK: %[[IndNext]] = add nuw i64 %[[Ind]], 4 -; CHECK: %[[VecIndNext]] = add <4 x i64> %[[VecInd]], -; CHECK: %[[Cmp:.*]] = icmp eq i64 %[[IndNext]], 8 -; CHECK: br i1 %[[Cmp]], label %middle.block, label %vector.body - -; AVX-LABEL: vector.ph: -; AVX: %[[SplatVal:.*]] = insertelement <8 x i32> poison, i32 %n, i64 0 -; AVX: %[[Splat:.*]] = shufflevector <8 x i32> %[[SplatVal]], <8 x i32> poison, <8 x i32> zeroinitializer - -; AVX-LABEL: vector.body: -; AVX: %[[Ind:.*]] = phi i64 [ 0, %vector.ph ], [ %[[IndNext:.*]], %[[ForInc:.*]] ] -; AVX: %[[VecInd:.*]] = phi <8 x i64> [ , %vector.ph ], [ %[[VecIndNext:.*]], %[[ForInc]] ] -; AVX: %[[AAddr:.*]] = getelementptr inbounds [8 x i32], ptr @arr2, i64 0, <8 x i64> %[[VecInd]] -; AVX: %[[VecIndTr:.*]] = trunc <8 x i64> %[[VecInd]] to <8 x i32> -; AVX: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %[[VecIndTr]], <8 x ptr> %[[AAddr]], i32 4, <8 x i1> ) -; AVX: %[[VecIndTr2:.*]] = trunc <8 x i64> %[[VecInd]] to <8 x i32> -; AVX: %[[StoreVal:.*]] = add nsw <8 x i32> %[[VecIndTr2]], %[[Splat]] -; AVX: br label %[[InnerLoop:.+]] - -; AVX: [[InnerLoop]]: -; AVX: %[[InnerPhi:.*]] = phi <8 x i64> [ zeroinitializer, %vector.body ], [ %[[InnerPhiNext:.*]], %[[InnerLoop]] ] -; AVX: %[[AAddr2:.*]] = getelementptr inbounds [8 x [8 x i32]], ptr @arr, i64 0, <8 x i64> %[[InnerPhi]], <8 x i64> %[[VecInd]] -; AVX: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %[[StoreVal]], <8 x ptr> %[[AAddr2]], i32 4, <8 x i1> %[[InnerPhi]], -; AVX: %[[VecCond:.*]] = icmp eq <8 x i64> %[[InnerPhiNext]], -; AVX: %[[InnerCond:.*]] = extractelement <8 x i1> %[[VecCond]], i32 0 -; AVX: br i1 %[[InnerCond]], label %[[ForInc]], label %[[InnerLoop]] - -; AVX: [[ForInc]]: -; AVX: %[[VecIndNext]] = add <8 x i64> %[[VecInd]], -; AVX: %[[IndNext]] = add nuw i64 %[[Ind]], 8 -; AVX: br i1 true, label %middle.block, label %vector.body - @arr2 = external global [8 x i32], align 16 @arr = external global [8 x [8 x i32]], align 16 ; Function Attrs: norecurse nounwind uwtable define void @foo(i32 %n) { +; CHECK-LABEL: define void @foo +; CHECK-SAME: (i32 [[N:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[N]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_INC82:%.*]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[FOR_INC82]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [8 x i32], ptr @arr2, i64 0, i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = trunc <4 x i64> [[VEC_IND]] to <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; CHECK-NEXT: store <4 x i32> [[TMP2]], ptr [[TMP3]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = trunc <4 x i64> [[VEC_IND]] to <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[TMP4]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: br label [[FOR_BODY31:%.*]] +; CHECK: for.body31: +; CHECK-NEXT: [[SCALAR_PHI:%.*]] = phi i64 [ 0, [[VECTOR_BODY]] ], [ [[TMP8:%.*]], [[FOR_BODY31]] ] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [8 x [8 x i32]], ptr @arr, i64 0, i64 [[SCALAR_PHI]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0 +; CHECK-NEXT: store <4 x i32> [[TMP5]], ptr [[TMP7]], align 4 +; CHECK-NEXT: [[TMP8]] = add nuw nsw i64 [[SCALAR_PHI]], 1 +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 8 +; CHECK-NEXT: br i1 [[TMP9]], label [[FOR_INC82]], label [[FOR_BODY31]] +; CHECK: for.inc82: +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 8 +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 8, 8 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END10:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 8, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV21:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT22:%.*]], [[FOR_INC8:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [8 x i32], ptr @arr2, i64 0, i64 [[INDVARS_IV21]] +; CHECK-NEXT: [[TMP11:%.*]] = trunc i64 [[INDVARS_IV21]] to i32 +; CHECK-NEXT: store i32 [[TMP11]], ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[INDVARS_IV21]] to i32 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP12]], [[N]] +; CHECK-NEXT: br label [[FOR_BODY3:%.*]] +; CHECK: for.body3: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY3]] ] +; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds [8 x [8 x i32]], ptr @arr, i64 0, i64 [[INDVARS_IV]], i64 [[INDVARS_IV21]] +; CHECK-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX7]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 8 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_INC8]], label [[FOR_BODY3]] +; CHECK: for.inc8: +; CHECK-NEXT: [[INDVARS_IV_NEXT22]] = add nuw nsw i64 [[INDVARS_IV21]], 1 +; CHECK-NEXT: [[EXITCOND23:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT22]], 8 +; CHECK-NEXT: br i1 [[EXITCOND23]], label [[FOR_END10]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: for.end10: +; CHECK-NEXT: ret void +; +; AVX-LABEL: define void @foo +; AVX-SAME: (i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; AVX-NEXT: entry: +; AVX-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; AVX: vector.ph: +; AVX-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> poison, i32 [[N]], i64 0 +; AVX-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> poison, <8 x i32> zeroinitializer +; AVX-NEXT: br label [[VECTOR_BODY:%.*]] +; AVX: vector.body: +; AVX-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_INC82:%.*]] ] +; AVX-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[FOR_INC82]] ] +; AVX-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; AVX-NEXT: [[TMP1:%.*]] = getelementptr inbounds [8 x i32], ptr @arr2, i64 0, i64 [[TMP0]] +; AVX-NEXT: [[TMP2:%.*]] = trunc <8 x i64> [[VEC_IND]] to <8 x i32> +; AVX-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; AVX-NEXT: store <8 x i32> [[TMP2]], ptr [[TMP3]], align 4 +; AVX-NEXT: [[TMP4:%.*]] = trunc <8 x i64> [[VEC_IND]] to <8 x i32> +; AVX-NEXT: [[TMP5:%.*]] = add nsw <8 x i32> [[TMP4]], [[BROADCAST_SPLAT]] +; AVX-NEXT: br label [[FOR_BODY31:%.*]] +; AVX: for.body31: +; AVX-NEXT: [[SCALAR_PHI:%.*]] = phi i64 [ 0, [[VECTOR_BODY]] ], [ [[TMP8:%.*]], [[FOR_BODY31]] ] +; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds [8 x [8 x i32]], ptr @arr, i64 0, i64 [[SCALAR_PHI]], i64 [[TMP0]] +; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0 +; AVX-NEXT: store <8 x i32> [[TMP5]], ptr [[TMP7]], align 4 +; AVX-NEXT: [[TMP8]] = add nuw nsw i64 [[SCALAR_PHI]], 1 +; AVX-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 8 +; AVX-NEXT: br i1 [[TMP9]], label [[FOR_INC82]], label [[FOR_BODY31]] +; AVX: for.inc82: +; AVX-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], +; AVX-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; AVX-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; AVX: middle.block: +; AVX-NEXT: [[CMP_N:%.*]] = icmp eq i64 8, 8 +; AVX-NEXT: br i1 [[CMP_N]], label [[FOR_END10:%.*]], label [[SCALAR_PH]] +; AVX: scalar.ph: +; AVX-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 8, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; AVX-NEXT: br label [[FOR_BODY:%.*]] +; AVX: for.body: +; AVX-NEXT: [[INDVARS_IV21:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT22:%.*]], [[FOR_INC8:%.*]] ] +; AVX-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [8 x i32], ptr @arr2, i64 0, i64 [[INDVARS_IV21]] +; AVX-NEXT: [[TMP10:%.*]] = trunc i64 [[INDVARS_IV21]] to i32 +; AVX-NEXT: store i32 [[TMP10]], ptr [[ARRAYIDX]], align 4 +; AVX-NEXT: [[TMP11:%.*]] = trunc i64 [[INDVARS_IV21]] to i32 +; AVX-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP11]], [[N]] +; AVX-NEXT: br label [[FOR_BODY3:%.*]] +; AVX: for.body3: +; AVX-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY3]] ] +; AVX-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds [8 x [8 x i32]], ptr @arr, i64 0, i64 [[INDVARS_IV]], i64 [[INDVARS_IV21]] +; AVX-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX7]], align 4 +; AVX-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; AVX-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 8 +; AVX-NEXT: br i1 [[EXITCOND]], label [[FOR_INC8]], label [[FOR_BODY3]] +; AVX: for.inc8: +; AVX-NEXT: [[INDVARS_IV_NEXT22]] = add nuw nsw i64 [[INDVARS_IV21]], 1 +; AVX-NEXT: [[EXITCOND23:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT22]], 8 +; AVX-NEXT: br i1 [[EXITCOND23]], label [[FOR_END10]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; AVX: for.end10: +; AVX-NEXT: ret void +; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/explicit_outer_detection.ll b/llvm/test/Transforms/LoopVectorize/explicit_outer_detection.ll --- a/llvm/test/Transforms/LoopVectorize/explicit_outer_detection.ll +++ b/llvm/test/Transforms/LoopVectorize/explicit_outer_detection.ll @@ -117,13 +117,11 @@ } ; Case 3: Annotated outer loop WITH vector width and interleave information -; doesn't have to be collected. +; has to be collected. ; CHECK-LABEL: case3 -; CHECK-NOT: LV: Loop hints: force=enabled -; CHECK-NOT: LV: We can vectorize this outer loop! -; CHECK: LV: Loop hints: force=? -; CHECK: LV: Found a loop: inner.body +; CHECK: LV: Loop hints: force=enabled width=4 interleave=2 +; CHECK: LV: We can vectorize this outer loop! define void @case3(ptr nocapture %a, ptr nocapture readonly %b, i32 %N, i32 %M) local_unnamed_addr { entry: diff --git a/llvm/test/Transforms/LoopVectorize/outer-loop-vec-phi-predecessor-order.ll b/llvm/test/Transforms/LoopVectorize/outer-loop-vec-phi-predecessor-order.ll --- a/llvm/test/Transforms/LoopVectorize/outer-loop-vec-phi-predecessor-order.ll +++ b/llvm/test/Transforms/LoopVectorize/outer-loop-vec-phi-predecessor-order.ll @@ -15,38 +15,33 @@ ; CHECK: vector.ph: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[N]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[LOOP_1_LATCH5:%.*]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[LOOP_1_LATCH5]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[LOOP_1_LATCH4:%.*]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[LOOP_1_LATCH4]] ] ; CHECK-NEXT: br label [[LOOP_2_HEADER1:%.*]] ; CHECK: loop.2.header1: -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_BODY]] ], [ [[TMP5:%.*]], [[LOOP_2_LATCH4:%.*]] ] +; CHECK-NEXT: [[SCALAR_PHI:%.*]] = phi i64 [ 0, [[VECTOR_BODY]] ], [ [[TMP5:%.*]], [[LOOP_2_LATCH3:%.*]] ] ; CHECK-NEXT: br label [[LOOP_32:%.*]] ; CHECK: loop.32: -; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i64> [ zeroinitializer, [[LOOP_2_HEADER1]] ], [ [[TMP2:%.*]], [[LOOP_32]] ] -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [2000 x i32], ptr [[SRC:%.*]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI3]] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, [[LOOP_2_HEADER1]] ], [ [[TMP2:%.*]], [[LOOP_32]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [2000 x i32], ptr [[SRC:%.*]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]] ; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP0]], i32 4, <4 x i1> , <4 x i32> poison) ; CHECK-NEXT: [[TMP1:%.*]] = mul nsw <4 x i32> [[WIDE_MASKED_GATHER]], ; CHECK-NEXT: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[TMP1]], <4 x ptr> [[TMP0]], i32 4, <4 x i1> ) -; CHECK-NEXT: [[TMP2]] = add nuw nsw <4 x i64> [[VEC_PHI3]], -; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <4 x i64> [[TMP2]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[TMP3]], i32 0 -; CHECK-NEXT: br i1 [[TMP4]], label [[LOOP_2_LATCH4]], label [[LOOP_32]] -; CHECK: loop.2.latch4: -; CHECK-NEXT: [[TMP5]] = add nuw nsw <4 x i64> [[VEC_PHI]], -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq <4 x i64> [[TMP5]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP6]], i32 0 -; CHECK-NEXT: br i1 [[TMP7]], label [[LOOP_1_LATCH5]], label [[LOOP_2_HEADER1]] -; CHECK: loop.1.latch5: -; CHECK-NEXT: [[TMP8:%.*]] = add nuw nsw <4 x i64> [[VEC_IND]], -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq <4 x i64> [[TMP8]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP2]] = add nuw nsw <4 x i64> [[VEC_PHI]], +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i64> [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[TMP3]], [[N]] +; CHECK-NEXT: br i1 [[TMP4]], label [[LOOP_2_LATCH3]], label [[LOOP_32]] +; CHECK: loop.2.latch3: +; CHECK-NEXT: [[TMP5]] = add nuw nsw i64 [[SCALAR_PHI]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP5]], [[N]] +; CHECK-NEXT: br i1 [[TMP6]], label [[LOOP_1_LATCH4]], label [[LOOP_2_HEADER1]] +; CHECK: loop.1.latch4: ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], -; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] @@ -75,7 +70,7 @@ ; CHECK: loop.1.latch: ; CHECK-NEXT: [[IV_1_NEXT]] = add nuw nsw i64 [[IV_1]], 1 ; CHECK-NEXT: [[EC_1:%.*]] = icmp eq i64 [[IV_1_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EC_1]], label [[EXIT]], label [[LOOP_1_HEADER]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-NEXT: br i1 [[EC_1]], label [[EXIT]], label [[LOOP_1_HEADER]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/outer_loop_test1.ll b/llvm/test/Transforms/LoopVectorize/outer_loop_test1.ll --- a/llvm/test/Transforms/LoopVectorize/outer_loop_test1.ll +++ b/llvm/test/Transforms/LoopVectorize/outer_loop_test1.ll @@ -21,20 +21,22 @@ ; CHECK-LABEL: vector.body: ; CHECK: %[[Ind:.*]] = phi i64 [ 0, %vector.ph ], [ %[[IndNext:.*]], %[[ForInc:.*]] ] ; CHECK: %[[VecInd:.*]] = phi <4 x i64> [ , %vector.ph ], [ %[[VecIndNext:.*]], %[[ForInc]] ] -; CHECK: %[[AAddr:.*]] = getelementptr inbounds [8 x i32], ptr @arr2, i64 0, <4 x i64> %[[VecInd]] +; CHECK: %[[IndAdd:.*]] = add i64 %[[Ind]], 0 +; CHECK: %[[AAddr:.*]] = getelementptr inbounds [8 x i32], ptr @arr2, i64 0, i64 %[[IndAdd]] ; CHECK: %[[VecIndTr:.*]] = trunc <4 x i64> %[[VecInd]] to <4 x i32> -; CHECK: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %[[VecIndTr]], <4 x ptr> %[[AAddr]], i32 4, <4 x i1> ) +; CHECK: %[[AAddrCpy:.*]] = getelementptr inbounds i32, ptr %[[AAddr]], i32 0 +; CHECK: store <4 x i32> %[[VecIndTr]], ptr %[[AAddrCpy]], align 4 ; CHECK: %[[VecIndTr2:.*]] = trunc <4 x i64> %[[VecInd]] to <4 x i32> ; CHECK: %[[StoreVal:.*]] = add nsw <4 x i32> %[[VecIndTr2]], %[[Splat]] ; CHECK: br label %[[InnerLoop:.+]] ; CHECK: [[InnerLoop]]: -; CHECK: %[[InnerPhi:.*]] = phi <4 x i64> [ zeroinitializer, %vector.body ], [ %[[InnerPhiNext:.*]], %[[InnerLoop]] ] -; CHECK: %[[AAddr2:.*]] = getelementptr inbounds [8 x [8 x i32]], ptr @arr, i64 0, <4 x i64> %[[InnerPhi]], <4 x i64> %[[VecInd]] -; CHECK: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %[[StoreVal]], <4 x ptr> %[[AAddr2]], i32 4, <4 x i1> %[[InnerPhi]], -; CHECK: %[[VecCond:.*]] = icmp eq <4 x i64> %[[InnerPhiNext]], -; CHECK: %[[InnerCond:.*]] = extractelement <4 x i1> %[[VecCond]], i32 0 +; CHECK: %[[InnerPhi:.*]] = phi i64 [ 0, %vector.body ], [ %[[InnerPhiNext:.*]], %[[InnerLoop]] ] +; CHECK: %[[AAddr2:.*]] = getelementptr inbounds [8 x [8 x i32]], ptr @arr, i64 0, i64 %[[InnerPhi]], i64 %[[IndAdd]] +; CHECK: %[[AAddr2Cpy:.*]] = getelementptr inbounds i32, ptr %[[AAddr2]], i32 0 +; CHECK: store <4 x i32> %[[StoreVal]], ptr %[[AAddr2Cpy]], align 4 +; CHECK: %[[InnerPhiNext]] = add nuw nsw i64 %[[InnerPhi]], 1 +; CHECK: %[[InnerCond:.*]] = icmp eq i64 %[[InnerPhiNext]], 8 ; CHECK: br i1 %[[InnerCond]], label %[[ForInc]], label %[[InnerLoop]] ; CHECK: [[ForInc]]: diff --git a/llvm/test/Transforms/LoopVectorize/outer_loop_test2.ll b/llvm/test/Transforms/LoopVectorize/outer_loop_test2.ll --- a/llvm/test/Transforms/LoopVectorize/outer_loop_test2.ll +++ b/llvm/test/Transforms/LoopVectorize/outer_loop_test2.ll @@ -22,30 +22,36 @@ ; CHECK-LABEL: vector.body: ; CHECK: %[[Ind:.*]] = phi i64 [ 0, %vector.ph ], [ %[[IndNext:.*]], %[[ForInc:.*]] ] ; CHECK: %[[VecInd:.*]] = phi <4 x i64> [ , %vector.ph ], [ %[[VecIndNext:.*]], %[[ForInc]] ] -; CHECK: %[[AAddr:.*]] = getelementptr inbounds [1024 x i32], ptr @A, i64 0, <4 x i64> %[[VecInd]] -; CHECK: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %[[CSplat]], <4 x ptr> %[[AAddr]], i32 4, <4 x i1> ) +; CHECK: %[[tmp1:.*]] = add i64 %[[Ind]], 0 +; CHECK: %[[AAddr:.*]] = getelementptr [1024 x i32], ptr @A, i64 0, i64 %[[tmp1]] +; CHECK: %[[tmp2:.*]] = getelementptr i32, ptr %[[AAddr]], i32 0 +; CHECK: store <4 x i32> %[[CSplat]], ptr %[[tmp2]], align 4 ; CHECK: br i1 %[[ZeroTripChk]], label %[[InnerForPh:.*]], label %[[OuterInc:.*]] ; CHECK: [[InnerForPh]]: -; CHECK: %[[WideAVal:.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %[[AAddr]], i32 4, <4 x i1> , <4 x i32> poison) +; CHECK: %[[tmp4:.*]] = getelementptr i32, ptr %[[AAddr]], i32 0 +; CHECK: %[[WideAVal:.*]] = load <4 x i32>, ptr %[[tmp4]], align 4 ; CHECK: %[[VecIndTr:.*]] = trunc <4 x i64> %[[VecInd]] to <4 x i32> ; CHECK: br label %[[InnerForBody:.*]] ; CHECK: [[InnerForBody]]: -; CHECK: %[[InnerInd:.*]] = phi <4 x i64> [ zeroinitializer, %[[InnerForPh]] ], [ %[[InnerIndNext:.*]], %[[InnerForBody]] ] +; CHECK: %[[InnerInd:.*]] = phi i64 [ 0, %[[InnerForPh]] ], [ %[[InnerIndNext:.*]], %[[InnerForBody]] ] ; CHECK: %[[AccumPhi:.*]] = phi <4 x i32> [ %[[WideAVal]], %[[InnerForPh]] ], [ %[[AccumPhiNext:.*]], %[[InnerForBody]] ] -; CHECK: %[[BAddr:.*]] = getelementptr inbounds [1024 x i32], ptr @B, i64 0, <4 x i64> %[[InnerInd]] -; CHECK: %[[WideBVal:.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %[[BAddr]], i32 4, <4 x i1> , <4 x i32> poison) +; CHECK: %[[BAddr:.*]] = getelementptr inbounds [1024 x i32], ptr @B, i64 0, i64 %[[InnerInd]] +; CHECK: %[[tmp3:.*]] = load i32, ptr %[[BAddr]], align 4 +; CHECK: %[[tmp4:.*]] = insertelement <4 x i32> poison, i32 %[[tmp3]], i64 0 +; CHECK: %[[WideBVal:.*]] = shufflevector <4 x i32> %[[tmp4]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK: %[[Add1:.*]] = add nsw <4 x i32> %[[WideBVal]], %[[VecIndTr]] ; CHECK: %[[AccumPhiNext]] = add nsw <4 x i32> %[[Add1]], %[[AccumPhi]] -; CHECK: %[[InnerIndNext]] = add nuw nsw <4 x i64> %[[InnerInd]], -; CHECK: %[[InnerVecCond:.*]] = icmp eq <4 x i64> %[[InnerIndNext]], {{.*}} -; CHECK: %[[InnerCond:.+]] = extractelement <4 x i1> %[[InnerVecCond]], i32 0 + +; CHECK: %[[InnerIndNext]] = add nuw nsw i64 %[[InnerInd]], 1 +; CHECK: %[[InnerCond:.*]] = icmp eq i64 %[[InnerIndNext]], {{.*}} ; CHECK: br i1 %[[InnerCond]], label %[[InnerCrit:.*]], label %[[InnerForBody]] ; CHECK: [[InnerCrit]]: ; CHECK: %[[StorePhi:.*]] = phi <4 x i32> [ %[[AccumPhiNext]], %[[InnerForBody]] ] -; CHECK: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %[[StorePhi]], <4 x ptr> %[[AAddr]], i32 4, <4 x i1> ) +; CHECK: %[[tmp5:.*]] = getelementptr i32, ptr %[[AAddr]], i32 0 +; CHECK: store <4 x i32> %[[StorePhi]], ptr %[[tmp5]], align 4 ; CHECK: br label %[[ForInc]] ; CHECK: [[ForInc]]: diff --git a/llvm/test/Transforms/LoopVectorize/vplan-vectorize-inner-loop-reduction.ll b/llvm/test/Transforms/LoopVectorize/vplan-vectorize-inner-loop-reduction.ll --- a/llvm/test/Transforms/LoopVectorize/vplan-vectorize-inner-loop-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-vectorize-inner-loop-reduction.ll @@ -20,30 +20,29 @@ ; CHECK: vector.body: ; CHECK-NEXT: %[[FOR1_INDEX:.*]] = phi i64 [ 0, %[[LABEL_PR:.*]] ], [ %{{.*}}, %[[LABEL_FOR1_LATCH:.*]] ] -; CHECK: %[[VEC_INDEX:.*]] = phi <4 x i64> [ , %[[LABEL_PR]] ], [ %{{.*}}, %[[LABEL_FOR1_LATCH]] ] -; CHECK-NEXT: %[[A_PTR:.*]] = getelementptr inbounds double, ptr %a.in, <4 x i64> %[[VEC_INDEX]] -; CHECK-NEXT: %[[MASKED_GATHER1:.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> %[[A_PTR]], i32 8, <4 x i1> , <4 x double> poison) -; CHECK-NEXT: %[[B_PTR:.*]] = getelementptr inbounds double, ptr %b.in, <4 x i64> %[[VEC_INDEX]] -; CHECK-NEXT: %[[MASKED_GATHER2:.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> %[[B_PTR]], i32 8, <4 x i1> , <4 x double> poison) +; CHECK-NEXT: %[[TMP1:.*]] = add i64 %[[FOR1_INDEX]], 0 +; CHECK-NEXT: %[[A_PTR:.*]] = getelementptr inbounds double, ptr %a.in, i64 %[[TMP1]] +; CHECK-NEXT: %[[TMP2:.*]] = getelementptr inbounds double, ptr %[[A_PTR]], i32 0 +; CHECK-NEXT: %[[WIDE_LOAD1:.*]] = load <4 x double>, ptr %[[TMP2]], align 8 +; CHECK-NEXT: %[[B_PTR:.*]] = getelementptr inbounds double, ptr %b.in, i64 %[[TMP1]] +; CHECK-NEXT: %[[TMP3:.*]] = getelementptr inbounds double, ptr %[[B_PTR]], i32 0 +; CHECK-NEXT: %[[WIDE_LOAD2:.*]] = load <4 x double>, ptr %[[TMP3]], align 8 ; CHECK-NEXT: br label %[[FOR2_HEADER:.*]] ; CHECK: [[FOR2_HEADER]]: -; CHECK-NEXT: %[[FOR2_INDEX:.*]] = phi <4 x i32> [ zeroinitializer, %vector.body ], [ %[[FOR2_INDEX_NEXT:.*]], %[[FOR2_HEADER]] ] -; CHECK-NEXT: %[[REDUCTION:.*]] = phi <4 x double> [ %[[MASKED_GATHER1]], %vector.body ], [ %[[REDUCTION_NEXT:.*]], %[[FOR2_HEADER]] ] -; CHECK-NEXT: %[[REDUCTION_NEXT]] = fadd <4 x double> %[[MASKED_GATHER2]], %[[REDUCTION]] -; CHECK-NEXT: %[[FOR2_INDEX_NEXT]] = add nuw nsw <4 x i32> %[[FOR2_INDEX]], -; CHECK-NEXT: %[[VEC_PTR:.*]] = icmp eq <4 x i32> %[[FOR2_INDEX_NEXT]], -; CHECK-NEXT: %[[EXIT_COND:.*]] = extractelement <4 x i1> %[[VEC_PTR]], i32 0 +; CHECK-NEXT: %[[FOR2_INDEX:.*]] = phi i32 [ 0, %vector.body ], [ %[[FOR2_INDEX_NEXT:.*]], %[[FOR2_HEADER]] ] +; CHECK-NEXT: %[[REDUCTION:.*]] = phi <4 x double> [ %[[WIDE_LOAD1]], %vector.body ], [ %[[REDUCTION_NEXT:.*]], %[[FOR2_HEADER]] ] +; CHECK-NEXT: %[[REDUCTION_NEXT]] = fadd <4 x double> %[[WIDE_LOAD2]], %[[REDUCTION]] +; CHECK-NEXT: %[[FOR2_INDEX_NEXT]] = add nuw nsw i32 %[[FOR2_INDEX]], 1 +; CHECK-NEXT: %[[EXIT_COND:.*]] = icmp eq i32 %[[FOR2_INDEX_NEXT]], 10000 ; CHECK-NEXT: br i1 %[[EXIT_COND]], label %[[FOR1_LATCH:.*]], label %{{.*}} ; CHECK: [[FOR1_LATCH]]: ; CHECK-NEXT: %[[REDUCTION:.*]] = phi <4 x double> [ %[[REDUCTION_NEXT]], %[[FOR2_HEADER]] ] -; CHECK-NEXT: %[[C_PTR:.*]] = getelementptr inbounds double, ptr %c.out, <4 x i64> %[[VEC_INDEX]] -; CHECK-NEXT: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> %[[REDUCTION]], <4 x ptr> %[[C_PTR]], i32 8, <4 x i1> ) -; CHECK-NEXT: %[[VEC_INDEX_NEXT:.*]] = add nuw nsw <4 x i64> %[[VEC_INDEX]], -; CHECK-NEXT: %[[VEC_PTR:.*]] = icmp eq <4 x i64> %[[VEC_INDEX_NEXT]], +; CHECK-NEXT: %[[C_PTR:.*]] = getelementptr inbounds double, ptr %c.out, i64 %[[TMP1]] +; CHECK-NEXT: %[[TMP4:.*]] = getelementptr inbounds double, ptr %[[C_PTR]], i32 0 +; CHECK-NEXT: store <4 x double> %[[REDUCTION]], ptr %[[TMP4]], align 8 ; CHECK-NEXT: %[[FOR1_INDEX_NEXT:.*]] = add nuw i64 %[[FOR1_INDEX]], 4 -; CHECK-NEXT: %{{.*}} = add <4 x i64> %[[VEC_INDEX]], ; CHECK-NEXT: %[[EXIT_COND:.*]] = icmp eq i64 %[[FOR1_INDEX_NEXT]], 1000 ; CHECK-NEXT: br i1 %[[EXIT_COND]], label %{{.*}}, label %vector.body diff --git a/llvm/test/Transforms/LoopVectorize/vplan-widen-call-instruction.ll b/llvm/test/Transforms/LoopVectorize/vplan-widen-call-instruction.ll --- a/llvm/test/Transforms/LoopVectorize/vplan-widen-call-instruction.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-widen-call-instruction.ll @@ -7,35 +7,35 @@ define void @widen_call_instruction(ptr noalias nocapture readonly %a.in, ptr noalias nocapture readonly %b.in, ptr noalias nocapture %c.out) { ; CHECK-LABEL: @widen_call_instruction( -; CHECK: vector.body: -; CHECK-NEXT: %[[FOR1_INDEX:.*]] = phi i64 [ 0, %[[LABEL_PR:.*]] ], [ %{{.*}}, %[[LABEL_FOR1_LATCH:.*]] ] -; CHECK: %[[VEC_INDEX:.*]] = phi <4 x i64> [ , %[[LABEL_PR]] ], [ %{{.*}}, %[[LABEL_FOR1_LATCH]] ] -; CHECK-NEXT: %[[A_PTR:.*]] = getelementptr inbounds double, ptr %a.in, <4 x i64> %[[VEC_INDEX]] -; CHECK-NEXT: %[[MASKED_GATHER1:.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> %[[A_PTR]], i32 8, <4 x i1> , <4 x double> poison) -; CHECK-NEXT: %[[B_PTR:.*]] = getelementptr inbounds double, ptr %b.in, <4 x i64> %[[VEC_INDEX]] -; CHECK-NEXT: %[[MASKED_GATHER2:.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> %[[B_PTR]], i32 8, <4 x i1> , <4 x double> poison) -; CHECK-NEXT: %[[B_SQRT:.*]] = call <4 x double> @llvm.sqrt.v4f64(<4 x double> %[[MASKED_GATHER2]]) -; CHECK-NEXT: br label %[[FOR2_HEADER:.*]] -; CHECK: [[FOR2_HEADER]]: -; CHECK-NEXT: %[[FOR2_INDEX:.*]] = phi <4 x i32> [ zeroinitializer, %vector.body ], [ %[[FOR2_INDEX_NEXT:.*]], %[[FOR2_HEADER]] ] -; CHECK-NEXT: %[[REDUCTION:.*]] = phi <4 x double> [ %[[MASKED_GATHER1]], %vector.body ], [ %[[REDUCTION_NEXT:.*]], %[[FOR2_HEADER]] ] -; CHECK-NEXT: %[[REDUCTION_NEXT]] = fadd <4 x double> %[[B_SQRT]], %[[REDUCTION]] -; CHECK-NEXT: %[[FOR2_INDEX_NEXT]] = add nuw nsw <4 x i32> %[[FOR2_INDEX]], -; CHECK-NEXT: %[[VEC_PTR:.*]] = icmp eq <4 x i32> %[[FOR2_INDEX_NEXT]], -; CHECK-NEXT: %[[EXIT_COND:.*]] = extractelement <4 x i1> %[[VEC_PTR]], i32 0 -; CHECK-NEXT: br i1 %[[EXIT_COND]], label %[[FOR1_LATCH:.*]], label %{{.*}} +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %[[FOR1_LATCH:.*]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds double, ptr %a.in, i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x double>, ptr [[TMP2]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, ptr %b.in, i64 [[TMP0]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, ptr [[TMP3]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x double>, ptr [[TMP4]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = call <4 x double> @llvm.sqrt.v4f64(<4 x double> [[WIDE_LOAD1]]) +; CHECK-NEXT: br label %[[FOR2_HEADER:.*]] -; CHECK: [[FOR1_LATCH]]: -; CHECK-NEXT: %[[REDUCTION:.*]] = phi <4 x double> [ %[[REDUCTION_NEXT]], %[[FOR2_HEADER]] ] -; CHECK-NEXT: %[[C_PTR:.*]] = getelementptr inbounds double, ptr %c.out, <4 x i64> %[[VEC_INDEX]] -; CHECK-NEXT: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> %[[REDUCTION]], <4 x ptr> %[[C_PTR]], i32 8, <4 x i1> ) -; CHECK-NEXT: %[[VEC_INDEX_NEXT:.*]] = add nuw nsw <4 x i64> %[[VEC_INDEX]], -; CHECK-NEXT: %[[VEC_PTR:.*]] = icmp eq <4 x i64> %[[VEC_INDEX_NEXT]], -; CHECK-NEXT: %[[FOR1_INDEX_NEXT:.*]] = add nuw i64 %[[FOR1_INDEX]], 4 -; CHECK-NEXT: %{{.*}} = add <4 x i64> %[[VEC_INDEX]], -; CHECK-NEXT: %[[EXIT_COND:.*]] = icmp eq i64 %[[FOR1_INDEX_NEXT]], 1000 -; CHECK-NEXT: br i1 %[[EXIT_COND]], label %{{.*}}, label %vector.body +; CHECK: [[FOR2_HEADER]]: +; CHECK-NEXT: [[SCALAR_PHI:%.*]] = phi i32 [ 0, %vector.body ], [ [[TMP7:%.*]], %[[FOR2_HEADER]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x double> [ [[WIDE_LOAD]], %vector.body ], [ [[TMP6:%.*]], %[[FOR2_HEADER]] ] +; CHECK-NEXT: [[TMP6]] = fadd <4 x double> [[TMP5]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP7]] = add nuw nsw i32 [[SCALAR_PHI]], 1 +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP7]], 10000 +; CHECK-NEXT: br i1 [[TMP8]], label %[[FOR1_LATCH]], label %[[FOR2_HEADER]] + +; CHECK: [[FOR1_LATCH]]: +; CHECK-NEXT: [[VEC_PHI4:%.*]] = phi <4 x double> [ [[TMP6]], %[[FOR2_HEADER]] ] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds double, ptr %c.out, i64 [[TMP0]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds double, ptr [[TMP9]], i32 0 +; CHECK-NEXT: store <4 x double> [[VEC_PHI4]], ptr [[TMP10]], align 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 +; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label %vector.body entry: br label %for1.header diff --git a/llvm/test/Transforms/LoopVectorize/vplan-widen-select-instruction.ll b/llvm/test/Transforms/LoopVectorize/vplan-widen-select-instruction.ll --- a/llvm/test/Transforms/LoopVectorize/vplan-widen-select-instruction.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-widen-select-instruction.ll @@ -10,24 +10,26 @@ ; variables. define void @loop_invariant_select(ptr noalias nocapture %out, i1 %select, double %a, double %b) { -; CHECK-LABEL: @loop_invariant_select( +; CHECK-LABEL: define void @loop_invariant_select +; CHECK-SAME: (ptr noalias nocapture [[OUT:%.*]], i1 [[SELECT:%.*]], double [[A:%.*]], double [[B:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x double> poison, double [[A:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x double> poison, double [[A]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT]], <4 x double> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x double> poison, double [[B:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x double> poison, double [[B]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT2]], <4 x double> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR1_LATCH4:%.*]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[FOR1_LATCH4]] ] -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds double, ptr [[OUT:%.*]], <4 x i64> [[VEC_IND]] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 [[TMP0]] ; CHECK-NEXT: br label [[FOR2_HEADER1:%.*]] ; CHECK: for2.header1: -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_BODY]] ], [ [[TMP2:%.*]], [[FOR2_HEADER1]] ] -; CHECK-NEXT: [[TMP1:%.*]] = select i1 [[SELECT:%.*]], <4 x double> [[BROADCAST_SPLAT]], <4 x double> [[BROADCAST_SPLAT3]] -; CHECK-NEXT: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> [[TMP1]], <4 x ptr> [[TMP0]], i32 8, <4 x i1> ) +; CHECK-NEXT: [[SCALAR_PHI:%.*]] = phi i64 [ 0, [[VECTOR_BODY]] ], [ [[TMP4:%.*]], [[FOR2_HEADER1]] ] +; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[SELECT]], <4 x double> [[BROADCAST_SPLAT]], <4 x double> [[BROADCAST_SPLAT3]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 0 +; CHECK-NEXT: store <4 x double> [[TMP2]], ptr [[TMP3]], align 8 entry: br label %for1.header @@ -55,25 +57,28 @@ } define void @outer_loop_dependant_select(ptr noalias nocapture %out, double %a, double %b) { -; CHECK-LABEL: @outer_loop_dependant_select( +; CHECK-LABEL: define void @outer_loop_dependant_select +; CHECK-SAME: (ptr noalias nocapture [[OUT:%.*]], double [[A:%.*]], double [[B:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x double> poison, double [[A:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x double> poison, double [[A]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT]], <4 x double> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x double> poison, double [[B:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x double> poison, double [[B]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT2]], <4 x double> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR1_LATCH4:%.*]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[FOR1_LATCH4]] ] -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds double, ptr [[OUT:%.*]], <4 x i64> [[VEC_IND]] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 [[TMP0]] ; CHECK-NEXT: br label [[FOR2_HEADER1:%.*]] ; CHECK: for2.header1: -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_BODY]] ], [ [[TMP3:%.*]], [[FOR2_HEADER1]] ] -; CHECK-NEXT: [[TMP1:%.*]] = trunc <4 x i64> [[VEC_IND]] to <4 x i1> -; CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x double> [[BROADCAST_SPLAT]], <4 x double> [[BROADCAST_SPLAT3]] -; CHECK-NEXT: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> [[TMP2]], <4 x ptr> [[TMP0]], i32 8, <4 x i1> ) +; CHECK-NEXT: [[SCALAR_PHI:%.*]] = phi i64 [ 0, [[VECTOR_BODY]] ], [ [[TMP5:%.*]], [[FOR2_HEADER1]] ] +; CHECK-NEXT: [[TMP2:%.*]] = trunc <4 x i64> [[VEC_IND]] to <4 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x double> [[BROADCAST_SPLAT]], <4 x double> [[BROADCAST_SPLAT3]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 0 +; CHECK-NEXT: store <4 x double> [[TMP3]], ptr [[TMP4]], align 8 entry: br label %for1.header @@ -102,25 +107,27 @@ } define void @inner_loop_dependant_select(ptr noalias nocapture %out, double %a, double %b) { -; CHECK-LABEL: @inner_loop_dependant_select( +; CHECK-LABEL: define void @inner_loop_dependant_select +; CHECK-SAME: (ptr noalias nocapture [[OUT:%.*]], double [[A:%.*]], double [[B:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x double> poison, double [[A:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x double> poison, double [[A]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT]], <4 x double> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x double> poison, double [[B:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x double> poison, double [[B]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT2]], <4 x double> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR1_LATCH4:%.*]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[FOR1_LATCH4]] ] -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds double, ptr [[OUT:%.*]], <4 x i64> [[VEC_IND]] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 [[TMP0]] ; CHECK-NEXT: br label [[FOR2_HEADER1:%.*]] ; CHECK: for2.header1: -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_BODY]] ], [ [[TMP3:%.*]], [[FOR2_HEADER1]] ] -; CHECK-NEXT: [[TMP1:%.*]] = trunc <4 x i64> [[VEC_PHI]] to <4 x i1> -; CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x double> [[BROADCAST_SPLAT]], <4 x double> [[BROADCAST_SPLAT3]] -; CHECK-NEXT: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> [[TMP2]], <4 x ptr> [[TMP0]], i32 8, <4 x i1> ) +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_BODY]] ], [ [[TMP5:%.*]], [[FOR2_HEADER1]] ] +; CHECK-NEXT: [[TMP2:%.*]] = trunc <4 x i64> [[VEC_PHI]] to <4 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x double> [[BROADCAST_SPLAT]], <4 x double> [[BROADCAST_SPLAT3]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 0 +; CHECK-NEXT: store <4 x double> [[TMP3]], ptr [[TMP4]], align 8 entry: br label %for1.header @@ -149,26 +156,29 @@ } define void @outer_and_inner_loop_dependant_select(ptr noalias nocapture %out, double %a, double %b) { -; CHECK-LABEL: @outer_and_inner_loop_dependant_select( +; CHECK-LABEL: define void @outer_and_inner_loop_dependant_select +; CHECK-SAME: (ptr noalias nocapture [[OUT:%.*]], double [[A:%.*]], double [[B:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x double> poison, double [[A:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x double> poison, double [[A]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT]], <4 x double> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x double> poison, double [[B:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x double> poison, double [[B]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT2]], <4 x double> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR1_LATCH4:%.*]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[FOR1_LATCH4]] ] -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds double, ptr [[OUT:%.*]], <4 x i64> [[VEC_IND]] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 [[TMP0]] ; CHECK-NEXT: br label [[FOR2_HEADER1:%.*]] ; CHECK: for2.header1: -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_BODY]] ], [ [[TMP4:%.*]], [[FOR2_HEADER1]] ] -; CHECK-NEXT: [[TMP1:%.*]] = add nuw nsw <4 x i64> [[VEC_IND]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP2:%.*]] = trunc <4 x i64> [[TMP1]] to <4 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x double> [[BROADCAST_SPLAT]], <4 x double> [[BROADCAST_SPLAT3]] -; CHECK-NEXT: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> [[TMP3]], <4 x ptr> [[TMP0]], i32 8, <4 x i1> ) +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_BODY]] ], [ [[TMP6:%.*]], [[FOR2_HEADER1]] ] +; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw <4 x i64> [[VEC_IND]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP3:%.*]] = trunc <4 x i64> [[TMP2]] to <4 x i1> +; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP3]], <4 x double> [[BROADCAST_SPLAT]], <4 x double> [[BROADCAST_SPLAT3]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 0 +; CHECK-NEXT: store <4 x double> [[TMP4]], ptr [[TMP5]], align 8 entry: br label %for1.header