diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -160,6 +160,8 @@ bool skipScalarizationCost() const { return ScalarizationCost.isValid(); } }; +enum class PredicationStyle { None, Data, DataAndControlFlow }; + class TargetTransformInfo; typedef TargetTransformInfo TTI; @@ -531,8 +533,12 @@ const LoopAccessInfo *LAI) const; /// Query the target whether lowering of the llvm.get.active.lane.mask - /// intrinsic is supported. - bool emitGetActiveLaneMask() const; + /// intrinsic is supported and how the mask should be used. A return value + /// of PredicationStyle::Data indicates the mask is used as data only, + /// whereas PredicationStyle::DataAndControlFlow indicates we should also use + /// the mask for control flow in the loop. If unsupported the return value is + /// PredicationStyle::None. + PredicationStyle emitGetActiveLaneMask() const; // Parameters that control the loop peeling transformation struct PeelingPreferences { @@ -1553,7 +1559,7 @@ preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *TLI, DominatorTree *DT, const LoopAccessInfo *LAI) = 0; - virtual bool emitGetActiveLaneMask() = 0; + virtual PredicationStyle emitGetActiveLaneMask() = 0; virtual Optional instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) = 0; virtual Optional @@ -1932,7 +1938,7 @@ const LoopAccessInfo *LAI) override { return Impl.preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI); } - bool emitGetActiveLaneMask() override { + PredicationStyle emitGetActiveLaneMask() override { return Impl.emitGetActiveLaneMask(); } Optional instCombineIntrinsic(InstCombiner &IC, diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -167,8 +167,8 @@ return false; } - bool emitGetActiveLaneMask() const { - return false; + PredicationStyle emitGetActiveLaneMask() const { + return PredicationStyle::None; } Optional instCombineIntrinsic(InstCombiner &IC, diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -607,7 +607,7 @@ return BaseT::preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI); } - bool emitGetActiveLaneMask() { + PredicationStyle emitGetActiveLaneMask() { return BaseT::emitGetActiveLaneMask(); } diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -298,7 +298,7 @@ return TTIImpl->preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI); } -bool TargetTransformInfo::emitGetActiveLaneMask() const { +PredicationStyle TargetTransformInfo::emitGetActiveLaneMask() const { return TTIImpl->emitGetActiveLaneMask(); } diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -334,8 +334,10 @@ return 2; } - bool emitGetActiveLaneMask() const { - return ST->hasSVE(); + PredicationStyle emitGetActiveLaneMask() const { + if (ST->hasSVE()) + return PredicationStyle::DataAndControlFlow; + return PredicationStyle::None; } bool supportsScalableVectors() const { return ST->hasSVE(); } diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -298,7 +298,7 @@ TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE); - bool emitGetActiveLaneMask() const; + PredicationStyle emitGetActiveLaneMask() const; void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP); diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -2247,15 +2247,15 @@ return canTailPredicateLoop(L, LI, SE, DL, LAI); } -bool ARMTTIImpl::emitGetActiveLaneMask() const { +PredicationStyle ARMTTIImpl::emitGetActiveLaneMask() const { if (!ST->hasMVEIntegerOps() || !EnableTailPredication) - return false; + return PredicationStyle::None; // Intrinsic @llvm.get.active.lane.mask is supported. // It is used in the MVETailPredication pass, which requires the number of // elements processed by this vector loop to setup the tail-predicated // loop. - return true; + return PredicationStyle::Data; } void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -57,7 +57,10 @@ bool shouldExpandReduction(const IntrinsicInst *II) const; bool supportsScalableVectors() const { return ST->hasVInstructions(); } - bool emitGetActiveLaneMask() const { return ST->hasVInstructions(); } + PredicationStyle emitGetActiveLaneMask() const { + return ST->hasVInstructions() ? PredicationStyle::Data + : PredicationStyle::None; + } Optional getMaxVScale() const; Optional getVScaleForTuning() const; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1026,7 +1026,8 @@ if (isa(CurRec) || isa(CurRec) || isa(CurRec) || - isa(CurRec)) + isa(CurRec) || + isa(CurRec)) continue; // This recipe contributes to the address computation of a widen @@ -1511,6 +1512,13 @@ /// Returns true if all loop blocks should be masked to fold tail loop. bool foldTailByMasking() const { return FoldTailByMasking; } + /// Returns true if were tail-folding and want to use the active lane mask + /// for vector loop control flow. + bool useActiveLaneMaskForControlFlow() const { + return FoldTailByMasking && + TTI.emitGetActiveLaneMask() == PredicationStyle::DataAndControlFlow; + } + /// Returns true if the instructions in this block requires predication /// for any reason, e.g. because tail folding now requires a predicate /// or because the block in the original loop was predicated. @@ -8016,11 +8024,19 @@ if (!CM.blockNeedsPredicationForAnyReason(BB)) return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. + assert(CM.foldTailByMasking() && "must fold the tail"); + + // If we're using the active lane mask for control flow, then we get the + // mask from the active lane mask PHI that is cached in the VPlan. + PredicationStyle EmitGetActiveLaneMask = CM.TTI.emitGetActiveLaneMask(); + if (EmitGetActiveLaneMask == PredicationStyle::DataAndControlFlow) + return BlockMaskCache[BB] = Plan->getActiveLaneMaskPhi(); + // Introduce the early-exit compare IV <= BTC to form header block mask. // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by // constructing the desired canonical IV in the header block as its first // non-phi instructions. - assert(CM.foldTailByMasking() && "must fold the tail"); + VPBasicBlock *HeaderVPBB = Plan->getVectorLoopRegion()->getEntryBasicBlock(); auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi(); @@ -8029,7 +8045,7 @@ VPBuilder::InsertPointGuard Guard(Builder); Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint); - if (CM.TTI.emitGetActiveLaneMask()) { + if (EmitGetActiveLaneMask != PredicationStyle::None) { VPValue *TC = Plan->getOrCreateTripCount(); BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC}, nullptr, "active.lane.mask"); @@ -8573,19 +8589,22 @@ } } -// Add a VPCanonicalIVPHIRecipe starting at 0 to the header, a -// CanonicalIVIncrement{NUW} VPInstruction to increment it by VF * UF and a -// BranchOnCount VPInstruction to the latch. +// Add the necessary canonical IV and branch recipes required to control the +// loop. static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL, - bool HasNUW) { + bool HasNUW, + bool UseLaneMaskForLoopControlFlow) { Value *StartIdx = ConstantInt::get(IdxTy, 0); auto *StartV = Plan.getOrAddVPValue(StartIdx); + // Add a VPCanonicalIVPHIRecipe starting at 0 to the header. auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL); VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); VPBasicBlock *Header = TopRegion->getEntryBasicBlock(); Header->insert(CanonicalIVPHI, Header->begin()); + // Add a CanonicalIVIncrement{NUW} VPInstruction to increment the scalar + // IV by VF * UF. auto *CanonicalIVIncrement = new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW : VPInstruction::CanonicalIVIncrement, @@ -8595,10 +8614,59 @@ VPBasicBlock *EB = TopRegion->getExitingBasicBlock(); EB->appendRecipe(CanonicalIVIncrement); - auto *BranchOnCount = - new VPInstruction(VPInstruction::BranchOnCount, - {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); - EB->appendRecipe(BranchOnCount); + if (UseLaneMaskForLoopControlFlow) { + // Create the active lane mask instruction in the vplan preheader. + VPBasicBlock *Preheader = Plan.getEntry()->getEntryBasicBlock(); + + // We can't use StartV directly in the ActiveLaneMask VPInstruction, since + // we have to take unrolling into account. Each part needs to start at + // Part * VF + auto *CanonicalIVIncrementParts = + new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementForPartNUW + : VPInstruction::CanonicalIVIncrementForPart, + {StartV}, DL, "index.part.next"); + Preheader->appendRecipe(CanonicalIVIncrementParts); + + // Create the ActiveLaneMask instruction using the correct start values. + VPValue *TC = Plan.getOrCreateTripCount(); + auto *EntryALM = new VPInstruction(VPInstruction::ActiveLaneMask, + {CanonicalIVIncrementParts, TC}, DL, + "active.lane.mask.entry"); + Preheader->appendRecipe(EntryALM); + + // Now create the ActiveLaneMaskPhi recipe in the main loop using the + // preheader ActiveLaneMask instruction. + auto *LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc()); + Header->insert(LaneMaskPhi, Header->getFirstNonPhi()); + + // Create the active lane mask for the next iteration of the loop. + CanonicalIVIncrementParts = + new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementForPartNUW + : VPInstruction::CanonicalIVIncrementForPart, + {CanonicalIVIncrement}, DL); + EB->appendRecipe(CanonicalIVIncrementParts); + + auto *ALM = new VPInstruction(VPInstruction::ActiveLaneMask, + {CanonicalIVIncrementParts, TC}, DL, + "active.lane.mask.next"); + EB->appendRecipe(ALM); + LaneMaskPhi->addOperand(ALM); + + // We have to invert the mask here because a true condition means jumping + // to the exit block. + auto *NotMask = new VPInstruction(VPInstruction::Not, ALM, DL); + EB->appendRecipe(NotMask); + + VPInstruction *BranchBack = + new VPInstruction(VPInstruction::BranchOnCond, {NotMask}, DL); + EB->appendRecipe(BranchBack); + } else { + // Add the BranchOnCount VPInstruction to the latch. + VPInstruction *BranchBack = new VPInstruction( + VPInstruction::BranchOnCount, + {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); + EB->appendRecipe(BranchBack); + } } // Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the @@ -8699,7 +8767,8 @@ getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DLInst ? DLInst->getDebugLoc() : DebugLoc(), - !CM.foldTailByMasking()); + !CM.foldTailByMasking(), + CM.useActiveLaneMaskForControlFlow()); // Scan the body of the loop in a topological order to visit each basic block // after having visited its predecessor basic blocks. @@ -9014,7 +9083,7 @@ Term->eraseFromParent(); addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(), - true); + true, CM.useActiveLaneMaskForControlFlow()); return Plan; } diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -784,6 +784,10 @@ ActiveLaneMask, CanonicalIVIncrement, CanonicalIVIncrementNUW, + // The next two are similar to the above, but instead increment the + // canonical IV separately for each unrolled part. + CanonicalIVIncrementForPart, + CanonicalIVIncrementForPartNUW, BranchOnCount, BranchOnCond }; @@ -901,6 +905,8 @@ case VPInstruction::ActiveLaneMask: case VPInstruction::CanonicalIVIncrement: case VPInstruction::CanonicalIVIncrementNUW: + case VPInstruction::CanonicalIVIncrementForPart: + case VPInstruction::CanonicalIVIncrementForPartNUW: case VPInstruction::BranchOnCount: return true; }; @@ -1129,6 +1135,7 @@ /// Method to support type inquiry through isa, cast, and dyn_cast. static inline bool classof(const VPRecipeBase *B) { return B->getVPDefID() == VPRecipeBase::VPCanonicalIVPHISC || + B->getVPDefID() == VPRecipeBase::VPActiveLaneMaskPHISC || B->getVPDefID() == VPRecipeBase::VPFirstOrderRecurrencePHISC || B->getVPDefID() == VPRecipeBase::VPReductionPHISC || B->getVPDefID() == VPRecipeBase::VPWidenIntOrFpInductionSC || @@ -1136,6 +1143,7 @@ } static inline bool classof(const VPValue *V) { return V->getVPValueID() == VPValue::VPVCanonicalIVPHISC || + V->getVPValueID() == VPValue::VPVActiveLaneMaskPHISC || V->getVPValueID() == VPValue::VPVFirstOrderRecurrencePHISC || V->getVPValueID() == VPValue::VPVReductionPHISC || V->getVPValueID() == VPValue::VPVWidenIntOrFpInductionSC || @@ -1865,6 +1873,42 @@ } }; +/// A recipe for generating the active lane mask for the vector loop that is +/// used to predicate the vector operations. +/// TODO: It would be good to use the existing VPWidenPHIRecipe instead and +/// remove VPActiveLaneMaskPHIRecipe. +class VPActiveLaneMaskPHIRecipe : public VPHeaderPHIRecipe { + DebugLoc DL; + +public: + VPActiveLaneMaskPHIRecipe(VPValue *StartMask, DebugLoc DL) + : VPHeaderPHIRecipe(VPValue::VPVActiveLaneMaskPHISC, + VPActiveLaneMaskPHISC, nullptr, StartMask), + DL(DL) {} + + ~VPActiveLaneMaskPHIRecipe() override = default; + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPDef *D) { + return D->getVPDefID() == VPActiveLaneMaskPHISC; + } + static inline bool classof(const VPHeaderPHIRecipe *D) { + return D->getVPDefID() == VPActiveLaneMaskPHISC; + } + static inline bool classof(const VPValue *V) { + return V->getVPValueID() == VPValue::VPVActiveLaneMaskPHISC; + } + + /// Generate the active lane mask phi of the vector loop. + void execute(VPTransformState &State) override; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif +}; + /// A Recipe for widening the canonical induction variable of the vector loop. class VPWidenCanonicalIVRecipe : public VPRecipeBase, public VPValue { public: @@ -2660,6 +2704,10 @@ return cast(&*EntryVPBB->begin()); } + /// Find and return the VPActiveLaneMaskPHIRecipe from the header - there + /// be only one at most. If there isn't one, then return nullptr. + VPActiveLaneMaskPHIRecipe *getActiveLaneMaskPhi(); + void addLiveOut(PHINode *PN, VPValue *V); void clearLiveOuts() { diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -568,6 +568,24 @@ } #endif +VPActiveLaneMaskPHIRecipe *VPlan::getActiveLaneMaskPhi() { + VPBasicBlock *Header = getVectorLoopRegion()->getEntryBasicBlock(); + for (VPRecipeBase &R : Header->phis()) { + if (isa(&R)) + return cast(&R); + } + return nullptr; +} + +static bool canSimplifyBranchOnCond(VPInstruction *Term) { + VPInstruction *Not = dyn_cast(Term->getOperand(0)); + if (!Not || Not->getOpcode() != VPInstruction::Not) + return false; + + VPInstruction *ALM = dyn_cast(Not->getOperand(0)); + return ALM && ALM->getOpcode() == VPInstruction::ActiveLaneMask; +} + void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV, Value *CanonicalIVStartValue, VPTransformState &State, @@ -575,11 +593,15 @@ VPBasicBlock *ExitingVPBB = getVectorLoopRegion()->getExitingBasicBlock(); auto *Term = dyn_cast(&ExitingVPBB->back()); - // Try to simplify BranchOnCount to 'BranchOnCond true' if TC <= VF * UF when - // preparing to execute the plan for the main vector loop. - if (!IsEpilogueVectorization && Term && - Term->getOpcode() == VPInstruction::BranchOnCount && - isa(TripCountV)) { + // Try to simplify the branch condition if TC <= VF * UF when preparing to + // execute the plan for the main vector loop. We only do this if the + // terminator is: + // 1. BranchOnCount, or + // 2. BranchOnCond where the input is Not(ActiveLaneMask). + if (!IsEpilogueVectorization && Term && isa(TripCountV) && + (Term->getOpcode() == VPInstruction::BranchOnCount || + (Term->getOpcode() == VPInstruction::BranchOnCond && + canSimplifyBranchOnCond(Term)))) { ConstantInt *C = cast(TripCountV); uint64_t TCVal = C->getZExtValue(); if (TCVal && TCVal <= State.VF.getKnownMinValue() * State.UF) { @@ -699,7 +721,8 @@ // generated. bool SinglePartNeeded = isa(PhiR) || isa(PhiR) || - cast(PhiR)->isOrdered(); + (isa(PhiR) && + cast(PhiR)->isOrdered()); unsigned LastPartForNewPhi = SinglePartNeeded ? 1 : State->UF; for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) { diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -274,6 +274,23 @@ State.set(this, Next, Part); break; } + + case VPInstruction::CanonicalIVIncrementForPart: + case VPInstruction::CanonicalIVIncrementForPartNUW: { + bool IsNUW = getOpcode() == VPInstruction::CanonicalIVIncrementForPartNUW; + auto *IV = State.get(getOperand(0), VPIteration(0, 0)); + if (Part == 0) { + State.set(this, IV, Part); + break; + } + + // The canonical IV is incremented by the vectorization factor (num of SIMD + // elements) times the unroll part. + Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part); + Value *Next = Builder.CreateAdd(IV, Step, Name, IsNUW, false); + State.set(this, Next, Part); + break; + } case VPInstruction::BranchOnCond: { if (Part != 0) break; @@ -375,6 +392,12 @@ case VPInstruction::BranchOnCond: O << "branch-on-cond"; break; + case VPInstruction::CanonicalIVIncrementForPart: + O << "VF * Part + "; + break; + case VPInstruction::CanonicalIVIncrementForPartNUW: + O << "VF * Part +(nuw) "; + break; case VPInstruction::BranchOnCount: O << "branch-on-count "; break; @@ -1069,3 +1092,28 @@ printOperands(O, SlotTracker); } #endif + +// TODO: It would be good to use the existing VPWidenPHIRecipe instead and +// remove VPActiveLaneMaskPHIRecipe. +void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) { + BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); + for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) { + Value *StartMask = State.get(getOperand(0), Part); + PHINode *EntryPart = + State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask"); + EntryPart->addIncoming(StartMask, VectorPH); + EntryPart->setDebugLoc(DL); + State.set(this, EntryPart, Part); + } +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "ACTIVE-LANE-MASK-PHI "; + + printAsOperand(O, SlotTracker); + O << " = phi "; + printOperands(O, SlotTracker); +} +#endif diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -103,6 +103,7 @@ // Phi-like VPValues. Need to be kept together. VPVBlendSC, VPVCanonicalIVPHISC, + VPVActiveLaneMaskPHISC, VPVFirstOrderRecurrencePHISC, VPVWidenPHISC, VPVWidenIntOrFpInductionSC, @@ -358,6 +359,7 @@ // Phi-like recipes. Need to be kept together. VPBlendSC, VPCanonicalIVPHISC, + VPActiveLaneMaskPHISC, VPFirstOrderRecurrencePHISC, VPWidenPHISC, VPWidenIntOrFpInductionSC, diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -142,8 +142,17 @@ // other recipes in between. auto RecipeI = VPBB->begin(); auto End = VPBB->end(); - while (RecipeI != End && RecipeI->isPhi()) + unsigned NumActiveLaneMaskPhiRecipes = 0; + while (RecipeI != End && RecipeI->isPhi()) { + if (isa(RecipeI)) + NumActiveLaneMaskPhiRecipes++; RecipeI++; + } + + if (NumActiveLaneMaskPhiRecipes > 1) { + errs() << "There should be no more than one VPActiveLaneMaskPHIRecipe"; + return false; + } while (RecipeI != End) { if (RecipeI->isPhi() && !isa(&*RecipeI)) { @@ -181,15 +190,16 @@ } if (Exiting->empty()) { - errs() << "VPlan vector loop exiting block must end with BranchOnCount " - "VPInstruction but is empty\n"; + errs() << "VPlan vector loop exiting block must end with BranchOnCount or " + "BranchOnCond VPInstruction but is empty\n"; return false; } auto *LastInst = dyn_cast(std::prev(Exiting->end())); - if (!LastInst || LastInst->getOpcode() != VPInstruction::BranchOnCount) { - errs() << "VPlan vector loop exit must end with BranchOnCount " - "VPInstruction\n"; + if (!LastInst || (LastInst->getOpcode() != VPInstruction::BranchOnCount && + LastInst->getOpcode() != VPInstruction::BranchOnCond)) { + errs() << "VPlan vector loop exit must end with BranchOnCount or " + "BranchOnCond VPInstruction\n"; return false; } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions-tf.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions-tf.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions-tf.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions-tf.ll @@ -3,12 +3,15 @@ define void @invariant_store_red_exit_is_phi(i32* %dst, i32* readonly %src, i64 %n) { ; CHECK-LABEL: @invariant_store_red_exit_is_phi( +; CHECK: vector.ph: +; CHECK: %[[ACTIVE_LANE_MASK_ENTRY:.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %n) ; CHECK: vector.body: +; CHECK: %[[ACTIVE_LANE_MASK:.*]] = phi [ %[[ACTIVE_LANE_MASK_ENTRY]], %vector.ph ], [ %[[ACTIVE_LANE_MASK_NEXT:.*]], %vector.body ] ; CHECK: %[[VEC_PHI:.*]] = phi [ zeroinitializer, %vector.ph ], [ %[[PREDPHI:.*]], %vector.body ] -; CHECK: %[[ACTIVE_LANE_MASK:.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 {{%.*}}, i64 %n) ; CHECK: %[[LOAD:.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32 ; CHECK-NEXT: %[[ADD:.*]] = add %[[VEC_PHI]], %[[LOAD]] ; CHECK-NEXT: %[[SELECT:.*]] = select %[[ACTIVE_LANE_MASK]], %[[ADD]], %[[VEC_PHI]] +; CHECK: %[[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 %{{.*}}, i64 %n) ; CHECK: middle.block: ; CHECK-NEXT: %[[SUM:.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( %[[SELECT]]) ; CHECK-NEXT: store i32 %[[SUM]], i32* %dst, align 4 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll @@ -6,14 +6,16 @@ ; CHECK-LABEL: @trip7_i64( ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] -; CHECK: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 {{%.*}}, i64 7) +; CHECK: [[ACTIVE_LANE_MASK:%.*]] = phi [ {{%.*}}, %vector.ph ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %vector.body ] ; CHECK: {{%.*}} = call @llvm.masked.load.nxv2i64.p0nxv2i64(* {{%.*}}, i32 8, [[ACTIVE_LANE_MASK]], poison) ; CHECK: {{%.*}} = call @llvm.masked.load.nxv2i64.p0nxv2i64(* {{%.*}}, i32 8, [[ACTIVE_LANE_MASK]], poison) ; CHECK: call void @llvm.masked.store.nxv2i64.p0nxv2i64( {{%.*}}, * {{%.*}}, i32 8, [[ACTIVE_LANE_MASK]]) ; CHECK: [[VSCALE:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[VF:%.*]] = mul i64 [[VSCALE]], 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[VF]] -; CHECK-NEXT: [[COND:%.*]] = icmp eq i64 [[INDEX_NEXT]], {{%.*}} +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 7) +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NOT:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[COND:%.*]] = extractelement [[ACTIVE_LANE_MASK_NOT]], i32 0 ; CHECK-NEXT: br i1 [[COND]], label %middle.block, label %vector.body ; entry: @@ -40,13 +42,15 @@ ; CHECK-LABEL: @trip5_i8( ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] -; CHECK: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 {{%.*}}, i64 5) +; CHECK: [[ACTIVE_LANE_MASK:%.*]] = phi [ {{%.*}}, %vector.ph ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %vector.body ] ; CHECK: {{%.*}} = call @llvm.masked.load.nxv16i8.p0nxv16i8(* {{%.*}}, i32 1, [[ACTIVE_LANE_MASK]], poison) ; CHECK: {{%.*}} = call @llvm.masked.load.nxv16i8.p0nxv16i8(* {{%.*}}, i32 1, [[ACTIVE_LANE_MASK]], poison) ; CHECK: call void @llvm.masked.store.nxv16i8.p0nxv16i8( {{%.*}}, * {{%.*}}, i32 1, [[ACTIVE_LANE_MASK]]) ; CHECK: [[VSCALE:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[VF:%.*]] = mul i64 [[VSCALE]], 16 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[VF]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX_NEXT]], i64 5) +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NOT:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer) ; CHECK-NEXT: br i1 true, label %middle.block, label %vector.body ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll @@ -1,10 +1,34 @@ -; RUN: opt -S -loop-vectorize < %s | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; REQUIRES: asserts +; RUN: opt -S -loop-vectorize -debug-only=loop-vectorize < %s 2>%t | FileCheck %s +; RUN: cat %t | FileCheck %s --check-prefix=VPLANS ; These tests ensure that tail-folding is enabled when the predicate.enable ; loop attribute is set to true. target triple = "aarch64-unknown-linux-gnu" +; VPLANS-LABEL: Checking a loop in 'simple_memset' +; VPLANS: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF>=1' { +; VPLANS-NEXT: vector.ph: +; VPLANS-NEXT: EMIT vp<%2> = VF * Part + ir<0> +; VPLANS-NEXT: EMIT vp<%3> = active lane mask vp<%2> +; VPLANS-NEXT: Successor(s): vector loop +; VPLANS-EMPTY: +; VPLANS-NEXT: vector loop: { +; VPLANS-NEXT: vector.body: +; VPLANS-NEXT: EMIT vp<%4> = CANONICAL-INDUCTION +; VPLANS-NEXT: ACTIVE-LANE-MASK-PHI vp<%5> = phi vp<%3>, vp<%10> +; VPLANS-NEXT: vp<%6> = SCALAR-STEPS vp<%4>, ir<0>, ir<1> +; VPLANS-NEXT: CLONE ir<%gep> = getelementptr ir<%ptr>, vp<%6> +; VPLANS-NEXT: WIDEN store ir<%gep>, ir<%val>, vp<%5> +; VPLANS-NEXT: EMIT vp<%8> = VF * UF + vp<%4> +; VPLANS-NEXT: EMIT vp<%9> = VF * Part + vp<%8> +; VPLANS-NEXT: EMIT vp<%10> = active lane mask vp<%9> +; VPLANS-NEXT: EMIT vp<%11> = not vp<%10> +; VPLANS-NEXT: EMIT branch-on-cond vp<%11> +; VPLANS-NEXT: No successors +; VPLANS-NEXT: } define void @simple_memset(i32 %val, i32* %ptr, i64 %n) #0 { ; CHECK-LABEL: @simple_memset( @@ -24,22 +48,25 @@ ; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP8]] ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[VAL:%.*]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[ACTIVE_LANE_MASK]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK4:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX1]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP9]], i64 [[UMAX]]) ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP10]], i32 0 ; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to * -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT]], * [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT]], * [[TMP12]], i32 4, [[ACTIVE_LANE_MASK2]]) ; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4 -; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP14]] -; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP14]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK4]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT3]], i64 [[UMAX]]) +; CHECK-NEXT: [[TMP15:%.*]] = xor [[ACTIVE_LANE_MASK4]], shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP16:%.*]] = extractelement [[TMP15]], i32 0 +; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-optsize.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-optsize.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-optsize.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-optsize.ll @@ -4,17 +4,47 @@ define void @trip1024_i64(i64* noalias nocapture noundef %dst, i64* noalias nocapture noundef readonly %src) #0 { ; CHECK-LABEL: @trip1024_i64( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; CHECK-NEXT: [[TMP2:%.*]] = icmp ult i64 -1025, [[TMP1]] +; CHECK-NEXT: br i1 [[TMP2]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 2 +; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2 +; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[TMP6]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP7]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP4]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1024) +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] -; CHECK: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 {{%.*}}, i64 1024) -; CHECK: {{%.*}} = call @llvm.masked.load.nxv2i64.p0nxv2i64(* {{%.*}}, i32 8, [[ACTIVE_LANE_MASK]], poison) -; CHECK: {{%.*}} = call @llvm.masked.load.nxv2i64.p0nxv2i64(* {{%.*}}, i32 8, [[ACTIVE_LANE_MASK]], poison) -; CHECK: call void @llvm.masked.store.nxv2i64.p0nxv2i64( {{%.*}}, * {{%.*}}, i32 8, [[ACTIVE_LANE_MASK]]) -; CHECK: [[VSCALE:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[VF:%.*]] = mul i64 [[VSCALE]], 2 -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[VF]] -; CHECK-NEXT: [[COND:%.*]] = icmp eq i64 [[INDEX_NEXT]], {{%.*}} -; CHECK-NEXT: br i1 [[COND]], label %middle.block, label %vector.body +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK1:%.*]] = phi [ [[ACTIVE_LANE_MASK]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, i64* [[SRC:%.*]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, i64* [[TMP9]], i32 0 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i64* [[TMP10]] to * +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i64.p0nxv2i64(* [[TMP11]], i32 8, [[ACTIVE_LANE_MASK1]], poison) +; CHECK-NEXT: [[TMP12:%.*]] = shl nsw [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, i64* [[DST:%.*]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, i64* [[TMP13]], i32 0 +; CHECK-NEXT: [[TMP15:%.*]] = bitcast i64* [[TMP14]] to * +; CHECK-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = call @llvm.masked.load.nxv2i64.p0nxv2i64(* [[TMP15]], i32 8, [[ACTIVE_LANE_MASK1]], poison) +; CHECK-NEXT: [[TMP16:%.*]] = add nsw [[WIDE_MASKED_LOAD2]], [[TMP12]] +; CHECK-NEXT: [[TMP17:%.*]] = bitcast i64* [[TMP14]] to * +; CHECK-NEXT: call void @llvm.masked.store.nxv2i64.p0nxv2i64( [[TMP16]], * [[TMP17]], i32 8, [[ACTIVE_LANE_MASK1]]) +; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP19]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK3]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1024) +; CHECK-NEXT: [[TMP20:%.*]] = xor [[ACTIVE_LANE_MASK3]], shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP21:%.*]] = extractelement [[TMP20]], i32 0 +; CHECK-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll @@ -22,64 +22,94 @@ ; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP8]] ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4 +; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP10]] +; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 8 +; CHECK-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP12]] +; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 12 +; CHECK-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP14]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK3:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[UMAX]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK4:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT1]], i64 [[UMAX]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK5:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT2]], i64 [[UMAX]]) ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[VAL:%.*]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement poison, i32 [[VAL]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector [[BROADCAST_SPLATINSERT5]], poison, zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement poison, i32 [[VAL]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector [[BROADCAST_SPLATINSERT7]], poison, zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement poison, i32 [[VAL]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT10:%.*]] = shufflevector [[BROADCAST_SPLATINSERT9]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT11:%.*]] = insertelement poison, i32 [[VAL]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT12:%.*]] = shufflevector [[BROADCAST_SPLATINSERT11]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT13:%.*]] = insertelement poison, i32 [[VAL]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT14:%.*]] = shufflevector [[BROADCAST_SPLATINSERT13]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT15:%.*]] = insertelement poison, i32 [[VAL]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT16:%.*]] = shufflevector [[BROADCAST_SPLATINSERT15]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT11:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX1]], 0 -; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 4 -; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[TMP11]], 0 -; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 1 -; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[INDEX1]], [[TMP13]] -; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 8 -; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[TMP16]], 0 -; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 1 -; CHECK-NEXT: [[TMP19:%.*]] = add i64 [[INDEX1]], [[TMP18]] -; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 12 -; CHECK-NEXT: [[TMP22:%.*]] = add i64 [[TMP21]], 0 -; CHECK-NEXT: [[TMP23:%.*]] = mul i64 [[TMP22]], 1 -; CHECK-NEXT: [[TMP24:%.*]] = add i64 [[INDEX1]], [[TMP23]] -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP9]], i64 [[UMAX]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP14]], i64 [[UMAX]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK3:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP19]], i64 [[UMAX]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK4:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP24]], i64 [[UMAX]]) -; CHECK-NEXT: [[TMP25:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP26:%.*]] = getelementptr i32, i32* [[PTR]], i64 [[TMP14]] -; CHECK-NEXT: [[TMP27:%.*]] = getelementptr i32, i32* [[PTR]], i64 [[TMP19]] -; CHECK-NEXT: [[TMP28:%.*]] = getelementptr i32, i32* [[PTR]], i64 [[TMP24]] -; CHECK-NEXT: [[TMP29:%.*]] = getelementptr i32, i32* [[TMP25]], i32 0 -; CHECK-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP29]] to * -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT]], * [[TMP30]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[TMP31:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP32:%.*]] = mul i32 [[TMP31]], 4 -; CHECK-NEXT: [[TMP33:%.*]] = getelementptr i32, i32* [[TMP25]], i32 [[TMP32]] -; CHECK-NEXT: [[TMP34:%.*]] = bitcast i32* [[TMP33]] to * -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT6]], * [[TMP34]], i32 4, [[ACTIVE_LANE_MASK2]]) -; CHECK-NEXT: [[TMP35:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP36:%.*]] = mul i32 [[TMP35]], 8 -; CHECK-NEXT: [[TMP37:%.*]] = getelementptr i32, i32* [[TMP25]], i32 [[TMP36]] -; CHECK-NEXT: [[TMP38:%.*]] = bitcast i32* [[TMP37]] to * -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT8]], * [[TMP38]], i32 4, [[ACTIVE_LANE_MASK3]]) -; CHECK-NEXT: [[TMP39:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP40:%.*]] = mul i32 [[TMP39]], 12 -; CHECK-NEXT: [[TMP41:%.*]] = getelementptr i32, i32* [[TMP25]], i32 [[TMP40]] -; CHECK-NEXT: [[TMP42:%.*]] = bitcast i32* [[TMP41]] to * -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT10]], * [[TMP42]], i32 4, [[ACTIVE_LANE_MASK4]]) -; CHECK-NEXT: [[TMP43:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP44:%.*]] = mul i64 [[TMP43]], 16 -; CHECK-NEXT: [[INDEX_NEXT11]] = add i64 [[INDEX1]], [[TMP44]] -; CHECK-NEXT: [[TMP45:%.*]] = icmp eq i64 [[INDEX_NEXT11]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP45]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[INDEX6:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT17:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK7:%.*]] = phi [ [[ACTIVE_LANE_MASK]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK22:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK8:%.*]] = phi [ [[ACTIVE_LANE_MASK3]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK23:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK9:%.*]] = phi [ [[ACTIVE_LANE_MASK4]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK24:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK10:%.*]] = phi [ [[ACTIVE_LANE_MASK5]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK25:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[INDEX6]], 0 +; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4 +; CHECK-NEXT: [[TMP18:%.*]] = add i64 [[TMP17]], 0 +; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 1 +; CHECK-NEXT: [[TMP20:%.*]] = add i64 [[INDEX6]], [[TMP19]] +; CHECK-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 8 +; CHECK-NEXT: [[TMP23:%.*]] = add i64 [[TMP22]], 0 +; CHECK-NEXT: [[TMP24:%.*]] = mul i64 [[TMP23]], 1 +; CHECK-NEXT: [[TMP25:%.*]] = add i64 [[INDEX6]], [[TMP24]] +; CHECK-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP27:%.*]] = mul i64 [[TMP26]], 12 +; CHECK-NEXT: [[TMP28:%.*]] = add i64 [[TMP27]], 0 +; CHECK-NEXT: [[TMP29:%.*]] = mul i64 [[TMP28]], 1 +; CHECK-NEXT: [[TMP30:%.*]] = add i64 [[INDEX6]], [[TMP29]] +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP15]] +; CHECK-NEXT: [[TMP32:%.*]] = getelementptr i32, i32* [[PTR]], i64 [[TMP20]] +; CHECK-NEXT: [[TMP33:%.*]] = getelementptr i32, i32* [[PTR]], i64 [[TMP25]] +; CHECK-NEXT: [[TMP34:%.*]] = getelementptr i32, i32* [[PTR]], i64 [[TMP30]] +; CHECK-NEXT: [[TMP35:%.*]] = getelementptr i32, i32* [[TMP31]], i32 0 +; CHECK-NEXT: [[TMP36:%.*]] = bitcast i32* [[TMP35]] to * +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT]], * [[TMP36]], i32 4, [[ACTIVE_LANE_MASK7]]) +; CHECK-NEXT: [[TMP37:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP38:%.*]] = mul i32 [[TMP37]], 4 +; CHECK-NEXT: [[TMP39:%.*]] = getelementptr i32, i32* [[TMP31]], i32 [[TMP38]] +; CHECK-NEXT: [[TMP40:%.*]] = bitcast i32* [[TMP39]] to * +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT12]], * [[TMP40]], i32 4, [[ACTIVE_LANE_MASK8]]) +; CHECK-NEXT: [[TMP41:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP42:%.*]] = mul i32 [[TMP41]], 8 +; CHECK-NEXT: [[TMP43:%.*]] = getelementptr i32, i32* [[TMP31]], i32 [[TMP42]] +; CHECK-NEXT: [[TMP44:%.*]] = bitcast i32* [[TMP43]] to * +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT14]], * [[TMP44]], i32 4, [[ACTIVE_LANE_MASK9]]) +; CHECK-NEXT: [[TMP45:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP46:%.*]] = mul i32 [[TMP45]], 12 +; CHECK-NEXT: [[TMP47:%.*]] = getelementptr i32, i32* [[TMP31]], i32 [[TMP46]] +; CHECK-NEXT: [[TMP48:%.*]] = bitcast i32* [[TMP47]] to * +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT16]], * [[TMP48]], i32 4, [[ACTIVE_LANE_MASK10]]) +; CHECK-NEXT: [[TMP49:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP50:%.*]] = mul i64 [[TMP49]], 16 +; CHECK-NEXT: [[INDEX_NEXT17]] = add i64 [[INDEX6]], [[TMP50]] +; CHECK-NEXT: [[TMP51:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP52:%.*]] = mul i64 [[TMP51]], 4 +; CHECK-NEXT: [[INDEX_PART_NEXT19:%.*]] = add i64 [[INDEX_NEXT17]], [[TMP52]] +; CHECK-NEXT: [[TMP53:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP54:%.*]] = mul i64 [[TMP53]], 8 +; CHECK-NEXT: [[INDEX_PART_NEXT20:%.*]] = add i64 [[INDEX_NEXT17]], [[TMP54]] +; CHECK-NEXT: [[TMP55:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP56:%.*]] = mul i64 [[TMP55]], 12 +; CHECK-NEXT: [[INDEX_PART_NEXT21:%.*]] = add i64 [[INDEX_NEXT17]], [[TMP56]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK22]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT17]], i64 [[UMAX]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK23]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT19]], i64 [[UMAX]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK24]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT20]], i64 [[UMAX]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK25]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT21]], i64 [[UMAX]]) +; CHECK-NEXT: [[TMP57:%.*]] = xor [[ACTIVE_LANE_MASK22]], shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP58:%.*]] = xor [[ACTIVE_LANE_MASK23]], shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP59:%.*]] = xor [[ACTIVE_LANE_MASK24]], shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP60:%.*]] = xor [[ACTIVE_LANE_MASK25]], shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP61:%.*]] = extractelement [[TMP57]], i32 0 +; CHECK-NEXT: br i1 [[TMP61]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; @@ -116,94 +146,124 @@ ; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP8]] ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4 +; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP10]] +; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 8 +; CHECK-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP12]] +; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 12 +; CHECK-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP14]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK3:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[UMAX]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK4:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT1]], i64 [[UMAX]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK5:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT2]], i64 [[UMAX]]) ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[VAL:%.*]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement poison, i32 [[VAL]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT9:%.*]] = shufflevector [[BROADCAST_SPLATINSERT8]], poison, zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT10:%.*]] = insertelement poison, i32 [[VAL]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT11:%.*]] = shufflevector [[BROADCAST_SPLATINSERT10]], poison, zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT12:%.*]] = insertelement poison, i32 [[VAL]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT13:%.*]] = shufflevector [[BROADCAST_SPLATINSERT12]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT14:%.*]] = insertelement poison, i32 [[VAL]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT15:%.*]] = shufflevector [[BROADCAST_SPLATINSERT14]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT16:%.*]] = insertelement poison, i32 [[VAL]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT17:%.*]] = shufflevector [[BROADCAST_SPLATINSERT16]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT18:%.*]] = insertelement poison, i32 [[VAL]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT19:%.*]] = shufflevector [[BROADCAST_SPLATINSERT18]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT14:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX1]], 0 -; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 4 -; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[TMP11]], 0 -; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 1 -; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[INDEX1]], [[TMP13]] -; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 8 -; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[TMP16]], 0 -; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 1 -; CHECK-NEXT: [[TMP19:%.*]] = add i64 [[INDEX1]], [[TMP18]] -; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 12 -; CHECK-NEXT: [[TMP22:%.*]] = add i64 [[TMP21]], 0 -; CHECK-NEXT: [[TMP23:%.*]] = mul i64 [[TMP22]], 1 -; CHECK-NEXT: [[TMP24:%.*]] = add i64 [[INDEX1]], [[TMP23]] -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP9]], i64 [[UMAX]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP14]], i64 [[UMAX]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK3:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP19]], i64 [[UMAX]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK4:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP24]], i64 [[UMAX]]) -; CHECK-NEXT: [[TMP25:%.*]] = getelementptr i32, i32* [[COND_PTR:%.*]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP26:%.*]] = getelementptr i32, i32* [[COND_PTR]], i64 [[TMP14]] -; CHECK-NEXT: [[TMP27:%.*]] = getelementptr i32, i32* [[COND_PTR]], i64 [[TMP19]] -; CHECK-NEXT: [[TMP28:%.*]] = getelementptr i32, i32* [[COND_PTR]], i64 [[TMP24]] -; CHECK-NEXT: [[TMP29:%.*]] = getelementptr i32, i32* [[TMP25]], i32 0 -; CHECK-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP29]] to * -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP30]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP31:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP32:%.*]] = mul i32 [[TMP31]], 4 -; CHECK-NEXT: [[TMP33:%.*]] = getelementptr i32, i32* [[TMP25]], i32 [[TMP32]] -; CHECK-NEXT: [[TMP34:%.*]] = bitcast i32* [[TMP33]] to * -; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP34]], i32 4, [[ACTIVE_LANE_MASK2]], poison) -; CHECK-NEXT: [[TMP35:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP36:%.*]] = mul i32 [[TMP35]], 8 -; CHECK-NEXT: [[TMP37:%.*]] = getelementptr i32, i32* [[TMP25]], i32 [[TMP36]] -; CHECK-NEXT: [[TMP38:%.*]] = bitcast i32* [[TMP37]] to * -; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP38]], i32 4, [[ACTIVE_LANE_MASK3]], poison) -; CHECK-NEXT: [[TMP39:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP40:%.*]] = mul i32 [[TMP39]], 12 -; CHECK-NEXT: [[TMP41:%.*]] = getelementptr i32, i32* [[TMP25]], i32 [[TMP40]] -; CHECK-NEXT: [[TMP42:%.*]] = bitcast i32* [[TMP41]] to * -; CHECK-NEXT: [[WIDE_MASKED_LOAD7:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP42]], i32 4, [[ACTIVE_LANE_MASK4]], poison) -; CHECK-NEXT: [[TMP43:%.*]] = icmp ne [[WIDE_MASKED_LOAD]], zeroinitializer -; CHECK-NEXT: [[TMP44:%.*]] = icmp ne [[WIDE_MASKED_LOAD5]], zeroinitializer -; CHECK-NEXT: [[TMP45:%.*]] = icmp ne [[WIDE_MASKED_LOAD6]], zeroinitializer -; CHECK-NEXT: [[TMP46:%.*]] = icmp ne [[WIDE_MASKED_LOAD7]], zeroinitializer -; CHECK-NEXT: [[TMP47:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP48:%.*]] = getelementptr i32, i32* [[PTR]], i64 [[TMP14]] -; CHECK-NEXT: [[TMP49:%.*]] = getelementptr i32, i32* [[PTR]], i64 [[TMP19]] -; CHECK-NEXT: [[TMP50:%.*]] = getelementptr i32, i32* [[PTR]], i64 [[TMP24]] -; CHECK-NEXT: [[TMP51:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP43]], zeroinitializer -; CHECK-NEXT: [[TMP52:%.*]] = select [[ACTIVE_LANE_MASK2]], [[TMP44]], zeroinitializer -; CHECK-NEXT: [[TMP53:%.*]] = select [[ACTIVE_LANE_MASK3]], [[TMP45]], zeroinitializer -; CHECK-NEXT: [[TMP54:%.*]] = select [[ACTIVE_LANE_MASK4]], [[TMP46]], zeroinitializer -; CHECK-NEXT: [[TMP55:%.*]] = getelementptr i32, i32* [[TMP47]], i32 0 -; CHECK-NEXT: [[TMP56:%.*]] = bitcast i32* [[TMP55]] to * -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT]], * [[TMP56]], i32 4, [[TMP51]]) -; CHECK-NEXT: [[TMP57:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP58:%.*]] = mul i32 [[TMP57]], 4 -; CHECK-NEXT: [[TMP59:%.*]] = getelementptr i32, i32* [[TMP47]], i32 [[TMP58]] -; CHECK-NEXT: [[TMP60:%.*]] = bitcast i32* [[TMP59]] to * -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT9]], * [[TMP60]], i32 4, [[TMP52]]) -; CHECK-NEXT: [[TMP61:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP62:%.*]] = mul i32 [[TMP61]], 8 -; CHECK-NEXT: [[TMP63:%.*]] = getelementptr i32, i32* [[TMP47]], i32 [[TMP62]] -; CHECK-NEXT: [[TMP64:%.*]] = bitcast i32* [[TMP63]] to * -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT11]], * [[TMP64]], i32 4, [[TMP53]]) -; CHECK-NEXT: [[TMP65:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP66:%.*]] = mul i32 [[TMP65]], 12 -; CHECK-NEXT: [[TMP67:%.*]] = getelementptr i32, i32* [[TMP47]], i32 [[TMP66]] -; CHECK-NEXT: [[TMP68:%.*]] = bitcast i32* [[TMP67]] to * -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT13]], * [[TMP68]], i32 4, [[TMP54]]) -; CHECK-NEXT: [[TMP69:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP70:%.*]] = mul i64 [[TMP69]], 16 -; CHECK-NEXT: [[INDEX_NEXT14]] = add i64 [[INDEX1]], [[TMP70]] -; CHECK-NEXT: [[TMP71:%.*]] = icmp eq i64 [[INDEX_NEXT14]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP71]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[INDEX6:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT20:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK7:%.*]] = phi [ [[ACTIVE_LANE_MASK]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK25:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK8:%.*]] = phi [ [[ACTIVE_LANE_MASK3]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK26:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK9:%.*]] = phi [ [[ACTIVE_LANE_MASK4]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK27:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK10:%.*]] = phi [ [[ACTIVE_LANE_MASK5]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK28:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[INDEX6]], 0 +; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4 +; CHECK-NEXT: [[TMP18:%.*]] = add i64 [[TMP17]], 0 +; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 1 +; CHECK-NEXT: [[TMP20:%.*]] = add i64 [[INDEX6]], [[TMP19]] +; CHECK-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 8 +; CHECK-NEXT: [[TMP23:%.*]] = add i64 [[TMP22]], 0 +; CHECK-NEXT: [[TMP24:%.*]] = mul i64 [[TMP23]], 1 +; CHECK-NEXT: [[TMP25:%.*]] = add i64 [[INDEX6]], [[TMP24]] +; CHECK-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP27:%.*]] = mul i64 [[TMP26]], 12 +; CHECK-NEXT: [[TMP28:%.*]] = add i64 [[TMP27]], 0 +; CHECK-NEXT: [[TMP29:%.*]] = mul i64 [[TMP28]], 1 +; CHECK-NEXT: [[TMP30:%.*]] = add i64 [[INDEX6]], [[TMP29]] +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr i32, i32* [[COND_PTR:%.*]], i64 [[TMP15]] +; CHECK-NEXT: [[TMP32:%.*]] = getelementptr i32, i32* [[COND_PTR]], i64 [[TMP20]] +; CHECK-NEXT: [[TMP33:%.*]] = getelementptr i32, i32* [[COND_PTR]], i64 [[TMP25]] +; CHECK-NEXT: [[TMP34:%.*]] = getelementptr i32, i32* [[COND_PTR]], i64 [[TMP30]] +; CHECK-NEXT: [[TMP35:%.*]] = getelementptr i32, i32* [[TMP31]], i32 0 +; CHECK-NEXT: [[TMP36:%.*]] = bitcast i32* [[TMP35]] to * +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP36]], i32 4, [[ACTIVE_LANE_MASK7]], poison) +; CHECK-NEXT: [[TMP37:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP38:%.*]] = mul i32 [[TMP37]], 4 +; CHECK-NEXT: [[TMP39:%.*]] = getelementptr i32, i32* [[TMP31]], i32 [[TMP38]] +; CHECK-NEXT: [[TMP40:%.*]] = bitcast i32* [[TMP39]] to * +; CHECK-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP40]], i32 4, [[ACTIVE_LANE_MASK8]], poison) +; CHECK-NEXT: [[TMP41:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP42:%.*]] = mul i32 [[TMP41]], 8 +; CHECK-NEXT: [[TMP43:%.*]] = getelementptr i32, i32* [[TMP31]], i32 [[TMP42]] +; CHECK-NEXT: [[TMP44:%.*]] = bitcast i32* [[TMP43]] to * +; CHECK-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP44]], i32 4, [[ACTIVE_LANE_MASK9]], poison) +; CHECK-NEXT: [[TMP45:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP46:%.*]] = mul i32 [[TMP45]], 12 +; CHECK-NEXT: [[TMP47:%.*]] = getelementptr i32, i32* [[TMP31]], i32 [[TMP46]] +; CHECK-NEXT: [[TMP48:%.*]] = bitcast i32* [[TMP47]] to * +; CHECK-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP48]], i32 4, [[ACTIVE_LANE_MASK10]], poison) +; CHECK-NEXT: [[TMP49:%.*]] = icmp ne [[WIDE_MASKED_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP50:%.*]] = icmp ne [[WIDE_MASKED_LOAD11]], zeroinitializer +; CHECK-NEXT: [[TMP51:%.*]] = icmp ne [[WIDE_MASKED_LOAD12]], zeroinitializer +; CHECK-NEXT: [[TMP52:%.*]] = icmp ne [[WIDE_MASKED_LOAD13]], zeroinitializer +; CHECK-NEXT: [[TMP53:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP15]] +; CHECK-NEXT: [[TMP54:%.*]] = getelementptr i32, i32* [[PTR]], i64 [[TMP20]] +; CHECK-NEXT: [[TMP55:%.*]] = getelementptr i32, i32* [[PTR]], i64 [[TMP25]] +; CHECK-NEXT: [[TMP56:%.*]] = getelementptr i32, i32* [[PTR]], i64 [[TMP30]] +; CHECK-NEXT: [[TMP57:%.*]] = select [[ACTIVE_LANE_MASK7]], [[TMP49]], zeroinitializer +; CHECK-NEXT: [[TMP58:%.*]] = select [[ACTIVE_LANE_MASK8]], [[TMP50]], zeroinitializer +; CHECK-NEXT: [[TMP59:%.*]] = select [[ACTIVE_LANE_MASK9]], [[TMP51]], zeroinitializer +; CHECK-NEXT: [[TMP60:%.*]] = select [[ACTIVE_LANE_MASK10]], [[TMP52]], zeroinitializer +; CHECK-NEXT: [[TMP61:%.*]] = getelementptr i32, i32* [[TMP53]], i32 0 +; CHECK-NEXT: [[TMP62:%.*]] = bitcast i32* [[TMP61]] to * +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT]], * [[TMP62]], i32 4, [[TMP57]]) +; CHECK-NEXT: [[TMP63:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP64:%.*]] = mul i32 [[TMP63]], 4 +; CHECK-NEXT: [[TMP65:%.*]] = getelementptr i32, i32* [[TMP53]], i32 [[TMP64]] +; CHECK-NEXT: [[TMP66:%.*]] = bitcast i32* [[TMP65]] to * +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT15]], * [[TMP66]], i32 4, [[TMP58]]) +; CHECK-NEXT: [[TMP67:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP68:%.*]] = mul i32 [[TMP67]], 8 +; CHECK-NEXT: [[TMP69:%.*]] = getelementptr i32, i32* [[TMP53]], i32 [[TMP68]] +; CHECK-NEXT: [[TMP70:%.*]] = bitcast i32* [[TMP69]] to * +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT17]], * [[TMP70]], i32 4, [[TMP59]]) +; CHECK-NEXT: [[TMP71:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP72:%.*]] = mul i32 [[TMP71]], 12 +; CHECK-NEXT: [[TMP73:%.*]] = getelementptr i32, i32* [[TMP53]], i32 [[TMP72]] +; CHECK-NEXT: [[TMP74:%.*]] = bitcast i32* [[TMP73]] to * +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT19]], * [[TMP74]], i32 4, [[TMP60]]) +; CHECK-NEXT: [[TMP75:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP76:%.*]] = mul i64 [[TMP75]], 16 +; CHECK-NEXT: [[INDEX_NEXT20]] = add i64 [[INDEX6]], [[TMP76]] +; CHECK-NEXT: [[TMP77:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP78:%.*]] = mul i64 [[TMP77]], 4 +; CHECK-NEXT: [[INDEX_PART_NEXT22:%.*]] = add i64 [[INDEX_NEXT20]], [[TMP78]] +; CHECK-NEXT: [[TMP79:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP80:%.*]] = mul i64 [[TMP79]], 8 +; CHECK-NEXT: [[INDEX_PART_NEXT23:%.*]] = add i64 [[INDEX_NEXT20]], [[TMP80]] +; CHECK-NEXT: [[TMP81:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP82:%.*]] = mul i64 [[TMP81]], 12 +; CHECK-NEXT: [[INDEX_PART_NEXT24:%.*]] = add i64 [[INDEX_NEXT20]], [[TMP82]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK25]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT20]], i64 [[UMAX]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK26]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT22]], i64 [[UMAX]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK27]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT23]], i64 [[UMAX]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK28]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT24]], i64 [[UMAX]]) +; CHECK-NEXT: [[TMP83:%.*]] = xor [[ACTIVE_LANE_MASK25]], shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP84:%.*]] = xor [[ACTIVE_LANE_MASK26]], shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP85:%.*]] = xor [[ACTIVE_LANE_MASK27]], shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP86:%.*]] = xor [[ACTIVE_LANE_MASK28]], shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP87:%.*]] = extractelement [[TMP83]], i32 0 +; CHECK-NEXT: br i1 [[TMP87]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll @@ -1,8 +1,6 @@ ; RUN: opt -S -hints-allow-reordering=false -loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize -prefer-inloop-reductions < %s | FileCheck %s ; RUN: opt -S -hints-allow-reordering=false -loop-vectorize -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -prefer-inloop-reductions < %s | FileCheck %s -; CHECK-NOT: vector.body: - target triple = "aarch64-unknown-linux-gnu" @@ -24,22 +22,25 @@ ; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP8]] ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[VAL:%.*]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[ACTIVE_LANE_MASK]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK4:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX1]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP9]], i64 [[UMAX]]) ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP10]], i32 0 ; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to * -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT]], * [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT]], * [[TMP12]], i32 4, [[ACTIVE_LANE_MASK2]]) ; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4 -; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP14]] -; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP14]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK4]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT3]], i64 [[UMAX]]) +; CHECK-NEXT: [[TMP15:%.*]] = xor [[ACTIVE_LANE_MASK4]], shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP16:%.*]] = extractelement [[TMP15]], i32 0 +; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; @@ -59,6 +60,51 @@ } +define void @simple_memset_v4i32(i32 %val, i32* %ptr, i64 %n) #0 { +; CHECK-LABEL: @simple_memset_v4i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1) +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], 3 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 0, i64 [[UMAX]]) +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[VAL:%.*]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi <4 x i1> [ [[ACTIVE_LANE_MASK]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK4:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX1]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, i32* [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* +; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[BROADCAST_SPLAT]], <4 x i32>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK2]]) +; CHECK-NEXT: [[INDEX_NEXT3]] = add i64 [[INDEX1]], 4 +; CHECK-NEXT: [[ACTIVE_LANE_MASK4]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX_NEXT3]], i64 [[UMAX]]) +; CHECK-NEXT: [[TMP4:%.*]] = xor <4 x i1> [[ACTIVE_LANE_MASK4]], +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i1> [[TMP4]], i32 0 +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; +entry: + br label %while.body + +while.body: ; preds = %while.body, %entry + %index = phi i64 [ %index.next, %while.body ], [ 0, %entry ] + %gep = getelementptr i32, i32* %ptr, i64 %index + store i32 %val, i32* %gep + %index.next = add nsw i64 %index, 1 + %cmp10 = icmp ult i64 %index.next, %n + br i1 %cmp10, label %while.body, label %while.end.loopexit, !llvm.loop !3 + +while.end.loopexit: ; preds = %while.body + ret void +} + + define void @simple_memcpy(i32* noalias %dst, i32* noalias %src, i64 %n) #0 { ; CHECK-LABEL: @simple_memcpy( ; CHECK-NEXT: entry: @@ -77,24 +123,27 @@ ; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP8]] ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[ACTIVE_LANE_MASK]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK4:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX1]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP9]], i64 [[UMAX]]) ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[SRC:%.*]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP10]], i32 0 ; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to * -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP12]], i32 4, [[ACTIVE_LANE_MASK2]], poison) ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[DST:%.*]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, i32* [[TMP13]], i32 0 ; CHECK-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to * -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[WIDE_MASKED_LOAD]], * [[TMP15]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[WIDE_MASKED_LOAD]], * [[TMP15]], i32 4, [[ACTIVE_LANE_MASK2]]) ; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4 -; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP17]] -; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP17]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK4]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT3]], i64 [[UMAX]]) +; CHECK-NEXT: [[TMP18:%.*]] = xor [[ACTIVE_LANE_MASK4]], shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP19:%.*]] = extractelement [[TMP18]], i32 0 +; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; @@ -138,6 +187,7 @@ ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP8]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[IND_END:%.*]] = mul i64 [[N_VEC]], 4 +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[TMP2]]) ; CHECK-NEXT: [[TMP12:%.*]] = call @llvm.experimental.stepvector.nxv4i64() ; CHECK-NEXT: [[TMP13:%.*]] = add [[TMP12]], zeroinitializer ; CHECK-NEXT: [[TMP14:%.*]] = mul [[TMP13]], shufflevector ( insertelement ( poison, i64 4, i32 0), poison, zeroinitializer) @@ -149,25 +199,21 @@ ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[ACTIVE_LANE_MASK]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK4:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[INDEX1]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP18:%.*]] = call @llvm.experimental.stepvector.nxv4i64() -; CHECK-NEXT: [[TMP19:%.*]] = add zeroinitializer, [[TMP18]] -; CHECK-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT]], [[TMP19]] -; CHECK-NEXT: [[TMP20:%.*]] = extractelement [[VEC_IV]], i32 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP20]], i64 [[TMP2]]) -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i32, i32* [[SRC:%.*]], [[VEC_IND]] -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP21]], i32 4, [[ACTIVE_LANE_MASK]], undef) -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr i32, i32* [[DST:%.*]], [[VEC_IND]] -; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( [[WIDE_MASKED_GATHER]], [[TMP22]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[TMP23:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP24:%.*]] = mul i64 [[TMP23]], 4 -; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP24]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[SRC:%.*]], [[VEC_IND]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP18]], i32 4, [[ACTIVE_LANE_MASK2]], undef) +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DST:%.*]], [[VEC_IND]] +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( [[WIDE_MASKED_GATHER]], [[TMP19]], i32 4, [[ACTIVE_LANE_MASK2]]) +; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 4 +; CHECK-NEXT: [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP21]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK4]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT3]], i64 [[TMP2]]) +; CHECK-NEXT: [[TMP22:%.*]] = xor [[ACTIVE_LANE_MASK4]], shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer) ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: [[TMP23:%.*]] = extractelement [[TMP22]], i32 0 +; CHECK-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; @@ -207,24 +253,27 @@ ; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP8]] ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[ACTIVE_LANE_MASK]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK4:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX1]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP9]], i64 [[UMAX]]) ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[IND:%.*]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP10]], i32 0 ; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to * -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP12]], i32 4, [[ACTIVE_LANE_MASK2]], poison) ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[SRC:%.*]], [[WIDE_MASKED_LOAD]] -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP13]], i32 4, [[ACTIVE_LANE_MASK]], undef) +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP13]], i32 4, [[ACTIVE_LANE_MASK2]], undef) ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, i32* [[DST:%.*]], [[WIDE_MASKED_LOAD]] -; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( [[WIDE_MASKED_GATHER]], [[TMP14]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( [[WIDE_MASKED_GATHER]], [[TMP14]], i32 4, [[ACTIVE_LANE_MASK2]]) ; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4 -; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP16]] -; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP16]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK4]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT3]], i64 [[UMAX]]) +; CHECK-NEXT: [[TMP17:%.*]] = xor [[ACTIVE_LANE_MASK4]], shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP18:%.*]] = extractelement [[TMP17]], i32 0 +; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; @@ -267,23 +316,26 @@ ; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]] ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK1:%.*]] = phi [ [[ACTIVE_LANE_MASK]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK2:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP9]], i64 [[N]]) ; CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[SRC:%.*]], align 4 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP10]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to * -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT]], * [[TMP13]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT]], * [[TMP13]], i32 4, [[ACTIVE_LANE_MASK1]]) ; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP15]] -; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK2]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[N]]) +; CHECK-NEXT: [[TMP16:%.*]] = xor [[ACTIVE_LANE_MASK2]], shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP17:%.*]] = extractelement [[TMP16]], i32 0 +; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; @@ -326,22 +378,23 @@ ; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]] ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32* [[SRC:%.*]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[ACTIVE_LANE_MASK]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK4:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX1]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP9]], i64 [[N]]) ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[COND:%.*]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 0 ; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to * -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP12]], i32 4, [[ACTIVE_LANE_MASK2]], poison) ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq [[WIDE_MASKED_LOAD]], zeroinitializer ; CHECK-NEXT: [[TMP14:%.*]] = xor [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP15:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP14]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = select [[ACTIVE_LANE_MASK2]], [[TMP14]], zeroinitializer ; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[BROADCAST_SPLAT]], i32 4, [[TMP15]], undef) -; CHECK-NEXT: [[TMP16:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = select [[ACTIVE_LANE_MASK2]], [[TMP13]], zeroinitializer ; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP16]], zeroinitializer, [[WIDE_MASKED_GATHER]] ; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP18:%.*]] = or [[TMP15]], [[TMP16]] @@ -350,9 +403,11 @@ ; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[PREDPHI]], * [[TMP20]], i32 4, [[TMP18]]) ; CHECK-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 4 -; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP22]] -; CHECK-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP22]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK4]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT3]], i64 [[N]]) +; CHECK-NEXT: [[TMP23:%.*]] = xor [[ACTIVE_LANE_MASK4]], shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP24:%.*]] = extractelement [[TMP23]], i32 0 +; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; @@ -403,23 +458,26 @@ ; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]] ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32* [[DST:%.*]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK1:%.*]] = phi [ [[ACTIVE_LANE_MASK]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK2:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP9]], i64 [[N]]) ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 0 ; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to * -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP12]], i32 4, [[ACTIVE_LANE_MASK1]], poison) +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]], i32 4, [[ACTIVE_LANE_MASK1]]) ; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP14]] -; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK2]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[N]]) +; CHECK-NEXT: [[TMP15:%.*]] = xor [[ACTIVE_LANE_MASK2]], shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP16:%.*]] = extractelement [[TMP15]], i32 0 +; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; @@ -459,27 +517,30 @@ ; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP8]] ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[ACTIVE_LANE_MASK]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK5:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX1]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP9]], i64 [[UMAX]]) ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr float, float* [[SRC:%.*]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr float, float* [[DST:%.*]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr float, float* [[TMP10]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP12]] to * -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0nxv4f32(* [[TMP13]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0nxv4f32(* [[TMP13]], i32 4, [[ACTIVE_LANE_MASK2]], poison) ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr float, float* [[TMP11]], i32 0 ; CHECK-NEXT: [[TMP15:%.*]] = bitcast float* [[TMP14]] to * -; CHECK-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = call @llvm.masked.load.nxv4f32.p0nxv4f32(* [[TMP15]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP16:%.*]] = fdiv [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD2]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = call @llvm.masked.load.nxv4f32.p0nxv4f32(* [[TMP15]], i32 4, [[ACTIVE_LANE_MASK2]], poison) +; CHECK-NEXT: [[TMP16:%.*]] = fdiv [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD3]] ; CHECK-NEXT: [[TMP17:%.*]] = bitcast float* [[TMP14]] to * -; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0nxv4f32( [[TMP16]], * [[TMP17]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0nxv4f32( [[TMP16]], * [[TMP17]], i32 4, [[ACTIVE_LANE_MASK2]]) ; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 4 -; CHECK-NEXT: [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP19]] -; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT3]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-NEXT: [[INDEX_NEXT4]] = add i64 [[INDEX1]], [[TMP19]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK5]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT4]], i64 [[UMAX]]) +; CHECK-NEXT: [[TMP20:%.*]] = xor [[ACTIVE_LANE_MASK5]], shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP21:%.*]] = extractelement [[TMP20]], i32 0 +; CHECK-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; @@ -521,24 +582,27 @@ ; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP8]] ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[ACTIVE_LANE_MASK]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK4:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX1]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP9]], i64 [[UMAX]]) ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP10]], i32 0 ; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to * -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP13:%.*]] = select [[ACTIVE_LANE_MASK]], [[WIDE_MASKED_LOAD]], zeroinitializer +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP12]], i32 4, [[ACTIVE_LANE_MASK2]], poison) +; CHECK-NEXT: [[TMP13:%.*]] = select [[ACTIVE_LANE_MASK2]], [[WIDE_MASKED_LOAD]], zeroinitializer ; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP13]]) ; CHECK-NEXT: [[TMP15]] = add i32 [[TMP14]], [[VEC_PHI]] ; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4 -; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP17]] -; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-NEXT: [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP17]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK4]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT3]], i64 [[UMAX]]) +; CHECK-NEXT: [[TMP18:%.*]] = xor [[ACTIVE_LANE_MASK4]], shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP19:%.*]] = extractelement [[TMP18]], i32 0 +; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; @@ -577,23 +641,26 @@ ; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP8]] ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[ACTIVE_LANE_MASK]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK4:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX1]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP9]], i64 [[UMAX]]) ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr float, float* [[PTR:%.*]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr float, float* [[TMP10]], i32 0 ; CHECK-NEXT: [[TMP12:%.*]] = bitcast float* [[TMP11]] to * -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0nxv4f32(* [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP13:%.*]] = select [[ACTIVE_LANE_MASK]], [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, float -0.000000e+00, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0nxv4f32(* [[TMP12]], i32 4, [[ACTIVE_LANE_MASK2]], poison) +; CHECK-NEXT: [[TMP13:%.*]] = select [[ACTIVE_LANE_MASK2]], [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, float -0.000000e+00, i32 0), poison, zeroinitializer) ; CHECK-NEXT: [[TMP14]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[TMP13]]) ; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4 -; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP16]] -; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK-NEXT: [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP16]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK4]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT3]], i64 [[UMAX]]) +; CHECK-NEXT: [[TMP17:%.*]] = xor [[ACTIVE_LANE_MASK4]], shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP18:%.*]] = extractelement [[TMP17]], i32 0 +; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; @@ -631,30 +698,33 @@ ; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]] ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK1:%.*]] = phi [ [[ACTIVE_LANE_MASK]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK3:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 7, [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP9]], i64 [[N]]) ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[COND:%.*]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 0 ; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to * -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP12]], i32 4, [[ACTIVE_LANE_MASK1]], poison) ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i32 5, i32 0), poison, zeroinitializer) ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP15:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = select [[ACTIVE_LANE_MASK1]], [[TMP13]], zeroinitializer ; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i32, i32* [[TMP14]], i32 0 ; CHECK-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to * -; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP17]], i32 4, [[TMP15]], poison) -; CHECK-NEXT: [[TMP18:%.*]] = select [[TMP15]], [[WIDE_MASKED_LOAD1]], zeroinitializer +; CHECK-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP17]], i32 4, [[TMP15]], poison) +; CHECK-NEXT: [[TMP18:%.*]] = select [[TMP15]], [[WIDE_MASKED_LOAD2]], zeroinitializer ; CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.xor.nxv4i32( [[TMP18]]) ; CHECK-NEXT: [[TMP20]] = xor i32 [[TMP19]], [[VEC_PHI]] ; CHECK-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP22]] -; CHECK-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK3]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[N]]) +; CHECK-NEXT: [[TMP23:%.*]] = xor [[ACTIVE_LANE_MASK3]], shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP24:%.*]] = extractelement [[TMP23]], i32 0 +; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; @@ -715,5 +785,7 @@ !0 = distinct !{!0, !1, !2} !1 = !{!"llvm.loop.vectorize.width", i32 4} !2 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} +!3 = distinct !{!3, !4} +!4 = !{!"llvm.loop.vectorize.width", i32 4} attributes #0 = { "target-features"="+sve" } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll @@ -9,10 +9,12 @@ ; we don't artificially create new predicated blocks for the load. define void @uniform_load(i32* noalias %dst, i32* noalias readonly %src, i64 %n) #0 { ; CHECK-LABEL: @uniform_load( +; CHECK: vector.ph: +; CHECK: [[INIT_ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 0, i64 %n) ; CHECK: vector.body: ; CHECK-NEXT: [[IDX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[IDX_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[INIT_ACTIVE_LANE_MASK]], %vector.ph ], [ [[NEXT_ACTIVE_LANE_MASK:%.*]], %vector.body ] ; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[IDX]], 0 -; CHECK-NEXT: [[LOOP_PRED:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[TMP3]], i64 %n) ; CHECK-NEXT: [[LOAD_VAL:%.*]] = load i32, i32* %src, align 4 ; CHECK-NOT: load i32, i32* %src, align 4 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> poison, i32 [[LOAD_VAL]], i32 0 @@ -20,10 +22,12 @@ ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* %dst, i64 [[TMP3]] ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 0 ; CHECK-NEXT: [[STORE_PTR:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>* -; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP5]], <4 x i32>* [[STORE_PTR]], i32 4, <4 x i1> [[LOOP_PRED]]) +; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP5]], <4 x i32>* [[STORE_PTR]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]]) ; CHECK-NEXT: [[IDX_NEXT]] = add i64 [[IDX]], 4 -; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[IDX_NEXT]], %n.vec -; CHECK-NEXT: br i1 [[CMP]], label %middle.block, label %vector.body +; CHECK-NEXT: [[NEXT_ACTIVE_LANE_MASK]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[IDX_NEXT]], i64 %n) +; CHECK-NEXT: [[NOT_ACTIVE_LANE_MASK:%.*]] = xor <4 x i1> [[NEXT_ACTIVE_LANE_MASK]], +; CHECK-NEXT: [[FIRST_LANE_SET:%.*]] = extractelement <4 x i1> [[NOT_ACTIVE_LANE_MASK]], i32 0 +; CHECK-NEXT: br i1 [[FIRST_LANE_SET]], label %middle.block, label %vector.body entry: br label %for.body @@ -48,16 +52,17 @@ define void @cond_uniform_load(i32* nocapture %dst, i32* nocapture readonly %src, i32* nocapture readonly %cond, i64 %n) #0 { ; CHECK-LABEL: @cond_uniform_load( ; CHECK: vector.ph: +; CHECK: [[INIT_ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 0, i64 %n) ; CHECK: [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* %src, i32 0 ; CHECK-NEXT: [[SRC_SPLAT:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> poison, <4 x i32> zeroinitializer ; CHECK: vector.body: ; CHECK-NEXT: [[IDX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[IDX_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[INIT_ACTIVE_LANE_MASK]], %vector.ph ], [ [[NEXT_ACTIVE_LANE_MASK:%.*]], %vector.body ] ; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[IDX]], 0 -; CHECK-NEXT: [[LOOP_PRED:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[TMP3]], i64 %n) -; CHECK: [[COND_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{%.*}}, i32 4, <4 x i1> [[LOOP_PRED]], <4 x i32> poison) +; CHECK: [[COND_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{%.*}}, i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison) ; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <4 x i32> [[COND_LOAD]], zeroinitializer ; CHECK-NEXT: [[TMP5:%.*]] = xor <4 x i1> [[TMP4]], -; CHECK-NEXT: [[MASK:%.*]] = select <4 x i1> [[LOOP_PRED]], <4 x i1> [[TMP5]], <4 x i1> zeroinitializer +; CHECK-NEXT: [[MASK:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i1> [[TMP5]], <4 x i1> zeroinitializer ; CHECK-NEXT: call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[SRC_SPLAT]], i32 4, <4 x i1> [[MASK]], <4 x i32> undef) entry: br label %for.body