Index: llvm/include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfo.h +++ llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -159,6 +159,8 @@ bool skipScalarizationCost() const { return ScalarizationCost.isValid(); } }; +enum class PredicationStyle { None, Data, DataAndControlFlow }; + class TargetTransformInfo; typedef TargetTransformInfo TTI; @@ -531,7 +533,7 @@ /// Query the target whether lowering of the llvm.get.active.lane.mask /// intrinsic is supported. - bool emitGetActiveLaneMask() const; + PredicationStyle emitGetActiveLaneMask() const; // Parameters that control the loop peeling transformation struct PeelingPreferences { @@ -1539,7 +1541,7 @@ preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *TLI, DominatorTree *DT, const LoopAccessInfo *LAI) = 0; - virtual bool emitGetActiveLaneMask() = 0; + virtual PredicationStyle emitGetActiveLaneMask() = 0; virtual Optional instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) = 0; virtual Optional @@ -1914,7 +1916,7 @@ const LoopAccessInfo *LAI) override { return Impl.preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI); } - bool emitGetActiveLaneMask() override { + PredicationStyle emitGetActiveLaneMask() override { return Impl.emitGetActiveLaneMask(); } Optional instCombineIntrinsic(InstCombiner &IC, Index: llvm/include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -167,8 +167,8 @@ return false; } - bool emitGetActiveLaneMask() const { - return false; + PredicationStyle emitGetActiveLaneMask() const { + return PredicationStyle::None; } Optional instCombineIntrinsic(InstCombiner &IC, Index: llvm/include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -607,7 +607,7 @@ return BaseT::preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI); } - bool emitGetActiveLaneMask() { + PredicationStyle emitGetActiveLaneMask() { return BaseT::emitGetActiveLaneMask(); } Index: llvm/lib/Analysis/TargetTransformInfo.cpp =================================================================== --- llvm/lib/Analysis/TargetTransformInfo.cpp +++ llvm/lib/Analysis/TargetTransformInfo.cpp @@ -293,7 +293,7 @@ return TTIImpl->preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI); } -bool TargetTransformInfo::emitGetActiveLaneMask() const { +PredicationStyle TargetTransformInfo::emitGetActiveLaneMask() const { return TTIImpl->emitGetActiveLaneMask(); } Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -332,8 +332,10 @@ return 2; } - bool emitGetActiveLaneMask() const { - return ST->hasSVE(); + PredicationStyle emitGetActiveLaneMask() const { + if (ST->hasSVE()) + return PredicationStyle::DataAndControlFlow; + return PredicationStyle::None; } bool supportsScalableVectors() const { return ST->hasSVE(); } Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.h =================================================================== --- llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -298,7 +298,7 @@ TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE); - bool emitGetActiveLaneMask() const; + PredicationStyle emitGetActiveLaneMask() const; void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP); Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -2247,15 +2247,15 @@ return canTailPredicateLoop(L, LI, SE, DL, LAI); } -bool ARMTTIImpl::emitGetActiveLaneMask() const { +PredicationStyle ARMTTIImpl::emitGetActiveLaneMask() const { if (!ST->hasMVEIntegerOps() || !EnableTailPredication) - return false; + return PredicationStyle::None; // Intrinsic @llvm.get.active.lane.mask is supported. // It is used in the MVETailPredication pass, which requires the number of // elements processed by this vector loop to setup the tail-predicated // loop. - return true; + return PredicationStyle::Data; } void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1090,7 +1090,8 @@ if (isa(CurRec) || isa(CurRec) || isa(CurRec) || - isa(CurRec)) + isa(CurRec) || + isa(CurRec)) continue; // This recipe contributes to the address computation of a widen @@ -1589,6 +1590,11 @@ /// Returns true if all loop blocks should be masked to fold tail loop. bool foldTailByMasking() const { return FoldTailByMasking; } + bool useActiveLaneMaskForControlFlow() const { + return FoldTailByMasking && + TTI.emitGetActiveLaneMask() == PredicationStyle::DataAndControlFlow; + } + /// Returns true if the instructions in this block requires predication /// for any reason, e.g. because tail folding now requires a predicate /// or because the block in the original loop was predicated. @@ -8084,11 +8090,19 @@ if (!CM.blockNeedsPredicationForAnyReason(BB)) return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. + assert(CM.foldTailByMasking() && "must fold the tail"); + + // If we're using the active lane mask for control flow, then we get the + // mask from the active lane mask PHI that is cached in the VPlan. + PredicationStyle EmitGetActiveLaneMask = CM.TTI.emitGetActiveLaneMask(); + if (EmitGetActiveLaneMask == PredicationStyle::DataAndControlFlow) + return BlockMaskCache[BB] = Plan->getActiveLaneMaskPhi(); + // Introduce the early-exit compare IV <= BTC to form header block mask. // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by // constructing the desired canonical IV in the header block as its first // non-phi instructions. - assert(CM.foldTailByMasking() && "must fold the tail"); + VPBasicBlock *HeaderVPBB = Plan->getVectorLoopRegion()->getEntryBasicBlock(); auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi(); @@ -8097,7 +8111,7 @@ VPBuilder::InsertPointGuard Guard(Builder); Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint); - if (CM.TTI.emitGetActiveLaneMask()) { + if (EmitGetActiveLaneMask != PredicationStyle::None) { VPValue *TC = Plan->getOrCreateTripCount(); BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC}); } else { @@ -8652,7 +8666,8 @@ // CanonicalIVIncrement{NUW} VPInstruction to increment it by VF * UF and a // BranchOnCount VPInstruction to the latch. static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL, - bool HasNUW) { + bool HasNUW, + bool UseLaneMaskForLoopControlFlow) { Value *StartIdx = ConstantInt::get(IdxTy, 0); auto *StartV = Plan.getOrAddVPValue(StartIdx); @@ -8670,10 +8685,52 @@ VPBasicBlock *EB = TopRegion->getExitingBasicBlock(); EB->appendRecipe(CanonicalIVIncrement); - auto *BranchOnCount = - new VPInstruction(VPInstruction::BranchOnCount, - {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); - EB->appendRecipe(BranchOnCount); + if (UseLaneMaskForLoopControlFlow) { + // Create the active lane mask instruction in the vplan preheader. + VPBasicBlock *Preheader = Plan.getEntry()->getEntryBasicBlock(); + + // We can't use StartV directly in the ActiveLaneMask VPInstruction, since + // we have to take unrolling into account. Each part needs to start at + // Part * VF + auto *CanonicalIVIncrementParts = + new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementForPartNUW + : VPInstruction::CanonicalIVIncrementForPart, + {StartV}, DL); + Preheader->appendRecipe(CanonicalIVIncrementParts); + + // Create the ActiveLaneMask instruction using the correct start values. + VPValue *TC = Plan.getOrCreateTripCount(); + auto *EntryALM = new VPInstruction(VPInstruction::ActiveLaneMask, + {CanonicalIVIncrementParts, TC}, DL); + Preheader->appendRecipe(EntryALM); + + // Now create the ActiveLaneMaskPhi recipe in the main loop using the + // preheader ActiveLaneMask instruction. + auto *LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc()); + Header->insert(LaneMaskPhi, Header->getFirstNonPhi()); + Plan.setActiveLaneMaskPhi(LaneMaskPhi); + + CanonicalIVIncrementParts = + new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementForPartNUW + : VPInstruction::CanonicalIVIncrementForPart, + {CanonicalIVIncrement}, DL); + EB->appendRecipe(CanonicalIVIncrementParts); + + auto *ALM = new VPInstruction(VPInstruction::ActiveLaneMask, + {CanonicalIVIncrementParts, TC}, DL); + EB->appendRecipe(ALM); + LaneMaskPhi->addOperand(ALM); + + VPInstruction *BranchBack = + new VPInstruction(VPInstruction::BranchOnActiveLaneMask, {ALM}, DL); + + EB->appendRecipe(BranchBack); + } else { + VPInstruction *BranchBack = new VPInstruction( + VPInstruction::BranchOnCount, + {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); + EB->appendRecipe(BranchBack); + } } // Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the @@ -8774,7 +8831,8 @@ getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DLInst ? DLInst->getDebugLoc() : DebugLoc(), - !CM.foldTailByMasking()); + !CM.foldTailByMasking(), + CM.useActiveLaneMaskForControlFlow()); // Scan the body of the loop in a topological order to visit each basic block // after having visited its predecessor basic blocks. @@ -9085,7 +9143,7 @@ Term->eraseFromParent(); addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(), - true); + true, CM.useActiveLaneMaskForControlFlow()); return Plan; } Index: llvm/lib/Transforms/Vectorize/VPlan.h =================================================================== --- llvm/lib/Transforms/Vectorize/VPlan.h +++ llvm/lib/Transforms/Vectorize/VPlan.h @@ -755,8 +755,13 @@ ActiveLaneMask, CanonicalIVIncrement, CanonicalIVIncrementNUW, + // The next two are similar to the above, but instead increment the + // canonical IV separately for each unrolled part. + CanonicalIVIncrementForPart, + CanonicalIVIncrementForPartNUW, BranchOnCount, - BranchOnCond + BranchOnCond, + BranchOnActiveLaneMask, }; private: @@ -847,6 +852,7 @@ case Instruction::AtomicRMW: case VPInstruction::BranchOnCond: case VPInstruction::BranchOnCount: + case VPInstruction::BranchOnActiveLaneMask: return false; default: return true; @@ -868,7 +874,10 @@ case VPInstruction::ActiveLaneMask: case VPInstruction::CanonicalIVIncrement: case VPInstruction::CanonicalIVIncrementNUW: + case VPInstruction::CanonicalIVIncrementForPart: + case VPInstruction::CanonicalIVIncrementForPartNUW: case VPInstruction::BranchOnCount: + case VPInstruction::BranchOnActiveLaneMask: return true; }; llvm_unreachable("switch should return"); @@ -1096,6 +1105,7 @@ /// Method to support type inquiry through isa, cast, and dyn_cast. static inline bool classof(const VPRecipeBase *B) { return B->getVPDefID() == VPRecipeBase::VPCanonicalIVPHISC || + B->getVPDefID() == VPRecipeBase::VPActiveLaneMaskPHISC || B->getVPDefID() == VPRecipeBase::VPFirstOrderRecurrencePHISC || B->getVPDefID() == VPRecipeBase::VPReductionPHISC || B->getVPDefID() == VPRecipeBase::VPWidenIntOrFpInductionSC || @@ -1103,6 +1113,7 @@ } static inline bool classof(const VPValue *V) { return V->getVPValueID() == VPValue::VPVCanonicalIVPHISC || + V->getVPValueID() == VPValue::VPVActiveLaneMaskPHISC || V->getVPValueID() == VPValue::VPVFirstOrderRecurrencePHISC || V->getVPValueID() == VPValue::VPVReductionPHISC || V->getVPValueID() == VPValue::VPVWidenIntOrFpInductionSC || @@ -1832,6 +1843,40 @@ } }; +/// A recipe for generating the active lane mask for the vector loop that is +/// used to predicate the vector operations. +class VPActiveLaneMaskPHIRecipe : public VPHeaderPHIRecipe { + DebugLoc DL; + +public: + VPActiveLaneMaskPHIRecipe(VPValue *StartMask, DebugLoc DL) + : VPHeaderPHIRecipe(VPValue::VPVActiveLaneMaskPHISC, + VPActiveLaneMaskPHISC, nullptr, StartMask), + DL(DL) {} + + ~VPActiveLaneMaskPHIRecipe() override = default; + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPDef *D) { + return D->getVPDefID() == VPActiveLaneMaskPHISC; + } + static inline bool classof(const VPHeaderPHIRecipe *D) { + return D->getVPDefID() == VPActiveLaneMaskPHISC; + } + static inline bool classof(const VPValue *V) { + return V->getVPValueID() == VPValue::VPVActiveLaneMaskPHISC; + } + + /// Generate the active lane mask phi of the vector loop. + void execute(VPTransformState &State) override; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif +}; + /// A Recipe for widening the canonical induction variable of the vector loop. class VPWidenCanonicalIVRecipe : public VPRecipeBase, public VPValue { public: @@ -2439,6 +2484,8 @@ /// the tail. It equals TripCount - 1. VPValue *BackedgeTakenCount = nullptr; + VPActiveLaneMaskPHIRecipe *LaneMask = nullptr; + /// Represents the vector trip count. VPValue VectorTripCount; @@ -2513,6 +2560,12 @@ return BackedgeTakenCount; } + void setActiveLaneMaskPhi(VPActiveLaneMaskPHIRecipe *Mask) { + LaneMask = Mask; + } + + VPActiveLaneMaskPHIRecipe *getActiveLaneMaskPhi() const { return LaneMask; } + /// The vector trip count. VPValue &getVectorTripCount() { return VectorTripCount; } Index: llvm/lib/Transforms/Vectorize/VPlan.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/VPlan.cpp +++ llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -394,7 +394,8 @@ bool IsCondBranch = isa(R) || (VPI && (VPI->getOpcode() == VPInstruction::BranchOnCond || - VPI->getOpcode() == VPInstruction::BranchOnCount)); + VPI->getOpcode() == VPInstruction::BranchOnCount || + VPI->getOpcode() == VPInstruction::BranchOnActiveLaneMask)); (void)IsCondBranch; if (VPBB->getNumSuccessors() >= 2 || VPBB->isExiting()) { @@ -785,17 +786,41 @@ Builder.GetInsertBlock()->getTerminator()->eraseFromParent(); break; } - case VPInstruction::BranchOnCount: { + case VPInstruction::CanonicalIVIncrementForPart: + case VPInstruction::CanonicalIVIncrementForPartNUW: { + bool IsNUW = getOpcode() == VPInstruction::CanonicalIVIncrementForPartNUW; + auto *IV = State.get(getOperand(0), VPIteration(0, 0)); + + // The loop step is equal to the vectorization factor (num of SIMD + // elements) times the unroll factor (num of SIMD instructions). + Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part); + Value *Next = Builder.CreateAdd(IV, Step, "index.part.next", IsNUW, false); + State.set(this, Next, Part); + break; + } + case VPInstruction::BranchOnCount: + case VPInstruction::BranchOnActiveLaneMask: { if (Part != 0) break; - // First create the compare. - Value *IV = State.get(getOperand(0), Part); - Value *TC = State.get(getOperand(1), Part); - Value *Cond = Builder.CreateICmpEQ(IV, TC); - - // Now create the branch. + // First create the condition. auto *Plan = getParent()->getPlan(); VPRegionBlock *TopRegion = Plan->getVectorLoopRegion(); + + Value *Cond; + if (getOpcode() == VPInstruction::BranchOnCount) { + Value *IV = State.get(getOperand(0), 0); + Value *TC = State.get(getOperand(1), 0); + Cond = Builder.CreateICmpEQ(IV, TC); + } else { + Value *LaneMask = State.get(getOperand(0), Part); + + // Exit the loop if there are no active lanes. Testing the first lane is + // sufficient for this. + Cond = Builder.CreateExtractElement(LaneMask, Builder.getInt32(0)); + Cond = Builder.CreateNot(Cond); + } + + // Now create the branch. VPBasicBlock *Header = TopRegion->getEntry()->getEntryBasicBlock(); // Replace the temporary unreachable terminator with a new conditional @@ -865,9 +890,18 @@ case VPInstruction::BranchOnCond: O << "branch-on-cond"; break; + case VPInstruction::CanonicalIVIncrementForPart: + O << "VF * Part + "; + break; + case VPInstruction::CanonicalIVIncrementForPartNUW: + O << "VF * Part +(nuw) "; + break; case VPInstruction::BranchOnCount: O << "branch-on-count "; break; + case VPInstruction::BranchOnActiveLaneMask: + O << "branch-on-active-lane-mask "; + break; default: O << Instruction::getOpcodeName(getOpcode()); } @@ -905,7 +939,8 @@ // Try to simplify BranchOnCount to 'BranchOnCond true' if TC <= VF * UF when // preparing to execute the plan for the main vector loop. if (!CanonicalIVStartValue && Term && - Term->getOpcode() == VPInstruction::BranchOnCount && + (Term->getOpcode() == VPInstruction::BranchOnCount || + Term->getOpcode() == VPInstruction::BranchOnActiveLaneMask) && isa(TripCountV)) { ConstantInt *C = cast(TripCountV); uint64_t TCVal = C->getZExtValue(); @@ -1026,7 +1061,8 @@ // generated. bool SinglePartNeeded = isa(PhiR) || isa(PhiR) || - cast(PhiR)->isOrdered(); + (isa(PhiR) && + cast(PhiR)->isOrdered()); unsigned LastPartForNewPhi = SinglePartNeeded ? 1 : State->UF; for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) { @@ -1519,6 +1555,28 @@ (IsUniform || !VF.isScalable()); } +void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) { + BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); + for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) { + Value *StartMask = State.get(getOperand(0), Part); + PHINode *EntryPart = + PHINode::Create(StartMask->getType(), 2, "active.lane.mask", + &*State.CFG.PrevBB->getFirstInsertionPt()); + EntryPart->addIncoming(StartMask, VectorPH); + EntryPart->setDebugLoc(DL); + State.set(this, EntryPart, Part); + } +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "EMIT "; + printAsOperand(O, SlotTracker); + O << " = ACTIVE-LANE-MASK-PHI"; +} +#endif + void VPExpandSCEVRecipe::execute(VPTransformState &State) { assert(!State.Instance && "cannot be used in per-lane"); const DataLayout &DL = State.CFG.PrevBB->getModule()->getDataLayout(); Index: llvm/lib/Transforms/Vectorize/VPlanValue.h =================================================================== --- llvm/lib/Transforms/Vectorize/VPlanValue.h +++ llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -103,6 +103,7 @@ // Phi-like VPValues. Need to be kept together. VPVBlendSC, VPVCanonicalIVPHISC, + VPVActiveLaneMaskPHISC, VPVFirstOrderRecurrencePHISC, VPVWidenPHISC, VPVWidenIntOrFpInductionSC, @@ -358,6 +359,7 @@ // Phi-like recipes. Need to be kept together. VPBlendSC, VPCanonicalIVPHISC, + VPActiveLaneMaskPHISC, VPFirstOrderRecurrencePHISC, VPWidenPHISC, VPWidenIntOrFpInductionSC, Index: llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -187,9 +187,11 @@ } auto *LastInst = dyn_cast(std::prev(Exiting->end())); - if (!LastInst || LastInst->getOpcode() != VPInstruction::BranchOnCount) { - errs() << "VPlan vector loop exit must end with BranchOnCount " - "VPInstruction\n"; + if (!LastInst || + (LastInst->getOpcode() != VPInstruction::BranchOnCount && + LastInst->getOpcode() != VPInstruction::BranchOnActiveLaneMask)) { + errs() << "VPlan vector loop exit must end with BranchOnCount or " + "BranchOnActiveLaneMask VPInstruction\n"; return false; } Index: llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions-tf.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions-tf.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions-tf.ll @@ -4,11 +4,12 @@ define void @invariant_store_red_exit_is_phi(i32* %dst, i32* readonly %src, i64 %n) { ; CHECK-LABEL: @invariant_store_red_exit_is_phi( ; CHECK: vector.body: +; CHECK: %[[ACTIVE_LANE_MASK:.*]] = phi [ %{{.*}}, %vector.ph ], [ %[[ACTIVE_LANE_MASK2:.*]], %vector.body ] ; CHECK: %[[VEC_PHI:.*]] = phi [ zeroinitializer, %vector.ph ], [ %[[PREDPHI:.*]], %vector.body ] -; CHECK: %[[ACTIVE_LANE_MASK:.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 {{%.*}}, i64 %n) ; CHECK: %[[LOAD:.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32 ; CHECK-NEXT: %[[ADD:.*]] = add %[[VEC_PHI]], %[[LOAD]] ; CHECK-NEXT: %[[SELECT:.*]] = select %[[ACTIVE_LANE_MASK]], %[[ADD]], %[[VEC_PHI]] +; CHECK: %[[ACTIVE_LANE_MASK2]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 %{{.*}}, i64 %n) ; CHECK: middle.block: ; CHECK-NEXT: %[[SUM:.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( %[[SELECT]]) ; CHECK-NEXT: store i32 %[[SUM]], i32* %dst, align 4 Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll @@ -6,14 +6,17 @@ ; CHECK-LABEL: @trip7_i64( ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] -; CHECK: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 {{%.*}}, i64 7) +; CHECK: [[ACTIVE_LANE_MASK:%.*]] = phi [ {{%.*}}, %vector.ph ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %vector.body ] ; CHECK: {{%.*}} = call @llvm.masked.load.nxv2i64.p0nxv2i64(* {{%.*}}, i32 8, [[ACTIVE_LANE_MASK]], poison) ; CHECK: {{%.*}} = call @llvm.masked.load.nxv2i64.p0nxv2i64(* {{%.*}}, i32 8, [[ACTIVE_LANE_MASK]], poison) ; CHECK: call void @llvm.masked.store.nxv2i64.p0nxv2i64( {{%.*}}, * {{%.*}}, i32 8, [[ACTIVE_LANE_MASK]]) ; CHECK: [[VSCALE:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[VF:%.*]] = mul i64 [[VSCALE]], 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[VF]] -; CHECK-NEXT: [[COND:%.*]] = icmp eq i64 [[INDEX_NEXT]], {{%.*}} +; CHECK-NEXT: [[INDEX_NEXT_PART:%.*]] = add i64 [[INDEX_NEXT]], 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT_PART]], i64 7) +; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 +; CHECK-NEXT: [[COND:%.*]] = xor i1 [[FIRST_ACTIVE_LANE]], true ; CHECK-NEXT: br i1 [[COND]], label %middle.block, label %vector.body ; entry: @@ -40,13 +43,15 @@ ; CHECK-LABEL: @trip5_i8( ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] -; CHECK: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 {{%.*}}, i64 5) +; CHECK: [[ACTIVE_LANE_MASK:%.*]] = phi [ {{%.*}}, %vector.ph ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %vector.body ] ; CHECK: {{%.*}} = call @llvm.masked.load.nxv16i8.p0nxv16i8(* {{%.*}}, i32 1, [[ACTIVE_LANE_MASK]], poison) ; CHECK: {{%.*}} = call @llvm.masked.load.nxv16i8.p0nxv16i8(* {{%.*}}, i32 1, [[ACTIVE_LANE_MASK]], poison) ; CHECK: call void @llvm.masked.store.nxv16i8.p0nxv16i8( {{%.*}}, * {{%.*}}, i32 1, [[ACTIVE_LANE_MASK]]) ; CHECK: [[VSCALE:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[VF:%.*]] = mul i64 [[VSCALE]], 16 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[VF]] +; CHECK-NEXT: [[INDEX_NEXT_PART:%.*]] = add i64 [[INDEX_NEXT]], 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX_NEXT_PART]], i64 5) ; CHECK-NEXT: br i1 true, label %middle.block, label %vector.body ; entry: Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll @@ -1,10 +1,32 @@ -; RUN: opt -S -loop-vectorize < %s | FileCheck %s +; REQUIRES: asserts +; RUN: opt -S -loop-vectorize -debug-only=loop-vectorize < %s 2>%t | FileCheck %s +; RUN: cat %t | FileCheck %s --check-prefix=VPLANS ; These tests ensure that tail-folding is enabled when the predicate.enable ; loop attribute is set to true. target triple = "aarch64-unknown-linux-gnu" +; VPLANS-LABEL: Checking a loop in 'simple_memset' +; VPLANS: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF>=1' { +; VPLANS-NEXT: vector.ph: +; VPLANS-NEXT: EMIT vp<%2> = VF * Part + ir<0> +; VPLANS-NEXT: EMIT vp<%3> = active lane mask vp<%2> +; VPLANS-NEXT: Successor(s): vector loop +; VPLANS-EMPTY: +; VPLANS-NEXT: vector loop: { +; VPLANS-NEXT: vector.body: +; VPLANS-NEXT: EMIT vp<%4> = CANONICAL-INDUCTION +; VPLANS-NEXT: EMIT vp<%5> = ACTIVE-LANE-MASK-PHI +; VPLANS-NEXT: vp<%6> = SCALAR-STEPS vp<%4>, ir<0>, ir<1> +; VPLANS-NEXT: CLONE ir<%gep> = getelementptr ir<%ptr>, vp<%6> +; VPLANS-NEXT: WIDEN store ir<%gep>, ir<%val>, vp<%5> +; VPLANS-NEXT: EMIT vp<%8> = VF * UF + vp<%4> +; VPLANS-NEXT: EMIT vp<%9> = VF * Part + vp<%8> +; VPLANS-NEXT: EMIT vp<%10> = active lane mask vp<%9> +; VPLANS-NEXT: EMIT branch-on-active-lane-mask vp<%10> +; VPLANS-NEXT: No successors +; VPLANS-NEXT: } define void @simple_memset(i32 %val, i32* %ptr, i64 %n) #0 { ; CHECK-LABEL: @simple_memset( @@ -24,22 +46,26 @@ ; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP8]] ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[VAL:%.*]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[ACTIVE_LANE_MASK]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK4:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX1]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP9]], i64 [[UMAX]]) ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP10]], i32 0 ; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to * -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT]], * [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT]], * [[TMP12]], i32 4, [[ACTIVE_LANE_MASK2]]) ; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4 -; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP14]] -; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP14]] +; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 [[INDEX_NEXT3]], 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK4]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[UMAX]]) +; CHECK-NEXT: [[TMP15:%.*]] = extractelement [[ACTIVE_LANE_MASK4]], i32 0 +; CHECK-NEXT: [[TMP16:%.*]] = xor i1 [[TMP15]], true +; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-optsize.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-optsize.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-optsize.ll @@ -6,14 +6,17 @@ ; CHECK-LABEL: @trip1024_i64( ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] -; CHECK: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 {{%.*}}, i64 1024) +; CHECK: [[ACTIVE_LANE_MASK:%.*]] = phi [ {{.*}}, %vector.ph ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %vector.body ] ; CHECK: {{%.*}} = call @llvm.masked.load.nxv2i64.p0nxv2i64(* {{%.*}}, i32 8, [[ACTIVE_LANE_MASK]], poison) ; CHECK: {{%.*}} = call @llvm.masked.load.nxv2i64.p0nxv2i64(* {{%.*}}, i32 8, [[ACTIVE_LANE_MASK]], poison) ; CHECK: call void @llvm.masked.store.nxv2i64.p0nxv2i64( {{%.*}}, * {{%.*}}, i32 8, [[ACTIVE_LANE_MASK]]) ; CHECK: [[VSCALE:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[VF:%.*]] = mul i64 [[VSCALE]], 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[VF]] -; CHECK-NEXT: [[COND:%.*]] = icmp eq i64 [[INDEX_NEXT]], {{%.*}} +; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 [[INDEX_NEXT]], 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_PART_NEXT]], i64 1024) +; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 +; CHECK-NEXT: [[COND:%.*]] = xor i1 [[FIRST_ACTIVE_LANE]], true ; CHECK-NEXT: br i1 [[COND]], label %middle.block, label %vector.body ; entry: Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll @@ -22,64 +22,92 @@ ; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP8]] ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4 +; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP10]] +; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 8 +; CHECK-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP12]] +; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 12 +; CHECK-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP14]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK3:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[UMAX]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK4:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT1]], i64 [[UMAX]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK5:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT2]], i64 [[UMAX]]) ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[VAL:%.*]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement poison, i32 [[VAL]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector [[BROADCAST_SPLATINSERT5]], poison, zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement poison, i32 [[VAL]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector [[BROADCAST_SPLATINSERT7]], poison, zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement poison, i32 [[VAL]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT10:%.*]] = shufflevector [[BROADCAST_SPLATINSERT9]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT11:%.*]] = insertelement poison, i32 [[VAL]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT12:%.*]] = shufflevector [[BROADCAST_SPLATINSERT11]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT13:%.*]] = insertelement poison, i32 [[VAL]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT14:%.*]] = shufflevector [[BROADCAST_SPLATINSERT13]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT15:%.*]] = insertelement poison, i32 [[VAL]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT16:%.*]] = shufflevector [[BROADCAST_SPLATINSERT15]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT11:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX1]], 0 -; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 4 -; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[TMP11]], 0 -; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 1 -; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[INDEX1]], [[TMP13]] -; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 8 -; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[TMP16]], 0 -; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 1 -; CHECK-NEXT: [[TMP19:%.*]] = add i64 [[INDEX1]], [[TMP18]] -; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 12 -; CHECK-NEXT: [[TMP22:%.*]] = add i64 [[TMP21]], 0 -; CHECK-NEXT: [[TMP23:%.*]] = mul i64 [[TMP22]], 1 -; CHECK-NEXT: [[TMP24:%.*]] = add i64 [[INDEX1]], [[TMP23]] -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP9]], i64 [[UMAX]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP14]], i64 [[UMAX]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK3:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP19]], i64 [[UMAX]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK4:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP24]], i64 [[UMAX]]) -; CHECK-NEXT: [[TMP25:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP26:%.*]] = getelementptr i32, i32* [[PTR]], i64 [[TMP14]] -; CHECK-NEXT: [[TMP27:%.*]] = getelementptr i32, i32* [[PTR]], i64 [[TMP19]] -; CHECK-NEXT: [[TMP28:%.*]] = getelementptr i32, i32* [[PTR]], i64 [[TMP24]] -; CHECK-NEXT: [[TMP29:%.*]] = getelementptr i32, i32* [[TMP25]], i32 0 -; CHECK-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP29]] to * -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT]], * [[TMP30]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[TMP31:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP32:%.*]] = mul i32 [[TMP31]], 4 -; CHECK-NEXT: [[TMP33:%.*]] = getelementptr i32, i32* [[TMP25]], i32 [[TMP32]] -; CHECK-NEXT: [[TMP34:%.*]] = bitcast i32* [[TMP33]] to * -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT6]], * [[TMP34]], i32 4, [[ACTIVE_LANE_MASK2]]) -; CHECK-NEXT: [[TMP35:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP36:%.*]] = mul i32 [[TMP35]], 8 -; CHECK-NEXT: [[TMP37:%.*]] = getelementptr i32, i32* [[TMP25]], i32 [[TMP36]] -; CHECK-NEXT: [[TMP38:%.*]] = bitcast i32* [[TMP37]] to * -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT8]], * [[TMP38]], i32 4, [[ACTIVE_LANE_MASK3]]) -; CHECK-NEXT: [[TMP39:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP40:%.*]] = mul i32 [[TMP39]], 12 -; CHECK-NEXT: [[TMP41:%.*]] = getelementptr i32, i32* [[TMP25]], i32 [[TMP40]] -; CHECK-NEXT: [[TMP42:%.*]] = bitcast i32* [[TMP41]] to * -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT10]], * [[TMP42]], i32 4, [[ACTIVE_LANE_MASK4]]) -; CHECK-NEXT: [[TMP43:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP44:%.*]] = mul i64 [[TMP43]], 16 -; CHECK-NEXT: [[INDEX_NEXT11]] = add i64 [[INDEX1]], [[TMP44]] -; CHECK-NEXT: [[TMP45:%.*]] = icmp eq i64 [[INDEX_NEXT11]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP45]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[INDEX6:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT17:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK7:%.*]] = phi [ [[ACTIVE_LANE_MASK]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK22:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK8:%.*]] = phi [ [[ACTIVE_LANE_MASK3]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK23:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK9:%.*]] = phi [ [[ACTIVE_LANE_MASK4]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK24:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK10:%.*]] = phi [ [[ACTIVE_LANE_MASK5]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK25:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[INDEX6]], 0 +; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4 +; CHECK-NEXT: [[TMP18:%.*]] = add i64 [[TMP17]], 0 +; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 1 +; CHECK-NEXT: [[TMP20:%.*]] = add i64 [[INDEX6]], [[TMP19]] +; CHECK-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 8 +; CHECK-NEXT: [[TMP23:%.*]] = add i64 [[TMP22]], 0 +; CHECK-NEXT: [[TMP24:%.*]] = mul i64 [[TMP23]], 1 +; CHECK-NEXT: [[TMP25:%.*]] = add i64 [[INDEX6]], [[TMP24]] +; CHECK-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP27:%.*]] = mul i64 [[TMP26]], 12 +; CHECK-NEXT: [[TMP28:%.*]] = add i64 [[TMP27]], 0 +; CHECK-NEXT: [[TMP29:%.*]] = mul i64 [[TMP28]], 1 +; CHECK-NEXT: [[TMP30:%.*]] = add i64 [[INDEX6]], [[TMP29]] +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP15]] +; CHECK-NEXT: [[TMP32:%.*]] = getelementptr i32, i32* [[PTR]], i64 [[TMP20]] +; CHECK-NEXT: [[TMP33:%.*]] = getelementptr i32, i32* [[PTR]], i64 [[TMP25]] +; CHECK-NEXT: [[TMP34:%.*]] = getelementptr i32, i32* [[PTR]], i64 [[TMP30]] +; CHECK-NEXT: [[TMP35:%.*]] = getelementptr i32, i32* [[TMP31]], i32 0 +; CHECK-NEXT: [[TMP36:%.*]] = bitcast i32* [[TMP35]] to * +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT]], * [[TMP36]], i32 4, [[ACTIVE_LANE_MASK7]]) +; CHECK-NEXT: [[TMP37:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP38:%.*]] = mul i32 [[TMP37]], 4 +; CHECK-NEXT: [[TMP39:%.*]] = getelementptr i32, i32* [[TMP31]], i32 [[TMP38]] +; CHECK-NEXT: [[TMP40:%.*]] = bitcast i32* [[TMP39]] to * +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT12]], * [[TMP40]], i32 4, [[ACTIVE_LANE_MASK8]]) +; CHECK-NEXT: [[TMP41:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP42:%.*]] = mul i32 [[TMP41]], 8 +; CHECK-NEXT: [[TMP43:%.*]] = getelementptr i32, i32* [[TMP31]], i32 [[TMP42]] +; CHECK-NEXT: [[TMP44:%.*]] = bitcast i32* [[TMP43]] to * +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT14]], * [[TMP44]], i32 4, [[ACTIVE_LANE_MASK9]]) +; CHECK-NEXT: [[TMP45:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP46:%.*]] = mul i32 [[TMP45]], 12 +; CHECK-NEXT: [[TMP47:%.*]] = getelementptr i32, i32* [[TMP31]], i32 [[TMP46]] +; CHECK-NEXT: [[TMP48:%.*]] = bitcast i32* [[TMP47]] to * +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT16]], * [[TMP48]], i32 4, [[ACTIVE_LANE_MASK10]]) +; CHECK-NEXT: [[TMP49:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP50:%.*]] = mul i64 [[TMP49]], 16 +; CHECK-NEXT: [[INDEX_NEXT17]] = add i64 [[INDEX6]], [[TMP50]] +; CHECK-NEXT: [[INDEX_PART_NEXT18:%.*]] = add i64 [[INDEX_NEXT17]], 0 +; CHECK-NEXT: [[TMP51:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP52:%.*]] = mul i64 [[TMP51]], 4 +; CHECK-NEXT: [[INDEX_PART_NEXT19:%.*]] = add i64 [[INDEX_NEXT17]], [[TMP52]] +; CHECK-NEXT: [[TMP53:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP54:%.*]] = mul i64 [[TMP53]], 8 +; CHECK-NEXT: [[INDEX_PART_NEXT20:%.*]] = add i64 [[INDEX_NEXT17]], [[TMP54]] +; CHECK-NEXT: [[TMP55:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP56:%.*]] = mul i64 [[TMP55]], 12 +; CHECK-NEXT: [[INDEX_PART_NEXT21:%.*]] = add i64 [[INDEX_NEXT17]], [[TMP56]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK22]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT18]], i64 [[UMAX]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK23]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT19]], i64 [[UMAX]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK24]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT20]], i64 [[UMAX]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK25]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT21]], i64 [[UMAX]]) +; CHECK-NEXT: [[TMP57:%.*]] = extractelement [[ACTIVE_LANE_MASK22]], i32 0 +; CHECK-NEXT: [[TMP58:%.*]] = xor i1 [[TMP57]], true +; CHECK-NEXT: br i1 [[TMP58]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; @@ -116,94 +144,122 @@ ; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP8]] ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4 +; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP10]] +; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 8 +; CHECK-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP12]] +; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 12 +; CHECK-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP14]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK3:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[UMAX]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK4:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT1]], i64 [[UMAX]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK5:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT2]], i64 [[UMAX]]) ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[VAL:%.*]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement poison, i32 [[VAL]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT9:%.*]] = shufflevector [[BROADCAST_SPLATINSERT8]], poison, zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT10:%.*]] = insertelement poison, i32 [[VAL]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT11:%.*]] = shufflevector [[BROADCAST_SPLATINSERT10]], poison, zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT12:%.*]] = insertelement poison, i32 [[VAL]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT13:%.*]] = shufflevector [[BROADCAST_SPLATINSERT12]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT14:%.*]] = insertelement poison, i32 [[VAL]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT15:%.*]] = shufflevector [[BROADCAST_SPLATINSERT14]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT16:%.*]] = insertelement poison, i32 [[VAL]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT17:%.*]] = shufflevector [[BROADCAST_SPLATINSERT16]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT18:%.*]] = insertelement poison, i32 [[VAL]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT19:%.*]] = shufflevector [[BROADCAST_SPLATINSERT18]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT14:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX1]], 0 -; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 4 -; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[TMP11]], 0 -; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 1 -; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[INDEX1]], [[TMP13]] -; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 8 -; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[TMP16]], 0 -; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 1 -; CHECK-NEXT: [[TMP19:%.*]] = add i64 [[INDEX1]], [[TMP18]] -; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 12 -; CHECK-NEXT: [[TMP22:%.*]] = add i64 [[TMP21]], 0 -; CHECK-NEXT: [[TMP23:%.*]] = mul i64 [[TMP22]], 1 -; CHECK-NEXT: [[TMP24:%.*]] = add i64 [[INDEX1]], [[TMP23]] -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP9]], i64 [[UMAX]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP14]], i64 [[UMAX]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK3:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP19]], i64 [[UMAX]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK4:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP24]], i64 [[UMAX]]) -; CHECK-NEXT: [[TMP25:%.*]] = getelementptr i32, i32* [[COND_PTR:%.*]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP26:%.*]] = getelementptr i32, i32* [[COND_PTR]], i64 [[TMP14]] -; CHECK-NEXT: [[TMP27:%.*]] = getelementptr i32, i32* [[COND_PTR]], i64 [[TMP19]] -; CHECK-NEXT: [[TMP28:%.*]] = getelementptr i32, i32* [[COND_PTR]], i64 [[TMP24]] -; CHECK-NEXT: [[TMP29:%.*]] = getelementptr i32, i32* [[TMP25]], i32 0 -; CHECK-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP29]] to * -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP30]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP31:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP32:%.*]] = mul i32 [[TMP31]], 4 -; CHECK-NEXT: [[TMP33:%.*]] = getelementptr i32, i32* [[TMP25]], i32 [[TMP32]] -; CHECK-NEXT: [[TMP34:%.*]] = bitcast i32* [[TMP33]] to * -; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP34]], i32 4, [[ACTIVE_LANE_MASK2]], poison) -; CHECK-NEXT: [[TMP35:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP36:%.*]] = mul i32 [[TMP35]], 8 -; CHECK-NEXT: [[TMP37:%.*]] = getelementptr i32, i32* [[TMP25]], i32 [[TMP36]] -; CHECK-NEXT: [[TMP38:%.*]] = bitcast i32* [[TMP37]] to * -; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP38]], i32 4, [[ACTIVE_LANE_MASK3]], poison) -; CHECK-NEXT: [[TMP39:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP40:%.*]] = mul i32 [[TMP39]], 12 -; CHECK-NEXT: [[TMP41:%.*]] = getelementptr i32, i32* [[TMP25]], i32 [[TMP40]] -; CHECK-NEXT: [[TMP42:%.*]] = bitcast i32* [[TMP41]] to * -; CHECK-NEXT: [[WIDE_MASKED_LOAD7:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP42]], i32 4, [[ACTIVE_LANE_MASK4]], poison) -; CHECK-NEXT: [[TMP43:%.*]] = icmp ne [[WIDE_MASKED_LOAD]], zeroinitializer -; CHECK-NEXT: [[TMP44:%.*]] = icmp ne [[WIDE_MASKED_LOAD5]], zeroinitializer -; CHECK-NEXT: [[TMP45:%.*]] = icmp ne [[WIDE_MASKED_LOAD6]], zeroinitializer -; CHECK-NEXT: [[TMP46:%.*]] = icmp ne [[WIDE_MASKED_LOAD7]], zeroinitializer -; CHECK-NEXT: [[TMP47:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP48:%.*]] = getelementptr i32, i32* [[PTR]], i64 [[TMP14]] -; CHECK-NEXT: [[TMP49:%.*]] = getelementptr i32, i32* [[PTR]], i64 [[TMP19]] -; CHECK-NEXT: [[TMP50:%.*]] = getelementptr i32, i32* [[PTR]], i64 [[TMP24]] -; CHECK-NEXT: [[TMP51:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP43]], zeroinitializer -; CHECK-NEXT: [[TMP52:%.*]] = select [[ACTIVE_LANE_MASK2]], [[TMP44]], zeroinitializer -; CHECK-NEXT: [[TMP53:%.*]] = select [[ACTIVE_LANE_MASK3]], [[TMP45]], zeroinitializer -; CHECK-NEXT: [[TMP54:%.*]] = select [[ACTIVE_LANE_MASK4]], [[TMP46]], zeroinitializer -; CHECK-NEXT: [[TMP55:%.*]] = getelementptr i32, i32* [[TMP47]], i32 0 -; CHECK-NEXT: [[TMP56:%.*]] = bitcast i32* [[TMP55]] to * -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT]], * [[TMP56]], i32 4, [[TMP51]]) -; CHECK-NEXT: [[TMP57:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP58:%.*]] = mul i32 [[TMP57]], 4 -; CHECK-NEXT: [[TMP59:%.*]] = getelementptr i32, i32* [[TMP47]], i32 [[TMP58]] -; CHECK-NEXT: [[TMP60:%.*]] = bitcast i32* [[TMP59]] to * -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT9]], * [[TMP60]], i32 4, [[TMP52]]) -; CHECK-NEXT: [[TMP61:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP62:%.*]] = mul i32 [[TMP61]], 8 -; CHECK-NEXT: [[TMP63:%.*]] = getelementptr i32, i32* [[TMP47]], i32 [[TMP62]] -; CHECK-NEXT: [[TMP64:%.*]] = bitcast i32* [[TMP63]] to * -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT11]], * [[TMP64]], i32 4, [[TMP53]]) -; CHECK-NEXT: [[TMP65:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP66:%.*]] = mul i32 [[TMP65]], 12 -; CHECK-NEXT: [[TMP67:%.*]] = getelementptr i32, i32* [[TMP47]], i32 [[TMP66]] -; CHECK-NEXT: [[TMP68:%.*]] = bitcast i32* [[TMP67]] to * -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT13]], * [[TMP68]], i32 4, [[TMP54]]) -; CHECK-NEXT: [[TMP69:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP70:%.*]] = mul i64 [[TMP69]], 16 -; CHECK-NEXT: [[INDEX_NEXT14]] = add i64 [[INDEX1]], [[TMP70]] -; CHECK-NEXT: [[TMP71:%.*]] = icmp eq i64 [[INDEX_NEXT14]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP71]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[INDEX6:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT20:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK7:%.*]] = phi [ [[ACTIVE_LANE_MASK]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK25:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK8:%.*]] = phi [ [[ACTIVE_LANE_MASK3]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK26:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK9:%.*]] = phi [ [[ACTIVE_LANE_MASK4]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK27:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK10:%.*]] = phi [ [[ACTIVE_LANE_MASK5]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK28:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[INDEX6]], 0 +; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4 +; CHECK-NEXT: [[TMP18:%.*]] = add i64 [[TMP17]], 0 +; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 1 +; CHECK-NEXT: [[TMP20:%.*]] = add i64 [[INDEX6]], [[TMP19]] +; CHECK-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 8 +; CHECK-NEXT: [[TMP23:%.*]] = add i64 [[TMP22]], 0 +; CHECK-NEXT: [[TMP24:%.*]] = mul i64 [[TMP23]], 1 +; CHECK-NEXT: [[TMP25:%.*]] = add i64 [[INDEX6]], [[TMP24]] +; CHECK-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP27:%.*]] = mul i64 [[TMP26]], 12 +; CHECK-NEXT: [[TMP28:%.*]] = add i64 [[TMP27]], 0 +; CHECK-NEXT: [[TMP29:%.*]] = mul i64 [[TMP28]], 1 +; CHECK-NEXT: [[TMP30:%.*]] = add i64 [[INDEX6]], [[TMP29]] +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr i32, i32* [[COND_PTR:%.*]], i64 [[TMP15]] +; CHECK-NEXT: [[TMP32:%.*]] = getelementptr i32, i32* [[COND_PTR]], i64 [[TMP20]] +; CHECK-NEXT: [[TMP33:%.*]] = getelementptr i32, i32* [[COND_PTR]], i64 [[TMP25]] +; CHECK-NEXT: [[TMP34:%.*]] = getelementptr i32, i32* [[COND_PTR]], i64 [[TMP30]] +; CHECK-NEXT: [[TMP35:%.*]] = getelementptr i32, i32* [[TMP31]], i32 0 +; CHECK-NEXT: [[TMP36:%.*]] = bitcast i32* [[TMP35]] to * +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP36]], i32 4, [[ACTIVE_LANE_MASK7]], poison) +; CHECK-NEXT: [[TMP37:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP38:%.*]] = mul i32 [[TMP37]], 4 +; CHECK-NEXT: [[TMP39:%.*]] = getelementptr i32, i32* [[TMP31]], i32 [[TMP38]] +; CHECK-NEXT: [[TMP40:%.*]] = bitcast i32* [[TMP39]] to * +; CHECK-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP40]], i32 4, [[ACTIVE_LANE_MASK8]], poison) +; CHECK-NEXT: [[TMP41:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP42:%.*]] = mul i32 [[TMP41]], 8 +; CHECK-NEXT: [[TMP43:%.*]] = getelementptr i32, i32* [[TMP31]], i32 [[TMP42]] +; CHECK-NEXT: [[TMP44:%.*]] = bitcast i32* [[TMP43]] to * +; CHECK-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP44]], i32 4, [[ACTIVE_LANE_MASK9]], poison) +; CHECK-NEXT: [[TMP45:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP46:%.*]] = mul i32 [[TMP45]], 12 +; CHECK-NEXT: [[TMP47:%.*]] = getelementptr i32, i32* [[TMP31]], i32 [[TMP46]] +; CHECK-NEXT: [[TMP48:%.*]] = bitcast i32* [[TMP47]] to * +; CHECK-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP48]], i32 4, [[ACTIVE_LANE_MASK10]], poison) +; CHECK-NEXT: [[TMP49:%.*]] = icmp ne [[WIDE_MASKED_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP50:%.*]] = icmp ne [[WIDE_MASKED_LOAD11]], zeroinitializer +; CHECK-NEXT: [[TMP51:%.*]] = icmp ne [[WIDE_MASKED_LOAD12]], zeroinitializer +; CHECK-NEXT: [[TMP52:%.*]] = icmp ne [[WIDE_MASKED_LOAD13]], zeroinitializer +; CHECK-NEXT: [[TMP53:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP15]] +; CHECK-NEXT: [[TMP54:%.*]] = getelementptr i32, i32* [[PTR]], i64 [[TMP20]] +; CHECK-NEXT: [[TMP55:%.*]] = getelementptr i32, i32* [[PTR]], i64 [[TMP25]] +; CHECK-NEXT: [[TMP56:%.*]] = getelementptr i32, i32* [[PTR]], i64 [[TMP30]] +; CHECK-NEXT: [[TMP57:%.*]] = select [[ACTIVE_LANE_MASK7]], [[TMP49]], zeroinitializer +; CHECK-NEXT: [[TMP58:%.*]] = select [[ACTIVE_LANE_MASK8]], [[TMP50]], zeroinitializer +; CHECK-NEXT: [[TMP59:%.*]] = select [[ACTIVE_LANE_MASK9]], [[TMP51]], zeroinitializer +; CHECK-NEXT: [[TMP60:%.*]] = select [[ACTIVE_LANE_MASK10]], [[TMP52]], zeroinitializer +; CHECK-NEXT: [[TMP61:%.*]] = getelementptr i32, i32* [[TMP53]], i32 0 +; CHECK-NEXT: [[TMP62:%.*]] = bitcast i32* [[TMP61]] to * +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT]], * [[TMP62]], i32 4, [[TMP57]]) +; CHECK-NEXT: [[TMP63:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP64:%.*]] = mul i32 [[TMP63]], 4 +; CHECK-NEXT: [[TMP65:%.*]] = getelementptr i32, i32* [[TMP53]], i32 [[TMP64]] +; CHECK-NEXT: [[TMP66:%.*]] = bitcast i32* [[TMP65]] to * +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT15]], * [[TMP66]], i32 4, [[TMP58]]) +; CHECK-NEXT: [[TMP67:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP68:%.*]] = mul i32 [[TMP67]], 8 +; CHECK-NEXT: [[TMP69:%.*]] = getelementptr i32, i32* [[TMP53]], i32 [[TMP68]] +; CHECK-NEXT: [[TMP70:%.*]] = bitcast i32* [[TMP69]] to * +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT17]], * [[TMP70]], i32 4, [[TMP59]]) +; CHECK-NEXT: [[TMP71:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP72:%.*]] = mul i32 [[TMP71]], 12 +; CHECK-NEXT: [[TMP73:%.*]] = getelementptr i32, i32* [[TMP53]], i32 [[TMP72]] +; CHECK-NEXT: [[TMP74:%.*]] = bitcast i32* [[TMP73]] to * +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT19]], * [[TMP74]], i32 4, [[TMP60]]) +; CHECK-NEXT: [[TMP75:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP76:%.*]] = mul i64 [[TMP75]], 16 +; CHECK-NEXT: [[INDEX_NEXT20]] = add i64 [[INDEX6]], [[TMP76]] +; CHECK-NEXT: [[INDEX_PART_NEXT21:%.*]] = add i64 [[INDEX_NEXT20]], 0 +; CHECK-NEXT: [[TMP77:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP78:%.*]] = mul i64 [[TMP77]], 4 +; CHECK-NEXT: [[INDEX_PART_NEXT22:%.*]] = add i64 [[INDEX_NEXT20]], [[TMP78]] +; CHECK-NEXT: [[TMP79:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP80:%.*]] = mul i64 [[TMP79]], 8 +; CHECK-NEXT: [[INDEX_PART_NEXT23:%.*]] = add i64 [[INDEX_NEXT20]], [[TMP80]] +; CHECK-NEXT: [[TMP81:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP82:%.*]] = mul i64 [[TMP81]], 12 +; CHECK-NEXT: [[INDEX_PART_NEXT24:%.*]] = add i64 [[INDEX_NEXT20]], [[TMP82]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK25]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT21]], i64 [[UMAX]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK26]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT22]], i64 [[UMAX]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK27]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT23]], i64 [[UMAX]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK28]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT24]], i64 [[UMAX]]) +; CHECK-NEXT: [[TMP83:%.*]] = extractelement [[ACTIVE_LANE_MASK25]], i32 0 +; CHECK-NEXT: [[TMP84:%.*]] = xor i1 [[TMP83]], true +; CHECK-NEXT: br i1 [[TMP84]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll @@ -24,22 +24,26 @@ ; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP8]] ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[VAL:%.*]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[ACTIVE_LANE_MASK]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK4:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX1]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP9]], i64 [[UMAX]]) ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP10]], i32 0 ; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to * -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT]], * [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT]], * [[TMP12]], i32 4, [[ACTIVE_LANE_MASK2]]) ; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4 -; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP14]] -; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP14]] +; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 [[INDEX_NEXT3]], 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK4]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[UMAX]]) +; CHECK-NEXT: [[TMP15:%.*]] = extractelement [[ACTIVE_LANE_MASK4]], i32 0 +; CHECK-NEXT: [[TMP16:%.*]] = xor i1 [[TMP15]], true +; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; @@ -59,6 +63,52 @@ } +define void @simple_memset_v4i32(i32 %val, i32* %ptr, i64 %n) #0 { +; CHECK-LABEL: @simple_memset_v4i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1) +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], 3 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 0, i64 [[UMAX]]) +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[VAL:%.*]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi <4 x i1> [ [[ACTIVE_LANE_MASK]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK4:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX1]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, i32* [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* +; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[BROADCAST_SPLAT]], <4 x i32>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK2]]) +; CHECK-NEXT: [[INDEX_NEXT3]] = add i64 [[INDEX1]], 4 +; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 [[INDEX_NEXT3]], 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK4]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[UMAX]]) +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK4]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = xor i1 [[TMP4]], true +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; +entry: + br label %while.body + +while.body: ; preds = %while.body, %entry + %index = phi i64 [ %index.next, %while.body ], [ 0, %entry ] + %gep = getelementptr i32, i32* %ptr, i64 %index + store i32 %val, i32* %gep + %index.next = add nsw i64 %index, 1 + %cmp10 = icmp ult i64 %index.next, %n + br i1 %cmp10, label %while.body, label %while.end.loopexit, !llvm.loop !3 + +while.end.loopexit: ; preds = %while.body + ret void +} + + define void @simple_memcpy(i32* noalias %dst, i32* noalias %src, i64 %n) #0 { ; CHECK-LABEL: @simple_memcpy( ; CHECK-NEXT: entry: @@ -77,24 +127,28 @@ ; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP8]] ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[ACTIVE_LANE_MASK]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK4:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX1]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP9]], i64 [[UMAX]]) ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[SRC:%.*]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP10]], i32 0 ; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to * -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP12]], i32 4, [[ACTIVE_LANE_MASK2]], poison) ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[DST:%.*]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, i32* [[TMP13]], i32 0 ; CHECK-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to * -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[WIDE_MASKED_LOAD]], * [[TMP15]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[WIDE_MASKED_LOAD]], * [[TMP15]], i32 4, [[ACTIVE_LANE_MASK2]]) ; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4 -; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP17]] -; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP17]] +; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 [[INDEX_NEXT3]], 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK4]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[UMAX]]) +; CHECK-NEXT: [[TMP18:%.*]] = extractelement [[ACTIVE_LANE_MASK4]], i32 0 +; CHECK-NEXT: [[TMP19:%.*]] = xor i1 [[TMP18]], true +; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; @@ -138,6 +192,7 @@ ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP8]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[IND_END:%.*]] = mul i64 [[N_VEC]], 4 +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[TMP2]]) ; CHECK-NEXT: [[TMP12:%.*]] = call @llvm.experimental.stepvector.nxv4i64() ; CHECK-NEXT: [[TMP13:%.*]] = add [[TMP12]], zeroinitializer ; CHECK-NEXT: [[TMP14:%.*]] = mul [[TMP13]], shufflevector ( insertelement ( poison, i64 4, i32 0), poison, zeroinitializer) @@ -149,25 +204,22 @@ ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[ACTIVE_LANE_MASK]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK4:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[INDEX1]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP18:%.*]] = call @llvm.experimental.stepvector.nxv4i64() -; CHECK-NEXT: [[TMP19:%.*]] = add zeroinitializer, [[TMP18]] -; CHECK-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT]], [[TMP19]] -; CHECK-NEXT: [[TMP20:%.*]] = extractelement [[VEC_IV]], i32 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP20]], i64 [[TMP2]]) -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i32, i32* [[SRC:%.*]], [[VEC_IND]] -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP21]], i32 4, [[ACTIVE_LANE_MASK]], undef) -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr i32, i32* [[DST:%.*]], [[VEC_IND]] -; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( [[WIDE_MASKED_GATHER]], [[TMP22]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[TMP23:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP24:%.*]] = mul i64 [[TMP23]], 4 -; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP24]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[SRC:%.*]], [[VEC_IND]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP18]], i32 4, [[ACTIVE_LANE_MASK2]], undef) +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DST:%.*]], [[VEC_IND]] +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( [[WIDE_MASKED_GATHER]], [[TMP19]], i32 4, [[ACTIVE_LANE_MASK2]]) +; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 4 +; CHECK-NEXT: [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP21]] +; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 [[INDEX_NEXT3]], 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK4]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[TMP2]]) +; CHECK-NEXT: [[TMP22:%.*]] = extractelement [[ACTIVE_LANE_MASK4]], i32 0 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: [[TMP23:%.*]] = xor i1 [[TMP22]], true +; CHECK-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; @@ -207,24 +259,28 @@ ; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP8]] ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[ACTIVE_LANE_MASK]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK4:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX1]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP9]], i64 [[UMAX]]) ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[IND:%.*]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP10]], i32 0 ; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to * -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP12]], i32 4, [[ACTIVE_LANE_MASK2]], poison) ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[SRC:%.*]], [[WIDE_MASKED_LOAD]] -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP13]], i32 4, [[ACTIVE_LANE_MASK]], undef) +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP13]], i32 4, [[ACTIVE_LANE_MASK2]], undef) ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, i32* [[DST:%.*]], [[WIDE_MASKED_LOAD]] -; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( [[WIDE_MASKED_GATHER]], [[TMP14]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( [[WIDE_MASKED_GATHER]], [[TMP14]], i32 4, [[ACTIVE_LANE_MASK2]]) ; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4 -; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP16]] -; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP16]] +; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 [[INDEX_NEXT3]], 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK4]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[UMAX]]) +; CHECK-NEXT: [[TMP17:%.*]] = extractelement [[ACTIVE_LANE_MASK4]], i32 0 +; CHECK-NEXT: [[TMP18:%.*]] = xor i1 [[TMP17]], true +; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; @@ -267,23 +323,27 @@ ; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]] ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK1:%.*]] = phi [ [[ACTIVE_LANE_MASK]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK2:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP9]], i64 [[N]]) ; CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[SRC:%.*]], align 4 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP10]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to * -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT]], * [[TMP13]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT]], * [[TMP13]], i32 4, [[ACTIVE_LANE_MASK1]]) ; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP15]] -; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 [[INDEX_NEXT]], 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK2]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[N]]) +; CHECK-NEXT: [[TMP16:%.*]] = extractelement [[ACTIVE_LANE_MASK2]], i32 0 +; CHECK-NEXT: [[TMP17:%.*]] = xor i1 [[TMP16]], true +; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; @@ -326,22 +386,23 @@ ; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]] ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32* [[SRC:%.*]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[ACTIVE_LANE_MASK]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK4:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX1]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP9]], i64 [[N]]) ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[COND:%.*]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 0 ; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to * -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP12]], i32 4, [[ACTIVE_LANE_MASK2]], poison) ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq [[WIDE_MASKED_LOAD]], zeroinitializer ; CHECK-NEXT: [[TMP14:%.*]] = xor [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP15:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP14]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = select [[ACTIVE_LANE_MASK2]], [[TMP14]], zeroinitializer ; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[BROADCAST_SPLAT]], i32 4, [[TMP15]], undef) -; CHECK-NEXT: [[TMP16:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = select [[ACTIVE_LANE_MASK2]], [[TMP13]], zeroinitializer ; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP16]], zeroinitializer, [[WIDE_MASKED_GATHER]] ; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP18:%.*]] = or [[TMP15]], [[TMP16]] @@ -350,9 +411,12 @@ ; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[PREDPHI]], * [[TMP20]], i32 4, [[TMP18]]) ; CHECK-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 4 -; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP22]] -; CHECK-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP22]] +; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 [[INDEX_NEXT3]], 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK4]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[N]]) +; CHECK-NEXT: [[TMP23:%.*]] = extractelement [[ACTIVE_LANE_MASK4]], i32 0 +; CHECK-NEXT: [[TMP24:%.*]] = xor i1 [[TMP23]], true +; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; @@ -403,23 +467,27 @@ ; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]] ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32* [[DST:%.*]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK1:%.*]] = phi [ [[ACTIVE_LANE_MASK]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK2:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP9]], i64 [[N]]) ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 0 ; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to * -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP12]], i32 4, [[ACTIVE_LANE_MASK1]], poison) +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]], i32 4, [[ACTIVE_LANE_MASK1]]) ; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP14]] -; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 [[INDEX_NEXT]], 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK2]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[N]]) +; CHECK-NEXT: [[TMP15:%.*]] = extractelement [[ACTIVE_LANE_MASK2]], i32 0 +; CHECK-NEXT: [[TMP16:%.*]] = xor i1 [[TMP15]], true +; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; @@ -459,27 +527,31 @@ ; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP8]] ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[ACTIVE_LANE_MASK]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK5:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX1]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP9]], i64 [[UMAX]]) ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr float, float* [[SRC:%.*]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr float, float* [[DST:%.*]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr float, float* [[TMP10]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP12]] to * -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0nxv4f32(* [[TMP13]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0nxv4f32(* [[TMP13]], i32 4, [[ACTIVE_LANE_MASK2]], poison) ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr float, float* [[TMP11]], i32 0 ; CHECK-NEXT: [[TMP15:%.*]] = bitcast float* [[TMP14]] to * -; CHECK-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = call @llvm.masked.load.nxv4f32.p0nxv4f32(* [[TMP15]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP16:%.*]] = fdiv [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD2]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = call @llvm.masked.load.nxv4f32.p0nxv4f32(* [[TMP15]], i32 4, [[ACTIVE_LANE_MASK2]], poison) +; CHECK-NEXT: [[TMP16:%.*]] = fdiv [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD3]] ; CHECK-NEXT: [[TMP17:%.*]] = bitcast float* [[TMP14]] to * -; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0nxv4f32( [[TMP16]], * [[TMP17]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0nxv4f32( [[TMP16]], * [[TMP17]], i32 4, [[ACTIVE_LANE_MASK2]]) ; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 4 -; CHECK-NEXT: [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP19]] -; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT3]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-NEXT: [[INDEX_NEXT4]] = add i64 [[INDEX1]], [[TMP19]] +; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 [[INDEX_NEXT4]], 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK5]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[UMAX]]) +; CHECK-NEXT: [[TMP20:%.*]] = extractelement [[ACTIVE_LANE_MASK5]], i32 0 +; CHECK-NEXT: [[TMP21:%.*]] = xor i1 [[TMP20]], true +; CHECK-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; @@ -521,24 +593,28 @@ ; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP8]] ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[ACTIVE_LANE_MASK]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK4:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX1]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP9]], i64 [[UMAX]]) ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP10]], i32 0 ; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to * -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP13:%.*]] = select [[ACTIVE_LANE_MASK]], [[WIDE_MASKED_LOAD]], zeroinitializer +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP12]], i32 4, [[ACTIVE_LANE_MASK2]], poison) +; CHECK-NEXT: [[TMP13:%.*]] = select [[ACTIVE_LANE_MASK2]], [[WIDE_MASKED_LOAD]], zeroinitializer ; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP13]]) ; CHECK-NEXT: [[TMP15]] = add i32 [[TMP14]], [[VEC_PHI]] ; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4 -; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP17]] -; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-NEXT: [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP17]] +; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 [[INDEX_NEXT3]], 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK4]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[UMAX]]) +; CHECK-NEXT: [[TMP18:%.*]] = extractelement [[ACTIVE_LANE_MASK4]], i32 0 +; CHECK-NEXT: [[TMP19:%.*]] = xor i1 [[TMP18]], true +; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; @@ -577,23 +653,27 @@ ; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP8]] ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[ACTIVE_LANE_MASK]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK4:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX1]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP9]], i64 [[UMAX]]) ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr float, float* [[PTR:%.*]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr float, float* [[TMP10]], i32 0 ; CHECK-NEXT: [[TMP12:%.*]] = bitcast float* [[TMP11]] to * -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0nxv4f32(* [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP13:%.*]] = select [[ACTIVE_LANE_MASK]], [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, float -0.000000e+00, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0nxv4f32(* [[TMP12]], i32 4, [[ACTIVE_LANE_MASK2]], poison) +; CHECK-NEXT: [[TMP13:%.*]] = select [[ACTIVE_LANE_MASK2]], [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, float -0.000000e+00, i32 0), poison, zeroinitializer) ; CHECK-NEXT: [[TMP14]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[TMP13]]) ; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4 -; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP16]] -; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK-NEXT: [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP16]] +; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 [[INDEX_NEXT3]], 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK4]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[UMAX]]) +; CHECK-NEXT: [[TMP17:%.*]] = extractelement [[ACTIVE_LANE_MASK4]], i32 0 +; CHECK-NEXT: [[TMP18:%.*]] = xor i1 [[TMP17]], true +; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; @@ -631,30 +711,34 @@ ; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]] ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK1:%.*]] = phi [ [[ACTIVE_LANE_MASK]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK3:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 7, [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP9]], i64 [[N]]) ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[COND:%.*]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 0 ; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to * -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP12]], i32 4, [[ACTIVE_LANE_MASK1]], poison) ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i32 5, i32 0), poison, zeroinitializer) ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP15:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = select [[ACTIVE_LANE_MASK1]], [[TMP13]], zeroinitializer ; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i32, i32* [[TMP14]], i32 0 ; CHECK-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to * -; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP17]], i32 4, [[TMP15]], poison) -; CHECK-NEXT: [[TMP18:%.*]] = select [[TMP15]], [[WIDE_MASKED_LOAD1]], zeroinitializer +; CHECK-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP17]], i32 4, [[TMP15]], poison) +; CHECK-NEXT: [[TMP18:%.*]] = select [[TMP15]], [[WIDE_MASKED_LOAD2]], zeroinitializer ; CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.xor.nxv4i32( [[TMP18]]) ; CHECK-NEXT: [[TMP20]] = xor i32 [[TMP19]], [[VEC_PHI]] ; CHECK-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP22]] -; CHECK-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 [[INDEX_NEXT]], 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK3]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[N]]) +; CHECK-NEXT: [[TMP23:%.*]] = extractelement [[ACTIVE_LANE_MASK3]], i32 0 +; CHECK-NEXT: [[TMP24:%.*]] = xor i1 [[TMP23]], true +; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; @@ -715,5 +799,7 @@ !0 = distinct !{!0, !1, !2} !1 = !{!"llvm.loop.vectorize.width", i32 4} !2 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} +!3 = distinct !{!3, !4} +!4 = !{!"llvm.loop.vectorize.width", i32 4} attributes #0 = { "target-features"="+sve" } Index: llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll @@ -9,10 +9,12 @@ ; we don't artificially create new predicated blocks for the load. define void @uniform_load(i32* noalias %dst, i32* noalias readonly %src, i64 %n) #0 { ; CHECK-LABEL: @uniform_load( +; CHECK: vector.ph: +; CHECK: [[INIT_ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 0, i64 %n) ; CHECK: vector.body: ; CHECK-NEXT: [[IDX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[IDX_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[INIT_ACTIVE_LANE_MASK]], %vector.ph ], [ [[NEXT_ACTIVE_LANE_MASK:%.*]], %vector.body ] ; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[IDX]], 0 -; CHECK-NEXT: [[LOOP_PRED:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[TMP3]], i64 %n) ; CHECK-NEXT: [[LOAD_VAL:%.*]] = load i32, i32* %src, align 4 ; CHECK-NOT: load i32, i32* %src, align 4 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> poison, i32 [[LOAD_VAL]], i32 0 @@ -20,10 +22,13 @@ ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* %dst, i64 [[TMP3]] ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 0 ; CHECK-NEXT: [[STORE_PTR:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>* -; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP5]], <4 x i32>* [[STORE_PTR]], i32 4, <4 x i1> [[LOOP_PRED]]) +; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP5]], <4 x i32>* [[STORE_PTR]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]]) ; CHECK-NEXT: [[IDX_NEXT]] = add i64 [[IDX]], 4 -; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[IDX_NEXT]], %n.vec -; CHECK-NEXT: br i1 [[CMP]], label %middle.block, label %vector.body +; CHECK-NEXT: [[IDX_NEXT_PART0:%.*]] = add i64 [[IDX_NEXT]], 0 +; CHECK-NEXT: [[NEXT_ACTIVE_LANE_MASK]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[IDX_NEXT_PART0]], i64 %n) +; CHECK-NEXT: [[FIRST_LANE_SET:%.*]] = extractelement <4 x i1> [[NEXT_ACTIVE_LANE_MASK]], i32 0 +; CHECK-NEXT: [[FIRST_LANE_NOT_SET:%.*]] = xor i1 [[FIRST_LANE_SET]], true +; CHECK-NEXT: br i1 [[FIRST_LANE_NOT_SET]], label %middle.block, label %vector.body entry: br label %for.body @@ -48,16 +53,17 @@ define void @cond_uniform_load(i32* nocapture %dst, i32* nocapture readonly %src, i32* nocapture readonly %cond, i64 %n) #0 { ; CHECK-LABEL: @cond_uniform_load( ; CHECK: vector.ph: +; CHECK: [[INIT_ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 0, i64 %n) ; CHECK: [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* %src, i32 0 ; CHECK-NEXT: [[SRC_SPLAT:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> poison, <4 x i32> zeroinitializer ; CHECK: vector.body: ; CHECK-NEXT: [[IDX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[IDX_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[INIT_ACTIVE_LANE_MASK]], %vector.ph ], [ [[NEXT_ACTIVE_LANE_MASK:%.*]], %vector.body ] ; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[IDX]], 0 -; CHECK-NEXT: [[LOOP_PRED:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[TMP3]], i64 %n) -; CHECK: [[COND_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{%.*}}, i32 4, <4 x i1> [[LOOP_PRED]], <4 x i32> poison) +; CHECK: [[COND_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{%.*}}, i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison) ; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <4 x i32> [[COND_LOAD]], zeroinitializer ; CHECK-NEXT: [[TMP5:%.*]] = xor <4 x i1> [[TMP4]], -; CHECK-NEXT: [[MASK:%.*]] = select <4 x i1> [[LOOP_PRED]], <4 x i1> [[TMP5]], <4 x i1> zeroinitializer +; CHECK-NEXT: [[MASK:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i1> [[TMP5]], <4 x i1> zeroinitializer ; CHECK-NEXT: call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[SRC_SPLAT]], i32 4, <4 x i1> [[MASK]], <4 x i32> undef) entry: br label %for.body