Index: llvm/include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfo.h +++ llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -159,6 +159,8 @@ bool skipScalarizationCost() const { return ScalarizationCost.isValid(); } }; +enum class ActiveLaneMask { None, Predicate, PredicateAndControlFlow }; + class TargetTransformInfo; typedef TargetTransformInfo TTI; @@ -531,7 +533,7 @@ /// Query the target whether lowering of the llvm.get.active.lane.mask /// intrinsic is supported. - bool emitGetActiveLaneMask() const; + ActiveLaneMask emitGetActiveLaneMask() const; // Parameters that control the loop peeling transformation struct PeelingPreferences { @@ -1539,7 +1541,7 @@ preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *TLI, DominatorTree *DT, const LoopAccessInfo *LAI) = 0; - virtual bool emitGetActiveLaneMask() = 0; + virtual ActiveLaneMask emitGetActiveLaneMask() = 0; virtual Optional instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) = 0; virtual Optional @@ -1914,7 +1916,7 @@ const LoopAccessInfo *LAI) override { return Impl.preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI); } - bool emitGetActiveLaneMask() override { + ActiveLaneMask emitGetActiveLaneMask() override { return Impl.emitGetActiveLaneMask(); } Optional instCombineIntrinsic(InstCombiner &IC, Index: llvm/include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -167,9 +167,7 @@ return false; } - bool emitGetActiveLaneMask() const { - return false; - } + ActiveLaneMask emitGetActiveLaneMask() const { return ActiveLaneMask::None; } Optional instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { Index: llvm/include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -603,7 +603,7 @@ return BaseT::preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI); } - bool emitGetActiveLaneMask() { + ActiveLaneMask emitGetActiveLaneMask() { return BaseT::emitGetActiveLaneMask(); } Index: llvm/lib/Analysis/TargetTransformInfo.cpp =================================================================== --- llvm/lib/Analysis/TargetTransformInfo.cpp +++ llvm/lib/Analysis/TargetTransformInfo.cpp @@ -293,7 +293,7 @@ return TTIImpl->preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI); } -bool TargetTransformInfo::emitGetActiveLaneMask() const { +ActiveLaneMask TargetTransformInfo::emitGetActiveLaneMask() const { return TTIImpl->emitGetActiveLaneMask(); } Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -332,8 +332,10 @@ return 2; } - bool emitGetActiveLaneMask() const { - return ST->hasSVE(); + ActiveLaneMask emitGetActiveLaneMask() const { + if (ST->hasSVE()) + return ActiveLaneMask::PredicateAndControlFlow; + return ActiveLaneMask::None; } bool supportsScalableVectors() const { return ST->hasSVE(); } Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.h =================================================================== --- llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -298,7 +298,7 @@ TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE); - bool emitGetActiveLaneMask() const; + ActiveLaneMask emitGetActiveLaneMask() const; void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP); Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -2246,15 +2246,15 @@ return canTailPredicateLoop(L, LI, SE, DL, LAI); } -bool ARMTTIImpl::emitGetActiveLaneMask() const { +ActiveLaneMask ARMTTIImpl::emitGetActiveLaneMask() const { if (!ST->hasMVEIntegerOps() || !EnableTailPredication) - return false; + return ActiveLaneMask::None; // Intrinsic @llvm.get.active.lane.mask is supported. // It is used in the MVETailPredication pass, which requires the number of // elements processed by this vector loop to setup the tail-predicated // loop. - return true; + return ActiveLaneMask::Predicate; } void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1098,7 +1098,8 @@ if (isa(CurRec) || isa(CurRec) || isa(CurRec) || - isa(CurRec)) + isa(CurRec) || + isa(CurRec)) continue; // This recipe contributes to the address computation of a widen @@ -8089,11 +8090,19 @@ if (!CM.blockNeedsPredicationForAnyReason(BB)) return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. + assert(CM.foldTailByMasking() && "must fold the tail"); + + // If we're using the active lane mask for control flow, then we get the + // mask from the active lane mask PHI that is cached in the VPlan. + ActiveLaneMask EmitGetActiveLaneMask = CM.TTI.emitGetActiveLaneMask(); + if (EmitGetActiveLaneMask == ActiveLaneMask::PredicateAndControlFlow) + return BlockMaskCache[BB] = Plan->getActiveLaneMaskPhi(); + // Introduce the early-exit compare IV <= BTC to form header block mask. // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by // constructing the desired canonical IV in the header block as its first // non-phi instructions. - assert(CM.foldTailByMasking() && "must fold the tail"); + VPBasicBlock *HeaderVPBB = Plan->getVectorLoopRegion()->getEntryBasicBlock(); auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi(); @@ -8102,7 +8111,7 @@ VPBuilder::InsertPointGuard Guard(Builder); Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint); - if (CM.TTI.emitGetActiveLaneMask()) { + if (EmitGetActiveLaneMask != ActiveLaneMask::None) { VPValue *TC = Plan->getOrCreateTripCount(); BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC}); } else { @@ -8669,7 +8678,8 @@ // CanonicalIVIncrement{NUW} VPInstruction to increment it by VF * UF and a // BranchOnCount VPInstruction to the latch. static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL, - bool HasNUW, bool IsVPlanNative) { + bool HasNUW, bool IsVPlanNative, + bool UseLaneMaskForLoopControlFlow) { Value *StartIdx = ConstantInt::get(IdxTy, 0); auto *StartV = Plan.getOrAddVPValue(StartIdx); @@ -8689,10 +8699,32 @@ EB->setCondBit(nullptr); EB->appendRecipe(CanonicalIVIncrement); - auto *BranchOnCount = - new VPInstruction(VPInstruction::BranchOnCount, - {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); - EB->appendRecipe(BranchOnCount); + VPInstruction *BranchBack; + if (UseLaneMaskForLoopControlFlow) { + VPValue *TC = Plan.getOrCreateTripCount(); + auto *LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(TC, DebugLoc()); + Header->insert(LaneMaskPhi, Header->getFirstNonPhi()); + Plan.setActiveLaneMaskPhi(LaneMaskPhi); + + auto *CanonicalIVIncrementParts = + new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementForPartNUW + : VPInstruction::CanonicalIVIncrementForPart, + {CanonicalIVIncrement}, DL); + EB->appendRecipe(CanonicalIVIncrementParts); + + auto *ALM = new VPInstruction(VPInstruction::ActiveLaneMask, + {CanonicalIVIncrementParts, TC}, DL); + EB->appendRecipe(ALM); + LaneMaskPhi->addOperand(ALM); + + BranchBack = + new VPInstruction(VPInstruction::BranchOnActiveLaneMask, {ALM}, DL); + } else { + BranchBack = new VPInstruction( + VPInstruction::BranchOnCount, + {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); + } + EB->appendRecipe(BranchBack); } // Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the @@ -8791,9 +8823,13 @@ Instruction *DLInst = getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); + bool HasTailFolding = CM.foldTailByMasking(); + bool UseLaneMaskForControlFlow = + HasTailFolding && (CM.TTI.emitGetActiveLaneMask() == + ActiveLaneMask::PredicateAndControlFlow); addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DLInst ? DLInst->getDebugLoc() : DebugLoc(), - !CM.foldTailByMasking(), false); + !HasTailFolding, false, UseLaneMaskForControlFlow); // Scan the body of the loop in a topological order to visit each basic block // after having visited its predecessor basic blocks. @@ -9131,8 +9167,11 @@ // 4. Remove exit block. delete Exit; + bool UseLaneMaskForControlFlow = + CM.foldTailByMasking() && + CM.TTI.emitGetActiveLaneMask() == ActiveLaneMask::PredicateAndControlFlow; addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(), - true, true); + true, true, UseLaneMaskForControlFlow); return Plan; } Index: llvm/lib/Transforms/Vectorize/VPlan.h =================================================================== --- llvm/lib/Transforms/Vectorize/VPlan.h +++ llvm/lib/Transforms/Vectorize/VPlan.h @@ -824,7 +824,12 @@ ActiveLaneMask, CanonicalIVIncrement, CanonicalIVIncrementNUW, + // The next two are similar to the above, but instead increment the + // canonical IV separately for each unrolled part. + CanonicalIVIncrementForPart, + CanonicalIVIncrementForPartNUW, BranchOnCount, + BranchOnActiveLaneMask, }; private: @@ -914,6 +919,7 @@ case Instruction::Fence: case Instruction::AtomicRMW: case VPInstruction::BranchOnCount: + case VPInstruction::BranchOnActiveLaneMask: return false; default: return true; @@ -935,7 +941,10 @@ case VPInstruction::ActiveLaneMask: case VPInstruction::CanonicalIVIncrement: case VPInstruction::CanonicalIVIncrementNUW: + case VPInstruction::CanonicalIVIncrementForPart: + case VPInstruction::CanonicalIVIncrementForPartNUW: case VPInstruction::BranchOnCount: + case VPInstruction::BranchOnActiveLaneMask: return true; }; llvm_unreachable("switch should return"); @@ -1168,6 +1177,7 @@ /// Method to support type inquiry through isa, cast, and dyn_cast. static inline bool classof(const VPRecipeBase *B) { return B->getVPDefID() == VPRecipeBase::VPCanonicalIVPHISC || + B->getVPDefID() == VPRecipeBase::VPActiveLaneMaskPHISC || B->getVPDefID() == VPRecipeBase::VPFirstOrderRecurrencePHISC || B->getVPDefID() == VPRecipeBase::VPReductionPHISC || B->getVPDefID() == VPRecipeBase::VPWidenIntOrFpInductionSC || @@ -1175,6 +1185,7 @@ } static inline bool classof(const VPValue *V) { return V->getVPValueID() == VPValue::VPVCanonicalIVPHISC || + V->getVPValueID() == VPValue::VPVActiveLaneMaskPHISC || V->getVPValueID() == VPValue::VPVFirstOrderRecurrencePHISC || V->getVPValueID() == VPValue::VPVReductionPHISC || V->getVPValueID() == VPValue::VPVWidenIntOrFpInductionSC || @@ -1200,7 +1211,8 @@ /// Returns the incoming value from the loop backedge. VPValue *getBackedgeValue() { - return getOperand(1); + assert(getNumOperands() != 0 && "Invalid request for backedge value!"); + return getOperand(getNumOperands() - 1); } /// Returns the backedge value as a recipe. The backedge value is guaranteed @@ -1897,6 +1909,43 @@ } }; +/// A recipe for generating the active lane mask for the vector loop that is +/// used to predicate the vector operations. +class VPActiveLaneMaskPHIRecipe : public VPHeaderPHIRecipe { + DebugLoc DL; + VPValue *TC; + +public: + VPActiveLaneMaskPHIRecipe(VPValue *TC, DebugLoc DL) + : VPHeaderPHIRecipe(VPValue::VPVActiveLaneMaskPHISC, + VPActiveLaneMaskPHISC, nullptr), + DL(DL), TC(TC) {} + + ~VPActiveLaneMaskPHIRecipe() override = default; + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPDef *D) { + return D->getVPDefID() == VPActiveLaneMaskPHISC; + } + static inline bool classof(const VPHeaderPHIRecipe *D) { + return D->getVPDefID() == VPActiveLaneMaskPHISC; + } + static inline bool classof(const VPValue *V) { + return V->getVPValueID() == VPValue::VPVActiveLaneMaskPHISC; + } + + /// Generate the active lane mask phi of the vector loop. + void execute(VPTransformState &State) override; + + VPValue *getTripCount() const { return TC; } + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif +}; + /// A Recipe for widening the canonical induction variable of the vector loop. class VPWidenCanonicalIVRecipe : public VPRecipeBase, public VPValue { public: @@ -2495,6 +2544,8 @@ /// the tail. It equals TripCount - 1. VPValue *BackedgeTakenCount = nullptr; + VPActiveLaneMaskPHIRecipe *LaneMask = nullptr; + /// Represents the vector trip count. VPValue VectorTripCount; @@ -2572,6 +2623,12 @@ return BackedgeTakenCount; } + void setActiveLaneMaskPhi(VPActiveLaneMaskPHIRecipe *Mask) { + LaneMask = Mask; + } + + VPActiveLaneMaskPHIRecipe *getActiveLaneMaskPhi() const { return LaneMask; } + /// The vector trip count. VPValue &getVectorTripCount() { return VectorTripCount; } Index: llvm/lib/Transforms/Vectorize/VPlan.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/VPlan.cpp +++ llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -794,17 +794,40 @@ State.set(this, Next, Part); break; } - case VPInstruction::BranchOnCount: { + case VPInstruction::CanonicalIVIncrementForPart: + case VPInstruction::CanonicalIVIncrementForPartNUW: { + bool IsNUW = getOpcode() == VPInstruction::CanonicalIVIncrementForPartNUW; + auto *Phi = State.get(getOperand(0), 0); + // The loop step is equal to the vectorization factor (num of SIMD + // elements) times the unroll factor (num of SIMD instructions). + Value *Step = createStepForVF(Builder, Phi->getType(), State.VF, Part); + Value *Next = Builder.CreateAdd(Phi, Step, "index.part.next", IsNUW, false); + State.set(this, Next, Part); + break; + } + case VPInstruction::BranchOnCount: + case VPInstruction::BranchOnActiveLaneMask: { if (Part != 0) break; - // First create the compare. - Value *IV = State.get(getOperand(0), Part); - Value *TC = State.get(getOperand(1), Part); - Value *Cond = Builder.CreateICmpEQ(IV, TC); - - // Now create the branch. + // First create the condition. auto *Plan = getParent()->getPlan(); VPRegionBlock *TopRegion = Plan->getVectorLoopRegion(); + + Value *Cond; + if (getOpcode() == VPInstruction::BranchOnCount) { + Value *IV = State.get(getOperand(0), 0); + Value *TC = State.get(getOperand(1), 0); + Cond = Builder.CreateICmpEQ(IV, TC); + } else { + Value *LaneMask = State.get(getOperand(0), Part); + + // Exit the loop if there are no lanes active. Testing the first lane is + // sufficient for this. + Cond = Builder.CreateExtractElement(LaneMask, Builder.getInt32(0)); + Cond = Builder.CreateNot(Cond); + } + + // Now create the branch. VPBasicBlock *Header = TopRegion->getEntry()->getEntryBasicBlock(); if (Header->empty()) { assert(EnableVPlanNativePath && @@ -872,9 +895,18 @@ case VPInstruction::CanonicalIVIncrementNUW: O << "VF * UF +(nuw) "; break; + case VPInstruction::CanonicalIVIncrementForPart: + O << "VF * Part + "; + break; + case VPInstruction::CanonicalIVIncrementForPartNUW: + O << "VF * Part +(nuw) "; + break; case VPInstruction::BranchOnCount: O << "branch-on-count "; break; + case VPInstruction::BranchOnActiveLaneMask: + O << "branch-on-active-lane-mask "; + break; default: O << Instruction::getOpcodeName(getOpcode()); } @@ -1029,7 +1061,8 @@ // generated. bool SinglePartNeeded = isa(PhiR) || isa(PhiR) || - cast(PhiR)->isOrdered(); + (isa(PhiR) && + cast(PhiR)->isOrdered()); unsigned LastPartForNewPhi = SinglePartNeeded ? 1 : State->UF; for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) { @@ -1522,6 +1555,41 @@ (IsUniform || !VF.isScalable()); } +void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) { + BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); + Value *TC = State.get(getTripCount(), 0); + auto *Int1Ty = Type::getInt1Ty(State.Builder.getContext()); + auto *PredTy = VectorType::get(Int1Ty, State.VF); + + IRBuilder<>::InsertPointGuard Guard(State.Builder); + State.Builder.SetInsertPoint(&(*VectorPH->getFirstInsertionPt())); + for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) { + Instruction *StartMask; + + Value *StartIV = + createStepForVF(State.Builder, TC->getType(), State.VF, Part); + StartMask = State.Builder.CreateIntrinsic( + Intrinsic::get_active_lane_mask, {PredTy, TC->getType()}, {StartIV, TC}, + nullptr, "active.lane.mask"); + + PHINode *EntryPart = + PHINode::Create(StartMask->getType(), 2, "active.lane.mask", + &*State.CFG.PrevBB->getFirstInsertionPt()); + EntryPart->addIncoming(StartMask, VectorPH); + EntryPart->setDebugLoc(DL); + State.set(this, EntryPart, Part); + } +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "EMIT "; + printAsOperand(O, SlotTracker); + O << " = ACTIVE-LANE-MASK-PHI"; +} +#endif + void VPExpandSCEVRecipe::execute(VPTransformState &State) { assert(!State.Instance && "cannot be used in per-lane"); const DataLayout &DL = State.CFG.PrevBB->getModule()->getDataLayout(); Index: llvm/lib/Transforms/Vectorize/VPlanValue.h =================================================================== --- llvm/lib/Transforms/Vectorize/VPlanValue.h +++ llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -103,6 +103,7 @@ // Phi-like VPValues. Need to be kept together. VPVBlendSC, VPVCanonicalIVPHISC, + VPVActiveLaneMaskPHISC, VPVFirstOrderRecurrencePHISC, VPVWidenPHISC, VPVWidenIntOrFpInductionSC, @@ -361,6 +362,7 @@ // Phi-like recipes. Need to be kept together. VPBlendSC, VPCanonicalIVPHISC, + VPActiveLaneMaskPHISC, VPFirstOrderRecurrencePHISC, VPWidenPHISC, VPWidenIntOrFpInductionSC, Index: llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -183,9 +183,11 @@ } auto *LastInst = dyn_cast(std::prev(Exit->end())); - if (!LastInst || LastInst->getOpcode() != VPInstruction::BranchOnCount) { - errs() << "VPlan vector loop exit must end with BranchOnCount " - "VPInstruction\n"; + if (!LastInst || + (LastInst->getOpcode() != VPInstruction::BranchOnCount && + LastInst->getOpcode() != VPInstruction::BranchOnActiveLaneMask)) { + errs() << "VPlan vector loop exit must end with BranchOnCount or " + "BranchOnActiveLaneMask VPInstruction\n"; return false; } Index: llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions-tf.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions-tf.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions-tf.ll @@ -4,11 +4,12 @@ define void @invariant_store_red_exit_is_phi(i32* %dst, i32* readonly %src, i64 %n) { ; CHECK-LABEL: @invariant_store_red_exit_is_phi( ; CHECK: vector.body: +; CHECK: %[[ACTIVE_LANE_MASK:.*]] = phi [ %{{.*}}, %vector.ph ], [ %[[ACTIVE_LANE_MASK2:.*]], %vector.body ] ; CHECK: %[[VEC_PHI:.*]] = phi [ zeroinitializer, %vector.ph ], [ %[[PREDPHI:.*]], %vector.body ] -; CHECK: %[[ACTIVE_LANE_MASK:.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 {{%.*}}, i64 %n) ; CHECK: %[[LOAD:.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32 ; CHECK-NEXT: %[[ADD:.*]] = add %[[VEC_PHI]], %[[LOAD]] ; CHECK-NEXT: %[[SELECT:.*]] = select %[[ACTIVE_LANE_MASK]], %[[ADD]], %[[VEC_PHI]] +; CHECK: %[[ACTIVE_LANE_MASK2]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 %{{.*}}, i64 %n) ; CHECK: middle.block: ; CHECK-NEXT: %[[SUM:.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( %[[SELECT]]) ; CHECK-NEXT: store i32 %[[SUM]], i32* %dst, align 4 Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll @@ -6,14 +6,17 @@ ; CHECK-LABEL: @trip7_i64( ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] -; CHECK: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 {{%.*}}, i64 7) +; CHECK: [[ACTIVE_LANE_MASK:%.*]] = phi [ {{%.*}}, %vector.ph ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %vector.body ] ; CHECK: {{%.*}} = call @llvm.masked.load.nxv2i64.p0nxv2i64(* {{%.*}}, i32 8, [[ACTIVE_LANE_MASK]], poison) ; CHECK: {{%.*}} = call @llvm.masked.load.nxv2i64.p0nxv2i64(* {{%.*}}, i32 8, [[ACTIVE_LANE_MASK]], poison) ; CHECK: call void @llvm.masked.store.nxv2i64.p0nxv2i64( {{%.*}}, * {{%.*}}, i32 8, [[ACTIVE_LANE_MASK]]) ; CHECK: [[VSCALE:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[VF:%.*]] = mul i64 [[VSCALE]], 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[VF]] -; CHECK-NEXT: [[COND:%.*]] = icmp eq i64 [[INDEX_NEXT]], {{%.*}} +; CHECK-NEXT: [[INDEX_NEXT_PART:%.*]] = add i64 [[INDEX_NEXT]], 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT_PART]], i64 7) +; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 +; CHECK-NEXT: [[COND:%.*]] = xor i1 [[FIRST_ACTIVE_LANE]], true ; CHECK-NEXT: br i1 [[COND]], label %middle.block, label %vector.body ; entry: @@ -40,14 +43,17 @@ ; CHECK-LABEL: @trip5_i8( ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] -; CHECK: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 {{%.*}}, i64 5) +; CHECK: [[ACTIVE_LANE_MASK:%.*]] = phi [ {{%.*}}, %vector.ph ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %vector.body ] ; CHECK: {{%.*}} = call @llvm.masked.load.nxv16i8.p0nxv16i8(* {{%.*}}, i32 1, [[ACTIVE_LANE_MASK]], poison) ; CHECK: {{%.*}} = call @llvm.masked.load.nxv16i8.p0nxv16i8(* {{%.*}}, i32 1, [[ACTIVE_LANE_MASK]], poison) ; CHECK: call void @llvm.masked.store.nxv16i8.p0nxv16i8( {{%.*}}, * {{%.*}}, i32 1, [[ACTIVE_LANE_MASK]]) ; CHECK: [[VSCALE:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[VF:%.*]] = mul i64 [[VSCALE]], 16 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[VF]] -; CHECK-NEXT: [[COND:%.*]] = icmp eq i64 [[INDEX_NEXT]], {{%.*}} +; CHECK-NEXT: [[INDEX_NEXT_PART:%.*]] = add i64 [[INDEX_NEXT]], 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX_NEXT_PART]], i64 5) +; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 +; CHECK-NEXT: [[COND:%.*]] = xor i1 [[FIRST_ACTIVE_LANE]], true ; CHECK-NEXT: br i1 [[COND]], label %middle.block, label %vector.body ; entry: Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll @@ -1,10 +1,30 @@ -; RUN: opt -S -loop-vectorize < %s | FileCheck %s +; REQUIRES: asserts +; RUN: opt -S -loop-vectorize -debug-only=loop-vectorize < %s 2>%t | FileCheck %s +; RUN: cat %t | FileCheck %s --check-prefix=VPLANS ; These tests ensure that tail-folding is enabled when the predicate.enable ; loop attribute is set to true. target triple = "aarch64-unknown-linux-gnu" +; VPLANS-LABEL: Checking a loop in 'simple_memset' +; VPLANS: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF>=1' { +; VPLANS-NEXT: vector.ph: +; VPLANS-NEXT: Successor(s): vector loop +; VPLANS-EMPTY: +; VPLANS-NEXT: vector loop: { +; VPLANS-NEXT: vector.body: +; VPLANS-NEXT: EMIT vp<%2> = CANONICAL-INDUCTION +; VPLANS-NEXT: EMIT vp<%3> = ACTIVE-LANE-MASK-PHI +; VPLANS-NEXT: vp<%4> = SCALAR-STEPS vp<%2>, ir<0>, ir<1> +; VPLANS-NEXT: CLONE ir<%gep> = getelementptr ir<%ptr>, vp<%4> +; VPLANS-NEXT: WIDEN store ir<%gep>, ir<%val>, vp<%3> +; VPLANS-NEXT: EMIT vp<%6> = VF * UF + vp<%2> +; VPLANS-NEXT: EMIT vp<%7> = VF * Part + vp<%6> +; VPLANS-NEXT: EMIT vp<%8> = active lane mask vp<%7> +; VPLANS-NEXT: EMIT branch-on-active-lane-mask vp<%8> +; VPLANS-NEXT: No successors +; VPLANS-NEXT: } define void @simple_memset(i32 %val, i32* %ptr, i64 %n) #0 { ; CHECK-LABEL: @simple_memset( @@ -16,6 +36,7 @@ ; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]] ; CHECK-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 ; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() @@ -28,18 +49,21 @@ ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[ACTIVE_LANE_MASK]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK4:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX1]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP9]], i64 [[UMAX]]) ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP10]], i32 0 ; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to * -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT]], * [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT]], * [[TMP12]], i32 4, [[ACTIVE_LANE_MASK2]]) ; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4 -; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP14]] -; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP14]] +; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 [[INDEX_NEXT3]], 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK4]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[UMAX]]) +; CHECK-NEXT: [[TMP15:%.*]] = extractelement [[ACTIVE_LANE_MASK4]], i32 0 +; CHECK-NEXT: [[TMP16:%.*]] = xor i1 [[TMP15]], true +; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-optsize.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-optsize.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-optsize.ll @@ -6,14 +6,17 @@ ; CHECK-LABEL: @trip1024_i64( ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] -; CHECK: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 {{%.*}}, i64 1024) +; CHECK: [[ACTIVE_LANE_MASK:%.*]] = phi [ {{.*}}, %vector.ph ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %vector.body ] ; CHECK: {{%.*}} = call @llvm.masked.load.nxv2i64.p0nxv2i64(* {{%.*}}, i32 8, [[ACTIVE_LANE_MASK]], poison) ; CHECK: {{%.*}} = call @llvm.masked.load.nxv2i64.p0nxv2i64(* {{%.*}}, i32 8, [[ACTIVE_LANE_MASK]], poison) ; CHECK: call void @llvm.masked.store.nxv2i64.p0nxv2i64( {{%.*}}, * {{%.*}}, i32 8, [[ACTIVE_LANE_MASK]]) ; CHECK: [[VSCALE:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[VF:%.*]] = mul i64 [[VSCALE]], 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[VF]] -; CHECK-NEXT: [[COND:%.*]] = icmp eq i64 [[INDEX_NEXT]], {{%.*}} +; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 [[INDEX_NEXT]], 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_PART_NEXT]], i64 1024) +; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 +; CHECK-NEXT: [[COND:%.*]] = xor i1 [[FIRST_ACTIVE_LANE]], true ; CHECK-NEXT: br i1 [[COND]], label %middle.block, label %vector.body ; entry: Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll @@ -14,72 +14,97 @@ ; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]] ; CHECK-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16 +; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; CHECK-NEXT: [[ACTIVE_LANE_MASK3:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP5]], i64 [[UMAX]]) ; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 16 -; CHECK-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1 -; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP8]] -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 8 +; CHECK-NEXT: [[ACTIVE_LANE_MASK5:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP7]], i64 [[UMAX]]) +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 12 +; CHECK-NEXT: [[ACTIVE_LANE_MASK7:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP9]], i64 [[UMAX]]) +; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 16 +; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 16 +; CHECK-NEXT: [[TMP14:%.*]] = sub i64 [[TMP13]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP14]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP11]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[VAL:%.*]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement poison, i32 [[VAL]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector [[BROADCAST_SPLATINSERT5]], poison, zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement poison, i32 [[VAL]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector [[BROADCAST_SPLATINSERT7]], poison, zeroinitializer ; CHECK-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement poison, i32 [[VAL]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT10:%.*]] = shufflevector [[BROADCAST_SPLATINSERT9]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT11:%.*]] = insertelement poison, i32 [[VAL]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT12:%.*]] = shufflevector [[BROADCAST_SPLATINSERT11]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT13:%.*]] = insertelement poison, i32 [[VAL]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT14:%.*]] = shufflevector [[BROADCAST_SPLATINSERT13]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT11:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX1]], 0 -; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 4 -; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[TMP11]], 0 -; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 1 -; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[INDEX1]], [[TMP13]] -; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 8 -; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[TMP16]], 0 -; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 1 -; CHECK-NEXT: [[TMP19:%.*]] = add i64 [[INDEX1]], [[TMP18]] -; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 12 -; CHECK-NEXT: [[TMP22:%.*]] = add i64 [[TMP21]], 0 -; CHECK-NEXT: [[TMP23:%.*]] = mul i64 [[TMP22]], 1 -; CHECK-NEXT: [[TMP24:%.*]] = add i64 [[INDEX1]], [[TMP23]] -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP9]], i64 [[UMAX]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP14]], i64 [[UMAX]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK3:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP19]], i64 [[UMAX]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK4:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP24]], i64 [[UMAX]]) -; CHECK-NEXT: [[TMP25:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP26:%.*]] = getelementptr i32, i32* [[PTR]], i64 [[TMP14]] -; CHECK-NEXT: [[TMP27:%.*]] = getelementptr i32, i32* [[PTR]], i64 [[TMP19]] -; CHECK-NEXT: [[TMP28:%.*]] = getelementptr i32, i32* [[PTR]], i64 [[TMP24]] -; CHECK-NEXT: [[TMP29:%.*]] = getelementptr i32, i32* [[TMP25]], i32 0 -; CHECK-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP29]] to * -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT]], * [[TMP30]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[TMP31:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP32:%.*]] = mul i32 [[TMP31]], 4 -; CHECK-NEXT: [[TMP33:%.*]] = getelementptr i32, i32* [[TMP25]], i32 [[TMP32]] -; CHECK-NEXT: [[TMP34:%.*]] = bitcast i32* [[TMP33]] to * -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT6]], * [[TMP34]], i32 4, [[ACTIVE_LANE_MASK2]]) -; CHECK-NEXT: [[TMP35:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP36:%.*]] = mul i32 [[TMP35]], 8 -; CHECK-NEXT: [[TMP37:%.*]] = getelementptr i32, i32* [[TMP25]], i32 [[TMP36]] -; CHECK-NEXT: [[TMP38:%.*]] = bitcast i32* [[TMP37]] to * -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT8]], * [[TMP38]], i32 4, [[ACTIVE_LANE_MASK3]]) -; CHECK-NEXT: [[TMP39:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP40:%.*]] = mul i32 [[TMP39]], 12 -; CHECK-NEXT: [[TMP41:%.*]] = getelementptr i32, i32* [[TMP25]], i32 [[TMP40]] -; CHECK-NEXT: [[TMP42:%.*]] = bitcast i32* [[TMP41]] to * -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT10]], * [[TMP42]], i32 4, [[ACTIVE_LANE_MASK4]]) -; CHECK-NEXT: [[TMP43:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP44:%.*]] = mul i64 [[TMP43]], 16 -; CHECK-NEXT: [[INDEX_NEXT11]] = add i64 [[INDEX1]], [[TMP44]] -; CHECK-NEXT: [[TMP45:%.*]] = icmp eq i64 [[INDEX_NEXT11]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP45]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT15:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[ACTIVE_LANE_MASK]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK19:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK4:%.*]] = phi [ [[ACTIVE_LANE_MASK3]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK20:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK6:%.*]] = phi [ [[ACTIVE_LANE_MASK5]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK21:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK8:%.*]] = phi [ [[ACTIVE_LANE_MASK7]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK22:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[INDEX1]], 0 +; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4 +; CHECK-NEXT: [[TMP18:%.*]] = add i64 [[TMP17]], 0 +; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 1 +; CHECK-NEXT: [[TMP20:%.*]] = add i64 [[INDEX1]], [[TMP19]] +; CHECK-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 8 +; CHECK-NEXT: [[TMP23:%.*]] = add i64 [[TMP22]], 0 +; CHECK-NEXT: [[TMP24:%.*]] = mul i64 [[TMP23]], 1 +; CHECK-NEXT: [[TMP25:%.*]] = add i64 [[INDEX1]], [[TMP24]] +; CHECK-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP27:%.*]] = mul i64 [[TMP26]], 12 +; CHECK-NEXT: [[TMP28:%.*]] = add i64 [[TMP27]], 0 +; CHECK-NEXT: [[TMP29:%.*]] = mul i64 [[TMP28]], 1 +; CHECK-NEXT: [[TMP30:%.*]] = add i64 [[INDEX1]], [[TMP29]] +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP15]] +; CHECK-NEXT: [[TMP32:%.*]] = getelementptr i32, i32* [[PTR]], i64 [[TMP20]] +; CHECK-NEXT: [[TMP33:%.*]] = getelementptr i32, i32* [[PTR]], i64 [[TMP25]] +; CHECK-NEXT: [[TMP34:%.*]] = getelementptr i32, i32* [[PTR]], i64 [[TMP30]] +; CHECK-NEXT: [[TMP35:%.*]] = getelementptr i32, i32* [[TMP31]], i32 0 +; CHECK-NEXT: [[TMP36:%.*]] = bitcast i32* [[TMP35]] to * +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT]], * [[TMP36]], i32 4, [[ACTIVE_LANE_MASK2]]) +; CHECK-NEXT: [[TMP37:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP38:%.*]] = mul i32 [[TMP37]], 4 +; CHECK-NEXT: [[TMP39:%.*]] = getelementptr i32, i32* [[TMP31]], i32 [[TMP38]] +; CHECK-NEXT: [[TMP40:%.*]] = bitcast i32* [[TMP39]] to * +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT10]], * [[TMP40]], i32 4, [[ACTIVE_LANE_MASK4]]) +; CHECK-NEXT: [[TMP41:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP42:%.*]] = mul i32 [[TMP41]], 8 +; CHECK-NEXT: [[TMP43:%.*]] = getelementptr i32, i32* [[TMP31]], i32 [[TMP42]] +; CHECK-NEXT: [[TMP44:%.*]] = bitcast i32* [[TMP43]] to * +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT12]], * [[TMP44]], i32 4, [[ACTIVE_LANE_MASK6]]) +; CHECK-NEXT: [[TMP45:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP46:%.*]] = mul i32 [[TMP45]], 12 +; CHECK-NEXT: [[TMP47:%.*]] = getelementptr i32, i32* [[TMP31]], i32 [[TMP46]] +; CHECK-NEXT: [[TMP48:%.*]] = bitcast i32* [[TMP47]] to * +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT14]], * [[TMP48]], i32 4, [[ACTIVE_LANE_MASK8]]) +; CHECK-NEXT: [[TMP49:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP50:%.*]] = mul i64 [[TMP49]], 16 +; CHECK-NEXT: [[INDEX_NEXT15]] = add i64 [[INDEX1]], [[TMP50]] +; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 [[INDEX_NEXT15]], 0 +; CHECK-NEXT: [[TMP51:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP52:%.*]] = mul i64 [[TMP51]], 4 +; CHECK-NEXT: [[INDEX_PART_NEXT16:%.*]] = add i64 [[INDEX_NEXT15]], [[TMP52]] +; CHECK-NEXT: [[TMP53:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP54:%.*]] = mul i64 [[TMP53]], 8 +; CHECK-NEXT: [[INDEX_PART_NEXT17:%.*]] = add i64 [[INDEX_NEXT15]], [[TMP54]] +; CHECK-NEXT: [[TMP55:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP56:%.*]] = mul i64 [[TMP55]], 12 +; CHECK-NEXT: [[INDEX_PART_NEXT18:%.*]] = add i64 [[INDEX_NEXT15]], [[TMP56]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK19]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[UMAX]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK20]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT16]], i64 [[UMAX]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK21]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT17]], i64 [[UMAX]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK22]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT18]], i64 [[UMAX]]) +; CHECK-NEXT: [[TMP57:%.*]] = extractelement [[ACTIVE_LANE_MASK19]], i32 0 +; CHECK-NEXT: [[TMP58:%.*]] = xor i1 [[TMP57]], true +; CHECK-NEXT: br i1 [[TMP58]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; @@ -108,102 +133,127 @@ ; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]] ; CHECK-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16 +; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; CHECK-NEXT: [[ACTIVE_LANE_MASK3:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP5]], i64 [[UMAX]]) ; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 16 -; CHECK-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1 -; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP8]] -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 8 +; CHECK-NEXT: [[ACTIVE_LANE_MASK5:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP7]], i64 [[UMAX]]) +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 12 +; CHECK-NEXT: [[ACTIVE_LANE_MASK7:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP9]], i64 [[UMAX]]) +; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 16 +; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 16 +; CHECK-NEXT: [[TMP14:%.*]] = sub i64 [[TMP13]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP14]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP11]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[VAL:%.*]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement poison, i32 [[VAL]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT9:%.*]] = shufflevector [[BROADCAST_SPLATINSERT8]], poison, zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT10:%.*]] = insertelement poison, i32 [[VAL]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT11:%.*]] = shufflevector [[BROADCAST_SPLATINSERT10]], poison, zeroinitializer ; CHECK-NEXT: [[BROADCAST_SPLATINSERT12:%.*]] = insertelement poison, i32 [[VAL]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT13:%.*]] = shufflevector [[BROADCAST_SPLATINSERT12]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT14:%.*]] = insertelement poison, i32 [[VAL]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT15:%.*]] = shufflevector [[BROADCAST_SPLATINSERT14]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT16:%.*]] = insertelement poison, i32 [[VAL]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT17:%.*]] = shufflevector [[BROADCAST_SPLATINSERT16]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT14:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX1]], 0 -; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 4 -; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[TMP11]], 0 -; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 1 -; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[INDEX1]], [[TMP13]] -; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 8 -; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[TMP16]], 0 -; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 1 -; CHECK-NEXT: [[TMP19:%.*]] = add i64 [[INDEX1]], [[TMP18]] -; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 12 -; CHECK-NEXT: [[TMP22:%.*]] = add i64 [[TMP21]], 0 -; CHECK-NEXT: [[TMP23:%.*]] = mul i64 [[TMP22]], 1 -; CHECK-NEXT: [[TMP24:%.*]] = add i64 [[INDEX1]], [[TMP23]] -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP9]], i64 [[UMAX]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP14]], i64 [[UMAX]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK3:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP19]], i64 [[UMAX]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK4:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP24]], i64 [[UMAX]]) -; CHECK-NEXT: [[TMP25:%.*]] = getelementptr i32, i32* [[COND_PTR:%.*]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP26:%.*]] = getelementptr i32, i32* [[COND_PTR]], i64 [[TMP14]] -; CHECK-NEXT: [[TMP27:%.*]] = getelementptr i32, i32* [[COND_PTR]], i64 [[TMP19]] -; CHECK-NEXT: [[TMP28:%.*]] = getelementptr i32, i32* [[COND_PTR]], i64 [[TMP24]] -; CHECK-NEXT: [[TMP29:%.*]] = getelementptr i32, i32* [[TMP25]], i32 0 -; CHECK-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP29]] to * -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP30]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP31:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP32:%.*]] = mul i32 [[TMP31]], 4 -; CHECK-NEXT: [[TMP33:%.*]] = getelementptr i32, i32* [[TMP25]], i32 [[TMP32]] -; CHECK-NEXT: [[TMP34:%.*]] = bitcast i32* [[TMP33]] to * -; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP34]], i32 4, [[ACTIVE_LANE_MASK2]], poison) -; CHECK-NEXT: [[TMP35:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP36:%.*]] = mul i32 [[TMP35]], 8 -; CHECK-NEXT: [[TMP37:%.*]] = getelementptr i32, i32* [[TMP25]], i32 [[TMP36]] -; CHECK-NEXT: [[TMP38:%.*]] = bitcast i32* [[TMP37]] to * -; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP38]], i32 4, [[ACTIVE_LANE_MASK3]], poison) -; CHECK-NEXT: [[TMP39:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP40:%.*]] = mul i32 [[TMP39]], 12 -; CHECK-NEXT: [[TMP41:%.*]] = getelementptr i32, i32* [[TMP25]], i32 [[TMP40]] -; CHECK-NEXT: [[TMP42:%.*]] = bitcast i32* [[TMP41]] to * -; CHECK-NEXT: [[WIDE_MASKED_LOAD7:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP42]], i32 4, [[ACTIVE_LANE_MASK4]], poison) -; CHECK-NEXT: [[TMP43:%.*]] = icmp ne [[WIDE_MASKED_LOAD]], zeroinitializer -; CHECK-NEXT: [[TMP44:%.*]] = icmp ne [[WIDE_MASKED_LOAD5]], zeroinitializer -; CHECK-NEXT: [[TMP45:%.*]] = icmp ne [[WIDE_MASKED_LOAD6]], zeroinitializer -; CHECK-NEXT: [[TMP46:%.*]] = icmp ne [[WIDE_MASKED_LOAD7]], zeroinitializer -; CHECK-NEXT: [[TMP47:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP48:%.*]] = getelementptr i32, i32* [[PTR]], i64 [[TMP14]] -; CHECK-NEXT: [[TMP49:%.*]] = getelementptr i32, i32* [[PTR]], i64 [[TMP19]] -; CHECK-NEXT: [[TMP50:%.*]] = getelementptr i32, i32* [[PTR]], i64 [[TMP24]] -; CHECK-NEXT: [[TMP51:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP43]], zeroinitializer -; CHECK-NEXT: [[TMP52:%.*]] = select [[ACTIVE_LANE_MASK2]], [[TMP44]], zeroinitializer -; CHECK-NEXT: [[TMP53:%.*]] = select [[ACTIVE_LANE_MASK3]], [[TMP45]], zeroinitializer -; CHECK-NEXT: [[TMP54:%.*]] = select [[ACTIVE_LANE_MASK4]], [[TMP46]], zeroinitializer -; CHECK-NEXT: [[TMP55:%.*]] = getelementptr i32, i32* [[TMP47]], i32 0 -; CHECK-NEXT: [[TMP56:%.*]] = bitcast i32* [[TMP55]] to * -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT]], * [[TMP56]], i32 4, [[TMP51]]) -; CHECK-NEXT: [[TMP57:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP58:%.*]] = mul i32 [[TMP57]], 4 -; CHECK-NEXT: [[TMP59:%.*]] = getelementptr i32, i32* [[TMP47]], i32 [[TMP58]] -; CHECK-NEXT: [[TMP60:%.*]] = bitcast i32* [[TMP59]] to * -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT9]], * [[TMP60]], i32 4, [[TMP52]]) -; CHECK-NEXT: [[TMP61:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP62:%.*]] = mul i32 [[TMP61]], 8 -; CHECK-NEXT: [[TMP63:%.*]] = getelementptr i32, i32* [[TMP47]], i32 [[TMP62]] -; CHECK-NEXT: [[TMP64:%.*]] = bitcast i32* [[TMP63]] to * -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT11]], * [[TMP64]], i32 4, [[TMP53]]) -; CHECK-NEXT: [[TMP65:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP66:%.*]] = mul i32 [[TMP65]], 12 -; CHECK-NEXT: [[TMP67:%.*]] = getelementptr i32, i32* [[TMP47]], i32 [[TMP66]] -; CHECK-NEXT: [[TMP68:%.*]] = bitcast i32* [[TMP67]] to * -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT13]], * [[TMP68]], i32 4, [[TMP54]]) -; CHECK-NEXT: [[TMP69:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP70:%.*]] = mul i64 [[TMP69]], 16 -; CHECK-NEXT: [[INDEX_NEXT14]] = add i64 [[INDEX1]], [[TMP70]] -; CHECK-NEXT: [[TMP71:%.*]] = icmp eq i64 [[INDEX_NEXT14]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP71]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT18:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[ACTIVE_LANE_MASK]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK22:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK4:%.*]] = phi [ [[ACTIVE_LANE_MASK3]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK23:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK6:%.*]] = phi [ [[ACTIVE_LANE_MASK5]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK24:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK8:%.*]] = phi [ [[ACTIVE_LANE_MASK7]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK25:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[INDEX1]], 0 +; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4 +; CHECK-NEXT: [[TMP18:%.*]] = add i64 [[TMP17]], 0 +; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 1 +; CHECK-NEXT: [[TMP20:%.*]] = add i64 [[INDEX1]], [[TMP19]] +; CHECK-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 8 +; CHECK-NEXT: [[TMP23:%.*]] = add i64 [[TMP22]], 0 +; CHECK-NEXT: [[TMP24:%.*]] = mul i64 [[TMP23]], 1 +; CHECK-NEXT: [[TMP25:%.*]] = add i64 [[INDEX1]], [[TMP24]] +; CHECK-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP27:%.*]] = mul i64 [[TMP26]], 12 +; CHECK-NEXT: [[TMP28:%.*]] = add i64 [[TMP27]], 0 +; CHECK-NEXT: [[TMP29:%.*]] = mul i64 [[TMP28]], 1 +; CHECK-NEXT: [[TMP30:%.*]] = add i64 [[INDEX1]], [[TMP29]] +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr i32, i32* [[COND_PTR:%.*]], i64 [[TMP15]] +; CHECK-NEXT: [[TMP32:%.*]] = getelementptr i32, i32* [[COND_PTR]], i64 [[TMP20]] +; CHECK-NEXT: [[TMP33:%.*]] = getelementptr i32, i32* [[COND_PTR]], i64 [[TMP25]] +; CHECK-NEXT: [[TMP34:%.*]] = getelementptr i32, i32* [[COND_PTR]], i64 [[TMP30]] +; CHECK-NEXT: [[TMP35:%.*]] = getelementptr i32, i32* [[TMP31]], i32 0 +; CHECK-NEXT: [[TMP36:%.*]] = bitcast i32* [[TMP35]] to * +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP36]], i32 4, [[ACTIVE_LANE_MASK2]], poison) +; CHECK-NEXT: [[TMP37:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP38:%.*]] = mul i32 [[TMP37]], 4 +; CHECK-NEXT: [[TMP39:%.*]] = getelementptr i32, i32* [[TMP31]], i32 [[TMP38]] +; CHECK-NEXT: [[TMP40:%.*]] = bitcast i32* [[TMP39]] to * +; CHECK-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP40]], i32 4, [[ACTIVE_LANE_MASK4]], poison) +; CHECK-NEXT: [[TMP41:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP42:%.*]] = mul i32 [[TMP41]], 8 +; CHECK-NEXT: [[TMP43:%.*]] = getelementptr i32, i32* [[TMP31]], i32 [[TMP42]] +; CHECK-NEXT: [[TMP44:%.*]] = bitcast i32* [[TMP43]] to * +; CHECK-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP44]], i32 4, [[ACTIVE_LANE_MASK6]], poison) +; CHECK-NEXT: [[TMP45:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP46:%.*]] = mul i32 [[TMP45]], 12 +; CHECK-NEXT: [[TMP47:%.*]] = getelementptr i32, i32* [[TMP31]], i32 [[TMP46]] +; CHECK-NEXT: [[TMP48:%.*]] = bitcast i32* [[TMP47]] to * +; CHECK-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP48]], i32 4, [[ACTIVE_LANE_MASK8]], poison) +; CHECK-NEXT: [[TMP49:%.*]] = icmp ne [[WIDE_MASKED_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP50:%.*]] = icmp ne [[WIDE_MASKED_LOAD9]], zeroinitializer +; CHECK-NEXT: [[TMP51:%.*]] = icmp ne [[WIDE_MASKED_LOAD10]], zeroinitializer +; CHECK-NEXT: [[TMP52:%.*]] = icmp ne [[WIDE_MASKED_LOAD11]], zeroinitializer +; CHECK-NEXT: [[TMP53:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP15]] +; CHECK-NEXT: [[TMP54:%.*]] = getelementptr i32, i32* [[PTR]], i64 [[TMP20]] +; CHECK-NEXT: [[TMP55:%.*]] = getelementptr i32, i32* [[PTR]], i64 [[TMP25]] +; CHECK-NEXT: [[TMP56:%.*]] = getelementptr i32, i32* [[PTR]], i64 [[TMP30]] +; CHECK-NEXT: [[TMP57:%.*]] = select [[ACTIVE_LANE_MASK2]], [[TMP49]], zeroinitializer +; CHECK-NEXT: [[TMP58:%.*]] = select [[ACTIVE_LANE_MASK4]], [[TMP50]], zeroinitializer +; CHECK-NEXT: [[TMP59:%.*]] = select [[ACTIVE_LANE_MASK6]], [[TMP51]], zeroinitializer +; CHECK-NEXT: [[TMP60:%.*]] = select [[ACTIVE_LANE_MASK8]], [[TMP52]], zeroinitializer +; CHECK-NEXT: [[TMP61:%.*]] = getelementptr i32, i32* [[TMP53]], i32 0 +; CHECK-NEXT: [[TMP62:%.*]] = bitcast i32* [[TMP61]] to * +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT]], * [[TMP62]], i32 4, [[TMP57]]) +; CHECK-NEXT: [[TMP63:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP64:%.*]] = mul i32 [[TMP63]], 4 +; CHECK-NEXT: [[TMP65:%.*]] = getelementptr i32, i32* [[TMP53]], i32 [[TMP64]] +; CHECK-NEXT: [[TMP66:%.*]] = bitcast i32* [[TMP65]] to * +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT13]], * [[TMP66]], i32 4, [[TMP58]]) +; CHECK-NEXT: [[TMP67:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP68:%.*]] = mul i32 [[TMP67]], 8 +; CHECK-NEXT: [[TMP69:%.*]] = getelementptr i32, i32* [[TMP53]], i32 [[TMP68]] +; CHECK-NEXT: [[TMP70:%.*]] = bitcast i32* [[TMP69]] to * +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT15]], * [[TMP70]], i32 4, [[TMP59]]) +; CHECK-NEXT: [[TMP71:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP72:%.*]] = mul i32 [[TMP71]], 12 +; CHECK-NEXT: [[TMP73:%.*]] = getelementptr i32, i32* [[TMP53]], i32 [[TMP72]] +; CHECK-NEXT: [[TMP74:%.*]] = bitcast i32* [[TMP73]] to * +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT17]], * [[TMP74]], i32 4, [[TMP60]]) +; CHECK-NEXT: [[TMP75:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP76:%.*]] = mul i64 [[TMP75]], 16 +; CHECK-NEXT: [[INDEX_NEXT18]] = add i64 [[INDEX1]], [[TMP76]] +; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 [[INDEX_NEXT18]], 0 +; CHECK-NEXT: [[TMP77:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP78:%.*]] = mul i64 [[TMP77]], 4 +; CHECK-NEXT: [[INDEX_PART_NEXT19:%.*]] = add i64 [[INDEX_NEXT18]], [[TMP78]] +; CHECK-NEXT: [[TMP79:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP80:%.*]] = mul i64 [[TMP79]], 8 +; CHECK-NEXT: [[INDEX_PART_NEXT20:%.*]] = add i64 [[INDEX_NEXT18]], [[TMP80]] +; CHECK-NEXT: [[TMP81:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP82:%.*]] = mul i64 [[TMP81]], 12 +; CHECK-NEXT: [[INDEX_PART_NEXT21:%.*]] = add i64 [[INDEX_NEXT18]], [[TMP82]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK22]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[UMAX]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK23]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT19]], i64 [[UMAX]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK24]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT20]], i64 [[UMAX]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK25]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT21]], i64 [[UMAX]]) +; CHECK-NEXT: [[TMP83:%.*]] = extractelement [[ACTIVE_LANE_MASK22]], i32 0 +; CHECK-NEXT: [[TMP84:%.*]] = xor i1 [[TMP83]], true +; CHECK-NEXT: br i1 [[TMP84]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll @@ -16,6 +16,7 @@ ; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]] ; CHECK-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 ; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() @@ -28,18 +29,21 @@ ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[ACTIVE_LANE_MASK]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK4:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX1]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP9]], i64 [[UMAX]]) ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP10]], i32 0 ; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to * -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT]], * [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT]], * [[TMP12]], i32 4, [[ACTIVE_LANE_MASK2]]) ; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4 -; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP14]] -; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP14]] +; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 [[INDEX_NEXT3]], 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK4]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[UMAX]]) +; CHECK-NEXT: [[TMP15:%.*]] = extractelement [[ACTIVE_LANE_MASK4]], i32 0 +; CHECK-NEXT: [[TMP16:%.*]] = xor i1 [[TMP15]], true +; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; @@ -69,6 +73,7 @@ ; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]] ; CHECK-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 ; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() @@ -79,22 +84,25 @@ ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[ACTIVE_LANE_MASK]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK4:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX1]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP9]], i64 [[UMAX]]) ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[SRC:%.*]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP10]], i32 0 ; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to * -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP12]], i32 4, [[ACTIVE_LANE_MASK2]], poison) ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[DST:%.*]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, i32* [[TMP13]], i32 0 ; CHECK-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to * -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[WIDE_MASKED_LOAD]], * [[TMP15]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[WIDE_MASKED_LOAD]], * [[TMP15]], i32 4, [[ACTIVE_LANE_MASK2]]) ; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4 -; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP17]] -; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP17]] +; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 [[INDEX_NEXT3]], 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK4]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[UMAX]]) +; CHECK-NEXT: [[TMP18:%.*]] = extractelement [[ACTIVE_LANE_MASK4]], i32 0 +; CHECK-NEXT: [[TMP19:%.*]] = xor i1 [[TMP18]], true +; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; @@ -129,6 +137,7 @@ ; CHECK-NEXT: [[TMP6:%.*]] = icmp ult i64 [[TMP5]], [[TMP4]] ; CHECK-NEXT: br i1 [[TMP6]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[TMP2]]) ; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 ; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() @@ -149,25 +158,22 @@ ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[ACTIVE_LANE_MASK]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK4:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[INDEX1]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP18:%.*]] = call @llvm.experimental.stepvector.nxv4i64() -; CHECK-NEXT: [[TMP19:%.*]] = add zeroinitializer, [[TMP18]] -; CHECK-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT]], [[TMP19]] -; CHECK-NEXT: [[TMP20:%.*]] = extractelement [[VEC_IV]], i32 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP20]], i64 [[TMP2]]) -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i32, i32* [[SRC:%.*]], [[VEC_IND]] -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP21]], i32 4, [[ACTIVE_LANE_MASK]], undef) -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr i32, i32* [[DST:%.*]], [[VEC_IND]] -; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( [[WIDE_MASKED_GATHER]], [[TMP22]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[TMP23:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP24:%.*]] = mul i64 [[TMP23]], 4 -; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP24]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[SRC:%.*]], [[VEC_IND]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP18]], i32 4, [[ACTIVE_LANE_MASK2]], undef) +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DST:%.*]], [[VEC_IND]] +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( [[WIDE_MASKED_GATHER]], [[TMP19]], i32 4, [[ACTIVE_LANE_MASK2]]) +; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 4 +; CHECK-NEXT: [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP21]] +; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 [[INDEX_NEXT3]], 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK4]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[TMP2]]) +; CHECK-NEXT: [[TMP22:%.*]] = extractelement [[ACTIVE_LANE_MASK4]], i32 0 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: [[TMP23:%.*]] = xor i1 [[TMP22]], true +; CHECK-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; @@ -199,6 +205,7 @@ ; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]] ; CHECK-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 ; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() @@ -209,22 +216,25 @@ ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[ACTIVE_LANE_MASK]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK4:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX1]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP9]], i64 [[UMAX]]) ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[IND:%.*]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP10]], i32 0 ; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to * -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP12]], i32 4, [[ACTIVE_LANE_MASK2]], poison) ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[SRC:%.*]], [[WIDE_MASKED_LOAD]] -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP13]], i32 4, [[ACTIVE_LANE_MASK]], undef) +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP13]], i32 4, [[ACTIVE_LANE_MASK2]], undef) ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, i32* [[DST:%.*]], [[WIDE_MASKED_LOAD]] -; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( [[WIDE_MASKED_GATHER]], [[TMP14]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( [[WIDE_MASKED_GATHER]], [[TMP14]], i32 4, [[ACTIVE_LANE_MASK2]]) ; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4 -; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP16]] -; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP16]] +; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 [[INDEX_NEXT3]], 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK4]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[UMAX]]) +; CHECK-NEXT: [[TMP17:%.*]] = extractelement [[ACTIVE_LANE_MASK4]], i32 0 +; CHECK-NEXT: [[TMP18:%.*]] = xor i1 [[TMP17]], true +; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; @@ -259,6 +269,7 @@ ; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]] ; CHECK-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 ; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() @@ -270,20 +281,23 @@ ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK1:%.*]] = phi [ [[ACTIVE_LANE_MASK]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK2:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP9]], i64 [[N]]) ; CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[SRC:%.*]], align 4 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP10]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to * -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT]], * [[TMP13]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT]], * [[TMP13]], i32 4, [[ACTIVE_LANE_MASK1]]) ; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP15]] -; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 [[INDEX_NEXT]], 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK2]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[N]]) +; CHECK-NEXT: [[TMP16:%.*]] = extractelement [[ACTIVE_LANE_MASK2]], i32 0 +; CHECK-NEXT: [[TMP17:%.*]] = xor i1 [[TMP16]], true +; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; @@ -318,6 +332,7 @@ ; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]] ; CHECK-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 ; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() @@ -330,18 +345,18 @@ ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[ACTIVE_LANE_MASK]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK4:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX1]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP9]], i64 [[N]]) ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[COND:%.*]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 0 ; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to * -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP12]], i32 4, [[ACTIVE_LANE_MASK2]], poison) ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq [[WIDE_MASKED_LOAD]], zeroinitializer ; CHECK-NEXT: [[TMP14:%.*]] = xor [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP15:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP14]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = select [[ACTIVE_LANE_MASK2]], [[TMP14]], zeroinitializer ; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[BROADCAST_SPLAT]], i32 4, [[TMP15]], undef) -; CHECK-NEXT: [[TMP16:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = select [[ACTIVE_LANE_MASK2]], [[TMP13]], zeroinitializer ; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP16]], zeroinitializer, [[WIDE_MASKED_GATHER]] ; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP18:%.*]] = or [[TMP15]], [[TMP16]] @@ -350,9 +365,12 @@ ; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[PREDPHI]], * [[TMP20]], i32 4, [[TMP18]]) ; CHECK-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 4 -; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP22]] -; CHECK-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP22]] +; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 [[INDEX_NEXT3]], 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK4]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[N]]) +; CHECK-NEXT: [[TMP23:%.*]] = extractelement [[ACTIVE_LANE_MASK4]], i32 0 +; CHECK-NEXT: [[TMP24:%.*]] = xor i1 [[TMP23]], true +; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; @@ -395,6 +413,7 @@ ; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]] ; CHECK-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 ; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() @@ -408,18 +427,21 @@ ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK1:%.*]] = phi [ [[ACTIVE_LANE_MASK]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK2:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP9]], i64 [[N]]) ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 0 ; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to * -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP12]], i32 4, [[ACTIVE_LANE_MASK1]], poison) +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]], i32 4, [[ACTIVE_LANE_MASK1]]) ; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP14]] -; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 [[INDEX_NEXT]], 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK2]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[N]]) +; CHECK-NEXT: [[TMP15:%.*]] = extractelement [[ACTIVE_LANE_MASK2]], i32 0 +; CHECK-NEXT: [[TMP16:%.*]] = xor i1 [[TMP15]], true +; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; @@ -451,6 +473,7 @@ ; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]] ; CHECK-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 ; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() @@ -461,25 +484,28 @@ ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[ACTIVE_LANE_MASK]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK5:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX1]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP9]], i64 [[UMAX]]) ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr float, float* [[SRC:%.*]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr float, float* [[DST:%.*]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr float, float* [[TMP10]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP12]] to * -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0nxv4f32(* [[TMP13]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0nxv4f32(* [[TMP13]], i32 4, [[ACTIVE_LANE_MASK2]], poison) ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr float, float* [[TMP11]], i32 0 ; CHECK-NEXT: [[TMP15:%.*]] = bitcast float* [[TMP14]] to * -; CHECK-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = call @llvm.masked.load.nxv4f32.p0nxv4f32(* [[TMP15]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP16:%.*]] = fdiv [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD2]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = call @llvm.masked.load.nxv4f32.p0nxv4f32(* [[TMP15]], i32 4, [[ACTIVE_LANE_MASK2]], poison) +; CHECK-NEXT: [[TMP16:%.*]] = fdiv [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD3]] ; CHECK-NEXT: [[TMP17:%.*]] = bitcast float* [[TMP14]] to * -; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0nxv4f32( [[TMP16]], * [[TMP17]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0nxv4f32( [[TMP16]], * [[TMP17]], i32 4, [[ACTIVE_LANE_MASK2]]) ; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 4 -; CHECK-NEXT: [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP19]] -; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT3]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-NEXT: [[INDEX_NEXT4]] = add i64 [[INDEX1]], [[TMP19]] +; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 [[INDEX_NEXT4]], 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK5]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[UMAX]]) +; CHECK-NEXT: [[TMP20:%.*]] = extractelement [[ACTIVE_LANE_MASK5]], i32 0 +; CHECK-NEXT: [[TMP21:%.*]] = xor i1 [[TMP20]], true +; CHECK-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; @@ -513,6 +539,7 @@ ; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]] ; CHECK-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 ; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() @@ -523,22 +550,25 @@ ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[ACTIVE_LANE_MASK]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK4:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX1]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP9]], i64 [[UMAX]]) ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP10]], i32 0 ; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to * -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP13:%.*]] = select [[ACTIVE_LANE_MASK]], [[WIDE_MASKED_LOAD]], zeroinitializer +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP12]], i32 4, [[ACTIVE_LANE_MASK2]], poison) +; CHECK-NEXT: [[TMP13:%.*]] = select [[ACTIVE_LANE_MASK2]], [[WIDE_MASKED_LOAD]], zeroinitializer ; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP13]]) ; CHECK-NEXT: [[TMP15]] = add i32 [[TMP14]], [[VEC_PHI]] ; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4 -; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP17]] -; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-NEXT: [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP17]] +; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 [[INDEX_NEXT3]], 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK4]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[UMAX]]) +; CHECK-NEXT: [[TMP18:%.*]] = extractelement [[ACTIVE_LANE_MASK4]], i32 0 +; CHECK-NEXT: [[TMP19:%.*]] = xor i1 [[TMP18]], true +; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; @@ -569,6 +599,7 @@ ; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]] ; CHECK-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 ; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() @@ -579,21 +610,24 @@ ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[ACTIVE_LANE_MASK]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK4:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX1]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP9]], i64 [[UMAX]]) ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr float, float* [[PTR:%.*]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr float, float* [[TMP10]], i32 0 ; CHECK-NEXT: [[TMP12:%.*]] = bitcast float* [[TMP11]] to * -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0nxv4f32(* [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP13:%.*]] = select [[ACTIVE_LANE_MASK]], [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, float -0.000000e+00, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0nxv4f32(* [[TMP12]], i32 4, [[ACTIVE_LANE_MASK2]], poison) +; CHECK-NEXT: [[TMP13:%.*]] = select [[ACTIVE_LANE_MASK2]], [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, float -0.000000e+00, i32 0), poison, zeroinitializer) ; CHECK-NEXT: [[TMP14]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[TMP13]]) ; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4 -; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP16]] -; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK-NEXT: [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP16]] +; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 [[INDEX_NEXT3]], 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK4]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[UMAX]]) +; CHECK-NEXT: [[TMP17:%.*]] = extractelement [[ACTIVE_LANE_MASK4]], i32 0 +; CHECK-NEXT: [[TMP18:%.*]] = xor i1 [[TMP17]], true +; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; @@ -623,6 +657,7 @@ ; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]] ; CHECK-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 ; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() @@ -634,27 +669,30 @@ ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK1:%.*]] = phi [ [[ACTIVE_LANE_MASK]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK3:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 7, [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP9]], i64 [[N]]) ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[COND:%.*]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 0 ; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to * -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP12]], i32 4, [[ACTIVE_LANE_MASK1]], poison) ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i32 5, i32 0), poison, zeroinitializer) ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP15:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = select [[ACTIVE_LANE_MASK1]], [[TMP13]], zeroinitializer ; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i32, i32* [[TMP14]], i32 0 ; CHECK-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to * -; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP17]], i32 4, [[TMP15]], poison) -; CHECK-NEXT: [[TMP18:%.*]] = select [[TMP15]], [[WIDE_MASKED_LOAD1]], zeroinitializer +; CHECK-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP17]], i32 4, [[TMP15]], poison) +; CHECK-NEXT: [[TMP18:%.*]] = select [[TMP15]], [[WIDE_MASKED_LOAD2]], zeroinitializer ; CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.xor.nxv4i32( [[TMP18]]) ; CHECK-NEXT: [[TMP20]] = xor i32 [[TMP19]], [[VEC_PHI]] ; CHECK-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP22]] -; CHECK-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 [[INDEX_NEXT]], 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK3]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[N]]) +; CHECK-NEXT: [[TMP23:%.*]] = extractelement [[ACTIVE_LANE_MASK3]], i32 0 +; CHECK-NEXT: [[TMP24:%.*]] = xor i1 [[TMP23]], true +; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; Index: llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll @@ -9,10 +9,12 @@ ; we don't artificially create new predicated blocks for the load. define void @uniform_load(i32* noalias %dst, i32* noalias readonly %src, i64 %n) #0 { ; CHECK-LABEL: @uniform_load( +; CHECK: vector.ph: +; CHECK: [[INIT_ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 0, i64 %n) ; CHECK: vector.body: ; CHECK-NEXT: [[IDX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[IDX_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[INIT_ACTIVE_LANE_MASK]], %vector.ph ], [ [[NEXT_ACTIVE_LANE_MASK:%.*]], %vector.body ] ; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[IDX]], 0 -; CHECK-NEXT: [[LOOP_PRED:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[TMP3]], i64 %n) ; CHECK-NEXT: [[LOAD_VAL:%.*]] = load i32, i32* %src, align 4 ; CHECK-NOT: load i32, i32* %src, align 4 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> poison, i32 [[LOAD_VAL]], i32 0 @@ -20,10 +22,13 @@ ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* %dst, i64 [[TMP3]] ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 0 ; CHECK-NEXT: [[STORE_PTR:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>* -; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP5]], <4 x i32>* [[STORE_PTR]], i32 4, <4 x i1> [[LOOP_PRED]]) +; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP5]], <4 x i32>* [[STORE_PTR]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]]) ; CHECK-NEXT: [[IDX_NEXT]] = add i64 [[IDX]], 4 -; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[IDX_NEXT]], %n.vec -; CHECK-NEXT: br i1 [[CMP]], label %middle.block, label %vector.body +; CHECK-NEXT: [[IDX_NEXT_PART0:%.*]] = add i64 [[IDX_NEXT]], 0 +; CHECK-NEXT: [[NEXT_ACTIVE_LANE_MASK]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[IDX_NEXT_PART0]], i64 %n) +; CHECK-NEXT: [[FIRST_LANE_SET:%.*]] = extractelement <4 x i1> [[NEXT_ACTIVE_LANE_MASK]], i32 0 +; CHECK-NEXT: [[FIRST_LANE_NOT_SET:%.*]] = xor i1 [[FIRST_LANE_SET]], true +; CHECK-NEXT: br i1 [[FIRST_LANE_NOT_SET]], label %middle.block, label %vector.body entry: br label %for.body @@ -48,16 +53,17 @@ define void @cond_uniform_load(i32* nocapture %dst, i32* nocapture readonly %src, i32* nocapture readonly %cond, i64 %n) #0 { ; CHECK-LABEL: @cond_uniform_load( ; CHECK: vector.ph: +; CHECK: [[INIT_ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 0, i64 %n) ; CHECK: [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* %src, i32 0 ; CHECK-NEXT: [[SRC_SPLAT:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> poison, <4 x i32> zeroinitializer ; CHECK: vector.body: ; CHECK-NEXT: [[IDX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[IDX_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[INIT_ACTIVE_LANE_MASK]], %vector.ph ], [ [[NEXT_ACTIVE_LANE_MASK:%.*]], %vector.body ] ; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[IDX]], 0 -; CHECK-NEXT: [[LOOP_PRED:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[TMP3]], i64 %n) -; CHECK: [[COND_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{%.*}}, i32 4, <4 x i1> [[LOOP_PRED]], <4 x i32> poison) +; CHECK: [[COND_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{%.*}}, i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison) ; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <4 x i32> [[COND_LOAD]], zeroinitializer ; CHECK-NEXT: [[TMP5:%.*]] = xor <4 x i1> [[TMP4]], -; CHECK-NEXT: [[MASK:%.*]] = select <4 x i1> [[LOOP_PRED]], <4 x i1> [[TMP5]], <4 x i1> zeroinitializer +; CHECK-NEXT: [[MASK:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i1> [[TMP5]], <4 x i1> zeroinitializer ; CHECK-NEXT: call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[SRC_SPLAT]], i32 4, <4 x i1> [[MASK]], <4 x i32> undef) entry: br label %for.body