diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -66,6 +66,7 @@ public: VPBuilder() = default; + VPBuilder(VPBasicBlock *InsertBB) { setInsertPoint(InsertBB); } /// Clear the insertion point: created instructions will not be inserted into /// a block. @@ -143,6 +144,13 @@ return createInstruction(Opcode, Operands, DL, Name); } + VPInstruction *createOverflowingOp(unsigned Opcode, + std::initializer_list Operands, + VPRecipeWithIRFlags::WrapFlagsTy WrapFlags, + DebugLoc DL, const Twine &Name = "") { + return tryInsertInstruction( + new VPInstruction(Opcode, Operands, WrapFlags, DL, Name)); + } VPValue *createNot(VPValue *Operand, DebugLoc DL, const Twine &Name = "") { return createInstruction(VPInstruction::Not, {Operand}, DL, Name); } diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8065,14 +8065,6 @@ return; } - // If we're using the active lane mask for control flow, then we get the - // mask from the active lane mask PHI that is cached in the VPlan. - TailFoldingStyle TFStyle = CM.getTailFoldingStyle(); - if (useActiveLaneMaskForControlFlow(TFStyle)) { - BlockMaskCache[Header] = Plan.getActiveLaneMaskPhi(); - return; - } - // Introduce the early-exit compare IV <= BTC to form header block mask. // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by // constructing the desired canonical IV in the header block as its first @@ -8086,14 +8078,8 @@ VPBuilder::InsertPointGuard Guard(Builder); Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint); VPValue *BlockMask = nullptr; - if (useActiveLaneMask(TFStyle)) { - VPValue *TC = Plan.getTripCount(); - BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC}, - nullptr, "active.lane.mask"); - } else { - VPValue *BTC = Plan.getOrCreateBackedgeTakenCount(); - BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC); - } + VPValue *BTC = Plan.getOrCreateBackedgeTakenCount(); + BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC); BlockMaskCache[Header] = BlockMask; } @@ -8631,8 +8617,8 @@ // Add the necessary canonical IV and branch recipes required to control the // loop. -static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL, - TailFoldingStyle Style) { +static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW, + DebugLoc DL) { Value *StartIdx = ConstantInt::get(IdxTy, 0); auto *StartV = Plan.getVPValueOrAddLiveIn(StartIdx); @@ -8644,93 +8630,19 @@ // Add a CanonicalIVIncrement{NUW} VPInstruction to increment the scalar // IV by VF * UF. - bool HasNUW = Style == TailFoldingStyle::None; auto *CanonicalIVIncrement = new VPInstruction(VPInstruction::CanonicalIVIncrement, {CanonicalIVPHI}, {HasNUW, false}, DL, "index.next"); CanonicalIVPHI->addOperand(CanonicalIVIncrement); VPBasicBlock *EB = TopRegion->getExitingBasicBlock(); - if (useActiveLaneMaskForControlFlow(Style)) { - // Create the active lane mask instruction in the vplan preheader. - VPBasicBlock *VecPreheader = - cast(Plan.getVectorLoopRegion()->getSinglePredecessor()); - - // We can't use StartV directly in the ActiveLaneMask VPInstruction, since - // we have to take unrolling into account. Each part needs to start at - // Part * VF - auto *CanonicalIVIncrementParts = - new VPInstruction(VPInstruction::CanonicalIVIncrementForPart, {StartV}, - {HasNUW, false}, DL, "index.part.next"); - VecPreheader->appendRecipe(CanonicalIVIncrementParts); - - // Create the ActiveLaneMask instruction using the correct start values. - VPValue *TC = Plan.getTripCount(); - - VPValue *TripCount, *IncrementValue; - if (Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) { - // When avoiding a runtime check, the active.lane.mask inside the loop - // uses a modified trip count and the induction variable increment is - // done after the active.lane.mask intrinsic is called. - auto *TCMinusVF = - new VPInstruction(VPInstruction::CalculateTripCountMinusVF, {TC}, DL); - VecPreheader->appendRecipe(TCMinusVF); - IncrementValue = CanonicalIVPHI; - TripCount = TCMinusVF; - } else { - // When the loop is guarded by a runtime overflow check for the loop - // induction variable increment by VF, we can increment the value before - // the get.active.lane mask and use the unmodified tripcount. - EB->appendRecipe(CanonicalIVIncrement); - IncrementValue = CanonicalIVIncrement; - TripCount = TC; - } + EB->appendRecipe(CanonicalIVIncrement); - auto *EntryALM = new VPInstruction(VPInstruction::ActiveLaneMask, - {CanonicalIVIncrementParts, TC}, DL, - "active.lane.mask.entry"); - VecPreheader->appendRecipe(EntryALM); - - // Now create the ActiveLaneMaskPhi recipe in the main loop using the - // preheader ActiveLaneMask instruction. - auto *LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc()); - Header->insert(LaneMaskPhi, Header->getFirstNonPhi()); - - // Create the active lane mask for the next iteration of the loop. - CanonicalIVIncrementParts = - new VPInstruction(VPInstruction::CanonicalIVIncrementForPart, - {IncrementValue}, {HasNUW, false}, DL); - EB->appendRecipe(CanonicalIVIncrementParts); - - auto *ALM = new VPInstruction(VPInstruction::ActiveLaneMask, - {CanonicalIVIncrementParts, TripCount}, DL, - "active.lane.mask.next"); - EB->appendRecipe(ALM); - LaneMaskPhi->addOperand(ALM); - - if (Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) { - // Do the increment of the canonical IV after the active.lane.mask, because - // that value is still based off %CanonicalIVPHI - EB->appendRecipe(CanonicalIVIncrement); - } - - // We have to invert the mask here because a true condition means jumping - // to the exit block. - auto *NotMask = new VPInstruction(VPInstruction::Not, ALM, DL); - EB->appendRecipe(NotMask); - - VPInstruction *BranchBack = - new VPInstruction(VPInstruction::BranchOnCond, {NotMask}, DL); - EB->appendRecipe(BranchBack); - } else { - EB->appendRecipe(CanonicalIVIncrement); - - // Add the BranchOnCount VPInstruction to the latch. - VPInstruction *BranchBack = new VPInstruction( - VPInstruction::BranchOnCount, - {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); - EB->appendRecipe(BranchBack); - } + // Add the BranchOnCount VPInstruction to the latch. + VPInstruction *BranchBack = + new VPInstruction(VPInstruction::BranchOnCount, + {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); + EB->appendRecipe(BranchBack); } // Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the @@ -8817,8 +8729,11 @@ IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF); DebugLoc DL = getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); - addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DL, - CM.getTailFoldingStyle(IVUpdateMayOverflow)); + TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow); + // When not folding the tail, we know that the induction increment will not + // overflow. + bool HasNUW = Style == TailFoldingStyle::None; + addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL); // Proactively create header mask. Masks for other blocks are created on // demand. @@ -8987,6 +8902,15 @@ if (!VPlanTransforms::adjustFixedOrderRecurrences(*Plan, Builder)) return nullptr; + if (useActiveLaneMask(Style)) { + // TODO: Move checks to VPlanTransforms::addActiveLaneMask once + // TailFoldingStyle is visible there. + bool ForControlFlow = useActiveLaneMaskForControlFlow(Style); + bool WithoutRuntimeCheck = + Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck; + VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow, + WithoutRuntimeCheck); + } return Plan; } @@ -9021,8 +8945,11 @@ Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator(); Term->eraseFromParent(); - addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(), - CM.getTailFoldingStyle()); + // Tail folding is not supported for outer loops, so the induction increment + // is guaranteed to not wrap. + bool HasNUW = true; + addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, + DebugLoc()); return Plan; } diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -2682,10 +2682,6 @@ return cast(&*EntryVPBB->begin()); } - /// Find and return the VPActiveLaneMaskPHIRecipe from the header - there - /// be only one at most. If there isn't one, then return nullptr. - VPActiveLaneMaskPHIRecipe *getActiveLaneMaskPhi(); - void addLiveOut(PHINode *PN, VPValue *V); void removeLiveOut(PHINode *PN) { diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -718,15 +718,6 @@ return Plan; } -VPActiveLaneMaskPHIRecipe *VPlan::getActiveLaneMaskPhi() { - VPBasicBlock *Header = getVectorLoopRegion()->getEntryBasicBlock(); - for (VPRecipeBase &R : Header->phis()) { - if (isa(&R)) - return cast(&R); - } - return nullptr; -} - void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV, Value *CanonicalIVStartValue, VPTransformState &State, diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -67,6 +67,18 @@ /// regions until no improvements are remaining. static void createAndOptimizeReplicateRegions(VPlan &Plan); + /// Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an + /// (active-lane-mask recipe, wide canonical IV, trip-count). If \p + /// UseActiveLaneMaskForControlFlow is true, introduce an + /// VPActiveLaneMaskPHIRecipe. If \p DataAndControlFlowWithoutRuntimeCheck is + /// true, no minimum-iteration runtime check will be created (during skeleton + /// creation) and instead it is handled using active-lane-mask. \p + /// DataAndControlFlowWithoutRuntimeCheck implies \p + /// UseActiveLaneMaskForControlFlow. + static void addActiveLaneMask(VPlan &Plan, + bool UseActiveLaneMaskForControlFlow, + bool DataAndControlFlowWithoutRuntimeCheck); + private: /// Remove redundant VPBasicBlocks by merging them into their predecessor if /// the predecessor has a single successor. diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -16,6 +16,7 @@ #include "VPlanCFG.h" #include "VPlanDominatorTree.h" #include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetVector.h" #include "llvm/Analysis/IVDescriptors.h" #include "llvm/Analysis/VectorUtils.h" @@ -855,3 +856,152 @@ removeRedundantExpandSCEVRecipes(Plan); mergeBlocksIntoPredecessors(Plan); } + +// Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace +// the loop terminator with a branch-on-cond recipe with the negated +// active-lane-mask as operand. Note that this turns the loop into an +// uncountable one. Only the existing terminator is replaced, all other existing +// recipes/users remain unchanged, except for poison-generating flags being +// dropped from the canonical IV increment. Return the created +// VPActiveLaneMaskPHIRecipe. +// +// The function uses the following definitions: +// +// %TripCount = DataWithControlFlowWithoutRuntimeCheck ? +// calculate-trip-count-minus-VF (original TC) : original TC +// %IncrementValue = DataWithControlFlowWithoutRuntimeCheck ? +// CanonicalIVPhi : CanonicalIVIncrement +// %StartV is the canonical induction start value. +// +// The function adds the following recipes: +// +// vector.ph: +// %TripCount = calculate-trip-count-minus-VF (original TC) +// [if DataWithControlFlowWithoutRuntimeCheck] +// %EntryInc = canonical-iv-increment-for-part %StartV +// %EntryALM = active-lane-mask %EntryInc, %TripCount +// +// vector.body: +// ... +// %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ] +// ... +// %InLoopInc = canonical-iv-increment-for-part %IncrementValue +// %ALM = active-lane-mask %InLoopInc, TripCount +// %Negated = Not %ALM +// branch-on-cond %Negated +// +static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch( + VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck) { + VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); + VPBasicBlock *EB = TopRegion->getExitingBasicBlock(); + auto *CanonicalIVPHI = Plan.getCanonicalIV(); + VPValue *StartV = CanonicalIVPHI->getStartValue(); + + auto *CanonicalIVIncrement = + cast(CanonicalIVPHI->getBackedgeValue()); + // TODO: Check if dropping the flags is needed if + // !DataAndControlFlowWithoutRuntimeCheck. + CanonicalIVIncrement->dropPoisonGeneratingFlags(); + DebugLoc DL = CanonicalIVIncrement->getDebugLoc(); + // We can't use StartV directly in the ActiveLaneMask VPInstruction, since + // we have to take unrolling into account. Each part needs to start at + // Part * VF + auto *VecPreheader = cast(TopRegion->getSinglePredecessor()); + VPBuilder Builder(VecPreheader); + + // Create the ActiveLaneMask instruction using the correct start values. + VPValue *TC = Plan.getTripCount(); + + VPValue *TripCount, *IncrementValue; + if (!DataAndControlFlowWithoutRuntimeCheck) { + // When the loop is guarded by a runtime overflow check for the loop + // induction variable increment by VF, we can increment the value before + // the get.active.lane mask and use the unmodified tripcount. + IncrementValue = CanonicalIVIncrement; + TripCount = TC; + } else { + // When avoiding a runtime check, the active.lane.mask inside the loop + // uses a modified trip count and the induction variable increment is + // done after the active.lane.mask intrinsic is called. + IncrementValue = CanonicalIVPHI; + TripCount = Builder.createNaryOp(VPInstruction::CalculateTripCountMinusVF, + {TC}, DL); + } + auto *EntryIncrement = Builder.createOverflowingOp( + VPInstruction::CanonicalIVIncrementForPart, {StartV}, {false, false}, DL, + "index.part.next"); + + // Create the active lane mask instruction in the VPlan preheader. + auto *EntryALM = + Builder.createNaryOp(VPInstruction::ActiveLaneMask, {EntryIncrement, TC}, + DL, "active.lane.mask.entry"); + + // Now create the ActiveLaneMaskPhi recipe in the main loop using the + // preheader ActiveLaneMask instruction. + auto LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc()); + LaneMaskPhi->insertAfter(CanonicalIVPHI); + + // Create the active lane mask for the next iteration of the loop before the + // original terminator. + VPRecipeBase *OriginalTerminator = EB->getTerminator(); + Builder.setInsertPoint(OriginalTerminator); + auto *InLoopIncrement = + Builder.createOverflowingOp(VPInstruction::CanonicalIVIncrementForPart, + {IncrementValue}, {false, false}, DL); + auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask, + {InLoopIncrement, TripCount}, DL, + "active.lane.mask.next"); + LaneMaskPhi->addOperand(ALM); + + // Replace the original terminator with BranchOnCond. We have to invert the + // mask here because a true condition means jumping to the exit block. + auto *NotMask = Builder.createNot(ALM, DL); + Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL); + OriginalTerminator->eraseFromParent(); + return LaneMaskPhi; +} + +void VPlanTransforms::addActiveLaneMask( + VPlan &Plan, bool UseActiveLaneMaskForControlFlow, + bool DataAndControlFlowWithoutRuntimeCheck) { + assert(!DataAndControlFlowWithoutRuntimeCheck || + UseActiveLaneMaskForControlFlow && + "DataAndControlFlowWithoutRuntimeCheck implies " + "UseActiveLaneMaskForControlFlow"); + + auto FoundWidenCanonicalIVUser = + find_if(Plan.getCanonicalIV()->users(), + [](VPUser *U) { return isa(U); }); + assert(FoundWidenCanonicalIVUser && + "Must have widened canonical IV when tail folding!"); + auto *WideCanonicalIV = + cast(*FoundWidenCanonicalIVUser); + VPRecipeBase *LaneMask; + if (UseActiveLaneMaskForControlFlow) { + LaneMask = addVPLaneMaskPhiAndUpdateExitBranch( + Plan, DataAndControlFlowWithoutRuntimeCheck); + } else { + LaneMask = new VPInstruction(VPInstruction::ActiveLaneMask, + {WideCanonicalIV, Plan.getTripCount()}, + nullptr, "active.lane.mask"); + LaneMask->insertAfter(WideCanonicalIV); + } + + // Walk users of WideCanonicalIV and replace all compares of the form + // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an + // active-lane-mask. + VPValue *BTC = Plan.getOrCreateBackedgeTakenCount(); + for (VPUser *U : SmallVector(WideCanonicalIV->users())) { + auto *CompareToReplace = dyn_cast(U); + if (!CompareToReplace || + CompareToReplace->getOpcode() != Instruction::ICmp || + CompareToReplace->getPredicate() != CmpInst::ICMP_ULE || + CompareToReplace->getOperand(1) != BTC) + continue; + + assert(CompareToReplace->getOperand(0) == WideCanonicalIV && + "WidenCanonicalIV must be the first operand of the compare"); + CompareToReplace->replaceAllUsesWith(LaneMask->getVPSingleValue()); + CompareToReplace->eraseFromParent(); + } +} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll @@ -153,10 +153,10 @@ ; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]], poison) ; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = select [[ACTIVE_LANE_MASK]], [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) ; CHECK-ORDERED-TF-NEXT: [[TMP14]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], [[TMP13]]) -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP9]]) ; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 8 ; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP16]] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP9]]) ; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) ; CHECK-ORDERED-TF-NEXT: [[TMP18:%.*]] = extractelement [[TMP17]], i32 0 ; CHECK-ORDERED-TF-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] @@ -397,34 +397,34 @@ ; CHECK-ORDERED-TF-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; CHECK-ORDERED-TF-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-ORDERED-TF-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8 -; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP6]] -; CHECK-ORDERED-TF-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 16 -; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP8]] -; CHECK-ORDERED-TF-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 24 -; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP10]] -; CHECK-ORDERED-TF-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 32 -; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = sub i64 [[N]], [[TMP12]] -; CHECK-ORDERED-TF-NEXT: [[TMP14:%.*]] = icmp ugt i64 [[N]], [[TMP12]] -; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], i64 [[TMP13]], i64 0 -; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 32 -; CHECK-ORDERED-TF-NEXT: [[TMP18:%.*]] = sub i64 [[N]], [[TMP17]] -; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = icmp ugt i64 [[N]], [[TMP17]] -; CHECK-ORDERED-TF-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i64 [[TMP18]], i64 0 -; CHECK-ORDERED-TF-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 32 -; CHECK-ORDERED-TF-NEXT: [[TMP23:%.*]] = sub i64 [[N]], [[TMP22]] -; CHECK-ORDERED-TF-NEXT: [[TMP24:%.*]] = icmp ugt i64 [[N]], [[TMP22]] -; CHECK-ORDERED-TF-NEXT: [[TMP25:%.*]] = select i1 [[TMP24]], i64 [[TMP23]], i64 0 -; CHECK-ORDERED-TF-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP27:%.*]] = mul i64 [[TMP26]], 32 -; CHECK-ORDERED-TF-NEXT: [[TMP28:%.*]] = sub i64 [[N]], [[TMP27]] -; CHECK-ORDERED-TF-NEXT: [[TMP29:%.*]] = icmp ugt i64 [[N]], [[TMP27]] -; CHECK-ORDERED-TF-NEXT: [[TMP30:%.*]] = select i1 [[TMP29]], i64 [[TMP28]], i64 0 +; CHECK-ORDERED-TF-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 32 +; CHECK-ORDERED-TF-NEXT: [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]] +; CHECK-ORDERED-TF-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]] +; CHECK-ORDERED-TF-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 +; CHECK-ORDERED-TF-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 32 +; CHECK-ORDERED-TF-NEXT: [[TMP12:%.*]] = sub i64 [[N]], [[TMP11]] +; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = icmp ugt i64 [[N]], [[TMP11]] +; CHECK-ORDERED-TF-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i64 [[TMP12]], i64 0 +; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 32 +; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = sub i64 [[N]], [[TMP16]] +; CHECK-ORDERED-TF-NEXT: [[TMP18:%.*]] = icmp ugt i64 [[N]], [[TMP16]] +; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i64 [[TMP17]], i64 0 +; CHECK-ORDERED-TF-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 32 +; CHECK-ORDERED-TF-NEXT: [[TMP22:%.*]] = sub i64 [[N]], [[TMP21]] +; CHECK-ORDERED-TF-NEXT: [[TMP23:%.*]] = icmp ugt i64 [[N]], [[TMP21]] +; CHECK-ORDERED-TF-NEXT: [[TMP24:%.*]] = select i1 [[TMP23]], i64 [[TMP22]], i64 0 +; CHECK-ORDERED-TF-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP26:%.*]] = mul i64 [[TMP25]], 8 +; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP26]] +; CHECK-ORDERED-TF-NEXT: [[TMP27:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP28:%.*]] = mul i64 [[TMP27]], 16 +; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP28]] +; CHECK-ORDERED-TF-NEXT: [[TMP29:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP30:%.*]] = mul i64 [[TMP29]], 24 +; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP30]] ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[N]]) ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[N]]) ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY4:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT1]], i64 [[N]]) @@ -480,21 +480,21 @@ ; CHECK-ORDERED-TF-NEXT: [[TMP67:%.*]] = select [[ACTIVE_LANE_MASK8]], [[WIDE_MASKED_LOAD11]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) ; CHECK-ORDERED-TF-NEXT: [[TMP68]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP66]], [[TMP67]]) ; CHECK-ORDERED-TF-NEXT: [[TMP69:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP70:%.*]] = mul i64 [[TMP69]], 8 -; CHECK-ORDERED-TF-NEXT: [[TMP71:%.*]] = add i64 [[INDEX]], [[TMP70]] -; CHECK-ORDERED-TF-NEXT: [[TMP72:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP73:%.*]] = mul i64 [[TMP72]], 16 -; CHECK-ORDERED-TF-NEXT: [[TMP74:%.*]] = add i64 [[INDEX]], [[TMP73]] -; CHECK-ORDERED-TF-NEXT: [[TMP75:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP76:%.*]] = mul i64 [[TMP75]], 24 -; CHECK-ORDERED-TF-NEXT: [[TMP77:%.*]] = add i64 [[INDEX]], [[TMP76]] -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP15]]) -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT12]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP71]], i64 [[TMP20]]) -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT13]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP74]], i64 [[TMP25]]) -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT14]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP77]], i64 [[TMP30]]) -; CHECK-ORDERED-TF-NEXT: [[TMP78:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP79:%.*]] = mul i64 [[TMP78]], 32 -; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP79]] +; CHECK-ORDERED-TF-NEXT: [[TMP70:%.*]] = mul i64 [[TMP69]], 32 +; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP70]] +; CHECK-ORDERED-TF-NEXT: [[TMP71:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP72:%.*]] = mul i64 [[TMP71]], 8 +; CHECK-ORDERED-TF-NEXT: [[TMP73:%.*]] = add i64 [[INDEX]], [[TMP72]] +; CHECK-ORDERED-TF-NEXT: [[TMP74:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP75:%.*]] = mul i64 [[TMP74]], 16 +; CHECK-ORDERED-TF-NEXT: [[TMP76:%.*]] = add i64 [[INDEX]], [[TMP75]] +; CHECK-ORDERED-TF-NEXT: [[TMP77:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP78:%.*]] = mul i64 [[TMP77]], 24 +; CHECK-ORDERED-TF-NEXT: [[TMP79:%.*]] = add i64 [[INDEX]], [[TMP78]] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP9]]) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT12]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP73]], i64 [[TMP14]]) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT13]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP76]], i64 [[TMP19]]) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT14]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP79]], i64 [[TMP24]]) ; CHECK-ORDERED-TF-NEXT: [[TMP80:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) ; CHECK-ORDERED-TF-NEXT: [[TMP81:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT12]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) ; CHECK-ORDERED-TF-NEXT: [[TMP82:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) @@ -755,10 +755,10 @@ ; CHECK-ORDERED-TF-NEXT: [[TMP19]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[TMP18]]) ; CHECK-ORDERED-TF-NEXT: [[TMP20:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP16]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) ; CHECK-ORDERED-TF-NEXT: [[TMP21]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI1]], [[TMP20]]) -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP12]]) ; CHECK-ORDERED-TF-NEXT: [[TMP22:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-TF-NEXT: [[TMP23:%.*]] = mul i64 [[TMP22]], 4 ; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP23]] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP12]]) ; CHECK-ORDERED-TF-NEXT: [[TMP24:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) ; CHECK-ORDERED-TF-NEXT: [[TMP25:%.*]] = extractelement [[TMP24]], i32 0 ; CHECK-ORDERED-TF-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] @@ -1012,10 +1012,10 @@ ; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = fadd [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD1]] ; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP16]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) ; CHECK-ORDERED-TF-NEXT: [[TMP18]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[TMP17]]) -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP10]]) ; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-TF-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 4 ; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP20]] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP10]]) ; CHECK-ORDERED-TF-NEXT: [[TMP21:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) ; CHECK-ORDERED-TF-NEXT: [[TMP22:%.*]] = extractelement [[TMP21]], i32 0 ; CHECK-ORDERED-TF-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] @@ -1258,10 +1258,10 @@ ; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = or [[TMP15]], [[TMP18]] ; CHECK-ORDERED-TF-NEXT: [[TMP20:%.*]] = select [[TMP19]], [[PREDPHI]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) ; CHECK-ORDERED-TF-NEXT: [[TMP21]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[TMP20]]) -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]]) ; CHECK-ORDERED-TF-NEXT: [[TMP22:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-TF-NEXT: [[TMP23:%.*]] = mul i64 [[TMP22]], 4 ; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP23]] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]]) ; CHECK-ORDERED-TF-NEXT: [[TMP24:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) ; CHECK-ORDERED-TF-NEXT: [[TMP25:%.*]] = extractelement [[TMP24]], i32 0 ; CHECK-ORDERED-TF-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] @@ -1709,34 +1709,34 @@ ; CHECK-ORDERED-TF-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; CHECK-ORDERED-TF-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-ORDERED-TF-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8 -; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP6]] -; CHECK-ORDERED-TF-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 16 -; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP8]] -; CHECK-ORDERED-TF-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 24 -; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP10]] -; CHECK-ORDERED-TF-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 32 -; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = sub i64 [[N]], [[TMP12]] -; CHECK-ORDERED-TF-NEXT: [[TMP14:%.*]] = icmp ugt i64 [[N]], [[TMP12]] -; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], i64 [[TMP13]], i64 0 -; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 32 -; CHECK-ORDERED-TF-NEXT: [[TMP18:%.*]] = sub i64 [[N]], [[TMP17]] -; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = icmp ugt i64 [[N]], [[TMP17]] -; CHECK-ORDERED-TF-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i64 [[TMP18]], i64 0 -; CHECK-ORDERED-TF-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 32 -; CHECK-ORDERED-TF-NEXT: [[TMP23:%.*]] = sub i64 [[N]], [[TMP22]] -; CHECK-ORDERED-TF-NEXT: [[TMP24:%.*]] = icmp ugt i64 [[N]], [[TMP22]] -; CHECK-ORDERED-TF-NEXT: [[TMP25:%.*]] = select i1 [[TMP24]], i64 [[TMP23]], i64 0 -; CHECK-ORDERED-TF-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP27:%.*]] = mul i64 [[TMP26]], 32 -; CHECK-ORDERED-TF-NEXT: [[TMP28:%.*]] = sub i64 [[N]], [[TMP27]] -; CHECK-ORDERED-TF-NEXT: [[TMP29:%.*]] = icmp ugt i64 [[N]], [[TMP27]] -; CHECK-ORDERED-TF-NEXT: [[TMP30:%.*]] = select i1 [[TMP29]], i64 [[TMP28]], i64 0 +; CHECK-ORDERED-TF-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 32 +; CHECK-ORDERED-TF-NEXT: [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]] +; CHECK-ORDERED-TF-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]] +; CHECK-ORDERED-TF-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 +; CHECK-ORDERED-TF-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 32 +; CHECK-ORDERED-TF-NEXT: [[TMP12:%.*]] = sub i64 [[N]], [[TMP11]] +; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = icmp ugt i64 [[N]], [[TMP11]] +; CHECK-ORDERED-TF-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i64 [[TMP12]], i64 0 +; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 32 +; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = sub i64 [[N]], [[TMP16]] +; CHECK-ORDERED-TF-NEXT: [[TMP18:%.*]] = icmp ugt i64 [[N]], [[TMP16]] +; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i64 [[TMP17]], i64 0 +; CHECK-ORDERED-TF-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 32 +; CHECK-ORDERED-TF-NEXT: [[TMP22:%.*]] = sub i64 [[N]], [[TMP21]] +; CHECK-ORDERED-TF-NEXT: [[TMP23:%.*]] = icmp ugt i64 [[N]], [[TMP21]] +; CHECK-ORDERED-TF-NEXT: [[TMP24:%.*]] = select i1 [[TMP23]], i64 [[TMP22]], i64 0 +; CHECK-ORDERED-TF-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP26:%.*]] = mul i64 [[TMP25]], 8 +; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP26]] +; CHECK-ORDERED-TF-NEXT: [[TMP27:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP28:%.*]] = mul i64 [[TMP27]], 16 +; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP28]] +; CHECK-ORDERED-TF-NEXT: [[TMP29:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP30:%.*]] = mul i64 [[TMP29]], 24 +; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP30]] ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[N]]) ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[N]]) ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY4:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT1]], i64 [[N]]) @@ -1814,21 +1814,21 @@ ; CHECK-ORDERED-TF-NEXT: [[TMP85:%.*]] = select [[ACTIVE_LANE_MASK8]], [[TMP78]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) ; CHECK-ORDERED-TF-NEXT: [[TMP86]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP84]], [[TMP85]]) ; CHECK-ORDERED-TF-NEXT: [[TMP87:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP88:%.*]] = mul i64 [[TMP87]], 8 -; CHECK-ORDERED-TF-NEXT: [[TMP89:%.*]] = add i64 [[INDEX]], [[TMP88]] -; CHECK-ORDERED-TF-NEXT: [[TMP90:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP91:%.*]] = mul i64 [[TMP90]], 16 -; CHECK-ORDERED-TF-NEXT: [[TMP92:%.*]] = add i64 [[INDEX]], [[TMP91]] -; CHECK-ORDERED-TF-NEXT: [[TMP93:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP94:%.*]] = mul i64 [[TMP93]], 24 -; CHECK-ORDERED-TF-NEXT: [[TMP95:%.*]] = add i64 [[INDEX]], [[TMP94]] -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP15]]) -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT16]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP89]], i64 [[TMP20]]) -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT17]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP92]], i64 [[TMP25]]) -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT18]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP95]], i64 [[TMP30]]) -; CHECK-ORDERED-TF-NEXT: [[TMP96:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP97:%.*]] = mul i64 [[TMP96]], 32 -; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP97]] +; CHECK-ORDERED-TF-NEXT: [[TMP88:%.*]] = mul i64 [[TMP87]], 32 +; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP88]] +; CHECK-ORDERED-TF-NEXT: [[TMP89:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP90:%.*]] = mul i64 [[TMP89]], 8 +; CHECK-ORDERED-TF-NEXT: [[TMP91:%.*]] = add i64 [[INDEX]], [[TMP90]] +; CHECK-ORDERED-TF-NEXT: [[TMP92:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP93:%.*]] = mul i64 [[TMP92]], 16 +; CHECK-ORDERED-TF-NEXT: [[TMP94:%.*]] = add i64 [[INDEX]], [[TMP93]] +; CHECK-ORDERED-TF-NEXT: [[TMP95:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP96:%.*]] = mul i64 [[TMP95]], 24 +; CHECK-ORDERED-TF-NEXT: [[TMP97:%.*]] = add i64 [[INDEX]], [[TMP96]] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP9]]) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT16]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP91]], i64 [[TMP14]]) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT17]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP94]], i64 [[TMP19]]) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT18]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP97]], i64 [[TMP24]]) ; CHECK-ORDERED-TF-NEXT: [[TMP98:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) ; CHECK-ORDERED-TF-NEXT: [[TMP99:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT16]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) ; CHECK-ORDERED-TF-NEXT: [[TMP100:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT17]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) @@ -2123,34 +2123,34 @@ ; CHECK-ORDERED-TF-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; CHECK-ORDERED-TF-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-ORDERED-TF-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8 -; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP6]] -; CHECK-ORDERED-TF-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 16 -; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP8]] -; CHECK-ORDERED-TF-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 24 -; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP10]] -; CHECK-ORDERED-TF-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 32 -; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = sub i64 [[N]], [[TMP12]] -; CHECK-ORDERED-TF-NEXT: [[TMP14:%.*]] = icmp ugt i64 [[N]], [[TMP12]] -; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], i64 [[TMP13]], i64 0 -; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 32 -; CHECK-ORDERED-TF-NEXT: [[TMP18:%.*]] = sub i64 [[N]], [[TMP17]] -; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = icmp ugt i64 [[N]], [[TMP17]] -; CHECK-ORDERED-TF-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i64 [[TMP18]], i64 0 -; CHECK-ORDERED-TF-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 32 -; CHECK-ORDERED-TF-NEXT: [[TMP23:%.*]] = sub i64 [[N]], [[TMP22]] -; CHECK-ORDERED-TF-NEXT: [[TMP24:%.*]] = icmp ugt i64 [[N]], [[TMP22]] -; CHECK-ORDERED-TF-NEXT: [[TMP25:%.*]] = select i1 [[TMP24]], i64 [[TMP23]], i64 0 -; CHECK-ORDERED-TF-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP27:%.*]] = mul i64 [[TMP26]], 32 -; CHECK-ORDERED-TF-NEXT: [[TMP28:%.*]] = sub i64 [[N]], [[TMP27]] -; CHECK-ORDERED-TF-NEXT: [[TMP29:%.*]] = icmp ugt i64 [[N]], [[TMP27]] -; CHECK-ORDERED-TF-NEXT: [[TMP30:%.*]] = select i1 [[TMP29]], i64 [[TMP28]], i64 0 +; CHECK-ORDERED-TF-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 32 +; CHECK-ORDERED-TF-NEXT: [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]] +; CHECK-ORDERED-TF-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]] +; CHECK-ORDERED-TF-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 +; CHECK-ORDERED-TF-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 32 +; CHECK-ORDERED-TF-NEXT: [[TMP12:%.*]] = sub i64 [[N]], [[TMP11]] +; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = icmp ugt i64 [[N]], [[TMP11]] +; CHECK-ORDERED-TF-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i64 [[TMP12]], i64 0 +; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 32 +; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = sub i64 [[N]], [[TMP16]] +; CHECK-ORDERED-TF-NEXT: [[TMP18:%.*]] = icmp ugt i64 [[N]], [[TMP16]] +; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i64 [[TMP17]], i64 0 +; CHECK-ORDERED-TF-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 32 +; CHECK-ORDERED-TF-NEXT: [[TMP22:%.*]] = sub i64 [[N]], [[TMP21]] +; CHECK-ORDERED-TF-NEXT: [[TMP23:%.*]] = icmp ugt i64 [[N]], [[TMP21]] +; CHECK-ORDERED-TF-NEXT: [[TMP24:%.*]] = select i1 [[TMP23]], i64 [[TMP22]], i64 0 +; CHECK-ORDERED-TF-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP26:%.*]] = mul i64 [[TMP25]], 8 +; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP26]] +; CHECK-ORDERED-TF-NEXT: [[TMP27:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP28:%.*]] = mul i64 [[TMP27]], 16 +; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP28]] +; CHECK-ORDERED-TF-NEXT: [[TMP29:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP30:%.*]] = mul i64 [[TMP29]], 24 +; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP30]] ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[N]]) ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[N]]) ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY4:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT1]], i64 [[N]]) @@ -2228,21 +2228,21 @@ ; CHECK-ORDERED-TF-NEXT: [[TMP85:%.*]] = select nnan [[ACTIVE_LANE_MASK8]], [[TMP78]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) ; CHECK-ORDERED-TF-NEXT: [[TMP86]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP84]], [[TMP85]]) ; CHECK-ORDERED-TF-NEXT: [[TMP87:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP88:%.*]] = mul i64 [[TMP87]], 8 -; CHECK-ORDERED-TF-NEXT: [[TMP89:%.*]] = add i64 [[INDEX]], [[TMP88]] -; CHECK-ORDERED-TF-NEXT: [[TMP90:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP91:%.*]] = mul i64 [[TMP90]], 16 -; CHECK-ORDERED-TF-NEXT: [[TMP92:%.*]] = add i64 [[INDEX]], [[TMP91]] -; CHECK-ORDERED-TF-NEXT: [[TMP93:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP94:%.*]] = mul i64 [[TMP93]], 24 -; CHECK-ORDERED-TF-NEXT: [[TMP95:%.*]] = add i64 [[INDEX]], [[TMP94]] -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP15]]) -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT16]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP89]], i64 [[TMP20]]) -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT17]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP92]], i64 [[TMP25]]) -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT18]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP95]], i64 [[TMP30]]) -; CHECK-ORDERED-TF-NEXT: [[TMP96:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP97:%.*]] = mul i64 [[TMP96]], 32 -; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP97]] +; CHECK-ORDERED-TF-NEXT: [[TMP88:%.*]] = mul i64 [[TMP87]], 32 +; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP88]] +; CHECK-ORDERED-TF-NEXT: [[TMP89:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP90:%.*]] = mul i64 [[TMP89]], 8 +; CHECK-ORDERED-TF-NEXT: [[TMP91:%.*]] = add i64 [[INDEX]], [[TMP90]] +; CHECK-ORDERED-TF-NEXT: [[TMP92:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP93:%.*]] = mul i64 [[TMP92]], 16 +; CHECK-ORDERED-TF-NEXT: [[TMP94:%.*]] = add i64 [[INDEX]], [[TMP93]] +; CHECK-ORDERED-TF-NEXT: [[TMP95:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP96:%.*]] = mul i64 [[TMP95]], 24 +; CHECK-ORDERED-TF-NEXT: [[TMP97:%.*]] = add i64 [[INDEX]], [[TMP96]] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP9]]) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT16]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP91]], i64 [[TMP14]]) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT17]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP94]], i64 [[TMP19]]) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT18]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP97]], i64 [[TMP24]]) ; CHECK-ORDERED-TF-NEXT: [[TMP98:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) ; CHECK-ORDERED-TF-NEXT: [[TMP99:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT16]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) ; CHECK-ORDERED-TF-NEXT: [[TMP100:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT17]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll @@ -146,10 +146,10 @@ ; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.experimental.vector.interleave2.nxv32i8( [[TMP14]], [[TMP15]]) ; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK1:%.*]] = call @llvm.experimental.vector.interleave2.nxv32i1( [[TMP10]], [[TMP10]]) ; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.store.nxv32i8.p0( [[INTERLEAVED_VEC]], ptr [[TMP18]], i32 1, [[INTERLEAVED_MASK1]]) -; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP2]]) ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP19:%.*]] = call i32 @llvm.vscale.i32() ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP20:%.*]] = shl i32 [[TMP19]], 4 ; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP20]] +; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP2]]) ; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP21:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 ; PREDICATED_TAIL_FOLDING-NEXT: br i1 [[TMP21]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP0:![0-9]+]] @@ -312,10 +312,10 @@ ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP11]] ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP9]], zeroinitializer ; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( shufflevector ( insertelement ( poison, i8 2, i64 0), poison, zeroinitializer), [[TMP12]], i32 1, [[TMP13]]) -; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP2]]) ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32() ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = shl i32 [[TMP14]], 4 ; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP15]] +; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP2]]) ; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 ; PREDICATED_TAIL_FOLDING-NEXT: br i1 [[TMP16]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP4:![0-9]+]] @@ -487,10 +487,10 @@ ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP13]] ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP11]], zeroinitializer ; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( shufflevector ( insertelement ( poison, i8 2, i64 0), poison, zeroinitializer), [[TMP14]], i32 1, [[TMP15]]) -; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP2]]) ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = call i32 @llvm.vscale.i32() ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = shl i32 [[TMP16]], 4 ; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP17]] +; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP2]]) ; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP18:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 ; PREDICATED_TAIL_FOLDING-NEXT: br i1 [[TMP18]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP6:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll @@ -17,8 +17,8 @@ ; VPLANS-NEXT: No successors ; VPLANS-EMPTY: ; VPLANS-NEXT: vector.ph: -; VPLANS-NEXT: EMIT vp<[[VF:%[0-9]+]]> = VF * Part + ir<0> ; VPLANS-NEXT: EMIT vp<[[NEWTC:%[0-9]+]]> = TC > VF ? TC - VF : 0 vp<[[TC]]> +; VPLANS-NEXT: EMIT vp<[[VF:%[0-9]+]]> = VF * Part + ir<0> ; VPLANS-NEXT: EMIT vp<[[LANEMASK_ENTRY:%[0-9]+]]> = active lane mask vp<[[VF]]>, vp<[[TC]]> ; VPLANS-NEXT: Successor(s): vector loop ; VPLANS-EMPTY: @@ -29,9 +29,9 @@ ; VPLANS-NEXT: vp<[[STEP:%[0-9]+]]> = SCALAR-STEPS vp<[[INDV]]>, ir<1> ; VPLANS-NEXT: CLONE ir<%gep> = getelementptr ir<%ptr>, vp<[[STEP]]> ; VPLANS-NEXT: WIDEN store ir<%gep>, ir<%val>, vp<[[LANEMASK_PHI]]> +; VPLANS-NEXT: EMIT vp<[[INDV_UPDATE:%[0-9]+]]> = VF * UF + vp<[[INDV]]> ; VPLANS-NEXT: EMIT vp<[[INC:%[0-9]+]]> = VF * Part + vp<[[INDV]]> ; VPLANS-NEXT: EMIT vp<[[LANEMASK_LOOP]]> = active lane mask vp<[[INC]]>, vp<[[NEWTC]]> -; VPLANS-NEXT: EMIT vp<[[INDV_UPDATE:%[0-9]+]]> = VF * UF + vp<[[INDV]]> ; VPLANS-NEXT: EMIT vp<[[NOT:%[0-9]+]]> = not vp<[[LANEMASK_LOOP]]> ; VPLANS-NEXT: EMIT branch-on-cond vp<[[NOT]]> ; VPLANS-NEXT: No successors @@ -67,10 +67,10 @@ ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP10]] ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP11]], i32 0 ; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]]) ; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4 ; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP14]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]]) ; CHECK-NEXT: [[TMP15:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) ; CHECK-NEXT: [[TMP16:%.*]] = extractelement [[TMP15]], i32 0 ; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-overflow-checks.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-overflow-checks.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-overflow-checks.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-overflow-checks.ll @@ -89,10 +89,10 @@ ; CHECK-NEXT: [[TMP4:%.*]] = add nsw [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i32 42, i64 0), poison, zeroinitializer) ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[INDEX]] ; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[TMP4]], ptr [[TMP5]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP2]]) ; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP7]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP2]]) ; CHECK-NEXT: [[TMP8:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 ; CHECK-NEXT: br i1 [[TMP8]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-reductions.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-reductions.ll @@ -37,10 +37,10 @@ ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]], poison) ; CHECK-NEXT: [[TMP13:%.*]] = add [[VEC_PHI]], [[WIDE_MASKED_LOAD]] ; CHECK-NEXT: [[TMP14]] = select [[ACTIVE_LANE_MASK]], [[TMP13]], [[VEC_PHI]] -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]]) ; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4 ; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP16]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]]) ; CHECK-NEXT: [[TMP17:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) ; CHECK-NEXT: [[TMP18:%.*]] = extractelement [[TMP17]], i32 0 ; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] @@ -95,10 +95,10 @@ ; CHECK-IN-LOOP-NEXT: [[TMP13:%.*]] = select [[ACTIVE_LANE_MASK]], [[WIDE_MASKED_LOAD]], zeroinitializer ; CHECK-IN-LOOP-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP13]]) ; CHECK-IN-LOOP-NEXT: [[TMP15]] = add i32 [[TMP14]], [[VEC_PHI]] -; CHECK-IN-LOOP-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]]) ; CHECK-IN-LOOP-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-IN-LOOP-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4 ; CHECK-IN-LOOP-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP17]] +; CHECK-IN-LOOP-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]]) ; CHECK-IN-LOOP-NEXT: [[TMP18:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) ; CHECK-IN-LOOP-NEXT: [[TMP19:%.*]] = extractelement [[TMP18]], i32 0 ; CHECK-IN-LOOP-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] @@ -169,10 +169,10 @@ ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]], poison) ; CHECK-NEXT: [[TMP13:%.*]] = select [[ACTIVE_LANE_MASK]], [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) ; CHECK-NEXT: [[TMP14]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[TMP13]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]]) ; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4 ; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP16]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]]) ; CHECK-NEXT: [[TMP17:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) ; CHECK-NEXT: [[TMP18:%.*]] = extractelement [[TMP17]], i32 0 ; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] @@ -225,10 +225,10 @@ ; CHECK-IN-LOOP-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]], poison) ; CHECK-IN-LOOP-NEXT: [[TMP13:%.*]] = select [[ACTIVE_LANE_MASK]], [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) ; CHECK-IN-LOOP-NEXT: [[TMP14]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[TMP13]]) -; CHECK-IN-LOOP-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]]) ; CHECK-IN-LOOP-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-IN-LOOP-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4 ; CHECK-IN-LOOP-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP16]] +; CHECK-IN-LOOP-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]]) ; CHECK-IN-LOOP-NEXT: [[TMP17:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) ; CHECK-IN-LOOP-NEXT: [[TMP18:%.*]] = extractelement [[TMP17]], i32 0 ; CHECK-IN-LOOP-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] @@ -306,10 +306,10 @@ ; CHECK-NEXT: [[TMP19:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP18]], zeroinitializer ; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP15]], [[TMP17]], [[VEC_PHI]] ; CHECK-NEXT: [[TMP20]] = select [[ACTIVE_LANE_MASK]], [[PREDPHI]], [[VEC_PHI]] -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]]) ; CHECK-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP22]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]]) ; CHECK-NEXT: [[TMP23:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) ; CHECK-NEXT: [[TMP24:%.*]] = extractelement [[TMP23]], i32 0 ; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] @@ -376,10 +376,10 @@ ; CHECK-IN-LOOP-NEXT: [[TMP17:%.*]] = select [[TMP15]], [[WIDE_MASKED_LOAD1]], zeroinitializer ; CHECK-IN-LOOP-NEXT: [[TMP18:%.*]] = call i32 @llvm.vector.reduce.xor.nxv4i32( [[TMP17]]) ; CHECK-IN-LOOP-NEXT: [[TMP19]] = xor i32 [[TMP18]], [[VEC_PHI]] -; CHECK-IN-LOOP-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]]) ; CHECK-IN-LOOP-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-IN-LOOP-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 4 ; CHECK-IN-LOOP-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP21]] +; CHECK-IN-LOOP-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]]) ; CHECK-IN-LOOP-NEXT: [[TMP22:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) ; CHECK-IN-LOOP-NEXT: [[TMP23:%.*]] = extractelement [[TMP22]], i32 0 ; CHECK-IN-LOOP-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll @@ -19,34 +19,34 @@ ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 -; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP6]] -; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 8 -; CHECK-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP8]] -; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 12 -; CHECK-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP10]] -; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 16 -; CHECK-NEXT: [[TMP13:%.*]] = sub i64 [[UMAX]], [[TMP12]] -; CHECK-NEXT: [[TMP14:%.*]] = icmp ugt i64 [[UMAX]], [[TMP12]] -; CHECK-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], i64 [[TMP13]], i64 0 -; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 16 -; CHECK-NEXT: [[TMP18:%.*]] = sub i64 [[UMAX]], [[TMP17]] -; CHECK-NEXT: [[TMP19:%.*]] = icmp ugt i64 [[UMAX]], [[TMP17]] -; CHECK-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i64 [[TMP18]], i64 0 -; CHECK-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 16 -; CHECK-NEXT: [[TMP23:%.*]] = sub i64 [[UMAX]], [[TMP22]] -; CHECK-NEXT: [[TMP24:%.*]] = icmp ugt i64 [[UMAX]], [[TMP22]] -; CHECK-NEXT: [[TMP25:%.*]] = select i1 [[TMP24]], i64 [[TMP23]], i64 0 -; CHECK-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP27:%.*]] = mul i64 [[TMP26]], 16 -; CHECK-NEXT: [[TMP28:%.*]] = sub i64 [[UMAX]], [[TMP27]] -; CHECK-NEXT: [[TMP29:%.*]] = icmp ugt i64 [[UMAX]], [[TMP27]] -; CHECK-NEXT: [[TMP30:%.*]] = select i1 [[TMP29]], i64 [[TMP28]], i64 0 +; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 16 +; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 +; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 16 +; CHECK-NEXT: [[TMP12:%.*]] = sub i64 [[UMAX]], [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = icmp ugt i64 [[UMAX]], [[TMP11]] +; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i64 [[TMP12]], i64 0 +; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 16 +; CHECK-NEXT: [[TMP17:%.*]] = sub i64 [[UMAX]], [[TMP16]] +; CHECK-NEXT: [[TMP18:%.*]] = icmp ugt i64 [[UMAX]], [[TMP16]] +; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i64 [[TMP17]], i64 0 +; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 16 +; CHECK-NEXT: [[TMP22:%.*]] = sub i64 [[UMAX]], [[TMP21]] +; CHECK-NEXT: [[TMP23:%.*]] = icmp ugt i64 [[UMAX]], [[TMP21]] +; CHECK-NEXT: [[TMP24:%.*]] = select i1 [[TMP23]], i64 [[TMP22]], i64 0 +; CHECK-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP26:%.*]] = mul i64 [[TMP25]], 4 +; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP26]] +; CHECK-NEXT: [[TMP27:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP28:%.*]] = mul i64 [[TMP27]], 8 +; CHECK-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP28]] +; CHECK-NEXT: [[TMP29:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP30:%.*]] = mul i64 [[TMP29]], 12 +; CHECK-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP30]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[UMAX]]) ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY4:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT1]], i64 [[UMAX]]) @@ -55,11 +55,11 @@ ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX6:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT13:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX6:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT10:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[ACTIVE_LANE_MASK7:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY3]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT10:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[ACTIVE_LANE_MASK8:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY4]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT11:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[ACTIVE_LANE_MASK9:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY5]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT12:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK7:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY3]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT11:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK8:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY4]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT12:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK9:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY5]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT13:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP31:%.*]] = add i64 [[INDEX6]], 0 ; CHECK-NEXT: [[TMP32:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP33:%.*]] = mul i64 [[TMP32]], 4 @@ -95,25 +95,25 @@ ; CHECK-NEXT: [[TMP60:%.*]] = getelementptr i32, ptr [[TMP47]], i64 [[TMP59]] ; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP60]], i32 4, [[ACTIVE_LANE_MASK9]]) ; CHECK-NEXT: [[TMP61:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP62:%.*]] = mul i64 [[TMP61]], 4 -; CHECK-NEXT: [[TMP63:%.*]] = add i64 [[INDEX6]], [[TMP62]] -; CHECK-NEXT: [[TMP64:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP65:%.*]] = mul i64 [[TMP64]], 8 -; CHECK-NEXT: [[TMP66:%.*]] = add i64 [[INDEX6]], [[TMP65]] -; CHECK-NEXT: [[TMP67:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP68:%.*]] = mul i64 [[TMP67]], 12 -; CHECK-NEXT: [[TMP69:%.*]] = add i64 [[INDEX6]], [[TMP68]] -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX6]], i64 [[TMP15]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT10]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP63]], i64 [[TMP20]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT11]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP66]], i64 [[TMP25]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT12]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP69]], i64 [[TMP30]]) -; CHECK-NEXT: [[TMP70:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP71:%.*]] = mul i64 [[TMP70]], 16 -; CHECK-NEXT: [[INDEX_NEXT13]] = add i64 [[INDEX6]], [[TMP71]] +; CHECK-NEXT: [[TMP62:%.*]] = mul i64 [[TMP61]], 16 +; CHECK-NEXT: [[INDEX_NEXT10]] = add i64 [[INDEX6]], [[TMP62]] +; CHECK-NEXT: [[TMP63:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP64:%.*]] = mul i64 [[TMP63]], 4 +; CHECK-NEXT: [[TMP65:%.*]] = add i64 [[INDEX6]], [[TMP64]] +; CHECK-NEXT: [[TMP66:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP67:%.*]] = mul i64 [[TMP66]], 8 +; CHECK-NEXT: [[TMP68:%.*]] = add i64 [[INDEX6]], [[TMP67]] +; CHECK-NEXT: [[TMP69:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP70:%.*]] = mul i64 [[TMP69]], 12 +; CHECK-NEXT: [[TMP71:%.*]] = add i64 [[INDEX6]], [[TMP70]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX6]], i64 [[TMP9]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT11]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP65]], i64 [[TMP14]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT12]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP68]], i64 [[TMP19]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT13]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP71]], i64 [[TMP24]]) ; CHECK-NEXT: [[TMP72:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP73:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT10]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP74:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT11]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP75:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT12]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP73:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT11]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP74:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT12]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP75:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) ; CHECK-NEXT: [[TMP76:%.*]] = extractelement [[TMP72]], i32 0 ; CHECK-NEXT: br i1 [[TMP76]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: @@ -161,34 +161,34 @@ ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 -; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP6]] -; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 8 -; CHECK-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP8]] -; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 12 -; CHECK-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP10]] -; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 16 -; CHECK-NEXT: [[TMP13:%.*]] = sub i64 [[UMAX]], [[TMP12]] -; CHECK-NEXT: [[TMP14:%.*]] = icmp ugt i64 [[UMAX]], [[TMP12]] -; CHECK-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], i64 [[TMP13]], i64 0 -; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 16 -; CHECK-NEXT: [[TMP18:%.*]] = sub i64 [[UMAX]], [[TMP17]] -; CHECK-NEXT: [[TMP19:%.*]] = icmp ugt i64 [[UMAX]], [[TMP17]] -; CHECK-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i64 [[TMP18]], i64 0 -; CHECK-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 16 -; CHECK-NEXT: [[TMP23:%.*]] = sub i64 [[UMAX]], [[TMP22]] -; CHECK-NEXT: [[TMP24:%.*]] = icmp ugt i64 [[UMAX]], [[TMP22]] -; CHECK-NEXT: [[TMP25:%.*]] = select i1 [[TMP24]], i64 [[TMP23]], i64 0 -; CHECK-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP27:%.*]] = mul i64 [[TMP26]], 16 -; CHECK-NEXT: [[TMP28:%.*]] = sub i64 [[UMAX]], [[TMP27]] -; CHECK-NEXT: [[TMP29:%.*]] = icmp ugt i64 [[UMAX]], [[TMP27]] -; CHECK-NEXT: [[TMP30:%.*]] = select i1 [[TMP29]], i64 [[TMP28]], i64 0 +; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 16 +; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 +; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 16 +; CHECK-NEXT: [[TMP12:%.*]] = sub i64 [[UMAX]], [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = icmp ugt i64 [[UMAX]], [[TMP11]] +; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i64 [[TMP12]], i64 0 +; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 16 +; CHECK-NEXT: [[TMP17:%.*]] = sub i64 [[UMAX]], [[TMP16]] +; CHECK-NEXT: [[TMP18:%.*]] = icmp ugt i64 [[UMAX]], [[TMP16]] +; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i64 [[TMP17]], i64 0 +; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 16 +; CHECK-NEXT: [[TMP22:%.*]] = sub i64 [[UMAX]], [[TMP21]] +; CHECK-NEXT: [[TMP23:%.*]] = icmp ugt i64 [[UMAX]], [[TMP21]] +; CHECK-NEXT: [[TMP24:%.*]] = select i1 [[TMP23]], i64 [[TMP22]], i64 0 +; CHECK-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP26:%.*]] = mul i64 [[TMP25]], 4 +; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP26]] +; CHECK-NEXT: [[TMP27:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP28:%.*]] = mul i64 [[TMP27]], 8 +; CHECK-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP28]] +; CHECK-NEXT: [[TMP29:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP30:%.*]] = mul i64 [[TMP29]], 12 +; CHECK-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP30]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[UMAX]]) ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY4:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT1]], i64 [[UMAX]]) @@ -197,11 +197,11 @@ ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX6:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT16:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX6:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT13:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[ACTIVE_LANE_MASK7:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY3]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT13:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[ACTIVE_LANE_MASK8:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY4]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT14:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[ACTIVE_LANE_MASK9:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY5]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT15:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK7:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY3]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT14:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK8:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY4]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT15:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK9:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY5]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT16:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP31:%.*]] = add i64 [[INDEX6]], 0 ; CHECK-NEXT: [[TMP32:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP33:%.*]] = mul i64 [[TMP32]], 4 @@ -263,25 +263,25 @@ ; CHECK-NEXT: [[TMP82:%.*]] = getelementptr i32, ptr [[TMP65]], i64 [[TMP81]] ; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP82]], i32 4, [[TMP72]]) ; CHECK-NEXT: [[TMP83:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP84:%.*]] = mul i64 [[TMP83]], 4 -; CHECK-NEXT: [[TMP85:%.*]] = add i64 [[INDEX6]], [[TMP84]] -; CHECK-NEXT: [[TMP86:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP87:%.*]] = mul i64 [[TMP86]], 8 -; CHECK-NEXT: [[TMP88:%.*]] = add i64 [[INDEX6]], [[TMP87]] -; CHECK-NEXT: [[TMP89:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP90:%.*]] = mul i64 [[TMP89]], 12 -; CHECK-NEXT: [[TMP91:%.*]] = add i64 [[INDEX6]], [[TMP90]] -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX6]], i64 [[TMP15]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT13]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP85]], i64 [[TMP20]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT14]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP88]], i64 [[TMP25]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT15]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP91]], i64 [[TMP30]]) -; CHECK-NEXT: [[TMP92:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP93:%.*]] = mul i64 [[TMP92]], 16 -; CHECK-NEXT: [[INDEX_NEXT16]] = add i64 [[INDEX6]], [[TMP93]] +; CHECK-NEXT: [[TMP84:%.*]] = mul i64 [[TMP83]], 16 +; CHECK-NEXT: [[INDEX_NEXT13]] = add i64 [[INDEX6]], [[TMP84]] +; CHECK-NEXT: [[TMP85:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP86:%.*]] = mul i64 [[TMP85]], 4 +; CHECK-NEXT: [[TMP87:%.*]] = add i64 [[INDEX6]], [[TMP86]] +; CHECK-NEXT: [[TMP88:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP89:%.*]] = mul i64 [[TMP88]], 8 +; CHECK-NEXT: [[TMP90:%.*]] = add i64 [[INDEX6]], [[TMP89]] +; CHECK-NEXT: [[TMP91:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP92:%.*]] = mul i64 [[TMP91]], 12 +; CHECK-NEXT: [[TMP93:%.*]] = add i64 [[INDEX6]], [[TMP92]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX6]], i64 [[TMP9]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT14]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP87]], i64 [[TMP14]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT15]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP90]], i64 [[TMP19]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT16]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP93]], i64 [[TMP24]]) ; CHECK-NEXT: [[TMP94:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP95:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP96:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT14]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP97:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT15]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP95:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT14]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP96:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT15]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP97:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT16]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) ; CHECK-NEXT: [[TMP98:%.*]] = extractelement [[TMP94]], i32 0 ; CHECK-NEXT: br i1 [[TMP98]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll @@ -34,10 +34,10 @@ ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP10]] ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP11]], i32 0 ; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]]) ; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4 ; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP14]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]]) ; CHECK-NEXT: [[TMP15:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) ; CHECK-NEXT: [[TMP16:%.*]] = extractelement [[TMP15]], i32 0 ; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] @@ -95,8 +95,8 @@ ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP3]] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i32, ptr [[TMP4]], i32 0 ; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP5]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX1]], i64 [[TMP2]]) ; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], 4 +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX1]], i64 [[TMP2]]) ; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP6]], i32 0 ; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] @@ -162,10 +162,10 @@ ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[DST:%.*]], i64 [[TMP10]] ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[TMP13]], i32 0 ; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[WIDE_MASKED_LOAD]], ptr [[TMP14]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]]) ; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4 ; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP16]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]]) ; CHECK-NEXT: [[TMP17:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) ; CHECK-NEXT: [[TMP18:%.*]] = extractelement [[TMP17]], i32 0 ; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] @@ -246,10 +246,10 @@ ; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP19]], i32 4, [[ACTIVE_LANE_MASK]], poison) ; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i32, ptr [[DST:%.*]], [[VEC_IND]] ; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[WIDE_MASKED_GATHER]], [[TMP20]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP12]]) ; CHECK-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 4 ; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP22]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP12]]) ; CHECK-NEXT: [[TMP23:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; CHECK-NEXT: [[TMP24:%.*]] = extractelement [[TMP23]], i32 0 @@ -321,10 +321,10 @@ ; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP13]], i32 4, [[ACTIVE_LANE_MASK]], poison) ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[DST:%.*]], [[WIDE_MASKED_LOAD]] ; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[WIDE_MASKED_GATHER]], [[TMP14]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]]) ; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4 ; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP16]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]]) ; CHECK-NEXT: [[TMP17:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) ; CHECK-NEXT: [[TMP18:%.*]] = extractelement [[TMP17]], i32 0 ; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] @@ -399,10 +399,10 @@ ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[TMP10]] ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0 ; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP13]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]]) ; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP15]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]]) ; CHECK-NEXT: [[TMP16:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) ; CHECK-NEXT: [[TMP17:%.*]] = extractelement [[TMP16]], i32 0 ; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] @@ -483,10 +483,10 @@ ; CHECK-NEXT: [[TMP18:%.*]] = or [[TMP15]], [[TMP16]] ; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0 ; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[PREDPHI]], ptr [[TMP19]], i32 4, [[TMP18]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]]) ; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 4 ; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP21]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]]) ; CHECK-NEXT: [[TMP22:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) ; CHECK-NEXT: [[TMP23:%.*]] = extractelement [[TMP22]], i32 0 ; CHECK-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] @@ -574,10 +574,10 @@ ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0 ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]], poison) ; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]]) ; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP14]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]]) ; CHECK-NEXT: [[TMP15:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) ; CHECK-NEXT: [[TMP16:%.*]] = extractelement [[TMP15]], i32 0 ; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] @@ -648,10 +648,10 @@ ; CHECK-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP14]], i32 4, [[ACTIVE_LANE_MASK]], poison) ; CHECK-NEXT: [[TMP15:%.*]] = fdiv [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD2]] ; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0( [[TMP15]], ptr [[TMP14]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]]) ; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4 ; CHECK-NEXT: [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP17]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]]) ; CHECK-NEXT: [[TMP18:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) ; CHECK-NEXT: [[TMP19:%.*]] = extractelement [[TMP18]], i32 0 ; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] @@ -729,10 +729,10 @@ ; CHECK-NEXT: [[TMP15:%.*]] = select [[ACTIVE_LANE_MASK]], [[WIDE_MASKED_LOAD2]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) ; CHECK-NEXT: [[TMP16:%.*]] = udiv [[WIDE_MASKED_LOAD]], [[TMP15]] ; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[TMP16]], ptr [[TMP14]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]]) ; CHECK-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 4 ; CHECK-NEXT: [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP18]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]]) ; CHECK-NEXT: [[TMP19:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) ; CHECK-NEXT: [[TMP20:%.*]] = extractelement [[TMP19]], i32 0 ; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll @@ -25,8 +25,8 @@ ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr %dst, i64 [[TMP3]] ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0 ; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP5]], ptr [[TMP7]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[NEXT_ACTIVE_LANE_MASK]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[IDX]], i64 [[N2]]) ; CHECK-NEXT: [[IDX_NEXT]] = add i64 [[IDX]], 4 +; CHECK-NEXT: [[NEXT_ACTIVE_LANE_MASK]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[IDX]], i64 [[N2]]) ; CHECK-NEXT: [[NOT_ACTIVE_LANE_MASK:%.*]] = xor <4 x i1> [[NEXT_ACTIVE_LANE_MASK]], ; CHECK-NEXT: [[FIRST_LANE_SET:%.*]] = extractelement <4 x i1> [[NOT_ACTIVE_LANE_MASK]], i32 0 ; CHECK-NEXT: br i1 [[FIRST_LANE_SET]], label %middle.block, label %vector.body diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll b/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll @@ -233,10 +233,10 @@ ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP10]] ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP11]], i32 0 ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]]) -; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]]) ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4 ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP14]] +; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]]) ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP15:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP16:%.*]] = extractelement [[TMP15]], i32 0 ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]