Index: llvm/include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfo.h +++ llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -531,7 +531,7 @@ /// Query the target whether lowering of the llvm.get.active.lane.mask /// intrinsic is supported. - bool emitGetActiveLaneMask() const; + bool emitGetActiveLaneMask(bool &UseForControlFlow) const; // Parameters that control the loop peeling transformation struct PeelingPreferences { @@ -1527,7 +1527,7 @@ preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *TLI, DominatorTree *DT, const LoopAccessInfo *LAI) = 0; - virtual bool emitGetActiveLaneMask() = 0; + virtual bool emitGetActiveLaneMask(bool &UseForControlFlow) = 0; virtual Optional instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) = 0; virtual Optional @@ -1899,8 +1899,8 @@ const LoopAccessInfo *LAI) override { return Impl.preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI); } - bool emitGetActiveLaneMask() override { - return Impl.emitGetActiveLaneMask(); + bool emitGetActiveLaneMask(bool &UseForControlFlow) override { + return Impl.emitGetActiveLaneMask(UseForControlFlow); } Optional instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) override { Index: llvm/include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -167,9 +167,7 @@ return false; } - bool emitGetActiveLaneMask() const { - return false; - } + bool emitGetActiveLaneMask(bool &UseForControlFlow) const { return false; } Optional instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { Index: llvm/include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -584,8 +584,8 @@ return BaseT::preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI); } - bool emitGetActiveLaneMask() { - return BaseT::emitGetActiveLaneMask(); + bool emitGetActiveLaneMask(bool &UseForControlFlow) { + return BaseT::emitGetActiveLaneMask(UseForControlFlow); } Optional instCombineIntrinsic(InstCombiner &IC, Index: llvm/lib/Analysis/TargetTransformInfo.cpp =================================================================== --- llvm/lib/Analysis/TargetTransformInfo.cpp +++ llvm/lib/Analysis/TargetTransformInfo.cpp @@ -293,8 +293,8 @@ return TTIImpl->preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI); } -bool TargetTransformInfo::emitGetActiveLaneMask() const { - return TTIImpl->emitGetActiveLaneMask(); +bool TargetTransformInfo::emitGetActiveLaneMask(bool &UseForControlFlow) const { + return TTIImpl->emitGetActiveLaneMask(UseForControlFlow); } Optional Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -330,8 +330,12 @@ return 2; } - bool emitGetActiveLaneMask() const { - return ST->hasSVE(); + bool emitGetActiveLaneMask(bool &UseForControlFlow) const { + if (ST->hasSVE()) { + UseForControlFlow = true; + return true; + } + return false; } bool supportsScalableVectors() const { return ST->hasSVE(); } Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.h =================================================================== --- llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -298,7 +298,7 @@ TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE); - bool emitGetActiveLaneMask() const; + bool emitGetActiveLaneMask(bool &UseForControlFlow) const; void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP); Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -2204,7 +2204,7 @@ return canTailPredicateLoop(L, LI, SE, DL, LAI); } -bool ARMTTIImpl::emitGetActiveLaneMask() const { +bool ARMTTIImpl::emitGetActiveLaneMask(bool &UseForControlFlow) const { if (!ST->hasMVEIntegerOps() || !EnableTailPredication) return false; @@ -2212,6 +2212,7 @@ // It is used in the MVETailPredication pass, which requires the number of // elements processed by this vector loop to setup the tail-predicated // loop. + UseForControlFlow = false; return true; } void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1105,7 +1105,8 @@ if (isa(CurRec) || isa(CurRec) || isa(CurRec) || - isa(CurRec)) + isa(CurRec) || + isa(CurRec)) continue; // This recipe contributes to the address computation of a widen @@ -8109,6 +8110,13 @@ // constructing the desired canonical IV in the header block as its first // non-phi instructions. assert(CM.foldTailByMasking() && "must fold the tail"); + + bool UseLaneMaskForControlFlow; + bool EmitGetActiveLaneMask = + CM.TTI.emitGetActiveLaneMask(UseLaneMaskForControlFlow); + if (EmitGetActiveLaneMask && UseLaneMaskForControlFlow) + return BlockMaskCache[BB] = Plan->getActiveLaneMaskPhi(); + VPBasicBlock *HeaderVPBB = Plan->getVectorLoopRegion()->getEntryBasicBlock(); auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi(); @@ -8117,7 +8125,7 @@ VPBuilder::InsertPointGuard Guard(Builder); Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint); - if (CM.TTI.emitGetActiveLaneMask()) { + if (EmitGetActiveLaneMask) { VPValue *TC = Plan->getOrCreateTripCount(); BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC}); } else { @@ -8682,7 +8690,8 @@ // CanonicalIVIncrement{NUW} VPInstruction to increment it by VF * UF and a // BranchOnCount VPInstruction to the latch. static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL, - bool HasNUW, bool IsVPlanNative) { + bool HasNUW, bool IsVPlanNative, + bool UseLaneMaskForControlFlow) { Value *StartIdx = ConstantInt::get(IdxTy, 0); auto *StartV = Plan.getOrAddVPValue(StartIdx); @@ -8702,10 +8711,32 @@ EB->setCondBit(nullptr); EB->appendRecipe(CanonicalIVIncrement); - auto *BranchOnCount = - new VPInstruction(VPInstruction::BranchOnCount, - {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); - EB->appendRecipe(BranchOnCount); + VPInstruction *BranchBack; + if (UseLaneMaskForControlFlow) { + VPValue *TC = Plan.getOrCreateTripCount(); + auto *LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(TC, DebugLoc()); + Header->insert(LaneMaskPhi, Header->getFirstNonPhi()); + Plan.setActiveLaneMaskPhi(LaneMaskPhi); + + auto *CanonicalIVIncrementParts = + new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementPartsNUW + : VPInstruction::CanonicalIVIncrementParts, + {CanonicalIVIncrement}, DL); + EB->appendRecipe(CanonicalIVIncrementParts); + + auto *ALM = new VPInstruction(VPInstruction::ActiveLaneMask, + {CanonicalIVIncrementParts, TC}, DL); + EB->appendRecipe(ALM); + LaneMaskPhi->addOperand(ALM); + + BranchBack = + new VPInstruction(VPInstruction::BranchOnActiveLaneMask, {ALM}, DL); + } else { + BranchBack = new VPInstruction( + VPInstruction::BranchOnCount, + {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); + } + EB->appendRecipe(BranchBack); } VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( @@ -8784,9 +8815,13 @@ Instruction *DLInst = getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); + bool HasTailFolding = CM.foldTailByMasking(); + bool UseLaneMaskForControlFlow = false; + if (HasTailFolding) + CM.TTI.emitGetActiveLaneMask(UseLaneMaskForControlFlow); addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DLInst ? DLInst->getDebugLoc() : DebugLoc(), - !CM.foldTailByMasking(), false); + !HasTailFolding, false, UseLaneMaskForControlFlow); // Scan the body of the loop in a topological order to visit each basic block // after having visited its predecessor basic blocks. @@ -9121,8 +9156,11 @@ // 4. Remove exit block. delete Exit; + bool UseLaneMaskForControlFlow = false; + if (CM.foldTailByMasking()) + CM.TTI.emitGetActiveLaneMask(UseLaneMaskForControlFlow); addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(), - true, true); + true, true, UseLaneMaskForControlFlow); return Plan; } Index: llvm/lib/Transforms/Vectorize/VPlan.h =================================================================== --- llvm/lib/Transforms/Vectorize/VPlan.h +++ llvm/lib/Transforms/Vectorize/VPlan.h @@ -813,7 +813,10 @@ ActiveLaneMask, CanonicalIVIncrement, CanonicalIVIncrementNUW, + CanonicalIVIncrementParts, + CanonicalIVIncrementPartsNUW, BranchOnCount, + BranchOnActiveLaneMask, }; private: @@ -903,6 +906,7 @@ case Instruction::Fence: case Instruction::AtomicRMW: case VPInstruction::BranchOnCount: + case VPInstruction::BranchOnActiveLaneMask: return false; default: return true; @@ -924,7 +928,10 @@ case VPInstruction::ActiveLaneMask: case VPInstruction::CanonicalIVIncrement: case VPInstruction::CanonicalIVIncrementNUW: + case VPInstruction::CanonicalIVIncrementParts: + case VPInstruction::CanonicalIVIncrementPartsNUW: case VPInstruction::BranchOnCount: + case VPInstruction::BranchOnActiveLaneMask: return true; }; llvm_unreachable("switch should return"); @@ -1157,6 +1164,7 @@ /// Method to support type inquiry through isa, cast, and dyn_cast. static inline bool classof(const VPRecipeBase *B) { return B->getVPDefID() == VPRecipeBase::VPCanonicalIVPHISC || + B->getVPDefID() == VPRecipeBase::VPActiveLaneMaskPHISC || B->getVPDefID() == VPRecipeBase::VPFirstOrderRecurrencePHISC || B->getVPDefID() == VPRecipeBase::VPReductionPHISC || B->getVPDefID() == VPRecipeBase::VPWidenIntOrFpInductionSC || @@ -1164,6 +1172,7 @@ } static inline bool classof(const VPValue *V) { return V->getVPValueID() == VPValue::VPVCanonicalIVPHISC || + V->getVPValueID() == VPValue::VPVActiveLaneMaskPHISC || V->getVPValueID() == VPValue::VPVFirstOrderRecurrencePHISC || V->getVPValueID() == VPValue::VPVReductionPHISC || V->getVPValueID() == VPValue::VPVWidenIntOrFpInductionSC || @@ -1189,7 +1198,8 @@ /// Returns the incoming value from the loop backedge. VPValue *getBackedgeValue() { - return getOperand(1); + assert(getNumOperands() != 0 && "Invalid request for backedge value!"); + return getOperand(getNumOperands() - 1); } /// Returns the backedge value as a recipe. The backedge value is guaranteed @@ -1881,6 +1891,41 @@ } }; +class VPActiveLaneMaskPHIRecipe : public VPHeaderPHIRecipe { + DebugLoc DL; + VPValue *TC; + +public: + VPActiveLaneMaskPHIRecipe(VPValue *TC, DebugLoc DL) + : VPHeaderPHIRecipe(VPValue::VPVActiveLaneMaskPHISC, + VPActiveLaneMaskPHISC, nullptr), + DL(DL), TC(TC) {} + + ~VPActiveLaneMaskPHIRecipe() override = default; + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPDef *D) { + return D->getVPDefID() == VPActiveLaneMaskPHISC; + } + static inline bool classof(const VPHeaderPHIRecipe *D) { + return D->getVPDefID() == VPActiveLaneMaskPHISC; + } + static inline bool classof(const VPValue *V) { + return V->getVPValueID() == VPValue::VPVActiveLaneMaskPHISC; + } + + /// Generate the active lane mask phi of the vector loop. + void execute(VPTransformState &State) override; + + VPValue *getTripCount() const { return TC; } + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif +}; + /// A Recipe for widening the canonical induction variable of the vector loop. class VPWidenCanonicalIVRecipe : public VPRecipeBase, public VPValue { public: @@ -2479,6 +2524,8 @@ /// the tail. It equals TripCount - 1. VPValue *BackedgeTakenCount = nullptr; + VPActiveLaneMaskPHIRecipe *LaneMask = nullptr; + /// Represents the vector trip count. VPValue VectorTripCount; @@ -2551,6 +2598,12 @@ return BackedgeTakenCount; } + void setActiveLaneMaskPhi(VPActiveLaneMaskPHIRecipe *Mask) { + LaneMask = Mask; + } + + VPActiveLaneMaskPHIRecipe *getActiveLaneMaskPhi() const { return LaneMask; } + /// The vector trip count. VPValue &getVectorTripCount() { return VectorTripCount; } Index: llvm/lib/Transforms/Vectorize/VPlan.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/VPlan.cpp +++ llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -785,17 +785,40 @@ State.set(this, Next, Part); break; } - case VPInstruction::BranchOnCount: { + case VPInstruction::CanonicalIVIncrementParts: + case VPInstruction::CanonicalIVIncrementPartsNUW: { + bool IsNUW = getOpcode() == VPInstruction::CanonicalIVIncrementPartsNUW; + auto *Phi = State.get(getOperand(0), 0); + // The loop step is equal to the vectorization factor (num of SIMD + // elements) times the unroll factor (num of SIMD instructions). + Value *Step = createStepForVF(Builder, Phi->getType(), State.VF, Part); + Value *Next = Builder.CreateAdd(Phi, Step, "index.part.next", IsNUW, false); + State.set(this, Next, Part); + break; + } + case VPInstruction::BranchOnCount: + case VPInstruction::BranchOnActiveLaneMask: { if (Part != 0) break; - // First create the compare. - Value *IV = State.get(getOperand(0), Part); - Value *TC = State.get(getOperand(1), Part); - Value *Cond = Builder.CreateICmpEQ(IV, TC); - - // Now create the branch. + // First create the condition. auto *Plan = getParent()->getPlan(); VPRegionBlock *TopRegion = Plan->getVectorLoopRegion(); + + Value *Cond; + if (getOpcode() == VPInstruction::BranchOnCount) { + Value *IV = State.get(getOperand(0), 0); + Value *TC = State.get(getOperand(1), 0); + Cond = Builder.CreateICmpEQ(IV, TC); + } else { + Value *IV = State.get(getOperand(0), Part); + + // Exit the loop if there are no lanes active. Testing the first lane is + // sufficient for this. + Cond = Builder.CreateExtractElement(IV, Builder.getInt32(0)); + Cond = Builder.CreateNot(Cond); + } + + // Now create the branch. VPBasicBlock *Header = TopRegion->getEntry()->getEntryBasicBlock(); if (Header->empty()) { assert(EnableVPlanNativePath && @@ -863,9 +886,18 @@ case VPInstruction::CanonicalIVIncrementNUW: O << "VF * UF +(nuw) "; break; + case VPInstruction::CanonicalIVIncrementParts: + O << "VF * Part + "; + break; + case VPInstruction::CanonicalIVIncrementPartsNUW: + O << "VF * Part +(nuw) "; + break; case VPInstruction::BranchOnCount: O << "branch-on-count "; break; + case VPInstruction::BranchOnActiveLaneMask: + O << "branch-on-active-lane-mask "; + break; default: O << Instruction::getOpcodeName(getOpcode()); } @@ -1022,7 +1054,8 @@ // generated. bool SinglePartNeeded = isa(PhiR) || isa(PhiR) || - cast(PhiR)->isOrdered(); + (isa(PhiR) && + cast(PhiR)->isOrdered()); unsigned LastPartForNewPhi = SinglePartNeeded ? 1 : State->UF; for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) { @@ -1483,6 +1516,42 @@ } #endif +void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) { + BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); + Value *TC = State.get(getTripCount(), 0); + auto *Int1Ty = Type::getInt1Ty(State.Builder.getContext()); + auto *PredTy = VectorType::get(Int1Ty, State.VF); + for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) { + Instruction *StartMask; + { + IRBuilder<>::InsertPointGuard Guard(State.Builder); + State.Builder.SetInsertPoint(&(*VectorPH->getFirstInsertionPt())); + + Value *StartIV = + createStepForVF(State.Builder, TC->getType(), State.VF, Part); + StartMask = State.Builder.CreateIntrinsic( + Intrinsic::get_active_lane_mask, {PredTy, TC->getType()}, + {StartIV, TC}, nullptr, "active.lane.mask"); + } + + PHINode *EntryPart = + PHINode::Create(StartMask->getType(), 2, "active.lane.mask", + &*State.CFG.PrevBB->getFirstInsertionPt()); + EntryPart->addIncoming(StartMask, VectorPH); + EntryPart->setDebugLoc(DL); + State.set(this, EntryPart, Part); + } +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "EMIT "; + printAsOperand(O, SlotTracker); + O << " = ACTIVE-LANE-MASK-PHI"; +} +#endif + void VPExpandSCEVRecipe::execute(VPTransformState &State) { assert(!State.Instance && "cannot be used in per-lane"); const DataLayout &DL = State.CFG.PrevBB->getModule()->getDataLayout(); Index: llvm/lib/Transforms/Vectorize/VPlanValue.h =================================================================== --- llvm/lib/Transforms/Vectorize/VPlanValue.h +++ llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -103,6 +103,7 @@ // Phi-like VPValues. Need to be kept together. VPVBlendSC, VPVCanonicalIVPHISC, + VPVActiveLaneMaskPHISC, VPVFirstOrderRecurrencePHISC, VPVWidenPHISC, VPVWidenIntOrFpInductionSC, @@ -344,6 +345,7 @@ // Phi-like recipes. Need to be kept together. VPBlendSC, VPCanonicalIVPHISC, + VPActiveLaneMaskPHISC, VPFirstOrderRecurrencePHISC, VPWidenPHISC, VPWidenIntOrFpInductionSC, Index: llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -183,9 +183,11 @@ } auto *LastInst = dyn_cast(std::prev(Exit->end())); - if (!LastInst || LastInst->getOpcode() != VPInstruction::BranchOnCount) { - errs() << "VPlan vector loop exit must end with BranchOnCount " - "VPInstruction\n"; + if (!LastInst || + (LastInst->getOpcode() != VPInstruction::BranchOnCount && + LastInst->getOpcode() != VPInstruction::BranchOnActiveLaneMask)) { + errs() << "VPlan vector loop exit must end with BranchOnCount or " + "BranchOnActiveLaneMask VPInstruction\n"; return false; } Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll @@ -1,17 +1,38 @@ -; RUN: opt -S -loop-vectorize < %s | FileCheck %s +; REQUIRES: asserts +; RUN: opt -S -loop-vectorize -debug-only=loop-vectorize < %s 2>%t | FileCheck %s +; RUN: cat %t | FileCheck %s --check-prefix=VPLANS ; These tests ensure that tail-folding is enabled when the predicate.enable ; loop attribute is set to true. target triple = "aarch64-unknown-linux-gnu" +; VPLANS-LABEL: Checking a loop in 'simple_memset' +; VPLANS: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF>=1' { +; VPLANS-NEXT: vector.ph: +; VPLANS-NEXT: Successor(s): vector loop +; VPLANS-EMPTY: +; VPLANS-NEXT: vector loop: { +; VPLANS-NEXT: vector.body: +; VPLANS-NEXT: EMIT vp<%2> = CANONICAL-INDUCTION +; VPLANS-NEXT: EMIT vp<%3> = ACTIVE-LANE-MASK-PHI +; VPLANS-NEXT: vp<%4> = SCALAR-STEPS vp<%2>, ir<0>, ir<1> +; VPLANS-NEXT: CLONE ir<%gep> = getelementptr ir<%ptr>, vp<%4> +; VPLANS-NEXT: WIDEN store ir<%gep>, ir<%val>, vp<%3> +; VPLANS-NEXT: EMIT vp<%7> = VF * UF + vp<%2> +; VPLANS-NEXT: EMIT vp<%8> = VF * Part + vp<%7> +; VPLANS-NEXT: EMIT vp<%9> = active lane mask vp<%8> +; VPLANS-NEXT: EMIT branch-on-active-lane-mask vp<%9> +; VPLANS-NEXT: No successors +; VPLANS-NEXT: } define void @simple_memset(i32 %val, i32* %ptr, i64 %n) #0 { ; CHECK-LABEL: @simple_memset( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1) -; CHECK-NEXT: br i1 false, label %scalar.ph, label %vector.ph +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() @@ -20,24 +41,27 @@ ; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP4]] ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement poison, i32 [[VAL:%.*]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector [[BROADCAST_SPLATINSERT5]], poison, zeroinitializer -; CHECK-NEXT: br label %vector.body +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[VAL:%.*]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT2:%.*]], %vector.body ] -; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX1]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP8]], i64 [[UMAX]]) -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP10]], i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to * -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT6]], * [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4 -; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP14]] -; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP15]], label %middle.block, label %vector.body +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[ACTIVE_LANE_MASK]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX1]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to * +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT]], * [[TMP8]], i32 4, [[ACTIVE_LANE_MASK2]]) +; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4 +; CHECK-NEXT: [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP10]] +; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 [[INDEX_NEXT3]], 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[UMAX]]) +; CHECK-NEXT: [[TMP11:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = xor i1 [[TMP11]], true +; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label %while.end.loopexit, label %scalar.ph +; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; entry: br label %while.body Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll @@ -11,71 +11,96 @@ ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16 +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 12 +; CHECK-NEXT: [[ACTIVE_LANE_MASK7:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP1]], i64 [[UMAX]]) ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16 -; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1 -; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP4]] -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 +; CHECK-NEXT: [[ACTIVE_LANE_MASK5:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP3]], i64 [[UMAX]]) +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; CHECK-NEXT: [[ACTIVE_LANE_MASK3:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP5]], i64 [[UMAX]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) +; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 16 +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 16 +; CHECK-NEXT: [[TMP10:%.*]] = sub i64 [[TMP9]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP10]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP7]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[VAL:%.*]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement poison, i32 [[VAL]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector [[BROADCAST_SPLATINSERT5]], poison, zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement poison, i32 [[VAL]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector [[BROADCAST_SPLATINSERT7]], poison, zeroinitializer ; CHECK-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement poison, i32 [[VAL]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT10:%.*]] = shufflevector [[BROADCAST_SPLATINSERT9]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT11:%.*]] = insertelement poison, i32 [[VAL]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT12:%.*]] = shufflevector [[BROADCAST_SPLATINSERT11]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT13:%.*]] = insertelement poison, i32 [[VAL]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT14:%.*]] = shufflevector [[BROADCAST_SPLATINSERT13]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT11:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX1]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4 -; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[TMP7]], 0 -; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 1 -; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX1]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 8 -; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[TMP12]], 0 -; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 1 -; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[INDEX1]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 12 -; CHECK-NEXT: [[TMP18:%.*]] = add i64 [[TMP17]], 0 -; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 1 -; CHECK-NEXT: [[TMP20:%.*]] = add i64 [[INDEX1]], [[TMP19]] -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP5]], i64 [[UMAX]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP10]], i64 [[UMAX]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK3:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP15]], i64 [[UMAX]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK4:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP20]], i64 [[UMAX]]) -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr i32, i32* [[PTR]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr i32, i32* [[PTR]], i64 [[TMP15]] -; CHECK-NEXT: [[TMP24:%.*]] = getelementptr i32, i32* [[PTR]], i64 [[TMP20]] -; CHECK-NEXT: [[TMP25:%.*]] = getelementptr i32, i32* [[TMP21]], i32 0 -; CHECK-NEXT: [[TMP26:%.*]] = bitcast i32* [[TMP25]] to * -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT]], * [[TMP26]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[TMP27:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP28:%.*]] = mul i32 [[TMP27]], 4 -; CHECK-NEXT: [[TMP29:%.*]] = getelementptr i32, i32* [[TMP21]], i32 [[TMP28]] -; CHECK-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP29]] to * -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT6]], * [[TMP30]], i32 4, [[ACTIVE_LANE_MASK2]]) -; CHECK-NEXT: [[TMP31:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP32:%.*]] = mul i32 [[TMP31]], 8 -; CHECK-NEXT: [[TMP33:%.*]] = getelementptr i32, i32* [[TMP21]], i32 [[TMP32]] -; CHECK-NEXT: [[TMP34:%.*]] = bitcast i32* [[TMP33]] to * -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT8]], * [[TMP34]], i32 4, [[ACTIVE_LANE_MASK3]]) -; CHECK-NEXT: [[TMP35:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP36:%.*]] = mul i32 [[TMP35]], 12 -; CHECK-NEXT: [[TMP37:%.*]] = getelementptr i32, i32* [[TMP21]], i32 [[TMP36]] -; CHECK-NEXT: [[TMP38:%.*]] = bitcast i32* [[TMP37]] to * -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT10]], * [[TMP38]], i32 4, [[ACTIVE_LANE_MASK4]]) -; CHECK-NEXT: [[TMP39:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP40:%.*]] = mul i64 [[TMP39]], 16 -; CHECK-NEXT: [[INDEX_NEXT11]] = add i64 [[INDEX1]], [[TMP40]] -; CHECK-NEXT: [[TMP41:%.*]] = icmp eq i64 [[INDEX_NEXT11]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP41]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT15:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[ACTIVE_LANE_MASK]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK19:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK4:%.*]] = phi [ [[ACTIVE_LANE_MASK3]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK20:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK6:%.*]] = phi [ [[ACTIVE_LANE_MASK5]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK21:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK8:%.*]] = phi [ [[ACTIVE_LANE_MASK7]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK22:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[INDEX1]], 0 +; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 4 +; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[TMP13]], 0 +; CHECK-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 1 +; CHECK-NEXT: [[TMP16:%.*]] = add i64 [[INDEX1]], [[TMP15]] +; CHECK-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 8 +; CHECK-NEXT: [[TMP19:%.*]] = add i64 [[TMP18]], 0 +; CHECK-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 1 +; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[INDEX1]], [[TMP20]] +; CHECK-NEXT: [[TMP22:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP23:%.*]] = mul i64 [[TMP22]], 12 +; CHECK-NEXT: [[TMP24:%.*]] = add i64 [[TMP23]], 0 +; CHECK-NEXT: [[TMP25:%.*]] = mul i64 [[TMP24]], 1 +; CHECK-NEXT: [[TMP26:%.*]] = add i64 [[INDEX1]], [[TMP25]] +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr i32, i32* [[PTR]], i64 [[TMP16]] +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr i32, i32* [[PTR]], i64 [[TMP21]] +; CHECK-NEXT: [[TMP30:%.*]] = getelementptr i32, i32* [[PTR]], i64 [[TMP26]] +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr i32, i32* [[TMP27]], i32 0 +; CHECK-NEXT: [[TMP32:%.*]] = bitcast i32* [[TMP31]] to * +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT]], * [[TMP32]], i32 4, [[ACTIVE_LANE_MASK2]]) +; CHECK-NEXT: [[TMP33:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP34:%.*]] = mul i32 [[TMP33]], 4 +; CHECK-NEXT: [[TMP35:%.*]] = getelementptr i32, i32* [[TMP27]], i32 [[TMP34]] +; CHECK-NEXT: [[TMP36:%.*]] = bitcast i32* [[TMP35]] to * +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT10]], * [[TMP36]], i32 4, [[ACTIVE_LANE_MASK4]]) +; CHECK-NEXT: [[TMP37:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP38:%.*]] = mul i32 [[TMP37]], 8 +; CHECK-NEXT: [[TMP39:%.*]] = getelementptr i32, i32* [[TMP27]], i32 [[TMP38]] +; CHECK-NEXT: [[TMP40:%.*]] = bitcast i32* [[TMP39]] to * +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT12]], * [[TMP40]], i32 4, [[ACTIVE_LANE_MASK6]]) +; CHECK-NEXT: [[TMP41:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP42:%.*]] = mul i32 [[TMP41]], 12 +; CHECK-NEXT: [[TMP43:%.*]] = getelementptr i32, i32* [[TMP27]], i32 [[TMP42]] +; CHECK-NEXT: [[TMP44:%.*]] = bitcast i32* [[TMP43]] to * +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT14]], * [[TMP44]], i32 4, [[ACTIVE_LANE_MASK8]]) +; CHECK-NEXT: [[TMP45:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP46:%.*]] = mul i64 [[TMP45]], 16 +; CHECK-NEXT: [[INDEX_NEXT15]] = add i64 [[INDEX1]], [[TMP46]] +; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 [[INDEX_NEXT15]], 0 +; CHECK-NEXT: [[TMP47:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP48:%.*]] = mul i64 [[TMP47]], 4 +; CHECK-NEXT: [[INDEX_PART_NEXT16:%.*]] = add i64 [[INDEX_NEXT15]], [[TMP48]] +; CHECK-NEXT: [[TMP49:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP50:%.*]] = mul i64 [[TMP49]], 8 +; CHECK-NEXT: [[INDEX_PART_NEXT17:%.*]] = add i64 [[INDEX_NEXT15]], [[TMP50]] +; CHECK-NEXT: [[TMP51:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP52:%.*]] = mul i64 [[TMP51]], 12 +; CHECK-NEXT: [[INDEX_PART_NEXT18:%.*]] = add i64 [[INDEX_NEXT15]], [[TMP52]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK19]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[UMAX]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK20]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT16]], i64 [[UMAX]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK21]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT17]], i64 [[UMAX]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK22]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT18]], i64 [[UMAX]]) +; CHECK-NEXT: [[TMP53:%.*]] = extractelement [[ACTIVE_LANE_MASK19]], i32 0 +; CHECK-NEXT: [[TMP54:%.*]] = xor i1 [[TMP53]], true +; CHECK-NEXT: br i1 [[TMP54]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; @@ -101,101 +126,126 @@ ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16 +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 12 +; CHECK-NEXT: [[ACTIVE_LANE_MASK7:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP1]], i64 [[UMAX]]) ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16 -; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1 -; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP4]] -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 +; CHECK-NEXT: [[ACTIVE_LANE_MASK5:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP3]], i64 [[UMAX]]) +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; CHECK-NEXT: [[ACTIVE_LANE_MASK3:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP5]], i64 [[UMAX]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) +; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 16 +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 16 +; CHECK-NEXT: [[TMP10:%.*]] = sub i64 [[TMP9]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP10]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP7]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[VAL:%.*]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement poison, i32 [[VAL]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT9:%.*]] = shufflevector [[BROADCAST_SPLATINSERT8]], poison, zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT10:%.*]] = insertelement poison, i32 [[VAL]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT11:%.*]] = shufflevector [[BROADCAST_SPLATINSERT10]], poison, zeroinitializer ; CHECK-NEXT: [[BROADCAST_SPLATINSERT12:%.*]] = insertelement poison, i32 [[VAL]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT13:%.*]] = shufflevector [[BROADCAST_SPLATINSERT12]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT14:%.*]] = insertelement poison, i32 [[VAL]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT15:%.*]] = shufflevector [[BROADCAST_SPLATINSERT14]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT16:%.*]] = insertelement poison, i32 [[VAL]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT17:%.*]] = shufflevector [[BROADCAST_SPLATINSERT16]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT14:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX1]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4 -; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[TMP7]], 0 -; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 1 -; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX1]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 8 -; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[TMP12]], 0 -; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 1 -; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[INDEX1]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 12 -; CHECK-NEXT: [[TMP18:%.*]] = add i64 [[TMP17]], 0 -; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 1 -; CHECK-NEXT: [[TMP20:%.*]] = add i64 [[INDEX1]], [[TMP19]] -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP5]], i64 [[UMAX]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP10]], i64 [[UMAX]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK3:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP15]], i64 [[UMAX]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK4:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP20]], i64 [[UMAX]]) -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i32, i32* [[COND_PTR:%.*]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr i32, i32* [[COND_PTR]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr i32, i32* [[COND_PTR]], i64 [[TMP15]] -; CHECK-NEXT: [[TMP24:%.*]] = getelementptr i32, i32* [[COND_PTR]], i64 [[TMP20]] -; CHECK-NEXT: [[TMP25:%.*]] = getelementptr i32, i32* [[TMP21]], i32 0 -; CHECK-NEXT: [[TMP26:%.*]] = bitcast i32* [[TMP25]] to * -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP26]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP27:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP28:%.*]] = mul i32 [[TMP27]], 4 -; CHECK-NEXT: [[TMP29:%.*]] = getelementptr i32, i32* [[TMP21]], i32 [[TMP28]] -; CHECK-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP29]] to * -; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP30]], i32 4, [[ACTIVE_LANE_MASK2]], poison) -; CHECK-NEXT: [[TMP31:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP32:%.*]] = mul i32 [[TMP31]], 8 -; CHECK-NEXT: [[TMP33:%.*]] = getelementptr i32, i32* [[TMP21]], i32 [[TMP32]] -; CHECK-NEXT: [[TMP34:%.*]] = bitcast i32* [[TMP33]] to * -; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP34]], i32 4, [[ACTIVE_LANE_MASK3]], poison) -; CHECK-NEXT: [[TMP35:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP36:%.*]] = mul i32 [[TMP35]], 12 -; CHECK-NEXT: [[TMP37:%.*]] = getelementptr i32, i32* [[TMP21]], i32 [[TMP36]] -; CHECK-NEXT: [[TMP38:%.*]] = bitcast i32* [[TMP37]] to * -; CHECK-NEXT: [[WIDE_MASKED_LOAD7:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP38]], i32 4, [[ACTIVE_LANE_MASK4]], poison) -; CHECK-NEXT: [[TMP39:%.*]] = icmp ne [[WIDE_MASKED_LOAD]], zeroinitializer -; CHECK-NEXT: [[TMP40:%.*]] = icmp ne [[WIDE_MASKED_LOAD5]], zeroinitializer -; CHECK-NEXT: [[TMP41:%.*]] = icmp ne [[WIDE_MASKED_LOAD6]], zeroinitializer -; CHECK-NEXT: [[TMP42:%.*]] = icmp ne [[WIDE_MASKED_LOAD7]], zeroinitializer -; CHECK-NEXT: [[TMP43:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP44:%.*]] = getelementptr i32, i32* [[PTR]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP45:%.*]] = getelementptr i32, i32* [[PTR]], i64 [[TMP15]] -; CHECK-NEXT: [[TMP46:%.*]] = getelementptr i32, i32* [[PTR]], i64 [[TMP20]] -; CHECK-NEXT: [[TMP47:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP39]], zeroinitializer -; CHECK-NEXT: [[TMP48:%.*]] = select [[ACTIVE_LANE_MASK2]], [[TMP40]], zeroinitializer -; CHECK-NEXT: [[TMP49:%.*]] = select [[ACTIVE_LANE_MASK3]], [[TMP41]], zeroinitializer -; CHECK-NEXT: [[TMP50:%.*]] = select [[ACTIVE_LANE_MASK4]], [[TMP42]], zeroinitializer -; CHECK-NEXT: [[TMP51:%.*]] = getelementptr i32, i32* [[TMP43]], i32 0 -; CHECK-NEXT: [[TMP52:%.*]] = bitcast i32* [[TMP51]] to * -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT]], * [[TMP52]], i32 4, [[TMP47]]) -; CHECK-NEXT: [[TMP53:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP54:%.*]] = mul i32 [[TMP53]], 4 -; CHECK-NEXT: [[TMP55:%.*]] = getelementptr i32, i32* [[TMP43]], i32 [[TMP54]] -; CHECK-NEXT: [[TMP56:%.*]] = bitcast i32* [[TMP55]] to * -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT9]], * [[TMP56]], i32 4, [[TMP48]]) -; CHECK-NEXT: [[TMP57:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP58:%.*]] = mul i32 [[TMP57]], 8 -; CHECK-NEXT: [[TMP59:%.*]] = getelementptr i32, i32* [[TMP43]], i32 [[TMP58]] -; CHECK-NEXT: [[TMP60:%.*]] = bitcast i32* [[TMP59]] to * -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT11]], * [[TMP60]], i32 4, [[TMP49]]) -; CHECK-NEXT: [[TMP61:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP62:%.*]] = mul i32 [[TMP61]], 12 -; CHECK-NEXT: [[TMP63:%.*]] = getelementptr i32, i32* [[TMP43]], i32 [[TMP62]] -; CHECK-NEXT: [[TMP64:%.*]] = bitcast i32* [[TMP63]] to * -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT13]], * [[TMP64]], i32 4, [[TMP50]]) -; CHECK-NEXT: [[TMP65:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP66:%.*]] = mul i64 [[TMP65]], 16 -; CHECK-NEXT: [[INDEX_NEXT14]] = add i64 [[INDEX1]], [[TMP66]] -; CHECK-NEXT: [[TMP67:%.*]] = icmp eq i64 [[INDEX_NEXT14]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP67]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT18:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[ACTIVE_LANE_MASK]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK22:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK4:%.*]] = phi [ [[ACTIVE_LANE_MASK3]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK23:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK6:%.*]] = phi [ [[ACTIVE_LANE_MASK5]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK24:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK8:%.*]] = phi [ [[ACTIVE_LANE_MASK7]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK25:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[INDEX1]], 0 +; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 4 +; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[TMP13]], 0 +; CHECK-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 1 +; CHECK-NEXT: [[TMP16:%.*]] = add i64 [[INDEX1]], [[TMP15]] +; CHECK-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 8 +; CHECK-NEXT: [[TMP19:%.*]] = add i64 [[TMP18]], 0 +; CHECK-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 1 +; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[INDEX1]], [[TMP20]] +; CHECK-NEXT: [[TMP22:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP23:%.*]] = mul i64 [[TMP22]], 12 +; CHECK-NEXT: [[TMP24:%.*]] = add i64 [[TMP23]], 0 +; CHECK-NEXT: [[TMP25:%.*]] = mul i64 [[TMP24]], 1 +; CHECK-NEXT: [[TMP26:%.*]] = add i64 [[INDEX1]], [[TMP25]] +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr i32, i32* [[COND_PTR:%.*]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr i32, i32* [[COND_PTR]], i64 [[TMP16]] +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr i32, i32* [[COND_PTR]], i64 [[TMP21]] +; CHECK-NEXT: [[TMP30:%.*]] = getelementptr i32, i32* [[COND_PTR]], i64 [[TMP26]] +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr i32, i32* [[TMP27]], i32 0 +; CHECK-NEXT: [[TMP32:%.*]] = bitcast i32* [[TMP31]] to * +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP32]], i32 4, [[ACTIVE_LANE_MASK2]], poison) +; CHECK-NEXT: [[TMP33:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP34:%.*]] = mul i32 [[TMP33]], 4 +; CHECK-NEXT: [[TMP35:%.*]] = getelementptr i32, i32* [[TMP27]], i32 [[TMP34]] +; CHECK-NEXT: [[TMP36:%.*]] = bitcast i32* [[TMP35]] to * +; CHECK-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP36]], i32 4, [[ACTIVE_LANE_MASK4]], poison) +; CHECK-NEXT: [[TMP37:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP38:%.*]] = mul i32 [[TMP37]], 8 +; CHECK-NEXT: [[TMP39:%.*]] = getelementptr i32, i32* [[TMP27]], i32 [[TMP38]] +; CHECK-NEXT: [[TMP40:%.*]] = bitcast i32* [[TMP39]] to * +; CHECK-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP40]], i32 4, [[ACTIVE_LANE_MASK6]], poison) +; CHECK-NEXT: [[TMP41:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP42:%.*]] = mul i32 [[TMP41]], 12 +; CHECK-NEXT: [[TMP43:%.*]] = getelementptr i32, i32* [[TMP27]], i32 [[TMP42]] +; CHECK-NEXT: [[TMP44:%.*]] = bitcast i32* [[TMP43]] to * +; CHECK-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP44]], i32 4, [[ACTIVE_LANE_MASK8]], poison) +; CHECK-NEXT: [[TMP45:%.*]] = icmp ne [[WIDE_MASKED_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP46:%.*]] = icmp ne [[WIDE_MASKED_LOAD9]], zeroinitializer +; CHECK-NEXT: [[TMP47:%.*]] = icmp ne [[WIDE_MASKED_LOAD10]], zeroinitializer +; CHECK-NEXT: [[TMP48:%.*]] = icmp ne [[WIDE_MASKED_LOAD11]], zeroinitializer +; CHECK-NEXT: [[TMP49:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP50:%.*]] = getelementptr i32, i32* [[PTR]], i64 [[TMP16]] +; CHECK-NEXT: [[TMP51:%.*]] = getelementptr i32, i32* [[PTR]], i64 [[TMP21]] +; CHECK-NEXT: [[TMP52:%.*]] = getelementptr i32, i32* [[PTR]], i64 [[TMP26]] +; CHECK-NEXT: [[TMP53:%.*]] = select [[ACTIVE_LANE_MASK2]], [[TMP45]], zeroinitializer +; CHECK-NEXT: [[TMP54:%.*]] = select [[ACTIVE_LANE_MASK4]], [[TMP46]], zeroinitializer +; CHECK-NEXT: [[TMP55:%.*]] = select [[ACTIVE_LANE_MASK6]], [[TMP47]], zeroinitializer +; CHECK-NEXT: [[TMP56:%.*]] = select [[ACTIVE_LANE_MASK8]], [[TMP48]], zeroinitializer +; CHECK-NEXT: [[TMP57:%.*]] = getelementptr i32, i32* [[TMP49]], i32 0 +; CHECK-NEXT: [[TMP58:%.*]] = bitcast i32* [[TMP57]] to * +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT]], * [[TMP58]], i32 4, [[TMP53]]) +; CHECK-NEXT: [[TMP59:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP60:%.*]] = mul i32 [[TMP59]], 4 +; CHECK-NEXT: [[TMP61:%.*]] = getelementptr i32, i32* [[TMP49]], i32 [[TMP60]] +; CHECK-NEXT: [[TMP62:%.*]] = bitcast i32* [[TMP61]] to * +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT13]], * [[TMP62]], i32 4, [[TMP54]]) +; CHECK-NEXT: [[TMP63:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP64:%.*]] = mul i32 [[TMP63]], 8 +; CHECK-NEXT: [[TMP65:%.*]] = getelementptr i32, i32* [[TMP49]], i32 [[TMP64]] +; CHECK-NEXT: [[TMP66:%.*]] = bitcast i32* [[TMP65]] to * +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT15]], * [[TMP66]], i32 4, [[TMP55]]) +; CHECK-NEXT: [[TMP67:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP68:%.*]] = mul i32 [[TMP67]], 12 +; CHECK-NEXT: [[TMP69:%.*]] = getelementptr i32, i32* [[TMP49]], i32 [[TMP68]] +; CHECK-NEXT: [[TMP70:%.*]] = bitcast i32* [[TMP69]] to * +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT17]], * [[TMP70]], i32 4, [[TMP56]]) +; CHECK-NEXT: [[TMP71:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP72:%.*]] = mul i64 [[TMP71]], 16 +; CHECK-NEXT: [[INDEX_NEXT18]] = add i64 [[INDEX1]], [[TMP72]] +; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 [[INDEX_NEXT18]], 0 +; CHECK-NEXT: [[TMP73:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP74:%.*]] = mul i64 [[TMP73]], 4 +; CHECK-NEXT: [[INDEX_PART_NEXT19:%.*]] = add i64 [[INDEX_NEXT18]], [[TMP74]] +; CHECK-NEXT: [[TMP75:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP76:%.*]] = mul i64 [[TMP75]], 8 +; CHECK-NEXT: [[INDEX_PART_NEXT20:%.*]] = add i64 [[INDEX_NEXT18]], [[TMP76]] +; CHECK-NEXT: [[TMP77:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP78:%.*]] = mul i64 [[TMP77]], 12 +; CHECK-NEXT: [[INDEX_PART_NEXT21:%.*]] = add i64 [[INDEX_NEXT18]], [[TMP78]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK22]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[UMAX]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK23]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT19]], i64 [[UMAX]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK24]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT20]], i64 [[UMAX]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK25]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT21]], i64 [[UMAX]]) +; CHECK-NEXT: [[TMP79:%.*]] = extractelement [[ACTIVE_LANE_MASK22]], i32 0 +; CHECK-NEXT: [[TMP80:%.*]] = xor i1 [[TMP79]], true +; CHECK-NEXT: br i1 [[TMP80]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll @@ -10,8 +10,9 @@ ; CHECK-LABEL: @simple_memset( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1) -; CHECK-NEXT: br i1 false, label %scalar.ph, label %vector.ph +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() @@ -20,24 +21,27 @@ ; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP4]] ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement poison, i32 [[VAL:%.*]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector [[BROADCAST_SPLATINSERT5]], poison, zeroinitializer -; CHECK-NEXT: br label %vector.body +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[VAL:%.*]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT2:%.*]], %vector.body ] -; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX1]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP8]], i64 [[UMAX]]) -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP10]], i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to * -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT6]], * [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4 -; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP14]] -; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP15]], label %middle.block, label %vector.body +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[ACTIVE_LANE_MASK]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK4:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX1]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to * +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT]], * [[TMP8]], i32 4, [[ACTIVE_LANE_MASK2]]) +; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4 +; CHECK-NEXT: [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP10]] +; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 [[INDEX_NEXT3]], 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK4]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[UMAX]]) +; CHECK-NEXT: [[TMP11:%.*]] = extractelement [[ACTIVE_LANE_MASK4]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = xor i1 [[TMP11]], true +; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label %while.end.loopexit, label %scalar.ph +; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; entry: br label %while.body @@ -59,8 +63,9 @@ ; CHECK-LABEL: @simple_memcpy( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1) -; CHECK-NEXT: br i1 false, label %scalar.ph, label %vector.ph +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() @@ -69,26 +74,29 @@ ; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP4]] ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-NEXT: br label %vector.body +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT2:%.*]], %vector.body ] -; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX1]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP8]], i64 [[UMAX]]) -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[SRC:%.*]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP10]], i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to * -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[DST:%.*]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, i32* [[TMP13]], i32 0 -; CHECK-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to * -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[WIDE_MASKED_LOAD]], * [[TMP15]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4 -; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP17]] -; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP18]], label %middle.block, label %vector.body, !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[ACTIVE_LANE_MASK]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK4:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX1]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[SRC:%.*]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to * +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP8]], i32 4, [[ACTIVE_LANE_MASK2]], poison) +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[DST:%.*]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[TMP9]], i32 0 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to * +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[WIDE_MASKED_LOAD]], * [[TMP11]], i32 4, [[ACTIVE_LANE_MASK2]]) +; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 4 +; CHECK-NEXT: [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP13]] +; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 [[INDEX_NEXT3]], 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK4]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[UMAX]]) +; CHECK-NEXT: [[TMP14:%.*]] = extractelement [[ACTIVE_LANE_MASK4]], i32 0 +; CHECK-NEXT: [[TMP15:%.*]] = xor i1 [[TMP14]], true +; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label %while.end.loopexit, label %scalar.ph +; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; entry: br label %while.body @@ -115,8 +123,9 @@ ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[UMAX]], -1 ; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 2 ; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 -; CHECK-NEXT: br i1 false, label %scalar.ph, label %vector.ph +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[TMP2]]) ; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4 ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() @@ -135,29 +144,26 @@ ; CHECK-NEXT: [[TMP13:%.*]] = mul i64 4, [[TMP12]] ; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP13]], i32 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: br label %vector.body +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT2:%.*]], %vector.body ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement poison, i64 [[INDEX1]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector [[BROADCAST_SPLATINSERT3]], poison, zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = call @llvm.experimental.stepvector.nxv4i64() -; CHECK-NEXT: [[TMP15:%.*]] = add zeroinitializer, [[TMP14]] -; CHECK-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT4]], [[TMP15]] -; CHECK-NEXT: [[TMP16:%.*]] = extractelement [[VEC_IV]], i32 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP16]], i64 [[TMP2]]) -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i32, i32* [[SRC:%.*]], [[VEC_IND]] -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP17]], i32 4, [[ACTIVE_LANE_MASK]], undef) -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[DST:%.*]], [[VEC_IND]] -; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( [[WIDE_MASKED_GATHER]], [[TMP18]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 4 -; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP20]] +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[ACTIVE_LANE_MASK]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK4:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, i32* [[SRC:%.*]], [[VEC_IND]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP14]], i32 4, [[ACTIVE_LANE_MASK2]], undef) +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i32, i32* [[DST:%.*]], [[VEC_IND]] +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( [[WIDE_MASKED_GATHER]], [[TMP15]], i32 4, [[ACTIVE_LANE_MASK2]]) +; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4 +; CHECK-NEXT: [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP17]] +; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 [[INDEX_NEXT3]], 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK4]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[TMP2]]) +; CHECK-NEXT: [[TMP18:%.*]] = extractelement [[ACTIVE_LANE_MASK4]], i32 0 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP21]], label %middle.block, label %vector.body +; CHECK-NEXT: [[TMP19:%.*]] = xor i1 [[TMP18]], true +; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label %while.end.loopexit, label %scalar.ph +; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; entry: br label %while.body @@ -181,8 +187,9 @@ ; CHECK-LABEL: @simple_gather_scatter( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1) -; CHECK-NEXT: br i1 false, label %scalar.ph, label %vector.ph +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() @@ -193,24 +200,27 @@ ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT2:%.*]], %vector.body ] -; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX1]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP8]], i64 [[UMAX]]) -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[IND:%.*]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP10]], i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to * -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[SRC:%.*]], [[WIDE_MASKED_LOAD]] -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP13]], i32 4, [[ACTIVE_LANE_MASK]], undef) -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, i32* [[DST:%.*]], [[WIDE_MASKED_LOAD]] -; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( [[WIDE_MASKED_GATHER]], [[TMP14]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4 -; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP16]] -; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP17]], label %middle.block, label %vector.body +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[ACTIVE_LANE_MASK]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK4:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX1]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[IND:%.*]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to * +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP8]], i32 4, [[ACTIVE_LANE_MASK2]], poison) +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[SRC:%.*]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP9]], i32 4, [[ACTIVE_LANE_MASK2]], undef) +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[DST:%.*]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( [[WIDE_MASKED_GATHER]], [[TMP10]], i32 4, [[ACTIVE_LANE_MASK2]]) +; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 4 +; CHECK-NEXT: [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP12]] +; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 [[INDEX_NEXT3]], 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK4]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[UMAX]]) +; CHECK-NEXT: [[TMP13:%.*]] = extractelement [[ACTIVE_LANE_MASK4]], i32 0 +; CHECK-NEXT: [[TMP14:%.*]] = xor i1 [[TMP13]], true +; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label %while.end.loopexit, label %scalar.ph +; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; entry: br label %while.body @@ -237,35 +247,39 @@ define void @uniform_load(i32* noalias %dst, i32* noalias readonly %src, i64 %n) #0 { ; CHECK-LABEL: @uniform_load( ; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 false, label %scalar.ph, label %vector.ph +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N:%.*]]) ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 ; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1 -; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N:%.*]], [[TMP4]] +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP4]] ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-NEXT: br label %vector.body +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] -; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP8]], i64 %n) -; CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[SRC:%.*]], align 4 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement poison, i32 [[TMP10]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector [[BROADCAST_SPLATINSERT3]], poison, zeroinitializer -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 0 -; CHECK-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to * -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT4]], * [[TMP13]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP15]] -; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP16]], label %middle.block, label %vector.body, !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK1:%.*]] = phi [ [[ACTIVE_LANE_MASK]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[SRC:%.*]], align 4 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP6]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to * +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT]], * [[TMP9]], i32 4, [[ACTIVE_LANE_MASK1]]) +; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP11]] +; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 [[INDEX_NEXT]], 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK2]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[N]]) +; CHECK-NEXT: [[TMP12:%.*]] = extractelement [[ACTIVE_LANE_MASK2]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = xor i1 [[TMP12]], true +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label %for.end, label %scalar.ph +; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; entry: @@ -292,45 +306,49 @@ define void @cond_uniform_load(i32* noalias %dst, i32* noalias readonly %src, i32* noalias readonly %cond, i64 %n) #0 { ; CHECK-LABEL: @cond_uniform_load( ; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 false, label %scalar.ph, label %vector.ph +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N:%.*]]) ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 ; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1 -; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N:%.*]], [[TMP4]] +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP4]] ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement poison, i32* [[SRC:%.*]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector [[BROADCAST_SPLATINSERT5]], poison, zeroinitializer -; CHECK-NEXT: br label %vector.body +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32* [[SRC:%.*]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT2:%.*]], %vector.body ] -; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX1]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP8]], i64 %n) -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[COND:%.*]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to * -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP13:%.*]] = icmp eq [[WIDE_MASKED_LOAD]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = xor [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP15:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP14]], zeroinitializer -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[BROADCAST_SPLAT6]], i32 4, [[TMP15]], undef) -; CHECK-NEXT: [[TMP16:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP13]], zeroinitializer -; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP16]], zeroinitializer, [[WIDE_MASKED_GATHER]] -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP18:%.*]] = or [[TMP15]], [[TMP16]] -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[TMP17]], i32 0 -; CHECK-NEXT: [[TMP20:%.*]] = bitcast i32* [[TMP19]] to * -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[PREDPHI]], * [[TMP20]], i32 4, [[TMP18]]) -; CHECK-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 4 -; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP22]] -; CHECK-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP23]], label %middle.block, label %vector.body +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[ACTIVE_LANE_MASK]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK4:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX1]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[COND:%.*]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to * +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP8]], i32 4, [[ACTIVE_LANE_MASK2]], poison) +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq [[WIDE_MASKED_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = xor [[TMP9]], shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP11:%.*]] = select [[ACTIVE_LANE_MASK2]], [[TMP10]], zeroinitializer +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[BROADCAST_SPLAT]], i32 4, [[TMP11]], undef) +; CHECK-NEXT: [[TMP12:%.*]] = select [[ACTIVE_LANE_MASK2]], [[TMP9]], zeroinitializer +; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP12]], zeroinitializer, [[WIDE_MASKED_GATHER]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP14:%.*]] = or [[TMP11]], [[TMP12]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP13]], i32 0 +; CHECK-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP15]] to * +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[PREDPHI]], * [[TMP16]], i32 4, [[TMP14]]) +; CHECK-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 4 +; CHECK-NEXT: [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP18]] +; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 [[INDEX_NEXT3]], 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK4]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[N]]) +; CHECK-NEXT: [[TMP19:%.*]] = extractelement [[ACTIVE_LANE_MASK4]], i32 0 +; CHECK-NEXT: [[TMP20:%.*]] = xor i1 [[TMP19]], true +; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label %for.end, label %scalar.ph +; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; entry: @@ -365,35 +383,39 @@ define void @uniform_store(i32* noalias %dst, i32* noalias readonly %src, i64 %n) #0 { ; CHECK-LABEL: @uniform_store( ; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 false, label %scalar.ph, label %vector.ph +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N:%.*]]) ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 ; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1 -; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N:%.*]], [[TMP4]] +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP4]] ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement poison, i32* [[DST:%.*]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector [[BROADCAST_SPLATINSERT3]], poison, zeroinitializer -; CHECK-NEXT: br label %vector.body +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32* [[DST:%.*]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] -; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP8]], i64 %n) -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to * -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT4]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP14]] -; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP15]], label %middle.block, label %vector.body +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK1:%.*]] = phi [ [[ACTIVE_LANE_MASK]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to * +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP8]], i32 4, [[ACTIVE_LANE_MASK1]], poison) +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]], i32 4, [[ACTIVE_LANE_MASK1]]) +; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]] +; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 [[INDEX_NEXT]], 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK2]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[N]]) +; CHECK-NEXT: [[TMP11:%.*]] = extractelement [[ACTIVE_LANE_MASK2]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = xor i1 [[TMP11]], true +; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label %for.end, label %scalar.ph +; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; entry: @@ -417,8 +439,9 @@ ; CHECK-LABEL: @simple_fdiv( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1) -; CHECK-NEXT: br i1 false, label %scalar.ph, label %vector.ph +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() @@ -427,29 +450,32 @@ ; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP4]] ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-NEXT: br label %vector.body +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT2:%.*]], %vector.body ] -; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX1]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP8]], i64 [[UMAX]]) -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr float, float* [[SRC:%.*]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr float, float* [[DST:%.*]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr float, float* [[TMP10]], i32 0 -; CHECK-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP12]] to * -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0nxv4f32(* [[TMP13]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr float, float* [[TMP11]], i32 0 -; CHECK-NEXT: [[TMP15:%.*]] = bitcast float* [[TMP14]] to * -; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call @llvm.masked.load.nxv4f32.p0nxv4f32(* [[TMP15]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP16:%.*]] = fdiv [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD5]] -; CHECK-NEXT: [[TMP17:%.*]] = bitcast float* [[TMP14]] to * -; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0nxv4f32( [[TMP16]], * [[TMP17]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 4 -; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP19]] -; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP20]], label %middle.block, label %vector.body, !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[ACTIVE_LANE_MASK]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK5:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX1]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr float, float* [[SRC:%.*]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr float, float* [[DST:%.*]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr float, float* [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP8]] to * +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0nxv4f32(* [[TMP9]], i32 4, [[ACTIVE_LANE_MASK2]], poison) +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr float, float* [[TMP7]], i32 0 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast float* [[TMP10]] to * +; CHECK-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = call @llvm.masked.load.nxv4f32.p0nxv4f32(* [[TMP11]], i32 4, [[ACTIVE_LANE_MASK2]], poison) +; CHECK-NEXT: [[TMP12:%.*]] = fdiv [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD3]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP10]] to * +; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0nxv4f32( [[TMP12]], * [[TMP13]], i32 4, [[ACTIVE_LANE_MASK2]]) +; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 4 +; CHECK-NEXT: [[INDEX_NEXT4]] = add i64 [[INDEX1]], [[TMP15]] +; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 [[INDEX_NEXT4]], 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK5]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[UMAX]]) +; CHECK-NEXT: [[TMP16:%.*]] = extractelement [[ACTIVE_LANE_MASK5]], i32 0 +; CHECK-NEXT: [[TMP17:%.*]] = xor i1 [[TMP16]], true +; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label %while.end.loopexit, label %scalar.ph +; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; entry: br label %while.body @@ -475,8 +501,9 @@ ; CHECK-LABEL: @add_reduction_i32( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1) -; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label %vector.ph +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() @@ -485,26 +512,29 @@ ; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP4]] ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-NEXT: br label %vector.body +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT2:%.*]], %vector.body ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, %vector.ph ], [ [[TMP14:%.*]], %vector.body ] -; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX1]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP8]], i64 [[UMAX]]) -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[TMP9]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to * -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP11]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP12:%.*]] = select [[ACTIVE_LANE_MASK]], [[WIDE_MASKED_LOAD]], zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP12]]) -; CHECK-NEXT: [[TMP14]] = add i32 [[TMP13]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4 -; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP16]] -; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label %vector.body, !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[ACTIVE_LANE_MASK]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK4:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX1]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to * +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP8]], i32 4, [[ACTIVE_LANE_MASK2]], poison) +; CHECK-NEXT: [[TMP9:%.*]] = select [[ACTIVE_LANE_MASK2]], [[WIDE_MASKED_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP9]]) +; CHECK-NEXT: [[TMP11]] = add i32 [[TMP10]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 4 +; CHECK-NEXT: [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP13]] +; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 [[INDEX_NEXT3]], 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK4]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[UMAX]]) +; CHECK-NEXT: [[TMP14:%.*]] = extractelement [[ACTIVE_LANE_MASK4]], i32 0 +; CHECK-NEXT: [[TMP15:%.*]] = xor i1 [[TMP14]], true +; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label %scalar.ph +; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; entry: br label %while.body @@ -527,8 +557,9 @@ ; CHECK-LABEL: @add_reduction_f32( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1) -; CHECK-NEXT: br i1 false, label %scalar.ph, label %vector.ph +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() @@ -537,25 +568,28 @@ ; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP4]] ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-NEXT: br label %vector.body +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT2:%.*]], %vector.body ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, %vector.ph ], [ [[TMP14:%.*]], %vector.body ] -; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX1]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP8]], i64 [[UMAX]]) -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr float, float* [[PTR:%.*]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr float, float* [[TMP10]], i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = bitcast float* [[TMP11]] to * -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0nxv4f32(* [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP13:%.*]] = select [[ACTIVE_LANE_MASK]], [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, float -0.000000e+00, i32 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP14]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[TMP13]]) -; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4 -; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP16]] -; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP17]], label %middle.block, label %vector.body, !llvm.loop [[LOOP22:![0-9]+]] +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[ACTIVE_LANE_MASK]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK4:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX1]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr float, float* [[PTR:%.*]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr float, float* [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[TMP7]] to * +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0nxv4f32(* [[TMP8]], i32 4, [[ACTIVE_LANE_MASK2]], poison) +; CHECK-NEXT: [[TMP9:%.*]] = select [[ACTIVE_LANE_MASK2]], [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, float -0.000000e+00, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP10]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[TMP9]]) +; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 4 +; CHECK-NEXT: [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP12]] +; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 [[INDEX_NEXT3]], 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK4]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[UMAX]]) +; CHECK-NEXT: [[TMP13:%.*]] = extractelement [[ACTIVE_LANE_MASK4]], i32 0 +; CHECK-NEXT: [[TMP14:%.*]] = xor i1 [[TMP13]], true +; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label %while.end.loopexit, label %scalar.ph +; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; entry: br label %while.body @@ -577,42 +611,47 @@ define i32 @cond_xor_reduction(i32* noalias %a, i32* noalias %cond, i64 %N) #0 { ; CHECK-LABEL: @cond_xor_reduction( ; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 false, label %scalar.ph, label %vector.ph +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N:%.*]]) ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 ; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1 -; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N:%.*]], [[TMP4]] +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP4]] ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-NEXT: br label %vector.body +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 7, %vector.ph ], [ [[TMP16:%.*]], %vector.body ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK1:%.*]] = phi [ [[ACTIVE_LANE_MASK]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 7, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP5]], i64 [[N]]) ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[COND:%.*]], i64 [[TMP5]] ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 0 ; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to * -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP8]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP8]], i32 4, [[ACTIVE_LANE_MASK1]], poison) ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i32 5, i32 0), poison, zeroinitializer) ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP11:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP9]], zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = select [[ACTIVE_LANE_MASK1]], [[TMP9]], zeroinitializer ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to * -; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP13]], i32 4, [[TMP11]], poison) -; CHECK-NEXT: [[TMP14:%.*]] = select [[TMP11]], [[WIDE_MASKED_LOAD1]], zeroinitializer +; CHECK-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP13]], i32 4, [[TMP11]], poison) +; CHECK-NEXT: [[TMP14:%.*]] = select [[TMP11]], [[WIDE_MASKED_LOAD2]], zeroinitializer ; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.xor.nxv4i32( [[TMP14]]) ; CHECK-NEXT: [[TMP16]] = xor i32 [[TMP15]], [[VEC_PHI]] ; CHECK-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP18]] -; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 [[INDEX_NEXT]], 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK3]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[N]]) +; CHECK-NEXT: [[TMP19:%.*]] = extractelement [[ACTIVE_LANE_MASK3]], i32 0 +; CHECK-NEXT: [[TMP20:%.*]] = xor i1 [[TMP19]], true +; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label %scalar.ph +; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; entry: br label %for.body Index: llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll @@ -9,10 +9,12 @@ ; we don't artificially create new predicated blocks for the load. define void @uniform_load(i32* noalias %dst, i32* noalias readonly %src, i64 %n) #0 { ; CHECK-LABEL: @uniform_load( +; CHECK: vector.ph: +; CHECK: [[INIT_ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 0, i64 %n) ; CHECK: vector.body: ; CHECK-NEXT: [[IDX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[IDX_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[INIT_ACTIVE_LANE_MASK]], %vector.ph ], [ [[NEXT_ACTIVE_LANE_MASK:%.*]], %vector.body ] ; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[IDX]], 0 -; CHECK-NEXT: [[LOOP_PRED:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[TMP3]], i64 %n) ; CHECK-NEXT: [[LOAD_VAL:%.*]] = load i32, i32* %src, align 4 ; CHECK-NOT: load i32, i32* %src, align 4 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> poison, i32 [[LOAD_VAL]], i32 0 @@ -20,10 +22,13 @@ ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* %dst, i64 [[TMP3]] ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 0 ; CHECK-NEXT: [[STORE_PTR:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>* -; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP5]], <4 x i32>* [[STORE_PTR]], i32 4, <4 x i1> [[LOOP_PRED]]) +; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP5]], <4 x i32>* [[STORE_PTR]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]]) ; CHECK-NEXT: [[IDX_NEXT]] = add i64 [[IDX]], 4 -; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[IDX_NEXT]], %n.vec -; CHECK-NEXT: br i1 [[CMP]], label %middle.block, label %vector.body +; CHECK-NEXT: [[IDX_NEXT_PART0:%.*]] = add i64 [[IDX_NEXT]], 0 +; CHECK-NEXT: [[NEXT_ACTIVE_LANE_MASK]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[IDX_NEXT_PART0]], i64 %n) +; CHECK-NEXT: [[FIRST_LANE_SET:%.*]] = extractelement <4 x i1> [[NEXT_ACTIVE_LANE_MASK]], i32 0 +; CHECK-NEXT: [[FIRST_LANE_NOT_SET:%.*]] = xor i1 [[FIRST_LANE_SET]], true +; CHECK-NEXT: br i1 [[FIRST_LANE_NOT_SET]], label %middle.block, label %vector.body entry: br label %for.body @@ -48,16 +53,17 @@ define void @cond_uniform_load(i32* nocapture %dst, i32* nocapture readonly %src, i32* nocapture readonly %cond, i64 %n) #0 { ; CHECK-LABEL: @cond_uniform_load( ; CHECK: vector.ph: +; CHECK: [[INIT_ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 0, i64 %n) ; CHECK: [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* %src, i32 0 ; CHECK-NEXT: [[SRC_SPLAT:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> poison, <4 x i32> zeroinitializer ; CHECK: vector.body: ; CHECK-NEXT: [[IDX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[IDX_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[INIT_ACTIVE_LANE_MASK]], %vector.ph ], [ [[NEXT_ACTIVE_LANE_MASK:%.*]], %vector.body ] ; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[IDX]], 0 -; CHECK-NEXT: [[LOOP_PRED:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[TMP3]], i64 %n) -; CHECK: [[COND_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{%.*}}, i32 4, <4 x i1> [[LOOP_PRED]], <4 x i32> poison) +; CHECK: [[COND_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{%.*}}, i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison) ; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <4 x i32> [[COND_LOAD]], zeroinitializer ; CHECK-NEXT: [[TMP5:%.*]] = xor <4 x i1> [[TMP4]], -; CHECK-NEXT: [[MASK:%.*]] = select <4 x i1> [[LOOP_PRED]], <4 x i1> [[TMP5]], <4 x i1> zeroinitializer +; CHECK-NEXT: [[MASK:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i1> [[TMP5]], <4 x i1> zeroinitializer ; CHECK-NEXT: call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[SRC_SPLAT]], i32 4, <4 x i1> [[MASK]], <4 x i32> undef) entry: br label %for.body