Index: llvm/lib/Target/ARM/MVETailPredication.cpp =================================================================== --- llvm/lib/Target/ARM/MVETailPredication.cpp +++ llvm/lib/Target/ARM/MVETailPredication.cpp @@ -31,15 +31,14 @@ /// blocks of instructions operating on different vector types. /// /// This pass: -/// 1) Pattern matches the scalar iteration count produced by the vectoriser. -/// The scalar loop iteration count represents the number of elements to be -/// processed. -/// TODO: this could be emitted using an intrinsic, similar to the hardware -/// loop intrinsics, so that we don't need to pattern match this here. -/// 2) Inserts the VCTP intrinsic to represent the effect of -/// tail predication. This will be picked up by the ARM Low-overhead loop -/// pass, which performs the final transformation to a DLSTP or WLSTP -/// tail-predicated loop. +/// 1) Checks if the predicates of the masked load/store instructions are +/// generated by intrinsic @llvm.get.active.lanes(). This intrinsic consumes +/// the Backedge Taken Count (BTC) of the scalar loop as its second argument, +/// which we extract to set up the number of elements processed by the loop. +/// 2) Intrinsic @llvm.get.active.lanes() is then replaced by the MVE target +/// specific VCTP intrinsic to represent the effect of tail predication. +/// This will be picked up by the ARM Low-overhead loop pass, which performs +/// the final transformation to a DLSTP or WLSTP tail-predicated loop. #include "ARM.h" #include "ARMSubtarget.h" @@ -70,28 +69,6 @@ cl::desc("Disable MVE Tail Predication")); namespace { -// Bookkeeping for pattern matching the loop trip count and the number of -// elements processed by the loop. -struct TripCountPattern { - // An icmp instruction that calculates a predicate of active/inactive lanes - // used by the masked loads/stores. - Instruction *Predicate = nullptr; - - // The add instruction that increments the IV. - Value *TripCount = nullptr; - - // The number of elements processed by the vector loop. - Value *NumElements = nullptr; - - // Other instructions in the icmp chain that calculate the predicate. - FixedVectorType *VecTy = nullptr; - Instruction *Shuffle = nullptr; - Instruction *Induction = nullptr; - - TripCountPattern(Instruction *P, Value *TC, FixedVectorType *VT) - : Predicate(P), TripCount(TC), VecTy(VT){}; -}; - class MVETailPredication : public LoopPass { SmallVector MaskedInsts; Loop *L = nullptr; @@ -102,6 +79,8 @@ TargetTransformInfo *TTI = nullptr; TargetLibraryInfo *TLI = nullptr; bool ClonedVCTPInExitBlock = false; + IntrinsicInst *ActiveLaneMask = nullptr; + FixedVectorType *VecTy = nullptr; public: static char ID; @@ -129,23 +108,25 @@ /// load/stores. bool IsPredicatedVectorLoop(); - /// Compute a value for the total number of elements that the predicated - /// loop will process if it is a runtime value. - bool ComputeRuntimeElements(TripCountPattern &TCP); - - /// Return whether this is the icmp that generates an i1 vector, based - /// upon a loop counter and a limit that is defined outside the loop, - /// that generates the active/inactive lanes required for tail-predication. - bool isTailPredicate(TripCountPattern &TCP); + /// Perform checks on the arguments of @llvm.get.active.lane.mask + /// intrinsic: check if the first is a loop induction variable, and for the + /// the second check that no overflow can occur in the expression that use + /// this backedge-taken count. + bool IsSafeActiveMask(Value *TripCount, FixedVectorType *VecTy); /// Insert the intrinsic to represent the effect of tail predication. - void InsertVCTPIntrinsic(TripCountPattern &TCP, + void InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask, Value *TripCount, + FixedVectorType *VecTy, DenseMap &NewPredicates); /// Rematerialize the iteration count in exit blocks, which enables /// ARMLowOverheadLoops to better optimise away loop update statements inside /// hardware-loops. void RematerializeIterCount(); + + /// If it is not safe to lower @llvm.get.active.lane.mask to a VCTP, it needs + /// to be lowered to an icmp. + void RevertActiveLaneMask(); }; } // end namespace @@ -179,6 +160,74 @@ DeadInsts); } +void MVETailPredication::RevertActiveLaneMask() { + if (!ActiveLaneMask) + return; + + int VectorWidth = VecTy->getElementCount().Min; + IRBuilder<> Builder(ActiveLaneMask->getParent()->getFirstNonPHI()); + + // 1. Create the vector induction step. This %induction will be the LHS of + // the icmp: + // + // %splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 + // %splat = shufflevector <4 x i32> %splatinsert, <4 x i32> undef, <4 x i32> 0 + // %induction = add <4 x i32> %splat, + + Value *Index = ActiveLaneMask->getOperand(0); + Value *SplatIndex = + Builder.CreateVectorSplat(VectorWidth, Index, "lane.mask"); + + SmallVector Indices; + for (int i = 0; i < VectorWidth; ++i) + Indices.push_back(ConstantInt::get(Index->getType(), i)); + + Constant *CV = ConstantVector::get(Indices); + Value *Induction = Builder.CreateAdd(SplatIndex, CV, "lane.mask.induction"); + + LLVM_DEBUG(dbgs() << "ARM TP: New index: " << *SplatIndex << "\n"; + dbgs() << "ARM TP: New Induction: " << *Induction << "\n"); + + // 2. In the Preheader, we expect this pattern splatting the BTC to + // a vector. Find this %broadcast.splat, which will be the RHS of the + // the icmp: + // + // %TC.minus.1 = add i32 %N, -1 + // %splatinsert = insertelement <4 x i32> undef, i32 %TC.minus.1, i32 0 + // %splat = shufflevector <4 x i32> %splatinsert, <4 x i32> undef, <16 x i32> 0 + + auto *Preheader = L->getLoopPreheader(); + auto *BTC = ActiveLaneMask->getOperand(1); + Value *SplatBTC = nullptr; + + if (auto *C = dyn_cast(BTC)) { + Builder.SetInsertPoint(Preheader->getTerminator()); + SplatBTC = Builder.CreateVectorSplat(VectorWidth, C); + } else { + Instruction *InsertElem; + for (auto &V : *Preheader) { + InsertElem = dyn_cast(&V); + if (!InsertElem) + continue; + ConstantInt *CI = dyn_cast(InsertElem->getOperand(2)); + if (!CI) + continue; + if (InsertElem->getOperand(1) != BTC || CI->getSExtValue() != 0) + continue; + if ((SplatBTC = dyn_cast(*InsertElem->users().begin()))) + break; + } + } + if (!SplatBTC) + llvm_unreachable("Couldn't revert intrinsic to icmp"); + + Builder.SetInsertPoint(ActiveLaneMask); + Value *ICmp = Builder.CreateICmp(ICmpInst::ICMP_ULE, Induction, SplatBTC); + LLVM_DEBUG(dbgs() << "ARM TP: new compare: " << *ICmp << "\n"); + ActiveLaneMask->replaceAllUsesWith(ICmp); + ActiveLaneMask->eraseFromParent(); +} + bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) { if (skipLoop(L) || DisableTailPredication) return false; @@ -196,6 +245,7 @@ TLI = TLIP ? &TLIP->getTLI(*L->getHeader()->getParent()) : nullptr; DL = &L->getHeader()->getModule()->getDataLayout(); this->L = L; + ActiveLaneMask = nullptr; // The MVE and LOB extensions are combined to enable tail-predication, but // there's nothing preventing us from generating VCTP instructions for v8.1m. @@ -256,86 +306,19 @@ if (ClonedVCTPInExitBlock) RematerializeIterCount(); return true; - } + } else + RevertActiveLaneMask(); LLVM_DEBUG(dbgs() << "ARM TP: Can't tail-predicate this loop.\n"); return false; } -// Pattern match predicates/masks and determine if they use the loop induction -// variable to control the number of elements processed by the loop. If so, -// the loop is a candidate for tail-predication. -bool MVETailPredication::isTailPredicate(TripCountPattern &TCP) { - using namespace PatternMatch; - - // Pattern match the loop body and find the add with takes the index iv - // and adds a constant vector to it: - // - // vector.body: - // .. - // %index = phi i32 - // %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 - // %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, - // <4 x i32> undef, - // <4 x i32> zeroinitializer - // %induction = [add|or] <4 x i32> %broadcast.splat, - // %pred = icmp ule <4 x i32> %induction, %broadcast.splat11 - // - // Please note that the 'or' is equivalent to the 'and' here, this relies on - // BroadcastSplat being the IV which we know is a phi with 0 start and Lanes - // increment, which is all being checked below. - Instruction *BroadcastSplat = nullptr; - Constant *Const = nullptr; - if (!match(TCP.Induction, - m_Add(m_Instruction(BroadcastSplat), m_Constant(Const))) && - !match(TCP.Induction, - m_Or(m_Instruction(BroadcastSplat), m_Constant(Const)))) - return false; - - // Check that we're adding <0, 1, 2, 3... - if (auto *CDS = dyn_cast(Const)) { - for (unsigned i = 0; i < CDS->getNumElements(); ++i) { - if (CDS->getElementAsInteger(i) != i) - return false; - } - } else - return false; - - Instruction *Insert = nullptr; - // The shuffle which broadcasts the index iv into a vector. - if (!match(BroadcastSplat, - m_Shuffle(m_Instruction(Insert), m_Undef(), m_ZeroMask()))) - return false; - - // The insert element which initialises a vector with the index iv. - Instruction *IV = nullptr; - if (!match(Insert, m_InsertElt(m_Undef(), m_Instruction(IV), m_Zero()))) - return false; - - // The index iv. - auto *Phi = dyn_cast(IV); - if (!Phi) - return false; - - // TODO: Don't think we need to check the entry value. - Value *OnEntry = Phi->getIncomingValueForBlock(L->getLoopPreheader()); - if (!match(OnEntry, m_Zero())) - return false; - - Value *InLoop = Phi->getIncomingValueForBlock(L->getLoopLatch()); - unsigned Lanes = cast(Insert->getType())->getNumElements(); - - Instruction *LHS = nullptr; - if (!match(InLoop, m_Add(m_Instruction(LHS), m_SpecificInt(Lanes)))) - return false; - - return LHS == Phi; -} - static FixedVectorType *getVectorType(IntrinsicInst *I) { unsigned TypeOp = I->getIntrinsicID() == Intrinsic::masked_load ? 0 : 1; auto *PtrTy = cast(I->getOperand(TypeOp)->getType()); - return cast(PtrTy->getElementType()); + auto *VecTy = cast(PtrTy->getElementType()); + assert(VecTy && "No scalable vectors expected here"); + return VecTy; } bool MVETailPredication::IsPredicatedVectorLoop() { @@ -368,178 +351,6 @@ return !MaskedInsts.empty(); } -// Pattern match the predicate, which is an icmp with a constant vector of this -// form: -// -// icmp ult <4 x i32> %induction, -// -// and return the constant, i.e. 32002 in this example. This is assumed to be -// the scalar loop iteration count: the number of loop elements by the -// the vector loop. Further checks are performed in function isTailPredicate(), -// to verify 'induction' behaves as an induction variable. -// -static bool ComputeConstElements(TripCountPattern &TCP) { - if (!dyn_cast(TCP.TripCount)) - return false; - - ConstantInt *VF = ConstantInt::get( - cast(TCP.TripCount->getType()), TCP.VecTy->getNumElements()); - using namespace PatternMatch; - CmpInst::Predicate CC; - - if (!match(TCP.Predicate, m_ICmp(CC, m_Instruction(TCP.Induction), - m_AnyIntegralConstant())) || - CC != ICmpInst::ICMP_ULT) - return false; - - LLVM_DEBUG(dbgs() << "ARM TP: icmp with constants: "; TCP.Predicate->dump();); - Value *ConstVec = TCP.Predicate->getOperand(1); - - auto *CDS = dyn_cast(ConstVec); - if (!CDS || CDS->getNumElements() != VF->getSExtValue()) - return false; - - if ((TCP.NumElements = CDS->getSplatValue())) { - assert(dyn_cast(TCP.NumElements)->getSExtValue() % - VF->getSExtValue() != - 0 && - "tail-predication: trip count should not be a multiple of the VF"); - LLVM_DEBUG(dbgs() << "ARM TP: Found const elem count: " << *TCP.NumElements - << "\n"); - return true; - } - return false; -} - -// Pattern match the loop iteration count setup: -// -// %trip.count.minus.1 = add i32 %N, -1 -// %broadcast.splatinsert10 = insertelement <4 x i32> undef, -// i32 %trip.count.minus.1, i32 0 -// %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, -// <4 x i32> undef, -// <4 x i32> zeroinitializer -// .. -// vector.body: -// .. -// -static bool MatchElemCountLoopSetup(Loop *L, Instruction *Shuffle, - Value *NumElements) { - using namespace PatternMatch; - Instruction *Insert = nullptr; - - if (!match(Shuffle, - m_Shuffle(m_Instruction(Insert), m_Undef(), m_ZeroMask()))) - return false; - - // Insert the limit into a vector. - Instruction *BECount = nullptr; - if (!match(Insert, - m_InsertElt(m_Undef(), m_Instruction(BECount), m_Zero()))) - return false; - - // The limit calculation, backedge count. - Value *TripCount = nullptr; - if (!match(BECount, m_Add(m_Value(TripCount), m_AllOnes()))) - return false; - - if (TripCount != NumElements || !L->isLoopInvariant(BECount)) - return false; - - return true; -} - -bool MVETailPredication::ComputeRuntimeElements(TripCountPattern &TCP) { - using namespace PatternMatch; - const SCEV *TripCountSE = SE->getSCEV(TCP.TripCount); - ConstantInt *VF = ConstantInt::get( - cast(TCP.TripCount->getType()), TCP.VecTy->getNumElements()); - - if (VF->equalsInt(1)) - return false; - - CmpInst::Predicate Pred; - if (!match(TCP.Predicate, m_ICmp(Pred, m_Instruction(TCP.Induction), - m_Instruction(TCP.Shuffle))) || - Pred != ICmpInst::ICMP_ULE) - return false; - - LLVM_DEBUG(dbgs() << "Computing number of elements for vector trip count: "; - TCP.TripCount->dump()); - - // Otherwise, continue and try to pattern match the vector iteration - // count expression - auto VisitAdd = [&](const SCEVAddExpr *S) -> const SCEVMulExpr * { - if (auto *Const = dyn_cast(S->getOperand(0))) { - if (Const->getAPInt() != -VF->getValue()) - return nullptr; - } else - return nullptr; - return dyn_cast(S->getOperand(1)); - }; - - auto VisitMul = [&](const SCEVMulExpr *S) -> const SCEVUDivExpr * { - if (auto *Const = dyn_cast(S->getOperand(0))) { - if (Const->getValue() != VF) - return nullptr; - } else - return nullptr; - return dyn_cast(S->getOperand(1)); - }; - - auto VisitDiv = [&](const SCEVUDivExpr *S) -> const SCEV * { - if (auto *Const = dyn_cast(S->getRHS())) { - if (Const->getValue() != VF) - return nullptr; - } else - return nullptr; - - if (auto *RoundUp = dyn_cast(S->getLHS())) { - if (auto *Const = dyn_cast(RoundUp->getOperand(0))) { - if (Const->getAPInt() != (VF->getValue() - 1)) - return nullptr; - } else - return nullptr; - - return RoundUp->getOperand(1); - } - return nullptr; - }; - - // TODO: Can we use SCEV helpers, such as findArrayDimensions, and friends to - // determine the numbers of elements instead? Looks like this is what is used - // for delinearization, but I'm not sure if it can be applied to the - // vectorized form - at least not without a bit more work than I feel - // comfortable with. - - // Search for Elems in the following SCEV: - // (1 + ((-VF + (VF * (((VF - 1) + %Elems) /u VF))) /u VF)) - const SCEV *Elems = nullptr; - if (auto *TC = dyn_cast(TripCountSE)) - if (auto *Div = dyn_cast(TC->getOperand(1))) - if (auto *Add = dyn_cast(Div->getLHS())) - if (auto *Mul = VisitAdd(Add)) - if (auto *Div = VisitMul(Mul)) - if (auto *Res = VisitDiv(Div)) - Elems = Res; - - if (!Elems) - return false; - - Instruction *InsertPt = L->getLoopPreheader()->getTerminator(); - if (!isSafeToExpandAt(Elems, InsertPt, *SE)) - return false; - - auto DL = L->getHeader()->getModule()->getDataLayout(); - SCEVExpander Expander(*SE, DL, "elements"); - TCP.NumElements = Expander.expandCodeFor(Elems, Elems->getType(), InsertPt); - - if (!MatchElemCountLoopSetup(L, TCP.Shuffle, TCP.NumElements)) - return false; - - return true; -} - // Look through the exit block to see whether there's a duplicate predicate // instruction. This can happen when we need to perform a select on values // from the last and previous iteration. Instead of doing a straight @@ -587,7 +398,6 @@ if (auto *OpI = dyn_cast(U)) MaybeDead.insert(OpI); - I->dropAllReferences(); Dead.insert(I); } @@ -602,23 +412,148 @@ return ClonedVCTPInExitBlock; } -void MVETailPredication::InsertVCTPIntrinsic(TripCountPattern &TCP, +// The active lane intrinsic has this form: +// +// @llvm.get.active.lane.mask(IV, BTC) +// +// Here we perform checks that this intrinsic behaves as expected, +// which means: +// +// 1) The element count, which is calculated with BTC + 1, cannot overflow. +// 2) The IV must be an induction phi with an increment equal to the +// vector width. +// 3) FIXME: The element count needs to be sufficiently large that the decrement of +// element counter doesn't overflow, which means that we need to prove: +// ceil(ElementCount / VectorWidth) >= TripCount +// by rounding up ElementCount up: +// ((ElementCount + (VectorWidth - 1)) / VectorWidth +// and evaluate if expression isKnownNonNegative: +// (((ElementCount + (VectorWidth - 1)) / VectorWidth) - TripCount +// FIXME: isKnownNonNegative doesn't understand this expression, and cannot +// prove it is non-negative. +bool MVETailPredication::IsSafeActiveMask(Value *TripCount, + FixedVectorType *VecTy) { + int VectorWidth = VecTy->getNumElements(); + auto *BackedgeTakenCount = ActiveLaneMask->getOperand(1); + auto *BTC = SE->getSCEV(BackedgeTakenCount); + + // 1) Test whether entry to the loop is protected by a conditional + // BTC + 1 < 0. In other words, if the scalar trip count overflows, + // becomes negative, we shouldn't enter the loop and creating + // tripcount expression BTC + 1 is not safe. So, check that BTC + // isn't max. This is evaluated in unsigned, because the semantics + // of @get.active.lane.mask is a ULE comparison. + if (!llvm::cannotBeMaxInLoop(BTC, L, *SE, false /*Signed*/)) { + LLVM_DEBUG(dbgs() << "ARM TP: overflow detected in: "; + BTC->dump()); + return false; + } + + // 2) Prove that the sub expression is non-negative, i.e. it doesn't overflow: + // (((ElementCount + (VectorWidth - 1)) / VectorWidth) - TripCount + auto *One = SE->getOne(TripCount->getType()); + auto *ElementCount = SE->getAddExpr(BTC, One); + auto *Tmp = SE->getAddExpr(ElementCount, + SE->getSCEV(ConstantInt::get(TripCount->getType(), VectorWidth - 1))); + auto Ceil = SE->getUDivExpr(Tmp, + SE->getSCEV(ConstantInt::get(TripCount->getType(), VectorWidth))); + auto *ECMinusTC = SE->getMinusSCEV(Ceil, SE->getSCEV(TripCount)); + // FIXME: we should be using isKnownNonNegativeInLoop here, but that is + // not able to understand this expression, and would reject most/all cases. + // Using isKnownNegativeInLoop is probably better than nothing. + if (llvm::isKnownNegativeInLoop(ECMinusTC, L, *SE)) { + LLVM_DEBUG(dbgs() << "ARM TP: overflow in element count decrement\n"); + return false; + } + + // 3) Find out if IV is an induction phi. Note that We can't use Loop + // helpers here to get the induction variable, because the hardware loop is + // no longer in loopsimplify form, and also the hwloop intrinsic use a + // different counter. Using SCEV, we check that the induction is of the + // form i = i + 4, where the increment must be equal to the VectorWidth. + auto *IV = ActiveLaneMask->getOperand(0); + auto *IVExpr = SE->getSCEV(IV); + auto *AddExpr = dyn_cast(IVExpr); + if (!AddExpr) { + LLVM_DEBUG(dbgs() << "ARM TP: induction not an add expr: "; IVExpr->dump()); + return false; + } + // Check that this AddRec is associated with this loop. + if (AddExpr->getLoop() != L) { + LLVM_DEBUG(dbgs() << "ARM TP: phi not part of this loop\n"); + return false; + } + auto *Step = dyn_cast(AddExpr->getOperand(1)); + if (!Step) { + LLVM_DEBUG(dbgs() << "ARM TP: induction step is not a constant: "; + AddExpr->getOperand(1)->dump()); + return false; + } + auto StepValue = Step->getValue()->getSExtValue(); + if (VectorWidth == StepValue || + VectorWidth == -StepValue) + return true; + + LLVM_DEBUG(dbgs() << "ARM TP: step value " << StepValue << " doesn't match " + "vector width : " << VectorWidth << "\n"); + + return false; +} + +// Materialize NumElements in the preheader block. +static Value *getNumElements(BasicBlock *Preheader, Value *BTC) { + // First, check the preheader if it not already exist: + // + // preheader: + // %BTC = add i32 %N, -1 + // .. + // vector.body: + // + // if %BTC already exists. We don't need to emit %NumElems = %BTC + 1, + // but instead can just return %N. + for (auto &I : *Preheader) { + if (&I == BTC) { + LLVM_DEBUG(dbgs() << "ARM TP: Found num elems: " << I << "\n"); + return I.getOperand(0); + } + } + + // But we do need to materialise BTC if it is not already there, + // e.g. if it is a constant. + IRBuilder<> Builder(Preheader->getTerminator()); + Value *NumElements = Builder.CreateAdd(BTC, + ConstantInt::get(BTC->getType(), 1), "num.elements"); + LLVM_DEBUG(dbgs() << "ARM TP: Created num elems: " << *NumElements << "\n"); + return NumElements; +} + +void MVETailPredication::InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask, + Value *TripCount, FixedVectorType *VecTy, DenseMap &NewPredicates) { - IRBuilder<> Builder(L->getHeader()->getFirstNonPHI()); + IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); Module *M = L->getHeader()->getModule(); Type *Ty = IntegerType::get(M->getContext(), 32); + // The backedge-taken count in @llvm.get.active.lane.mask, its 2nd operand, + // is one less than the trip count. So we need to find or create + // %num.elements = %BTC + 1 in the preheader. + Value *BTC = ActiveLaneMask->getOperand(1); + Builder.SetInsertPoint(L->getLoopPreheader()->getTerminator()); + Value *NumElements = getNumElements(L->getLoopPreheader(), BTC); + // Insert a phi to count the number of elements processed by the loop. + Builder.SetInsertPoint(L->getHeader()->getFirstNonPHI() ); PHINode *Processed = Builder.CreatePHI(Ty, 2); - Processed->addIncoming(TCP.NumElements, L->getLoopPreheader()); + Processed->addIncoming(NumElements, L->getLoopPreheader()); - // Insert the intrinsic to represent the effect of tail predication. - Builder.SetInsertPoint(cast(TCP.Predicate)); + // Replace @llvm.get.active.mask() with the ARM specific VCTP intrinic, and thus + // represent the effect of tail predication. + Builder.SetInsertPoint(ActiveLaneMask); ConstantInt *Factor = - ConstantInt::get(cast(Ty), TCP.VecTy->getNumElements()); + ConstantInt::get(cast(Ty), VecTy->getNumElements()); Intrinsic::ID VCTPID; - switch (TCP.VecTy->getNumElements()) { + switch (VecTy->getNumElements()) { default: llvm_unreachable("unexpected number of lanes"); case 4: VCTPID = Intrinsic::arm_mve_vctp32; break; @@ -632,9 +567,9 @@ // purposes, but takes a v4i1 instead of a v2i1. } Function *VCTP = Intrinsic::getDeclaration(M, VCTPID); - Value *TailPredicate = Builder.CreateCall(VCTP, Processed); - TCP.Predicate->replaceAllUsesWith(TailPredicate); - NewPredicates[TCP.Predicate] = cast(TailPredicate); + Value *VCTPCall = Builder.CreateCall(VCTP, Processed); + ActiveLaneMask->replaceAllUsesWith(VCTPCall); + NewPredicates[ActiveLaneMask] = cast(VCTPCall); // Add the incoming value to the new phi. // TODO: This add likely already exists in the loop. @@ -642,7 +577,7 @@ Processed->addIncoming(Remaining, L->getLoopLatch()); LLVM_DEBUG(dbgs() << "ARM TP: Insert processed elements phi: " << *Processed << "\n" - << "ARM TP: Inserted VCTP: " << *TailPredicate << "\n"); + << "ARM TP: Inserted VCTP: " << *VCTPCall << "\n"); } bool MVETailPredication::TryConvert(Value *TripCount) { @@ -653,51 +588,33 @@ LLVM_DEBUG(dbgs() << "ARM TP: Found predicated vector loop.\n"); - // Walk through the masked intrinsics and try to find whether the predicate - // operand is generated from an induction variable. SetVector Predicates; DenseMap NewPredicates; -#ifndef NDEBUG - // For debugging purposes, use this to indicate we have been able to - // pattern match the scalar loop trip count. - bool FoundScalarTC = false; -#endif - + // Walk through the masked intrinsics and try to find whether the predicate + // operand is generated by intrinsic @llvm.get.active.lane.mask(). for (auto *I : MaskedInsts) { - Intrinsic::ID ID = I->getIntrinsicID(); - // First, find the icmp used by this masked load/store. - unsigned PredOp = ID == Intrinsic::masked_load ? 2 : 3; + unsigned PredOp = I->getIntrinsicID() == Intrinsic::masked_load ? 2 : 3; auto *Predicate = dyn_cast(I->getArgOperand(PredOp)); if (!Predicate || Predicates.count(Predicate)) continue; - // Step 1: using this icmp, now calculate the number of elements - // processed by this loop. - TripCountPattern TCP(Predicate, TripCount, getVectorType(I)); - if (!(ComputeConstElements(TCP) || ComputeRuntimeElements(TCP))) + ActiveLaneMask = dyn_cast(Predicate); + if (!ActiveLaneMask || + ActiveLaneMask->getIntrinsicID() != Intrinsic::get_active_lane_mask) continue; - LLVM_DEBUG(FoundScalarTC = true); - - if (!isTailPredicate(TCP)) { - LLVM_DEBUG(dbgs() << "ARM TP: Not an icmp that generates tail predicate: " - << *Predicate << "\n"); - continue; - } - - LLVM_DEBUG(dbgs() << "ARM TP: Found icmp generating tail predicate: " - << *Predicate << "\n"); Predicates.insert(Predicate); + LLVM_DEBUG(dbgs() << "ARM TP: Found active lane mask: " + << *ActiveLaneMask << "\n"); - // Step 2: emit the VCTP intrinsic representing the effect of TP. - InsertVCTPIntrinsic(TCP, NewPredicates); - } - - if (!NewPredicates.size()) { - LLVM_DEBUG(if (!FoundScalarTC) - dbgs() << "ARM TP: Can't determine loop itertion count\n"); - return false; + VecTy = getVectorType(I); + if (!IsSafeActiveMask(TripCount, VecTy)) { + LLVM_DEBUG(dbgs() << "ARM TP: Not safe to insert VCTP.\n"); + return false; + } + LLVM_DEBUG(dbgs() << "ARM TP: Safe to insert VCTP.\n"); + InsertVCTPIntrinsic(ActiveLaneMask, TripCount, VecTy, NewPredicates); } // Now clean up. Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll @@ -34,16 +34,19 @@ %broadcast.splat = shufflevector <16 x i32> %broadcast.splatinsert, <16 x i32> undef, <16 x i32> zeroinitializer %induction = or <16 x i32> %broadcast.splat, %tmp = getelementptr inbounds i8, i8* %a, i32 %index - %tmp1 = icmp ule <16 x i32> %induction, %broadcast.splat11 + +; %tmp1 = icmp ule <16 x i32> %induction, %broadcast.splat11 + %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %trip.count.minus.1) + %tmp2 = bitcast i8* %tmp to <16 x i8>* - %wide.masked.load = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %tmp2, i32 4, <16 x i1> %tmp1, <16 x i8> undef) + %wide.masked.load = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %tmp2, i32 4, <16 x i1> %active.lane.mask, <16 x i8> undef) %tmp3 = getelementptr inbounds i8, i8* %b, i32 %index %tmp4 = bitcast i8* %tmp3 to <16 x i8>* - %wide.masked.load2 = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %tmp4, i32 4, <16 x i1> %tmp1, <16 x i8> undef) + %wide.masked.load2 = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %tmp4, i32 4, <16 x i1> %active.lane.mask, <16 x i8> undef) %mul = mul nsw <16 x i8> %wide.masked.load2, %wide.masked.load %tmp6 = getelementptr inbounds i8, i8* %c, i32 %index %tmp7 = bitcast i8* %tmp6 to <16 x i8>* - tail call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %mul, <16 x i8>* %tmp7, i32 4, <16 x i1> %tmp1) + tail call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %mul, <16 x i8>* %tmp7, i32 4, <16 x i1> %active.lane.mask) %index.next = add i32 %index, 16 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1) %tmp16 = icmp ne i32 %tmp15, 0 @@ -87,16 +90,19 @@ %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer %induction = add <8 x i32> %broadcast.splat, %tmp = getelementptr inbounds i16, i16* %a, i32 %index - %tmp1 = icmp ule <8 x i32> %induction, %broadcast.splat11 + +; %tmp1 = icmp ule <8 x i32> %induction, %broadcast.splat11 + %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %trip.count.minus.1) + %tmp2 = bitcast i16* %tmp to <8 x i16>* - %wide.masked.load = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp2, i32 4, <8 x i1> %tmp1, <8 x i16> undef) + %wide.masked.load = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp2, i32 4, <8 x i1> %active.lane.mask, <8 x i16> undef) %tmp3 = getelementptr inbounds i16, i16* %b, i32 %index %tmp4 = bitcast i16* %tmp3 to <8 x i16>* - %wide.masked.load2 = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> %tmp1, <8 x i16> undef) + %wide.masked.load2 = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> %active.lane.mask, <8 x i16> undef) %mul = mul nsw <8 x i16> %wide.masked.load2, %wide.masked.load %tmp6 = getelementptr inbounds i16, i16* %c, i32 %index %tmp7 = bitcast i16* %tmp6 to <8 x i16>* - tail call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %mul, <8 x i16>* %tmp7, i32 4, <8 x i1> %tmp1) + tail call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %mul, <8 x i16>* %tmp7, i32 4, <8 x i1> %active.lane.mask) %index.next = add i32 %index, 8 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1) %tmp16 = icmp ne i32 %tmp15, 0 @@ -139,16 +145,17 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = or <4 x i32> %broadcast.splat, %tmp = getelementptr inbounds i32, i32* %a, i32 %index - %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + ; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 %tmp2 = bitcast i32* %tmp to <4 x i32>* - %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef) + %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index %tmp4 = bitcast i32* %tmp3 to <4 x i32>* - %wide.masked.load2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef) + %wide.masked.load2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) %mul = mul nsw <4 x i32> %wide.masked.load2, %wide.masked.load %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index %tmp7 = bitcast i32* %tmp6 to <4 x i32>* - tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %mul, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1) + tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %mul, <4 x i32>* %tmp7, i32 4, <4 x i1> %active.lane.mask) %index.next = add i32 %index, 4 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1) %tmp16 = icmp ne i32 %tmp15, 0 @@ -192,14 +199,15 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %tmp = getelementptr inbounds i32, i32* %a, i32 %index - %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 +; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) %tmp2 = bitcast i32* %tmp to <4 x i32>* - %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef) + %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) %extract.1.low = shufflevector <4 x i32> %wide.masked.load, <4 x i32> undef, < 2 x i32> < i32 0, i32 2> %extract.1.high = shufflevector <4 x i32> %wide.masked.load, <4 x i32> undef, < 2 x i32> < i32 1, i32 3> %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index %tmp4 = bitcast i32* %tmp3 to <4 x i32>* - %wide.masked.load2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef) + %wide.masked.load2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) %extract.2.low = shufflevector <4 x i32> %wide.masked.load2, <4 x i32> undef, < 2 x i32> < i32 0, i32 2> %extract.2.high = shufflevector <4 x i32> %wide.masked.load2, <4 x i32> undef, < 2 x i32> < i32 1, i32 3> %mul = mul nsw <2 x i32> %extract.1.low, %extract.2.low @@ -207,7 +215,7 @@ %combine = shufflevector <2 x i32> %mul, <2 x i32> %sub, <4 x i32> %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index %tmp7 = bitcast i32* %tmp6 to <4 x i32>* - tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %combine, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1) + tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %combine, <4 x i32>* %tmp7, i32 4, <4 x i1> %active.lane.mask) %index.next = add i32 %index, 4 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1) %tmp16 = icmp ne i32 %tmp15, 0 @@ -250,17 +258,20 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %tmp = getelementptr inbounds i32, i32* %a, i32 %index - %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + +; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %wrong = icmp ult <4 x i32> %induction, %broadcast.splat11 %tmp2 = bitcast i32* %tmp to <4 x i32>* - %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef) + %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index %tmp4 = bitcast i32* %tmp3 to <4 x i32>* %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %wrong, <4 x i32> undef) %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index %tmp7 = bitcast i32* %tmp6 to <4 x i32>* - tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1) + tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %active.lane.mask) %index.next = add i32 %index, 4 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1) %tmp16 = icmp ne i32 %tmp15, 0 @@ -272,6 +283,7 @@ ; The store now uses ult predicate. ; CHECK-LABEL: mismatch_store_pred +; CHECK: vector.body: ; CHECK: %index = phi i32 ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ] ; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELEMS]]) @@ -304,13 +316,16 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %tmp = getelementptr inbounds i32, i32* %a, i32 %index - %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + +; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %wrong = icmp ult <4 x i32> %induction, %broadcast.splat11 %tmp2 = bitcast i32* %tmp to <4 x i32>* - %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef) + %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index %tmp4 = bitcast i32* %tmp3 to <4 x i32>* - %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef) + %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index %tmp7 = bitcast i32* %tmp6 to <4 x i32>* @@ -334,4 +349,6 @@ declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) declare void @llvm.set.loop.iterations.i32(i32) declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) - +declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) +declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32) +declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32) Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/clear-maskedinsts.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/clear-maskedinsts.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/clear-maskedinsts.ll @@ -23,45 +23,53 @@ ; CHECK-NEXT: [[BROADCAST_SPLATINSERT71:%.*]] = insertelement <4 x i32> undef, i32 [[X]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT72:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT71]], <4 x i32> undef, <4 x i32> zeroinitializer ; CHECK-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[TMP3]]) +; CHECK-NEXT: [[NUM_ELEMENTS:%.*]] = add i32 [[TRIP_COUNT_MINUS_183]], 1 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[LSR_IV9:%.*]] = phi i32* [ [[SCEVGEP10:%.*]], [[VECTOR_BODY]] ], [ [[D:%.*]], [[VECTOR_PH]] ] -; CHECK-NEXT: [[TMP4:%.*]] = phi i32 [ [[TMP3]], [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP5:%.*]] = phi i32 [ [[N]], [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi i32 [ [[TMP3]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP5:%.*]] = phi i32 [ [[NUM_ELEMENTS]], [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[LSR_IV911:%.*]] = bitcast i32* [[LSR_IV9]] to <4 x i32>* -; CHECK-NEXT: [[TMP6:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP5]]) -; CHECK-NEXT: [[TMP7]] = sub i32 [[TMP5]], 4 -; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[BROADCAST_SPLAT72]], <4 x i32>* [[LSR_IV911]], i32 4, <4 x i1> [[TMP6]]) +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> undef, i32 [[TRIP_COUNT_MINUS_183]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP5]]) +; CHECK-NEXT: [[TMP9]] = sub i32 [[TMP5]], 4 +; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[BROADCAST_SPLAT72]], <4 x i32>* [[LSR_IV911]], i32 4, <4 x i1> [[TMP8]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 ; CHECK-NEXT: [[SCEVGEP10]] = getelementptr i32, i32* [[LSR_IV9]], i32 4 -; CHECK-NEXT: [[TMP8]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP4]], i32 1) -; CHECK-NEXT: [[TMP9:%.*]] = icmp ne i32 [[TMP8]], 0 -; CHECK-NEXT: br i1 [[TMP9]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]] +; CHECK-NEXT: [[TMP10]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP4]], i32 1) +; CHECK-NEXT: [[TMP11:%.*]] = icmp ne i32 [[TMP10]], 0 +; CHECK-NEXT: br i1 [[TMP11]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]] ; CHECK: vector.body75: ; CHECK-NEXT: [[LSR_IV6:%.*]] = phi i32* [ [[S1:%.*]], [[VECTOR_BODY75_PREHEADER]] ], [ [[SCEVGEP7:%.*]], [[VECTOR_BODY75]] ] ; CHECK-NEXT: [[LSR_IV3:%.*]] = phi i32* [ [[S2:%.*]], [[VECTOR_BODY75_PREHEADER]] ], [ [[SCEVGEP4:%.*]], [[VECTOR_BODY75]] ] ; CHECK-NEXT: [[LSR_IV:%.*]] = phi i32* [ [[D]], [[VECTOR_BODY75_PREHEADER]] ], [ [[SCEVGEP:%.*]], [[VECTOR_BODY75]] ] ; CHECK-NEXT: [[INDEX80:%.*]] = phi i32 [ [[INDEX_NEXT81:%.*]], [[VECTOR_BODY75]] ], [ 0, [[VECTOR_BODY75_PREHEADER]] ] -; CHECK-NEXT: [[TMP10:%.*]] = phi i32 [ [[TMP2]], [[VECTOR_BODY75_PREHEADER]] ], [ [[TMP15:%.*]], [[VECTOR_BODY75]] ] +; CHECK-NEXT: [[TMP12:%.*]] = phi i32 [ [[TMP2]], [[VECTOR_BODY75_PREHEADER]] ], [ [[TMP17:%.*]], [[VECTOR_BODY75]] ] ; CHECK-NEXT: [[LSR_IV68:%.*]] = bitcast i32* [[LSR_IV6]] to <4 x i32>* ; CHECK-NEXT: [[LSR_IV35:%.*]] = bitcast i32* [[LSR_IV3]] to <4 x i32>* ; CHECK-NEXT: [[LSR_IV2:%.*]] = bitcast i32* [[LSR_IV]] to <4 x i32>* ; CHECK-NEXT: [[BROADCAST_SPLATINSERT84:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX80]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT85:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT84]], <4 x i32> undef, <4 x i32> zeroinitializer ; CHECK-NEXT: [[INDUCTION86:%.*]] = add <4 x i32> [[BROADCAST_SPLAT85]], -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> undef, i32 [[TRIP_COUNT_MINUS_183]], i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> undef, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = icmp ule <4 x i32> [[INDUCTION86]], [[TMP12]] -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[LSR_IV68]], i32 4, <4 x i1> [[TMP13]], <4 x i32> undef) -; CHECK-NEXT: [[WIDE_MASKED_LOAD89:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[LSR_IV35]], i32 4, <4 x i1> [[TMP13]], <4 x i32> undef) -; CHECK-NEXT: [[TMP14:%.*]] = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> [[WIDE_MASKED_LOAD89]], <4 x i32> [[WIDE_MASKED_LOAD]]) -; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP14]], <4 x i32>* [[LSR_IV2]], i32 4, <4 x i1> [[TMP13]]) +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> undef, i32 [[TRIP_COUNT_MINUS_183]], i32 0 +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP13]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = icmp ule <4 x i32> [[INDUCTION86]], [[TMP14]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[LSR_IV68]], i32 4, <4 x i1> [[TMP15]], <4 x i32> undef) +; CHECK-NEXT: [[WIDE_MASKED_LOAD89:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[LSR_IV35]], i32 4, <4 x i1> [[TMP15]], <4 x i32> undef) +; CHECK-NEXT: [[TMP16:%.*]] = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> [[WIDE_MASKED_LOAD89]], <4 x i32> [[WIDE_MASKED_LOAD]]) +; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP16]], <4 x i32>* [[LSR_IV2]], i32 4, <4 x i1> [[TMP15]]) ; CHECK-NEXT: [[INDEX_NEXT81]] = add i32 [[INDEX80]], 4 ; CHECK-NEXT: [[SCEVGEP]] = getelementptr i32, i32* [[LSR_IV]], i32 4 ; CHECK-NEXT: [[SCEVGEP4]] = getelementptr i32, i32* [[LSR_IV3]], i32 4 ; CHECK-NEXT: [[SCEVGEP7]] = getelementptr i32, i32* [[LSR_IV6]], i32 4 -; CHECK-NEXT: [[TMP15]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP10]], i32 1) -; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0 -; CHECK-NEXT: br i1 [[TMP16]], label [[VECTOR_BODY75]], label [[FOR_COND_CLEANUP]] +; CHECK-NEXT: [[TMP17]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP12]], i32 1) +; CHECK-NEXT: [[TMP18:%.*]] = icmp ne i32 [[TMP17]], 0 +; CHECK-NEXT: br i1 [[TMP18]], label [[VECTOR_BODY75]], label [[FOR_COND_CLEANUP]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret i32 0 ; @@ -100,7 +108,7 @@ %induction = add <4 x i32> %broadcast.splat, %5 = insertelement <4 x i32> undef, i32 %trip.count.minus.183, i32 0 %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <4 x i32> zeroinitializer - %7 = icmp ule <4 x i32> %induction, %6 + %7 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.183) call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %broadcast.splat72, <4 x i32>* %lsr.iv911, i32 4, <4 x i1> %7) %index.next = add i32 %index, 4 %scevgep10 = getelementptr i32, i32* %lsr.iv9, i32 4 @@ -143,3 +151,7 @@ declare <4 x i32> @llvm.usub.sat.v4i32(<4 x i32>, <4 x i32>) declare void @llvm.set.loop.iterations.i32(i32) declare i32 @llvm.loop.decrement.reg.i32(i32, i32) + +declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) +declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32) +declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32) Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll @@ -8,30 +8,28 @@ ; CHECK-NEXT: itt eq ; CHECK-NEXT: moveq r0, #0 ; CHECK-NEXT: bxeq lr -; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: adds r4, r3, #3 +; CHECK-NEXT: add.w r12, r3, #3 +; CHECK-NEXT: mov.w lr, #1 +; CHECK-NEXT: bic r12, r12, #3 ; CHECK-NEXT: vmov.i32 q1, #0x0 -; CHECK-NEXT: bic r4, r4, #3 -; CHECK-NEXT: sub.w r12, r4, #4 -; CHECK-NEXT: movs r4, #1 -; CHECK-NEXT: add.w lr, r4, r12, lsr #2 -; CHECK-NEXT: lsr.w r4, r12, #2 -; CHECK-NEXT: sub.w r12, r3, r4, lsl #2 -; CHECK-NEXT: movs r4, #0 +; CHECK-NEXT: sub.w r12, r12, #4 +; CHECK-NEXT: add.w lr, lr, r12, lsr #2 +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB0_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r3 -; CHECK-NEXT: and r5, r4, #15 +; CHECK-NEXT: and r4, r12, #15 ; CHECK-NEXT: vstr p0, [sp] @ 4-byte Spill -; CHECK-NEXT: vdup.32 q3, r5 +; CHECK-NEXT: vdup.32 q3, r4 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: vpstt ; CHECK-NEXT: vldrwt.u32 q1, [r2], #16 ; CHECK-NEXT: vldrwt.u32 q2, [r1], #16 ; CHECK-NEXT: vcmp.i32 eq, q3, zr -; CHECK-NEXT: adds r4, #4 +; CHECK-NEXT: add.w r12, r12, #4 ; CHECK-NEXT: vpsel q1, q2, q1 ; CHECK-NEXT: vldr p0, [sp] @ 4-byte Reload ; CHECK-NEXT: vpst @@ -41,11 +39,10 @@ ; CHECK-NEXT: vadd.i32 q1, q1, q0 ; CHECK-NEXT: le lr, .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block -; CHECK-NEXT: vctp.32 r12 ; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: pop {r4, pc} entry: %cmp8 = icmp eq i32 %N, 0 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph @@ -65,7 +62,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %tmp = getelementptr inbounds i32, i32* %a, i32 %index - %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12 + +; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12 + %tmp1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %tmp2 = bitcast i32* %tmp to <4 x i32>* %wide.masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef) %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index @@ -98,37 +98,35 @@ define dso_local i32 @vpsel_mul_reduce_add_2(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, ; CHECK-LABEL: vpsel_mul_reduce_add_2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: ldr.w r12, [sp, #20] ; CHECK-NEXT: cmp.w r12, #0 ; CHECK-NEXT: beq .LBB1_4 ; CHECK-NEXT: @ %bb.1: @ %vector.ph -; CHECK-NEXT: add.w r5, r12, #3 +; CHECK-NEXT: add.w r4, r12, #3 ; CHECK-NEXT: vmov.i32 q1, #0x0 -; CHECK-NEXT: bic r5, r5, #3 -; CHECK-NEXT: subs r4, r5, #4 -; CHECK-NEXT: movs r5, #1 -; CHECK-NEXT: add.w lr, r5, r4, lsr #2 -; CHECK-NEXT: lsrs r4, r4, #2 -; CHECK-NEXT: sub.w r4, r12, r4, lsl #2 -; CHECK-NEXT: movs r5, #0 +; CHECK-NEXT: bic r4, r4, #3 +; CHECK-NEXT: sub.w lr, r4, #4 +; CHECK-NEXT: movs r4, #1 +; CHECK-NEXT: add.w lr, r4, lr, lsr #2 +; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB1_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r12 -; CHECK-NEXT: and r6, r5, #15 +; CHECK-NEXT: and r5, r4, #15 ; CHECK-NEXT: vstr p0, [sp] @ 4-byte Spill ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: vpstt ; CHECK-NEXT: vldrwt.u32 q1, [r3], #16 ; CHECK-NEXT: vldrwt.u32 q2, [r2], #16 -; CHECK-NEXT: vdup.32 q3, r6 +; CHECK-NEXT: vdup.32 q3, r5 ; CHECK-NEXT: vsub.i32 q1, q2, q1 ; CHECK-NEXT: vpst ; CHECK-NEXT: vldrwt.u32 q2, [r1], #16 ; CHECK-NEXT: vcmp.i32 eq, q3, zr -; CHECK-NEXT: adds r5, #4 +; CHECK-NEXT: adds r4, #4 ; CHECK-NEXT: vpsel q1, q1, q2 ; CHECK-NEXT: vldr p0, [sp] @ 4-byte Reload ; CHECK-NEXT: vpst @@ -138,15 +136,14 @@ ; CHECK-NEXT: vadd.i32 q1, q1, q0 ; CHECK-NEXT: le lr, .LBB1_2 ; CHECK-NEXT: @ %bb.3: @ %middle.block -; CHECK-NEXT: vctp.32 r4 ; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: pop {r4, r5, r7, pc} ; CHECK-NEXT: .LBB1_4: ; CHECK-NEXT: movs r0, #0 ; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: pop {r4, r5, r7, pc} i32* noalias nocapture readonly %c, i32* noalias nocapture readonly %d, i32 %N) { entry: %cmp8 = icmp eq i32 %N, 0 @@ -167,7 +164,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %tmp = getelementptr inbounds i32, i32* %a, i32 %index - %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12 + +; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12 + %tmp1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %tmp2 = bitcast i32* %tmp to <4 x i32>* %wide.masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef) %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index @@ -204,19 +204,19 @@ define dso_local i32 @and_mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, ; CHECK-LABEL: and_mul_reduce_add: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: ldr.w r12, [sp, #16] +; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: sub sp, #4 +; CHECK-NEXT: ldr.w r12, [sp, #12] ; CHECK-NEXT: cmp.w r12, #0 ; CHECK-NEXT: beq .LBB2_4 ; CHECK-NEXT: @ %bb.1: @ %vector.ph ; CHECK-NEXT: add.w r4, r12, #3 ; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: bic r4, r4, #3 -; CHECK-NEXT: subs r5, r4, #4 +; CHECK-NEXT: sub.w lr, r4, #4 ; CHECK-NEXT: movs r4, #1 -; CHECK-NEXT: add.w lr, r4, r5, lsr #2 -; CHECK-NEXT: lsrs r4, r5, #2 -; CHECK-NEXT: sub.w r4, r12, r4, lsl #2 +; CHECK-NEXT: add.w lr, r4, lr, lsr #2 +; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB2_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -225,23 +225,27 @@ ; CHECK-NEXT: vpstt ; CHECK-NEXT: vldrwt.u32 q1, [r1], #16 ; CHECK-NEXT: vldrwt.u32 q2, [r0], #16 -; CHECK-NEXT: sub.w r12, r12, #4 +; CHECK-NEXT: vstr p0, [sp] @ 4-byte Spill ; CHECK-NEXT: vsub.i32 q1, q2, q1 +; CHECK-NEXT: adds r4, #4 ; CHECK-NEXT: vpsttt ; CHECK-NEXT: vcmpt.i32 eq, q1, zr ; CHECK-NEXT: vldrwt.u32 q1, [r3], #16 ; CHECK-NEXT: vldrwt.u32 q2, [r2], #16 +; CHECK-NEXT: sub.w r12, r12, #4 ; CHECK-NEXT: vmul.i32 q1, q2, q1 ; CHECK-NEXT: vadd.i32 q1, q1, q0 ; CHECK-NEXT: le lr, .LBB2_2 ; CHECK-NEXT: @ %bb.3: @ %middle.block -; CHECK-NEXT: vctp.32 r4 +; CHECK-NEXT: vldr p0, [sp] @ 4-byte Reload ; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vaddv.u32 r0, q0 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: pop {r4, pc} ; CHECK-NEXT: .LBB2_4: ; CHECK-NEXT: movs r0, #0 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: pop {r4, pc} i32* noalias nocapture readonly %c, i32* noalias nocapture readonly %d, i32 %N) { entry: %cmp8 = icmp eq i32 %N, 0 @@ -262,7 +266,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %tmp = getelementptr inbounds i32, i32* %a, i32 %index - %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12 + +; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12 + %tmp1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %tmp2 = bitcast i32* %tmp to <4 x i32>* %wide.masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef) %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index @@ -296,19 +303,19 @@ define dso_local i32 @or_mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture readonly %c, i32* noalias nocapture readonly %d, i32 %N) { ; CHECK-LABEL: or_mul_reduce_add: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: ldr.w r12, [sp, #16] +; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: sub sp, #4 +; CHECK-NEXT: ldr.w r12, [sp, #12] ; CHECK-NEXT: cmp.w r12, #0 ; CHECK-NEXT: beq .LBB3_4 ; CHECK-NEXT: @ %bb.1: @ %vector.ph ; CHECK-NEXT: add.w r4, r12, #3 ; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: bic r4, r4, #3 -; CHECK-NEXT: subs r5, r4, #4 +; CHECK-NEXT: sub.w lr, r4, #4 ; CHECK-NEXT: movs r4, #1 -; CHECK-NEXT: add.w lr, r4, r5, lsr #2 -; CHECK-NEXT: lsrs r4, r5, #2 -; CHECK-NEXT: sub.w r4, r12, r4, lsl #2 +; CHECK-NEXT: add.w lr, r4, lr, lsr #2 +; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB3_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -317,24 +324,28 @@ ; CHECK-NEXT: vpstt ; CHECK-NEXT: vldrwt.u32 q1, [r1], #16 ; CHECK-NEXT: vldrwt.u32 q2, [r0], #16 -; CHECK-NEXT: vpnot +; CHECK-NEXT: vstr p0, [sp] @ 4-byte Spill ; CHECK-NEXT: vsub.i32 q1, q2, q1 -; CHECK-NEXT: sub.w r12, r12, #4 +; CHECK-NEXT: vpnot ; CHECK-NEXT: vpstee ; CHECK-NEXT: vcmpt.i32 ne, q1, zr ; CHECK-NEXT: vldrwe.u32 q1, [r3], #16 ; CHECK-NEXT: vldrwe.u32 q2, [r2], #16 +; CHECK-NEXT: adds r4, #4 ; CHECK-NEXT: vmul.i32 q1, q2, q1 +; CHECK-NEXT: sub.w r12, r12, #4 ; CHECK-NEXT: vadd.i32 q1, q1, q0 ; CHECK-NEXT: le lr, .LBB3_2 ; CHECK-NEXT: @ %bb.3: @ %middle.block -; CHECK-NEXT: vctp.32 r4 +; CHECK-NEXT: vldr p0, [sp] @ 4-byte Reload ; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vaddv.u32 r0, q0 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: pop {r4, pc} ; CHECK-NEXT: .LBB3_4: ; CHECK-NEXT: movs r0, #0 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: pop {r4, pc} entry: %cmp8 = icmp eq i32 %N, 0 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph @@ -354,7 +365,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %tmp = getelementptr inbounds i32, i32* %a, i32 %index - %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12 + +; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12 + %tmp1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %tmp2 = bitcast i32* %tmp to <4 x i32>* %wide.masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef) %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index @@ -392,9 +406,11 @@ ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB4_1: @ %bb9 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: adds r3, #4 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16 ; CHECK-NEXT: vcmp.i32 ne, q0, zr ; CHECK-NEXT: vpst @@ -423,7 +439,10 @@ %tmp12 = shufflevector <4 x i32> %tmp11, <4 x i32> undef, <4 x i32> zeroinitializer %tmp13 = add <4 x i32> %tmp12, %tmp14 = getelementptr inbounds i32, i32* %arg1, i32 %tmp10 - %tmp15 = icmp ule <4 x i32> %tmp13, %tmp8 + + ; %tmp15 = icmp ule <4 x i32> %tmp13, %tmp8 + %tmp15 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %tmp10, i32 %tmp6) + %tmp16 = bitcast i32* %tmp14 to <4 x i32>* %tmp17 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp16, i32 4, <4 x i1> %tmp15, <4 x i32> undef) %tmp18 = icmp ne <4 x i32> %tmp17, zeroinitializer @@ -449,6 +468,7 @@ ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB5_1: @ %bb12 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -456,6 +476,7 @@ ; CHECK-NEXT: vptt.i32 ne, q0, zr ; CHECK-NEXT: vcmpt.s32 le, q0, r2 ; CHECK-NEXT: vldrwt.u32 q1, [r1], #16 +; CHECK-NEXT: add.w r12, r12, #4 ; CHECK-NEXT: vmul.i32 q0, q1, q0 ; CHECK-NEXT: vpst ; CHECK-NEXT: vstrwt.32 q0, [r0], #16 @@ -482,7 +503,10 @@ %tmp15 = shufflevector <4 x i32> %tmp14, <4 x i32> undef, <4 x i32> zeroinitializer %tmp16 = add <4 x i32> %tmp15, %tmp17 = getelementptr inbounds i32, i32* %arg, i32 %tmp13 - %tmp18 = icmp ule <4 x i32> %tmp16, %tmp9 + + ; %tmp18 = icmp ule <4 x i32> %tmp16, %tmp9 + %tmp18= call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %tmp13, i32 %tmp7) + %tmp19 = bitcast i32* %tmp17 to <4 x i32>* %tmp20 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp19, i32 4, <4 x i1> %tmp18, <4 x i32> undef) %tmp21 = icmp ne <4 x i32> %tmp20, zeroinitializer @@ -509,3 +533,5 @@ ; Function Attrs: nounwind readnone willreturn declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) + +declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/extending-loads.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/extending-loads.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/extending-loads.ll @@ -8,9 +8,11 @@ ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: dlstp.16 lr, r2 ; CHECK-NEXT: .LBB0_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: adds r3, #8 ; CHECK-NEXT: vldrb.s16 q0, [r1], #8 ; CHECK-NEXT: vldrh.u16 q1, [r0] ; CHECK-NEXT: vadd.i16 q0, q1, q0 @@ -36,7 +38,10 @@ %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer %induction = or <8 x i32> %broadcast.splat, %0 = getelementptr inbounds i8, i8* %b, i32 %index - %1 = icmp ule <8 x i32> %induction, %broadcast.splat11 + + ; %1 = icmp ule <8 x i32> %induction, %broadcast.splat11 + %1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast i8* %0 to <8 x i8>* %wide.masked.load = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %2, i32 1, <8 x i1> %1, <8 x i8> undef) %3 = sext <8 x i8> %wide.masked.load to <8 x i16> @@ -62,9 +67,11 @@ ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: dlstp.16 lr, r2 ; CHECK-NEXT: .LBB1_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: adds r3, #8 ; CHECK-NEXT: vldrb.u16 q0, [r1], #8 ; CHECK-NEXT: vldrh.u16 q1, [r0] ; CHECK-NEXT: vadd.i16 q0, q1, q0 @@ -90,7 +97,10 @@ %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer %induction = or <8 x i32> %broadcast.splat, %0 = getelementptr inbounds i8, i8* %b, i32 %index - %1 = icmp ule <8 x i32> %induction, %broadcast.splat11 + + ; %1 = icmp ule <8 x i32> %induction, %broadcast.splat11 + %1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast i8* %0 to <8 x i8>* %wide.masked.load = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %2, i32 1, <8 x i1> %1, <8 x i8> undef) %3 = zext <8 x i8> %wide.masked.load to <8 x i16> @@ -116,9 +126,11 @@ ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB2_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: adds r3, #4 ; CHECK-NEXT: vldrh.s32 q0, [r1], #8 ; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vadd.i32 q0, q1, q0 @@ -144,7 +156,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = or <4 x i32> %broadcast.splat, %0 = getelementptr inbounds i16, i16* %b, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat9 + + ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat9 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast i16* %0 to <4 x i16>* %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %2, i32 2, <4 x i1> %1, <4 x i16> undef) %3 = sext <4 x i16> %wide.masked.load to <4 x i32> @@ -170,9 +185,11 @@ ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB3_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: adds r3, #4 ; CHECK-NEXT: vldrh.u32 q0, [r1], #8 ; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vadd.i32 q0, q1, q0 @@ -198,7 +215,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = or <4 x i32> %broadcast.splat, %0 = getelementptr inbounds i16, i16* %b, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat9 + + ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat9 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast i16* %0 to <4 x i16>* %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %2, i32 2, <4 x i1> %1, <4 x i16> undef) %3 = zext <4 x i16> %wide.masked.load to <4 x i32> @@ -223,3 +243,5 @@ declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) +declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) +declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32) Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll @@ -35,9 +35,11 @@ ; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: b .LBB0_8 ; CHECK-NEXT: .LBB0_4: @ %vector.ph +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB0_5: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: add.w r12, r12, #4 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16 ; CHECK-NEXT: vldrw.u32 q1, [r2], #16 ; CHECK-NEXT: vmul.f32 q0, q1, q0 @@ -135,7 +137,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %2 = getelementptr inbounds float, float* %b, i32 %index - %3 = icmp ule <4 x i32> %induction, %broadcast.splat22 + + ; %3 = icmp ule <4 x i32> %induction, %broadcast.splat22 + %3 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %4 = bitcast float* %2 to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %3, <4 x float> undef) %5 = getelementptr inbounds float, float* %c, i32 %index @@ -224,12 +229,12 @@ ; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2 -; CHECK-NEXT: lsr.w r3, r12, #2 -; CHECK-NEXT: sub.w r3, r2, r3, lsl #2 +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB1_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r2 +; CHECK-NEXT: adds r3, #4 ; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vpstt @@ -238,7 +243,6 @@ ; CHECK-NEXT: vfma.f32 q0, q3, q2 ; CHECK-NEXT: le lr, .LBB1_2 ; CHECK-NEXT: @ %bb.3: @ %middle.block -; CHECK-NEXT: vctp.32 r3 ; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vmov.f32 s4, s2 ; CHECK-NEXT: vmov.f32 s5, s3 @@ -274,7 +278,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %0 = getelementptr inbounds float, float* %b, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat12 + +; %1 = icmp ule <4 x i32> %induction, %broadcast.splat12 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast float* %0 to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) %3 = getelementptr inbounds float, float* %c, i32 %index @@ -586,3 +593,4 @@ ; Function Attrs: argmemonly nounwind readonly willreturn declare <4 x half> @llvm.masked.load.v4f16.p0v4f16(<4 x half>*, i32 immarg, <4 x i1>, <4 x half>) +declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll @@ -15,12 +15,12 @@ ; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2 -; CHECK-NEXT: lsr.w r3, r12, #2 -; CHECK-NEXT: sub.w r3, r2, r3, lsl #2 +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB0_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r2 +; CHECK-NEXT: adds r3, #4 ; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vpst @@ -28,7 +28,6 @@ ; CHECK-NEXT: vmla.u32 q0, q2, r0 ; CHECK-NEXT: le lr, .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block -; CHECK-NEXT: vctp.32 r3 ; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: pop {r7, pc} @@ -54,7 +53,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %0 = getelementptr inbounds i8, i8* %b, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + +; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast i8* %0 to <4 x i8>* %wide.masked.load = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %2, i32 1, <4 x i1> %1, <4 x i8> undef) %3 = zext <4 x i8> %wide.masked.load to <4 x i32> @@ -88,12 +90,12 @@ ; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2 -; CHECK-NEXT: lsr.w r3, r12, #2 -; CHECK-NEXT: sub.w r3, r2, r3, lsl #2 +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB1_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r2 +; CHECK-NEXT: adds r3, #4 ; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vpst @@ -101,7 +103,6 @@ ; CHECK-NEXT: vmla.u32 q0, q2, r0 ; CHECK-NEXT: le lr, .LBB1_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block -; CHECK-NEXT: vctp.32 r3 ; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: pop {r7, pc} @@ -127,7 +128,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %0 = getelementptr inbounds i16, i16* %b, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + +; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast i16* %0 to <4 x i16>* %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %2, i32 2, <4 x i1> %1, <4 x i16> undef) %3 = sext <4 x i16> %wide.masked.load to <4 x i32> @@ -161,12 +165,12 @@ ; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2 -; CHECK-NEXT: lsr.w r3, r12, #2 -; CHECK-NEXT: sub.w r3, r2, r3, lsl #2 +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB2_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r2 +; CHECK-NEXT: adds r3, #4 ; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vpst @@ -174,7 +178,6 @@ ; CHECK-NEXT: vmla.u32 q0, q2, r0 ; CHECK-NEXT: le lr, .LBB2_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block -; CHECK-NEXT: vctp.32 r3 ; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: pop {r7, pc} @@ -200,7 +203,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %0 = getelementptr inbounds i8, i8* %b, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + +; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast i8* %0 to <4 x i8>* %wide.masked.load = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %2, i32 1, <4 x i1> %1, <4 x i8> undef) %3 = zext <4 x i8> %wide.masked.load to <4 x i32> @@ -234,12 +240,12 @@ ; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2 -; CHECK-NEXT: lsr.w r3, r12, #2 -; CHECK-NEXT: sub.w r3, r2, r3, lsl #2 +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB3_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r2 +; CHECK-NEXT: adds r3, #4 ; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vpst @@ -247,7 +253,6 @@ ; CHECK-NEXT: vmla.u32 q0, q2, r0 ; CHECK-NEXT: le lr, .LBB3_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block -; CHECK-NEXT: vctp.32 r3 ; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: pop {r7, pc} @@ -273,7 +278,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %0 = getelementptr inbounds i16, i16* %b, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + +; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast i16* %0 to <4 x i16>* %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %2, i32 2, <4 x i1> %1, <4 x i16> undef) %3 = zext <4 x i16> %wide.masked.load to <4 x i32> @@ -307,12 +315,12 @@ ; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2 -; CHECK-NEXT: lsr.w r3, r12, #2 -; CHECK-NEXT: sub.w r3, r2, r3, lsl #2 +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB4_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r2 +; CHECK-NEXT: adds r3, #4 ; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vpst @@ -320,7 +328,6 @@ ; CHECK-NEXT: vmla.u32 q0, q2, r0 ; CHECK-NEXT: le lr, .LBB4_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block -; CHECK-NEXT: vctp.32 r3 ; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: pop {r7, pc} @@ -345,7 +352,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %0 = getelementptr inbounds i32, i32* %b, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat10 + +; %1 = icmp ule <4 x i32> %induction, %broadcast.splat10 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast i32* %0 to <4 x i32>* %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef) %3 = mul nsw <4 x i32> %wide.masked.load, %broadcast.splat12 @@ -399,9 +409,11 @@ ; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: b .LBB5_8 ; CHECK-NEXT: .LBB5_4: @ %vector.ph +; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: dlstp.32 lr, r12 ; CHECK-NEXT: .LBB5_5: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: adds r4, #4 ; CHECK-NEXT: vldrb.u32 q0, [r0], #4 ; CHECK-NEXT: vldrb.u32 q1, [r1], #4 ; CHECK-NEXT: vmlas.u32 q1, q0, r2 @@ -502,7 +514,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %2 = getelementptr inbounds i8, i8* %a, i32 %index - %3 = icmp ule <4 x i32> %induction, %broadcast.splat20 + + ; %3 = icmp ule <4 x i32> %induction, %broadcast.splat20 + %3 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %4 = bitcast i8* %2 to <4 x i8>* %wide.masked.load = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %4, i32 1, <4 x i1> %3, <4 x i8> undef) %5 = zext <4 x i8> %wide.masked.load to <4 x i32> @@ -605,9 +620,11 @@ ; CHECK-NEXT: cmp.w r12, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r4, pc} +; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: dlstp.32 lr, r12 ; CHECK-NEXT: .LBB6_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: adds r4, #4 ; CHECK-NEXT: vldrh.s32 q0, [r0], #8 ; CHECK-NEXT: vldrh.s32 q1, [r1], #8 ; CHECK-NEXT: vmlas.u32 q1, q0, r2 @@ -636,7 +653,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %0 = getelementptr inbounds i16, i16* %a, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat13 + +; %1 = icmp ule <4 x i32> %induction, %broadcast.splat13 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast i16* %0 to <4 x i16>* %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %2, i32 2, <4 x i1> %1, <4 x i16> undef) %3 = sext <4 x i16> %wide.masked.load to <4 x i32> @@ -692,9 +712,11 @@ ; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: b .LBB7_8 ; CHECK-NEXT: .LBB7_4: @ %vector.ph +; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: dlstp.32 lr, r12 ; CHECK-NEXT: .LBB7_5: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: adds r4, #4 ; CHECK-NEXT: vldrb.u32 q0, [r0], #4 ; CHECK-NEXT: vldrb.u32 q1, [r1], #4 ; CHECK-NEXT: vmlas.u32 q1, q0, r2 @@ -795,7 +817,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %2 = getelementptr inbounds i8, i8* %a, i32 %index - %3 = icmp ule <4 x i32> %induction, %broadcast.splat20 + +; %3 = icmp ule <4 x i32> %induction, %broadcast.splat20 + %3 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %4 = bitcast i8* %2 to <4 x i8>* %wide.masked.load = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %4, i32 1, <4 x i1> %3, <4 x i8> undef) %5 = zext <4 x i8> %wide.masked.load to <4 x i32> @@ -898,9 +923,11 @@ ; CHECK-NEXT: cmp.w r12, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r4, pc} +; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: dlstp.32 lr, r12 ; CHECK-NEXT: .LBB8_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: adds r4, #4 ; CHECK-NEXT: vldrh.u32 q0, [r0], #8 ; CHECK-NEXT: vldrh.u32 q1, [r1], #8 ; CHECK-NEXT: vmlas.u32 q1, q0, r2 @@ -929,7 +956,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %0 = getelementptr inbounds i16, i16* %a, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat13 + +; %1 = icmp ule <4 x i32> %induction, %broadcast.splat13 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast i16* %0 to <4 x i16>* %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %2, i32 2, <4 x i1> %1, <4 x i16> undef) %3 = zext <4 x i16> %wide.masked.load to <4 x i32> @@ -985,9 +1015,11 @@ ; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: b .LBB9_8 ; CHECK-NEXT: .LBB9_4: @ %vector.ph +; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: dlstp.32 lr, r12 ; CHECK-NEXT: .LBB9_5: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: adds r4, #4 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: vldrw.u32 q1, [r1], #16 ; CHECK-NEXT: vmlas.u32 q1, q0, r2 @@ -1085,7 +1117,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %2 = getelementptr inbounds i32, i32* %a, i32 %index - %3 = icmp ule <4 x i32> %induction, %broadcast.splat22 + +; %3 = icmp ule <4 x i32> %induction, %broadcast.splat22 + %3 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %4 = bitcast i32* %2 to <4 x i32>* %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %4, i32 4, <4 x i1> %3, <4 x i32> undef) %5 = getelementptr inbounds i32, i32* %b, i32 %index @@ -1175,9 +1210,11 @@ ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dlstp.16 lr, r3 ; CHECK-NEXT: .LBB10_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: add.w r12, r12, #8 ; CHECK-NEXT: vldrb.u16 q0, [r1], #8 ; CHECK-NEXT: vldrb.u16 q1, [r2], #8 ; CHECK-NEXT: vmul.i16 q0, q1, q0 @@ -1203,7 +1240,10 @@ %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer %induction = add <8 x i32> %broadcast.splat, %0 = getelementptr inbounds i8, i8* %b, i32 %index - %1 = icmp ule <8 x i32> %induction, %broadcast.splat13 + +; %1 = icmp ule <8 x i32> %induction, %broadcast.splat13 + %1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast i8* %0 to <8 x i8>* %wide.masked.load = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %2, i32 1, <8 x i1> %1, <8 x i8> undef) %3 = zext <8 x i8> %wide.masked.load to <8 x i16> @@ -1230,6 +1270,5 @@ declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32 immarg, <8 x i1>) declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) - - - +declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) +declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32) Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll @@ -9,6 +9,9 @@ ; CHECK: for.cond1.preheader.us.preheader: ; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 3 ; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -4 +; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT28:%.*]] = insertelement <4 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT29:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT28]], <4 x i32> undef, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP:%.*]] = add i32 [[N_VEC]], -4 ; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[TMP]], 2 ; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i32 [[TMP1]], 1 @@ -21,12 +24,16 @@ ; CHECK-NEXT: [[ARRAYIDX8_PROMOTED_US:%.*]] = load i32, i32* [[ARRAYIDX8_US]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> , i32 [[ARRAYIDX8_PROMOTED_US]], i32 0 ; CHECK-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[TMP2]]) +; CHECK-NEXT: [[NUM_ELEMENTS:%.*]] = add i32 [[TRIP_COUNT_MINUS_1]], 1 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[FOR_COND1_PREHEADER_US]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP4]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP5:%.*]] = phi i32 [ [[TMP2]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[N]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[NUM_ELEMENTS]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[TMP3]], i32 [[INDEX]] ; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP0]]) ; CHECK-NEXT: [[TMP2]] = sub i32 [[TMP0]], 4 @@ -86,7 +93,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %tmp6 = getelementptr inbounds i16, i16* %tmp3, i32 %index - %tmp7 = icmp ule <4 x i32> %induction, %broadcast.splat29 + + ; %tmp7 = icmp ule <4 x i32> %induction, %broadcast.splat29 + %tmp7 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %tmp8 = bitcast i16* %tmp6 to <4 x i16>* %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %tmp8, i32 2, <4 x i1> %tmp7, <4 x i16> undef) %tmp9 = sext <4 x i16> %wide.masked.load to <4 x i32> @@ -121,6 +131,9 @@ ; CHECK: for.cond1.preheader.us.preheader: ; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 3 ; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -4 +; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT27:%.*]] = insertelement <4 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT28:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT27]], <4 x i32> undef, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP:%.*]] = add i32 [[N_VEC]], -4 ; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[TMP]], 2 ; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i32 [[TMP1]], 1 @@ -133,12 +146,16 @@ ; CHECK-NEXT: [[ARRAYIDX7_PROMOTED_US:%.*]] = load i32, i32* [[ARRAYIDX7_US]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> , i32 [[ARRAYIDX7_PROMOTED_US]], i32 0 ; CHECK-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[TMP2]]) +; CHECK-NEXT: [[NUM_ELEMENTS:%.*]] = add i32 [[TRIP_COUNT_MINUS_1]], 1 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[FOR_COND1_PREHEADER_US]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP4]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP5:%.*]] = phi i32 [ [[TMP2]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[N]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[NUM_ELEMENTS]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 [[INDEX]] ; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP0]]) ; CHECK-NEXT: [[TMP2]] = sub i32 [[TMP0]], 4 @@ -196,7 +213,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %tmp6 = getelementptr inbounds i32, i32* %tmp3, i32 %index - %tmp7 = icmp ule <4 x i32> %induction, %broadcast.splat28 + + ; %tmp7 = icmp ule <4 x i32> %induction, %broadcast.splat28 + %tmp7 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %tmp8 = bitcast i32* %tmp6 to <4 x i32>* %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp8, i32 4, <4 x i1> %tmp7, <4 x i32> undef) %tmp9 = getelementptr inbounds i32, i32* %B, i32 %index @@ -236,6 +256,8 @@ ; Function Attrs: noduplicate nounwind declare i32 @llvm.loop.decrement.reg.i32(i32, i32) #2 +declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) + attributes #0 = { argmemonly nounwind readonly willreturn } attributes #1 = { nounwind readnone willreturn } attributes #2 = { noduplicate nounwind } Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll @@ -10,17 +10,22 @@ ; CHECK-NEXT: [[LSR_IV14:%.*]] = phi i32* [ [[SCEVGEP15:%.*]], [[VECTOR_BODY]] ], [ [[A:%.*]], [[ENTRY:%.*]] ] ; CHECK-NEXT: [[LSR_IV11:%.*]] = phi i32* [ [[SCEVGEP12:%.*]], [[VECTOR_BODY]] ], [ [[C:%.*]], [[ENTRY]] ] ; CHECK-NEXT: [[LSR_IV:%.*]] = phi i32* [ [[SCEVGEP:%.*]], [[VECTOR_BODY]] ], [ [[B:%.*]], [[ENTRY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ 8001, [[ENTRY]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 32002, [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 32003, [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[LSR_IV1416:%.*]] = bitcast i32* [[LSR_IV14]] to <4 x i32>* ; CHECK-NEXT: [[LSR_IV1113:%.*]] = bitcast i32* [[LSR_IV11]] to <4 x i32>* ; CHECK-NEXT: [[LSR_IV10:%.*]] = bitcast i32* [[LSR_IV]] to <4 x i32>* +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], ; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP1]]) ; CHECK-NEXT: [[TMP3]] = sub i32 [[TMP1]], 4 ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[LSR_IV10]], i32 4, <4 x i1> [[TMP2]], <4 x i32> undef) ; CHECK-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[LSR_IV1113]], i32 4, <4 x i1> [[TMP2]], <4 x i32> undef) ; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD]] ; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP4]], <4 x i32>* [[LSR_IV1416]], i32 4, <4 x i1> [[TMP2]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 ; CHECK-NEXT: [[SCEVGEP]] = getelementptr i32, i32* [[LSR_IV]], i32 4 ; CHECK-NEXT: [[SCEVGEP12]] = getelementptr i32, i32* [[LSR_IV11]], i32 4 ; CHECK-NEXT: [[SCEVGEP15]] = getelementptr i32, i32* [[LSR_IV14]], i32 4 @@ -46,7 +51,10 @@ %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, - %1 = icmp ult <4 x i32> %induction, + + ; %1 = icmp ult <4 x i32> %induction, + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32002) + %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef) %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef) %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load @@ -171,6 +179,7 @@ ; UGT here: %1 = icmp ugt <4 x i32> %induction, + %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef) %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef) %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load @@ -199,25 +208,27 @@ ; CHECK-NEXT: [[LSR_IV11:%.*]] = phi i32* [ [[SCEVGEP12:%.*]], [[VECTOR_BODY]] ], [ [[C:%.*]], [[ENTRY]] ] ; CHECK-NEXT: [[LSR_IV:%.*]] = phi i32* [ [[SCEVGEP:%.*]], [[VECTOR_BODY]] ], [ [[B:%.*]], [[ENTRY]] ] ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ 8001, [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ 8001, [[ENTRY]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 32003, [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[LSR_IV1416:%.*]] = bitcast i32* [[LSR_IV14]] to <4 x i32>* ; CHECK-NEXT: [[LSR_IV1113:%.*]] = bitcast i32* [[LSR_IV11]] to <4 x i32>* ; CHECK-NEXT: [[LSR_IV10:%.*]] = bitcast i32* [[LSR_IV]] to <4 x i32>* ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer ; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], -; CHECK-NEXT: [[TMP1:%.*]] = icmp ult <4 x i32> [[INDUCTION]], -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[LSR_IV10]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef) -; CHECK-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[LSR_IV1113]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef) -; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD]] -; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP2]], <4 x i32>* [[LSR_IV1416]], i32 4, <4 x i1> [[TMP1]]) +; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP1]]) +; CHECK-NEXT: [[TMP3]] = sub i32 [[TMP1]], 4 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[LSR_IV10]], i32 4, <4 x i1> [[TMP2]], <4 x i32> undef) +; CHECK-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[LSR_IV1113]], i32 4, <4 x i1> [[TMP2]], <4 x i32> undef) +; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP4]], <4 x i32>* [[LSR_IV1416]], i32 4, <4 x i1> [[TMP2]]) ; CHECK-NEXT: [[INDEX_NEXT]] = sub i32 [[INDEX]], 4 ; CHECK-NEXT: [[SCEVGEP]] = getelementptr i32, i32* [[LSR_IV]], i32 4 ; CHECK-NEXT: [[SCEVGEP12]] = getelementptr i32, i32* [[LSR_IV11]], i32 4 ; CHECK-NEXT: [[SCEVGEP15]] = getelementptr i32, i32* [[LSR_IV14]], i32 4 -; CHECK-NEXT: [[TMP3]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP0]], i32 1) -; CHECK-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0 -; CHECK-NEXT: br i1 [[TMP4]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK-NEXT: [[TMP5]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP0]], i32 1) +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne i32 [[TMP5]], 0 +; CHECK-NEXT: br i1 [[TMP6]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void ; @@ -237,7 +248,10 @@ %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, - %1 = icmp ult <4 x i32> %induction, + + ; %1 = icmp ult <4 x i32> %induction, + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32002) + %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef) %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef) %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load @@ -305,8 +319,325 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, -; non-uniform constant vector here: +; Non-uniform constant vector here. This can't be represented with +; @llvm.get.active.lane.mask, but let's keep this test as a sanity check: %1 = icmp ult <4 x i32> %induction, + + %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef) + %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef) + %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %2, <4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %1) + %index.next = add i32 %index, 4 + %scevgep = getelementptr i32, i32* %lsr.iv, i32 4 + %scevgep12 = getelementptr i32, i32* %lsr.iv11, i32 4 + %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4 + %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1) + %4 = icmp ne i32 %3, 0 + br i1 %4, label %vector.body, label %for.cond.cleanup + +for.cond.cleanup: + ret void +} + +; CHECK-LABEL: @overflow( +; +; CHECK-NOT: @llvm.arm.mve.vctp32 +; CHECK-NOT: @llvm.get.active.lane.mask +; +; CHECK: %lane.mask.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 +; CHECK: %lane.mask.splat = shufflevector <4 x i32> %lane.mask.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK: %lane.mask.induction = add <4 x i32> %lane.mask.splat, +; CHECK: %[[ICMP:.*]] = icmp ule <4 x i32> %lane.mask.induction, +; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ICMP]], <4 x i32> undef) +; +; CHECK: ret void +; +define dso_local void @overflow(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 { +entry: + call void @llvm.set.loop.iterations.i32(i32 8001) + br label %vector.body + +vector.body: + %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %A, %entry ] + %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ] + %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ] + %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = phi i32 [ 8001, %entry ], [ %3, %vector.body ] + %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* + %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>* + %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>* + %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + %induction = add <4 x i32> %broadcast.splat, + +; BTC = UINT_MAX, and scalar trip count BTC + 1 would overflow: + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 4294967295) + + %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef) + %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef) + %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %2, <4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %1) + %index.next = add i32 %index, 4 + %scevgep = getelementptr i32, i32* %lsr.iv, i32 4 + %scevgep12 = getelementptr i32, i32* %lsr.iv11, i32 4 + %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4 + %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1) + %4 = icmp ne i32 %3, 0 + br i1 %4, label %vector.body, label %for.cond.cleanup + +for.cond.cleanup: + ret void +} + +; CHECK-LABEL: @IV_not_an_induction( +; +; CHECK-NOT: @llvm.arm.mve.vctp32 +; CHECK-NOT: @llvm.get.active.lane.mask +; +; CHECK: %lane.mask.splatinsert = insertelement <4 x i32> undef, i32 %N, i32 0 +; CHECK: %lane.mask.splat = shufflevector <4 x i32> %lane.mask.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK: %lane.mask.induction = add <4 x i32> %lane.mask.splat, +; CHECK: %[[ICMP:.*]] = icmp ule <4 x i32> %lane.mask.induction, +; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ICMP]], <4 x i32> undef) +; CHECK: ret void +; +define dso_local void @IV_not_an_induction(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 { +entry: + call void @llvm.set.loop.iterations.i32(i32 8001) + br label %vector.body + +vector.body: + %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %A, %entry ] + %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ] + %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ] + %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = phi i32 [ 8001, %entry ], [ %3, %vector.body ] + %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* + %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>* + %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>* + %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + %induction = add <4 x i32> %broadcast.splat, + +; The induction variable %D is not an IV: + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %N, i32 32002) + + %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef) + %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef) + %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %2, <4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %1) + %index.next = add i32 %index, 4 + %scevgep = getelementptr i32, i32* %lsr.iv, i32 4 + %scevgep12 = getelementptr i32, i32* %lsr.iv11, i32 4 + %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4 + %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1) + %4 = icmp ne i32 %3, 0 + br i1 %4, label %vector.body, label %for.cond.cleanup + +for.cond.cleanup: + ret void +} + +; CHECK-LABEL: @IV_wrong_step( +; +; CHECK-NOT: @llvm.arm.mve.vctp32 +; CHECK-NOT: @llvm.get.active.lane.mask +; +; CHECK: %lane.mask.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 +; CHECK: %lane.mask.splat = shufflevector <4 x i32> %lane.mask.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK: %lane.mask.induction = add <4 x i32> %lane.mask.splat, +; CHECK: %[[ICMP:.*]] = icmp ule <4 x i32> %lane.mask.induction, +; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ICMP]], <4 x i32> undef) +; CHECK: ret void +; +define dso_local void @IV_wrong_step(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 { +entry: + call void @llvm.set.loop.iterations.i32(i32 8001) + br label %vector.body + +vector.body: + %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %A, %entry ] + %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ] + %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ] + %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = phi i32 [ 8001, %entry ], [ %3, %vector.body ] + %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* + %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>* + %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>* + %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + %induction = add <4 x i32> %broadcast.splat, + + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32002) + + %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef) + %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef) + %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %2, <4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %1) + +; %index is incremented with 3 and not 4, which is the vectorisation factor +; that we expect here: + %index.next = add i32 %index, 3 + + %scevgep = getelementptr i32, i32* %lsr.iv, i32 4 + %scevgep12 = getelementptr i32, i32* %lsr.iv11, i32 4 + %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4 + %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1) + %4 = icmp ne i32 %3, 0 + br i1 %4, label %vector.body, label %for.cond.cleanup + +for.cond.cleanup: + ret void +} + +; CHECK-LABEL: @IV_step_not_constant( +; +; CHECK-NOT: @llvm.arm.mve.vctp32 +; CHECK-NOT: @llvm.get.active.lane.mask +; +; CHECK: %lane.mask.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 +; CHECK: %lane.mask.splat = shufflevector <4 x i32> %lane.mask.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK: %lane.mask.induction = add <4 x i32> %lane.mask.splat, +; CHECK: %[[ICMP:.*]] = icmp ule <4 x i32> %lane.mask.induction, +; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ICMP]], <4 x i32> undef) +; CHECK: ret void +; +define dso_local void @IV_step_not_constant(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 { +entry: + call void @llvm.set.loop.iterations.i32(i32 8001) + br label %vector.body + +vector.body: + %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %A, %entry ] + %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ] + %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ] + %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = phi i32 [ 8001, %entry ], [ %3, %vector.body ] + %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* + %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>* + %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>* + %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + %induction = add <4 x i32> %broadcast.splat, + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32002) + %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef) + %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef) + %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %2, <4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %1) + +; %index is incremented with some runtime value, i.e. not a constant: + %index.next = add i32 %index, %N + + %scevgep = getelementptr i32, i32* %lsr.iv, i32 4 + %scevgep12 = getelementptr i32, i32* %lsr.iv11, i32 4 + %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4 + %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1) + %4 = icmp ne i32 %3, 0 + br i1 %4, label %vector.body, label %for.cond.cleanup + +for.cond.cleanup: + ret void +} + +; CHECK-LABEL: @outerloop_phi( +; +; CHECK-NOT: @llvm.arm.mve.vctp32 +; CHECK-NOT: @llvm.get.active.lane.mask +; CHECK: %lane.mask.splatinsert = insertelement <4 x i32> undef, i32 %j.025, i32 0 +; CHECK: %lane.mask.splat = shufflevector <4 x i32> %lane.mask.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK: %lane.mask.induction = add <4 x i32> %lane.mask.splat, +; CHECK: %[[ICMP:.*]] = icmp ule <4 x i32> %lane.mask.induction, +; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ICMP]], <4 x i32> undef) +; +; CHECK: ret void +; +define dso_local void @outerloop_phi(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 { +entry: + %cmp24 = icmp eq i32 %N, 0 + br i1 %cmp24, label %for.cond.cleanup, label %vector.ph.preheader + +vector.ph.preheader: ; preds = %entry + br label %vector.ph + +vector.ph: ; preds = %vector.ph.preheader, %for.cond.cleanup3 + %lsr.iv36 = phi i32* [ %B, %vector.ph.preheader ], [ %scevgep37, %for.cond.cleanup3 ] + %lsr.iv31 = phi i32* [ %C, %vector.ph.preheader ], [ %scevgep32, %for.cond.cleanup3 ] + %lsr.iv = phi i32* [ %A, %vector.ph.preheader ], [ %scevgep, %for.cond.cleanup3 ] + %j.025 = phi i32 [ %inc11, %for.cond.cleanup3 ], [ 0, %vector.ph.preheader ] + call void @llvm.set.loop.iterations.i32(i32 1025) + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %lsr.iv38 = phi i32* [ %scevgep39, %vector.body ], [ %lsr.iv36, %vector.ph ] + %lsr.iv33 = phi i32* [ %scevgep34, %vector.body ], [ %lsr.iv31, %vector.ph ] + %lsr.iv28 = phi i32* [ %scevgep29, %vector.body ], [ %lsr.iv, %vector.ph ] + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %0 = phi i32 [ 1025, %vector.ph ], [ %2, %vector.body ] + %lsr.iv3840 = bitcast i32* %lsr.iv38 to <4 x i32>* + %lsr.iv3335 = bitcast i32* %lsr.iv33 to <4 x i32>* + %lsr.iv2830 = bitcast i32* %lsr.iv28 to <4 x i32>* + +; It's using %j.025, the induction variable from its outer loop: + %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %j.025, i32 4096) + + %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv3840, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) + %wide.masked.load27 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv3335, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) + %1 = add nsw <4 x i32> %wide.masked.load27, %wide.masked.load + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %lsr.iv2830, i32 4, <4 x i1> %active.lane.mask) + %index.next = add i32 %index, 4 + %scevgep29 = getelementptr i32, i32* %lsr.iv28, i32 4 + %scevgep34 = getelementptr i32, i32* %lsr.iv33, i32 4 + %scevgep39 = getelementptr i32, i32* %lsr.iv38, i32 4 + %2 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1) + %3 = icmp ne i32 %2, 0 + br i1 %3, label %vector.body, label %for.cond.cleanup3 + +for.cond.cleanup: ; preds = %for.cond.cleanup3, %entry + ret void + +for.cond.cleanup3: ; preds = %vector.body + %inc11 = add nuw i32 %j.025, 1 + %scevgep = getelementptr i32, i32* %lsr.iv, i32 1 + %scevgep32 = getelementptr i32, i32* %lsr.iv31, i32 1 + %scevgep37 = getelementptr i32, i32* %lsr.iv36, i32 1 + %exitcond26 = icmp eq i32 %inc11, %N + br i1 %exitcond26, label %for.cond.cleanup, label %vector.ph +} + +; CHECK-LABEL: @overflow_in_sub( +; CHECK-NOT: @llvm.arm.mve.vctp32 +; CHECK-NOT: @llvm.get.active.lane.mask +; CHECK: ret void +; +define dso_local void @overflow_in_sub(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 { +entry: + call void @llvm.set.loop.iterations.i32(i32 8001) + br label %vector.body + +vector.body: + %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %A, %entry ] + %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ] + %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ] + %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = phi i32 [ 8001, %entry ], [ %3, %vector.body ] + %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* + %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>* + %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>* + %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + %induction = add <4 x i32> %broadcast.splat, + +; Overflow in the substraction. This should hold: +; +; ceil(ElementCount / VectorWidth) >= TripCount +; +; But we have: +; +; ceil(3200 / 4) >= 8001 +; 8000 >= 8001 +; + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 31999) + %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef) %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef) %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load @@ -323,7 +654,9 @@ ret void } + declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #1 declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #2 declare i32 @llvm.loop.decrement.reg.i32(i32 , i32 ) declare void @llvm.set.loop.iterations.i32(i32) +declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll @@ -27,7 +27,10 @@ %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer %induction = add <8 x i32> %broadcast.splat, %tmp = getelementptr inbounds i16, i16* %a, i32 %index - %tmp1 = icmp ule <8 x i32> %induction, %broadcast.splat11 + + ; %tmp1 = icmp ule <8 x i32> %induction, %broadcast.splat11 + %tmp1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %trip.count.minus.1) + %tmp2 = bitcast i16* %tmp to <8 x i16>* %wide.masked.load = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp2, i32 4, <8 x i1> %tmp1, <8 x i16> undef) %tmp3 = getelementptr inbounds i16, i16* %b, i32 %index @@ -84,7 +87,10 @@ %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer %induction = add <8 x i32> %broadcast.splat, %tmp = getelementptr inbounds i16, i16* %a, i32 %index - %tmp1 = icmp ule <8 x i32> %induction, %broadcast.splat11 + + ; %tmp1 = icmp ule <8 x i32> %induction, %broadcast.splat11 + %tmp1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %trip.count.minus.1) + %tmp2 = bitcast i16* %tmp to <8 x i16>* %wide.masked.load = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp2, i32 4, <8 x i1> %tmp1, <8 x i16> undef) %tmp3 = getelementptr inbounds i16, i16* %b, i32 %index @@ -143,7 +149,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %tmp = getelementptr inbounds i32, i32* %a, i32 %index - %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + + ; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + %tmp1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %tmp2 = bitcast i32* %tmp to <4 x i32>* %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef) %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index @@ -171,3 +180,5 @@ declare void @llvm.masked.store.v4i64.p0v4i64(<4 x i64>, <4 x i64>*, i32 immarg, <4 x i1>) declare void @llvm.set.loop.iterations.i32(i32) declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) +declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) +declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32) Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll @@ -1,36 +1,43 @@ ; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -disable-mve-tail-predication=false -mattr=+mve %s -S -o - | FileCheck %s ; CHECK-LABEL: reduction_i32 -; CHECK: phi i32 [ 0, %entry ] -; CHECK: phi <8 x i16> [ zeroinitializer, %entry ] +; CHECK: phi i32 [ 0, %vector.ph ] +; CHECK: phi <8 x i16> [ zeroinitializer, %vector.ph ] ; CHECK: phi i32 -; CHECK: [[PHI:%[^ ]+]] = phi i32 [ %N, %entry ], [ [[ELEMS:%[^ ]+]], %vector.body ] +; CHECK: [[PHI:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[ELEMS:%[^ ]+]], %vector.body ] ; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[PHI]]) ; CHECK: [[ELEMS]] = sub i32 [[PHI]], 8 ; CHECK: call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef) ; CHECK: call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp6, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef) define i16 @reduction_i32(i16* nocapture readonly %A, i16* nocapture readonly %B, i32 %N) { entry: + %cmp8 = icmp eq i32 %N, 0 + br i1 %cmp8, label %for.cond.cleanup, label %vector.ph + +vector.ph: %tmp = add i32 %N, -1 - %n.rnd.up = add nuw nsw i32 %tmp, 8 + %n.rnd.up = add i32 %tmp, 8 %n.vec = and i32 %n.rnd.up, -8 %broadcast.splatinsert1 = insertelement <8 x i32> undef, i32 %tmp, i32 0 %broadcast.splat2 = shufflevector <8 x i32> %broadcast.splatinsert1, <8 x i32> undef, <8 x i32> zeroinitializer %0 = add i32 %n.vec, -8 %1 = lshr i32 %0, 3 - %2 = add nuw nsw i32 %1, 1 + %2 = add i32 %1, 1 call void @llvm.set.loop.iterations.i32(i32 %2) br label %vector.body -vector.body: ; preds = %vector.body, %entry - %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] - %vec.phi = phi <8 x i16> [ zeroinitializer, %entry ], [ %tmp8, %vector.body ] - %3 = phi i32 [ %2, %entry ], [ %4, %vector.body ] +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph], [ %index.next, %vector.body ] + %vec.phi = phi <8 x i16> [ zeroinitializer, %vector.ph], [ %tmp8, %vector.body ] + %3 = phi i32 [ %2, %vector.ph], [ %4, %vector.body ] %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer %induction = add <8 x i32> %broadcast.splat, %tmp2 = getelementptr inbounds i16, i16* %A, i32 %index - %tmp3 = icmp ule <8 x i32> %induction, %broadcast.splat2 + + ; %tmp3 = icmp ule <8 x i32> %induction, %broadcast.splat2 + %tmp3 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %tmp) + %tmp4 = bitcast i16* %tmp2 to <8 x i16>* %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> %tmp3, <8 x i16> undef) %tmp5 = getelementptr inbounds i16, i16* %B, i32 %index @@ -38,7 +45,7 @@ %wide.masked.load3 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp6, i32 4, <8 x i1> %tmp3, <8 x i16> undef) %tmp7 = add <8 x i16> %wide.masked.load, %vec.phi %tmp8 = add <8 x i16> %tmp7, %wide.masked.load3 - %index.next = add nuw nsw i32 %index, 8 + %index.next = add i32 %index, 8 %4 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %3, i32 1) %5 = icmp ne i32 %4, 0 br i1 %5, label %vector.body, label %middle.block @@ -56,17 +63,101 @@ %bin.rdx7 = add <8 x i16> %rdx.shuf6, %bin.rdx5 %tmp11 = extractelement <8 x i16> %bin.rdx7, i32 0 ret i16 %tmp11 + +for.cond.cleanup: + %res.0 = phi i16 [ 0, %entry ] + ret i16 %res.0 } ; CHECK-LABEL: reduction_i32_with_scalar -; CHECK: phi i32 [ 0, %entry ] -; CHECK: phi <8 x i16> [ zeroinitializer, %entry ] -; CHECK: phi i32 -; CHECK: [[PHI:%[^ ]+]] = phi i32 [ %N, %entry ], [ [[ELEMS:%[^ ]+]], %vector.body ] +; CHECK: vector.body: +; CHECK: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; CHECK: %vec.phi = phi <8 x i16> [ zeroinitializer, %vector.ph ], [ %{{.*}}, %vector.body ] +; CHECK: %{{.*}} = phi i32 [ %{{.*}}, %vector.ph ], [ %{{.*}}, %vector.body ] +; CHECK: [[PHI:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[ELEMS:%[^ ]+]], %vector.body ] ; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[PHI]]) ; CHECK: [[ELEMS]] = sub i32 [[PHI]], 8 ; CHECK: call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef) define i16 @reduction_i32_with_scalar(i16* nocapture readonly %A, i16 %B, i32 %N) local_unnamed_addr { +entry: + %cmp8 = icmp eq i32 %N, 0 + br i1 %cmp8, label %for.cond.cleanup, label %vector.ph + +vector.ph: + %tmp = add i32 %N, -1 + %n.rnd.up = add nuw nsw i32 %tmp, 8 + %n.vec = and i32 %n.rnd.up, -8 + %broadcast.splatinsert1 = insertelement <8 x i32> undef, i32 %tmp, i32 0 + %broadcast.splat2 = shufflevector <8 x i32> %broadcast.splatinsert1, <8 x i32> undef, <8 x i32> zeroinitializer + %broadcast.splatinsert3 = insertelement <8 x i16> undef, i16 %B, i32 0 + %broadcast.splat4 = shufflevector <8 x i16> %broadcast.splatinsert3, <8 x i16> undef, <8 x i32> zeroinitializer + %0 = add i32 %n.vec, -8 + %1 = lshr i32 %0, 3 + %2 = add nuw nsw i32 %1, 1 + call void @llvm.set.loop.iterations.i32(i32 %2) + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph], [ %index.next, %vector.body ] + %vec.phi = phi <8 x i16> [ zeroinitializer, %vector.ph], [ %tmp6, %vector.body ] + %3 = phi i32 [ %2, %vector.ph], [ %4, %vector.body ] + %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0 + %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer + %induction = add <8 x i32> %broadcast.splat, + %tmp2 = getelementptr inbounds i16, i16* %A, i32 %index + + ; %tmp3 = icmp ule <8 x i32> %induction, %broadcast.splat2 + %tmp3 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %tmp) + + %tmp4 = bitcast i16* %tmp2 to <8 x i16>* + %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> %tmp3, <8 x i16> undef) + %tmp5 = add <8 x i16> %vec.phi, %broadcast.splat4 + %tmp6 = add <8 x i16> %tmp5, %wide.masked.load + %index.next = add nuw nsw i32 %index, 8 + %4 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %3, i32 1) + %5 = icmp ne i32 %4, 0 + br i1 %5, label %vector.body, label %middle.block + +middle.block: ; preds = %vector.body + %tmp8 = select <8 x i1> %tmp3, <8 x i16> %tmp6, <8 x i16> %vec.phi + %rdx.shuf = shufflevector <8 x i16> %tmp8, <8 x i16> undef, <8 x i32> + %bin.rdx = add <8 x i16> %rdx.shuf, %tmp8 + %rdx.shuf5 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> + %bin.rdx6 = add <8 x i16> %rdx.shuf5, %bin.rdx + %rdx.shuf7 = shufflevector <8 x i16> %bin.rdx6, <8 x i16> undef, <8 x i32> + %bin.rdx8 = add <8 x i16> %rdx.shuf7, %bin.rdx6 + %tmp9 = extractelement <8 x i16> %bin.rdx8, i32 0 + ret i16 %tmp9 + +for.cond.cleanup: + %res.0 = phi i16 [ 0, %entry ] + ret i16 %res.0 +} + +; The vector loop is not guarded with an entry check (N == 0). +; This means we can't calculate a precise range for the backedge count in +; @llvm.get.active.lane.mask, and are assuming overflow can happen and thus +; we can't insert the VCTP here. +; +; CHECK-LABEL: @reduction_not_guarded +; +; CHECK-NOT: @llvm.arm.mve.vctp +; CHECK-NOT: @llvm.get.active.lane.mask.v8i1.i32 +; +; CHECK: entry: +; CHECK: %[[ELEMCOUNT:.*]] = add i32 %N, -1 +; CHECK: %broadcast.splatinsert1 = insertelement <8 x i32> undef, i32 %[[ELEMCOUNT]], i32 0 +; CHECK %broadcast.splat2 = shufflevector <8 x i32> %broadcast.splatinsert1, <8 x i32> undef, <8 x i32> zeroinitializer +; +; CHECK: vector.body: +; CHECK: %lane.mask.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0 +; CHECK: %lane.mask.splat = shufflevector <8 x i32> %lane.mask.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer +; CHECK: %lane.mask.induction = add <8 x i32> %lane.mask.splat, +; CHECK: %[[ICMP:.*]] = icmp ule <8 x i32> %lane.mask.induction, %broadcast.splat2 +; CHECK: call <8 x i16> @llvm.masked.load.v8i16.p0v8i16({{.*}}, <8 x i1> %[[ICMP]], <8 x i16> undef) +; CHECK: ret +; +define i16 @reduction_not_guarded(i16* nocapture readonly %A, i16 %B, i32 %N) local_unnamed_addr { entry: %tmp = add i32 %N, -1 %n.rnd.up = add nuw nsw i32 %tmp, 8 @@ -81,15 +172,18 @@ call void @llvm.set.loop.iterations.i32(i32 %2) br label %vector.body -vector.body: ; preds = %vector.body, %entry - %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] - %vec.phi = phi <8 x i16> [ zeroinitializer, %entry ], [ %tmp6, %vector.body ] +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %entry], [ %index.next, %vector.body ] + %vec.phi = phi <8 x i16> [ zeroinitializer, %entry], [ %tmp6, %vector.body ] %3 = phi i32 [ %2, %entry ], [ %4, %vector.body ] %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer %induction = add <8 x i32> %broadcast.splat, %tmp2 = getelementptr inbounds i16, i16* %A, i32 %index - %tmp3 = icmp ule <8 x i32> %induction, %broadcast.splat2 + + ; %tmp3 = icmp ule <8 x i32> %induction, %broadcast.splat2 + %tmp3 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %tmp) + %tmp4 = bitcast i16* %tmp2 to <8 x i16>* %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> %tmp3, <8 x i16> undef) %tmp5 = add <8 x i16> %vec.phi, %broadcast.splat4 @@ -114,5 +208,5 @@ declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>) declare void @llvm.set.loop.iterations.i32(i32) declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) - - +declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) +declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32) Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll @@ -15,8 +15,7 @@ ; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2 -; CHECK-NEXT: lsr.w r3, r12, #2 -; CHECK-NEXT: sub.w r3, r2, r3, lsl #2 +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB0_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -25,12 +24,12 @@ ; CHECK-NEXT: vpstt ; CHECK-NEXT: vldrwt.u32 q0, [r0], #16 ; CHECK-NEXT: vldrwt.u32 q2, [r1], #16 -; CHECK-NEXT: subs r2, #4 +; CHECK-NEXT: adds r3, #4 ; CHECK-NEXT: vmul.i32 q0, q2, q0 +; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vadd.i32 q0, q0, q1 ; CHECK-NEXT: le lr, .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block -; CHECK-NEXT: vctp.32 r3 ; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: pop {r7, pc} @@ -53,7 +52,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %0 = getelementptr inbounds i32, i32* %a, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat12 + + ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat12 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast i32* %0 to <4 x i32>* %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef) %3 = getelementptr inbounds i32, i32* %b, i32 %index @@ -89,8 +91,7 @@ ; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: subs r1, #4 ; CHECK-NEXT: add.w lr, r3, r1, lsr #2 -; CHECK-NEXT: lsrs r1, r1, #2 -; CHECK-NEXT: sub.w r1, r2, r1, lsl #2 +; CHECK-NEXT: movs r1, #0 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB1_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -98,11 +99,11 @@ ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vpst ; CHECK-NEXT: vldrwt.u32 q0, [r0], #16 +; CHECK-NEXT: adds r1, #4 ; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vadd.i32 q0, q0, q1 ; CHECK-NEXT: le lr, .LBB1_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block -; CHECK-NEXT: vctp.32 r1 ; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: pop {r7, pc} @@ -125,7 +126,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %0 = getelementptr inbounds i32, i32* %a, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat10 + + ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat10 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast i32* %0 to <4 x i32>* %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef) %3 = add nsw <4 x i32> %wide.masked.load, %vec.phi @@ -157,8 +161,7 @@ ; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: subs r1, #4 ; CHECK-NEXT: add.w lr, r3, r1, lsr #2 -; CHECK-NEXT: lsrs r1, r1, #2 -; CHECK-NEXT: sub.w r1, r2, r1, lsl #2 +; CHECK-NEXT: movs r1, #0 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB2_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -166,11 +169,11 @@ ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vpst ; CHECK-NEXT: vldrwt.u32 q0, [r0], #16 +; CHECK-NEXT: adds r1, #4 ; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vadd.i32 q0, q0, q1 ; CHECK-NEXT: le lr, .LBB2_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block -; CHECK-NEXT: vctp.32 r1 ; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: pop {r7, pc} @@ -193,7 +196,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %0 = getelementptr inbounds i32, i32* %a, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat10 + + ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat10 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast i32* %0 to <4 x i32>* %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef) %3 = add nsw <4 x i32> %wide.masked.load, %vec.phi @@ -218,9 +224,11 @@ ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB3_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: add.w r12, r12, #4 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16 ; CHECK-NEXT: vmul.i32 q0, q0, r2 ; CHECK-NEXT: vstrw.32 q0, [r0], #16 @@ -247,7 +255,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %0 = getelementptr inbounds i32, i32* %b, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat9 + + ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat9 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast i32* %0 to <4 x i32>* %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef) %3 = mul nsw <4 x i32> %wide.masked.load, %broadcast.splat11 @@ -269,9 +280,11 @@ ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB4_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: add.w r12, r12, #4 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16 ; CHECK-NEXT: vadd.i32 q0, q0, r2 ; CHECK-NEXT: vstrw.32 q0, [r0], #16 @@ -298,7 +311,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %0 = getelementptr inbounds i32, i32* %b, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat9 + + ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat9 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast i32* %0 to <4 x i32>* %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef) %3 = add nsw <4 x i32> %wide.masked.load, %broadcast.splat11 @@ -320,9 +336,11 @@ ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dlstp.8 lr, r3 ; CHECK-NEXT: .LBB5_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: add.w r12, r12, #16 ; CHECK-NEXT: vldrb.u8 q0, [r1], #16 ; CHECK-NEXT: vldrb.u8 q1, [r2], #16 ; CHECK-NEXT: vmul.i8 q0, q1, q0 @@ -348,7 +366,10 @@ %broadcast.splat = shufflevector <16 x i32> %broadcast.splatinsert, <16 x i32> undef, <16 x i32> zeroinitializer %induction = add <16 x i32> %broadcast.splat, %0 = getelementptr inbounds i8, i8* %b, i32 %index - %1 = icmp ule <16 x i32> %induction, %broadcast.splat13 + + ; %1 = icmp ule <16 x i32> %induction, %broadcast.splat13 + %1 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast i8* %0 to <16 x i8>* %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %2, i32 1, <16 x i1> %1, <16 x i8> undef) %3 = getelementptr inbounds i8, i8* %c, i32 %index @@ -374,9 +395,11 @@ ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dlstp.16 lr, r3 ; CHECK-NEXT: .LBB6_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: add.w r12, r12, #8 ; CHECK-NEXT: vldrh.u16 q0, [r1], #16 ; CHECK-NEXT: vldrh.u16 q1, [r2], #16 ; CHECK-NEXT: vmul.i16 q0, q1, q0 @@ -402,7 +425,10 @@ %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer %induction = add <8 x i32> %broadcast.splat, %0 = getelementptr inbounds i16, i16* %b, i32 %index - %1 = icmp ule <8 x i32> %induction, %broadcast.splat13 + + ; %1 = icmp ule <8 x i32> %induction, %broadcast.splat13 + %1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast i16* %0 to <8 x i16>* %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %2, i32 2, <8 x i1> %1, <8 x i16> undef) %3 = getelementptr inbounds i16, i16* %c, i32 %index @@ -427,4 +453,6 @@ declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32 immarg, <8 x i1>) declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) - +declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) +declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32) +declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32) Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll @@ -5,12 +5,9 @@ ; CHECK: vector.ph: ; CHECK: call void @llvm.set.loop.iterations.i32 -; CHECK: [[UF:%[^ ]+]] = shl i32 %{{.*}}, 2 -; CHECK: [[REMAT_ITER:%[^ ]+]] = sub i32 %N, [[UF]] ; CHECK: br label %vector.body ; CHECK: vector.body: -; CHECK-NOT: phi i32 [ 0, %vector.ph ] ; CHECK: [[ELTS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[SUB:%[^ ]+]], %vector.body ] ; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELTS]]) ; CHECK: [[SUB]] = sub i32 [[ELTS]], 4 @@ -18,7 +15,7 @@ ; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], ; CHECK: middle.block: -; CHECK: [[VCTP_CLONE:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[REMAT_ITER]]) +; CHECK: [[VCTP_CLONE:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[REMAT_ITER:%.*]]) ; CHECK: [[VPSEL:%[^ ]+]] = select <4 x i1> [[VCTP_CLONE]], ; CHECK: call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[VPSEL]]) @@ -32,14 +29,14 @@ %4 = lshr i32 %3, 2 %5 = add nuw nsw i32 %4, 1 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph - + vector.ph: ; preds = %entry %trip.count.minus.1 = add i32 %N, -1 %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer call void @llvm.set.loop.iterations.i32(i32 %5) br label %vector.body - + vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %lsr.iv2 = phi i32* [ %scevgep3, %vector.body ], [ %a, %vector.ph ] @@ -51,7 +48,10 @@ %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, - %7 = icmp ule <4 x i32> %induction, %broadcast.splat12 + + ; %7 = icmp ule <4 x i32> %induction, %broadcast.splat12 + %7 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv24, i32 4, <4 x i1> %7, <4 x i32> undef) %wide.masked.load13 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1, i32 4, <4 x i1> %7, <4 x i32> undef) %8 = mul nsw <4 x i32> %wide.masked.load13, %wide.masked.load @@ -62,20 +62,23 @@ %10 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %6, i32 1) %11 = icmp ne i32 %10, 0 br i1 %11, label %vector.body, label %middle.block - + middle.block: ; preds = %vector.body - %12 = icmp ule <4 x i32> %induction, %broadcast.splat12 +; TODO: check that the intrinsic is also emitted here by the loop vectoriser +; %12 = icmp ule <4 x i32> %induction, %broadcast.splat12 + %12 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %13 = select <4 x i1> %12, <4 x i32> %9, <4 x i32> %vec.phi %14 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %13) br label %for.cond.cleanup - + for.cond.cleanup: ; preds = %middle.block, %entry %res.0.lcssa = phi i32 [ 0, %entry ], [ %14, %middle.block ] ret i32 %res.0.lcssa } - + declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) declare void @llvm.set.loop.iterations.i32(i32) declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) - +declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) Index: llvm/test/CodeGen/Thumb2/mve-fma-loops.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-fma-loops.ll +++ llvm/test/CodeGen/Thumb2/mve-fma-loops.ll @@ -4,22 +4,24 @@ define arm_aapcs_vfpcc void @fmas1(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) { ; CHECK-LABEL: fmas1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt -; CHECK-NEXT: poplt {r7, pc} +; CHECK-NEXT: poplt {r4, pc} ; CHECK-NEXT: vmov r12, s0 +; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB0_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: adds r4, #4 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16 ; CHECK-NEXT: vfmas.f32 q1, q0, r12 ; CHECK-NEXT: vstrw.32 q1, [r2], #16 ; CHECK-NEXT: letp lr, .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: pop {r4, pc} entry: %cmp8 = icmp sgt i32 %n, 0 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup @@ -40,7 +42,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = or <4 x i32> %broadcast.splat, %0 = getelementptr inbounds float, float* %x, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + + ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast float* %0 to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) %3 = getelementptr inbounds float, float* %y, i32 %index @@ -61,22 +66,24 @@ define arm_aapcs_vfpcc void @fmas2(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) { ; CHECK-LABEL: fmas2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt -; CHECK-NEXT: poplt {r7, pc} +; CHECK-NEXT: poplt {r4, pc} ; CHECK-NEXT: vmov r12, s0 +; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB1_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: adds r4, #4 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: vldrw.u32 q1, [r1], #16 ; CHECK-NEXT: vfmas.f32 q1, q0, r12 ; CHECK-NEXT: vstrw.32 q1, [r2], #16 ; CHECK-NEXT: letp lr, .LBB1_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: pop {r4, pc} entry: %cmp8 = icmp sgt i32 %n, 0 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup @@ -97,7 +104,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = or <4 x i32> %broadcast.splat, %0 = getelementptr inbounds float, float* %x, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + + ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast float* %0 to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) %3 = getelementptr inbounds float, float* %y, i32 %index @@ -119,22 +129,24 @@ define arm_aapcs_vfpcc void @fma1(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) { ; CHECK-LABEL: fma1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt -; CHECK-NEXT: poplt {r7, pc} +; CHECK-NEXT: poplt {r4, pc} ; CHECK-NEXT: vmov r12, s0 +; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB2_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: adds r4, #4 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: vldrw.u32 q1, [r1], #16 ; CHECK-NEXT: vfma.f32 q1, q0, r12 ; CHECK-NEXT: vstrw.32 q1, [r2], #16 ; CHECK-NEXT: letp lr, .LBB2_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: pop {r4, pc} entry: %cmp8 = icmp sgt i32 %n, 0 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup @@ -155,7 +167,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = or <4 x i32> %broadcast.splat, %0 = getelementptr inbounds float, float* %x, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + + ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast float* %0 to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) %3 = getelementptr inbounds float, float* %y, i32 %index @@ -176,22 +191,24 @@ define arm_aapcs_vfpcc void @fma2(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) { ; CHECK-LABEL: fma2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt -; CHECK-NEXT: poplt {r7, pc} +; CHECK-NEXT: poplt {r4, pc} ; CHECK-NEXT: vmov r12, s0 +; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB3_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: adds r4, #4 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: vldrw.u32 q1, [r1], #16 ; CHECK-NEXT: vfma.f32 q1, q0, r12 ; CHECK-NEXT: vstrw.32 q1, [r2], #16 ; CHECK-NEXT: letp lr, .LBB3_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: pop {r4, pc} entry: %cmp8 = icmp sgt i32 %n, 0 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup @@ -212,7 +229,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = or <4 x i32> %broadcast.splat, %0 = getelementptr inbounds float, float* %x, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + + ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast float* %0 to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) %3 = fmul fast <4 x float> %wide.masked.load, %broadcast.splat13 @@ -234,23 +254,25 @@ define arm_aapcs_vfpcc void @fmss1(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) { ; CHECK-LABEL: fmss1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt -; CHECK-NEXT: poplt {r7, pc} -; CHECK-NEXT: vmov r12, s0 +; CHECK-NEXT: poplt {r4, pc} +; CHECK-NEXT: vmov r4, s0 ; CHECK-NEXT: dlstp.32 lr, r3 -; CHECK-NEXT: eor r12, r12, #-2147483648 +; CHECK-NEXT: eor r12, r4, #-2147483648 +; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: .LBB4_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: adds r4, #4 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16 ; CHECK-NEXT: vfmas.f32 q1, q0, r12 ; CHECK-NEXT: vstrw.32 q1, [r2], #16 ; CHECK-NEXT: letp lr, .LBB4_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: pop {r4, pc} entry: %cmp8 = icmp sgt i32 %n, 0 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup @@ -272,7 +294,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = or <4 x i32> %broadcast.splat, %0 = getelementptr inbounds float, float* %x, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + + ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast float* %0 to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) %3 = getelementptr inbounds float, float* %y, i32 %index @@ -293,17 +318,19 @@ define arm_aapcs_vfpcc void @fmss2(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) { ; CHECK-LABEL: fmss2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt -; CHECK-NEXT: poplt {r7, pc} -; CHECK-NEXT: vmov r12, s0 -; CHECK-NEXT: vdup.32 q0, r12 -; CHECK-NEXT: dlstp.32 lr, r3 +; CHECK-NEXT: poplt {r4, pc} +; CHECK-NEXT: vmov r4, s0 +; CHECK-NEXT: vdup.32 q0, r4 ; CHECK-NEXT: vneg.f32 q0, q0 +; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB5_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: add.w r12, r12, #4 ; CHECK-NEXT: vmov q3, q0 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16 ; CHECK-NEXT: vldrw.u32 q2, [r1], #16 @@ -311,7 +338,7 @@ ; CHECK-NEXT: vstrw.32 q3, [r2], #16 ; CHECK-NEXT: letp lr, .LBB5_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: pop {r4, pc} entry: %cmp8 = icmp sgt i32 %n, 0 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup @@ -332,7 +359,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = or <4 x i32> %broadcast.splat, %0 = getelementptr inbounds float, float* %x, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + + ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast float* %0 to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) %3 = getelementptr inbounds float, float* %y, i32 %index @@ -361,9 +391,11 @@ ; CHECK-NEXT: poplt {r7, pc} ; CHECK-NEXT: vmov r12, s0 ; CHECK-NEXT: vdup.32 q0, r12 +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB6_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: add.w r12, r12, #4 ; CHECK-NEXT: vmov q3, q0 ; CHECK-NEXT: vldrw.u32 q1, [r1], #16 ; CHECK-NEXT: vldrw.u32 q2, [r0], #16 @@ -392,7 +424,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = or <4 x i32> %broadcast.splat, %0 = getelementptr inbounds float, float* %x, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + + ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast float* %0 to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) %3 = getelementptr inbounds float, float* %y, i32 %index @@ -421,9 +456,11 @@ ; CHECK-NEXT: poplt {r7, pc} ; CHECK-NEXT: vmov r12, s0 ; CHECK-NEXT: vdup.32 q0, r12 +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB7_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: add.w r12, r12, #4 ; CHECK-NEXT: vmov q3, q0 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16 ; CHECK-NEXT: vldrw.u32 q2, [r1], #16 @@ -452,7 +489,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = or <4 x i32> %broadcast.splat, %0 = getelementptr inbounds float, float* %x, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + + ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast float* %0 to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) %3 = getelementptr inbounds float, float* %y, i32 %index @@ -474,23 +514,25 @@ define arm_aapcs_vfpcc void @fms1(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) { ; CHECK-LABEL: fms1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt -; CHECK-NEXT: poplt {r7, pc} -; CHECK-NEXT: vmov r12, s0 +; CHECK-NEXT: poplt {r4, pc} +; CHECK-NEXT: vmov r4, s0 ; CHECK-NEXT: dlstp.32 lr, r3 -; CHECK-NEXT: eor r12, r12, #-2147483648 +; CHECK-NEXT: eor r12, r4, #-2147483648 +; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: .LBB8_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: adds r4, #4 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: vldrw.u32 q1, [r1], #16 ; CHECK-NEXT: vfma.f32 q1, q0, r12 ; CHECK-NEXT: vstrw.32 q1, [r2], #16 ; CHECK-NEXT: letp lr, .LBB8_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: pop {r4, pc} entry: %cmp8 = icmp sgt i32 %n, 0 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup @@ -512,7 +554,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = or <4 x i32> %broadcast.splat, %0 = getelementptr inbounds float, float* %x, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + + ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast float* %0 to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) %3 = getelementptr inbounds float, float* %y, i32 %index @@ -540,9 +585,11 @@ ; CHECK-NEXT: poplt {r7, pc} ; CHECK-NEXT: vmov r12, s0 ; CHECK-NEXT: vdup.32 q0, r12 +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB9_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: add.w r12, r12, #4 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16 ; CHECK-NEXT: vldrw.u32 q2, [r1], #16 ; CHECK-NEXT: vfms.f32 q2, q1, q0 @@ -570,7 +617,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = or <4 x i32> %broadcast.splat, %0 = getelementptr inbounds float, float* %x, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + + ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast float* %0 to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) %3 = getelementptr inbounds float, float* %y, i32 %index @@ -592,23 +642,25 @@ define arm_aapcs_vfpcc void @fms3(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) { ; CHECK-LABEL: fms3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt -; CHECK-NEXT: poplt {r7, pc} +; CHECK-NEXT: poplt {r4, pc} ; CHECK-NEXT: vmov r12, s0 +; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB10_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r1], #16 -; CHECK-NEXT: vldrw.u32 q1, [r0], #16 -; CHECK-NEXT: vneg.f32 q0, q0 -; CHECK-NEXT: vfma.f32 q0, q1, r12 -; CHECK-NEXT: vstrw.32 q0, [r2], #16 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 +; CHECK-NEXT: vldrw.u32 q1, [r1], #16 +; CHECK-NEXT: adds r4, #4 +; CHECK-NEXT: vneg.f32 q1, q1 +; CHECK-NEXT: vfma.f32 q1, q0, r12 +; CHECK-NEXT: vstrw.32 q1, [r2], #16 ; CHECK-NEXT: letp lr, .LBB10_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: pop {r4, pc} entry: %cmp8 = icmp sgt i32 %n, 0 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup @@ -629,7 +681,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = or <4 x i32> %broadcast.splat, %0 = getelementptr inbounds float, float* %x, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + + ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast float* %0 to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) %3 = getelementptr inbounds float, float* %y, i32 %index @@ -651,23 +706,25 @@ define arm_aapcs_vfpcc void @fms4(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) { ; CHECK-LABEL: fms4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt -; CHECK-NEXT: poplt {r7, pc} +; CHECK-NEXT: poplt {r4, pc} ; CHECK-NEXT: vmov r12, s0 +; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB11_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r1], #16 -; CHECK-NEXT: vldrw.u32 q1, [r0], #16 -; CHECK-NEXT: vneg.f32 q0, q0 -; CHECK-NEXT: vfma.f32 q0, q1, r12 -; CHECK-NEXT: vstrw.32 q0, [r2], #16 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 +; CHECK-NEXT: vldrw.u32 q1, [r1], #16 +; CHECK-NEXT: adds r4, #4 +; CHECK-NEXT: vneg.f32 q1, q1 +; CHECK-NEXT: vfma.f32 q1, q0, r12 +; CHECK-NEXT: vstrw.32 q1, [r2], #16 ; CHECK-NEXT: letp lr, .LBB11_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: pop {r4, pc} entry: %cmp8 = icmp sgt i32 %n, 0 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup @@ -688,7 +745,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = or <4 x i32> %broadcast.splat, %0 = getelementptr inbounds float, float* %x, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + + ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast float* %0 to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) %3 = fmul fast <4 x float> %wide.masked.load, %broadcast.splat13 @@ -710,3 +770,4 @@ declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4 x i1>, <4 x float>) declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32 immarg, <4 x i1>) +declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)