diff --git a/llvm/lib/Target/ARM/MVETailPredication.cpp b/llvm/lib/Target/ARM/MVETailPredication.cpp --- a/llvm/lib/Target/ARM/MVETailPredication.cpp +++ b/llvm/lib/Target/ARM/MVETailPredication.cpp @@ -31,15 +31,14 @@ /// blocks of instructions operating on different vector types. /// /// This pass: -/// 1) Pattern matches the scalar iteration count produced by the vectoriser. -/// The scalar loop iteration count represents the number of elements to be -/// processed. -/// TODO: this could be emitted using an intrinsic, similar to the hardware -/// loop intrinsics, so that we don't need to pattern match this here. -/// 2) Inserts the VCTP intrinsic to represent the effect of -/// tail predication. This will be picked up by the ARM Low-overhead loop -/// pass, which performs the final transformation to a DLSTP or WLSTP -/// tail-predicated loop. +/// 1) Checks if the predicates of the masked load/store instructions are +/// generated by intrinsic @llvm.get.active.lanes(). This intrinsic consumes +/// the Backedge Taken Count (BTC) of the scalar loop as its second argument, +/// which we extract to set up the number of elements processed by the loop. +/// 2) Intrinsic @llvm.get.active.lanes() is then replaced by the MVE target +/// specific VCTP intrinsic to represent the effect of tail predication. +/// This will be picked up by the ARM Low-overhead loop pass, which performs +/// the final transformation to a DLSTP or WLSTP tail-predicated loop. #include "ARM.h" #include "ARMSubtarget.h" @@ -64,34 +63,18 @@ #define DEBUG_TYPE "mve-tail-predication" #define DESC "Transform predicated vector loops to use MVE tail predication" +static cl::opt +ForceTailPredication("force-tail-predication", cl::Hidden, cl::init(false), + cl::desc("Force tail-predication even if it might be " + "unsafe (e.g. possible overflow in loop " + "counters)")); + cl::opt DisableTailPredication("disable-mve-tail-predication", cl::Hidden, cl::init(true), cl::desc("Disable MVE Tail Predication")); namespace { -// Bookkeeping for pattern matching the loop trip count and the number of -// elements processed by the loop. -struct TripCountPattern { - // An icmp instruction that calculates a predicate of active/inactive lanes - // used by the masked loads/stores. - Instruction *Predicate = nullptr; - - // The add instruction that increments the IV. - Value *TripCount = nullptr; - - // The number of elements processed by the vector loop. - Value *NumElements = nullptr; - - // Other instructions in the icmp chain that calculate the predicate. - FixedVectorType *VecTy = nullptr; - Instruction *Shuffle = nullptr; - Instruction *Induction = nullptr; - - TripCountPattern(Instruction *P, Value *TC, FixedVectorType *VT) - : Predicate(P), TripCount(TC), VecTy(VT){}; -}; - class MVETailPredication : public LoopPass { SmallVector MaskedInsts; Loop *L = nullptr; @@ -102,6 +85,8 @@ TargetTransformInfo *TTI = nullptr; TargetLibraryInfo *TLI = nullptr; bool ClonedVCTPInExitBlock = false; + IntrinsicInst *ActiveLaneMask = nullptr; + FixedVectorType *VecTy = nullptr; public: static char ID; @@ -129,23 +114,25 @@ /// load/stores. bool IsPredicatedVectorLoop(); - /// Compute a value for the total number of elements that the predicated - /// loop will process if it is a runtime value. - bool ComputeRuntimeElements(TripCountPattern &TCP); - - /// Return whether this is the icmp that generates an i1 vector, based - /// upon a loop counter and a limit that is defined outside the loop, - /// that generates the active/inactive lanes required for tail-predication. - bool isTailPredicate(TripCountPattern &TCP); + /// Perform checks on the arguments of @llvm.get.active.lane.mask + /// intrinsic: check if the first is a loop induction variable, and for the + /// the second check that no overflow can occur in the expression that use + /// this backedge-taken count. + bool IsSafeActiveMask(Value *TripCount, FixedVectorType *VecTy); /// Insert the intrinsic to represent the effect of tail predication. - void InsertVCTPIntrinsic(TripCountPattern &TCP, + void InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask, Value *TripCount, + FixedVectorType *VecTy, DenseMap &NewPredicates); /// Rematerialize the iteration count in exit blocks, which enables /// ARMLowOverheadLoops to better optimise away loop update statements inside /// hardware-loops. void RematerializeIterCount(); + + /// If it is not safe to lower @llvm.get.active.lane.mask to a VCTP, it needs + /// to be lowered to an icmp. + void RevertActiveLaneMask(); }; } // end namespace @@ -179,6 +166,83 @@ DeadInsts); } +void MVETailPredication::RevertActiveLaneMask() { + if (!ActiveLaneMask) + return; + + int VectorWidth = VecTy->getElementCount().Min; + IRBuilder<> Builder(ActiveLaneMask->getParent()->getFirstNonPHI()); + + // 1. Create the vector induction step. This %induction will be the LHS of + // the icmp: + // + // %splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 + // %splat = shufflevector <4 x i32> %splatinsert, <4 x i32> undef, <4 x i32> 0 + // %induction = add <4 x i32> %splat, + // + Value *Index = ActiveLaneMask->getOperand(0); + Value *SplatIndex = + Builder.CreateVectorSplat(VectorWidth, Index, "lane.mask"); + + SmallVector Indices; + for (int i = 0; i < VectorWidth; ++i) + Indices.push_back(ConstantInt::get(Index->getType(), i)); + + Constant *CV = ConstantVector::get(Indices); + Value *Induction = Builder.CreateAdd(SplatIndex, CV, "lane.mask.induction"); + + LLVM_DEBUG(dbgs() << "ARM TP: New index: " << *SplatIndex << "\n"; + dbgs() << "ARM TP: New Induction: " << *Induction << "\n"); + + // 2. In the Preheader, first look if the splat BTC already exists. Find this + // %splat, which will be the RHS of the icmp: + // + // %TC.minus.1 = add i32 %N, -1 + // %splatinsert = insertelement <4 x i32> undef, i32 %TC.minus.1, i32 0 + // %splat = shufflevector <4 x i32> %splatinsert, <4 x i32> undef, <16 x i32> 0 + // + auto *Preheader = L->getLoopPreheader(); + auto *BTC = ActiveLaneMask->getOperand(1); + Value *SplatBTC = nullptr; + + if (auto *C = dyn_cast(BTC)) { + Builder.SetInsertPoint(Preheader->getTerminator()); + SplatBTC = Builder.CreateVectorSplat(VectorWidth, C); + LLVM_DEBUG(dbgs() << "ARM TCP: New splat BTC: " << *SplatBTC << "\n"); + } else { + Instruction *InsertElem; + for (auto &V : *Preheader) { + InsertElem = dyn_cast(&V); + if (!InsertElem) + continue; + ConstantInt *CI = dyn_cast(InsertElem->getOperand(2)); + if (!CI) + continue; + if (InsertElem->getOperand(1) != BTC || CI->getSExtValue() != 0) + continue; + if ((SplatBTC = dyn_cast(*InsertElem->users().begin()))) + break; + } + } + // Or create the splat BTC if it doesn't exist. + if (!SplatBTC) { + Builder.SetInsertPoint(Preheader->getTerminator()); + Value *Undef = + UndefValue::get(FixedVectorType::get(BTC->getType(), VectorWidth)); + Value *Insert = Builder.CreateInsertElement(Undef, + BTC, Builder.getInt32(0), "insert.btc"); + Value *Zero = ConstantInt::get(Insert->getType(), 0); + SplatBTC = Builder.CreateShuffleVector (Insert, Undef, Zero, "splat.btc"); + LLVM_DEBUG(dbgs() << "ARM TCP: New splat BTC: " << *SplatBTC << "\n"); + } + + Builder.SetInsertPoint(ActiveLaneMask); + Value *ICmp = Builder.CreateICmp(ICmpInst::ICMP_ULE, Induction, SplatBTC); + LLVM_DEBUG(dbgs() << "ARM TP: New compare: " << *ICmp << "\n"); + ActiveLaneMask->replaceAllUsesWith(ICmp); + ActiveLaneMask->eraseFromParent(); +} + bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) { if (skipLoop(L) || DisableTailPredication) return false; @@ -196,6 +260,7 @@ TLI = TLIP ? &TLIP->getTLI(*L->getHeader()->getParent()) : nullptr; DL = &L->getHeader()->getModule()->getDataLayout(); this->L = L; + ActiveLaneMask = nullptr; // The MVE and LOB extensions are combined to enable tail-predication, but // there's nothing preventing us from generating VCTP instructions for v8.1m. @@ -256,86 +321,19 @@ if (ClonedVCTPInExitBlock) RematerializeIterCount(); return true; - } + } else + RevertActiveLaneMask(); LLVM_DEBUG(dbgs() << "ARM TP: Can't tail-predicate this loop.\n"); return false; } -// Pattern match predicates/masks and determine if they use the loop induction -// variable to control the number of elements processed by the loop. If so, -// the loop is a candidate for tail-predication. -bool MVETailPredication::isTailPredicate(TripCountPattern &TCP) { - using namespace PatternMatch; - - // Pattern match the loop body and find the add with takes the index iv - // and adds a constant vector to it: - // - // vector.body: - // .. - // %index = phi i32 - // %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 - // %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, - // <4 x i32> undef, - // <4 x i32> zeroinitializer - // %induction = [add|or] <4 x i32> %broadcast.splat, - // %pred = icmp ule <4 x i32> %induction, %broadcast.splat11 - // - // Please note that the 'or' is equivalent to the 'and' here, this relies on - // BroadcastSplat being the IV which we know is a phi with 0 start and Lanes - // increment, which is all being checked below. - Instruction *BroadcastSplat = nullptr; - Constant *Const = nullptr; - if (!match(TCP.Induction, - m_Add(m_Instruction(BroadcastSplat), m_Constant(Const))) && - !match(TCP.Induction, - m_Or(m_Instruction(BroadcastSplat), m_Constant(Const)))) - return false; - - // Check that we're adding <0, 1, 2, 3... - if (auto *CDS = dyn_cast(Const)) { - for (unsigned i = 0; i < CDS->getNumElements(); ++i) { - if (CDS->getElementAsInteger(i) != i) - return false; - } - } else - return false; - - Instruction *Insert = nullptr; - // The shuffle which broadcasts the index iv into a vector. - if (!match(BroadcastSplat, - m_Shuffle(m_Instruction(Insert), m_Undef(), m_ZeroMask()))) - return false; - - // The insert element which initialises a vector with the index iv. - Instruction *IV = nullptr; - if (!match(Insert, m_InsertElt(m_Undef(), m_Instruction(IV), m_Zero()))) - return false; - - // The index iv. - auto *Phi = dyn_cast(IV); - if (!Phi) - return false; - - // TODO: Don't think we need to check the entry value. - Value *OnEntry = Phi->getIncomingValueForBlock(L->getLoopPreheader()); - if (!match(OnEntry, m_Zero())) - return false; - - Value *InLoop = Phi->getIncomingValueForBlock(L->getLoopLatch()); - unsigned Lanes = cast(Insert->getType())->getNumElements(); - - Instruction *LHS = nullptr; - if (!match(InLoop, m_Add(m_Instruction(LHS), m_SpecificInt(Lanes)))) - return false; - - return LHS == Phi; -} - static FixedVectorType *getVectorType(IntrinsicInst *I) { unsigned TypeOp = I->getIntrinsicID() == Intrinsic::masked_load ? 0 : 1; auto *PtrTy = cast(I->getOperand(TypeOp)->getType()); - return cast(PtrTy->getElementType()); + auto *VecTy = cast(PtrTy->getElementType()); + assert(VecTy && "No scalable vectors expected here"); + return VecTy; } bool MVETailPredication::IsPredicatedVectorLoop() { @@ -368,178 +366,6 @@ return !MaskedInsts.empty(); } -// Pattern match the predicate, which is an icmp with a constant vector of this -// form: -// -// icmp ult <4 x i32> %induction, -// -// and return the constant, i.e. 32002 in this example. This is assumed to be -// the scalar loop iteration count: the number of loop elements by the -// the vector loop. Further checks are performed in function isTailPredicate(), -// to verify 'induction' behaves as an induction variable. -// -static bool ComputeConstElements(TripCountPattern &TCP) { - if (!dyn_cast(TCP.TripCount)) - return false; - - ConstantInt *VF = ConstantInt::get( - cast(TCP.TripCount->getType()), TCP.VecTy->getNumElements()); - using namespace PatternMatch; - CmpInst::Predicate CC; - - if (!match(TCP.Predicate, m_ICmp(CC, m_Instruction(TCP.Induction), - m_AnyIntegralConstant())) || - CC != ICmpInst::ICMP_ULT) - return false; - - LLVM_DEBUG(dbgs() << "ARM TP: icmp with constants: "; TCP.Predicate->dump();); - Value *ConstVec = TCP.Predicate->getOperand(1); - - auto *CDS = dyn_cast(ConstVec); - if (!CDS || CDS->getNumElements() != VF->getSExtValue()) - return false; - - if ((TCP.NumElements = CDS->getSplatValue())) { - assert(dyn_cast(TCP.NumElements)->getSExtValue() % - VF->getSExtValue() != - 0 && - "tail-predication: trip count should not be a multiple of the VF"); - LLVM_DEBUG(dbgs() << "ARM TP: Found const elem count: " << *TCP.NumElements - << "\n"); - return true; - } - return false; -} - -// Pattern match the loop iteration count setup: -// -// %trip.count.minus.1 = add i32 %N, -1 -// %broadcast.splatinsert10 = insertelement <4 x i32> undef, -// i32 %trip.count.minus.1, i32 0 -// %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, -// <4 x i32> undef, -// <4 x i32> zeroinitializer -// .. -// vector.body: -// .. -// -static bool MatchElemCountLoopSetup(Loop *L, Instruction *Shuffle, - Value *NumElements) { - using namespace PatternMatch; - Instruction *Insert = nullptr; - - if (!match(Shuffle, - m_Shuffle(m_Instruction(Insert), m_Undef(), m_ZeroMask()))) - return false; - - // Insert the limit into a vector. - Instruction *BECount = nullptr; - if (!match(Insert, - m_InsertElt(m_Undef(), m_Instruction(BECount), m_Zero()))) - return false; - - // The limit calculation, backedge count. - Value *TripCount = nullptr; - if (!match(BECount, m_Add(m_Value(TripCount), m_AllOnes()))) - return false; - - if (TripCount != NumElements || !L->isLoopInvariant(BECount)) - return false; - - return true; -} - -bool MVETailPredication::ComputeRuntimeElements(TripCountPattern &TCP) { - using namespace PatternMatch; - const SCEV *TripCountSE = SE->getSCEV(TCP.TripCount); - ConstantInt *VF = ConstantInt::get( - cast(TCP.TripCount->getType()), TCP.VecTy->getNumElements()); - - if (VF->equalsInt(1)) - return false; - - CmpInst::Predicate Pred; - if (!match(TCP.Predicate, m_ICmp(Pred, m_Instruction(TCP.Induction), - m_Instruction(TCP.Shuffle))) || - Pred != ICmpInst::ICMP_ULE) - return false; - - LLVM_DEBUG(dbgs() << "Computing number of elements for vector trip count: "; - TCP.TripCount->dump()); - - // Otherwise, continue and try to pattern match the vector iteration - // count expression - auto VisitAdd = [&](const SCEVAddExpr *S) -> const SCEVMulExpr * { - if (auto *Const = dyn_cast(S->getOperand(0))) { - if (Const->getAPInt() != -VF->getValue()) - return nullptr; - } else - return nullptr; - return dyn_cast(S->getOperand(1)); - }; - - auto VisitMul = [&](const SCEVMulExpr *S) -> const SCEVUDivExpr * { - if (auto *Const = dyn_cast(S->getOperand(0))) { - if (Const->getValue() != VF) - return nullptr; - } else - return nullptr; - return dyn_cast(S->getOperand(1)); - }; - - auto VisitDiv = [&](const SCEVUDivExpr *S) -> const SCEV * { - if (auto *Const = dyn_cast(S->getRHS())) { - if (Const->getValue() != VF) - return nullptr; - } else - return nullptr; - - if (auto *RoundUp = dyn_cast(S->getLHS())) { - if (auto *Const = dyn_cast(RoundUp->getOperand(0))) { - if (Const->getAPInt() != (VF->getValue() - 1)) - return nullptr; - } else - return nullptr; - - return RoundUp->getOperand(1); - } - return nullptr; - }; - - // TODO: Can we use SCEV helpers, such as findArrayDimensions, and friends to - // determine the numbers of elements instead? Looks like this is what is used - // for delinearization, but I'm not sure if it can be applied to the - // vectorized form - at least not without a bit more work than I feel - // comfortable with. - - // Search for Elems in the following SCEV: - // (1 + ((-VF + (VF * (((VF - 1) + %Elems) /u VF))) /u VF)) - const SCEV *Elems = nullptr; - if (auto *TC = dyn_cast(TripCountSE)) - if (auto *Div = dyn_cast(TC->getOperand(1))) - if (auto *Add = dyn_cast(Div->getLHS())) - if (auto *Mul = VisitAdd(Add)) - if (auto *Div = VisitMul(Mul)) - if (auto *Res = VisitDiv(Div)) - Elems = Res; - - if (!Elems) - return false; - - Instruction *InsertPt = L->getLoopPreheader()->getTerminator(); - if (!isSafeToExpandAt(Elems, InsertPt, *SE)) - return false; - - auto DL = L->getHeader()->getModule()->getDataLayout(); - SCEVExpander Expander(*SE, DL, "elements"); - TCP.NumElements = Expander.expandCodeFor(Elems, Elems->getType(), InsertPt); - - if (!MatchElemCountLoopSetup(L, TCP.Shuffle, TCP.NumElements)) - return false; - - return true; -} - // Look through the exit block to see whether there's a duplicate predicate // instruction. This can happen when we need to perform a select on values // from the last and previous iteration. Instead of doing a straight @@ -587,7 +413,6 @@ if (auto *OpI = dyn_cast(U)) MaybeDead.insert(OpI); - I->dropAllReferences(); Dead.insert(I); } @@ -602,23 +427,207 @@ return ClonedVCTPInExitBlock; } -void MVETailPredication::InsertVCTPIntrinsic(TripCountPattern &TCP, +// The active lane intrinsic has this form: +// +// @llvm.get.active.lane.mask(IV, BTC) +// +// Here we perform checks that this intrinsic behaves as expected, +// which means: +// +// 1) The element count, which is calculated with BTC + 1, cannot overflow. +// 2) The element count needs to be sufficiently large that the decrement of +// element counter doesn't overflow, which means that we need to prove: +// ceil(ElementCount / VectorWidth) >= TripCount +// by rounding up ElementCount up: +// ((ElementCount + (VectorWidth - 1)) / VectorWidth +// and evaluate if expression isKnownNonNegative: +// (((ElementCount + (VectorWidth - 1)) / VectorWidth) - TripCount +// 3) The IV must be an induction phi with an increment equal to the +// vector width. +bool MVETailPredication::IsSafeActiveMask(Value *TripCount, + FixedVectorType *VecTy) { + // 1) Test whether entry to the loop is protected by a conditional + // BTC + 1 < 0. In other words, if the scalar trip count overflows, + // becomes negative, we shouldn't enter the loop and creating + // tripcount expression BTC + 1 is not safe. So, check that BTC + // isn't max. This is evaluated in unsigned, because the semantics + // of @get.active.lane.mask is a ULE comparison. + int VectorWidth = VecTy->getNumElements(); + auto *BackedgeTakenCount = ActiveLaneMask->getOperand(1); + auto *BTC = SE->getSCEV(BackedgeTakenCount); + + if (!llvm::cannotBeMaxInLoop(BTC, L, *SE, false /*Signed*/) && + !ForceTailPredication) { + LLVM_DEBUG(dbgs() << "ARM TP: Overflow possible, BTC can be max: "; + BTC->dump()); + return false; + } + + // 2) Prove that the sub expression is non-negative, i.e. it doesn't overflow: + // + // (((ElementCount + (VectorWidth - 1)) / VectorWidth) - TripCount + // + // 2.1) First prove overflow can't happen in: + // + // ElementCount + (VectorWidth - 1) + // + // Because of a lack of context, it is difficult to get a useful bounds on + // this expression. But since ElementCount uses the same variables as the + // TripCount (TC), for which we can find meaningful value ranges, we use that + // instead and assert that: + // + // upperbound(TC) <= UINT_MAX - VectorWidth + // + auto *TC = SE->getSCEV(TripCount); + unsigned SizeInBits = TripCount->getType()->getScalarSizeInBits(); + auto Diff = APInt(SizeInBits, ~0) - APInt(SizeInBits, VectorWidth); + uint64_t MaxMinusVW = Diff.getZExtValue(); + uint64_t UpperboundTC = SE->getSignedRange(TC).getUpper().getZExtValue(); + + if (UpperboundTC > MaxMinusVW && !ForceTailPredication) { + LLVM_DEBUG(dbgs() << "ARM TP: Overflow possible in tripcount rounding:\n"; + dbgs() << "upperbound(TC) <= UINT_MAX - VectorWidth\n"; + dbgs() << UpperboundTC << " <= " << MaxMinusVW << "== false\n";); + return false; + } + + // 2.2) Make sure overflow doesn't happen in final expression: + // (((ElementCount + (VectorWidth - 1)) / VectorWidth) - TripCount, + // To do this, compare the full ranges of these subexpressions: + // + // Range(Ceil) <= Range(TC) + // + // where Ceil = ElementCount + (VW-1) / VW. If Ceil and TC are runtime + // values (and not constants), we have to compensate for the lowerbound value + // range to be off by 1. The reason is that BTC lives in the preheader in + // this form: + // + // %trip.count.minus = add nsw nuw i32 %N, -1 + // + // For the loop to be executed, %N has to be >= 1 and as a result the value + // range of %trip.count.minus has a lower bound of 0. Value %TC has this form: + // + // %5 = add nuw nsw i32 %4, 1 + // call void @llvm.set.loop.iterations.i32(i32 %5) + // + // where %5 is some expression using %N, which needs to have a lower bound of + // 1. Thus, if the ranges of Ceil and TC are not a single constant but a set, + // we first add 0 to TC such that we can do the <= comparison on both sets. + // + auto *One = SE->getOne(TripCount->getType()); + // ElementCount = BTC + 1 + auto *ElementCount = SE->getAddExpr(BTC, One); + // Tmp = ElementCount + (VW-1) + auto *ECPlusVWMinus1 = SE->getAddExpr(ElementCount, + SE->getSCEV(ConstantInt::get(TripCount->getType(), VectorWidth - 1))); + // Ceil = ElementCount + (VW-1) / VW + auto *Ceil = SE->getUDivExpr(ECPlusVWMinus1, + SE->getSCEV(ConstantInt::get(TripCount->getType(), VectorWidth))); + + ConstantRange RangeCeil = SE->getSignedRange(Ceil) ; + ConstantRange RangeTC = SE->getSignedRange(TC) ; + if (!RangeTC.isSingleElement()) { + auto ZeroRange = + ConstantRange(APInt(TripCount->getType()->getScalarSizeInBits(), 0)); + RangeTC = RangeTC.unionWith(ZeroRange); + } + if (!RangeTC.contains(RangeCeil) && !ForceTailPredication) { + LLVM_DEBUG(dbgs() << "ARM TP: Overflow possible in sub\n"); + return false; + } + + // 3) Find out if IV is an induction phi. Note that We can't use Loop + // helpers here to get the induction variable, because the hardware loop is + // no longer in loopsimplify form, and also the hwloop intrinsic use a + // different counter. Using SCEV, we check that the induction is of the + // form i = i + 4, where the increment must be equal to the VectorWidth. + auto *IV = ActiveLaneMask->getOperand(0); + auto *IVExpr = SE->getSCEV(IV); + auto *AddExpr = dyn_cast(IVExpr); + if (!AddExpr) { + LLVM_DEBUG(dbgs() << "ARM TP: induction not an add expr: "; IVExpr->dump()); + return false; + } + // Check that this AddRec is associated with this loop. + if (AddExpr->getLoop() != L) { + LLVM_DEBUG(dbgs() << "ARM TP: phi not part of this loop\n"); + return false; + } + auto *Step = dyn_cast(AddExpr->getOperand(1)); + if (!Step) { + LLVM_DEBUG(dbgs() << "ARM TP: induction step is not a constant: "; + AddExpr->getOperand(1)->dump()); + return false; + } + auto StepValue = Step->getValue()->getSExtValue(); + if (VectorWidth == StepValue) + return true; + + LLVM_DEBUG(dbgs() << "ARM TP: step value " << StepValue << " doesn't match " + "vector width : " << VectorWidth << "\n"); + + return false; +} + +// Materialize NumElements in the preheader block. +static Value *getNumElements(BasicBlock *Preheader, Value *BTC) { + // First, check the preheader if it not already exist: + // + // preheader: + // %BTC = add i32 %N, -1 + // .. + // vector.body: + // + // if %BTC already exists. We don't need to emit %NumElems = %BTC + 1, + // but instead can just return %N. + for (auto &I : *Preheader) { + if (I.getOpcode() != Instruction::Add || &I != BTC) + continue; + ConstantInt *MinusOne = nullptr; + if (!(MinusOne = dyn_cast(I.getOperand(1)))) + continue; + if (MinusOne->getSExtValue() == -1) { + LLVM_DEBUG(dbgs() << "ARM TP: Found num elems: " << I << "\n"); + return I.getOperand(0); + } + } + + // But we do need to materialise BTC if it is not already there, + // e.g. if it is a constant. + IRBuilder<> Builder(Preheader->getTerminator()); + Value *NumElements = Builder.CreateAdd(BTC, + ConstantInt::get(BTC->getType(), 1), "num.elements"); + LLVM_DEBUG(dbgs() << "ARM TP: Created num elems: " << *NumElements << "\n"); + return NumElements; +} + +void MVETailPredication::InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask, + Value *TripCount, FixedVectorType *VecTy, DenseMap &NewPredicates) { - IRBuilder<> Builder(L->getHeader()->getFirstNonPHI()); + IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); Module *M = L->getHeader()->getModule(); Type *Ty = IntegerType::get(M->getContext(), 32); + // The backedge-taken count in @llvm.get.active.lane.mask, its 2nd operand, + // is one less than the trip count. So we need to find or create + // %num.elements = %BTC + 1 in the preheader. + Value *BTC = ActiveLaneMask->getOperand(1); + Builder.SetInsertPoint(L->getLoopPreheader()->getTerminator()); + Value *NumElements = getNumElements(L->getLoopPreheader(), BTC); + // Insert a phi to count the number of elements processed by the loop. + Builder.SetInsertPoint(L->getHeader()->getFirstNonPHI() ); PHINode *Processed = Builder.CreatePHI(Ty, 2); - Processed->addIncoming(TCP.NumElements, L->getLoopPreheader()); + Processed->addIncoming(NumElements, L->getLoopPreheader()); - // Insert the intrinsic to represent the effect of tail predication. - Builder.SetInsertPoint(cast(TCP.Predicate)); + // Replace @llvm.get.active.mask() with the ARM specific VCTP intrinic, and thus + // represent the effect of tail predication. + Builder.SetInsertPoint(ActiveLaneMask); ConstantInt *Factor = - ConstantInt::get(cast(Ty), TCP.VecTy->getNumElements()); + ConstantInt::get(cast(Ty), VecTy->getNumElements()); Intrinsic::ID VCTPID; - switch (TCP.VecTy->getNumElements()) { + switch (VecTy->getNumElements()) { default: llvm_unreachable("unexpected number of lanes"); case 4: VCTPID = Intrinsic::arm_mve_vctp32; break; @@ -632,9 +641,9 @@ // purposes, but takes a v4i1 instead of a v2i1. } Function *VCTP = Intrinsic::getDeclaration(M, VCTPID); - Value *TailPredicate = Builder.CreateCall(VCTP, Processed); - TCP.Predicate->replaceAllUsesWith(TailPredicate); - NewPredicates[TCP.Predicate] = cast(TailPredicate); + Value *VCTPCall = Builder.CreateCall(VCTP, Processed); + ActiveLaneMask->replaceAllUsesWith(VCTPCall); + NewPredicates[ActiveLaneMask] = cast(VCTPCall); // Add the incoming value to the new phi. // TODO: This add likely already exists in the loop. @@ -642,7 +651,7 @@ Processed->addIncoming(Remaining, L->getLoopLatch()); LLVM_DEBUG(dbgs() << "ARM TP: Insert processed elements phi: " << *Processed << "\n" - << "ARM TP: Inserted VCTP: " << *TailPredicate << "\n"); + << "ARM TP: Inserted VCTP: " << *VCTPCall << "\n"); } bool MVETailPredication::TryConvert(Value *TripCount) { @@ -653,51 +662,33 @@ LLVM_DEBUG(dbgs() << "ARM TP: Found predicated vector loop.\n"); - // Walk through the masked intrinsics and try to find whether the predicate - // operand is generated from an induction variable. SetVector Predicates; DenseMap NewPredicates; -#ifndef NDEBUG - // For debugging purposes, use this to indicate we have been able to - // pattern match the scalar loop trip count. - bool FoundScalarTC = false; -#endif - + // Walk through the masked intrinsics and try to find whether the predicate + // operand is generated by intrinsic @llvm.get.active.lane.mask(). for (auto *I : MaskedInsts) { - Intrinsic::ID ID = I->getIntrinsicID(); - // First, find the icmp used by this masked load/store. - unsigned PredOp = ID == Intrinsic::masked_load ? 2 : 3; + unsigned PredOp = I->getIntrinsicID() == Intrinsic::masked_load ? 2 : 3; auto *Predicate = dyn_cast(I->getArgOperand(PredOp)); if (!Predicate || Predicates.count(Predicate)) continue; - // Step 1: using this icmp, now calculate the number of elements - // processed by this loop. - TripCountPattern TCP(Predicate, TripCount, getVectorType(I)); - if (!(ComputeConstElements(TCP) || ComputeRuntimeElements(TCP))) + ActiveLaneMask = dyn_cast(Predicate); + if (!ActiveLaneMask || + ActiveLaneMask->getIntrinsicID() != Intrinsic::get_active_lane_mask) continue; - LLVM_DEBUG(FoundScalarTC = true); - - if (!isTailPredicate(TCP)) { - LLVM_DEBUG(dbgs() << "ARM TP: Not an icmp that generates tail predicate: " - << *Predicate << "\n"); - continue; - } - - LLVM_DEBUG(dbgs() << "ARM TP: Found icmp generating tail predicate: " - << *Predicate << "\n"); Predicates.insert(Predicate); + LLVM_DEBUG(dbgs() << "ARM TP: Found active lane mask: " + << *ActiveLaneMask << "\n"); - // Step 2: emit the VCTP intrinsic representing the effect of TP. - InsertVCTPIntrinsic(TCP, NewPredicates); - } - - if (!NewPredicates.size()) { - LLVM_DEBUG(if (!FoundScalarTC) - dbgs() << "ARM TP: Can't determine loop itertion count\n"); - return false; + VecTy = getVectorType(I); + if (!IsSafeActiveMask(TripCount, VecTy)) { + LLVM_DEBUG(dbgs() << "ARM TP: Not safe to insert VCTP.\n"); + return false; + } + LLVM_DEBUG(dbgs() << "ARM TP: Safe to insert VCTP.\n"); + InsertVCTPIntrinsic(ActiveLaneMask, TripCount, VecTy, NewPredicates); } // Now clean up. diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll @@ -1,6 +1,7 @@ ; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -disable-mve-tail-predication=false -mattr=+mve,+lob %s -S -o - | FileCheck %s ; CHECK-LABEL: mul_v16i8 +; CHECK-NOT: %num.elements = add i32 %trip.count.minus.1, 1 ; CHECK: vector.body: ; CHECK: %index = phi i32 ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ] @@ -34,16 +35,19 @@ %broadcast.splat = shufflevector <16 x i32> %broadcast.splatinsert, <16 x i32> undef, <16 x i32> zeroinitializer %induction = or <16 x i32> %broadcast.splat, %tmp = getelementptr inbounds i8, i8* %a, i32 %index - %tmp1 = icmp ule <16 x i32> %induction, %broadcast.splat11 + +; %tmp1 = icmp ule <16 x i32> %induction, %broadcast.splat11 + %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %trip.count.minus.1) + %tmp2 = bitcast i8* %tmp to <16 x i8>* - %wide.masked.load = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %tmp2, i32 4, <16 x i1> %tmp1, <16 x i8> undef) + %wide.masked.load = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %tmp2, i32 4, <16 x i1> %active.lane.mask, <16 x i8> undef) %tmp3 = getelementptr inbounds i8, i8* %b, i32 %index %tmp4 = bitcast i8* %tmp3 to <16 x i8>* - %wide.masked.load2 = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %tmp4, i32 4, <16 x i1> %tmp1, <16 x i8> undef) + %wide.masked.load2 = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %tmp4, i32 4, <16 x i1> %active.lane.mask, <16 x i8> undef) %mul = mul nsw <16 x i8> %wide.masked.load2, %wide.masked.load %tmp6 = getelementptr inbounds i8, i8* %c, i32 %index %tmp7 = bitcast i8* %tmp6 to <16 x i8>* - tail call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %mul, <16 x i8>* %tmp7, i32 4, <16 x i1> %tmp1) + tail call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %mul, <16 x i8>* %tmp7, i32 4, <16 x i1> %active.lane.mask) %index.next = add i32 %index, 16 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1) %tmp16 = icmp ne i32 %tmp15, 0 @@ -54,6 +58,7 @@ } ; CHECK-LABEL: mul_v8i16 +; CHECK-NOT: %num.elements = add i32 %trip.count.minus.1, 1 ; CHECK: vector.body: ; CHECK: %index = phi i32 ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ] @@ -87,16 +92,19 @@ %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer %induction = add <8 x i32> %broadcast.splat, %tmp = getelementptr inbounds i16, i16* %a, i32 %index - %tmp1 = icmp ule <8 x i32> %induction, %broadcast.splat11 + +; %tmp1 = icmp ule <8 x i32> %induction, %broadcast.splat11 + %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %trip.count.minus.1) + %tmp2 = bitcast i16* %tmp to <8 x i16>* - %wide.masked.load = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp2, i32 4, <8 x i1> %tmp1, <8 x i16> undef) + %wide.masked.load = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp2, i32 4, <8 x i1> %active.lane.mask, <8 x i16> undef) %tmp3 = getelementptr inbounds i16, i16* %b, i32 %index %tmp4 = bitcast i16* %tmp3 to <8 x i16>* - %wide.masked.load2 = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> %tmp1, <8 x i16> undef) + %wide.masked.load2 = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> %active.lane.mask, <8 x i16> undef) %mul = mul nsw <8 x i16> %wide.masked.load2, %wide.masked.load %tmp6 = getelementptr inbounds i16, i16* %c, i32 %index %tmp7 = bitcast i16* %tmp6 to <8 x i16>* - tail call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %mul, <8 x i16>* %tmp7, i32 4, <8 x i1> %tmp1) + tail call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %mul, <8 x i16>* %tmp7, i32 4, <8 x i1> %active.lane.mask) %index.next = add i32 %index, 8 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1) %tmp16 = icmp ne i32 %tmp15, 0 @@ -107,6 +115,7 @@ } ; CHECK-LABEL: mul_v4i32 +; CHECK-NOT: %num.elements = add i32 %trip.count.minus.1, 1 ; CHECK: vector.body: ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ] ; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELEMS]]) @@ -139,16 +148,17 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = or <4 x i32> %broadcast.splat, %tmp = getelementptr inbounds i32, i32* %a, i32 %index - %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + ; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 %tmp2 = bitcast i32* %tmp to <4 x i32>* - %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef) + %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index %tmp4 = bitcast i32* %tmp3 to <4 x i32>* - %wide.masked.load2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef) + %wide.masked.load2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) %mul = mul nsw <4 x i32> %wide.masked.load2, %wide.masked.load %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index %tmp7 = bitcast i32* %tmp6 to <4 x i32>* - tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %mul, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1) + tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %mul, <4 x i32>* %tmp7, i32 4, <4 x i1> %active.lane.mask) %index.next = add i32 %index, 4 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1) %tmp16 = icmp ne i32 %tmp15, 0 @@ -159,6 +169,7 @@ } ; CHECK-LABEL: split_vector +; CHECK-NOT: %num.elements = add i32 %trip.count.minus.1, 1 ; CHECK: vector.body: ; CHECK: %index = phi i32 ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ] @@ -192,14 +203,15 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %tmp = getelementptr inbounds i32, i32* %a, i32 %index - %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 +; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) %tmp2 = bitcast i32* %tmp to <4 x i32>* - %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef) + %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) %extract.1.low = shufflevector <4 x i32> %wide.masked.load, <4 x i32> undef, < 2 x i32> < i32 0, i32 2> %extract.1.high = shufflevector <4 x i32> %wide.masked.load, <4 x i32> undef, < 2 x i32> < i32 1, i32 3> %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index %tmp4 = bitcast i32* %tmp3 to <4 x i32>* - %wide.masked.load2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef) + %wide.masked.load2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) %extract.2.low = shufflevector <4 x i32> %wide.masked.load2, <4 x i32> undef, < 2 x i32> < i32 0, i32 2> %extract.2.high = shufflevector <4 x i32> %wide.masked.load2, <4 x i32> undef, < 2 x i32> < i32 1, i32 3> %mul = mul nsw <2 x i32> %extract.1.low, %extract.2.low @@ -207,7 +219,7 @@ %combine = shufflevector <2 x i32> %mul, <2 x i32> %sub, <4 x i32> %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index %tmp7 = bitcast i32* %tmp6 to <4 x i32>* - tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %combine, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1) + tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %combine, <4 x i32>* %tmp7, i32 4, <4 x i1> %active.lane.mask) %index.next = add i32 %index, 4 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1) %tmp16 = icmp ne i32 %tmp15, 0 @@ -250,17 +262,20 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %tmp = getelementptr inbounds i32, i32* %a, i32 %index - %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + +; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %wrong = icmp ult <4 x i32> %induction, %broadcast.splat11 %tmp2 = bitcast i32* %tmp to <4 x i32>* - %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef) + %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index %tmp4 = bitcast i32* %tmp3 to <4 x i32>* %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %wrong, <4 x i32> undef) %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index %tmp7 = bitcast i32* %tmp6 to <4 x i32>* - tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1) + tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %active.lane.mask) %index.next = add i32 %index, 4 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1) %tmp16 = icmp ne i32 %tmp15, 0 @@ -272,6 +287,8 @@ ; The store now uses ult predicate. ; CHECK-LABEL: mismatch_store_pred +; CHECK-NOT: %num.elements = add i32 %trip.count.minus.1, 1 +; CHECK: vector.body: ; CHECK: %index = phi i32 ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ] ; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELEMS]]) @@ -304,13 +321,16 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %tmp = getelementptr inbounds i32, i32* %a, i32 %index - %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + +; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %wrong = icmp ult <4 x i32> %induction, %broadcast.splat11 %tmp2 = bitcast i32* %tmp to <4 x i32>* - %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef) + %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index %tmp4 = bitcast i32* %tmp3 to <4 x i32>* - %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef) + %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index %tmp7 = bitcast i32* %tmp6 to <4 x i32>* @@ -334,4 +354,6 @@ declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) declare void @llvm.set.loop.iterations.i32(i32) declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) - +declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) +declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32) +declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/clear-maskedinsts.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/clear-maskedinsts.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/clear-maskedinsts.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/clear-maskedinsts.ll @@ -23,45 +23,53 @@ ; CHECK-NEXT: [[BROADCAST_SPLATINSERT71:%.*]] = insertelement <4 x i32> undef, i32 [[X]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT72:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT71]], <4 x i32> undef, <4 x i32> zeroinitializer ; CHECK-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[TMP3]]) +; CHECK-NEXT: [[NUM_ELEMENTS:%.*]] = add i32 [[TRIP_COUNT_MINUS_183]], 1 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[LSR_IV9:%.*]] = phi i32* [ [[SCEVGEP10:%.*]], [[VECTOR_BODY]] ], [ [[D:%.*]], [[VECTOR_PH]] ] -; CHECK-NEXT: [[TMP4:%.*]] = phi i32 [ [[TMP3]], [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP5:%.*]] = phi i32 [ [[N]], [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi i32 [ [[TMP3]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP5:%.*]] = phi i32 [ [[NUM_ELEMENTS]], [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[LSR_IV911:%.*]] = bitcast i32* [[LSR_IV9]] to <4 x i32>* -; CHECK-NEXT: [[TMP6:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP5]]) -; CHECK-NEXT: [[TMP7]] = sub i32 [[TMP5]], 4 -; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[BROADCAST_SPLAT72]], <4 x i32>* [[LSR_IV911]], i32 4, <4 x i1> [[TMP6]]) +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> undef, i32 [[TRIP_COUNT_MINUS_183]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP5]]) +; CHECK-NEXT: [[TMP9]] = sub i32 [[TMP5]], 4 +; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[BROADCAST_SPLAT72]], <4 x i32>* [[LSR_IV911]], i32 4, <4 x i1> [[TMP8]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 ; CHECK-NEXT: [[SCEVGEP10]] = getelementptr i32, i32* [[LSR_IV9]], i32 4 -; CHECK-NEXT: [[TMP8]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP4]], i32 1) -; CHECK-NEXT: [[TMP9:%.*]] = icmp ne i32 [[TMP8]], 0 -; CHECK-NEXT: br i1 [[TMP9]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]] +; CHECK-NEXT: [[TMP10]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP4]], i32 1) +; CHECK-NEXT: [[TMP11:%.*]] = icmp ne i32 [[TMP10]], 0 +; CHECK-NEXT: br i1 [[TMP11]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]] ; CHECK: vector.body75: ; CHECK-NEXT: [[LSR_IV6:%.*]] = phi i32* [ [[S1:%.*]], [[VECTOR_BODY75_PREHEADER]] ], [ [[SCEVGEP7:%.*]], [[VECTOR_BODY75]] ] ; CHECK-NEXT: [[LSR_IV3:%.*]] = phi i32* [ [[S2:%.*]], [[VECTOR_BODY75_PREHEADER]] ], [ [[SCEVGEP4:%.*]], [[VECTOR_BODY75]] ] ; CHECK-NEXT: [[LSR_IV:%.*]] = phi i32* [ [[D]], [[VECTOR_BODY75_PREHEADER]] ], [ [[SCEVGEP:%.*]], [[VECTOR_BODY75]] ] ; CHECK-NEXT: [[INDEX80:%.*]] = phi i32 [ [[INDEX_NEXT81:%.*]], [[VECTOR_BODY75]] ], [ 0, [[VECTOR_BODY75_PREHEADER]] ] -; CHECK-NEXT: [[TMP10:%.*]] = phi i32 [ [[TMP2]], [[VECTOR_BODY75_PREHEADER]] ], [ [[TMP15:%.*]], [[VECTOR_BODY75]] ] +; CHECK-NEXT: [[TMP12:%.*]] = phi i32 [ [[TMP2]], [[VECTOR_BODY75_PREHEADER]] ], [ [[TMP17:%.*]], [[VECTOR_BODY75]] ] ; CHECK-NEXT: [[LSR_IV68:%.*]] = bitcast i32* [[LSR_IV6]] to <4 x i32>* ; CHECK-NEXT: [[LSR_IV35:%.*]] = bitcast i32* [[LSR_IV3]] to <4 x i32>* ; CHECK-NEXT: [[LSR_IV2:%.*]] = bitcast i32* [[LSR_IV]] to <4 x i32>* ; CHECK-NEXT: [[BROADCAST_SPLATINSERT84:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX80]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT85:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT84]], <4 x i32> undef, <4 x i32> zeroinitializer ; CHECK-NEXT: [[INDUCTION86:%.*]] = add <4 x i32> [[BROADCAST_SPLAT85]], -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> undef, i32 [[TRIP_COUNT_MINUS_183]], i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> undef, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = icmp ule <4 x i32> [[INDUCTION86]], [[TMP12]] -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[LSR_IV68]], i32 4, <4 x i1> [[TMP13]], <4 x i32> undef) -; CHECK-NEXT: [[WIDE_MASKED_LOAD89:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[LSR_IV35]], i32 4, <4 x i1> [[TMP13]], <4 x i32> undef) -; CHECK-NEXT: [[TMP14:%.*]] = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> [[WIDE_MASKED_LOAD89]], <4 x i32> [[WIDE_MASKED_LOAD]]) -; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP14]], <4 x i32>* [[LSR_IV2]], i32 4, <4 x i1> [[TMP13]]) +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> undef, i32 [[TRIP_COUNT_MINUS_183]], i32 0 +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP13]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = icmp ule <4 x i32> [[INDUCTION86]], [[TMP14]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[LSR_IV68]], i32 4, <4 x i1> [[TMP15]], <4 x i32> undef) +; CHECK-NEXT: [[WIDE_MASKED_LOAD89:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[LSR_IV35]], i32 4, <4 x i1> [[TMP15]], <4 x i32> undef) +; CHECK-NEXT: [[TMP16:%.*]] = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> [[WIDE_MASKED_LOAD89]], <4 x i32> [[WIDE_MASKED_LOAD]]) +; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP16]], <4 x i32>* [[LSR_IV2]], i32 4, <4 x i1> [[TMP15]]) ; CHECK-NEXT: [[INDEX_NEXT81]] = add i32 [[INDEX80]], 4 ; CHECK-NEXT: [[SCEVGEP]] = getelementptr i32, i32* [[LSR_IV]], i32 4 ; CHECK-NEXT: [[SCEVGEP4]] = getelementptr i32, i32* [[LSR_IV3]], i32 4 ; CHECK-NEXT: [[SCEVGEP7]] = getelementptr i32, i32* [[LSR_IV6]], i32 4 -; CHECK-NEXT: [[TMP15]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP10]], i32 1) -; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0 -; CHECK-NEXT: br i1 [[TMP16]], label [[VECTOR_BODY75]], label [[FOR_COND_CLEANUP]] +; CHECK-NEXT: [[TMP17]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP12]], i32 1) +; CHECK-NEXT: [[TMP18:%.*]] = icmp ne i32 [[TMP17]], 0 +; CHECK-NEXT: br i1 [[TMP18]], label [[VECTOR_BODY75]], label [[FOR_COND_CLEANUP]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret i32 0 ; @@ -100,7 +108,7 @@ %induction = add <4 x i32> %broadcast.splat, %5 = insertelement <4 x i32> undef, i32 %trip.count.minus.183, i32 0 %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <4 x i32> zeroinitializer - %7 = icmp ule <4 x i32> %induction, %6 + %7 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.183) call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %broadcast.splat72, <4 x i32>* %lsr.iv911, i32 4, <4 x i1> %7) %index.next = add i32 %index, 4 %scevgep10 = getelementptr i32, i32* %lsr.iv9, i32 4 @@ -143,3 +151,7 @@ declare <4 x i32> @llvm.usub.sat.v4i32(<4 x i32>, <4 x i32>) declare void @llvm.set.loop.iterations.i32(i32) declare i32 @llvm.loop.decrement.reg.i32(i32, i32) + +declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) +declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32) +declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll @@ -8,30 +8,28 @@ ; CHECK-NEXT: itt eq ; CHECK-NEXT: moveq r0, #0 ; CHECK-NEXT: bxeq lr -; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: adds r4, r3, #3 +; CHECK-NEXT: add.w r12, r3, #3 +; CHECK-NEXT: mov.w lr, #1 +; CHECK-NEXT: bic r12, r12, #3 ; CHECK-NEXT: vmov.i32 q1, #0x0 -; CHECK-NEXT: bic r4, r4, #3 -; CHECK-NEXT: sub.w r12, r4, #4 -; CHECK-NEXT: movs r4, #1 -; CHECK-NEXT: add.w lr, r4, r12, lsr #2 -; CHECK-NEXT: lsr.w r4, r12, #2 -; CHECK-NEXT: sub.w r12, r3, r4, lsl #2 -; CHECK-NEXT: movs r4, #0 +; CHECK-NEXT: sub.w r12, r12, #4 +; CHECK-NEXT: add.w lr, lr, r12, lsr #2 +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB0_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r3 -; CHECK-NEXT: and r5, r4, #15 +; CHECK-NEXT: and r4, r12, #15 ; CHECK-NEXT: vstr p0, [sp] @ 4-byte Spill -; CHECK-NEXT: vdup.32 q3, r5 +; CHECK-NEXT: vdup.32 q3, r4 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: vpstt ; CHECK-NEXT: vldrwt.u32 q1, [r2], #16 ; CHECK-NEXT: vldrwt.u32 q2, [r1], #16 ; CHECK-NEXT: vcmp.i32 eq, q3, zr -; CHECK-NEXT: adds r4, #4 +; CHECK-NEXT: add.w r12, r12, #4 ; CHECK-NEXT: vpsel q1, q2, q1 ; CHECK-NEXT: vldr p0, [sp] @ 4-byte Reload ; CHECK-NEXT: vpst @@ -41,11 +39,10 @@ ; CHECK-NEXT: vadd.i32 q1, q1, q0 ; CHECK-NEXT: le lr, .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block -; CHECK-NEXT: vctp.32 r12 ; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: pop {r4, pc} entry: %cmp8 = icmp eq i32 %N, 0 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph @@ -65,7 +62,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %tmp = getelementptr inbounds i32, i32* %a, i32 %index - %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12 + +; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12 + %tmp1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %tmp2 = bitcast i32* %tmp to <4 x i32>* %wide.masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef) %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index @@ -98,37 +98,35 @@ define dso_local i32 @vpsel_mul_reduce_add_2(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, ; CHECK-LABEL: vpsel_mul_reduce_add_2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: ldr.w r12, [sp, #20] ; CHECK-NEXT: cmp.w r12, #0 ; CHECK-NEXT: beq .LBB1_4 ; CHECK-NEXT: @ %bb.1: @ %vector.ph -; CHECK-NEXT: add.w r5, r12, #3 +; CHECK-NEXT: add.w r4, r12, #3 ; CHECK-NEXT: vmov.i32 q1, #0x0 -; CHECK-NEXT: bic r5, r5, #3 -; CHECK-NEXT: subs r4, r5, #4 -; CHECK-NEXT: movs r5, #1 -; CHECK-NEXT: add.w lr, r5, r4, lsr #2 -; CHECK-NEXT: lsrs r4, r4, #2 -; CHECK-NEXT: sub.w r4, r12, r4, lsl #2 -; CHECK-NEXT: movs r5, #0 +; CHECK-NEXT: bic r4, r4, #3 +; CHECK-NEXT: sub.w lr, r4, #4 +; CHECK-NEXT: movs r4, #1 +; CHECK-NEXT: add.w lr, r4, lr, lsr #2 +; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB1_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r12 -; CHECK-NEXT: and r6, r5, #15 +; CHECK-NEXT: and r5, r4, #15 ; CHECK-NEXT: vstr p0, [sp] @ 4-byte Spill ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: vpstt ; CHECK-NEXT: vldrwt.u32 q1, [r3], #16 ; CHECK-NEXT: vldrwt.u32 q2, [r2], #16 -; CHECK-NEXT: vdup.32 q3, r6 +; CHECK-NEXT: vdup.32 q3, r5 ; CHECK-NEXT: vsub.i32 q1, q2, q1 ; CHECK-NEXT: vpst ; CHECK-NEXT: vldrwt.u32 q2, [r1], #16 ; CHECK-NEXT: vcmp.i32 eq, q3, zr -; CHECK-NEXT: adds r5, #4 +; CHECK-NEXT: adds r4, #4 ; CHECK-NEXT: vpsel q1, q1, q2 ; CHECK-NEXT: vldr p0, [sp] @ 4-byte Reload ; CHECK-NEXT: vpst @@ -138,15 +136,14 @@ ; CHECK-NEXT: vadd.i32 q1, q1, q0 ; CHECK-NEXT: le lr, .LBB1_2 ; CHECK-NEXT: @ %bb.3: @ %middle.block -; CHECK-NEXT: vctp.32 r4 ; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: pop {r4, r5, r7, pc} ; CHECK-NEXT: .LBB1_4: ; CHECK-NEXT: movs r0, #0 ; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: pop {r4, r5, r7, pc} i32* noalias nocapture readonly %c, i32* noalias nocapture readonly %d, i32 %N) { entry: %cmp8 = icmp eq i32 %N, 0 @@ -167,7 +164,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %tmp = getelementptr inbounds i32, i32* %a, i32 %index - %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12 + +; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12 + %tmp1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %tmp2 = bitcast i32* %tmp to <4 x i32>* %wide.masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef) %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index @@ -204,19 +204,19 @@ define dso_local i32 @and_mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, ; CHECK-LABEL: and_mul_reduce_add: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: ldr.w r12, [sp, #16] +; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: sub sp, #4 +; CHECK-NEXT: ldr.w r12, [sp, #12] ; CHECK-NEXT: cmp.w r12, #0 ; CHECK-NEXT: beq .LBB2_4 ; CHECK-NEXT: @ %bb.1: @ %vector.ph ; CHECK-NEXT: add.w r4, r12, #3 ; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: bic r4, r4, #3 -; CHECK-NEXT: subs r5, r4, #4 +; CHECK-NEXT: sub.w lr, r4, #4 ; CHECK-NEXT: movs r4, #1 -; CHECK-NEXT: add.w lr, r4, r5, lsr #2 -; CHECK-NEXT: lsrs r4, r5, #2 -; CHECK-NEXT: sub.w r4, r12, r4, lsl #2 +; CHECK-NEXT: add.w lr, r4, lr, lsr #2 +; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB2_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -225,23 +225,27 @@ ; CHECK-NEXT: vpstt ; CHECK-NEXT: vldrwt.u32 q1, [r1], #16 ; CHECK-NEXT: vldrwt.u32 q2, [r0], #16 -; CHECK-NEXT: sub.w r12, r12, #4 +; CHECK-NEXT: vstr p0, [sp] @ 4-byte Spill ; CHECK-NEXT: vsub.i32 q1, q2, q1 +; CHECK-NEXT: adds r4, #4 ; CHECK-NEXT: vpsttt ; CHECK-NEXT: vcmpt.i32 eq, q1, zr ; CHECK-NEXT: vldrwt.u32 q1, [r3], #16 ; CHECK-NEXT: vldrwt.u32 q2, [r2], #16 +; CHECK-NEXT: sub.w r12, r12, #4 ; CHECK-NEXT: vmul.i32 q1, q2, q1 ; CHECK-NEXT: vadd.i32 q1, q1, q0 ; CHECK-NEXT: le lr, .LBB2_2 ; CHECK-NEXT: @ %bb.3: @ %middle.block -; CHECK-NEXT: vctp.32 r4 +; CHECK-NEXT: vldr p0, [sp] @ 4-byte Reload ; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vaddv.u32 r0, q0 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: pop {r4, pc} ; CHECK-NEXT: .LBB2_4: ; CHECK-NEXT: movs r0, #0 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: pop {r4, pc} i32* noalias nocapture readonly %c, i32* noalias nocapture readonly %d, i32 %N) { entry: %cmp8 = icmp eq i32 %N, 0 @@ -262,7 +266,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %tmp = getelementptr inbounds i32, i32* %a, i32 %index - %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12 + +; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12 + %tmp1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %tmp2 = bitcast i32* %tmp to <4 x i32>* %wide.masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef) %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index @@ -296,19 +303,19 @@ define dso_local i32 @or_mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture readonly %c, i32* noalias nocapture readonly %d, i32 %N) { ; CHECK-LABEL: or_mul_reduce_add: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: ldr.w r12, [sp, #16] +; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: sub sp, #4 +; CHECK-NEXT: ldr.w r12, [sp, #12] ; CHECK-NEXT: cmp.w r12, #0 ; CHECK-NEXT: beq .LBB3_4 ; CHECK-NEXT: @ %bb.1: @ %vector.ph ; CHECK-NEXT: add.w r4, r12, #3 ; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: bic r4, r4, #3 -; CHECK-NEXT: subs r5, r4, #4 +; CHECK-NEXT: sub.w lr, r4, #4 ; CHECK-NEXT: movs r4, #1 -; CHECK-NEXT: add.w lr, r4, r5, lsr #2 -; CHECK-NEXT: lsrs r4, r5, #2 -; CHECK-NEXT: sub.w r4, r12, r4, lsl #2 +; CHECK-NEXT: add.w lr, r4, lr, lsr #2 +; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB3_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -317,24 +324,28 @@ ; CHECK-NEXT: vpstt ; CHECK-NEXT: vldrwt.u32 q1, [r1], #16 ; CHECK-NEXT: vldrwt.u32 q2, [r0], #16 -; CHECK-NEXT: vpnot +; CHECK-NEXT: vstr p0, [sp] @ 4-byte Spill ; CHECK-NEXT: vsub.i32 q1, q2, q1 -; CHECK-NEXT: sub.w r12, r12, #4 +; CHECK-NEXT: vpnot ; CHECK-NEXT: vpstee ; CHECK-NEXT: vcmpt.i32 ne, q1, zr ; CHECK-NEXT: vldrwe.u32 q1, [r3], #16 ; CHECK-NEXT: vldrwe.u32 q2, [r2], #16 +; CHECK-NEXT: adds r4, #4 ; CHECK-NEXT: vmul.i32 q1, q2, q1 +; CHECK-NEXT: sub.w r12, r12, #4 ; CHECK-NEXT: vadd.i32 q1, q1, q0 ; CHECK-NEXT: le lr, .LBB3_2 ; CHECK-NEXT: @ %bb.3: @ %middle.block -; CHECK-NEXT: vctp.32 r4 +; CHECK-NEXT: vldr p0, [sp] @ 4-byte Reload ; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vaddv.u32 r0, q0 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: pop {r4, pc} ; CHECK-NEXT: .LBB3_4: ; CHECK-NEXT: movs r0, #0 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: pop {r4, pc} entry: %cmp8 = icmp eq i32 %N, 0 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph @@ -354,7 +365,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %tmp = getelementptr inbounds i32, i32* %a, i32 %index - %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12 + +; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12 + %tmp1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %tmp2 = bitcast i32* %tmp to <4 x i32>* %wide.masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef) %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index @@ -392,9 +406,11 @@ ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB4_1: @ %bb9 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: adds r3, #4 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16 ; CHECK-NEXT: vcmp.i32 ne, q0, zr ; CHECK-NEXT: vpst @@ -423,7 +439,10 @@ %tmp12 = shufflevector <4 x i32> %tmp11, <4 x i32> undef, <4 x i32> zeroinitializer %tmp13 = add <4 x i32> %tmp12, %tmp14 = getelementptr inbounds i32, i32* %arg1, i32 %tmp10 - %tmp15 = icmp ule <4 x i32> %tmp13, %tmp8 + + ; %tmp15 = icmp ule <4 x i32> %tmp13, %tmp8 + %tmp15 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %tmp10, i32 %tmp6) + %tmp16 = bitcast i32* %tmp14 to <4 x i32>* %tmp17 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp16, i32 4, <4 x i1> %tmp15, <4 x i32> undef) %tmp18 = icmp ne <4 x i32> %tmp17, zeroinitializer @@ -449,6 +468,7 @@ ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB5_1: @ %bb12 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -456,6 +476,7 @@ ; CHECK-NEXT: vptt.i32 ne, q0, zr ; CHECK-NEXT: vcmpt.s32 le, q0, r2 ; CHECK-NEXT: vldrwt.u32 q1, [r1], #16 +; CHECK-NEXT: add.w r12, r12, #4 ; CHECK-NEXT: vmul.i32 q0, q1, q0 ; CHECK-NEXT: vpst ; CHECK-NEXT: vstrwt.32 q0, [r0], #16 @@ -482,7 +503,10 @@ %tmp15 = shufflevector <4 x i32> %tmp14, <4 x i32> undef, <4 x i32> zeroinitializer %tmp16 = add <4 x i32> %tmp15, %tmp17 = getelementptr inbounds i32, i32* %arg, i32 %tmp13 - %tmp18 = icmp ule <4 x i32> %tmp16, %tmp9 + + ; %tmp18 = icmp ule <4 x i32> %tmp16, %tmp9 + %tmp18= call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %tmp13, i32 %tmp7) + %tmp19 = bitcast i32* %tmp17 to <4 x i32>* %tmp20 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp19, i32 4, <4 x i1> %tmp18, <4 x i32> undef) %tmp21 = icmp ne <4 x i32> %tmp20, zeroinitializer @@ -509,3 +533,5 @@ ; Function Attrs: nounwind readnone willreturn declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) + +declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/extending-loads.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/extending-loads.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/extending-loads.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/extending-loads.ll @@ -8,9 +8,11 @@ ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: dlstp.16 lr, r2 ; CHECK-NEXT: .LBB0_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: adds r3, #8 ; CHECK-NEXT: vldrb.s16 q0, [r1], #8 ; CHECK-NEXT: vldrh.u16 q1, [r0] ; CHECK-NEXT: vadd.i16 q0, q1, q0 @@ -36,7 +38,10 @@ %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer %induction = or <8 x i32> %broadcast.splat, %0 = getelementptr inbounds i8, i8* %b, i32 %index - %1 = icmp ule <8 x i32> %induction, %broadcast.splat11 + + ; %1 = icmp ule <8 x i32> %induction, %broadcast.splat11 + %1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast i8* %0 to <8 x i8>* %wide.masked.load = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %2, i32 1, <8 x i1> %1, <8 x i8> undef) %3 = sext <8 x i8> %wide.masked.load to <8 x i16> @@ -62,9 +67,11 @@ ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: dlstp.16 lr, r2 ; CHECK-NEXT: .LBB1_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: adds r3, #8 ; CHECK-NEXT: vldrb.u16 q0, [r1], #8 ; CHECK-NEXT: vldrh.u16 q1, [r0] ; CHECK-NEXT: vadd.i16 q0, q1, q0 @@ -90,7 +97,10 @@ %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer %induction = or <8 x i32> %broadcast.splat, %0 = getelementptr inbounds i8, i8* %b, i32 %index - %1 = icmp ule <8 x i32> %induction, %broadcast.splat11 + + ; %1 = icmp ule <8 x i32> %induction, %broadcast.splat11 + %1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast i8* %0 to <8 x i8>* %wide.masked.load = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %2, i32 1, <8 x i1> %1, <8 x i8> undef) %3 = zext <8 x i8> %wide.masked.load to <8 x i16> @@ -116,9 +126,11 @@ ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB2_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: adds r3, #4 ; CHECK-NEXT: vldrh.s32 q0, [r1], #8 ; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vadd.i32 q0, q1, q0 @@ -144,7 +156,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = or <4 x i32> %broadcast.splat, %0 = getelementptr inbounds i16, i16* %b, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat9 + + ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat9 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast i16* %0 to <4 x i16>* %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %2, i32 2, <4 x i1> %1, <4 x i16> undef) %3 = sext <4 x i16> %wide.masked.load to <4 x i32> @@ -170,9 +185,11 @@ ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB3_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: adds r3, #4 ; CHECK-NEXT: vldrh.u32 q0, [r1], #8 ; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vadd.i32 q0, q1, q0 @@ -198,7 +215,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = or <4 x i32> %broadcast.splat, %0 = getelementptr inbounds i16, i16* %b, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat9 + + ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat9 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast i16* %0 to <4 x i16>* %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %2, i32 2, <4 x i1> %1, <4 x i16> undef) %3 = zext <4 x i16> %wide.masked.load to <4 x i32> @@ -223,3 +243,5 @@ declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) +declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) +declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll @@ -35,9 +35,11 @@ ; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: b .LBB0_8 ; CHECK-NEXT: .LBB0_4: @ %vector.ph +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB0_5: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: add.w r12, r12, #4 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16 ; CHECK-NEXT: vldrw.u32 q1, [r2], #16 ; CHECK-NEXT: vmul.f32 q0, q1, q0 @@ -135,7 +137,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %2 = getelementptr inbounds float, float* %b, i32 %index - %3 = icmp ule <4 x i32> %induction, %broadcast.splat22 + + ; %3 = icmp ule <4 x i32> %induction, %broadcast.splat22 + %3 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %4 = bitcast float* %2 to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %3, <4 x float> undef) %5 = getelementptr inbounds float, float* %c, i32 %index @@ -224,12 +229,12 @@ ; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2 -; CHECK-NEXT: lsr.w r3, r12, #2 -; CHECK-NEXT: sub.w r3, r2, r3, lsl #2 +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB1_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r2 +; CHECK-NEXT: adds r3, #4 ; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vpstt @@ -238,7 +243,6 @@ ; CHECK-NEXT: vfma.f32 q0, q3, q2 ; CHECK-NEXT: le lr, .LBB1_2 ; CHECK-NEXT: @ %bb.3: @ %middle.block -; CHECK-NEXT: vctp.32 r3 ; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vmov.f32 s4, s2 ; CHECK-NEXT: vmov.f32 s5, s3 @@ -274,7 +278,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %0 = getelementptr inbounds float, float* %b, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat12 + +; %1 = icmp ule <4 x i32> %induction, %broadcast.splat12 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast float* %0 to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) %3 = getelementptr inbounds float, float* %c, i32 %index @@ -586,3 +593,4 @@ ; Function Attrs: argmemonly nounwind readonly willreturn declare <4 x half> @llvm.masked.load.v4f16.p0v4f16(<4 x half>*, i32 immarg, <4 x i1>, <4 x half>) +declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll @@ -15,12 +15,12 @@ ; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2 -; CHECK-NEXT: lsr.w r3, r12, #2 -; CHECK-NEXT: sub.w r3, r2, r3, lsl #2 +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB0_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r2 +; CHECK-NEXT: adds r3, #4 ; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vpst @@ -28,7 +28,6 @@ ; CHECK-NEXT: vmla.u32 q0, q2, r0 ; CHECK-NEXT: le lr, .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block -; CHECK-NEXT: vctp.32 r3 ; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: pop {r7, pc} @@ -54,7 +53,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %0 = getelementptr inbounds i8, i8* %b, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + +; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast i8* %0 to <4 x i8>* %wide.masked.load = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %2, i32 1, <4 x i1> %1, <4 x i8> undef) %3 = zext <4 x i8> %wide.masked.load to <4 x i32> @@ -88,12 +90,12 @@ ; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2 -; CHECK-NEXT: lsr.w r3, r12, #2 -; CHECK-NEXT: sub.w r3, r2, r3, lsl #2 +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB1_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r2 +; CHECK-NEXT: adds r3, #4 ; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vpst @@ -101,7 +103,6 @@ ; CHECK-NEXT: vmla.u32 q0, q2, r0 ; CHECK-NEXT: le lr, .LBB1_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block -; CHECK-NEXT: vctp.32 r3 ; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: pop {r7, pc} @@ -127,7 +128,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %0 = getelementptr inbounds i16, i16* %b, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + +; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast i16* %0 to <4 x i16>* %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %2, i32 2, <4 x i1> %1, <4 x i16> undef) %3 = sext <4 x i16> %wide.masked.load to <4 x i32> @@ -161,12 +165,12 @@ ; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2 -; CHECK-NEXT: lsr.w r3, r12, #2 -; CHECK-NEXT: sub.w r3, r2, r3, lsl #2 +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB2_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r2 +; CHECK-NEXT: adds r3, #4 ; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vpst @@ -174,7 +178,6 @@ ; CHECK-NEXT: vmla.u32 q0, q2, r0 ; CHECK-NEXT: le lr, .LBB2_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block -; CHECK-NEXT: vctp.32 r3 ; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: pop {r7, pc} @@ -200,7 +203,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %0 = getelementptr inbounds i8, i8* %b, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + +; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast i8* %0 to <4 x i8>* %wide.masked.load = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %2, i32 1, <4 x i1> %1, <4 x i8> undef) %3 = zext <4 x i8> %wide.masked.load to <4 x i32> @@ -234,12 +240,12 @@ ; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2 -; CHECK-NEXT: lsr.w r3, r12, #2 -; CHECK-NEXT: sub.w r3, r2, r3, lsl #2 +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB3_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r2 +; CHECK-NEXT: adds r3, #4 ; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vpst @@ -247,7 +253,6 @@ ; CHECK-NEXT: vmla.u32 q0, q2, r0 ; CHECK-NEXT: le lr, .LBB3_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block -; CHECK-NEXT: vctp.32 r3 ; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: pop {r7, pc} @@ -273,7 +278,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %0 = getelementptr inbounds i16, i16* %b, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + +; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast i16* %0 to <4 x i16>* %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %2, i32 2, <4 x i1> %1, <4 x i16> undef) %3 = zext <4 x i16> %wide.masked.load to <4 x i32> @@ -307,12 +315,12 @@ ; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2 -; CHECK-NEXT: lsr.w r3, r12, #2 -; CHECK-NEXT: sub.w r3, r2, r3, lsl #2 +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB4_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r2 +; CHECK-NEXT: adds r3, #4 ; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vpst @@ -320,7 +328,6 @@ ; CHECK-NEXT: vmla.u32 q0, q2, r0 ; CHECK-NEXT: le lr, .LBB4_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block -; CHECK-NEXT: vctp.32 r3 ; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: pop {r7, pc} @@ -345,7 +352,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %0 = getelementptr inbounds i32, i32* %b, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat10 + +; %1 = icmp ule <4 x i32> %induction, %broadcast.splat10 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast i32* %0 to <4 x i32>* %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef) %3 = mul nsw <4 x i32> %wide.masked.load, %broadcast.splat12 @@ -399,9 +409,11 @@ ; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: b .LBB5_8 ; CHECK-NEXT: .LBB5_4: @ %vector.ph +; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: dlstp.32 lr, r12 ; CHECK-NEXT: .LBB5_5: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: adds r4, #4 ; CHECK-NEXT: vldrb.u32 q0, [r0], #4 ; CHECK-NEXT: vldrb.u32 q1, [r1], #4 ; CHECK-NEXT: vmlas.u32 q1, q0, r2 @@ -502,7 +514,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %2 = getelementptr inbounds i8, i8* %a, i32 %index - %3 = icmp ule <4 x i32> %induction, %broadcast.splat20 + + ; %3 = icmp ule <4 x i32> %induction, %broadcast.splat20 + %3 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %4 = bitcast i8* %2 to <4 x i8>* %wide.masked.load = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %4, i32 1, <4 x i1> %3, <4 x i8> undef) %5 = zext <4 x i8> %wide.masked.load to <4 x i32> @@ -605,9 +620,11 @@ ; CHECK-NEXT: cmp.w r12, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r4, pc} +; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: dlstp.32 lr, r12 ; CHECK-NEXT: .LBB6_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: adds r4, #4 ; CHECK-NEXT: vldrh.s32 q0, [r0], #8 ; CHECK-NEXT: vldrh.s32 q1, [r1], #8 ; CHECK-NEXT: vmlas.u32 q1, q0, r2 @@ -636,7 +653,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %0 = getelementptr inbounds i16, i16* %a, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat13 + +; %1 = icmp ule <4 x i32> %induction, %broadcast.splat13 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast i16* %0 to <4 x i16>* %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %2, i32 2, <4 x i1> %1, <4 x i16> undef) %3 = sext <4 x i16> %wide.masked.load to <4 x i32> @@ -692,9 +712,11 @@ ; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: b .LBB7_8 ; CHECK-NEXT: .LBB7_4: @ %vector.ph +; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: dlstp.32 lr, r12 ; CHECK-NEXT: .LBB7_5: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: adds r4, #4 ; CHECK-NEXT: vldrb.u32 q0, [r0], #4 ; CHECK-NEXT: vldrb.u32 q1, [r1], #4 ; CHECK-NEXT: vmlas.u32 q1, q0, r2 @@ -795,7 +817,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %2 = getelementptr inbounds i8, i8* %a, i32 %index - %3 = icmp ule <4 x i32> %induction, %broadcast.splat20 + +; %3 = icmp ule <4 x i32> %induction, %broadcast.splat20 + %3 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %4 = bitcast i8* %2 to <4 x i8>* %wide.masked.load = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %4, i32 1, <4 x i1> %3, <4 x i8> undef) %5 = zext <4 x i8> %wide.masked.load to <4 x i32> @@ -898,9 +923,11 @@ ; CHECK-NEXT: cmp.w r12, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r4, pc} +; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: dlstp.32 lr, r12 ; CHECK-NEXT: .LBB8_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: adds r4, #4 ; CHECK-NEXT: vldrh.u32 q0, [r0], #8 ; CHECK-NEXT: vldrh.u32 q1, [r1], #8 ; CHECK-NEXT: vmlas.u32 q1, q0, r2 @@ -929,7 +956,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %0 = getelementptr inbounds i16, i16* %a, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat13 + +; %1 = icmp ule <4 x i32> %induction, %broadcast.splat13 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast i16* %0 to <4 x i16>* %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %2, i32 2, <4 x i1> %1, <4 x i16> undef) %3 = zext <4 x i16> %wide.masked.load to <4 x i32> @@ -985,9 +1015,11 @@ ; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: b .LBB9_8 ; CHECK-NEXT: .LBB9_4: @ %vector.ph +; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: dlstp.32 lr, r12 ; CHECK-NEXT: .LBB9_5: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: adds r4, #4 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: vldrw.u32 q1, [r1], #16 ; CHECK-NEXT: vmlas.u32 q1, q0, r2 @@ -1085,7 +1117,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %2 = getelementptr inbounds i32, i32* %a, i32 %index - %3 = icmp ule <4 x i32> %induction, %broadcast.splat22 + +; %3 = icmp ule <4 x i32> %induction, %broadcast.splat22 + %3 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %4 = bitcast i32* %2 to <4 x i32>* %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %4, i32 4, <4 x i1> %3, <4 x i32> undef) %5 = getelementptr inbounds i32, i32* %b, i32 %index @@ -1175,9 +1210,11 @@ ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dlstp.16 lr, r3 ; CHECK-NEXT: .LBB10_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: add.w r12, r12, #8 ; CHECK-NEXT: vldrb.u16 q0, [r1], #8 ; CHECK-NEXT: vldrb.u16 q1, [r2], #8 ; CHECK-NEXT: vmul.i16 q0, q1, q0 @@ -1203,7 +1240,10 @@ %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer %induction = add <8 x i32> %broadcast.splat, %0 = getelementptr inbounds i8, i8* %b, i32 %index - %1 = icmp ule <8 x i32> %induction, %broadcast.splat13 + +; %1 = icmp ule <8 x i32> %induction, %broadcast.splat13 + %1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast i8* %0 to <8 x i8>* %wide.masked.load = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %2, i32 1, <8 x i1> %1, <8 x i8> undef) %3 = zext <8 x i8> %wide.masked.load to <8 x i16> @@ -1230,6 +1270,5 @@ declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32 immarg, <8 x i1>) declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) - - - +declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) +declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll @@ -9,6 +9,9 @@ ; CHECK: for.cond1.preheader.us.preheader: ; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 3 ; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -4 +; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT28:%.*]] = insertelement <4 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT29:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT28]], <4 x i32> undef, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP:%.*]] = add i32 [[N_VEC]], -4 ; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[TMP]], 2 ; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i32 [[TMP1]], 1 @@ -21,12 +24,16 @@ ; CHECK-NEXT: [[ARRAYIDX8_PROMOTED_US:%.*]] = load i32, i32* [[ARRAYIDX8_US]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> , i32 [[ARRAYIDX8_PROMOTED_US]], i32 0 ; CHECK-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[TMP2]]) +; CHECK-NEXT: [[NUM_ELEMENTS:%.*]] = add i32 [[TRIP_COUNT_MINUS_1]], 1 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[FOR_COND1_PREHEADER_US]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP4]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP5:%.*]] = phi i32 [ [[TMP2]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[N]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[NUM_ELEMENTS]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[TMP3]], i32 [[INDEX]] ; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP0]]) ; CHECK-NEXT: [[TMP2]] = sub i32 [[TMP0]], 4 @@ -86,7 +93,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %tmp6 = getelementptr inbounds i16, i16* %tmp3, i32 %index - %tmp7 = icmp ule <4 x i32> %induction, %broadcast.splat29 + + ; %tmp7 = icmp ule <4 x i32> %induction, %broadcast.splat29 + %tmp7 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %tmp8 = bitcast i16* %tmp6 to <4 x i16>* %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %tmp8, i32 2, <4 x i1> %tmp7, <4 x i16> undef) %tmp9 = sext <4 x i16> %wide.masked.load to <4 x i32> @@ -121,6 +131,9 @@ ; CHECK: for.cond1.preheader.us.preheader: ; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 3 ; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -4 +; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT27:%.*]] = insertelement <4 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT28:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT27]], <4 x i32> undef, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP:%.*]] = add i32 [[N_VEC]], -4 ; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[TMP]], 2 ; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i32 [[TMP1]], 1 @@ -133,12 +146,16 @@ ; CHECK-NEXT: [[ARRAYIDX7_PROMOTED_US:%.*]] = load i32, i32* [[ARRAYIDX7_US]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> , i32 [[ARRAYIDX7_PROMOTED_US]], i32 0 ; CHECK-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[TMP2]]) +; CHECK-NEXT: [[NUM_ELEMENTS:%.*]] = add i32 [[TRIP_COUNT_MINUS_1]], 1 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[FOR_COND1_PREHEADER_US]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP4]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP5:%.*]] = phi i32 [ [[TMP2]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[N]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[NUM_ELEMENTS]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 [[INDEX]] ; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP0]]) ; CHECK-NEXT: [[TMP2]] = sub i32 [[TMP0]], 4 @@ -196,7 +213,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %tmp6 = getelementptr inbounds i32, i32* %tmp3, i32 %index - %tmp7 = icmp ule <4 x i32> %induction, %broadcast.splat28 + + ; %tmp7 = icmp ule <4 x i32> %induction, %broadcast.splat28 + %tmp7 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %tmp8 = bitcast i32* %tmp6 to <4 x i32>* %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp8, i32 4, <4 x i1> %tmp7, <4 x i32> undef) %tmp9 = getelementptr inbounds i32, i32* %B, i32 %index @@ -221,6 +241,7 @@ ret void } + ; Function Attrs: argmemonly nounwind readonly willreturn declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #0 @@ -236,6 +257,8 @@ ; Function Attrs: noduplicate nounwind declare i32 @llvm.loop.decrement.reg.i32(i32, i32) #2 +declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) + attributes #0 = { argmemonly nounwind readonly willreturn } attributes #1 = { nounwind readnone willreturn } attributes #2 = { noduplicate nounwind } diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll @@ -10,17 +10,22 @@ ; CHECK-NEXT: [[LSR_IV14:%.*]] = phi i32* [ [[SCEVGEP15:%.*]], [[VECTOR_BODY]] ], [ [[A:%.*]], [[ENTRY:%.*]] ] ; CHECK-NEXT: [[LSR_IV11:%.*]] = phi i32* [ [[SCEVGEP12:%.*]], [[VECTOR_BODY]] ], [ [[C:%.*]], [[ENTRY]] ] ; CHECK-NEXT: [[LSR_IV:%.*]] = phi i32* [ [[SCEVGEP:%.*]], [[VECTOR_BODY]] ], [ [[B:%.*]], [[ENTRY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ 8001, [[ENTRY]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 32002, [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 32003, [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[LSR_IV1416:%.*]] = bitcast i32* [[LSR_IV14]] to <4 x i32>* ; CHECK-NEXT: [[LSR_IV1113:%.*]] = bitcast i32* [[LSR_IV11]] to <4 x i32>* ; CHECK-NEXT: [[LSR_IV10:%.*]] = bitcast i32* [[LSR_IV]] to <4 x i32>* +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], ; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP1]]) ; CHECK-NEXT: [[TMP3]] = sub i32 [[TMP1]], 4 ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[LSR_IV10]], i32 4, <4 x i1> [[TMP2]], <4 x i32> undef) ; CHECK-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[LSR_IV1113]], i32 4, <4 x i1> [[TMP2]], <4 x i32> undef) ; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD]] ; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP4]], <4 x i32>* [[LSR_IV1416]], i32 4, <4 x i1> [[TMP2]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 ; CHECK-NEXT: [[SCEVGEP]] = getelementptr i32, i32* [[LSR_IV]], i32 4 ; CHECK-NEXT: [[SCEVGEP12]] = getelementptr i32, i32* [[LSR_IV11]], i32 4 ; CHECK-NEXT: [[SCEVGEP15]] = getelementptr i32, i32* [[LSR_IV14]], i32 4 @@ -46,7 +51,10 @@ %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, - %1 = icmp ult <4 x i32> %induction, + + ; %1 = icmp ult <4 x i32> %induction, + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32002) + %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef) %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef) %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load @@ -171,6 +179,7 @@ ; UGT here: %1 = icmp ugt <4 x i32> %induction, + %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef) %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef) %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load @@ -187,10 +196,8 @@ ret void } -; Check that this loop behaves as expected, i.e, that the loop increment is -; an increment and not a decrement. -define dso_local void @foo4(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 { -; CHECK-LABEL: @foo4( +define dso_local void @foo5(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 { +; CHECK-LABEL: @foo5( ; CHECK-NEXT: entry: ; CHECK-NEXT: call void @llvm.set.loop.iterations.i32(i32 8001) ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] @@ -206,12 +213,12 @@ ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer ; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], -; CHECK-NEXT: [[TMP1:%.*]] = icmp ult <4 x i32> [[INDUCTION]], +; CHECK-NEXT: [[TMP1:%.*]] = icmp ult <4 x i32> [[INDUCTION]], ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[LSR_IV10]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef) ; CHECK-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[LSR_IV1113]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef) ; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD]] ; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP2]], <4 x i32>* [[LSR_IV1416]], i32 4, <4 x i1> [[TMP1]]) -; CHECK-NEXT: [[INDEX_NEXT]] = sub i32 [[INDEX]], 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 ; CHECK-NEXT: [[SCEVGEP]] = getelementptr i32, i32* [[LSR_IV]], i32 4 ; CHECK-NEXT: [[SCEVGEP12]] = getelementptr i32, i32* [[LSR_IV11]], i32 4 ; CHECK-NEXT: [[SCEVGEP15]] = getelementptr i32, i32* [[LSR_IV14]], i32 4 @@ -237,14 +244,66 @@ %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, - %1 = icmp ult <4 x i32> %induction, + +; Non-uniform constant vector here. This can't be represented with +; @llvm.get.active.lane.mask, but let's keep this test as a sanity check: + %1 = icmp ult <4 x i32> %induction, + %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef) %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef) %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %2, <4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %1) + %index.next = add i32 %index, 4 + %scevgep = getelementptr i32, i32* %lsr.iv, i32 4 + %scevgep12 = getelementptr i32, i32* %lsr.iv11, i32 4 + %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4 + %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1) + %4 = icmp ne i32 %3, 0 + br i1 %4, label %vector.body, label %for.cond.cleanup + +for.cond.cleanup: + ret void +} -; Counting down: - %index.next = sub i32 %index, 4 +; CHECK-LABEL: @overflow_BTC_plus_1( +; +; CHECK-NOT: @llvm.arm.mve.vctp32 +; CHECK-NOT: @llvm.get.active.lane.mask +; +; CHECK: %lane.mask.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 +; CHECK: %lane.mask.splat = shufflevector <4 x i32> %lane.mask.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK: %lane.mask.induction = add <4 x i32> %lane.mask.splat, +; CHECK: %[[ICMP:.*]] = icmp ule <4 x i32> %lane.mask.induction, +; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ICMP]], <4 x i32> undef) +; +; CHECK: ret void +; +define dso_local void @overflow_BTC_plus_1(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 { +entry: + call void @llvm.set.loop.iterations.i32(i32 8001) + br label %vector.body + +vector.body: + %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %A, %entry ] + %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ] + %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ] + %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = phi i32 [ 8001, %entry ], [ %3, %vector.body ] + %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* + %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>* + %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>* + %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + %induction = add <4 x i32> %broadcast.splat, + +; BTC = UINT_MAX, and scalar trip count BTC + 1 would overflow: + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 4294967295) + + %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef) + %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef) + %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %2, <4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %1) + %index.next = add i32 %index, 4 %scevgep = getelementptr i32, i32* %lsr.iv, i32 4 %scevgep12 = getelementptr i32, i32* %lsr.iv11, i32 4 %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4 @@ -256,38 +315,12 @@ ret void } -define dso_local void @foo5(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 { -; CHECK-LABEL: @foo5( -; CHECK-NEXT: entry: -; CHECK-NEXT: call void @llvm.set.loop.iterations.i32(i32 8001) -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[LSR_IV14:%.*]] = phi i32* [ [[SCEVGEP15:%.*]], [[VECTOR_BODY]] ], [ [[A:%.*]], [[ENTRY:%.*]] ] -; CHECK-NEXT: [[LSR_IV11:%.*]] = phi i32* [ [[SCEVGEP12:%.*]], [[VECTOR_BODY]] ], [ [[C:%.*]], [[ENTRY]] ] -; CHECK-NEXT: [[LSR_IV:%.*]] = phi i32* [ [[SCEVGEP:%.*]], [[VECTOR_BODY]] ], [ [[B:%.*]], [[ENTRY]] ] -; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ 8001, [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[LSR_IV1416:%.*]] = bitcast i32* [[LSR_IV14]] to <4 x i32>* -; CHECK-NEXT: [[LSR_IV1113:%.*]] = bitcast i32* [[LSR_IV11]] to <4 x i32>* -; CHECK-NEXT: [[LSR_IV10:%.*]] = bitcast i32* [[LSR_IV]] to <4 x i32>* -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer -; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], -; CHECK-NEXT: [[TMP1:%.*]] = icmp ult <4 x i32> [[INDUCTION]], -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[LSR_IV10]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef) -; CHECK-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[LSR_IV1113]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef) -; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD]] -; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP2]], <4 x i32>* [[LSR_IV1416]], i32 4, <4 x i1> [[TMP1]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 -; CHECK-NEXT: [[SCEVGEP]] = getelementptr i32, i32* [[LSR_IV]], i32 4 -; CHECK-NEXT: [[SCEVGEP12]] = getelementptr i32, i32* [[LSR_IV11]], i32 4 -; CHECK-NEXT: [[SCEVGEP15]] = getelementptr i32, i32* [[LSR_IV14]], i32 4 -; CHECK-NEXT: [[TMP3]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP0]], i32 1) -; CHECK-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0 -; CHECK-NEXT: br i1 [[TMP4]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]] -; CHECK: for.cond.cleanup: -; CHECK-NEXT: ret void +; CHECK-LABEL: @overflow_in_sub( +; CHECK-NOT: @llvm.arm.mve.vctp32 +; CHECK-NOT: @llvm.get.active.lane.mask +; CHECK: ret void ; +define dso_local void @overflow_in_sub(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 { entry: call void @llvm.set.loop.iterations.i32(i32 8001) br label %vector.body @@ -305,8 +338,63 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, -; non-uniform constant vector here: - %1 = icmp ult <4 x i32> %induction, +; Overflow in the substraction. This should hold: +; +; ceil(ElementCount / VectorWidth) >= TripCount +; +; But we have: +; +; ceil(3200 / 4) >= 8001 +; 8000 >= 8001 +; + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 31999) + + %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef) + %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef) + %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %2, <4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %1) + %index.next = add i32 %index, 4 + %scevgep = getelementptr i32, i32* %lsr.iv, i32 4 + %scevgep12 = getelementptr i32, i32* %lsr.iv11, i32 4 + %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4 + %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1) + %4 = icmp ne i32 %3, 0 + br i1 %4, label %vector.body, label %for.cond.cleanup + +for.cond.cleanup: + ret void +} + +; CHECK-LABEL: @overflow_in_rounding_tripcount( +; CHECK-NOT: @llvm.arm.mve.vctp32 +; CHECK-NOT: @llvm.get.active.lane.mask +; CHECK: ret void +; +define dso_local void @overflow_in_rounding_tripcount(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 { +entry: + +; TC = 4294967292 +; 4294967292 <= 4294967291 (MAX - vectorwidth) +; False +; + call void @llvm.set.loop.iterations.i32(i32 4294967291) + br label %vector.body + +vector.body: + %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %A, %entry ] + %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ] + %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ] + %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = phi i32 [ 8001, %entry ], [ %3, %vector.body ] + %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* + %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>* + %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>* + %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + %induction = add <4 x i32> %broadcast.splat, + + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32002) + %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef) %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef) %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load @@ -323,7 +411,225 @@ ret void } + +; CHECK-LABEL: @IV_not_an_induction( +; +; CHECK-NOT: @llvm.arm.mve.vctp32 +; CHECK-NOT: @llvm.get.active.lane.mask +; +; CHECK: %lane.mask.splatinsert = insertelement <4 x i32> undef, i32 %N, i32 0 +; CHECK: %lane.mask.splat = shufflevector <4 x i32> %lane.mask.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK: %lane.mask.induction = add <4 x i32> %lane.mask.splat, +; CHECK: %[[ICMP:.*]] = icmp ule <4 x i32> %lane.mask.induction, +; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ICMP]], <4 x i32> undef) +; CHECK: ret void +; +define dso_local void @IV_not_an_induction(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 { +entry: + call void @llvm.set.loop.iterations.i32(i32 8001) + br label %vector.body + +vector.body: + %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %A, %entry ] + %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ] + %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ] + %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = phi i32 [ 8001, %entry ], [ %3, %vector.body ] + %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* + %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>* + %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>* + %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + %induction = add <4 x i32> %broadcast.splat, + +; The induction variable %D is not an IV: + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %N, i32 32002) + + %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef) + %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef) + %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %2, <4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %1) + %index.next = add i32 %index, 4 + %scevgep = getelementptr i32, i32* %lsr.iv, i32 4 + %scevgep12 = getelementptr i32, i32* %lsr.iv11, i32 4 + %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4 + %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1) + %4 = icmp ne i32 %3, 0 + br i1 %4, label %vector.body, label %for.cond.cleanup + +for.cond.cleanup: + ret void +} + +; CHECK-LABEL: @IV_wrong_step( +; +; CHECK-NOT: @llvm.arm.mve.vctp32 +; CHECK-NOT: @llvm.get.active.lane.mask +; +; CHECK: %lane.mask.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 +; CHECK: %lane.mask.splat = shufflevector <4 x i32> %lane.mask.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK: %lane.mask.induction = add <4 x i32> %lane.mask.splat, +; CHECK: %[[ICMP:.*]] = icmp ule <4 x i32> %lane.mask.induction, +; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ICMP]], <4 x i32> undef) +; CHECK: ret void +; +define dso_local void @IV_wrong_step(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 { +entry: + call void @llvm.set.loop.iterations.i32(i32 8001) + br label %vector.body + +vector.body: + %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %A, %entry ] + %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ] + %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ] + %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = phi i32 [ 8001, %entry ], [ %3, %vector.body ] + %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* + %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>* + %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>* + %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + %induction = add <4 x i32> %broadcast.splat, + + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32002) + + %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef) + %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef) + %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %2, <4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %1) + +; %index is incremented with 3 and not 4, which is the vectorisation factor +; that we expect here: + %index.next = add i32 %index, 3 + + %scevgep = getelementptr i32, i32* %lsr.iv, i32 4 + %scevgep12 = getelementptr i32, i32* %lsr.iv11, i32 4 + %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4 + %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1) + %4 = icmp ne i32 %3, 0 + br i1 %4, label %vector.body, label %for.cond.cleanup + +for.cond.cleanup: + ret void +} + +; CHECK-LABEL: @IV_step_not_constant( +; +; CHECK-NOT: @llvm.arm.mve.vctp32 +; CHECK-NOT: @llvm.get.active.lane.mask +; +; CHECK: %lane.mask.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 +; CHECK: %lane.mask.splat = shufflevector <4 x i32> %lane.mask.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK: %lane.mask.induction = add <4 x i32> %lane.mask.splat, +; CHECK: %[[ICMP:.*]] = icmp ule <4 x i32> %lane.mask.induction, +; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ICMP]], <4 x i32> undef) +; CHECK: ret void +; +define dso_local void @IV_step_not_constant(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 { +entry: + call void @llvm.set.loop.iterations.i32(i32 8001) + br label %vector.body + +vector.body: + %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %A, %entry ] + %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ] + %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ] + %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = phi i32 [ 8001, %entry ], [ %3, %vector.body ] + %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* + %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>* + %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>* + %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + %induction = add <4 x i32> %broadcast.splat, + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32002) + %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef) + %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef) + %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %2, <4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %1) + +; %index is incremented with some runtime value, i.e. not a constant: + %index.next = add i32 %index, %N + + %scevgep = getelementptr i32, i32* %lsr.iv, i32 4 + %scevgep12 = getelementptr i32, i32* %lsr.iv11, i32 4 + %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4 + %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1) + %4 = icmp ne i32 %3, 0 + br i1 %4, label %vector.body, label %for.cond.cleanup + +for.cond.cleanup: + ret void +} + +; CHECK-LABEL: @outerloop_phi( +; +; CHECK-NOT: @llvm.arm.mve.vctp32 +; CHECK-NOT: @llvm.get.active.lane.mask +; CHECK: %lane.mask.splatinsert = insertelement <4 x i32> undef, i32 %j.025, i32 0 +; CHECK: %lane.mask.splat = shufflevector <4 x i32> %lane.mask.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK: %lane.mask.induction = add <4 x i32> %lane.mask.splat, +; CHECK: %[[ICMP:.*]] = icmp ule <4 x i32> %lane.mask.induction, +; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ICMP]], <4 x i32> undef) +; +; CHECK: ret void +; +define dso_local void @outerloop_phi(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 { +entry: + %cmp24 = icmp eq i32 %N, 0 + br i1 %cmp24, label %for.cond.cleanup, label %vector.ph.preheader + +vector.ph.preheader: ; preds = %entry + br label %vector.ph + +vector.ph: ; preds = %vector.ph.preheader, %for.cond.cleanup3 + %lsr.iv36 = phi i32* [ %B, %vector.ph.preheader ], [ %scevgep37, %for.cond.cleanup3 ] + %lsr.iv31 = phi i32* [ %C, %vector.ph.preheader ], [ %scevgep32, %for.cond.cleanup3 ] + %lsr.iv = phi i32* [ %A, %vector.ph.preheader ], [ %scevgep, %for.cond.cleanup3 ] + %j.025 = phi i32 [ %inc11, %for.cond.cleanup3 ], [ 0, %vector.ph.preheader ] + call void @llvm.set.loop.iterations.i32(i32 1025) + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %lsr.iv38 = phi i32* [ %scevgep39, %vector.body ], [ %lsr.iv36, %vector.ph ] + %lsr.iv33 = phi i32* [ %scevgep34, %vector.body ], [ %lsr.iv31, %vector.ph ] + %lsr.iv28 = phi i32* [ %scevgep29, %vector.body ], [ %lsr.iv, %vector.ph ] + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %0 = phi i32 [ 1025, %vector.ph ], [ %2, %vector.body ] + %lsr.iv3840 = bitcast i32* %lsr.iv38 to <4 x i32>* + %lsr.iv3335 = bitcast i32* %lsr.iv33 to <4 x i32>* + %lsr.iv2830 = bitcast i32* %lsr.iv28 to <4 x i32>* + +; It's using %j.025, the induction variable from its outer loop: + %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %j.025, i32 4096) + + %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv3840, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) + %wide.masked.load27 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv3335, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) + %1 = add nsw <4 x i32> %wide.masked.load27, %wide.masked.load + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %lsr.iv2830, i32 4, <4 x i1> %active.lane.mask) + %index.next = add i32 %index, 4 + %scevgep29 = getelementptr i32, i32* %lsr.iv28, i32 4 + %scevgep34 = getelementptr i32, i32* %lsr.iv33, i32 4 + %scevgep39 = getelementptr i32, i32* %lsr.iv38, i32 4 + %2 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1) + %3 = icmp ne i32 %2, 0 + br i1 %3, label %vector.body, label %for.cond.cleanup3 + +for.cond.cleanup: ; preds = %for.cond.cleanup3, %entry + ret void + +for.cond.cleanup3: ; preds = %vector.body + %inc11 = add nuw i32 %j.025, 1 + %scevgep = getelementptr i32, i32* %lsr.iv, i32 1 + %scevgep32 = getelementptr i32, i32* %lsr.iv31, i32 1 + %scevgep37 = getelementptr i32, i32* %lsr.iv36, i32 1 + %exitcond26 = icmp eq i32 %inc11, %N + br i1 %exitcond26, label %for.cond.cleanup, label %vector.ph +} + + declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #1 declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #2 declare i32 @llvm.loop.decrement.reg.i32(i32 , i32 ) declare void @llvm.set.loop.iterations.i32(i32) +declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll @@ -27,7 +27,10 @@ %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer %induction = add <8 x i32> %broadcast.splat, %tmp = getelementptr inbounds i16, i16* %a, i32 %index - %tmp1 = icmp ule <8 x i32> %induction, %broadcast.splat11 + + ; %tmp1 = icmp ule <8 x i32> %induction, %broadcast.splat11 + %tmp1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %trip.count.minus.1) + %tmp2 = bitcast i16* %tmp to <8 x i16>* %wide.masked.load = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp2, i32 4, <8 x i1> %tmp1, <8 x i16> undef) %tmp3 = getelementptr inbounds i16, i16* %b, i32 %index @@ -84,7 +87,10 @@ %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer %induction = add <8 x i32> %broadcast.splat, %tmp = getelementptr inbounds i16, i16* %a, i32 %index - %tmp1 = icmp ule <8 x i32> %induction, %broadcast.splat11 + + ; %tmp1 = icmp ule <8 x i32> %induction, %broadcast.splat11 + %tmp1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %trip.count.minus.1) + %tmp2 = bitcast i16* %tmp to <8 x i16>* %wide.masked.load = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp2, i32 4, <8 x i1> %tmp1, <8 x i16> undef) %tmp3 = getelementptr inbounds i16, i16* %b, i32 %index @@ -143,7 +149,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %tmp = getelementptr inbounds i32, i32* %a, i32 %index - %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + + ; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + %tmp1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %tmp2 = bitcast i32* %tmp to <4 x i32>* %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef) %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index @@ -171,3 +180,5 @@ declare void @llvm.masked.store.v4i64.p0v4i64(<4 x i64>, <4 x i64>*, i32 immarg, <4 x i1>) declare void @llvm.set.loop.iterations.i32(i32) declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) +declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) +declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll @@ -1,36 +1,45 @@ ; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -disable-mve-tail-predication=false -mattr=+mve %s -S -o - | FileCheck %s +; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -disable-mve-tail-predication=false \ +; RUN: -force-tail-predication -mattr=+mve %s -S -o - | FileCheck %s --check-prefix=FORCE ; CHECK-LABEL: reduction_i32 -; CHECK: phi i32 [ 0, %entry ] -; CHECK: phi <8 x i16> [ zeroinitializer, %entry ] +; CHECK: phi i32 [ 0, %vector.ph ] +; CHECK: phi <8 x i16> [ zeroinitializer, %vector.ph ] ; CHECK: phi i32 -; CHECK: [[PHI:%[^ ]+]] = phi i32 [ %N, %entry ], [ [[ELEMS:%[^ ]+]], %vector.body ] +; CHECK: [[PHI:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[ELEMS:%[^ ]+]], %vector.body ] ; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[PHI]]) ; CHECK: [[ELEMS]] = sub i32 [[PHI]], 8 ; CHECK: call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef) ; CHECK: call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp6, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef) define i16 @reduction_i32(i16* nocapture readonly %A, i16* nocapture readonly %B, i32 %N) { entry: + %cmp8 = icmp eq i32 %N, 0 + br i1 %cmp8, label %for.cond.cleanup, label %vector.ph + +vector.ph: %tmp = add i32 %N, -1 - %n.rnd.up = add nuw nsw i32 %tmp, 8 + %n.rnd.up = add i32 %tmp, 8 %n.vec = and i32 %n.rnd.up, -8 %broadcast.splatinsert1 = insertelement <8 x i32> undef, i32 %tmp, i32 0 %broadcast.splat2 = shufflevector <8 x i32> %broadcast.splatinsert1, <8 x i32> undef, <8 x i32> zeroinitializer %0 = add i32 %n.vec, -8 %1 = lshr i32 %0, 3 - %2 = add nuw nsw i32 %1, 1 + %2 = add i32 %1, 1 call void @llvm.set.loop.iterations.i32(i32 %2) br label %vector.body -vector.body: ; preds = %vector.body, %entry - %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] - %vec.phi = phi <8 x i16> [ zeroinitializer, %entry ], [ %tmp8, %vector.body ] - %3 = phi i32 [ %2, %entry ], [ %4, %vector.body ] +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph], [ %index.next, %vector.body ] + %vec.phi = phi <8 x i16> [ zeroinitializer, %vector.ph], [ %tmp8, %vector.body ] + %3 = phi i32 [ %2, %vector.ph], [ %4, %vector.body ] %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer %induction = add <8 x i32> %broadcast.splat, %tmp2 = getelementptr inbounds i16, i16* %A, i32 %index - %tmp3 = icmp ule <8 x i32> %induction, %broadcast.splat2 + + ; %tmp3 = icmp ule <8 x i32> %induction, %broadcast.splat2 + %tmp3 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %tmp) + %tmp4 = bitcast i16* %tmp2 to <8 x i16>* %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> %tmp3, <8 x i16> undef) %tmp5 = getelementptr inbounds i16, i16* %B, i32 %index @@ -38,7 +47,7 @@ %wide.masked.load3 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp6, i32 4, <8 x i1> %tmp3, <8 x i16> undef) %tmp7 = add <8 x i16> %wide.masked.load, %vec.phi %tmp8 = add <8 x i16> %tmp7, %wide.masked.load3 - %index.next = add nuw nsw i32 %index, 8 + %index.next = add i32 %index, 8 %4 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %3, i32 1) %5 = icmp ne i32 %4, 0 br i1 %5, label %vector.body, label %middle.block @@ -56,18 +65,102 @@ %bin.rdx7 = add <8 x i16> %rdx.shuf6, %bin.rdx5 %tmp11 = extractelement <8 x i16> %bin.rdx7, i32 0 ret i16 %tmp11 + +for.cond.cleanup: + %res.0 = phi i16 [ 0, %entry ] + ret i16 %res.0 } ; CHECK-LABEL: reduction_i32_with_scalar -; CHECK: phi i32 [ 0, %entry ] -; CHECK: phi <8 x i16> [ zeroinitializer, %entry ] -; CHECK: phi i32 -; CHECK: [[PHI:%[^ ]+]] = phi i32 [ %N, %entry ], [ [[ELEMS:%[^ ]+]], %vector.body ] +; CHECK: vector.body: +; CHECK: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; CHECK: %vec.phi = phi <8 x i16> [ zeroinitializer, %vector.ph ], [ %{{.*}}, %vector.body ] +; CHECK: %{{.*}} = phi i32 [ %{{.*}}, %vector.ph ], [ %{{.*}}, %vector.body ] +; CHECK: [[PHI:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[ELEMS:%[^ ]+]], %vector.body ] ; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[PHI]]) ; CHECK: [[ELEMS]] = sub i32 [[PHI]], 8 ; CHECK: call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef) define i16 @reduction_i32_with_scalar(i16* nocapture readonly %A, i16 %B, i32 %N) local_unnamed_addr { entry: + %cmp8 = icmp eq i32 %N, 0 + br i1 %cmp8, label %for.cond.cleanup, label %vector.ph + +vector.ph: + %tmp = add i32 %N, -1 + %n.rnd.up = add nuw nsw i32 %tmp, 8 + %n.vec = and i32 %n.rnd.up, -8 + %broadcast.splatinsert1 = insertelement <8 x i32> undef, i32 %tmp, i32 0 + %broadcast.splat2 = shufflevector <8 x i32> %broadcast.splatinsert1, <8 x i32> undef, <8 x i32> zeroinitializer + %broadcast.splatinsert3 = insertelement <8 x i16> undef, i16 %B, i32 0 + %broadcast.splat4 = shufflevector <8 x i16> %broadcast.splatinsert3, <8 x i16> undef, <8 x i32> zeroinitializer + %0 = add i32 %n.vec, -8 + %1 = lshr i32 %0, 3 + %2 = add nuw nsw i32 %1, 1 + call void @llvm.set.loop.iterations.i32(i32 %2) + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph], [ %index.next, %vector.body ] + %vec.phi = phi <8 x i16> [ zeroinitializer, %vector.ph], [ %tmp6, %vector.body ] + %3 = phi i32 [ %2, %vector.ph], [ %4, %vector.body ] + %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0 + %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer + %induction = add <8 x i32> %broadcast.splat, + %tmp2 = getelementptr inbounds i16, i16* %A, i32 %index + + ; %tmp3 = icmp ule <8 x i32> %induction, %broadcast.splat2 + %tmp3 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %tmp) + + %tmp4 = bitcast i16* %tmp2 to <8 x i16>* + %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> %tmp3, <8 x i16> undef) + %tmp5 = add <8 x i16> %vec.phi, %broadcast.splat4 + %tmp6 = add <8 x i16> %tmp5, %wide.masked.load + %index.next = add nuw nsw i32 %index, 8 + %4 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %3, i32 1) + %5 = icmp ne i32 %4, 0 + br i1 %5, label %vector.body, label %middle.block + +middle.block: ; preds = %vector.body + %tmp8 = select <8 x i1> %tmp3, <8 x i16> %tmp6, <8 x i16> %vec.phi + %rdx.shuf = shufflevector <8 x i16> %tmp8, <8 x i16> undef, <8 x i32> + %bin.rdx = add <8 x i16> %rdx.shuf, %tmp8 + %rdx.shuf5 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> + %bin.rdx6 = add <8 x i16> %rdx.shuf5, %bin.rdx + %rdx.shuf7 = shufflevector <8 x i16> %bin.rdx6, <8 x i16> undef, <8 x i32> + %bin.rdx8 = add <8 x i16> %rdx.shuf7, %bin.rdx6 + %tmp9 = extractelement <8 x i16> %bin.rdx8, i32 0 + ret i16 %tmp9 + +for.cond.cleanup: + %res.0 = phi i16 [ 0, %entry ] + ret i16 %res.0 +} + +; The vector loop is not guarded with an entry check (N == 0). +; This means we can't calculate a precise range for the backedge count in +; @llvm.get.active.lane.mask, and are assuming overflow can happen and thus +; we can't insert the VCTP here. +; +; CHECK-LABEL: @reduction_not_guarded +; +; CHECK-NOT: @llvm.arm.mve.vctp +; CHECK-NOT: @llvm.get.active.lane.mask.v8i1.i32 +; +; CHECK: entry: +; CHECK: %[[ELEMCOUNT:.*]] = add i32 %N, -1 +; CHECK: %broadcast.splatinsert1 = insertelement <8 x i32> undef, i32 %[[ELEMCOUNT]], i32 0 +; CHECK %broadcast.splat2 = shufflevector <8 x i32> %broadcast.splatinsert1, <8 x i32> undef, <8 x i32> zeroinitializer +; +; CHECK: vector.body: +; CHECK: %lane.mask.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0 +; CHECK: %lane.mask.splat = shufflevector <8 x i32> %lane.mask.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer +; CHECK: %lane.mask.induction = add <8 x i32> %lane.mask.splat, +; CHECK: %[[ICMP:.*]] = icmp ule <8 x i32> %lane.mask.induction, %broadcast.splat2 +; CHECK: call <8 x i16> @llvm.masked.load.v8i16.p0v8i16({{.*}}, <8 x i1> %[[ICMP]], <8 x i16> undef) +; CHECK: ret +; +define i16 @reduction_not_guarded(i16* nocapture readonly %A, i16 %B, i32 %N) local_unnamed_addr { +entry: %tmp = add i32 %N, -1 %n.rnd.up = add nuw nsw i32 %tmp, 8 %n.vec = and i32 %n.rnd.up, -8 @@ -81,15 +174,18 @@ call void @llvm.set.loop.iterations.i32(i32 %2) br label %vector.body -vector.body: ; preds = %vector.body, %entry - %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] - %vec.phi = phi <8 x i16> [ zeroinitializer, %entry ], [ %tmp6, %vector.body ] +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %entry], [ %index.next, %vector.body ] + %vec.phi = phi <8 x i16> [ zeroinitializer, %entry], [ %tmp6, %vector.body ] %3 = phi i32 [ %2, %entry ], [ %4, %vector.body ] %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer %induction = add <8 x i32> %broadcast.splat, %tmp2 = getelementptr inbounds i16, i16* %A, i32 %index - %tmp3 = icmp ule <8 x i32> %induction, %broadcast.splat2 + + ; %tmp3 = icmp ule <8 x i32> %induction, %broadcast.splat2 + %tmp3 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %tmp) + %tmp4 = bitcast i16* %tmp2 to <8 x i16>* %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> %tmp3, <8 x i16> undef) %tmp5 = add <8 x i16> %vec.phi, %broadcast.splat4 @@ -111,8 +207,119 @@ ret i16 %tmp9 } +; Without forcing tail-predication, we bail because overflow analysis says: +; +; overflow possible in: {(-1 + (sext i16 %Size to i32)),+,-1}<%for.body> +; +; CHECK-LABEL: @Correlation +; +; CHECK: entry: +; CHECK: for.body.lr.ph: ; preds = %entry +; CHECK: for.body: ; preds = %for.end, %for.body.lr.ph +; CHECK: vector.ph: ; preds = %for.body +; CHECK: %trip.count.minus.1 = add i32 %8, -1 +; CHECK: call void @llvm.set.loop.iterations.i32(i32 %7) +; CHECK: %insert.btc = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 +; CHECK: %splat.btc = shufflevector <4 x i32> %insert.btc, <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK: br label %vector.body +; CHECK: vector.body: +; CHECK-NOT: @llvm.arm.mve.vctp +; CHECK: %[[ICMP:.*]] = icmp ule <4 x i32> %lane.mask.induction, %splat.btc +; CHECK: call <4 x i16> @llvm.masked.load.v4i16.p0v4i16({{.*}}, <4 x i1> %[[ICMP]],{{.*}} +; +; +; FORCE-LABEL: @Correlation +; FORCE: vector.ph: ; preds = %for.body +; FORCE: %trip.count.minus.1 = add i32 %{{.*}}, -1 +; FORCE: call void @llvm.set.loop.iterations.i32(i32 %{{.*}}) +; FORCE: br label %vector.body +; FORCE: vector.body: ; preds = %vector.body, %vector.ph +; FORCE: %[[VCTP:.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 %{{.*}}) +; FORCE: call <4 x i16> @llvm.masked.load.v4i16.p0v4i16({{.*}}, <4 x i1> %[[VCTP]]{{.*}} +; +define dso_local void @Correlation(i16* nocapture readonly %Input, i16* nocapture %Output, i16 signext %Size, i16 signext %N, i16 signext %Scale) local_unnamed_addr #0 { +entry: + %conv = sext i16 %N to i32 + %cmp36 = icmp sgt i16 %N, 0 + br i1 %cmp36, label %for.body.lr.ph, label %for.end17 + +for.body.lr.ph: + %conv2 = sext i16 %Size to i32 + %conv1032 = zext i16 %Scale to i32 + %0 = add i32 %conv2, 3 + br label %for.body + +for.body: + %lsr.iv51 = phi i32 [ %lsr.iv.next, %for.end ], [ %0, %for.body.lr.ph ] + %lsr.iv46 = phi i16* [ %scevgep47, %for.end ], [ %Input, %for.body.lr.ph ] + %i.037 = phi i32 [ 0, %for.body.lr.ph ], [ %inc16, %for.end ] + %1 = mul nsw i32 %i.037, -1 + %2 = add i32 %0, %1 + %3 = lshr i32 %2, 2 + %4 = shl nuw i32 %3, 2 + %5 = add i32 %4, -4 + %6 = lshr i32 %5, 2 + %7 = add nuw nsw i32 %6, 1 + %8 = sub i32 %conv2, %i.037 + %cmp433 = icmp slt i32 %i.037, %conv2 + br i1 %cmp433, label %vector.ph, label %for.end + +vector.ph: ; preds = %for.body + %trip.count.minus.1 = add i32 %8, -1 + call void @llvm.set.loop.iterations.i32(i32 %7) + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %lsr.iv48 = phi i16* [ %scevgep49, %vector.body ], [ %lsr.iv46, %vector.ph ] + %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %Input, %vector.ph ] + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %16, %vector.body ] + %9 = phi i32 [ %7, %vector.ph ], [ %17, %vector.body ] + %lsr.iv4850 = bitcast i16* %lsr.iv48 to <4 x i16>* + %lsr.iv45 = bitcast i16* %lsr.iv to <4 x i16>* + %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv45, i32 2, <4 x i1> %active.lane.mask, <4 x i16> undef) + %10 = sext <4 x i16> %wide.masked.load to <4 x i32> + %wide.masked.load42 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv4850, i32 2, <4 x i1> %active.lane.mask, <4 x i16> undef) + %11 = sext <4 x i16> %wide.masked.load42 to <4 x i32> + %12 = mul nsw <4 x i32> %11, %10 + %13 = insertelement <4 x i32> undef, i32 %conv1032, i32 0 + %14 = shufflevector <4 x i32> %13, <4 x i32> undef, <4 x i32> zeroinitializer + %15 = ashr <4 x i32> %12, %14 + %16 = add <4 x i32> %15, %vec.phi + %index.next = add i32 %index, 4 + %scevgep = getelementptr i16, i16* %lsr.iv, i32 4 + %scevgep49 = getelementptr i16, i16* %lsr.iv48, i32 4 + %17 = call i32 @llvm.loop.decrement.reg.i32(i32 %9, i32 1) + %18 = icmp ne i32 %17, 0 + br i1 %18, label %vector.body, label %middle.block + +middle.block: ; preds = %vector.body + %19 = select <4 x i1> %active.lane.mask, <4 x i32> %16, <4 x i32> %vec.phi + %20 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %19) + br label %for.end + +for.end: ; preds = %middle.block, %for.body + %Sum.0.lcssa = phi i32 [ 0, %for.body ], [ %20, %middle.block ] + %21 = lshr i32 %Sum.0.lcssa, 16 + %conv13 = trunc i32 %21 to i16 + %arrayidx14 = getelementptr inbounds i16, i16* %Output, i32 %i.037 + store i16 %conv13, i16* %arrayidx14, align 2 + %inc16 = add nuw nsw i32 %i.037, 1 + %scevgep47 = getelementptr i16, i16* %lsr.iv46, i32 1 + %lsr.iv.next = add i32 %lsr.iv51, -1 + %exitcond39 = icmp eq i32 %inc16, %conv + br i1 %exitcond39, label %for.end17, label %for.body + +for.end17: ; preds = %for.end, %entry + ret void +} + declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>) declare void @llvm.set.loop.iterations.i32(i32) declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) - - +declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) +declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32) +declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) +declare i32 @llvm.loop.decrement.reg.i32(i32, i32) +declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll @@ -15,8 +15,7 @@ ; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2 -; CHECK-NEXT: lsr.w r3, r12, #2 -; CHECK-NEXT: sub.w r3, r2, r3, lsl #2 +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB0_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -25,12 +24,12 @@ ; CHECK-NEXT: vpstt ; CHECK-NEXT: vldrwt.u32 q0, [r0], #16 ; CHECK-NEXT: vldrwt.u32 q2, [r1], #16 -; CHECK-NEXT: subs r2, #4 +; CHECK-NEXT: adds r3, #4 ; CHECK-NEXT: vmul.i32 q0, q2, q0 +; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vadd.i32 q0, q0, q1 ; CHECK-NEXT: le lr, .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block -; CHECK-NEXT: vctp.32 r3 ; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: pop {r7, pc} @@ -53,7 +52,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %0 = getelementptr inbounds i32, i32* %a, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat12 + + ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat12 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast i32* %0 to <4 x i32>* %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef) %3 = getelementptr inbounds i32, i32* %b, i32 %index @@ -89,8 +91,7 @@ ; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: subs r1, #4 ; CHECK-NEXT: add.w lr, r3, r1, lsr #2 -; CHECK-NEXT: lsrs r1, r1, #2 -; CHECK-NEXT: sub.w r1, r2, r1, lsl #2 +; CHECK-NEXT: movs r1, #0 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB1_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -98,11 +99,11 @@ ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vpst ; CHECK-NEXT: vldrwt.u32 q0, [r0], #16 +; CHECK-NEXT: adds r1, #4 ; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vadd.i32 q0, q0, q1 ; CHECK-NEXT: le lr, .LBB1_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block -; CHECK-NEXT: vctp.32 r1 ; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: pop {r7, pc} @@ -125,7 +126,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %0 = getelementptr inbounds i32, i32* %a, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat10 + + ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat10 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast i32* %0 to <4 x i32>* %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef) %3 = add nsw <4 x i32> %wide.masked.load, %vec.phi @@ -157,8 +161,7 @@ ; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: subs r1, #4 ; CHECK-NEXT: add.w lr, r3, r1, lsr #2 -; CHECK-NEXT: lsrs r1, r1, #2 -; CHECK-NEXT: sub.w r1, r2, r1, lsl #2 +; CHECK-NEXT: movs r1, #0 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB2_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -166,11 +169,11 @@ ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vpst ; CHECK-NEXT: vldrwt.u32 q0, [r0], #16 +; CHECK-NEXT: adds r1, #4 ; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vadd.i32 q0, q0, q1 ; CHECK-NEXT: le lr, .LBB2_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block -; CHECK-NEXT: vctp.32 r1 ; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: pop {r7, pc} @@ -193,7 +196,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %0 = getelementptr inbounds i32, i32* %a, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat10 + + ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat10 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast i32* %0 to <4 x i32>* %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef) %3 = add nsw <4 x i32> %wide.masked.load, %vec.phi @@ -218,9 +224,11 @@ ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB3_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: add.w r12, r12, #4 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16 ; CHECK-NEXT: vmul.i32 q0, q0, r2 ; CHECK-NEXT: vstrw.32 q0, [r0], #16 @@ -247,7 +255,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %0 = getelementptr inbounds i32, i32* %b, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat9 + + ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat9 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast i32* %0 to <4 x i32>* %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef) %3 = mul nsw <4 x i32> %wide.masked.load, %broadcast.splat11 @@ -269,9 +280,11 @@ ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB4_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: add.w r12, r12, #4 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16 ; CHECK-NEXT: vadd.i32 q0, q0, r2 ; CHECK-NEXT: vstrw.32 q0, [r0], #16 @@ -298,7 +311,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %0 = getelementptr inbounds i32, i32* %b, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat9 + + ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat9 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast i32* %0 to <4 x i32>* %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef) %3 = add nsw <4 x i32> %wide.masked.load, %broadcast.splat11 @@ -320,9 +336,11 @@ ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dlstp.8 lr, r3 ; CHECK-NEXT: .LBB5_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: add.w r12, r12, #16 ; CHECK-NEXT: vldrb.u8 q0, [r1], #16 ; CHECK-NEXT: vldrb.u8 q1, [r2], #16 ; CHECK-NEXT: vmul.i8 q0, q1, q0 @@ -348,7 +366,10 @@ %broadcast.splat = shufflevector <16 x i32> %broadcast.splatinsert, <16 x i32> undef, <16 x i32> zeroinitializer %induction = add <16 x i32> %broadcast.splat, %0 = getelementptr inbounds i8, i8* %b, i32 %index - %1 = icmp ule <16 x i32> %induction, %broadcast.splat13 + + ; %1 = icmp ule <16 x i32> %induction, %broadcast.splat13 + %1 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast i8* %0 to <16 x i8>* %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %2, i32 1, <16 x i1> %1, <16 x i8> undef) %3 = getelementptr inbounds i8, i8* %c, i32 %index @@ -374,9 +395,11 @@ ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dlstp.16 lr, r3 ; CHECK-NEXT: .LBB6_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: add.w r12, r12, #8 ; CHECK-NEXT: vldrh.u16 q0, [r1], #16 ; CHECK-NEXT: vldrh.u16 q1, [r2], #16 ; CHECK-NEXT: vmul.i16 q0, q1, q0 @@ -402,7 +425,10 @@ %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer %induction = add <8 x i32> %broadcast.splat, %0 = getelementptr inbounds i16, i16* %b, i32 %index - %1 = icmp ule <8 x i32> %induction, %broadcast.splat13 + + ; %1 = icmp ule <8 x i32> %induction, %broadcast.splat13 + %1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast i16* %0 to <8 x i16>* %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %2, i32 2, <8 x i1> %1, <8 x i16> undef) %3 = getelementptr inbounds i16, i16* %c, i32 %index @@ -427,4 +453,6 @@ declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32 immarg, <8 x i1>) declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) - +declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) +declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32) +declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll @@ -5,12 +5,9 @@ ; CHECK: vector.ph: ; CHECK: call void @llvm.set.loop.iterations.i32 -; CHECK: [[UF:%[^ ]+]] = shl i32 %{{.*}}, 2 -; CHECK: [[REMAT_ITER:%[^ ]+]] = sub i32 %N, [[UF]] ; CHECK: br label %vector.body ; CHECK: vector.body: -; CHECK-NOT: phi i32 [ 0, %vector.ph ] ; CHECK: [[ELTS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[SUB:%[^ ]+]], %vector.body ] ; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELTS]]) ; CHECK: [[SUB]] = sub i32 [[ELTS]], 4 @@ -18,7 +15,7 @@ ; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], ; CHECK: middle.block: -; CHECK: [[VCTP_CLONE:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[REMAT_ITER]]) +; CHECK: [[VCTP_CLONE:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[REMAT_ITER:%.*]]) ; CHECK: [[VPSEL:%[^ ]+]] = select <4 x i1> [[VCTP_CLONE]], ; CHECK: call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[VPSEL]]) @@ -32,14 +29,14 @@ %4 = lshr i32 %3, 2 %5 = add nuw nsw i32 %4, 1 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph - + vector.ph: ; preds = %entry %trip.count.minus.1 = add i32 %N, -1 %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer call void @llvm.set.loop.iterations.i32(i32 %5) br label %vector.body - + vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %lsr.iv2 = phi i32* [ %scevgep3, %vector.body ], [ %a, %vector.ph ] @@ -51,7 +48,10 @@ %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, - %7 = icmp ule <4 x i32> %induction, %broadcast.splat12 + + ; %7 = icmp ule <4 x i32> %induction, %broadcast.splat12 + %7 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv24, i32 4, <4 x i1> %7, <4 x i32> undef) %wide.masked.load13 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1, i32 4, <4 x i1> %7, <4 x i32> undef) %8 = mul nsw <4 x i32> %wide.masked.load13, %wide.masked.load @@ -62,20 +62,23 @@ %10 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %6, i32 1) %11 = icmp ne i32 %10, 0 br i1 %11, label %vector.body, label %middle.block - + middle.block: ; preds = %vector.body - %12 = icmp ule <4 x i32> %induction, %broadcast.splat12 +; TODO: check that the intrinsic is also emitted here by the loop vectoriser +; %12 = icmp ule <4 x i32> %induction, %broadcast.splat12 + %12 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %13 = select <4 x i1> %12, <4 x i32> %9, <4 x i32> %vec.phi %14 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %13) br label %for.cond.cleanup - + for.cond.cleanup: ; preds = %middle.block, %entry %res.0.lcssa = phi i32 [ 0, %entry ], [ %14, %middle.block ] ret i32 %res.0.lcssa } - + declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) declare void @llvm.set.loop.iterations.i32(i32) declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) - +declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) diff --git a/llvm/test/CodeGen/Thumb2/mve-fma-loops.ll b/llvm/test/CodeGen/Thumb2/mve-fma-loops.ll --- a/llvm/test/CodeGen/Thumb2/mve-fma-loops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-fma-loops.ll @@ -4,22 +4,24 @@ define arm_aapcs_vfpcc void @fmas1(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) { ; CHECK-LABEL: fmas1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt -; CHECK-NEXT: poplt {r7, pc} +; CHECK-NEXT: poplt {r4, pc} ; CHECK-NEXT: vmov r12, s0 +; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB0_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: adds r4, #4 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16 ; CHECK-NEXT: vfmas.f32 q1, q0, r12 ; CHECK-NEXT: vstrw.32 q1, [r2], #16 ; CHECK-NEXT: letp lr, .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: pop {r4, pc} entry: %cmp8 = icmp sgt i32 %n, 0 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup @@ -40,7 +42,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = or <4 x i32> %broadcast.splat, %0 = getelementptr inbounds float, float* %x, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + + ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast float* %0 to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) %3 = getelementptr inbounds float, float* %y, i32 %index @@ -61,22 +66,24 @@ define arm_aapcs_vfpcc void @fmas2(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) { ; CHECK-LABEL: fmas2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt -; CHECK-NEXT: poplt {r7, pc} +; CHECK-NEXT: poplt {r4, pc} ; CHECK-NEXT: vmov r12, s0 +; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB1_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: adds r4, #4 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: vldrw.u32 q1, [r1], #16 ; CHECK-NEXT: vfmas.f32 q1, q0, r12 ; CHECK-NEXT: vstrw.32 q1, [r2], #16 ; CHECK-NEXT: letp lr, .LBB1_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: pop {r4, pc} entry: %cmp8 = icmp sgt i32 %n, 0 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup @@ -97,7 +104,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = or <4 x i32> %broadcast.splat, %0 = getelementptr inbounds float, float* %x, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + + ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast float* %0 to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) %3 = getelementptr inbounds float, float* %y, i32 %index @@ -119,22 +129,24 @@ define arm_aapcs_vfpcc void @fma1(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) { ; CHECK-LABEL: fma1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt -; CHECK-NEXT: poplt {r7, pc} +; CHECK-NEXT: poplt {r4, pc} ; CHECK-NEXT: vmov r12, s0 +; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB2_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: adds r4, #4 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: vldrw.u32 q1, [r1], #16 ; CHECK-NEXT: vfma.f32 q1, q0, r12 ; CHECK-NEXT: vstrw.32 q1, [r2], #16 ; CHECK-NEXT: letp lr, .LBB2_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: pop {r4, pc} entry: %cmp8 = icmp sgt i32 %n, 0 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup @@ -155,7 +167,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = or <4 x i32> %broadcast.splat, %0 = getelementptr inbounds float, float* %x, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + + ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast float* %0 to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) %3 = getelementptr inbounds float, float* %y, i32 %index @@ -176,22 +191,24 @@ define arm_aapcs_vfpcc void @fma2(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) { ; CHECK-LABEL: fma2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt -; CHECK-NEXT: poplt {r7, pc} +; CHECK-NEXT: poplt {r4, pc} ; CHECK-NEXT: vmov r12, s0 +; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB3_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: adds r4, #4 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: vldrw.u32 q1, [r1], #16 ; CHECK-NEXT: vfma.f32 q1, q0, r12 ; CHECK-NEXT: vstrw.32 q1, [r2], #16 ; CHECK-NEXT: letp lr, .LBB3_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: pop {r4, pc} entry: %cmp8 = icmp sgt i32 %n, 0 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup @@ -212,7 +229,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = or <4 x i32> %broadcast.splat, %0 = getelementptr inbounds float, float* %x, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + + ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast float* %0 to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) %3 = fmul fast <4 x float> %wide.masked.load, %broadcast.splat13 @@ -234,23 +254,25 @@ define arm_aapcs_vfpcc void @fmss1(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) { ; CHECK-LABEL: fmss1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt -; CHECK-NEXT: poplt {r7, pc} -; CHECK-NEXT: vmov r12, s0 +; CHECK-NEXT: poplt {r4, pc} +; CHECK-NEXT: vmov r4, s0 ; CHECK-NEXT: dlstp.32 lr, r3 -; CHECK-NEXT: eor r12, r12, #-2147483648 +; CHECK-NEXT: eor r12, r4, #-2147483648 +; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: .LBB4_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: adds r4, #4 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16 ; CHECK-NEXT: vfmas.f32 q1, q0, r12 ; CHECK-NEXT: vstrw.32 q1, [r2], #16 ; CHECK-NEXT: letp lr, .LBB4_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: pop {r4, pc} entry: %cmp8 = icmp sgt i32 %n, 0 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup @@ -272,7 +294,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = or <4 x i32> %broadcast.splat, %0 = getelementptr inbounds float, float* %x, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + + ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast float* %0 to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) %3 = getelementptr inbounds float, float* %y, i32 %index @@ -293,17 +318,19 @@ define arm_aapcs_vfpcc void @fmss2(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) { ; CHECK-LABEL: fmss2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt -; CHECK-NEXT: poplt {r7, pc} -; CHECK-NEXT: vmov r12, s0 -; CHECK-NEXT: vdup.32 q0, r12 -; CHECK-NEXT: dlstp.32 lr, r3 +; CHECK-NEXT: poplt {r4, pc} +; CHECK-NEXT: vmov r4, s0 +; CHECK-NEXT: vdup.32 q0, r4 ; CHECK-NEXT: vneg.f32 q0, q0 +; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB5_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: add.w r12, r12, #4 ; CHECK-NEXT: vmov q3, q0 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16 ; CHECK-NEXT: vldrw.u32 q2, [r1], #16 @@ -311,7 +338,7 @@ ; CHECK-NEXT: vstrw.32 q3, [r2], #16 ; CHECK-NEXT: letp lr, .LBB5_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: pop {r4, pc} entry: %cmp8 = icmp sgt i32 %n, 0 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup @@ -332,7 +359,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = or <4 x i32> %broadcast.splat, %0 = getelementptr inbounds float, float* %x, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + + ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast float* %0 to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) %3 = getelementptr inbounds float, float* %y, i32 %index @@ -361,9 +391,11 @@ ; CHECK-NEXT: poplt {r7, pc} ; CHECK-NEXT: vmov r12, s0 ; CHECK-NEXT: vdup.32 q0, r12 +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB6_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: add.w r12, r12, #4 ; CHECK-NEXT: vmov q3, q0 ; CHECK-NEXT: vldrw.u32 q1, [r1], #16 ; CHECK-NEXT: vldrw.u32 q2, [r0], #16 @@ -392,7 +424,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = or <4 x i32> %broadcast.splat, %0 = getelementptr inbounds float, float* %x, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + + ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast float* %0 to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) %3 = getelementptr inbounds float, float* %y, i32 %index @@ -421,9 +456,11 @@ ; CHECK-NEXT: poplt {r7, pc} ; CHECK-NEXT: vmov r12, s0 ; CHECK-NEXT: vdup.32 q0, r12 +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB7_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: add.w r12, r12, #4 ; CHECK-NEXT: vmov q3, q0 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16 ; CHECK-NEXT: vldrw.u32 q2, [r1], #16 @@ -452,7 +489,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = or <4 x i32> %broadcast.splat, %0 = getelementptr inbounds float, float* %x, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + + ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast float* %0 to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) %3 = getelementptr inbounds float, float* %y, i32 %index @@ -474,23 +514,25 @@ define arm_aapcs_vfpcc void @fms1(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) { ; CHECK-LABEL: fms1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt -; CHECK-NEXT: poplt {r7, pc} -; CHECK-NEXT: vmov r12, s0 +; CHECK-NEXT: poplt {r4, pc} +; CHECK-NEXT: vmov r4, s0 ; CHECK-NEXT: dlstp.32 lr, r3 -; CHECK-NEXT: eor r12, r12, #-2147483648 +; CHECK-NEXT: eor r12, r4, #-2147483648 +; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: .LBB8_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: adds r4, #4 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: vldrw.u32 q1, [r1], #16 ; CHECK-NEXT: vfma.f32 q1, q0, r12 ; CHECK-NEXT: vstrw.32 q1, [r2], #16 ; CHECK-NEXT: letp lr, .LBB8_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: pop {r4, pc} entry: %cmp8 = icmp sgt i32 %n, 0 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup @@ -512,7 +554,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = or <4 x i32> %broadcast.splat, %0 = getelementptr inbounds float, float* %x, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + + ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast float* %0 to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) %3 = getelementptr inbounds float, float* %y, i32 %index @@ -540,9 +585,11 @@ ; CHECK-NEXT: poplt {r7, pc} ; CHECK-NEXT: vmov r12, s0 ; CHECK-NEXT: vdup.32 q0, r12 +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB9_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: add.w r12, r12, #4 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16 ; CHECK-NEXT: vldrw.u32 q2, [r1], #16 ; CHECK-NEXT: vfms.f32 q2, q1, q0 @@ -570,7 +617,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = or <4 x i32> %broadcast.splat, %0 = getelementptr inbounds float, float* %x, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + + ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast float* %0 to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) %3 = getelementptr inbounds float, float* %y, i32 %index @@ -592,23 +642,25 @@ define arm_aapcs_vfpcc void @fms3(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) { ; CHECK-LABEL: fms3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt -; CHECK-NEXT: poplt {r7, pc} +; CHECK-NEXT: poplt {r4, pc} ; CHECK-NEXT: vmov r12, s0 +; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB10_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r1], #16 -; CHECK-NEXT: vldrw.u32 q1, [r0], #16 -; CHECK-NEXT: vneg.f32 q0, q0 -; CHECK-NEXT: vfma.f32 q0, q1, r12 -; CHECK-NEXT: vstrw.32 q0, [r2], #16 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 +; CHECK-NEXT: vldrw.u32 q1, [r1], #16 +; CHECK-NEXT: adds r4, #4 +; CHECK-NEXT: vneg.f32 q1, q1 +; CHECK-NEXT: vfma.f32 q1, q0, r12 +; CHECK-NEXT: vstrw.32 q1, [r2], #16 ; CHECK-NEXT: letp lr, .LBB10_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: pop {r4, pc} entry: %cmp8 = icmp sgt i32 %n, 0 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup @@ -629,7 +681,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = or <4 x i32> %broadcast.splat, %0 = getelementptr inbounds float, float* %x, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + + ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast float* %0 to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) %3 = getelementptr inbounds float, float* %y, i32 %index @@ -651,23 +706,25 @@ define arm_aapcs_vfpcc void @fms4(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) { ; CHECK-LABEL: fms4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt -; CHECK-NEXT: poplt {r7, pc} +; CHECK-NEXT: poplt {r4, pc} ; CHECK-NEXT: vmov r12, s0 +; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB11_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r1], #16 -; CHECK-NEXT: vldrw.u32 q1, [r0], #16 -; CHECK-NEXT: vneg.f32 q0, q0 -; CHECK-NEXT: vfma.f32 q0, q1, r12 -; CHECK-NEXT: vstrw.32 q0, [r2], #16 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 +; CHECK-NEXT: vldrw.u32 q1, [r1], #16 +; CHECK-NEXT: adds r4, #4 +; CHECK-NEXT: vneg.f32 q1, q1 +; CHECK-NEXT: vfma.f32 q1, q0, r12 +; CHECK-NEXT: vstrw.32 q1, [r2], #16 ; CHECK-NEXT: letp lr, .LBB11_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: pop {r4, pc} entry: %cmp8 = icmp sgt i32 %n, 0 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup @@ -688,7 +745,10 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = or <4 x i32> %broadcast.splat, %0 = getelementptr inbounds float, float* %x, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + + ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast float* %0 to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) %3 = fmul fast <4 x float> %wide.masked.load, %broadcast.splat13 @@ -710,3 +770,4 @@ declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4 x i1>, <4 x float>) declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32 immarg, <4 x i1>) +declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)