diff --git a/llvm/lib/Target/ARM/MVETailPredication.cpp b/llvm/lib/Target/ARM/MVETailPredication.cpp index 284d278c5cab..e6ec52dc54dc 100644 --- a/llvm/lib/Target/ARM/MVETailPredication.cpp +++ b/llvm/lib/Target/ARM/MVETailPredication.cpp @@ -1,715 +1,706 @@ //===- MVETailPredication.cpp - MVE Tail Predication ------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // /// \file /// Armv8.1m introduced MVE, M-Profile Vector Extension, and low-overhead /// branches to help accelerate DSP applications. These two extensions, /// combined with a new form of predication called tail-predication, can be used /// to provide implicit vector predication within a low-overhead loop. /// This is implicit because the predicate of active/inactive lanes is /// calculated by hardware, and thus does not need to be explicitly passed /// to vector instructions. The instructions responsible for this are the /// DLSTP and WLSTP instructions, which setup a tail-predicated loop and the /// the total number of data elements processed by the loop. The loop-end /// LETP instruction is responsible for decrementing and setting the remaining /// elements to be processed and generating the mask of active lanes. /// /// The HardwareLoops pass inserts intrinsics identifying loops that the /// backend will attempt to convert into a low-overhead loop. The vectorizer is /// responsible for generating a vectorized loop in which the lanes are /// predicated upon the iteration counter. This pass looks at these predicated /// vector loops, that are targets for low-overhead loops, and prepares it for /// code generation. Once the vectorizer has produced a masked loop, there's a /// couple of final forms: /// - A tail-predicated loop, with implicit predication. /// - A loop containing multiple VCPT instructions, predicating multiple VPT /// blocks of instructions operating on different vector types. /// /// This pass: -/// 1) Pattern matches the scalar iteration count produced by the vectoriser. -/// The scalar loop iteration count represents the number of elements to be -/// processed. -/// TODO: this could be emitted using an intrinsic, similar to the hardware -/// loop intrinsics, so that we don't need to pattern match this here. -/// 2) Inserts the VCTP intrinsic to represent the effect of -/// tail predication. This will be picked up by the ARM Low-overhead loop -/// pass, which performs the final transformation to a DLSTP or WLSTP -/// tail-predicated loop. +/// 1) Checks if the predicates of the masked load/store instructions are +/// generated by intrinsic @llvm.get.active.lanes(). This intrinsic consumes +/// the Backedge Taken Count (BTC) of the scalar loop as its second argument, +/// which we extract to set up the number of elements processed by the loop. +/// 2) Intrinsic @llvm.get.active.lanes() is then replaced by the MVE target +/// specific VCTP intrinsic to represent the effect of tail predication. +/// This will be picked up by the ARM Low-overhead loop pass, which performs +/// the final transformation to a DLSTP or WLSTP tail-predicated loop. #include "ARM.h" #include "ARMSubtarget.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicsARM.h" #include "llvm/IR/PatternMatch.h" #include "llvm/InitializePasses.h" #include "llvm/Support/Debug.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" using namespace llvm; #define DEBUG_TYPE "mve-tail-predication" #define DESC "Transform predicated vector loops to use MVE tail predication" +static cl::opt +ForceTailPredication("force-tail-predication", cl::Hidden, cl::init(false), + cl::desc("Force tail-predication even if it might be " + "unsafe (e.g. possible overflow in loop " + "counters)")); + cl::opt DisableTailPredication("disable-mve-tail-predication", cl::Hidden, cl::init(true), cl::desc("Disable MVE Tail Predication")); namespace { -// Bookkeeping for pattern matching the loop trip count and the number of -// elements processed by the loop. -struct TripCountPattern { - // An icmp instruction that calculates a predicate of active/inactive lanes - // used by the masked loads/stores. - Instruction *Predicate = nullptr; - - // The add instruction that increments the IV. - Value *TripCount = nullptr; - - // The number of elements processed by the vector loop. - Value *NumElements = nullptr; - - // Other instructions in the icmp chain that calculate the predicate. - FixedVectorType *VecTy = nullptr; - Instruction *Shuffle = nullptr; - Instruction *Induction = nullptr; - - TripCountPattern(Instruction *P, Value *TC, FixedVectorType *VT) - : Predicate(P), TripCount(TC), VecTy(VT){}; -}; - class MVETailPredication : public LoopPass { SmallVector MaskedInsts; Loop *L = nullptr; LoopInfo *LI = nullptr; const DataLayout *DL; DominatorTree *DT = nullptr; ScalarEvolution *SE = nullptr; TargetTransformInfo *TTI = nullptr; TargetLibraryInfo *TLI = nullptr; bool ClonedVCTPInExitBlock = false; + IntrinsicInst *ActiveLaneMask = nullptr; + FixedVectorType *VecTy = nullptr; public: static char ID; MVETailPredication() : LoopPass(ID) { } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.addRequired(); AU.addRequired(); AU.addRequired(); AU.addRequired(); AU.addRequired(); AU.addPreserved(); AU.setPreservesCFG(); } bool runOnLoop(Loop *L, LPPassManager&) override; private: /// Perform the relevant checks on the loop and convert if possible. bool TryConvert(Value *TripCount); /// Return whether this is a vectorized loop, that contains masked /// load/stores. bool IsPredicatedVectorLoop(); - /// Compute a value for the total number of elements that the predicated - /// loop will process if it is a runtime value. - bool ComputeRuntimeElements(TripCountPattern &TCP); - - /// Return whether this is the icmp that generates an i1 vector, based - /// upon a loop counter and a limit that is defined outside the loop, - /// that generates the active/inactive lanes required for tail-predication. - bool isTailPredicate(TripCountPattern &TCP); + /// Perform checks on the arguments of @llvm.get.active.lane.mask + /// intrinsic: check if the first is a loop induction variable, and for the + /// the second check that no overflow can occur in the expression that use + /// this backedge-taken count. + bool IsSafeActiveMask(Value *TripCount, FixedVectorType *VecTy); /// Insert the intrinsic to represent the effect of tail predication. - void InsertVCTPIntrinsic(TripCountPattern &TCP, + void InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask, Value *TripCount, + FixedVectorType *VecTy, DenseMap &NewPredicates); /// Rematerialize the iteration count in exit blocks, which enables /// ARMLowOverheadLoops to better optimise away loop update statements inside /// hardware-loops. void RematerializeIterCount(); + + /// If it is not safe to lower @llvm.get.active.lane.mask to a VCTP, it needs + /// to be lowered to an icmp. + void RevertActiveLaneMask(); }; } // end namespace static bool IsDecrement(Instruction &I) { auto *Call = dyn_cast(&I); if (!Call) return false; Intrinsic::ID ID = Call->getIntrinsicID(); return ID == Intrinsic::loop_decrement_reg; } static bool IsMasked(Instruction *I) { auto *Call = dyn_cast(I); if (!Call) return false; Intrinsic::ID ID = Call->getIntrinsicID(); // TODO: Support gather/scatter expand/compress operations. return ID == Intrinsic::masked_store || ID == Intrinsic::masked_load; } void MVETailPredication::RematerializeIterCount() { SmallVector DeadInsts; SCEVExpander Rewriter(*SE, *DL, "mvetp"); ReplaceExitVal ReplaceExitValue = AlwaysRepl; formLCSSARecursively(*L, *DT, LI, SE); rewriteLoopExitValues(L, LI, TLI, SE, TTI, Rewriter, DT, ReplaceExitValue, DeadInsts); } +void MVETailPredication::RevertActiveLaneMask() { + if (!ActiveLaneMask) + return; + + int VectorWidth = VecTy->getElementCount().Min; + IRBuilder<> Builder(ActiveLaneMask->getParent()->getFirstNonPHI()); + + // 1. Create the vector induction step. This %induction will be the LHS of + // the icmp: + // + // %splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 + // %splat = shufflevector <4 x i32> %splatinsert, <4 x i32> undef, <4 x i32> 0 + // %induction = add <4 x i32> %splat, + // + Value *Index = ActiveLaneMask->getOperand(0); + Value *SplatIndex = + Builder.CreateVectorSplat(VectorWidth, Index, "lane.mask"); + + SmallVector Indices; + for (int i = 0; i < VectorWidth; ++i) + Indices.push_back(ConstantInt::get(Index->getType(), i)); + + Constant *CV = ConstantVector::get(Indices); + Value *Induction = Builder.CreateAdd(SplatIndex, CV, "lane.mask.induction"); + + LLVM_DEBUG(dbgs() << "ARM TP: New index: " << *SplatIndex << "\n"; + dbgs() << "ARM TP: New Induction: " << *Induction << "\n"); + + // 2. In the Preheader, first look if the splat BTC already exists. Find this + // %splat, which will be the RHS of the icmp: + // + // %TC.minus.1 = add i32 %N, -1 + // %splatinsert = insertelement <4 x i32> undef, i32 %TC.minus.1, i32 0 + // %splat = shufflevector <4 x i32> %splatinsert, <4 x i32> undef, <16 x i32> 0 + // + auto *Preheader = L->getLoopPreheader(); + auto *BTC = ActiveLaneMask->getOperand(1); + Value *SplatBTC = nullptr; + + if (auto *C = dyn_cast(BTC)) { + Builder.SetInsertPoint(Preheader->getTerminator()); + SplatBTC = Builder.CreateVectorSplat(VectorWidth, C); + LLVM_DEBUG(dbgs() << "ARM TCP: New splat BTC: " << *SplatBTC << "\n"); + } else { + Instruction *InsertElem; + for (auto &V : *Preheader) { + InsertElem = dyn_cast(&V); + if (!InsertElem) + continue; + ConstantInt *CI = dyn_cast(InsertElem->getOperand(2)); + if (!CI) + continue; + if (InsertElem->getOperand(1) != BTC || CI->getSExtValue() != 0) + continue; + if ((SplatBTC = dyn_cast(*InsertElem->users().begin()))) + break; + } + } + // Or create the splat BTC if it doesn't exist. + if (!SplatBTC) { + Builder.SetInsertPoint(Preheader->getTerminator()); + Value *Undef = + UndefValue::get(FixedVectorType::get(BTC->getType(), VectorWidth)); + Value *Insert = Builder.CreateInsertElement(Undef, + BTC, Builder.getInt32(0), "insert.btc"); + Value *Zero = ConstantInt::get(Insert->getType(), 0); + SplatBTC = Builder.CreateShuffleVector (Insert, Undef, Zero, "splat.btc"); + LLVM_DEBUG(dbgs() << "ARM TCP: New splat BTC: " << *SplatBTC << "\n"); + } + + Builder.SetInsertPoint(ActiveLaneMask); + Value *ICmp = Builder.CreateICmp(ICmpInst::ICMP_ULE, Induction, SplatBTC); + LLVM_DEBUG(dbgs() << "ARM TP: New compare: " << *ICmp << "\n"); + ActiveLaneMask->replaceAllUsesWith(ICmp); + ActiveLaneMask->eraseFromParent(); +} + bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) { if (skipLoop(L) || DisableTailPredication) return false; MaskedInsts.clear(); Function &F = *L->getHeader()->getParent(); auto &TPC = getAnalysis(); auto &TM = TPC.getTM(); auto *ST = &TM.getSubtarget(F); DT = &getAnalysis().getDomTree(); LI = &getAnalysis().getLoopInfo(); TTI = &getAnalysis().getTTI(F); SE = &getAnalysis().getSE(); auto *TLIP = getAnalysisIfAvailable(); TLI = TLIP ? &TLIP->getTLI(*L->getHeader()->getParent()) : nullptr; DL = &L->getHeader()->getModule()->getDataLayout(); this->L = L; + ActiveLaneMask = nullptr; // The MVE and LOB extensions are combined to enable tail-predication, but // there's nothing preventing us from generating VCTP instructions for v8.1m. if (!ST->hasMVEIntegerOps() || !ST->hasV8_1MMainlineOps()) { LLVM_DEBUG(dbgs() << "ARM TP: Not a v8.1m.main+mve target.\n"); return false; } BasicBlock *Preheader = L->getLoopPreheader(); if (!Preheader) return false; auto FindLoopIterations = [](BasicBlock *BB) -> IntrinsicInst* { for (auto &I : *BB) { auto *Call = dyn_cast(&I); if (!Call) continue; Intrinsic::ID ID = Call->getIntrinsicID(); if (ID == Intrinsic::set_loop_iterations || ID == Intrinsic::test_set_loop_iterations) return cast(&I); } return nullptr; }; // Look for the hardware loop intrinsic that sets the iteration count. IntrinsicInst *Setup = FindLoopIterations(Preheader); // The test.set iteration could live in the pre-preheader. if (!Setup) { if (!Preheader->getSinglePredecessor()) return false; Setup = FindLoopIterations(Preheader->getSinglePredecessor()); if (!Setup) return false; } // Search for the hardware loop intrinic that decrements the loop counter. IntrinsicInst *Decrement = nullptr; for (auto *BB : L->getBlocks()) { for (auto &I : *BB) { if (IsDecrement(I)) { Decrement = cast(&I); break; } } } if (!Decrement) return false; ClonedVCTPInExitBlock = false; LLVM_DEBUG(dbgs() << "ARM TP: Running on Loop: " << *L << *Setup << "\n" << *Decrement << "\n"); if (TryConvert(Setup->getArgOperand(0))) { if (ClonedVCTPInExitBlock) RematerializeIterCount(); return true; - } + } else + RevertActiveLaneMask(); LLVM_DEBUG(dbgs() << "ARM TP: Can't tail-predicate this loop.\n"); return false; } -// Pattern match predicates/masks and determine if they use the loop induction -// variable to control the number of elements processed by the loop. If so, -// the loop is a candidate for tail-predication. -bool MVETailPredication::isTailPredicate(TripCountPattern &TCP) { - using namespace PatternMatch; - - // Pattern match the loop body and find the add with takes the index iv - // and adds a constant vector to it: - // - // vector.body: - // .. - // %index = phi i32 - // %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 - // %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, - // <4 x i32> undef, - // <4 x i32> zeroinitializer - // %induction = [add|or] <4 x i32> %broadcast.splat, - // %pred = icmp ule <4 x i32> %induction, %broadcast.splat11 - // - // Please note that the 'or' is equivalent to the 'and' here, this relies on - // BroadcastSplat being the IV which we know is a phi with 0 start and Lanes - // increment, which is all being checked below. - Instruction *BroadcastSplat = nullptr; - Constant *Const = nullptr; - if (!match(TCP.Induction, - m_Add(m_Instruction(BroadcastSplat), m_Constant(Const))) && - !match(TCP.Induction, - m_Or(m_Instruction(BroadcastSplat), m_Constant(Const)))) - return false; - - // Check that we're adding <0, 1, 2, 3... - if (auto *CDS = dyn_cast(Const)) { - for (unsigned i = 0; i < CDS->getNumElements(); ++i) { - if (CDS->getElementAsInteger(i) != i) - return false; - } - } else - return false; - - Instruction *Insert = nullptr; - // The shuffle which broadcasts the index iv into a vector. - if (!match(BroadcastSplat, - m_Shuffle(m_Instruction(Insert), m_Undef(), m_ZeroMask()))) - return false; - - // The insert element which initialises a vector with the index iv. - Instruction *IV = nullptr; - if (!match(Insert, m_InsertElt(m_Undef(), m_Instruction(IV), m_Zero()))) - return false; - - // The index iv. - auto *Phi = dyn_cast(IV); - if (!Phi) - return false; - - // TODO: Don't think we need to check the entry value. - Value *OnEntry = Phi->getIncomingValueForBlock(L->getLoopPreheader()); - if (!match(OnEntry, m_Zero())) - return false; - - Value *InLoop = Phi->getIncomingValueForBlock(L->getLoopLatch()); - unsigned Lanes = cast(Insert->getType())->getNumElements(); - - Instruction *LHS = nullptr; - if (!match(InLoop, m_Add(m_Instruction(LHS), m_SpecificInt(Lanes)))) - return false; - - return LHS == Phi; -} - static FixedVectorType *getVectorType(IntrinsicInst *I) { unsigned TypeOp = I->getIntrinsicID() == Intrinsic::masked_load ? 0 : 1; auto *PtrTy = cast(I->getOperand(TypeOp)->getType()); - return cast(PtrTy->getElementType()); + auto *VecTy = cast(PtrTy->getElementType()); + assert(VecTy && "No scalable vectors expected here"); + return VecTy; } bool MVETailPredication::IsPredicatedVectorLoop() { // Check that the loop contains at least one masked load/store intrinsic. // We only support 'normal' vector instructions - other than masked // load/stores. for (auto *BB : L->getBlocks()) { for (auto &I : *BB) { if (IsMasked(&I)) { FixedVectorType *VecTy = getVectorType(cast(&I)); unsigned Lanes = VecTy->getNumElements(); unsigned ElementWidth = VecTy->getScalarSizeInBits(); // MVE vectors are 128-bit, but don't support 128 x i1. // TODO: Can we support vectors larger than 128-bits? unsigned MaxWidth = TTI->getRegisterBitWidth(true); if (Lanes * ElementWidth > MaxWidth || Lanes == MaxWidth) return false; MaskedInsts.push_back(cast(&I)); } else if (auto *Int = dyn_cast(&I)) { if (Int->getIntrinsicID() == Intrinsic::fma) continue; for (auto &U : Int->args()) { if (isa(U->getType())) return false; } } } } return !MaskedInsts.empty(); } -// Pattern match the predicate, which is an icmp with a constant vector of this -// form: -// -// icmp ult <4 x i32> %induction, -// -// and return the constant, i.e. 32002 in this example. This is assumed to be -// the scalar loop iteration count: the number of loop elements by the -// the vector loop. Further checks are performed in function isTailPredicate(), -// to verify 'induction' behaves as an induction variable. -// -static bool ComputeConstElements(TripCountPattern &TCP) { - if (!dyn_cast(TCP.TripCount)) - return false; - - ConstantInt *VF = ConstantInt::get( - cast(TCP.TripCount->getType()), TCP.VecTy->getNumElements()); - using namespace PatternMatch; - CmpInst::Predicate CC; - - if (!match(TCP.Predicate, m_ICmp(CC, m_Instruction(TCP.Induction), - m_AnyIntegralConstant())) || - CC != ICmpInst::ICMP_ULT) - return false; - - LLVM_DEBUG(dbgs() << "ARM TP: icmp with constants: "; TCP.Predicate->dump();); - Value *ConstVec = TCP.Predicate->getOperand(1); - - auto *CDS = dyn_cast(ConstVec); - if (!CDS || CDS->getNumElements() != VF->getSExtValue()) - return false; - - if ((TCP.NumElements = CDS->getSplatValue())) { - assert(dyn_cast(TCP.NumElements)->getSExtValue() % - VF->getSExtValue() != - 0 && - "tail-predication: trip count should not be a multiple of the VF"); - LLVM_DEBUG(dbgs() << "ARM TP: Found const elem count: " << *TCP.NumElements - << "\n"); - return true; - } - return false; -} - -// Pattern match the loop iteration count setup: -// -// %trip.count.minus.1 = add i32 %N, -1 -// %broadcast.splatinsert10 = insertelement <4 x i32> undef, -// i32 %trip.count.minus.1, i32 0 -// %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, -// <4 x i32> undef, -// <4 x i32> zeroinitializer -// .. -// vector.body: -// .. -// -static bool MatchElemCountLoopSetup(Loop *L, Instruction *Shuffle, - Value *NumElements) { - using namespace PatternMatch; - Instruction *Insert = nullptr; - - if (!match(Shuffle, - m_Shuffle(m_Instruction(Insert), m_Undef(), m_ZeroMask()))) - return false; - - // Insert the limit into a vector. - Instruction *BECount = nullptr; - if (!match(Insert, - m_InsertElt(m_Undef(), m_Instruction(BECount), m_Zero()))) - return false; - - // The limit calculation, backedge count. - Value *TripCount = nullptr; - if (!match(BECount, m_Add(m_Value(TripCount), m_AllOnes()))) - return false; - - if (TripCount != NumElements || !L->isLoopInvariant(BECount)) - return false; - - return true; -} - -bool MVETailPredication::ComputeRuntimeElements(TripCountPattern &TCP) { - using namespace PatternMatch; - const SCEV *TripCountSE = SE->getSCEV(TCP.TripCount); - ConstantInt *VF = ConstantInt::get( - cast(TCP.TripCount->getType()), TCP.VecTy->getNumElements()); - - if (VF->equalsInt(1)) - return false; - - CmpInst::Predicate Pred; - if (!match(TCP.Predicate, m_ICmp(Pred, m_Instruction(TCP.Induction), - m_Instruction(TCP.Shuffle))) || - Pred != ICmpInst::ICMP_ULE) - return false; - - LLVM_DEBUG(dbgs() << "Computing number of elements for vector trip count: "; - TCP.TripCount->dump()); - - // Otherwise, continue and try to pattern match the vector iteration - // count expression - auto VisitAdd = [&](const SCEVAddExpr *S) -> const SCEVMulExpr * { - if (auto *Const = dyn_cast(S->getOperand(0))) { - if (Const->getAPInt() != -VF->getValue()) - return nullptr; - } else - return nullptr; - return dyn_cast(S->getOperand(1)); - }; - - auto VisitMul = [&](const SCEVMulExpr *S) -> const SCEVUDivExpr * { - if (auto *Const = dyn_cast(S->getOperand(0))) { - if (Const->getValue() != VF) - return nullptr; - } else - return nullptr; - return dyn_cast(S->getOperand(1)); - }; - - auto VisitDiv = [&](const SCEVUDivExpr *S) -> const SCEV * { - if (auto *Const = dyn_cast(S->getRHS())) { - if (Const->getValue() != VF) - return nullptr; - } else - return nullptr; - - if (auto *RoundUp = dyn_cast(S->getLHS())) { - if (auto *Const = dyn_cast(RoundUp->getOperand(0))) { - if (Const->getAPInt() != (VF->getValue() - 1)) - return nullptr; - } else - return nullptr; - - return RoundUp->getOperand(1); - } - return nullptr; - }; - - // TODO: Can we use SCEV helpers, such as findArrayDimensions, and friends to - // determine the numbers of elements instead? Looks like this is what is used - // for delinearization, but I'm not sure if it can be applied to the - // vectorized form - at least not without a bit more work than I feel - // comfortable with. - - // Search for Elems in the following SCEV: - // (1 + ((-VF + (VF * (((VF - 1) + %Elems) /u VF))) /u VF)) - const SCEV *Elems = nullptr; - if (auto *TC = dyn_cast(TripCountSE)) - if (auto *Div = dyn_cast(TC->getOperand(1))) - if (auto *Add = dyn_cast(Div->getLHS())) - if (auto *Mul = VisitAdd(Add)) - if (auto *Div = VisitMul(Mul)) - if (auto *Res = VisitDiv(Div)) - Elems = Res; - - if (!Elems) - return false; - - Instruction *InsertPt = L->getLoopPreheader()->getTerminator(); - if (!isSafeToExpandAt(Elems, InsertPt, *SE)) - return false; - - auto DL = L->getHeader()->getModule()->getDataLayout(); - SCEVExpander Expander(*SE, DL, "elements"); - TCP.NumElements = Expander.expandCodeFor(Elems, Elems->getType(), InsertPt); - - if (!MatchElemCountLoopSetup(L, TCP.Shuffle, TCP.NumElements)) - return false; - - return true; -} - // Look through the exit block to see whether there's a duplicate predicate // instruction. This can happen when we need to perform a select on values // from the last and previous iteration. Instead of doing a straight // replacement of that predicate with the vctp, clone the vctp and place it // in the block. This means that the VPR doesn't have to be live into the // exit block which should make it easier to convert this loop into a proper // tail predicated loop. static bool Cleanup(DenseMap &NewPredicates, SetVector &MaybeDead, Loop *L) { BasicBlock *Exit = L->getUniqueExitBlock(); if (!Exit) { LLVM_DEBUG(dbgs() << "ARM TP: can't find loop exit block\n"); return false; } bool ClonedVCTPInExitBlock = false; for (auto &Pair : NewPredicates) { Instruction *OldPred = Pair.first; Instruction *NewPred = Pair.second; for (auto &I : *Exit) { if (I.isSameOperationAs(OldPred)) { Instruction *PredClone = NewPred->clone(); PredClone->insertBefore(&I); I.replaceAllUsesWith(PredClone); MaybeDead.insert(&I); ClonedVCTPInExitBlock = true; LLVM_DEBUG(dbgs() << "ARM TP: replacing: "; I.dump(); dbgs() << "ARM TP: with: "; PredClone->dump()); break; } } } // Drop references and add operands to check for dead. SmallPtrSet Dead; while (!MaybeDead.empty()) { auto *I = MaybeDead.front(); MaybeDead.remove(I); if (I->hasNUsesOrMore(1)) continue; for (auto &U : I->operands()) if (auto *OpI = dyn_cast(U)) MaybeDead.insert(OpI); - I->dropAllReferences(); Dead.insert(I); } for (auto *I : Dead) { LLVM_DEBUG(dbgs() << "ARM TP: removing dead insn: "; I->dump()); I->eraseFromParent(); } for (auto I : L->blocks()) DeleteDeadPHIs(I); return ClonedVCTPInExitBlock; } -void MVETailPredication::InsertVCTPIntrinsic(TripCountPattern &TCP, +// The active lane intrinsic has this form: +// +// @llvm.get.active.lane.mask(IV, BTC) +// +// Here we perform checks that this intrinsic behaves as expected, +// which means: +// +// 1) The element count, which is calculated with BTC + 1, cannot overflow. +// 2) The element count needs to be sufficiently large that the decrement of +// element counter doesn't overflow, which means that we need to prove: +// ceil(ElementCount / VectorWidth) >= TripCount +// by rounding up ElementCount up: +// ((ElementCount + (VectorWidth - 1)) / VectorWidth +// and evaluate if expression isKnownNonNegative: +// (((ElementCount + (VectorWidth - 1)) / VectorWidth) - TripCount +// 3) The IV must be an induction phi with an increment equal to the +// vector width. +bool MVETailPredication::IsSafeActiveMask(Value *TripCount, + FixedVectorType *VecTy) { + // 1) Test whether entry to the loop is protected by a conditional + // BTC + 1 < 0. In other words, if the scalar trip count overflows, + // becomes negative, we shouldn't enter the loop and creating + // tripcount expression BTC + 1 is not safe. So, check that BTC + // isn't max. This is evaluated in unsigned, because the semantics + // of @get.active.lane.mask is a ULE comparison. + int VectorWidth = VecTy->getNumElements(); + auto *BackedgeTakenCount = ActiveLaneMask->getOperand(1); + auto *BTC = SE->getSCEV(BackedgeTakenCount); + + if (!llvm::cannotBeMaxInLoop(BTC, L, *SE, false /*Signed*/) && + !ForceTailPredication) { + LLVM_DEBUG(dbgs() << "ARM TP: Overflow possible, BTC can be max: "; + BTC->dump()); + return false; + } + + // 2) Prove that the sub expression is non-negative, i.e. it doesn't overflow: + // + // (((ElementCount + (VectorWidth - 1)) / VectorWidth) - TripCount + // + // 2.1) First prove overflow can't happen in: + // + // ElementCount + (VectorWidth - 1) + // + // Because of a lack of context, it is difficult to get a useful bounds on + // this expression. But since ElementCount uses the same variables as the + // TripCount (TC), for which we can find meaningful value ranges, we use that + // instead and assert that: + // + // upperbound(TC) <= UINT_MAX - VectorWidth + // + auto *TC = SE->getSCEV(TripCount); + unsigned SizeInBits = TripCount->getType()->getScalarSizeInBits(); + auto Diff = APInt(SizeInBits, ~0) - APInt(SizeInBits, VectorWidth); + uint64_t MaxMinusVW = Diff.getZExtValue(); + uint64_t UpperboundTC = SE->getSignedRange(TC).getUpper().getZExtValue(); + + if (UpperboundTC > MaxMinusVW && !ForceTailPredication) { + LLVM_DEBUG(dbgs() << "ARM TP: Overflow possible in tripcount rounding:\n"; + dbgs() << "upperbound(TC) <= UINT_MAX - VectorWidth\n"; + dbgs() << UpperboundTC << " <= " << MaxMinusVW << "== false\n";); + return false; + } + + // 2.2) Make sure overflow doesn't happen in final expression: + // (((ElementCount + (VectorWidth - 1)) / VectorWidth) - TripCount, + // To do this, compare the full ranges of these subexpressions: + // + // Range(Ceil) <= Range(TC) + // + // where Ceil = ElementCount + (VW-1) / VW. If Ceil and TC are runtime + // values (and not constants), we have to compensate for the lowerbound value + // range to be off by 1. The reason is that BTC lives in the preheader in + // this form: + // + // %trip.count.minus = add nsw nuw i32 %N, -1 + // + // For the loop to be executed, %N has to be >= 1 and as a result the value + // range of %trip.count.minus has a lower bound of 0. Value %TC has this form: + // + // %5 = add nuw nsw i32 %4, 1 + // call void @llvm.set.loop.iterations.i32(i32 %5) + // + // where %5 is some expression using %N, which needs to have a lower bound of + // 1. Thus, if the ranges of Ceil and TC are not a single constant but a set, + // we first add 0 to TC such that we can do the <= comparison on both sets. + // + auto *One = SE->getOne(TripCount->getType()); + // ElementCount = BTC + 1 + auto *ElementCount = SE->getAddExpr(BTC, One); + // Tmp = ElementCount + (VW-1) + auto *ECPlusVWMinus1 = SE->getAddExpr(ElementCount, + SE->getSCEV(ConstantInt::get(TripCount->getType(), VectorWidth - 1))); + // Ceil = ElementCount + (VW-1) / VW + auto *Ceil = SE->getUDivExpr(ECPlusVWMinus1, + SE->getSCEV(ConstantInt::get(TripCount->getType(), VectorWidth))); + + ConstantRange RangeCeil = SE->getSignedRange(Ceil) ; + ConstantRange RangeTC = SE->getSignedRange(TC) ; + if (!RangeTC.isSingleElement()) { + auto ZeroRange = + ConstantRange(APInt(TripCount->getType()->getScalarSizeInBits(), 0)); + RangeTC = RangeTC.unionWith(ZeroRange); + } + if (!RangeTC.contains(RangeCeil) && !ForceTailPredication) { + LLVM_DEBUG(dbgs() << "ARM TP: Overflow possible in sub\n"); + return false; + } + + // 3) Find out if IV is an induction phi. Note that We can't use Loop + // helpers here to get the induction variable, because the hardware loop is + // no longer in loopsimplify form, and also the hwloop intrinsic use a + // different counter. Using SCEV, we check that the induction is of the + // form i = i + 4, where the increment must be equal to the VectorWidth. + auto *IV = ActiveLaneMask->getOperand(0); + auto *IVExpr = SE->getSCEV(IV); + auto *AddExpr = dyn_cast(IVExpr); + if (!AddExpr) { + LLVM_DEBUG(dbgs() << "ARM TP: induction not an add expr: "; IVExpr->dump()); + return false; + } + // Check that this AddRec is associated with this loop. + if (AddExpr->getLoop() != L) { + LLVM_DEBUG(dbgs() << "ARM TP: phi not part of this loop\n"); + return false; + } + auto *Step = dyn_cast(AddExpr->getOperand(1)); + if (!Step) { + LLVM_DEBUG(dbgs() << "ARM TP: induction step is not a constant: "; + AddExpr->getOperand(1)->dump()); + return false; + } + auto StepValue = Step->getValue()->getSExtValue(); + if (VectorWidth == StepValue) + return true; + + LLVM_DEBUG(dbgs() << "ARM TP: step value " << StepValue << " doesn't match " + "vector width : " << VectorWidth << "\n"); + + return false; +} + +// Materialize NumElements in the preheader block. +static Value *getNumElements(BasicBlock *Preheader, Value *BTC) { + // First, check the preheader if it not already exist: + // + // preheader: + // %BTC = add i32 %N, -1 + // .. + // vector.body: + // + // if %BTC already exists. We don't need to emit %NumElems = %BTC + 1, + // but instead can just return %N. + for (auto &I : *Preheader) { + if (I.getOpcode() != Instruction::Add || &I != BTC) + continue; + ConstantInt *MinusOne = nullptr; + if (!(MinusOne = dyn_cast(I.getOperand(1)))) + continue; + if (MinusOne->getSExtValue() == -1) { + LLVM_DEBUG(dbgs() << "ARM TP: Found num elems: " << I << "\n"); + return I.getOperand(0); + } + } + + // But we do need to materialise BTC if it is not already there, + // e.g. if it is a constant. + IRBuilder<> Builder(Preheader->getTerminator()); + Value *NumElements = Builder.CreateAdd(BTC, + ConstantInt::get(BTC->getType(), 1), "num.elements"); + LLVM_DEBUG(dbgs() << "ARM TP: Created num elems: " << *NumElements << "\n"); + return NumElements; +} + +void MVETailPredication::InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask, + Value *TripCount, FixedVectorType *VecTy, DenseMap &NewPredicates) { - IRBuilder<> Builder(L->getHeader()->getFirstNonPHI()); + IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); Module *M = L->getHeader()->getModule(); Type *Ty = IntegerType::get(M->getContext(), 32); + // The backedge-taken count in @llvm.get.active.lane.mask, its 2nd operand, + // is one less than the trip count. So we need to find or create + // %num.elements = %BTC + 1 in the preheader. + Value *BTC = ActiveLaneMask->getOperand(1); + Builder.SetInsertPoint(L->getLoopPreheader()->getTerminator()); + Value *NumElements = getNumElements(L->getLoopPreheader(), BTC); + // Insert a phi to count the number of elements processed by the loop. + Builder.SetInsertPoint(L->getHeader()->getFirstNonPHI() ); PHINode *Processed = Builder.CreatePHI(Ty, 2); - Processed->addIncoming(TCP.NumElements, L->getLoopPreheader()); + Processed->addIncoming(NumElements, L->getLoopPreheader()); - // Insert the intrinsic to represent the effect of tail predication. - Builder.SetInsertPoint(cast(TCP.Predicate)); + // Replace @llvm.get.active.mask() with the ARM specific VCTP intrinic, and thus + // represent the effect of tail predication. + Builder.SetInsertPoint(ActiveLaneMask); ConstantInt *Factor = - ConstantInt::get(cast(Ty), TCP.VecTy->getNumElements()); + ConstantInt::get(cast(Ty), VecTy->getNumElements()); Intrinsic::ID VCTPID; - switch (TCP.VecTy->getNumElements()) { + switch (VecTy->getNumElements()) { default: llvm_unreachable("unexpected number of lanes"); case 4: VCTPID = Intrinsic::arm_mve_vctp32; break; case 8: VCTPID = Intrinsic::arm_mve_vctp16; break; case 16: VCTPID = Intrinsic::arm_mve_vctp8; break; // FIXME: vctp64 currently not supported because the predicate // vector wants to be <2 x i1>, but v2i1 is not a legal MVE // type, so problems happen at isel time. // Intrinsic::arm_mve_vctp64 exists for ACLE intrinsics // purposes, but takes a v4i1 instead of a v2i1. } Function *VCTP = Intrinsic::getDeclaration(M, VCTPID); - Value *TailPredicate = Builder.CreateCall(VCTP, Processed); - TCP.Predicate->replaceAllUsesWith(TailPredicate); - NewPredicates[TCP.Predicate] = cast(TailPredicate); + Value *VCTPCall = Builder.CreateCall(VCTP, Processed); + ActiveLaneMask->replaceAllUsesWith(VCTPCall); + NewPredicates[ActiveLaneMask] = cast(VCTPCall); // Add the incoming value to the new phi. // TODO: This add likely already exists in the loop. Value *Remaining = Builder.CreateSub(Processed, Factor); Processed->addIncoming(Remaining, L->getLoopLatch()); LLVM_DEBUG(dbgs() << "ARM TP: Insert processed elements phi: " << *Processed << "\n" - << "ARM TP: Inserted VCTP: " << *TailPredicate << "\n"); + << "ARM TP: Inserted VCTP: " << *VCTPCall << "\n"); } bool MVETailPredication::TryConvert(Value *TripCount) { if (!IsPredicatedVectorLoop()) { LLVM_DEBUG(dbgs() << "ARM TP: no masked instructions in loop.\n"); return false; } LLVM_DEBUG(dbgs() << "ARM TP: Found predicated vector loop.\n"); - // Walk through the masked intrinsics and try to find whether the predicate - // operand is generated from an induction variable. SetVector Predicates; DenseMap NewPredicates; -#ifndef NDEBUG - // For debugging purposes, use this to indicate we have been able to - // pattern match the scalar loop trip count. - bool FoundScalarTC = false; -#endif - + // Walk through the masked intrinsics and try to find whether the predicate + // operand is generated by intrinsic @llvm.get.active.lane.mask(). for (auto *I : MaskedInsts) { - Intrinsic::ID ID = I->getIntrinsicID(); - // First, find the icmp used by this masked load/store. - unsigned PredOp = ID == Intrinsic::masked_load ? 2 : 3; + unsigned PredOp = I->getIntrinsicID() == Intrinsic::masked_load ? 2 : 3; auto *Predicate = dyn_cast(I->getArgOperand(PredOp)); if (!Predicate || Predicates.count(Predicate)) continue; - // Step 1: using this icmp, now calculate the number of elements - // processed by this loop. - TripCountPattern TCP(Predicate, TripCount, getVectorType(I)); - if (!(ComputeConstElements(TCP) || ComputeRuntimeElements(TCP))) + ActiveLaneMask = dyn_cast(Predicate); + if (!ActiveLaneMask || + ActiveLaneMask->getIntrinsicID() != Intrinsic::get_active_lane_mask) continue; - LLVM_DEBUG(FoundScalarTC = true); - - if (!isTailPredicate(TCP)) { - LLVM_DEBUG(dbgs() << "ARM TP: Not an icmp that generates tail predicate: " - << *Predicate << "\n"); - continue; - } - - LLVM_DEBUG(dbgs() << "ARM TP: Found icmp generating tail predicate: " - << *Predicate << "\n"); Predicates.insert(Predicate); + LLVM_DEBUG(dbgs() << "ARM TP: Found active lane mask: " + << *ActiveLaneMask << "\n"); - // Step 2: emit the VCTP intrinsic representing the effect of TP. - InsertVCTPIntrinsic(TCP, NewPredicates); - } - - if (!NewPredicates.size()) { - LLVM_DEBUG(if (!FoundScalarTC) - dbgs() << "ARM TP: Can't determine loop itertion count\n"); - return false; + VecTy = getVectorType(I); + if (!IsSafeActiveMask(TripCount, VecTy)) { + LLVM_DEBUG(dbgs() << "ARM TP: Not safe to insert VCTP.\n"); + return false; + } + LLVM_DEBUG(dbgs() << "ARM TP: Safe to insert VCTP.\n"); + InsertVCTPIntrinsic(ActiveLaneMask, TripCount, VecTy, NewPredicates); } // Now clean up. ClonedVCTPInExitBlock = Cleanup(NewPredicates, Predicates, L); return true; } Pass *llvm::createMVETailPredicationPass() { return new MVETailPredication(); } char MVETailPredication::ID = 0; INITIALIZE_PASS_BEGIN(MVETailPredication, DEBUG_TYPE, DESC, false, false) INITIALIZE_PASS_END(MVETailPredication, DEBUG_TYPE, DESC, false, false) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll index ad7920007267..54ddf6468336 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll @@ -1,337 +1,359 @@ ; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -disable-mve-tail-predication=false -mattr=+mve,+lob %s -S -o - | FileCheck %s ; CHECK-LABEL: mul_v16i8 +; CHECK-NOT: %num.elements = add i32 %trip.count.minus.1, 1 ; CHECK: vector.body: ; CHECK: %index = phi i32 ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ] ; CHECK: [[VCTP:%[^ ]+]] = call <16 x i1> @llvm.arm.mve.vctp8(i32 [[ELEMS]]) ; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 16 ; CHECK: [[LD0:%[^ ]+]] = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* {{.*}}, i32 4, <16 x i1> [[VCTP]], <16 x i8> undef) ; CHECK: [[LD1:%[^ ]+]] = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* {{.*}}, i32 4, <16 x i1> [[VCTP]], <16 x i8> undef) ; CHECK: tail call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> {{.*}}, <16 x i8>* {{.*}}, i32 4, <16 x i1> [[VCTP]]) define dso_local arm_aapcs_vfpcc void @mul_v16i8(i8* noalias nocapture readonly %a, i8* noalias nocapture readonly %b, i8* noalias nocapture %c, i32 %N) { entry: %cmp8 = icmp eq i32 %N, 0 %tmp8 = add i32 %N, 15 %tmp9 = lshr i32 %tmp8, 4 %tmp10 = shl nuw i32 %tmp9, 4 %tmp11 = add i32 %tmp10, -16 %tmp12 = lshr i32 %tmp11, 4 %tmp13 = add nuw nsw i32 %tmp12, 1 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph vector.ph: ; preds = %entry %trip.count.minus.1 = add i32 %N, -1 %broadcast.splatinsert10 = insertelement <16 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat11 = shufflevector <16 x i32> %broadcast.splatinsert10, <16 x i32> undef, <16 x i32> zeroinitializer call void @llvm.set.loop.iterations.i32(i32 %tmp13) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ] %broadcast.splatinsert = insertelement <16 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <16 x i32> %broadcast.splatinsert, <16 x i32> undef, <16 x i32> zeroinitializer %induction = or <16 x i32> %broadcast.splat, %tmp = getelementptr inbounds i8, i8* %a, i32 %index - %tmp1 = icmp ule <16 x i32> %induction, %broadcast.splat11 + +; %tmp1 = icmp ule <16 x i32> %induction, %broadcast.splat11 + %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %trip.count.minus.1) + %tmp2 = bitcast i8* %tmp to <16 x i8>* - %wide.masked.load = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %tmp2, i32 4, <16 x i1> %tmp1, <16 x i8> undef) + %wide.masked.load = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %tmp2, i32 4, <16 x i1> %active.lane.mask, <16 x i8> undef) %tmp3 = getelementptr inbounds i8, i8* %b, i32 %index %tmp4 = bitcast i8* %tmp3 to <16 x i8>* - %wide.masked.load2 = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %tmp4, i32 4, <16 x i1> %tmp1, <16 x i8> undef) + %wide.masked.load2 = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %tmp4, i32 4, <16 x i1> %active.lane.mask, <16 x i8> undef) %mul = mul nsw <16 x i8> %wide.masked.load2, %wide.masked.load %tmp6 = getelementptr inbounds i8, i8* %c, i32 %index %tmp7 = bitcast i8* %tmp6 to <16 x i8>* - tail call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %mul, <16 x i8>* %tmp7, i32 4, <16 x i1> %tmp1) + tail call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %mul, <16 x i8>* %tmp7, i32 4, <16 x i1> %active.lane.mask) %index.next = add i32 %index, 16 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1) %tmp16 = icmp ne i32 %tmp15, 0 br i1 %tmp16, label %vector.body, label %for.cond.cleanup for.cond.cleanup: ; preds = %vector.body, %entry ret void } ; CHECK-LABEL: mul_v8i16 +; CHECK-NOT: %num.elements = add i32 %trip.count.minus.1, 1 ; CHECK: vector.body: ; CHECK: %index = phi i32 ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ] ; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[ELEMS]]) ; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 8 ; CHECK: [[LD0:%[^ ]+]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* {{.*}}, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef) ; CHECK: [[LD1:%[^ ]+]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* {{.*}}, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef) ; CHECK: tail call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> {{.*}}, <8 x i16>* {{.*}}, i32 4, <8 x i1> [[VCTP]]) define dso_local arm_aapcs_vfpcc void @mul_v8i16(i16* noalias nocapture readonly %a, i16* noalias nocapture readonly %b, i16* noalias nocapture %c, i32 %N) { entry: %cmp8 = icmp eq i32 %N, 0 %tmp8 = add i32 %N, 7 %tmp9 = lshr i32 %tmp8, 3 %tmp10 = shl nuw i32 %tmp9, 3 %tmp11 = add i32 %tmp10, -8 %tmp12 = lshr i32 %tmp11, 3 %tmp13 = add nuw nsw i32 %tmp12, 1 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph vector.ph: ; preds = %entry %trip.count.minus.1 = add i32 %N, -1 %broadcast.splatinsert10 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat11 = shufflevector <8 x i32> %broadcast.splatinsert10, <8 x i32> undef, <8 x i32> zeroinitializer call void @llvm.set.loop.iterations.i32(i32 %tmp13) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ] %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer %induction = add <8 x i32> %broadcast.splat, %tmp = getelementptr inbounds i16, i16* %a, i32 %index - %tmp1 = icmp ule <8 x i32> %induction, %broadcast.splat11 + +; %tmp1 = icmp ule <8 x i32> %induction, %broadcast.splat11 + %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %trip.count.minus.1) + %tmp2 = bitcast i16* %tmp to <8 x i16>* - %wide.masked.load = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp2, i32 4, <8 x i1> %tmp1, <8 x i16> undef) + %wide.masked.load = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp2, i32 4, <8 x i1> %active.lane.mask, <8 x i16> undef) %tmp3 = getelementptr inbounds i16, i16* %b, i32 %index %tmp4 = bitcast i16* %tmp3 to <8 x i16>* - %wide.masked.load2 = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> %tmp1, <8 x i16> undef) + %wide.masked.load2 = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> %active.lane.mask, <8 x i16> undef) %mul = mul nsw <8 x i16> %wide.masked.load2, %wide.masked.load %tmp6 = getelementptr inbounds i16, i16* %c, i32 %index %tmp7 = bitcast i16* %tmp6 to <8 x i16>* - tail call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %mul, <8 x i16>* %tmp7, i32 4, <8 x i1> %tmp1) + tail call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %mul, <8 x i16>* %tmp7, i32 4, <8 x i1> %active.lane.mask) %index.next = add i32 %index, 8 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1) %tmp16 = icmp ne i32 %tmp15, 0 br i1 %tmp16, label %vector.body, label %for.cond.cleanup for.cond.cleanup: ; preds = %vector.body, %entry ret void } ; CHECK-LABEL: mul_v4i32 +; CHECK-NOT: %num.elements = add i32 %trip.count.minus.1, 1 ; CHECK: vector.body: ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ] ; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELEMS]]) ; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4 ; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef) ; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef) ; CHECK: tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> {{.*}}, <4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]]) define dso_local arm_aapcs_vfpcc void @mul_v4i32(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) { entry: %cmp8 = icmp eq i32 %N, 0 %tmp8 = add i32 %N, 3 %tmp9 = lshr i32 %tmp8, 2 %tmp10 = shl nuw i32 %tmp9, 2 %tmp11 = add i32 %tmp10, -4 %tmp12 = lshr i32 %tmp11, 2 %tmp13 = add nuw nsw i32 %tmp12, 1 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph vector.ph: ; preds = %entry %trip.count.minus.1 = add i32 %N, -1 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer call void @llvm.set.loop.iterations.i32(i32 %tmp13) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ] %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = or <4 x i32> %broadcast.splat, %tmp = getelementptr inbounds i32, i32* %a, i32 %index - %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + ; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 %tmp2 = bitcast i32* %tmp to <4 x i32>* - %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef) + %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index %tmp4 = bitcast i32* %tmp3 to <4 x i32>* - %wide.masked.load2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef) + %wide.masked.load2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) %mul = mul nsw <4 x i32> %wide.masked.load2, %wide.masked.load %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index %tmp7 = bitcast i32* %tmp6 to <4 x i32>* - tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %mul, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1) + tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %mul, <4 x i32>* %tmp7, i32 4, <4 x i1> %active.lane.mask) %index.next = add i32 %index, 4 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1) %tmp16 = icmp ne i32 %tmp15, 0 br i1 %tmp16, label %vector.body, label %for.cond.cleanup for.cond.cleanup: ; preds = %vector.body, %entry ret void } ; CHECK-LABEL: split_vector +; CHECK-NOT: %num.elements = add i32 %trip.count.minus.1, 1 ; CHECK: vector.body: ; CHECK: %index = phi i32 ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ] ; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELEMS]]) ; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4 ; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef) ; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef) ; CHECK: tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> {{.*}}, <4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]]) define dso_local arm_aapcs_vfpcc void @split_vector(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) { entry: %cmp8 = icmp eq i32 %N, 0 %tmp8 = add i32 %N, 3 %tmp9 = lshr i32 %tmp8, 2 %tmp10 = shl nuw i32 %tmp9, 2 %tmp11 = add i32 %tmp10, -4 %tmp12 = lshr i32 %tmp11, 2 %tmp13 = add nuw nsw i32 %tmp12, 1 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph vector.ph: ; preds = %entry %trip.count.minus.1 = add i32 %N, -1 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer call void @llvm.set.loop.iterations.i32(i32 %tmp13) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ] %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %tmp = getelementptr inbounds i32, i32* %a, i32 %index - %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 +; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) %tmp2 = bitcast i32* %tmp to <4 x i32>* - %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef) + %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) %extract.1.low = shufflevector <4 x i32> %wide.masked.load, <4 x i32> undef, < 2 x i32> < i32 0, i32 2> %extract.1.high = shufflevector <4 x i32> %wide.masked.load, <4 x i32> undef, < 2 x i32> < i32 1, i32 3> %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index %tmp4 = bitcast i32* %tmp3 to <4 x i32>* - %wide.masked.load2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef) + %wide.masked.load2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) %extract.2.low = shufflevector <4 x i32> %wide.masked.load2, <4 x i32> undef, < 2 x i32> < i32 0, i32 2> %extract.2.high = shufflevector <4 x i32> %wide.masked.load2, <4 x i32> undef, < 2 x i32> < i32 1, i32 3> %mul = mul nsw <2 x i32> %extract.1.low, %extract.2.low %sub = sub nsw <2 x i32> %extract.1.high, %extract.2.high %combine = shufflevector <2 x i32> %mul, <2 x i32> %sub, <4 x i32> %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index %tmp7 = bitcast i32* %tmp6 to <4 x i32>* - tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %combine, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1) + tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %combine, <4 x i32>* %tmp7, i32 4, <4 x i1> %active.lane.mask) %index.next = add i32 %index, 4 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1) %tmp16 = icmp ne i32 %tmp15, 0 br i1 %tmp16, label %vector.body, label %for.cond.cleanup for.cond.cleanup: ; preds = %vector.body, %entry ret void } ; One of the loads now uses ult predicate. ; CHECK-LABEL: mismatch_load_pred ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ] ; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELEMS]]) ; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4 ; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef) ; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> %wrong, <4 x i32> undef) ; CHECK: tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> {{.*}}, <4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]]) define dso_local arm_aapcs_vfpcc void @mismatch_load_pred(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) { entry: %cmp8 = icmp eq i32 %N, 0 %tmp8 = add i32 %N, 3 %tmp9 = lshr i32 %tmp8, 2 %tmp10 = shl nuw i32 %tmp9, 2 %tmp11 = add i32 %tmp10, -4 %tmp12 = lshr i32 %tmp11, 2 %tmp13 = add nuw nsw i32 %tmp12, 1 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph vector.ph: ; preds = %entry %trip.count.minus.1 = add i32 %N, -1 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer call void @llvm.set.loop.iterations.i32(i32 %tmp13) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ] %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %tmp = getelementptr inbounds i32, i32* %a, i32 %index - %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + +; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %wrong = icmp ult <4 x i32> %induction, %broadcast.splat11 %tmp2 = bitcast i32* %tmp to <4 x i32>* - %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef) + %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index %tmp4 = bitcast i32* %tmp3 to <4 x i32>* %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %wrong, <4 x i32> undef) %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index %tmp7 = bitcast i32* %tmp6 to <4 x i32>* - tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1) + tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %active.lane.mask) %index.next = add i32 %index, 4 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1) %tmp16 = icmp ne i32 %tmp15, 0 br i1 %tmp16, label %vector.body, label %for.cond.cleanup for.cond.cleanup: ; preds = %vector.body, %entry ret void } ; The store now uses ult predicate. ; CHECK-LABEL: mismatch_store_pred +; CHECK-NOT: %num.elements = add i32 %trip.count.minus.1, 1 +; CHECK: vector.body: ; CHECK: %index = phi i32 ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ] ; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELEMS]]) ; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4 ; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef) ; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef) ; CHECK: tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> {{.*}}, <4 x i32>* {{.*}}, i32 4, <4 x i1> %wrong) define dso_local arm_aapcs_vfpcc void @mismatch_store_pred(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) { entry: %cmp8 = icmp eq i32 %N, 0 %tmp8 = add i32 %N, 3 %tmp9 = lshr i32 %tmp8, 2 %tmp10 = shl nuw i32 %tmp9, 2 %tmp11 = add i32 %tmp10, -4 %tmp12 = lshr i32 %tmp11, 2 %tmp13 = add nuw nsw i32 %tmp12, 1 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph vector.ph: ; preds = %entry %trip.count.minus.1 = add i32 %N, -1 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer call void @llvm.set.loop.iterations.i32(i32 %tmp13) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ] %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %tmp = getelementptr inbounds i32, i32* %a, i32 %index - %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + +; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %wrong = icmp ult <4 x i32> %induction, %broadcast.splat11 %tmp2 = bitcast i32* %tmp to <4 x i32>* - %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef) + %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index %tmp4 = bitcast i32* %tmp3 to <4 x i32>* - %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef) + %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index %tmp7 = bitcast i32* %tmp6 to <4 x i32>* tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %wrong) %index.next = add i32 %index, 4 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1) %tmp16 = icmp ne i32 %tmp15, 0 br i1 %tmp16, label %vector.body, label %for.cond.cleanup for.cond.cleanup: ; preds = %vector.body, %entry ret void } declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>) declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32 immarg, <16 x i1>) declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>) declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32 immarg, <8 x i1>) declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) declare void @llvm.masked.store.v2i64.p0v2i64(<2 x i64>, <2 x i64>*, i32 immarg, <2 x i1>) declare <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>*, i32 immarg, <2 x i1>, <2 x i64>) declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) declare void @llvm.set.loop.iterations.i32(i32) declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) - +declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) +declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32) +declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/clear-maskedinsts.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/clear-maskedinsts.ll index 74a95cbe7d1c..dab642b94be0 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/clear-maskedinsts.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/clear-maskedinsts.ll @@ -1,145 +1,157 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -mtriple=thumbv8.1m.main -mattr=+mve.fp -mve-tail-predication -disable-mve-tail-predication=false %s -S -o - | FileCheck %s define hidden i32 @_Z4loopPiPjiS0_i(i32* noalias nocapture readonly %s1, i32* noalias nocapture readonly %s2, i32 %x, i32* noalias nocapture %d, i32 %n) { ; CHECK-LABEL: @_Z4loopPiPjiS0_i( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP63:%.*]] = icmp sgt i32 [[N:%.*]], 0 ; CHECK-NEXT: br i1 [[CMP63]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]] ; CHECK: for.body.lr.ph: ; CHECK-NEXT: [[TOBOOL:%.*]] = icmp eq i32 [[X:%.*]], 0 ; CHECK-NEXT: [[N_RND_UP77:%.*]] = add nuw i32 [[N]], 3 ; CHECK-NEXT: [[N_VEC79:%.*]] = and i32 [[N_RND_UP77]], -4 ; CHECK-NEXT: [[TRIP_COUNT_MINUS_183:%.*]] = add nsw i32 [[N]], -1 ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N_VEC79]], -4 ; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[TMP0]], 2 ; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i32 [[TMP1]], 1 ; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i32 [[TMP1]], 1 ; CHECK-NEXT: br i1 [[TOBOOL]], label [[VECTOR_BODY75_PREHEADER:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.body75.preheader: ; CHECK-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[TMP2]]) ; CHECK-NEXT: br label [[VECTOR_BODY75:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[BROADCAST_SPLATINSERT71:%.*]] = insertelement <4 x i32> undef, i32 [[X]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT72:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT71]], <4 x i32> undef, <4 x i32> zeroinitializer ; CHECK-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[TMP3]]) +; CHECK-NEXT: [[NUM_ELEMENTS:%.*]] = add i32 [[TRIP_COUNT_MINUS_183]], 1 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[LSR_IV9:%.*]] = phi i32* [ [[SCEVGEP10:%.*]], [[VECTOR_BODY]] ], [ [[D:%.*]], [[VECTOR_PH]] ] -; CHECK-NEXT: [[TMP4:%.*]] = phi i32 [ [[TMP3]], [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP5:%.*]] = phi i32 [ [[N]], [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi i32 [ [[TMP3]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP5:%.*]] = phi i32 [ [[NUM_ELEMENTS]], [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[LSR_IV911:%.*]] = bitcast i32* [[LSR_IV9]] to <4 x i32>* -; CHECK-NEXT: [[TMP6:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP5]]) -; CHECK-NEXT: [[TMP7]] = sub i32 [[TMP5]], 4 -; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[BROADCAST_SPLAT72]], <4 x i32>* [[LSR_IV911]], i32 4, <4 x i1> [[TMP6]]) +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> undef, i32 [[TRIP_COUNT_MINUS_183]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP5]]) +; CHECK-NEXT: [[TMP9]] = sub i32 [[TMP5]], 4 +; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[BROADCAST_SPLAT72]], <4 x i32>* [[LSR_IV911]], i32 4, <4 x i1> [[TMP8]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 ; CHECK-NEXT: [[SCEVGEP10]] = getelementptr i32, i32* [[LSR_IV9]], i32 4 -; CHECK-NEXT: [[TMP8]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP4]], i32 1) -; CHECK-NEXT: [[TMP9:%.*]] = icmp ne i32 [[TMP8]], 0 -; CHECK-NEXT: br i1 [[TMP9]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]] +; CHECK-NEXT: [[TMP10]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP4]], i32 1) +; CHECK-NEXT: [[TMP11:%.*]] = icmp ne i32 [[TMP10]], 0 +; CHECK-NEXT: br i1 [[TMP11]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]] ; CHECK: vector.body75: ; CHECK-NEXT: [[LSR_IV6:%.*]] = phi i32* [ [[S1:%.*]], [[VECTOR_BODY75_PREHEADER]] ], [ [[SCEVGEP7:%.*]], [[VECTOR_BODY75]] ] ; CHECK-NEXT: [[LSR_IV3:%.*]] = phi i32* [ [[S2:%.*]], [[VECTOR_BODY75_PREHEADER]] ], [ [[SCEVGEP4:%.*]], [[VECTOR_BODY75]] ] ; CHECK-NEXT: [[LSR_IV:%.*]] = phi i32* [ [[D]], [[VECTOR_BODY75_PREHEADER]] ], [ [[SCEVGEP:%.*]], [[VECTOR_BODY75]] ] ; CHECK-NEXT: [[INDEX80:%.*]] = phi i32 [ [[INDEX_NEXT81:%.*]], [[VECTOR_BODY75]] ], [ 0, [[VECTOR_BODY75_PREHEADER]] ] -; CHECK-NEXT: [[TMP10:%.*]] = phi i32 [ [[TMP2]], [[VECTOR_BODY75_PREHEADER]] ], [ [[TMP15:%.*]], [[VECTOR_BODY75]] ] +; CHECK-NEXT: [[TMP12:%.*]] = phi i32 [ [[TMP2]], [[VECTOR_BODY75_PREHEADER]] ], [ [[TMP17:%.*]], [[VECTOR_BODY75]] ] ; CHECK-NEXT: [[LSR_IV68:%.*]] = bitcast i32* [[LSR_IV6]] to <4 x i32>* ; CHECK-NEXT: [[LSR_IV35:%.*]] = bitcast i32* [[LSR_IV3]] to <4 x i32>* ; CHECK-NEXT: [[LSR_IV2:%.*]] = bitcast i32* [[LSR_IV]] to <4 x i32>* ; CHECK-NEXT: [[BROADCAST_SPLATINSERT84:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX80]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT85:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT84]], <4 x i32> undef, <4 x i32> zeroinitializer ; CHECK-NEXT: [[INDUCTION86:%.*]] = add <4 x i32> [[BROADCAST_SPLAT85]], -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> undef, i32 [[TRIP_COUNT_MINUS_183]], i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> undef, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = icmp ule <4 x i32> [[INDUCTION86]], [[TMP12]] -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[LSR_IV68]], i32 4, <4 x i1> [[TMP13]], <4 x i32> undef) -; CHECK-NEXT: [[WIDE_MASKED_LOAD89:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[LSR_IV35]], i32 4, <4 x i1> [[TMP13]], <4 x i32> undef) -; CHECK-NEXT: [[TMP14:%.*]] = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> [[WIDE_MASKED_LOAD89]], <4 x i32> [[WIDE_MASKED_LOAD]]) -; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP14]], <4 x i32>* [[LSR_IV2]], i32 4, <4 x i1> [[TMP13]]) +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> undef, i32 [[TRIP_COUNT_MINUS_183]], i32 0 +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP13]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = icmp ule <4 x i32> [[INDUCTION86]], [[TMP14]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[LSR_IV68]], i32 4, <4 x i1> [[TMP15]], <4 x i32> undef) +; CHECK-NEXT: [[WIDE_MASKED_LOAD89:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[LSR_IV35]], i32 4, <4 x i1> [[TMP15]], <4 x i32> undef) +; CHECK-NEXT: [[TMP16:%.*]] = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> [[WIDE_MASKED_LOAD89]], <4 x i32> [[WIDE_MASKED_LOAD]]) +; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP16]], <4 x i32>* [[LSR_IV2]], i32 4, <4 x i1> [[TMP15]]) ; CHECK-NEXT: [[INDEX_NEXT81]] = add i32 [[INDEX80]], 4 ; CHECK-NEXT: [[SCEVGEP]] = getelementptr i32, i32* [[LSR_IV]], i32 4 ; CHECK-NEXT: [[SCEVGEP4]] = getelementptr i32, i32* [[LSR_IV3]], i32 4 ; CHECK-NEXT: [[SCEVGEP7]] = getelementptr i32, i32* [[LSR_IV6]], i32 4 -; CHECK-NEXT: [[TMP15]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP10]], i32 1) -; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0 -; CHECK-NEXT: br i1 [[TMP16]], label [[VECTOR_BODY75]], label [[FOR_COND_CLEANUP]] +; CHECK-NEXT: [[TMP17]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP12]], i32 1) +; CHECK-NEXT: [[TMP18:%.*]] = icmp ne i32 [[TMP17]], 0 +; CHECK-NEXT: br i1 [[TMP18]], label [[VECTOR_BODY75]], label [[FOR_COND_CLEANUP]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret i32 0 ; entry: %cmp63 = icmp sgt i32 %n, 0 br i1 %cmp63, label %for.body.lr.ph, label %for.cond.cleanup for.body.lr.ph: ; preds = %entry %tobool = icmp eq i32 %x, 0 %n.rnd.up77 = add nuw i32 %n, 3 %n.vec79 = and i32 %n.rnd.up77, -4 %trip.count.minus.183 = add nsw i32 %n, -1 %0 = add i32 %n.vec79, -4 %1 = lshr i32 %0, 2 %2 = add nuw nsw i32 %1, 1 %3 = add nuw nsw i32 %1, 1 br i1 %tobool, label %vector.body75.preheader, label %vector.ph vector.body75.preheader: ; preds = %for.body.lr.ph call void @llvm.set.loop.iterations.i32(i32 %2) br label %vector.body75 vector.ph: ; preds = %for.body.lr.ph %broadcast.splatinsert71 = insertelement <4 x i32> undef, i32 %x, i32 0 %broadcast.splat72 = shufflevector <4 x i32> %broadcast.splatinsert71, <4 x i32> undef, <4 x i32> zeroinitializer call void @llvm.set.loop.iterations.i32(i32 %3) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %lsr.iv9 = phi i32* [ %scevgep10, %vector.body ], [ %d, %vector.ph ] %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %4 = phi i32 [ %3, %vector.ph ], [ %8, %vector.body ] %lsr.iv911 = bitcast i32* %lsr.iv9 to <4 x i32>* %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %5 = insertelement <4 x i32> undef, i32 %trip.count.minus.183, i32 0 %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <4 x i32> zeroinitializer - %7 = icmp ule <4 x i32> %induction, %6 + %7 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.183) call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %broadcast.splat72, <4 x i32>* %lsr.iv911, i32 4, <4 x i1> %7) %index.next = add i32 %index, 4 %scevgep10 = getelementptr i32, i32* %lsr.iv9, i32 4 %8 = call i32 @llvm.loop.decrement.reg.i32(i32 %4, i32 1) %9 = icmp ne i32 %8, 0 br i1 %9, label %vector.body, label %for.cond.cleanup vector.body75: ; preds = %vector.body75, %vector.body75.preheader %lsr.iv6 = phi i32* [ %s1, %vector.body75.preheader ], [ %scevgep7, %vector.body75 ] %lsr.iv3 = phi i32* [ %s2, %vector.body75.preheader ], [ %scevgep4, %vector.body75 ] %lsr.iv = phi i32* [ %d, %vector.body75.preheader ], [ %scevgep, %vector.body75 ] %index80 = phi i32 [ %index.next81, %vector.body75 ], [ 0, %vector.body75.preheader ] %10 = phi i32 [ %2, %vector.body75.preheader ], [ %15, %vector.body75 ] %lsr.iv68 = bitcast i32* %lsr.iv6 to <4 x i32>* %lsr.iv35 = bitcast i32* %lsr.iv3 to <4 x i32>* %lsr.iv2 = bitcast i32* %lsr.iv to <4 x i32>* %broadcast.splatinsert84 = insertelement <4 x i32> undef, i32 %index80, i32 0 %broadcast.splat85 = shufflevector <4 x i32> %broadcast.splatinsert84, <4 x i32> undef, <4 x i32> zeroinitializer %induction86 = add <4 x i32> %broadcast.splat85, %11 = insertelement <4 x i32> undef, i32 %trip.count.minus.183, i32 0 %12 = shufflevector <4 x i32> %11, <4 x i32> undef, <4 x i32> zeroinitializer %13 = icmp ule <4 x i32> %induction86, %12 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv68, i32 4, <4 x i1> %13, <4 x i32> undef) %wide.masked.load89 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv35, i32 4, <4 x i1> %13, <4 x i32> undef) %14 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %wide.masked.load89, <4 x i32> %wide.masked.load) call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %14, <4 x i32>* %lsr.iv2, i32 4, <4 x i1> %13) %index.next81 = add i32 %index80, 4 %scevgep = getelementptr i32, i32* %lsr.iv, i32 4 %scevgep4 = getelementptr i32, i32* %lsr.iv3, i32 4 %scevgep7 = getelementptr i32, i32* %lsr.iv6, i32 4 %15 = call i32 @llvm.loop.decrement.reg.i32(i32 %10, i32 1) %16 = icmp ne i32 %15, 0 br i1 %16, label %vector.body75, label %for.cond.cleanup for.cond.cleanup: ; preds = %vector.body, %vector.body75, %entry ret i32 0 } declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) declare <4 x i32> @llvm.usub.sat.v4i32(<4 x i32>, <4 x i32>) declare void @llvm.set.loop.iterations.i32(i32) declare i32 @llvm.loop.decrement.reg.i32(i32, i32) + +declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) +declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32) +declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll index 9af4185cae44..bf6e92a1c883 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll @@ -1,511 +1,537 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -disable-mve-tail-predication=false --verify-machineinstrs %s -o - | FileCheck %s define dso_local i32 @vpsel_mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture readonly %c, i32 %N) { ; CHECK-LABEL: vpsel_mul_reduce_add: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: itt eq ; CHECK-NEXT: moveq r0, #0 ; CHECK-NEXT: bxeq lr -; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: adds r4, r3, #3 +; CHECK-NEXT: add.w r12, r3, #3 +; CHECK-NEXT: mov.w lr, #1 +; CHECK-NEXT: bic r12, r12, #3 ; CHECK-NEXT: vmov.i32 q1, #0x0 -; CHECK-NEXT: bic r4, r4, #3 -; CHECK-NEXT: sub.w r12, r4, #4 -; CHECK-NEXT: movs r4, #1 -; CHECK-NEXT: add.w lr, r4, r12, lsr #2 -; CHECK-NEXT: lsr.w r4, r12, #2 -; CHECK-NEXT: sub.w r12, r3, r4, lsl #2 -; CHECK-NEXT: movs r4, #0 +; CHECK-NEXT: sub.w r12, r12, #4 +; CHECK-NEXT: add.w lr, lr, r12, lsr #2 +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB0_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r3 -; CHECK-NEXT: and r5, r4, #15 +; CHECK-NEXT: and r4, r12, #15 ; CHECK-NEXT: vstr p0, [sp] @ 4-byte Spill -; CHECK-NEXT: vdup.32 q3, r5 +; CHECK-NEXT: vdup.32 q3, r4 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: vpstt ; CHECK-NEXT: vldrwt.u32 q1, [r2], #16 ; CHECK-NEXT: vldrwt.u32 q2, [r1], #16 ; CHECK-NEXT: vcmp.i32 eq, q3, zr -; CHECK-NEXT: adds r4, #4 +; CHECK-NEXT: add.w r12, r12, #4 ; CHECK-NEXT: vpsel q1, q2, q1 ; CHECK-NEXT: vldr p0, [sp] @ 4-byte Reload ; CHECK-NEXT: vpst ; CHECK-NEXT: vldrwt.u32 q2, [r0], #16 ; CHECK-NEXT: vmul.i32 q1, q1, q2 ; CHECK-NEXT: subs r3, #4 ; CHECK-NEXT: vadd.i32 q1, q1, q0 ; CHECK-NEXT: le lr, .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block -; CHECK-NEXT: vctp.32 r12 ; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: pop {r4, pc} entry: %cmp8 = icmp eq i32 %N, 0 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph vector.ph: ; preds = %entry %n.rnd.up = add i32 %N, 3 %n.vec = and i32 %n.rnd.up, -4 %trip.count.minus.1 = add i32 %N, -1 %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %add, %vector.body ] %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %tmp = getelementptr inbounds i32, i32* %a, i32 %index - %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12 + +; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12 + %tmp1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %tmp2 = bitcast i32* %tmp to <4 x i32>* %wide.masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef) %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index %tmp4 = bitcast i32* %tmp3 to <4 x i32>* %wide.masked.load.b = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef) %tmp5 = getelementptr inbounds i32, i32* %c, i32 %index %tmp6 = bitcast i32* %tmp5 to <4 x i32>* %wide.masked.load.c = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp6, i32 4, <4 x i1> %tmp1, <4 x i32> undef) %rem = urem i32 %index, 16 %rem.broadcast.splatinsert = insertelement <4 x i32> undef, i32 %rem, i32 0 %rem.broadcast.splat = shufflevector <4 x i32> %rem.broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %cmp = icmp eq <4 x i32> %rem.broadcast.splat, %wide.masked.load = select <4 x i1> %cmp, <4 x i32> %wide.masked.load.b, <4 x i32> %wide.masked.load.c %mul = mul nsw <4 x i32> %wide.masked.load, %wide.masked.load.a %add = add nsw <4 x i32> %mul, %vec.phi %index.next = add i32 %index, 4 %tmp7 = icmp eq i32 %index.next, %n.vec br i1 %tmp7, label %middle.block, label %vector.body middle.block: ; preds = %vector.body %tmp8 = select <4 x i1> %tmp1, <4 x i32> %add, <4 x i32> %vec.phi %tmp9 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %tmp8) br label %for.cond.cleanup for.cond.cleanup: ; preds = %middle.block, %entry %res.0.lcssa = phi i32 [ 0, %entry ], [ %tmp9, %middle.block ] ret i32 %res.0.lcssa } define dso_local i32 @vpsel_mul_reduce_add_2(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, ; CHECK-LABEL: vpsel_mul_reduce_add_2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: ldr.w r12, [sp, #20] ; CHECK-NEXT: cmp.w r12, #0 ; CHECK-NEXT: beq .LBB1_4 ; CHECK-NEXT: @ %bb.1: @ %vector.ph -; CHECK-NEXT: add.w r5, r12, #3 +; CHECK-NEXT: add.w r4, r12, #3 ; CHECK-NEXT: vmov.i32 q1, #0x0 -; CHECK-NEXT: bic r5, r5, #3 -; CHECK-NEXT: subs r4, r5, #4 -; CHECK-NEXT: movs r5, #1 -; CHECK-NEXT: add.w lr, r5, r4, lsr #2 -; CHECK-NEXT: lsrs r4, r4, #2 -; CHECK-NEXT: sub.w r4, r12, r4, lsl #2 -; CHECK-NEXT: movs r5, #0 +; CHECK-NEXT: bic r4, r4, #3 +; CHECK-NEXT: sub.w lr, r4, #4 +; CHECK-NEXT: movs r4, #1 +; CHECK-NEXT: add.w lr, r4, lr, lsr #2 +; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB1_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r12 -; CHECK-NEXT: and r6, r5, #15 +; CHECK-NEXT: and r5, r4, #15 ; CHECK-NEXT: vstr p0, [sp] @ 4-byte Spill ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: vpstt ; CHECK-NEXT: vldrwt.u32 q1, [r3], #16 ; CHECK-NEXT: vldrwt.u32 q2, [r2], #16 -; CHECK-NEXT: vdup.32 q3, r6 +; CHECK-NEXT: vdup.32 q3, r5 ; CHECK-NEXT: vsub.i32 q1, q2, q1 ; CHECK-NEXT: vpst ; CHECK-NEXT: vldrwt.u32 q2, [r1], #16 ; CHECK-NEXT: vcmp.i32 eq, q3, zr -; CHECK-NEXT: adds r5, #4 +; CHECK-NEXT: adds r4, #4 ; CHECK-NEXT: vpsel q1, q1, q2 ; CHECK-NEXT: vldr p0, [sp] @ 4-byte Reload ; CHECK-NEXT: vpst ; CHECK-NEXT: vldrwt.u32 q2, [r0], #16 ; CHECK-NEXT: vmul.i32 q1, q1, q2 ; CHECK-NEXT: sub.w r12, r12, #4 ; CHECK-NEXT: vadd.i32 q1, q1, q0 ; CHECK-NEXT: le lr, .LBB1_2 ; CHECK-NEXT: @ %bb.3: @ %middle.block -; CHECK-NEXT: vctp.32 r4 ; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: pop {r4, r5, r7, pc} ; CHECK-NEXT: .LBB1_4: ; CHECK-NEXT: movs r0, #0 ; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: pop {r4, r5, r7, pc} i32* noalias nocapture readonly %c, i32* noalias nocapture readonly %d, i32 %N) { entry: %cmp8 = icmp eq i32 %N, 0 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph vector.ph: ; preds = %entry %n.rnd.up = add i32 %N, 3 %n.vec = and i32 %n.rnd.up, -4 %trip.count.minus.1 = add i32 %N, -1 %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %add, %vector.body ] %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %tmp = getelementptr inbounds i32, i32* %a, i32 %index - %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12 + +; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12 + %tmp1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %tmp2 = bitcast i32* %tmp to <4 x i32>* %wide.masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef) %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index %tmp4 = bitcast i32* %tmp3 to <4 x i32>* %wide.masked.load.b = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef) %tmp5 = getelementptr inbounds i32, i32* %c, i32 %index %tmp6 = bitcast i32* %tmp5 to <4 x i32>* %wide.masked.load.c = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp6, i32 4, <4 x i1> %tmp1, <4 x i32> undef) %tmp7 = getelementptr inbounds i32, i32* %d, i32 %index %tmp8 = bitcast i32* %tmp7 to <4 x i32>* %wide.masked.load.d = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp8, i32 4, <4 x i1> %tmp1, <4 x i32> undef) %sub = sub <4 x i32> %wide.masked.load.c, %wide.masked.load.d %rem = urem i32 %index, 16 %rem.broadcast.splatinsert = insertelement <4 x i32> undef, i32 %rem, i32 0 %rem.broadcast.splat = shufflevector <4 x i32> %rem.broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %cmp = icmp eq <4 x i32> %rem.broadcast.splat, %sel = select <4 x i1> %cmp, <4 x i32> %sub, <4 x i32> %wide.masked.load.b %mul = mul <4 x i32> %sel, %wide.masked.load.a %add = add <4 x i32> %mul, %vec.phi %index.next = add i32 %index, 4 %cmp.exit = icmp eq i32 %index.next, %n.vec br i1 %cmp.exit, label %middle.block, label %vector.body middle.block: ; preds = %vector.body %acc = select <4 x i1> %tmp1, <4 x i32> %add, <4 x i32> %vec.phi %reduce = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %acc) br label %for.cond.cleanup for.cond.cleanup: ; preds = %middle.block, %entry %res.0.lcssa = phi i32 [ 0, %entry ], [ %reduce, %middle.block ] ret i32 %res.0.lcssa } define dso_local i32 @and_mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, ; CHECK-LABEL: and_mul_reduce_add: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: ldr.w r12, [sp, #16] +; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: sub sp, #4 +; CHECK-NEXT: ldr.w r12, [sp, #12] ; CHECK-NEXT: cmp.w r12, #0 ; CHECK-NEXT: beq .LBB2_4 ; CHECK-NEXT: @ %bb.1: @ %vector.ph ; CHECK-NEXT: add.w r4, r12, #3 ; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: bic r4, r4, #3 -; CHECK-NEXT: subs r5, r4, #4 +; CHECK-NEXT: sub.w lr, r4, #4 ; CHECK-NEXT: movs r4, #1 -; CHECK-NEXT: add.w lr, r4, r5, lsr #2 -; CHECK-NEXT: lsrs r4, r5, #2 -; CHECK-NEXT: sub.w r4, r12, r4, lsl #2 +; CHECK-NEXT: add.w lr, r4, lr, lsr #2 +; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB2_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r12 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: vpstt ; CHECK-NEXT: vldrwt.u32 q1, [r1], #16 ; CHECK-NEXT: vldrwt.u32 q2, [r0], #16 -; CHECK-NEXT: sub.w r12, r12, #4 +; CHECK-NEXT: vstr p0, [sp] @ 4-byte Spill ; CHECK-NEXT: vsub.i32 q1, q2, q1 +; CHECK-NEXT: adds r4, #4 ; CHECK-NEXT: vpsttt ; CHECK-NEXT: vcmpt.i32 eq, q1, zr ; CHECK-NEXT: vldrwt.u32 q1, [r3], #16 ; CHECK-NEXT: vldrwt.u32 q2, [r2], #16 +; CHECK-NEXT: sub.w r12, r12, #4 ; CHECK-NEXT: vmul.i32 q1, q2, q1 ; CHECK-NEXT: vadd.i32 q1, q1, q0 ; CHECK-NEXT: le lr, .LBB2_2 ; CHECK-NEXT: @ %bb.3: @ %middle.block -; CHECK-NEXT: vctp.32 r4 +; CHECK-NEXT: vldr p0, [sp] @ 4-byte Reload ; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vaddv.u32 r0, q0 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: pop {r4, pc} ; CHECK-NEXT: .LBB2_4: ; CHECK-NEXT: movs r0, #0 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: pop {r4, pc} i32* noalias nocapture readonly %c, i32* noalias nocapture readonly %d, i32 %N) { entry: %cmp8 = icmp eq i32 %N, 0 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph vector.ph: ; preds = %entry %n.rnd.up = add i32 %N, 3 %n.vec = and i32 %n.rnd.up, -4 %trip.count.minus.1 = add i32 %N, -1 %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %add, %vector.body ] %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %tmp = getelementptr inbounds i32, i32* %a, i32 %index - %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12 + +; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12 + %tmp1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %tmp2 = bitcast i32* %tmp to <4 x i32>* %wide.masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef) %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index %tmp4 = bitcast i32* %tmp3 to <4 x i32>* %wide.masked.load.b = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef) %sub = sub <4 x i32> %wide.masked.load.a, %wide.masked.load.b %cmp = icmp eq <4 x i32> %sub, %mask = and <4 x i1> %cmp, %tmp1 %tmp5 = getelementptr inbounds i32, i32* %c, i32 %index %tmp6 = bitcast i32* %tmp5 to <4 x i32>* %wide.masked.load.c = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp6, i32 4, <4 x i1> %mask, <4 x i32> undef) %tmp7 = getelementptr inbounds i32, i32* %d, i32 %index %tmp8 = bitcast i32* %tmp7 to <4 x i32>* %wide.masked.load.d = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp8, i32 4, <4 x i1> %mask, <4 x i32> undef) %mul = mul <4 x i32> %wide.masked.load.c, %wide.masked.load.d %add = add <4 x i32> %mul, %vec.phi %index.next = add i32 %index, 4 %cmp.exit = icmp eq i32 %index.next, %n.vec br i1 %cmp.exit, label %middle.block, label %vector.body middle.block: ; preds = %vector.body %acc = select <4 x i1> %tmp1, <4 x i32> %add, <4 x i32> %vec.phi %reduce = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %acc) br label %for.cond.cleanup for.cond.cleanup: ; preds = %middle.block, %entry %res.0.lcssa = phi i32 [ 0, %entry ], [ %reduce, %middle.block ] ret i32 %res.0.lcssa } define dso_local i32 @or_mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture readonly %c, i32* noalias nocapture readonly %d, i32 %N) { ; CHECK-LABEL: or_mul_reduce_add: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: ldr.w r12, [sp, #16] +; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: sub sp, #4 +; CHECK-NEXT: ldr.w r12, [sp, #12] ; CHECK-NEXT: cmp.w r12, #0 ; CHECK-NEXT: beq .LBB3_4 ; CHECK-NEXT: @ %bb.1: @ %vector.ph ; CHECK-NEXT: add.w r4, r12, #3 ; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: bic r4, r4, #3 -; CHECK-NEXT: subs r5, r4, #4 +; CHECK-NEXT: sub.w lr, r4, #4 ; CHECK-NEXT: movs r4, #1 -; CHECK-NEXT: add.w lr, r4, r5, lsr #2 -; CHECK-NEXT: lsrs r4, r5, #2 -; CHECK-NEXT: sub.w r4, r12, r4, lsl #2 +; CHECK-NEXT: add.w lr, r4, lr, lsr #2 +; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB3_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r12 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: vpstt ; CHECK-NEXT: vldrwt.u32 q1, [r1], #16 ; CHECK-NEXT: vldrwt.u32 q2, [r0], #16 -; CHECK-NEXT: vpnot +; CHECK-NEXT: vstr p0, [sp] @ 4-byte Spill ; CHECK-NEXT: vsub.i32 q1, q2, q1 -; CHECK-NEXT: sub.w r12, r12, #4 +; CHECK-NEXT: vpnot ; CHECK-NEXT: vpstee ; CHECK-NEXT: vcmpt.i32 ne, q1, zr ; CHECK-NEXT: vldrwe.u32 q1, [r3], #16 ; CHECK-NEXT: vldrwe.u32 q2, [r2], #16 +; CHECK-NEXT: adds r4, #4 ; CHECK-NEXT: vmul.i32 q1, q2, q1 +; CHECK-NEXT: sub.w r12, r12, #4 ; CHECK-NEXT: vadd.i32 q1, q1, q0 ; CHECK-NEXT: le lr, .LBB3_2 ; CHECK-NEXT: @ %bb.3: @ %middle.block -; CHECK-NEXT: vctp.32 r4 +; CHECK-NEXT: vldr p0, [sp] @ 4-byte Reload ; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vaddv.u32 r0, q0 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: pop {r4, pc} ; CHECK-NEXT: .LBB3_4: ; CHECK-NEXT: movs r0, #0 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: pop {r4, pc} entry: %cmp8 = icmp eq i32 %N, 0 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph vector.ph: ; preds = %entry %n.rnd.up = add i32 %N, 3 %n.vec = and i32 %n.rnd.up, -4 %trip.count.minus.1 = add i32 %N, -1 %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %add, %vector.body ] %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %tmp = getelementptr inbounds i32, i32* %a, i32 %index - %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12 + +; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12 + %tmp1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %tmp2 = bitcast i32* %tmp to <4 x i32>* %wide.masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef) %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index %tmp4 = bitcast i32* %tmp3 to <4 x i32>* %wide.masked.load.b = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef) %sub = sub <4 x i32> %wide.masked.load.a, %wide.masked.load.b %cmp = icmp eq <4 x i32> %sub, %mask = or <4 x i1> %cmp, %tmp1 %tmp5 = getelementptr inbounds i32, i32* %c, i32 %index %tmp6 = bitcast i32* %tmp5 to <4 x i32>* %wide.masked.load.c = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp6, i32 4, <4 x i1> %mask, <4 x i32> undef) %tmp7 = getelementptr inbounds i32, i32* %d, i32 %index %tmp8 = bitcast i32* %tmp7 to <4 x i32>* %wide.masked.load.d = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp8, i32 4, <4 x i1> %mask, <4 x i32> undef) %mul = mul <4 x i32> %wide.masked.load.c, %wide.masked.load.d %add = add <4 x i32> %mul, %vec.phi %index.next = add i32 %index, 4 %cmp.exit = icmp eq i32 %index.next, %n.vec br i1 %cmp.exit, label %middle.block, label %vector.body middle.block: ; preds = %vector.body %acc = select <4 x i1> %tmp1, <4 x i32> %add, <4 x i32> %vec.phi %reduce = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %acc) br label %for.cond.cleanup for.cond.cleanup: ; preds = %middle.block, %entry %res.0.lcssa = phi i32 [ 0, %entry ], [ %reduce, %middle.block ] ret i32 %res.0.lcssa } define dso_local void @continue_on_zero(i32* noalias nocapture %arg, i32* noalias nocapture readonly %arg1, i32 %arg2) { ; CHECK-LABEL: continue_on_zero: ; CHECK: @ %bb.0: @ %bb ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB4_1: @ %bb9 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: adds r3, #4 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16 ; CHECK-NEXT: vcmp.i32 ne, q0, zr ; CHECK-NEXT: vpst ; CHECK-NEXT: vldrwt.u32 q1, [r0] ; CHECK-NEXT: vmul.i32 q0, q1, q0 ; CHECK-NEXT: vpst ; CHECK-NEXT: vstrwt.32 q0, [r0], #16 ; CHECK-NEXT: letp lr, .LBB4_1 ; CHECK-NEXT: @ %bb.2: @ %bb27 ; CHECK-NEXT: pop {r7, pc} bb: %tmp = icmp eq i32 %arg2, 0 br i1 %tmp, label %bb27, label %bb3 bb3: ; preds = %bb %tmp4 = add i32 %arg2, 3 %tmp5 = and i32 %tmp4, -4 %tmp6 = add i32 %arg2, -1 %tmp7 = insertelement <4 x i32> undef, i32 %tmp6, i32 0 %tmp8 = shufflevector <4 x i32> %tmp7, <4 x i32> undef, <4 x i32> zeroinitializer br label %bb9 bb9: ; preds = %bb9, %bb3 %tmp10 = phi i32 [ 0, %bb3 ], [ %tmp25, %bb9 ] %tmp11 = insertelement <4 x i32> undef, i32 %tmp10, i32 0 %tmp12 = shufflevector <4 x i32> %tmp11, <4 x i32> undef, <4 x i32> zeroinitializer %tmp13 = add <4 x i32> %tmp12, %tmp14 = getelementptr inbounds i32, i32* %arg1, i32 %tmp10 - %tmp15 = icmp ule <4 x i32> %tmp13, %tmp8 + + ; %tmp15 = icmp ule <4 x i32> %tmp13, %tmp8 + %tmp15 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %tmp10, i32 %tmp6) + %tmp16 = bitcast i32* %tmp14 to <4 x i32>* %tmp17 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp16, i32 4, <4 x i1> %tmp15, <4 x i32> undef) %tmp18 = icmp ne <4 x i32> %tmp17, zeroinitializer %tmp19 = getelementptr inbounds i32, i32* %arg, i32 %tmp10 %tmp20 = and <4 x i1> %tmp18, %tmp15 %tmp21 = bitcast i32* %tmp19 to <4 x i32>* %tmp22 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp21, i32 4, <4 x i1> %tmp20, <4 x i32> undef) %tmp23 = mul nsw <4 x i32> %tmp22, %tmp17 %tmp24 = bitcast i32* %tmp19 to <4 x i32>* call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp23, <4 x i32>* %tmp24, i32 4, <4 x i1> %tmp20) %tmp25 = add i32 %tmp10, 4 %tmp26 = icmp eq i32 %tmp25, %tmp5 br i1 %tmp26, label %bb27, label %bb9 bb27: ; preds = %bb9, %bb ret void } define dso_local arm_aapcs_vfpcc void @range_test(i32* noalias nocapture %arg, i32* noalias nocapture readonly %arg1, i32 %arg2, i32 %arg3) { ; CHECK-LABEL: range_test: ; CHECK: @ %bb.0: @ %bb ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB5_1: @ %bb12 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vptt.i32 ne, q0, zr ; CHECK-NEXT: vcmpt.s32 le, q0, r2 ; CHECK-NEXT: vldrwt.u32 q1, [r1], #16 +; CHECK-NEXT: add.w r12, r12, #4 ; CHECK-NEXT: vmul.i32 q0, q1, q0 ; CHECK-NEXT: vpst ; CHECK-NEXT: vstrwt.32 q0, [r0], #16 ; CHECK-NEXT: letp lr, .LBB5_1 ; CHECK-NEXT: @ %bb.2: @ %bb32 ; CHECK-NEXT: pop {r7, pc} bb: %tmp = icmp eq i32 %arg3, 0 br i1 %tmp, label %bb32, label %bb4 bb4: ; preds = %bb %tmp5 = add i32 %arg3, 3 %tmp6 = and i32 %tmp5, -4 %tmp7 = add i32 %arg3, -1 %tmp8 = insertelement <4 x i32> undef, i32 %tmp7, i32 0 %tmp9 = shufflevector <4 x i32> %tmp8, <4 x i32> undef, <4 x i32> zeroinitializer %tmp10 = insertelement <4 x i32> undef, i32 %arg2, i32 0 %tmp11 = shufflevector <4 x i32> %tmp10, <4 x i32> undef, <4 x i32> zeroinitializer br label %bb12 bb12: ; preds = %bb12, %bb4 %tmp13 = phi i32 [ 0, %bb4 ], [ %tmp30, %bb12 ] %tmp14 = insertelement <4 x i32> undef, i32 %tmp13, i32 0 %tmp15 = shufflevector <4 x i32> %tmp14, <4 x i32> undef, <4 x i32> zeroinitializer %tmp16 = add <4 x i32> %tmp15, %tmp17 = getelementptr inbounds i32, i32* %arg, i32 %tmp13 - %tmp18 = icmp ule <4 x i32> %tmp16, %tmp9 + + ; %tmp18 = icmp ule <4 x i32> %tmp16, %tmp9 + %tmp18= call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %tmp13, i32 %tmp7) + %tmp19 = bitcast i32* %tmp17 to <4 x i32>* %tmp20 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp19, i32 4, <4 x i1> %tmp18, <4 x i32> undef) %tmp21 = icmp ne <4 x i32> %tmp20, zeroinitializer %tmp22 = icmp sle <4 x i32> %tmp20, %tmp11 %tmp23 = getelementptr inbounds i32, i32* %arg1, i32 %tmp13 %tmp24 = and <4 x i1> %tmp22, %tmp21 %tmp25 = and <4 x i1> %tmp24, %tmp18 %tmp26 = bitcast i32* %tmp23 to <4 x i32>* %tmp27 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp26, i32 4, <4 x i1> %tmp25, <4 x i32> undef) %tmp28 = mul nsw <4 x i32> %tmp27, %tmp20 %tmp29 = bitcast i32* %tmp17 to <4 x i32>* call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp28, <4 x i32>* %tmp29, i32 4, <4 x i1> %tmp25) %tmp30 = add i32 %tmp13, 4 %tmp31 = icmp eq i32 %tmp30, %tmp6 br i1 %tmp31, label %bb32, label %bb12 bb32: ; preds = %bb12, %bb ret void } ; Function Attrs: argmemonly nounwind readonly willreturn declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>) ; Function Attrs: nounwind readnone willreturn declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) + +declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/extending-loads.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/extending-loads.ll index b4846cd824e7..8d201a23a689 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/extending-loads.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/extending-loads.ll @@ -1,225 +1,247 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -disable-mve-tail-predication=false %s -o - | FileCheck %s define dso_local arm_aapcs_vfpcc void @sext_i8(i16* noalias nocapture %a, i8* nocapture readonly %b, i32 %N) { ; CHECK-LABEL: sext_i8: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: dlstp.16 lr, r2 ; CHECK-NEXT: .LBB0_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: adds r3, #8 ; CHECK-NEXT: vldrb.s16 q0, [r1], #8 ; CHECK-NEXT: vldrh.u16 q1, [r0] ; CHECK-NEXT: vadd.i16 q0, q1, q0 ; CHECK-NEXT: vstrh.16 q0, [r0], #16 ; CHECK-NEXT: letp lr, .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} entry: %cmp8 = icmp eq i32 %N, 0 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph vector.ph: ; preds = %entry %n.rnd.up = add i32 %N, 7 %n.vec = and i32 %n.rnd.up, -8 %trip.count.minus.1 = add i32 %N, -1 %broadcast.splatinsert10 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat11 = shufflevector <8 x i32> %broadcast.splatinsert10, <8 x i32> undef, <8 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer %induction = or <8 x i32> %broadcast.splat, %0 = getelementptr inbounds i8, i8* %b, i32 %index - %1 = icmp ule <8 x i32> %induction, %broadcast.splat11 + + ; %1 = icmp ule <8 x i32> %induction, %broadcast.splat11 + %1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast i8* %0 to <8 x i8>* %wide.masked.load = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %2, i32 1, <8 x i1> %1, <8 x i8> undef) %3 = sext <8 x i8> %wide.masked.load to <8 x i16> %4 = getelementptr inbounds i16, i16* %a, i32 %index %5 = bitcast i16* %4 to <8 x i16>* %wide.masked.load12 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %5, i32 2, <8 x i1> %1, <8 x i16> undef) %6 = add <8 x i16> %wide.masked.load12, %3 %7 = bitcast i16* %4 to <8 x i16>* call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %6, <8 x i16>* %7, i32 2, <8 x i1> %1) %index.next = add i32 %index, 8 %8 = icmp eq i32 %index.next, %n.vec br i1 %8, label %for.cond.cleanup, label %vector.body for.cond.cleanup: ; preds = %vector.body, %entry ret void } ; Function Attrs: nofree norecurse nounwind define dso_local arm_aapcs_vfpcc void @zext_i8(i16* noalias nocapture %a, i8* nocapture readonly %b, i32 %N) local_unnamed_addr #0 { ; CHECK-LABEL: zext_i8: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: dlstp.16 lr, r2 ; CHECK-NEXT: .LBB1_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: adds r3, #8 ; CHECK-NEXT: vldrb.u16 q0, [r1], #8 ; CHECK-NEXT: vldrh.u16 q1, [r0] ; CHECK-NEXT: vadd.i16 q0, q1, q0 ; CHECK-NEXT: vstrh.16 q0, [r0], #16 ; CHECK-NEXT: letp lr, .LBB1_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} entry: %cmp8 = icmp eq i32 %N, 0 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph vector.ph: ; preds = %entry %n.rnd.up = add i32 %N, 7 %n.vec = and i32 %n.rnd.up, -8 %trip.count.minus.1 = add i32 %N, -1 %broadcast.splatinsert10 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat11 = shufflevector <8 x i32> %broadcast.splatinsert10, <8 x i32> undef, <8 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer %induction = or <8 x i32> %broadcast.splat, %0 = getelementptr inbounds i8, i8* %b, i32 %index - %1 = icmp ule <8 x i32> %induction, %broadcast.splat11 + + ; %1 = icmp ule <8 x i32> %induction, %broadcast.splat11 + %1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast i8* %0 to <8 x i8>* %wide.masked.load = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %2, i32 1, <8 x i1> %1, <8 x i8> undef) %3 = zext <8 x i8> %wide.masked.load to <8 x i16> %4 = getelementptr inbounds i16, i16* %a, i32 %index %5 = bitcast i16* %4 to <8 x i16>* %wide.masked.load12 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %5, i32 2, <8 x i1> %1, <8 x i16> undef) %6 = add <8 x i16> %wide.masked.load12, %3 %7 = bitcast i16* %4 to <8 x i16>* call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %6, <8 x i16>* %7, i32 2, <8 x i1> %1) %index.next = add i32 %index, 8 %8 = icmp eq i32 %index.next, %n.vec br i1 %8, label %for.cond.cleanup, label %vector.body for.cond.cleanup: ; preds = %vector.body, %entry ret void } ; Function Attrs: nofree norecurse nounwind define dso_local arm_aapcs_vfpcc void @sext_i16(i32* noalias nocapture %a, i16* nocapture readonly %b, i32 %N) local_unnamed_addr #0 { ; CHECK-LABEL: sext_i16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB2_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: adds r3, #4 ; CHECK-NEXT: vldrh.s32 q0, [r1], #8 ; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vadd.i32 q0, q1, q0 ; CHECK-NEXT: vstrw.32 q0, [r0], #16 ; CHECK-NEXT: letp lr, .LBB2_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} entry: %cmp6 = icmp eq i32 %N, 0 br i1 %cmp6, label %for.cond.cleanup, label %vector.ph vector.ph: ; preds = %entry %n.rnd.up = add i32 %N, 3 %n.vec = and i32 %n.rnd.up, -4 %trip.count.minus.1 = add i32 %N, -1 %broadcast.splatinsert8 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat9 = shufflevector <4 x i32> %broadcast.splatinsert8, <4 x i32> undef, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = or <4 x i32> %broadcast.splat, %0 = getelementptr inbounds i16, i16* %b, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat9 + + ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat9 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast i16* %0 to <4 x i16>* %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %2, i32 2, <4 x i1> %1, <4 x i16> undef) %3 = sext <4 x i16> %wide.masked.load to <4 x i32> %4 = getelementptr inbounds i32, i32* %a, i32 %index %5 = bitcast i32* %4 to <4 x i32>* %wide.masked.load10 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %5, i32 4, <4 x i1> %1, <4 x i32> undef) %6 = add nsw <4 x i32> %wide.masked.load10, %3 %7 = bitcast i32* %4 to <4 x i32>* call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %6, <4 x i32>* %7, i32 4, <4 x i1> %1) %index.next = add i32 %index, 4 %8 = icmp eq i32 %index.next, %n.vec br i1 %8, label %for.cond.cleanup, label %vector.body for.cond.cleanup: ; preds = %vector.body, %entry ret void } ; Function Attrs: nofree norecurse nounwind define dso_local arm_aapcs_vfpcc void @zext_i16(i32* noalias nocapture %a, i16* nocapture readonly %b, i32 %N) local_unnamed_addr #0 { ; CHECK-LABEL: zext_i16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB3_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: adds r3, #4 ; CHECK-NEXT: vldrh.u32 q0, [r1], #8 ; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vadd.i32 q0, q1, q0 ; CHECK-NEXT: vstrw.32 q0, [r0], #16 ; CHECK-NEXT: letp lr, .LBB3_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} entry: %cmp6 = icmp eq i32 %N, 0 br i1 %cmp6, label %for.cond.cleanup, label %vector.ph vector.ph: ; preds = %entry %n.rnd.up = add i32 %N, 3 %n.vec = and i32 %n.rnd.up, -4 %trip.count.minus.1 = add i32 %N, -1 %broadcast.splatinsert8 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat9 = shufflevector <4 x i32> %broadcast.splatinsert8, <4 x i32> undef, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = or <4 x i32> %broadcast.splat, %0 = getelementptr inbounds i16, i16* %b, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat9 + + ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat9 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast i16* %0 to <4 x i16>* %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %2, i32 2, <4 x i1> %1, <4 x i16> undef) %3 = zext <4 x i16> %wide.masked.load to <4 x i32> %4 = getelementptr inbounds i32, i32* %a, i32 %index %5 = bitcast i32* %4 to <4 x i32>* %wide.masked.load10 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %5, i32 4, <4 x i1> %1, <4 x i32> undef) %6 = add <4 x i32> %wide.masked.load10, %3 %7 = bitcast i32* %4 to <4 x i32>* call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %6, <4 x i32>* %7, i32 4, <4 x i1> %1) %index.next = add i32 %index, 4 %8 = icmp eq i32 %index.next, %n.vec br i1 %8, label %for.cond.cleanup, label %vector.body for.cond.cleanup: ; preds = %vector.body, %entry ret void } declare <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>*, i32 immarg, <8 x i1>, <8 x i8>) declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>) declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32 immarg, <8 x i1>) declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) +declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) +declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll index efc37128e817..0380d5f20bc7 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll @@ -1,588 +1,596 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp,+fp-armv8d16sp,+fp16,+fullfp16 -disable-mve-tail-predication=false %s -o - | FileCheck %s define arm_aapcs_vfpcc void @fast_float_mul(float* nocapture %a, float* nocapture readonly %b, float* nocapture readonly %c, i32 %N) { ; CHECK-LABEL: fast_float_mul: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: push {r4, r5, r6, r7, lr} ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: beq.w .LBB0_11 ; CHECK-NEXT: @ %bb.1: @ %vector.memcheck ; CHECK-NEXT: add.w r4, r0, r3, lsl #2 ; CHECK-NEXT: add.w r5, r2, r3, lsl #2 ; CHECK-NEXT: cmp r4, r2 ; CHECK-NEXT: mov.w r12, #1 ; CHECK-NEXT: cset lr, hi ; CHECK-NEXT: cmp r5, r0 ; CHECK-NEXT: cset r6, hi ; CHECK-NEXT: cmp r4, r1 ; CHECK-NEXT: add.w r5, r1, r3, lsl #2 ; CHECK-NEXT: cset r4, hi ; CHECK-NEXT: cmp r5, r0 ; CHECK-NEXT: cset r5, hi ; CHECK-NEXT: ands r5, r4 ; CHECK-NEXT: lsls r5, r5, #31 ; CHECK-NEXT: itt eq ; CHECK-NEXT: andeq.w r6, r6, lr ; CHECK-NEXT: lslseq.w r6, r6, #31 ; CHECK-NEXT: beq .LBB0_4 ; CHECK-NEXT: @ %bb.2: @ %for.body.preheader ; CHECK-NEXT: subs r6, r3, #1 ; CHECK-NEXT: and r7, r3, #3 ; CHECK-NEXT: cmp r6, #3 ; CHECK-NEXT: bhs .LBB0_6 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: b .LBB0_8 ; CHECK-NEXT: .LBB0_4: @ %vector.ph +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB0_5: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: add.w r12, r12, #4 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16 ; CHECK-NEXT: vldrw.u32 q1, [r2], #16 ; CHECK-NEXT: vmul.f32 q0, q1, q0 ; CHECK-NEXT: vstrw.32 q0, [r0], #16 ; CHECK-NEXT: letp lr, .LBB0_5 ; CHECK-NEXT: b .LBB0_11 ; CHECK-NEXT: .LBB0_6: @ %for.body.preheader.new ; CHECK-NEXT: bic r3, r3, #3 ; CHECK-NEXT: subs r3, #4 ; CHECK-NEXT: add.w lr, r12, r3, lsr #2 ; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB0_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: adds r4, r1, r3 ; CHECK-NEXT: adds r5, r2, r3 ; CHECK-NEXT: adds r6, r0, r3 ; CHECK-NEXT: adds r3, #16 ; CHECK-NEXT: vldr s0, [r4] ; CHECK-NEXT: add.w r12, r12, #4 ; CHECK-NEXT: vldr s2, [r5] ; CHECK-NEXT: vmul.f32 s0, s2, s0 ; CHECK-NEXT: vstr s0, [r6] ; CHECK-NEXT: vldr s0, [r4, #4] ; CHECK-NEXT: vldr s2, [r5, #4] ; CHECK-NEXT: vmul.f32 s0, s2, s0 ; CHECK-NEXT: vstr s0, [r6, #4] ; CHECK-NEXT: vldr s0, [r4, #8] ; CHECK-NEXT: vldr s2, [r5, #8] ; CHECK-NEXT: vmul.f32 s0, s2, s0 ; CHECK-NEXT: vstr s0, [r6, #8] ; CHECK-NEXT: vldr s0, [r4, #12] ; CHECK-NEXT: vldr s2, [r5, #12] ; CHECK-NEXT: vmul.f32 s0, s2, s0 ; CHECK-NEXT: vstr s0, [r6, #12] ; CHECK-NEXT: le lr, .LBB0_7 ; CHECK-NEXT: .LBB0_8: @ %for.cond.cleanup.loopexit.unr-lcssa ; CHECK-NEXT: wls lr, r7, .LBB0_11 ; CHECK-NEXT: @ %bb.9: @ %for.body.epil.preheader ; CHECK-NEXT: add.w r1, r1, r12, lsl #2 ; CHECK-NEXT: add.w r2, r2, r12, lsl #2 ; CHECK-NEXT: add.w r0, r0, r12, lsl #2 ; CHECK-NEXT: mov lr, r7 ; CHECK-NEXT: .LBB0_10: @ %for.body.epil ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldr s0, [r1] ; CHECK-NEXT: adds r1, #4 ; CHECK-NEXT: vldr s2, [r2] ; CHECK-NEXT: adds r2, #4 ; CHECK-NEXT: vmul.f32 s0, s2, s0 ; CHECK-NEXT: vstr s0, [r0] ; CHECK-NEXT: adds r0, #4 ; CHECK-NEXT: le lr, .LBB0_10 ; CHECK-NEXT: .LBB0_11: @ %for.cond.cleanup ; CHECK-NEXT: pop {r4, r5, r6, r7, pc} entry: %cmp8 = icmp eq i32 %N, 0 br i1 %cmp8, label %for.cond.cleanup, label %vector.memcheck vector.memcheck: ; preds = %entry %scevgep = getelementptr float, float* %a, i32 %N %scevgep13 = getelementptr float, float* %b, i32 %N %scevgep16 = getelementptr float, float* %c, i32 %N %bound0 = icmp ugt float* %scevgep13, %a %bound1 = icmp ugt float* %scevgep, %b %found.conflict = and i1 %bound0, %bound1 %bound018 = icmp ugt float* %scevgep16, %a %bound119 = icmp ugt float* %scevgep, %c %found.conflict20 = and i1 %bound018, %bound119 %conflict.rdx = or i1 %found.conflict, %found.conflict20 br i1 %conflict.rdx, label %for.body.preheader, label %vector.ph for.body.preheader: ; preds = %vector.memcheck %0 = add i32 %N, -1 %xtraiter = and i32 %N, 3 %1 = icmp ult i32 %0, 3 br i1 %1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new for.body.preheader.new: ; preds = %for.body.preheader %unroll_iter = sub i32 %N, %xtraiter br label %for.body vector.ph: ; preds = %vector.memcheck %n.rnd.up = add i32 %N, 3 %n.vec = and i32 %n.rnd.up, -4 %trip.count.minus.1 = add i32 %N, -1 %broadcast.splatinsert21 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat22 = shufflevector <4 x i32> %broadcast.splatinsert21, <4 x i32> undef, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %2 = getelementptr inbounds float, float* %b, i32 %index - %3 = icmp ule <4 x i32> %induction, %broadcast.splat22 + + ; %3 = icmp ule <4 x i32> %induction, %broadcast.splat22 + %3 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %4 = bitcast float* %2 to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %3, <4 x float> undef) %5 = getelementptr inbounds float, float* %c, i32 %index %6 = bitcast float* %5 to <4 x float>* %wide.masked.load23 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %6, i32 4, <4 x i1> %3, <4 x float> undef) %7 = fmul fast <4 x float> %wide.masked.load23, %wide.masked.load %8 = getelementptr inbounds float, float* %a, i32 %index %9 = bitcast float* %8 to <4 x float>* call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %7, <4 x float>* %9, i32 4, <4 x i1> %3) %index.next = add i32 %index, 4 %10 = icmp eq i32 %index.next, %n.vec br i1 %10, label %for.cond.cleanup, label %vector.body for.cond.cleanup.loopexit.unr-lcssa: ; preds = %for.body, %for.body.preheader %i.09.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ] %lcmp.mod = icmp eq i32 %xtraiter, 0 br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil for.body.epil: ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil %i.09.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.09.unr, %for.cond.cleanup.loopexit.unr-lcssa ] %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ] %arrayidx.epil = getelementptr inbounds float, float* %b, i32 %i.09.epil %11 = load float, float* %arrayidx.epil, align 4 %arrayidx1.epil = getelementptr inbounds float, float* %c, i32 %i.09.epil %12 = load float, float* %arrayidx1.epil, align 4 %mul.epil = fmul fast float %12, %11 %arrayidx2.epil = getelementptr inbounds float, float* %a, i32 %i.09.epil store float %mul.epil, float* %arrayidx2.epil, align 4 %inc.epil = add nuw i32 %i.09.epil, 1 %epil.iter.sub = add i32 %epil.iter, -1 %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0 br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil for.cond.cleanup: ; preds = %vector.body, %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil, %entry ret void for.body: ; preds = %for.body, %for.body.preheader.new %i.09 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ] %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ] %arrayidx = getelementptr inbounds float, float* %b, i32 %i.09 %13 = load float, float* %arrayidx, align 4 %arrayidx1 = getelementptr inbounds float, float* %c, i32 %i.09 %14 = load float, float* %arrayidx1, align 4 %mul = fmul fast float %14, %13 %arrayidx2 = getelementptr inbounds float, float* %a, i32 %i.09 store float %mul, float* %arrayidx2, align 4 %inc = or i32 %i.09, 1 %arrayidx.1 = getelementptr inbounds float, float* %b, i32 %inc %15 = load float, float* %arrayidx.1, align 4 %arrayidx1.1 = getelementptr inbounds float, float* %c, i32 %inc %16 = load float, float* %arrayidx1.1, align 4 %mul.1 = fmul fast float %16, %15 %arrayidx2.1 = getelementptr inbounds float, float* %a, i32 %inc store float %mul.1, float* %arrayidx2.1, align 4 %inc.1 = or i32 %i.09, 2 %arrayidx.2 = getelementptr inbounds float, float* %b, i32 %inc.1 %17 = load float, float* %arrayidx.2, align 4 %arrayidx1.2 = getelementptr inbounds float, float* %c, i32 %inc.1 %18 = load float, float* %arrayidx1.2, align 4 %mul.2 = fmul fast float %18, %17 %arrayidx2.2 = getelementptr inbounds float, float* %a, i32 %inc.1 store float %mul.2, float* %arrayidx2.2, align 4 %inc.2 = or i32 %i.09, 3 %arrayidx.3 = getelementptr inbounds float, float* %b, i32 %inc.2 %19 = load float, float* %arrayidx.3, align 4 %arrayidx1.3 = getelementptr inbounds float, float* %c, i32 %inc.2 %20 = load float, float* %arrayidx1.3, align 4 %mul.3 = fmul fast float %20, %19 %arrayidx2.3 = getelementptr inbounds float, float* %a, i32 %inc.2 store float %mul.3, float* %arrayidx2.3, align 4 %inc.3 = add nuw i32 %i.09, 4 %niter.nsub.3 = add i32 %niter, -4 %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0 br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body } define arm_aapcs_vfpcc float @fast_float_mac(float* nocapture readonly %b, float* nocapture readonly %c, i32 %N) { ; CHECK-LABEL: fast_float_mac: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: cbz r2, .LBB1_4 ; CHECK-NEXT: @ %bb.1: @ %vector.ph ; CHECK-NEXT: adds r3, r2, #3 ; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: bic r3, r3, #3 ; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2 -; CHECK-NEXT: lsr.w r3, r12, #2 -; CHECK-NEXT: sub.w r3, r2, r3, lsl #2 +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB1_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r2 +; CHECK-NEXT: adds r3, #4 ; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vpstt ; CHECK-NEXT: vldrwt.u32 q2, [r0], #16 ; CHECK-NEXT: vldrwt.u32 q3, [r1], #16 ; CHECK-NEXT: vfma.f32 q0, q3, q2 ; CHECK-NEXT: le lr, .LBB1_2 ; CHECK-NEXT: @ %bb.3: @ %middle.block -; CHECK-NEXT: vctp.32 r3 ; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vmov.f32 s4, s2 ; CHECK-NEXT: vmov.f32 s5, s3 ; CHECK-NEXT: vadd.f32 q0, q0, q1 ; CHECK-NEXT: vmov r0, s1 ; CHECK-NEXT: vadd.f32 q0, q0, r0 ; CHECK-NEXT: @ kill: def $s0 killed $s0 killed $q0 ; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .LBB1_4: ; CHECK-NEXT: vldr s0, .LCPI1_0 ; CHECK-NEXT: @ kill: def $s0 killed $s0 killed $q0 ; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: @ %bb.5: ; CHECK-NEXT: .LCPI1_0: ; CHECK-NEXT: .long 0x00000000 @ float 0 entry: %cmp8 = icmp eq i32 %N, 0 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph vector.ph: ; preds = %entry %n.rnd.up = add i32 %N, 3 %n.vec = and i32 %n.rnd.up, -4 %trip.count.minus.1 = add i32 %N, -1 %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %vec.phi = phi <4 x float> [ zeroinitializer, %vector.ph ], [ %6, %vector.body ] %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %0 = getelementptr inbounds float, float* %b, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat12 + +; %1 = icmp ule <4 x i32> %induction, %broadcast.splat12 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast float* %0 to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) %3 = getelementptr inbounds float, float* %c, i32 %index %4 = bitcast float* %3 to <4 x float>* %wide.masked.load13 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %1, <4 x float> undef) %5 = fmul fast <4 x float> %wide.masked.load13, %wide.masked.load %6 = fadd fast <4 x float> %5, %vec.phi %index.next = add i32 %index, 4 %7 = icmp eq i32 %index.next, %n.vec br i1 %7, label %middle.block, label %vector.body middle.block: ; preds = %vector.body %8 = select <4 x i1> %1, <4 x float> %6, <4 x float> %vec.phi %rdx.shuf = shufflevector <4 x float> %8, <4 x float> undef, <4 x i32> %bin.rdx = fadd fast <4 x float> %8, %rdx.shuf %rdx.shuf14 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> %bin.rdx15 = fadd fast <4 x float> %bin.rdx, %rdx.shuf14 %9 = extractelement <4 x float> %bin.rdx15, i32 0 br label %for.cond.cleanup for.cond.cleanup: ; preds = %middle.block, %entry %a.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %9, %middle.block ] ret float %a.0.lcssa } define arm_aapcs_vfpcc float @fast_float_half_mac(half* nocapture readonly %b, half* nocapture readonly %c, i32 %N) { ; CHECK-LABEL: fast_float_half_mac: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: sub sp, #32 ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: beq.w .LBB2_22 ; CHECK-NEXT: @ %bb.1: @ %vector.ph ; CHECK-NEXT: adds r3, r2, #3 ; CHECK-NEXT: vmov.i32 q5, #0x0 ; CHECK-NEXT: bic r3, r3, #3 ; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2 ; CHECK-NEXT: sub.w r12, r2, #1 ; CHECK-NEXT: adr r2, .LCPI2_1 ; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vdup.32 q1, r12 ; CHECK-NEXT: vdup.32 q2, r12 ; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill ; CHECK-NEXT: b .LBB2_4 ; CHECK-NEXT: .LBB2_2: @ %cond.load25 ; CHECK-NEXT: @ in Loop: Header=BB2_4 Depth=1 ; CHECK-NEXT: vmovx.f16 s0, s28 ; CHECK-NEXT: vmov r4, s28 ; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vmov.16 q6[0], r4 ; CHECK-NEXT: vldr.16 s0, [r1, #6] ; CHECK-NEXT: vmov.16 q6[1], r2 ; CHECK-NEXT: vmov r2, s29 ; CHECK-NEXT: vmov.16 q6[2], r2 ; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vmov.16 q6[3], r2 ; CHECK-NEXT: .LBB2_3: @ %else26 ; CHECK-NEXT: @ in Loop: Header=BB2_4 Depth=1 ; CHECK-NEXT: vmul.f16 q5, q6, q5 ; CHECK-NEXT: adds r0, #8 ; CHECK-NEXT: vmovx.f16 s2, s21 ; CHECK-NEXT: vmovx.f16 s0, s20 ; CHECK-NEXT: vcvtb.f32.f16 s27, s2 ; CHECK-NEXT: adds r1, #8 ; CHECK-NEXT: vcvtb.f32.f16 s26, s21 ; CHECK-NEXT: adds r3, #4 ; CHECK-NEXT: vcvtb.f32.f16 s25, s0 ; CHECK-NEXT: subs.w lr, lr, #1 ; CHECK-NEXT: vcvtb.f32.f16 s24, s20 ; CHECK-NEXT: vadd.f32 q5, q3, q6 ; CHECK-NEXT: bne .LBB2_4 ; CHECK-NEXT: b .LBB2_21 ; CHECK-NEXT: .LBB2_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload ; CHECK-NEXT: vmov q3, q5 ; CHECK-NEXT: @ implicit-def: $q6 ; CHECK-NEXT: vadd.i32 q4, q0, r3 ; CHECK-NEXT: vcmp.u32 cs, q1, q4 ; CHECK-NEXT: vmrs r4, p0 ; CHECK-NEXT: and r2, r4, #1 ; CHECK-NEXT: rsbs r5, r2, #0 ; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: bfi r2, r5, #0, #1 ; CHECK-NEXT: ubfx r5, r4, #4, #1 ; CHECK-NEXT: rsbs r5, r5, #0 ; CHECK-NEXT: bfi r2, r5, #1, #1 ; CHECK-NEXT: ubfx r5, r4, #8, #1 ; CHECK-NEXT: ubfx r4, r4, #12, #1 ; CHECK-NEXT: rsbs r5, r5, #0 ; CHECK-NEXT: bfi r2, r5, #2, #1 ; CHECK-NEXT: rsbs r4, r4, #0 ; CHECK-NEXT: bfi r2, r4, #3, #1 ; CHECK-NEXT: lsls r4, r2, #31 ; CHECK-NEXT: bne .LBB2_9 ; CHECK-NEXT: @ %bb.5: @ %else ; CHECK-NEXT: @ in Loop: Header=BB2_4 Depth=1 ; CHECK-NEXT: lsls r4, r2, #30 ; CHECK-NEXT: bpl .LBB2_10 ; CHECK-NEXT: .LBB2_6: @ %cond.load6 ; CHECK-NEXT: @ in Loop: Header=BB2_4 Depth=1 ; CHECK-NEXT: vldr.16 s20, [r0, #2] ; CHECK-NEXT: vmov r5, s24 ; CHECK-NEXT: vmovx.f16 s24, s25 ; CHECK-NEXT: vmov r4, s20 ; CHECK-NEXT: vmov.16 q5[0], r5 ; CHECK-NEXT: vmov.16 q5[1], r4 ; CHECK-NEXT: vmov r4, s25 ; CHECK-NEXT: vmov.16 q5[2], r4 ; CHECK-NEXT: vmov r4, s24 ; CHECK-NEXT: vmov.16 q5[3], r4 ; CHECK-NEXT: lsls r4, r2, #29 ; CHECK-NEXT: bmi .LBB2_11 ; CHECK-NEXT: .LBB2_7: @ in Loop: Header=BB2_4 Depth=1 ; CHECK-NEXT: vmov q6, q5 ; CHECK-NEXT: lsls r2, r2, #28 ; CHECK-NEXT: bmi .LBB2_12 ; CHECK-NEXT: .LBB2_8: @ in Loop: Header=BB2_4 Depth=1 ; CHECK-NEXT: vmov q5, q6 ; CHECK-NEXT: b .LBB2_13 ; CHECK-NEXT: .LBB2_9: @ %cond.load ; CHECK-NEXT: @ in Loop: Header=BB2_4 Depth=1 ; CHECK-NEXT: vldr.16 s24, [r0] ; CHECK-NEXT: lsls r4, r2, #30 ; CHECK-NEXT: bmi .LBB2_6 ; CHECK-NEXT: .LBB2_10: @ in Loop: Header=BB2_4 Depth=1 ; CHECK-NEXT: vmov q5, q6 ; CHECK-NEXT: lsls r4, r2, #29 ; CHECK-NEXT: bpl .LBB2_7 ; CHECK-NEXT: .LBB2_11: @ %cond.load9 ; CHECK-NEXT: @ in Loop: Header=BB2_4 Depth=1 ; CHECK-NEXT: vmovx.f16 s24, s20 ; CHECK-NEXT: vmov r4, s20 ; CHECK-NEXT: vldr.16 s28, [r0, #4] ; CHECK-NEXT: vmov r5, s24 ; CHECK-NEXT: vmov.16 q6[0], r4 ; CHECK-NEXT: vmovx.f16 s20, s21 ; CHECK-NEXT: vmov.16 q6[1], r5 ; CHECK-NEXT: vmov r4, s28 ; CHECK-NEXT: vmov.16 q6[2], r4 ; CHECK-NEXT: vmov r4, s20 ; CHECK-NEXT: vmov.16 q6[3], r4 ; CHECK-NEXT: lsls r2, r2, #28 ; CHECK-NEXT: bpl .LBB2_8 ; CHECK-NEXT: .LBB2_12: @ %cond.load12 ; CHECK-NEXT: @ in Loop: Header=BB2_4 Depth=1 ; CHECK-NEXT: vmovx.f16 s20, s24 ; CHECK-NEXT: vmov r4, s24 ; CHECK-NEXT: vmov r2, s20 ; CHECK-NEXT: vmov.16 q5[0], r4 ; CHECK-NEXT: vmov.16 q5[1], r2 ; CHECK-NEXT: vmov r2, s25 ; CHECK-NEXT: vldr.16 s24, [r0, #6] ; CHECK-NEXT: vmov.16 q5[2], r2 ; CHECK-NEXT: vmov r2, s24 ; CHECK-NEXT: vmov.16 q5[3], r2 ; CHECK-NEXT: .LBB2_13: @ %else13 ; CHECK-NEXT: @ in Loop: Header=BB2_4 Depth=1 ; CHECK-NEXT: vcmp.u32 cs, q2, q4 ; CHECK-NEXT: @ implicit-def: $q7 ; CHECK-NEXT: vmrs r4, p0 ; CHECK-NEXT: and r2, r4, #1 ; CHECK-NEXT: rsbs r5, r2, #0 ; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: bfi r2, r5, #0, #1 ; CHECK-NEXT: ubfx r5, r4, #4, #1 ; CHECK-NEXT: rsbs r5, r5, #0 ; CHECK-NEXT: bfi r2, r5, #1, #1 ; CHECK-NEXT: ubfx r5, r4, #8, #1 ; CHECK-NEXT: ubfx r4, r4, #12, #1 ; CHECK-NEXT: rsbs r5, r5, #0 ; CHECK-NEXT: bfi r2, r5, #2, #1 ; CHECK-NEXT: rsbs r4, r4, #0 ; CHECK-NEXT: bfi r2, r4, #3, #1 ; CHECK-NEXT: lsls r4, r2, #31 ; CHECK-NEXT: bne .LBB2_17 ; CHECK-NEXT: @ %bb.14: @ %else17 ; CHECK-NEXT: @ in Loop: Header=BB2_4 Depth=1 ; CHECK-NEXT: lsls r4, r2, #30 ; CHECK-NEXT: bpl .LBB2_18 ; CHECK-NEXT: .LBB2_15: @ %cond.load19 ; CHECK-NEXT: @ in Loop: Header=BB2_4 Depth=1 ; CHECK-NEXT: vldr.16 s24, [r1, #2] ; CHECK-NEXT: vmov r5, s28 ; CHECK-NEXT: vmovx.f16 s28, s29 ; CHECK-NEXT: vmov r4, s24 ; CHECK-NEXT: vmov.16 q6[0], r5 ; CHECK-NEXT: vmov.16 q6[1], r4 ; CHECK-NEXT: vmov r4, s29 ; CHECK-NEXT: vmov.16 q6[2], r4 ; CHECK-NEXT: vmov r4, s28 ; CHECK-NEXT: vmov.16 q6[3], r4 ; CHECK-NEXT: lsls r4, r2, #29 ; CHECK-NEXT: bmi .LBB2_19 ; CHECK-NEXT: .LBB2_16: @ in Loop: Header=BB2_4 Depth=1 ; CHECK-NEXT: vmov q7, q6 ; CHECK-NEXT: lsls r2, r2, #28 ; CHECK-NEXT: bmi.w .LBB2_2 ; CHECK-NEXT: b .LBB2_20 ; CHECK-NEXT: .LBB2_17: @ %cond.load16 ; CHECK-NEXT: @ in Loop: Header=BB2_4 Depth=1 ; CHECK-NEXT: vldr.16 s28, [r1] ; CHECK-NEXT: lsls r4, r2, #30 ; CHECK-NEXT: bmi .LBB2_15 ; CHECK-NEXT: .LBB2_18: @ in Loop: Header=BB2_4 Depth=1 ; CHECK-NEXT: vmov q6, q7 ; CHECK-NEXT: lsls r4, r2, #29 ; CHECK-NEXT: bpl .LBB2_16 ; CHECK-NEXT: .LBB2_19: @ %cond.load22 ; CHECK-NEXT: @ in Loop: Header=BB2_4 Depth=1 ; CHECK-NEXT: vmovx.f16 s28, s24 ; CHECK-NEXT: vmov r4, s24 ; CHECK-NEXT: vldr.16 s0, [r1, #4] ; CHECK-NEXT: vmov r5, s28 ; CHECK-NEXT: vmov.16 q7[0], r4 ; CHECK-NEXT: vmov r4, s0 ; CHECK-NEXT: vmov.16 q7[1], r5 ; CHECK-NEXT: vmovx.f16 s0, s25 ; CHECK-NEXT: vmov.16 q7[2], r4 ; CHECK-NEXT: vmov r4, s0 ; CHECK-NEXT: vmov.16 q7[3], r4 ; CHECK-NEXT: lsls r2, r2, #28 ; CHECK-NEXT: bmi.w .LBB2_2 ; CHECK-NEXT: .LBB2_20: @ in Loop: Header=BB2_4 Depth=1 ; CHECK-NEXT: vmov q6, q7 ; CHECK-NEXT: b .LBB2_3 ; CHECK-NEXT: .LBB2_21: @ %middle.block ; CHECK-NEXT: vdup.32 q0, r12 ; CHECK-NEXT: vcmp.u32 cs, q0, q4 ; CHECK-NEXT: vpsel q0, q5, q3 ; CHECK-NEXT: vmov.f32 s4, s2 ; CHECK-NEXT: vmov.f32 s5, s3 ; CHECK-NEXT: vadd.f32 q0, q0, q1 ; CHECK-NEXT: vmov r0, s1 ; CHECK-NEXT: vadd.f32 q0, q0, r0 ; CHECK-NEXT: b .LBB2_23 ; CHECK-NEXT: .LBB2_22: ; CHECK-NEXT: vldr s0, .LCPI2_0 ; CHECK-NEXT: .LBB2_23: @ %for.cond.cleanup ; CHECK-NEXT: @ kill: def $s0 killed $s0 killed $q0 ; CHECK-NEXT: add sp, #32 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: pop {r4, r5, r7, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.24: ; CHECK-NEXT: .LCPI2_1: ; CHECK-NEXT: .long 0 @ 0x0 ; CHECK-NEXT: .long 1 @ 0x1 ; CHECK-NEXT: .long 2 @ 0x2 ; CHECK-NEXT: .long 3 @ 0x3 ; CHECK-NEXT: .LCPI2_0: ; CHECK-NEXT: .long 0x00000000 @ float 0 entry: %cmp8 = icmp eq i32 %N, 0 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph vector.ph: ; preds = %entry %n.rnd.up = add i32 %N, 3 %n.vec = and i32 %n.rnd.up, -4 %trip.count.minus.1 = add i32 %N, -1 %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %vec.phi = phi <4 x float> [ zeroinitializer, %vector.ph ], [ %7, %vector.body ] %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %0 = getelementptr inbounds half, half* %b, i32 %index %1 = icmp ule <4 x i32> %induction, %broadcast.splat12 %2 = bitcast half* %0 to <4 x half>* %wide.masked.load = call <4 x half> @llvm.masked.load.v4f16.p0v4f16(<4 x half>* %2, i32 2, <4 x i1> %1, <4 x half> undef) %3 = getelementptr inbounds half, half* %c, i32 %index %4 = bitcast half* %3 to <4 x half>* %wide.masked.load13 = call <4 x half> @llvm.masked.load.v4f16.p0v4f16(<4 x half>* %4, i32 2, <4 x i1> %1, <4 x half> undef) %5 = fmul fast <4 x half> %wide.masked.load13, %wide.masked.load %6 = fpext <4 x half> %5 to <4 x float> %7 = fadd fast <4 x float> %vec.phi, %6 %index.next = add i32 %index, 4 %8 = icmp eq i32 %index.next, %n.vec br i1 %8, label %middle.block, label %vector.body middle.block: ; preds = %vector.body %9 = select <4 x i1> %1, <4 x float> %7, <4 x float> %vec.phi %rdx.shuf = shufflevector <4 x float> %9, <4 x float> undef, <4 x i32> %bin.rdx = fadd fast <4 x float> %9, %rdx.shuf %rdx.shuf14 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> %bin.rdx15 = fadd fast <4 x float> %bin.rdx, %rdx.shuf14 %10 = extractelement <4 x float> %bin.rdx15, i32 0 br label %for.cond.cleanup for.cond.cleanup: ; preds = %middle.block, %entry %a.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %10, %middle.block ] ret float %a.0.lcssa } ; Function Attrs: argmemonly nounwind readonly willreturn declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4 x i1>, <4 x float>) ; Function Attrs: argmemonly nounwind willreturn declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32 immarg, <4 x i1>) ; Function Attrs: argmemonly nounwind readonly willreturn declare <4 x half> @llvm.masked.load.v4f16.p0v4f16(<4 x half>*, i32 immarg, <4 x i1>, <4 x half>) +declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll index b76ca06ecec3..6c1273db3f80 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll @@ -1,1235 +1,1274 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -disable-mve-tail-predication=false %s -o - | FileCheck %s define arm_aapcs_vfpcc i32 @test_acc_scalar_char(i8 zeroext %a, i8* nocapture readonly %b, i32 %N) { ; CHECK-LABEL: test_acc_scalar_char: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: itt eq ; CHECK-NEXT: moveq r0, #0 ; CHECK-NEXT: bxeq lr ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: adds r3, r2, #3 ; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: bic r3, r3, #3 ; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2 -; CHECK-NEXT: lsr.w r3, r12, #2 -; CHECK-NEXT: sub.w r3, r2, r3, lsl #2 +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB0_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r2 +; CHECK-NEXT: adds r3, #4 ; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vpst ; CHECK-NEXT: vldrbt.u32 q2, [r1], #4 ; CHECK-NEXT: vmla.u32 q0, q2, r0 ; CHECK-NEXT: le lr, .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block -; CHECK-NEXT: vctp.32 r3 ; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: pop {r7, pc} entry: %cmp7 = icmp eq i32 %N, 0 br i1 %cmp7, label %for.cond.cleanup, label %vector.ph vector.ph: ; preds = %entry %conv = zext i8 %a to i32 %n.rnd.up = add i32 %N, 3 %n.vec = and i32 %n.rnd.up, -4 %trip.count.minus.1 = add i32 %N, -1 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer %broadcast.splatinsert12 = insertelement <4 x i32> undef, i32 %conv, i32 0 %broadcast.splat13 = shufflevector <4 x i32> %broadcast.splatinsert12, <4 x i32> undef, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %5, %vector.body ] %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %0 = getelementptr inbounds i8, i8* %b, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + +; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast i8* %0 to <4 x i8>* %wide.masked.load = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %2, i32 1, <4 x i1> %1, <4 x i8> undef) %3 = zext <4 x i8> %wide.masked.load to <4 x i32> %4 = mul nuw nsw <4 x i32> %broadcast.splat13, %3 %5 = add nuw nsw <4 x i32> %4, %vec.phi %index.next = add i32 %index, 4 %6 = icmp eq i32 %index.next, %n.vec br i1 %6, label %middle.block, label %vector.body middle.block: ; preds = %vector.body %7 = select <4 x i1> %1, <4 x i32> %5, <4 x i32> %vec.phi %8 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %7) br label %for.cond.cleanup for.cond.cleanup: ; preds = %middle.block, %entry %res.0.lcssa = phi i32 [ 0, %entry ], [ %8, %middle.block ] ret i32 %res.0.lcssa } define arm_aapcs_vfpcc i32 @test_acc_scalar_short(i16 signext %a, i16* nocapture readonly %b, i32 %N) { ; CHECK-LABEL: test_acc_scalar_short: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: itt eq ; CHECK-NEXT: moveq r0, #0 ; CHECK-NEXT: bxeq lr ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: adds r3, r2, #3 ; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: bic r3, r3, #3 ; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2 -; CHECK-NEXT: lsr.w r3, r12, #2 -; CHECK-NEXT: sub.w r3, r2, r3, lsl #2 +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB1_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r2 +; CHECK-NEXT: adds r3, #4 ; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vpst ; CHECK-NEXT: vldrht.s32 q2, [r1], #8 ; CHECK-NEXT: vmla.u32 q0, q2, r0 ; CHECK-NEXT: le lr, .LBB1_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block -; CHECK-NEXT: vctp.32 r3 ; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: pop {r7, pc} entry: %cmp7 = icmp eq i32 %N, 0 br i1 %cmp7, label %for.cond.cleanup, label %vector.ph vector.ph: ; preds = %entry %conv = sext i16 %a to i32 %n.rnd.up = add i32 %N, 3 %n.vec = and i32 %n.rnd.up, -4 %trip.count.minus.1 = add i32 %N, -1 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer %broadcast.splatinsert12 = insertelement <4 x i32> undef, i32 %conv, i32 0 %broadcast.splat13 = shufflevector <4 x i32> %broadcast.splatinsert12, <4 x i32> undef, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %5, %vector.body ] %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %0 = getelementptr inbounds i16, i16* %b, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + +; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast i16* %0 to <4 x i16>* %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %2, i32 2, <4 x i1> %1, <4 x i16> undef) %3 = sext <4 x i16> %wide.masked.load to <4 x i32> %4 = mul nsw <4 x i32> %broadcast.splat13, %3 %5 = add nsw <4 x i32> %4, %vec.phi %index.next = add i32 %index, 4 %6 = icmp eq i32 %index.next, %n.vec br i1 %6, label %middle.block, label %vector.body middle.block: ; preds = %vector.body %7 = select <4 x i1> %1, <4 x i32> %5, <4 x i32> %vec.phi %8 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %7) br label %for.cond.cleanup for.cond.cleanup: ; preds = %middle.block, %entry %res.0.lcssa = phi i32 [ 0, %entry ], [ %8, %middle.block ] ret i32 %res.0.lcssa } define arm_aapcs_vfpcc i32 @test_acc_scalar_uchar(i8 zeroext %a, i8* nocapture readonly %b, i32 %N) { ; CHECK-LABEL: test_acc_scalar_uchar: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: itt eq ; CHECK-NEXT: moveq r0, #0 ; CHECK-NEXT: bxeq lr ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: adds r3, r2, #3 ; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: bic r3, r3, #3 ; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2 -; CHECK-NEXT: lsr.w r3, r12, #2 -; CHECK-NEXT: sub.w r3, r2, r3, lsl #2 +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB2_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r2 +; CHECK-NEXT: adds r3, #4 ; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vpst ; CHECK-NEXT: vldrbt.u32 q2, [r1], #4 ; CHECK-NEXT: vmla.u32 q0, q2, r0 ; CHECK-NEXT: le lr, .LBB2_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block -; CHECK-NEXT: vctp.32 r3 ; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: pop {r7, pc} entry: %cmp7 = icmp eq i32 %N, 0 br i1 %cmp7, label %for.cond.cleanup, label %vector.ph vector.ph: ; preds = %entry %conv = zext i8 %a to i32 %n.rnd.up = add i32 %N, 3 %n.vec = and i32 %n.rnd.up, -4 %trip.count.minus.1 = add i32 %N, -1 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer %broadcast.splatinsert12 = insertelement <4 x i32> undef, i32 %conv, i32 0 %broadcast.splat13 = shufflevector <4 x i32> %broadcast.splatinsert12, <4 x i32> undef, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %5, %vector.body ] %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %0 = getelementptr inbounds i8, i8* %b, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + +; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast i8* %0 to <4 x i8>* %wide.masked.load = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %2, i32 1, <4 x i1> %1, <4 x i8> undef) %3 = zext <4 x i8> %wide.masked.load to <4 x i32> %4 = mul nuw nsw <4 x i32> %broadcast.splat13, %3 %5 = add nuw nsw <4 x i32> %4, %vec.phi %index.next = add i32 %index, 4 %6 = icmp eq i32 %index.next, %n.vec br i1 %6, label %middle.block, label %vector.body middle.block: ; preds = %vector.body %7 = select <4 x i1> %1, <4 x i32> %5, <4 x i32> %vec.phi %8 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %7) br label %for.cond.cleanup for.cond.cleanup: ; preds = %middle.block, %entry %res.0.lcssa = phi i32 [ 0, %entry ], [ %8, %middle.block ] ret i32 %res.0.lcssa } define arm_aapcs_vfpcc i32 @test_acc_scalar_ushort(i16 signext %a, i16* nocapture readonly %b, i32 %N) { ; CHECK-LABEL: test_acc_scalar_ushort: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: itt eq ; CHECK-NEXT: moveq r0, #0 ; CHECK-NEXT: bxeq lr ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: adds r3, r2, #3 ; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: bic r3, r3, #3 ; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2 -; CHECK-NEXT: lsr.w r3, r12, #2 -; CHECK-NEXT: sub.w r3, r2, r3, lsl #2 +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB3_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r2 +; CHECK-NEXT: adds r3, #4 ; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vpst ; CHECK-NEXT: vldrht.u32 q2, [r1], #8 ; CHECK-NEXT: vmla.u32 q0, q2, r0 ; CHECK-NEXT: le lr, .LBB3_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block -; CHECK-NEXT: vctp.32 r3 ; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: pop {r7, pc} entry: %cmp7 = icmp eq i32 %N, 0 br i1 %cmp7, label %for.cond.cleanup, label %vector.ph vector.ph: ; preds = %entry %conv = sext i16 %a to i32 %n.rnd.up = add i32 %N, 3 %n.vec = and i32 %n.rnd.up, -4 %trip.count.minus.1 = add i32 %N, -1 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer %broadcast.splatinsert12 = insertelement <4 x i32> undef, i32 %conv, i32 0 %broadcast.splat13 = shufflevector <4 x i32> %broadcast.splatinsert12, <4 x i32> undef, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %5, %vector.body ] %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %0 = getelementptr inbounds i16, i16* %b, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + +; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast i16* %0 to <4 x i16>* %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %2, i32 2, <4 x i1> %1, <4 x i16> undef) %3 = zext <4 x i16> %wide.masked.load to <4 x i32> %4 = mul nsw <4 x i32> %broadcast.splat13, %3 %5 = add nsw <4 x i32> %4, %vec.phi %index.next = add i32 %index, 4 %6 = icmp eq i32 %index.next, %n.vec br i1 %6, label %middle.block, label %vector.body middle.block: ; preds = %vector.body %7 = select <4 x i1> %1, <4 x i32> %5, <4 x i32> %vec.phi %8 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %7) br label %for.cond.cleanup for.cond.cleanup: ; preds = %middle.block, %entry %res.0.lcssa = phi i32 [ 0, %entry ], [ %8, %middle.block ] ret i32 %res.0.lcssa } define arm_aapcs_vfpcc i32 @test_acc_scalar_int(i32 %a, i32* nocapture readonly %b, i32 %N) { ; CHECK-LABEL: test_acc_scalar_int: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: itt eq ; CHECK-NEXT: moveq r0, #0 ; CHECK-NEXT: bxeq lr ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: adds r3, r2, #3 ; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: bic r3, r3, #3 ; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2 -; CHECK-NEXT: lsr.w r3, r12, #2 -; CHECK-NEXT: sub.w r3, r2, r3, lsl #2 +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB4_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r2 +; CHECK-NEXT: adds r3, #4 ; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vpst ; CHECK-NEXT: vldrwt.u32 q2, [r1], #16 ; CHECK-NEXT: vmla.u32 q0, q2, r0 ; CHECK-NEXT: le lr, .LBB4_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block -; CHECK-NEXT: vctp.32 r3 ; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: pop {r7, pc} entry: %cmp6 = icmp eq i32 %N, 0 br i1 %cmp6, label %for.cond.cleanup, label %vector.ph vector.ph: ; preds = %entry %n.rnd.up = add i32 %N, 3 %n.vec = and i32 %n.rnd.up, -4 %trip.count.minus.1 = add i32 %N, -1 %broadcast.splatinsert9 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat10 = shufflevector <4 x i32> %broadcast.splatinsert9, <4 x i32> undef, <4 x i32> zeroinitializer %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %a, i32 0 %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %4, %vector.body ] %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %0 = getelementptr inbounds i32, i32* %b, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat10 + +; %1 = icmp ule <4 x i32> %induction, %broadcast.splat10 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast i32* %0 to <4 x i32>* %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef) %3 = mul nsw <4 x i32> %wide.masked.load, %broadcast.splat12 %4 = add nsw <4 x i32> %3, %vec.phi %index.next = add i32 %index, 4 %5 = icmp eq i32 %index.next, %n.vec br i1 %5, label %middle.block, label %vector.body middle.block: ; preds = %vector.body %6 = select <4 x i1> %1, <4 x i32> %4, <4 x i32> %vec.phi %7 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %6) br label %for.cond.cleanup for.cond.cleanup: ; preds = %middle.block, %entry %res.0.lcssa = phi i32 [ 0, %entry ], [ %7, %middle.block ] ret i32 %res.0.lcssa } define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_char(i8* nocapture readonly %a, i8* nocapture readonly %b, i8 zeroext %c, i32* nocapture %res, i32 %N) { ; CHECK-LABEL: test_vec_mul_scalar_add_char: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} ; CHECK-NEXT: ldr.w r12, [sp, #28] ; CHECK-NEXT: cmp.w r12, #0 ; CHECK-NEXT: beq.w .LBB5_11 ; CHECK-NEXT: @ %bb.1: @ %for.body.lr.ph ; CHECK-NEXT: add.w r6, r3, r12, lsl #2 ; CHECK-NEXT: add.w r4, r1, r12 ; CHECK-NEXT: cmp r6, r1 ; CHECK-NEXT: add.w r5, r0, r12 ; CHECK-NEXT: cset lr, hi ; CHECK-NEXT: cmp r4, r3 ; CHECK-NEXT: cset r4, hi ; CHECK-NEXT: cmp r6, r0 ; CHECK-NEXT: cset r6, hi ; CHECK-NEXT: cmp r5, r3 ; CHECK-NEXT: cset r5, hi ; CHECK-NEXT: ands r5, r6 ; CHECK-NEXT: movs r6, #1 ; CHECK-NEXT: lsls r5, r5, #31 ; CHECK-NEXT: itt eq ; CHECK-NEXT: andeq.w r5, r4, lr ; CHECK-NEXT: lslseq.w r5, r5, #31 ; CHECK-NEXT: beq .LBB5_4 ; CHECK-NEXT: @ %bb.2: @ %for.body.preheader ; CHECK-NEXT: sub.w r5, r12, #1 ; CHECK-NEXT: and r9, r12, #3 ; CHECK-NEXT: cmp r5, #3 ; CHECK-NEXT: bhs .LBB5_6 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: b .LBB5_8 ; CHECK-NEXT: .LBB5_4: @ %vector.ph +; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: dlstp.32 lr, r12 ; CHECK-NEXT: .LBB5_5: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: adds r4, #4 ; CHECK-NEXT: vldrb.u32 q0, [r0], #4 ; CHECK-NEXT: vldrb.u32 q1, [r1], #4 ; CHECK-NEXT: vmlas.u32 q1, q0, r2 ; CHECK-NEXT: vstrw.32 q1, [r3], #16 ; CHECK-NEXT: letp lr, .LBB5_5 ; CHECK-NEXT: b .LBB5_11 ; CHECK-NEXT: .LBB5_6: @ %for.body.preheader.new ; CHECK-NEXT: bic r5, r12, #3 ; CHECK-NEXT: add.w r4, r3, #8 ; CHECK-NEXT: subs r5, #4 ; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: add.w lr, r6, r5, lsr #2 ; CHECK-NEXT: adds r5, r0, #3 ; CHECK-NEXT: adds r6, r1, #1 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB5_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldrb r8, [r5, #-3] ; CHECK-NEXT: add.w r12, r12, #4 ; CHECK-NEXT: ldrb r7, [r6, #-1] ; CHECK-NEXT: smlabb r7, r7, r8, r2 ; CHECK-NEXT: str r7, [r4, #-8] ; CHECK-NEXT: ldrb r8, [r5, #-2] ; CHECK-NEXT: ldrb r7, [r6] ; CHECK-NEXT: smlabb r7, r7, r8, r2 ; CHECK-NEXT: str r7, [r4, #-4] ; CHECK-NEXT: ldrb r8, [r5, #-1] ; CHECK-NEXT: ldrb r7, [r6, #1] ; CHECK-NEXT: smlabb r7, r7, r8, r2 ; CHECK-NEXT: str r7, [r4] ; CHECK-NEXT: ldrb.w r8, [r5] ; CHECK-NEXT: adds r5, #4 ; CHECK-NEXT: ldrb r7, [r6, #2] ; CHECK-NEXT: adds r6, #4 ; CHECK-NEXT: smlabb r7, r7, r8, r2 ; CHECK-NEXT: str r7, [r4, #4] ; CHECK-NEXT: adds r4, #16 ; CHECK-NEXT: le lr, .LBB5_7 ; CHECK-NEXT: .LBB5_8: @ %for.cond.cleanup.loopexit.unr-lcssa ; CHECK-NEXT: wls lr, r9, .LBB5_11 ; CHECK-NEXT: @ %bb.9: @ %for.body.epil.preheader ; CHECK-NEXT: add r0, r12 ; CHECK-NEXT: add r1, r12 ; CHECK-NEXT: add.w r3, r3, r12, lsl #2 ; CHECK-NEXT: mov lr, r9 ; CHECK-NEXT: .LBB5_10: @ %for.body.epil ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldrb r6, [r0], #1 ; CHECK-NEXT: ldrb r5, [r1], #1 ; CHECK-NEXT: smlabb r6, r5, r6, r2 ; CHECK-NEXT: str r6, [r3], #4 ; CHECK-NEXT: le lr, .LBB5_10 ; CHECK-NEXT: .LBB5_11: @ %for.cond.cleanup ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} entry: %res12 = bitcast i32* %res to i8* %cmp10 = icmp eq i32 %N, 0 br i1 %cmp10, label %for.cond.cleanup, label %for.body.lr.ph for.body.lr.ph: ; preds = %entry %conv3 = zext i8 %c to i32 %scevgep = getelementptr i32, i32* %res, i32 %N %scevgep13 = bitcast i32* %scevgep to i8* %scevgep14 = getelementptr i8, i8* %a, i32 %N %scevgep15 = getelementptr i8, i8* %b, i32 %N %bound0 = icmp ugt i8* %scevgep14, %res12 %bound1 = icmp ugt i8* %scevgep13, %a %found.conflict = and i1 %bound0, %bound1 %bound016 = icmp ugt i8* %scevgep15, %res12 %bound117 = icmp ugt i8* %scevgep13, %b %found.conflict18 = and i1 %bound016, %bound117 %conflict.rdx = or i1 %found.conflict, %found.conflict18 br i1 %conflict.rdx, label %for.body.preheader, label %vector.ph for.body.preheader: ; preds = %for.body.lr.ph %0 = add i32 %N, -1 %xtraiter = and i32 %N, 3 %1 = icmp ult i32 %0, 3 br i1 %1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new for.body.preheader.new: ; preds = %for.body.preheader %unroll_iter = sub i32 %N, %xtraiter br label %for.body vector.ph: ; preds = %for.body.lr.ph %n.rnd.up = add i32 %N, 3 %n.vec = and i32 %n.rnd.up, -4 %trip.count.minus.1 = add i32 %N, -1 %broadcast.splatinsert19 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat20 = shufflevector <4 x i32> %broadcast.splatinsert19, <4 x i32> undef, <4 x i32> zeroinitializer %broadcast.splatinsert22 = insertelement <4 x i32> undef, i32 %conv3, i32 0 %broadcast.splat23 = shufflevector <4 x i32> %broadcast.splatinsert22, <4 x i32> undef, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %2 = getelementptr inbounds i8, i8* %a, i32 %index - %3 = icmp ule <4 x i32> %induction, %broadcast.splat20 + + ; %3 = icmp ule <4 x i32> %induction, %broadcast.splat20 + %3 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %4 = bitcast i8* %2 to <4 x i8>* %wide.masked.load = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %4, i32 1, <4 x i1> %3, <4 x i8> undef) %5 = zext <4 x i8> %wide.masked.load to <4 x i32> %6 = getelementptr inbounds i8, i8* %b, i32 %index %7 = bitcast i8* %6 to <4 x i8>* %wide.masked.load21 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %7, i32 1, <4 x i1> %3, <4 x i8> undef) %8 = zext <4 x i8> %wide.masked.load21 to <4 x i32> %9 = mul nuw nsw <4 x i32> %8, %5 %10 = add nuw nsw <4 x i32> %9, %broadcast.splat23 %11 = getelementptr inbounds i32, i32* %res, i32 %index %12 = bitcast i32* %11 to <4 x i32>* call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %10, <4 x i32>* %12, i32 4, <4 x i1> %3) %index.next = add i32 %index, 4 %13 = icmp eq i32 %index.next, %n.vec br i1 %13, label %for.cond.cleanup, label %vector.body for.cond.cleanup.loopexit.unr-lcssa: ; preds = %for.body, %for.body.preheader %i.011.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ] %lcmp.mod = icmp eq i32 %xtraiter, 0 br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil for.body.epil: ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil %i.011.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.011.unr, %for.cond.cleanup.loopexit.unr-lcssa ] %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ] %arrayidx.epil = getelementptr inbounds i8, i8* %a, i32 %i.011.epil %14 = load i8, i8* %arrayidx.epil, align 1 %conv.epil = zext i8 %14 to i32 %arrayidx1.epil = getelementptr inbounds i8, i8* %b, i32 %i.011.epil %15 = load i8, i8* %arrayidx1.epil, align 1 %conv2.epil = zext i8 %15 to i32 %mul.epil = mul nuw nsw i32 %conv2.epil, %conv.epil %add.epil = add nuw nsw i32 %mul.epil, %conv3 %arrayidx4.epil = getelementptr inbounds i32, i32* %res, i32 %i.011.epil store i32 %add.epil, i32* %arrayidx4.epil, align 4 %inc.epil = add nuw i32 %i.011.epil, 1 %epil.iter.sub = add i32 %epil.iter, -1 %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0 br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil for.cond.cleanup: ; preds = %vector.body, %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil, %entry ret void for.body: ; preds = %for.body, %for.body.preheader.new %i.011 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ] %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ] %arrayidx = getelementptr inbounds i8, i8* %a, i32 %i.011 %16 = load i8, i8* %arrayidx, align 1 %conv = zext i8 %16 to i32 %arrayidx1 = getelementptr inbounds i8, i8* %b, i32 %i.011 %17 = load i8, i8* %arrayidx1, align 1 %conv2 = zext i8 %17 to i32 %mul = mul nuw nsw i32 %conv2, %conv %add = add nuw nsw i32 %mul, %conv3 %arrayidx4 = getelementptr inbounds i32, i32* %res, i32 %i.011 store i32 %add, i32* %arrayidx4, align 4 %inc = or i32 %i.011, 1 %arrayidx.1 = getelementptr inbounds i8, i8* %a, i32 %inc %18 = load i8, i8* %arrayidx.1, align 1 %conv.1 = zext i8 %18 to i32 %arrayidx1.1 = getelementptr inbounds i8, i8* %b, i32 %inc %19 = load i8, i8* %arrayidx1.1, align 1 %conv2.1 = zext i8 %19 to i32 %mul.1 = mul nuw nsw i32 %conv2.1, %conv.1 %add.1 = add nuw nsw i32 %mul.1, %conv3 %arrayidx4.1 = getelementptr inbounds i32, i32* %res, i32 %inc store i32 %add.1, i32* %arrayidx4.1, align 4 %inc.1 = or i32 %i.011, 2 %arrayidx.2 = getelementptr inbounds i8, i8* %a, i32 %inc.1 %20 = load i8, i8* %arrayidx.2, align 1 %conv.2 = zext i8 %20 to i32 %arrayidx1.2 = getelementptr inbounds i8, i8* %b, i32 %inc.1 %21 = load i8, i8* %arrayidx1.2, align 1 %conv2.2 = zext i8 %21 to i32 %mul.2 = mul nuw nsw i32 %conv2.2, %conv.2 %add.2 = add nuw nsw i32 %mul.2, %conv3 %arrayidx4.2 = getelementptr inbounds i32, i32* %res, i32 %inc.1 store i32 %add.2, i32* %arrayidx4.2, align 4 %inc.2 = or i32 %i.011, 3 %arrayidx.3 = getelementptr inbounds i8, i8* %a, i32 %inc.2 %22 = load i8, i8* %arrayidx.3, align 1 %conv.3 = zext i8 %22 to i32 %arrayidx1.3 = getelementptr inbounds i8, i8* %b, i32 %inc.2 %23 = load i8, i8* %arrayidx1.3, align 1 %conv2.3 = zext i8 %23 to i32 %mul.3 = mul nuw nsw i32 %conv2.3, %conv.3 %add.3 = add nuw nsw i32 %mul.3, %conv3 %arrayidx4.3 = getelementptr inbounds i32, i32* %res, i32 %inc.2 store i32 %add.3, i32* %arrayidx4.3, align 4 %inc.3 = add nuw i32 %i.011, 4 %niter.nsub.3 = add i32 %niter, -4 %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0 br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body } define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_short(i16* nocapture readonly %a, i16* nocapture readonly %b, i16 signext %c, i32* nocapture %res, i32 %N) { ; CHECK-LABEL: test_vec_mul_scalar_add_short: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: ldr.w r12, [sp, #8] ; CHECK-NEXT: cmp.w r12, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r4, pc} +; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: dlstp.32 lr, r12 ; CHECK-NEXT: .LBB6_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: adds r4, #4 ; CHECK-NEXT: vldrh.s32 q0, [r0], #8 ; CHECK-NEXT: vldrh.s32 q1, [r1], #8 ; CHECK-NEXT: vmlas.u32 q1, q0, r2 ; CHECK-NEXT: vstrw.32 q1, [r3], #16 ; CHECK-NEXT: letp lr, .LBB6_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r4, pc} entry: %cmp10 = icmp eq i32 %N, 0 br i1 %cmp10, label %for.cond.cleanup, label %vector.ph vector.ph: ; preds = %entry %conv3 = sext i16 %c to i32 %n.rnd.up = add i32 %N, 3 %n.vec = and i32 %n.rnd.up, -4 %trip.count.minus.1 = add i32 %N, -1 %broadcast.splatinsert12 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat13 = shufflevector <4 x i32> %broadcast.splatinsert12, <4 x i32> undef, <4 x i32> zeroinitializer %broadcast.splatinsert15 = insertelement <4 x i32> undef, i32 %conv3, i32 0 %broadcast.splat16 = shufflevector <4 x i32> %broadcast.splatinsert15, <4 x i32> undef, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %0 = getelementptr inbounds i16, i16* %a, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat13 + +; %1 = icmp ule <4 x i32> %induction, %broadcast.splat13 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast i16* %0 to <4 x i16>* %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %2, i32 2, <4 x i1> %1, <4 x i16> undef) %3 = sext <4 x i16> %wide.masked.load to <4 x i32> %4 = getelementptr inbounds i16, i16* %b, i32 %index %5 = bitcast i16* %4 to <4 x i16>* %wide.masked.load14 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %5, i32 2, <4 x i1> %1, <4 x i16> undef) %6 = sext <4 x i16> %wide.masked.load14 to <4 x i32> %7 = mul nsw <4 x i32> %6, %3 %8 = add nsw <4 x i32> %7, %broadcast.splat16 %9 = getelementptr inbounds i32, i32* %res, i32 %index %10 = bitcast i32* %9 to <4 x i32>* call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %8, <4 x i32>* %10, i32 4, <4 x i1> %1) %index.next = add i32 %index, 4 %11 = icmp eq i32 %index.next, %n.vec br i1 %11, label %for.cond.cleanup, label %vector.body for.cond.cleanup: ; preds = %vector.body, %entry ret void } define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_uchar(i8* nocapture readonly %a, i8* nocapture readonly %b, i8 zeroext %c, i32* nocapture %res, i32 %N) { ; CHECK-LABEL: test_vec_mul_scalar_add_uchar: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} ; CHECK-NEXT: ldr.w r12, [sp, #28] ; CHECK-NEXT: cmp.w r12, #0 ; CHECK-NEXT: beq.w .LBB7_11 ; CHECK-NEXT: @ %bb.1: @ %for.body.lr.ph ; CHECK-NEXT: add.w r6, r3, r12, lsl #2 ; CHECK-NEXT: add.w r4, r1, r12 ; CHECK-NEXT: cmp r6, r1 ; CHECK-NEXT: add.w r5, r0, r12 ; CHECK-NEXT: cset lr, hi ; CHECK-NEXT: cmp r4, r3 ; CHECK-NEXT: cset r4, hi ; CHECK-NEXT: cmp r6, r0 ; CHECK-NEXT: cset r6, hi ; CHECK-NEXT: cmp r5, r3 ; CHECK-NEXT: cset r5, hi ; CHECK-NEXT: ands r5, r6 ; CHECK-NEXT: movs r6, #1 ; CHECK-NEXT: lsls r5, r5, #31 ; CHECK-NEXT: itt eq ; CHECK-NEXT: andeq.w r5, r4, lr ; CHECK-NEXT: lslseq.w r5, r5, #31 ; CHECK-NEXT: beq .LBB7_4 ; CHECK-NEXT: @ %bb.2: @ %for.body.preheader ; CHECK-NEXT: sub.w r5, r12, #1 ; CHECK-NEXT: and r9, r12, #3 ; CHECK-NEXT: cmp r5, #3 ; CHECK-NEXT: bhs .LBB7_6 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: b .LBB7_8 ; CHECK-NEXT: .LBB7_4: @ %vector.ph +; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: dlstp.32 lr, r12 ; CHECK-NEXT: .LBB7_5: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: adds r4, #4 ; CHECK-NEXT: vldrb.u32 q0, [r0], #4 ; CHECK-NEXT: vldrb.u32 q1, [r1], #4 ; CHECK-NEXT: vmlas.u32 q1, q0, r2 ; CHECK-NEXT: vstrw.32 q1, [r3], #16 ; CHECK-NEXT: letp lr, .LBB7_5 ; CHECK-NEXT: b .LBB7_11 ; CHECK-NEXT: .LBB7_6: @ %for.body.preheader.new ; CHECK-NEXT: bic r5, r12, #3 ; CHECK-NEXT: add.w r4, r3, #8 ; CHECK-NEXT: subs r5, #4 ; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: add.w lr, r6, r5, lsr #2 ; CHECK-NEXT: adds r5, r0, #3 ; CHECK-NEXT: adds r6, r1, #1 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB7_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldrb r8, [r5, #-3] ; CHECK-NEXT: add.w r12, r12, #4 ; CHECK-NEXT: ldrb r7, [r6, #-1] ; CHECK-NEXT: smlabb r7, r7, r8, r2 ; CHECK-NEXT: str r7, [r4, #-8] ; CHECK-NEXT: ldrb r8, [r5, #-2] ; CHECK-NEXT: ldrb r7, [r6] ; CHECK-NEXT: smlabb r7, r7, r8, r2 ; CHECK-NEXT: str r7, [r4, #-4] ; CHECK-NEXT: ldrb r8, [r5, #-1] ; CHECK-NEXT: ldrb r7, [r6, #1] ; CHECK-NEXT: smlabb r7, r7, r8, r2 ; CHECK-NEXT: str r7, [r4] ; CHECK-NEXT: ldrb.w r8, [r5] ; CHECK-NEXT: adds r5, #4 ; CHECK-NEXT: ldrb r7, [r6, #2] ; CHECK-NEXT: adds r6, #4 ; CHECK-NEXT: smlabb r7, r7, r8, r2 ; CHECK-NEXT: str r7, [r4, #4] ; CHECK-NEXT: adds r4, #16 ; CHECK-NEXT: le lr, .LBB7_7 ; CHECK-NEXT: .LBB7_8: @ %for.cond.cleanup.loopexit.unr-lcssa ; CHECK-NEXT: wls lr, r9, .LBB7_11 ; CHECK-NEXT: @ %bb.9: @ %for.body.epil.preheader ; CHECK-NEXT: add r0, r12 ; CHECK-NEXT: add r1, r12 ; CHECK-NEXT: add.w r3, r3, r12, lsl #2 ; CHECK-NEXT: mov lr, r9 ; CHECK-NEXT: .LBB7_10: @ %for.body.epil ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldrb r6, [r0], #1 ; CHECK-NEXT: ldrb r5, [r1], #1 ; CHECK-NEXT: smlabb r6, r5, r6, r2 ; CHECK-NEXT: str r6, [r3], #4 ; CHECK-NEXT: le lr, .LBB7_10 ; CHECK-NEXT: .LBB7_11: @ %for.cond.cleanup ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} entry: %res12 = bitcast i32* %res to i8* %cmp10 = icmp eq i32 %N, 0 br i1 %cmp10, label %for.cond.cleanup, label %for.body.lr.ph for.body.lr.ph: ; preds = %entry %conv3 = zext i8 %c to i32 %scevgep = getelementptr i32, i32* %res, i32 %N %scevgep13 = bitcast i32* %scevgep to i8* %scevgep14 = getelementptr i8, i8* %a, i32 %N %scevgep15 = getelementptr i8, i8* %b, i32 %N %bound0 = icmp ugt i8* %scevgep14, %res12 %bound1 = icmp ugt i8* %scevgep13, %a %found.conflict = and i1 %bound0, %bound1 %bound016 = icmp ugt i8* %scevgep15, %res12 %bound117 = icmp ugt i8* %scevgep13, %b %found.conflict18 = and i1 %bound016, %bound117 %conflict.rdx = or i1 %found.conflict, %found.conflict18 br i1 %conflict.rdx, label %for.body.preheader, label %vector.ph for.body.preheader: ; preds = %for.body.lr.ph %0 = add i32 %N, -1 %xtraiter = and i32 %N, 3 %1 = icmp ult i32 %0, 3 br i1 %1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new for.body.preheader.new: ; preds = %for.body.preheader %unroll_iter = sub i32 %N, %xtraiter br label %for.body vector.ph: ; preds = %for.body.lr.ph %n.rnd.up = add i32 %N, 3 %n.vec = and i32 %n.rnd.up, -4 %trip.count.minus.1 = add i32 %N, -1 %broadcast.splatinsert19 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat20 = shufflevector <4 x i32> %broadcast.splatinsert19, <4 x i32> undef, <4 x i32> zeroinitializer %broadcast.splatinsert22 = insertelement <4 x i32> undef, i32 %conv3, i32 0 %broadcast.splat23 = shufflevector <4 x i32> %broadcast.splatinsert22, <4 x i32> undef, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %2 = getelementptr inbounds i8, i8* %a, i32 %index - %3 = icmp ule <4 x i32> %induction, %broadcast.splat20 + +; %3 = icmp ule <4 x i32> %induction, %broadcast.splat20 + %3 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %4 = bitcast i8* %2 to <4 x i8>* %wide.masked.load = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %4, i32 1, <4 x i1> %3, <4 x i8> undef) %5 = zext <4 x i8> %wide.masked.load to <4 x i32> %6 = getelementptr inbounds i8, i8* %b, i32 %index %7 = bitcast i8* %6 to <4 x i8>* %wide.masked.load21 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %7, i32 1, <4 x i1> %3, <4 x i8> undef) %8 = zext <4 x i8> %wide.masked.load21 to <4 x i32> %9 = mul nuw nsw <4 x i32> %8, %5 %10 = add nuw nsw <4 x i32> %9, %broadcast.splat23 %11 = getelementptr inbounds i32, i32* %res, i32 %index %12 = bitcast i32* %11 to <4 x i32>* call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %10, <4 x i32>* %12, i32 4, <4 x i1> %3) %index.next = add i32 %index, 4 %13 = icmp eq i32 %index.next, %n.vec br i1 %13, label %for.cond.cleanup, label %vector.body for.cond.cleanup.loopexit.unr-lcssa: ; preds = %for.body, %for.body.preheader %i.011.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ] %lcmp.mod = icmp eq i32 %xtraiter, 0 br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil for.body.epil: ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil %i.011.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.011.unr, %for.cond.cleanup.loopexit.unr-lcssa ] %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ] %arrayidx.epil = getelementptr inbounds i8, i8* %a, i32 %i.011.epil %14 = load i8, i8* %arrayidx.epil, align 1 %conv.epil = zext i8 %14 to i32 %arrayidx1.epil = getelementptr inbounds i8, i8* %b, i32 %i.011.epil %15 = load i8, i8* %arrayidx1.epil, align 1 %conv2.epil = zext i8 %15 to i32 %mul.epil = mul nuw nsw i32 %conv2.epil, %conv.epil %add.epil = add nuw nsw i32 %mul.epil, %conv3 %arrayidx4.epil = getelementptr inbounds i32, i32* %res, i32 %i.011.epil store i32 %add.epil, i32* %arrayidx4.epil, align 4 %inc.epil = add nuw i32 %i.011.epil, 1 %epil.iter.sub = add i32 %epil.iter, -1 %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0 br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil for.cond.cleanup: ; preds = %vector.body, %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil, %entry ret void for.body: ; preds = %for.body, %for.body.preheader.new %i.011 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ] %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ] %arrayidx = getelementptr inbounds i8, i8* %a, i32 %i.011 %16 = load i8, i8* %arrayidx, align 1 %conv = zext i8 %16 to i32 %arrayidx1 = getelementptr inbounds i8, i8* %b, i32 %i.011 %17 = load i8, i8* %arrayidx1, align 1 %conv2 = zext i8 %17 to i32 %mul = mul nuw nsw i32 %conv2, %conv %add = add nuw nsw i32 %mul, %conv3 %arrayidx4 = getelementptr inbounds i32, i32* %res, i32 %i.011 store i32 %add, i32* %arrayidx4, align 4 %inc = or i32 %i.011, 1 %arrayidx.1 = getelementptr inbounds i8, i8* %a, i32 %inc %18 = load i8, i8* %arrayidx.1, align 1 %conv.1 = zext i8 %18 to i32 %arrayidx1.1 = getelementptr inbounds i8, i8* %b, i32 %inc %19 = load i8, i8* %arrayidx1.1, align 1 %conv2.1 = zext i8 %19 to i32 %mul.1 = mul nuw nsw i32 %conv2.1, %conv.1 %add.1 = add nuw nsw i32 %mul.1, %conv3 %arrayidx4.1 = getelementptr inbounds i32, i32* %res, i32 %inc store i32 %add.1, i32* %arrayidx4.1, align 4 %inc.1 = or i32 %i.011, 2 %arrayidx.2 = getelementptr inbounds i8, i8* %a, i32 %inc.1 %20 = load i8, i8* %arrayidx.2, align 1 %conv.2 = zext i8 %20 to i32 %arrayidx1.2 = getelementptr inbounds i8, i8* %b, i32 %inc.1 %21 = load i8, i8* %arrayidx1.2, align 1 %conv2.2 = zext i8 %21 to i32 %mul.2 = mul nuw nsw i32 %conv2.2, %conv.2 %add.2 = add nuw nsw i32 %mul.2, %conv3 %arrayidx4.2 = getelementptr inbounds i32, i32* %res, i32 %inc.1 store i32 %add.2, i32* %arrayidx4.2, align 4 %inc.2 = or i32 %i.011, 3 %arrayidx.3 = getelementptr inbounds i8, i8* %a, i32 %inc.2 %22 = load i8, i8* %arrayidx.3, align 1 %conv.3 = zext i8 %22 to i32 %arrayidx1.3 = getelementptr inbounds i8, i8* %b, i32 %inc.2 %23 = load i8, i8* %arrayidx1.3, align 1 %conv2.3 = zext i8 %23 to i32 %mul.3 = mul nuw nsw i32 %conv2.3, %conv.3 %add.3 = add nuw nsw i32 %mul.3, %conv3 %arrayidx4.3 = getelementptr inbounds i32, i32* %res, i32 %inc.2 store i32 %add.3, i32* %arrayidx4.3, align 4 %inc.3 = add nuw i32 %i.011, 4 %niter.nsub.3 = add i32 %niter, -4 %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0 br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body } define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_ushort(i16* nocapture readonly %a, i16* nocapture readonly %b, i16 signext %c, i32* nocapture %res, i32 %N) { ; CHECK-LABEL: test_vec_mul_scalar_add_ushort: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: ldr.w r12, [sp, #8] ; CHECK-NEXT: cmp.w r12, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r4, pc} +; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: dlstp.32 lr, r12 ; CHECK-NEXT: .LBB8_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: adds r4, #4 ; CHECK-NEXT: vldrh.u32 q0, [r0], #8 ; CHECK-NEXT: vldrh.u32 q1, [r1], #8 ; CHECK-NEXT: vmlas.u32 q1, q0, r2 ; CHECK-NEXT: vstrw.32 q1, [r3], #16 ; CHECK-NEXT: letp lr, .LBB8_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r4, pc} entry: %cmp10 = icmp eq i32 %N, 0 br i1 %cmp10, label %for.cond.cleanup, label %vector.ph vector.ph: ; preds = %entry %conv3 = sext i16 %c to i32 %n.rnd.up = add i32 %N, 3 %n.vec = and i32 %n.rnd.up, -4 %trip.count.minus.1 = add i32 %N, -1 %broadcast.splatinsert12 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat13 = shufflevector <4 x i32> %broadcast.splatinsert12, <4 x i32> undef, <4 x i32> zeroinitializer %broadcast.splatinsert15 = insertelement <4 x i32> undef, i32 %conv3, i32 0 %broadcast.splat16 = shufflevector <4 x i32> %broadcast.splatinsert15, <4 x i32> undef, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %0 = getelementptr inbounds i16, i16* %a, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat13 + +; %1 = icmp ule <4 x i32> %induction, %broadcast.splat13 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast i16* %0 to <4 x i16>* %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %2, i32 2, <4 x i1> %1, <4 x i16> undef) %3 = zext <4 x i16> %wide.masked.load to <4 x i32> %4 = getelementptr inbounds i16, i16* %b, i32 %index %5 = bitcast i16* %4 to <4 x i16>* %wide.masked.load14 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %5, i32 2, <4 x i1> %1, <4 x i16> undef) %6 = zext <4 x i16> %wide.masked.load14 to <4 x i32> %7 = mul nuw nsw <4 x i32> %6, %3 %8 = add nsw <4 x i32> %7, %broadcast.splat16 %9 = getelementptr inbounds i32, i32* %res, i32 %index %10 = bitcast i32* %9 to <4 x i32>* call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %8, <4 x i32>* %10, i32 4, <4 x i1> %1) %index.next = add i32 %index, 4 %11 = icmp eq i32 %index.next, %n.vec br i1 %11, label %for.cond.cleanup, label %vector.body for.cond.cleanup: ; preds = %vector.body, %entry ret void } define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_int(i32* nocapture readonly %a, i32* nocapture readonly %b, i32 %c, i32* nocapture %res, i32 %N) { ; CHECK-LABEL: test_vec_mul_scalar_add_int: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} ; CHECK-NEXT: ldr.w r12, [sp, #28] ; CHECK-NEXT: cmp.w r12, #0 ; CHECK-NEXT: beq.w .LBB9_11 ; CHECK-NEXT: @ %bb.1: @ %vector.memcheck ; CHECK-NEXT: add.w r6, r3, r12, lsl #2 ; CHECK-NEXT: add.w r4, r1, r12, lsl #2 ; CHECK-NEXT: cmp r6, r1 ; CHECK-NEXT: add.w r5, r0, r12, lsl #2 ; CHECK-NEXT: cset lr, hi ; CHECK-NEXT: cmp r4, r3 ; CHECK-NEXT: cset r4, hi ; CHECK-NEXT: cmp r6, r0 ; CHECK-NEXT: cset r6, hi ; CHECK-NEXT: cmp r5, r3 ; CHECK-NEXT: cset r5, hi ; CHECK-NEXT: ands r5, r6 ; CHECK-NEXT: movs r6, #1 ; CHECK-NEXT: lsls r5, r5, #31 ; CHECK-NEXT: itt eq ; CHECK-NEXT: andeq.w r5, r4, lr ; CHECK-NEXT: lslseq.w r5, r5, #31 ; CHECK-NEXT: beq .LBB9_4 ; CHECK-NEXT: @ %bb.2: @ %for.body.preheader ; CHECK-NEXT: sub.w r5, r12, #1 ; CHECK-NEXT: and r9, r12, #3 ; CHECK-NEXT: cmp r5, #3 ; CHECK-NEXT: bhs .LBB9_6 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: b .LBB9_8 ; CHECK-NEXT: .LBB9_4: @ %vector.ph +; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: dlstp.32 lr, r12 ; CHECK-NEXT: .LBB9_5: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: adds r4, #4 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: vldrw.u32 q1, [r1], #16 ; CHECK-NEXT: vmlas.u32 q1, q0, r2 ; CHECK-NEXT: vstrw.32 q1, [r3], #16 ; CHECK-NEXT: letp lr, .LBB9_5 ; CHECK-NEXT: b .LBB9_11 ; CHECK-NEXT: .LBB9_6: @ %for.body.preheader.new ; CHECK-NEXT: bic r5, r12, #3 ; CHECK-NEXT: add.w r4, r3, #8 ; CHECK-NEXT: subs r5, #4 ; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: add.w lr, r6, r5, lsr #2 ; CHECK-NEXT: add.w r5, r0, #8 ; CHECK-NEXT: add.w r6, r1, #8 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB9_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr r8, [r5, #-8] ; CHECK-NEXT: add.w r12, r12, #4 ; CHECK-NEXT: ldr r7, [r6, #-8] ; CHECK-NEXT: mla r7, r7, r8, r2 ; CHECK-NEXT: str r7, [r4, #-8] ; CHECK-NEXT: ldr r8, [r5, #-4] ; CHECK-NEXT: ldr r7, [r6, #-4] ; CHECK-NEXT: mla r7, r7, r8, r2 ; CHECK-NEXT: str r7, [r4, #-4] ; CHECK-NEXT: ldr.w r8, [r5] ; CHECK-NEXT: ldr r7, [r6] ; CHECK-NEXT: mla r7, r7, r8, r2 ; CHECK-NEXT: str r7, [r4] ; CHECK-NEXT: ldr.w r8, [r5, #4] ; CHECK-NEXT: adds r5, #16 ; CHECK-NEXT: ldr r7, [r6, #4] ; CHECK-NEXT: adds r6, #16 ; CHECK-NEXT: mla r7, r7, r8, r2 ; CHECK-NEXT: str r7, [r4, #4] ; CHECK-NEXT: adds r4, #16 ; CHECK-NEXT: le lr, .LBB9_7 ; CHECK-NEXT: .LBB9_8: @ %for.cond.cleanup.loopexit.unr-lcssa ; CHECK-NEXT: wls lr, r9, .LBB9_11 ; CHECK-NEXT: @ %bb.9: @ %for.body.epil.preheader ; CHECK-NEXT: add.w r0, r0, r12, lsl #2 ; CHECK-NEXT: add.w r1, r1, r12, lsl #2 ; CHECK-NEXT: add.w r3, r3, r12, lsl #2 ; CHECK-NEXT: mov lr, r9 ; CHECK-NEXT: .LBB9_10: @ %for.body.epil ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr r6, [r0], #4 ; CHECK-NEXT: ldr r5, [r1], #4 ; CHECK-NEXT: mla r6, r5, r6, r2 ; CHECK-NEXT: str r6, [r3], #4 ; CHECK-NEXT: le lr, .LBB9_10 ; CHECK-NEXT: .LBB9_11: @ %for.cond.cleanup ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} entry: %cmp8 = icmp eq i32 %N, 0 br i1 %cmp8, label %for.cond.cleanup, label %vector.memcheck vector.memcheck: ; preds = %entry %scevgep = getelementptr i32, i32* %res, i32 %N %scevgep13 = getelementptr i32, i32* %a, i32 %N %scevgep16 = getelementptr i32, i32* %b, i32 %N %bound0 = icmp ugt i32* %scevgep13, %res %bound1 = icmp ugt i32* %scevgep, %a %found.conflict = and i1 %bound0, %bound1 %bound018 = icmp ugt i32* %scevgep16, %res %bound119 = icmp ugt i32* %scevgep, %b %found.conflict20 = and i1 %bound018, %bound119 %conflict.rdx = or i1 %found.conflict, %found.conflict20 br i1 %conflict.rdx, label %for.body.preheader, label %vector.ph for.body.preheader: ; preds = %vector.memcheck %0 = add i32 %N, -1 %xtraiter = and i32 %N, 3 %1 = icmp ult i32 %0, 3 br i1 %1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new for.body.preheader.new: ; preds = %for.body.preheader %unroll_iter = sub i32 %N, %xtraiter br label %for.body vector.ph: ; preds = %vector.memcheck %n.rnd.up = add i32 %N, 3 %n.vec = and i32 %n.rnd.up, -4 %trip.count.minus.1 = add i32 %N, -1 %broadcast.splatinsert21 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat22 = shufflevector <4 x i32> %broadcast.splatinsert21, <4 x i32> undef, <4 x i32> zeroinitializer %broadcast.splatinsert24 = insertelement <4 x i32> undef, i32 %c, i32 0 %broadcast.splat25 = shufflevector <4 x i32> %broadcast.splatinsert24, <4 x i32> undef, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %2 = getelementptr inbounds i32, i32* %a, i32 %index - %3 = icmp ule <4 x i32> %induction, %broadcast.splat22 + +; %3 = icmp ule <4 x i32> %induction, %broadcast.splat22 + %3 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %4 = bitcast i32* %2 to <4 x i32>* %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %4, i32 4, <4 x i1> %3, <4 x i32> undef) %5 = getelementptr inbounds i32, i32* %b, i32 %index %6 = bitcast i32* %5 to <4 x i32>* %wide.masked.load23 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %6, i32 4, <4 x i1> %3, <4 x i32> undef) %7 = mul nsw <4 x i32> %wide.masked.load23, %wide.masked.load %8 = add nsw <4 x i32> %7, %broadcast.splat25 %9 = getelementptr inbounds i32, i32* %res, i32 %index %10 = bitcast i32* %9 to <4 x i32>* call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %8, <4 x i32>* %10, i32 4, <4 x i1> %3) %index.next = add i32 %index, 4 %11 = icmp eq i32 %index.next, %n.vec br i1 %11, label %for.cond.cleanup, label %vector.body for.cond.cleanup.loopexit.unr-lcssa: ; preds = %for.body, %for.body.preheader %i.09.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ] %lcmp.mod = icmp eq i32 %xtraiter, 0 br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil for.body.epil: ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil %i.09.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.09.unr, %for.cond.cleanup.loopexit.unr-lcssa ] %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ] %arrayidx.epil = getelementptr inbounds i32, i32* %a, i32 %i.09.epil %12 = load i32, i32* %arrayidx.epil, align 4 %arrayidx1.epil = getelementptr inbounds i32, i32* %b, i32 %i.09.epil %13 = load i32, i32* %arrayidx1.epil, align 4 %mul.epil = mul nsw i32 %13, %12 %add.epil = add nsw i32 %mul.epil, %c %arrayidx2.epil = getelementptr inbounds i32, i32* %res, i32 %i.09.epil store i32 %add.epil, i32* %arrayidx2.epil, align 4 %inc.epil = add nuw i32 %i.09.epil, 1 %epil.iter.sub = add i32 %epil.iter, -1 %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0 br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil for.cond.cleanup: ; preds = %vector.body, %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil, %entry ret void for.body: ; preds = %for.body, %for.body.preheader.new %i.09 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ] %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ] %arrayidx = getelementptr inbounds i32, i32* %a, i32 %i.09 %14 = load i32, i32* %arrayidx, align 4 %arrayidx1 = getelementptr inbounds i32, i32* %b, i32 %i.09 %15 = load i32, i32* %arrayidx1, align 4 %mul = mul nsw i32 %15, %14 %add = add nsw i32 %mul, %c %arrayidx2 = getelementptr inbounds i32, i32* %res, i32 %i.09 store i32 %add, i32* %arrayidx2, align 4 %inc = or i32 %i.09, 1 %arrayidx.1 = getelementptr inbounds i32, i32* %a, i32 %inc %16 = load i32, i32* %arrayidx.1, align 4 %arrayidx1.1 = getelementptr inbounds i32, i32* %b, i32 %inc %17 = load i32, i32* %arrayidx1.1, align 4 %mul.1 = mul nsw i32 %17, %16 %add.1 = add nsw i32 %mul.1, %c %arrayidx2.1 = getelementptr inbounds i32, i32* %res, i32 %inc store i32 %add.1, i32* %arrayidx2.1, align 4 %inc.1 = or i32 %i.09, 2 %arrayidx.2 = getelementptr inbounds i32, i32* %a, i32 %inc.1 %18 = load i32, i32* %arrayidx.2, align 4 %arrayidx1.2 = getelementptr inbounds i32, i32* %b, i32 %inc.1 %19 = load i32, i32* %arrayidx1.2, align 4 %mul.2 = mul nsw i32 %19, %18 %add.2 = add nsw i32 %mul.2, %c %arrayidx2.2 = getelementptr inbounds i32, i32* %res, i32 %inc.1 store i32 %add.2, i32* %arrayidx2.2, align 4 %inc.2 = or i32 %i.09, 3 %arrayidx.3 = getelementptr inbounds i32, i32* %a, i32 %inc.2 %20 = load i32, i32* %arrayidx.3, align 4 %arrayidx1.3 = getelementptr inbounds i32, i32* %b, i32 %inc.2 %21 = load i32, i32* %arrayidx1.3, align 4 %mul.3 = mul nsw i32 %21, %20 %add.3 = add nsw i32 %mul.3, %c %arrayidx2.3 = getelementptr inbounds i32, i32* %res, i32 %inc.2 store i32 %add.3, i32* %arrayidx2.3, align 4 %inc.3 = add nuw i32 %i.09, 4 %niter.nsub.3 = add i32 %niter, -4 %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0 br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body } define dso_local arm_aapcs_vfpcc void @test_v8i8_to_v8i16(i16* noalias nocapture %a, i8* nocapture readonly %b, i8* nocapture readonly %c, i32 %N) { ; CHECK-LABEL: test_v8i8_to_v8i16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dlstp.16 lr, r3 ; CHECK-NEXT: .LBB10_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: add.w r12, r12, #8 ; CHECK-NEXT: vldrb.u16 q0, [r1], #8 ; CHECK-NEXT: vldrb.u16 q1, [r2], #8 ; CHECK-NEXT: vmul.i16 q0, q1, q0 ; CHECK-NEXT: vstrh.16 q0, [r0], #16 ; CHECK-NEXT: letp lr, .LBB10_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} entry: %cmp10 = icmp eq i32 %N, 0 br i1 %cmp10, label %for.cond.cleanup, label %vector.ph vector.ph: ; preds = %entry %n.rnd.up = add i32 %N, 7 %n.vec = and i32 %n.rnd.up, -8 %trip.count.minus.1 = add i32 %N, -1 %broadcast.splatinsert12 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat13 = shufflevector <8 x i32> %broadcast.splatinsert12, <8 x i32> undef, <8 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer %induction = add <8 x i32> %broadcast.splat, %0 = getelementptr inbounds i8, i8* %b, i32 %index - %1 = icmp ule <8 x i32> %induction, %broadcast.splat13 + +; %1 = icmp ule <8 x i32> %induction, %broadcast.splat13 + %1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast i8* %0 to <8 x i8>* %wide.masked.load = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %2, i32 1, <8 x i1> %1, <8 x i8> undef) %3 = zext <8 x i8> %wide.masked.load to <8 x i16> %4 = getelementptr inbounds i8, i8* %c, i32 %index %5 = bitcast i8* %4 to <8 x i8>* %wide.masked.load14 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %5, i32 1, <8 x i1> %1, <8 x i8> undef) %6 = zext <8 x i8> %wide.masked.load14 to <8 x i16> %7 = mul nuw <8 x i16> %6, %3 %8 = getelementptr inbounds i16, i16* %a, i32 %index %9 = bitcast i16* %8 to <8 x i16>* call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %7, <8 x i16>* %9, i32 2, <8 x i1> %1) %index.next = add i32 %index, 8 %10 = icmp eq i32 %index.next, %n.vec br i1 %10, label %for.cond.cleanup, label %vector.body for.cond.cleanup: ; preds = %vector.body, %entry ret void } declare <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>*, i32 immarg, <4 x i1>, <4 x i8>) declare <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>*, i32 immarg, <8 x i1>, <8 x i8>) declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32 immarg, <8 x i1>) declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) - - - +declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) +declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll index 3a33e4342e01..64702cc3c315 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll @@ -1,241 +1,264 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -mtriple=armv8.1m.main -mattr=+mve -S -mve-tail-predication -disable-mve-tail-predication=false %s -o - | FileCheck %s define void @mat_vec_sext_i16(i16** nocapture readonly %A, i16* nocapture readonly %B, i32* noalias nocapture %C, i32 %N) { ; CHECK-LABEL: @mat_vec_sext_i16( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP24:%.*]] = icmp eq i32 [[N:%.*]], 0 ; CHECK-NEXT: br i1 [[CMP24]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER_US_PREHEADER:%.*]] ; CHECK: for.cond1.preheader.us.preheader: ; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 3 ; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -4 +; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT28:%.*]] = insertelement <4 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT29:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT28]], <4 x i32> undef, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP:%.*]] = add i32 [[N_VEC]], -4 ; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[TMP]], 2 ; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i32 [[TMP1]], 1 ; CHECK-NEXT: br label [[FOR_COND1_PREHEADER_US:%.*]] ; CHECK: for.cond1.preheader.us: ; CHECK-NEXT: [[I_025_US:%.*]] = phi i32 [ [[INC10_US:%.*]], [[MIDDLE_BLOCK:%.*]] ], [ 0, [[FOR_COND1_PREHEADER_US_PREHEADER]] ] ; CHECK-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds i16*, i16** [[A:%.*]], i32 [[I_025_US]] ; CHECK-NEXT: [[TMP3:%.*]] = load i16*, i16** [[ARRAYIDX_US]], align 4 ; CHECK-NEXT: [[ARRAYIDX8_US:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i32 [[I_025_US]] ; CHECK-NEXT: [[ARRAYIDX8_PROMOTED_US:%.*]] = load i32, i32* [[ARRAYIDX8_US]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> , i32 [[ARRAYIDX8_PROMOTED_US]], i32 0 ; CHECK-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[TMP2]]) +; CHECK-NEXT: [[NUM_ELEMENTS:%.*]] = add i32 [[TRIP_COUNT_MINUS_1]], 1 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[FOR_COND1_PREHEADER_US]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP4]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP5:%.*]] = phi i32 [ [[TMP2]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[N]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[NUM_ELEMENTS]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[TMP3]], i32 [[INDEX]] ; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP0]]) ; CHECK-NEXT: [[TMP2]] = sub i32 [[TMP0]], 4 ; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16* [[TMP6]] to <4 x i16>* ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[TMP8]], i32 2, <4 x i1> [[TMP1]], <4 x i16> undef) ; CHECK-NEXT: [[TMP9:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD]] to <4 x i32> ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i16, i16* [[B:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to <4 x i16>* ; CHECK-NEXT: [[WIDE_MASKED_LOAD30:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[TMP11]], i32 2, <4 x i1> [[TMP1]], <4 x i16> undef) ; CHECK-NEXT: [[TMP12:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD30]] to <4 x i32> ; CHECK-NEXT: [[TMP13:%.*]] = mul nsw <4 x i32> [[TMP12]], [[TMP9]] ; CHECK-NEXT: [[TMP14]] = add nsw <4 x i32> [[TMP13]], [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 ; CHECK-NEXT: [[TMP15]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP5]], i32 1) ; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0 ; CHECK-NEXT: br i1 [[TMP16]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP17:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[TMP14]], <4 x i32> [[VEC_PHI]] ; CHECK-NEXT: [[TMP18:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP17]]) ; CHECK-NEXT: store i32 [[TMP18]], i32* [[ARRAYIDX8_US]], align 4 ; CHECK-NEXT: [[INC10_US]] = add nuw i32 [[I_025_US]], 1 ; CHECK-NEXT: [[EXITCOND27:%.*]] = icmp eq i32 [[INC10_US]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND27]], label [[FOR_COND_CLEANUP]], label [[FOR_COND1_PREHEADER_US]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void ; entry: %cmp24 = icmp eq i32 %N, 0 br i1 %cmp24, label %for.cond.cleanup, label %for.cond1.preheader.us.preheader for.cond1.preheader.us.preheader: ; preds = %entry %n.rnd.up = add i32 %N, 3 %n.vec = and i32 %n.rnd.up, -4 %trip.count.minus.1 = add i32 %N, -1 %broadcast.splatinsert28 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat29 = shufflevector <4 x i32> %broadcast.splatinsert28, <4 x i32> undef, <4 x i32> zeroinitializer %tmp = add i32 %n.vec, -4 %tmp1 = lshr i32 %tmp, 2 %tmp2 = add nuw nsw i32 %tmp1, 1 br label %for.cond1.preheader.us for.cond1.preheader.us: ; preds = %middle.block, %for.cond1.preheader.us.preheader %i.025.us = phi i32 [ %inc10.us, %middle.block ], [ 0, %for.cond1.preheader.us.preheader ] %arrayidx.us = getelementptr inbounds i16*, i16** %A, i32 %i.025.us %tmp3 = load i16*, i16** %arrayidx.us, align 4 %arrayidx8.us = getelementptr inbounds i32, i32* %C, i32 %i.025.us %arrayidx8.promoted.us = load i32, i32* %arrayidx8.us, align 4 %tmp4 = insertelement <4 x i32> , i32 %arrayidx8.promoted.us, i32 0 call void @llvm.set.loop.iterations.i32(i32 %tmp2) br label %vector.body vector.body: ; preds = %vector.body, %for.cond1.preheader.us %index = phi i32 [ 0, %for.cond1.preheader.us ], [ %index.next, %vector.body ] %vec.phi = phi <4 x i32> [ %tmp4, %for.cond1.preheader.us ], [ %tmp14, %vector.body ] %tmp5 = phi i32 [ %tmp2, %for.cond1.preheader.us ], [ %tmp15, %vector.body ] %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %tmp6 = getelementptr inbounds i16, i16* %tmp3, i32 %index - %tmp7 = icmp ule <4 x i32> %induction, %broadcast.splat29 + + ; %tmp7 = icmp ule <4 x i32> %induction, %broadcast.splat29 + %tmp7 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %tmp8 = bitcast i16* %tmp6 to <4 x i16>* %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %tmp8, i32 2, <4 x i1> %tmp7, <4 x i16> undef) %tmp9 = sext <4 x i16> %wide.masked.load to <4 x i32> %tmp10 = getelementptr inbounds i16, i16* %B, i32 %index %tmp11 = bitcast i16* %tmp10 to <4 x i16>* %wide.masked.load30 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %tmp11, i32 2, <4 x i1> %tmp7, <4 x i16> undef) %tmp12 = sext <4 x i16> %wide.masked.load30 to <4 x i32> %tmp13 = mul nsw <4 x i32> %tmp12, %tmp9 %tmp14 = add nsw <4 x i32> %tmp13, %vec.phi %index.next = add i32 %index, 4 %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp5, i32 1) %tmp16 = icmp ne i32 %tmp15, 0 br i1 %tmp16, label %vector.body, label %middle.block middle.block: ; preds = %vector.body %tmp17 = select <4 x i1> %tmp7, <4 x i32> %tmp14, <4 x i32> %vec.phi %tmp18 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %tmp17) store i32 %tmp18, i32* %arrayidx8.us, align 4 %inc10.us = add nuw i32 %i.025.us, 1 %exitcond27 = icmp eq i32 %inc10.us, %N br i1 %exitcond27, label %for.cond.cleanup, label %for.cond1.preheader.us for.cond.cleanup: ; preds = %middle.block, %entry ret void } define void @mat_vec_i32(i32** nocapture readonly %A, i32* nocapture readonly %B, i32* noalias nocapture %C, i32 %N) { ; CHECK-LABEL: @mat_vec_i32( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP23:%.*]] = icmp eq i32 [[N:%.*]], 0 ; CHECK-NEXT: br i1 [[CMP23]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER_US_PREHEADER:%.*]] ; CHECK: for.cond1.preheader.us.preheader: ; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 3 ; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -4 +; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT27:%.*]] = insertelement <4 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT28:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT27]], <4 x i32> undef, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP:%.*]] = add i32 [[N_VEC]], -4 ; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[TMP]], 2 ; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i32 [[TMP1]], 1 ; CHECK-NEXT: br label [[FOR_COND1_PREHEADER_US:%.*]] ; CHECK: for.cond1.preheader.us: ; CHECK-NEXT: [[I_024_US:%.*]] = phi i32 [ [[INC9_US:%.*]], [[MIDDLE_BLOCK:%.*]] ], [ 0, [[FOR_COND1_PREHEADER_US_PREHEADER]] ] ; CHECK-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32*, i32** [[A:%.*]], i32 [[I_024_US]] ; CHECK-NEXT: [[TMP3:%.*]] = load i32*, i32** [[ARRAYIDX_US]], align 4 ; CHECK-NEXT: [[ARRAYIDX7_US:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i32 [[I_024_US]] ; CHECK-NEXT: [[ARRAYIDX7_PROMOTED_US:%.*]] = load i32, i32* [[ARRAYIDX7_US]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> , i32 [[ARRAYIDX7_PROMOTED_US]], i32 0 ; CHECK-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[TMP2]]) +; CHECK-NEXT: [[NUM_ELEMENTS:%.*]] = add i32 [[TRIP_COUNT_MINUS_1]], 1 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[FOR_COND1_PREHEADER_US]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP4]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP5:%.*]] = phi i32 [ [[TMP2]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[N]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[NUM_ELEMENTS]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 [[INDEX]] ; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP0]]) ; CHECK-NEXT: [[TMP2]] = sub i32 [[TMP0]], 4 ; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP8]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef) ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_MASKED_LOAD29:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP10]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef) ; CHECK-NEXT: [[TMP11:%.*]] = mul nsw <4 x i32> [[WIDE_MASKED_LOAD29]], [[WIDE_MASKED_LOAD]] ; CHECK-NEXT: [[TMP12]] = add nsw <4 x i32> [[VEC_PHI]], [[TMP11]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 ; CHECK-NEXT: [[TMP13]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP5]], i32 1) ; CHECK-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 ; CHECK-NEXT: br i1 [[TMP14]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP15:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[TMP12]], <4 x i32> [[VEC_PHI]] ; CHECK-NEXT: [[TMP16:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP15]]) ; CHECK-NEXT: store i32 [[TMP16]], i32* [[ARRAYIDX7_US]], align 4 ; CHECK-NEXT: [[INC9_US]] = add nuw i32 [[I_024_US]], 1 ; CHECK-NEXT: [[EXITCOND26:%.*]] = icmp eq i32 [[INC9_US]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND26]], label [[FOR_COND_CLEANUP]], label [[FOR_COND1_PREHEADER_US]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void ; entry: %cmp23 = icmp eq i32 %N, 0 br i1 %cmp23, label %for.cond.cleanup, label %for.cond1.preheader.us.preheader for.cond1.preheader.us.preheader: ; preds = %entry %n.rnd.up = add i32 %N, 3 %n.vec = and i32 %n.rnd.up, -4 %trip.count.minus.1 = add i32 %N, -1 %broadcast.splatinsert27 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat28 = shufflevector <4 x i32> %broadcast.splatinsert27, <4 x i32> undef, <4 x i32> zeroinitializer %tmp = add i32 %n.vec, -4 %tmp1 = lshr i32 %tmp, 2 %tmp2 = add nuw nsw i32 %tmp1, 1 br label %for.cond1.preheader.us for.cond1.preheader.us: ; preds = %middle.block, %for.cond1.preheader.us.preheader %i.024.us = phi i32 [ %inc9.us, %middle.block ], [ 0, %for.cond1.preheader.us.preheader ] %arrayidx.us = getelementptr inbounds i32*, i32** %A, i32 %i.024.us %tmp3 = load i32*, i32** %arrayidx.us, align 4 %arrayidx7.us = getelementptr inbounds i32, i32* %C, i32 %i.024.us %arrayidx7.promoted.us = load i32, i32* %arrayidx7.us, align 4 %tmp4 = insertelement <4 x i32> , i32 %arrayidx7.promoted.us, i32 0 call void @llvm.set.loop.iterations.i32(i32 %tmp2) br label %vector.body vector.body: ; preds = %vector.body, %for.cond1.preheader.us %index = phi i32 [ 0, %for.cond1.preheader.us ], [ %index.next, %vector.body ] %vec.phi = phi <4 x i32> [ %tmp4, %for.cond1.preheader.us ], [ %tmp12, %vector.body ] %tmp5 = phi i32 [ %tmp2, %for.cond1.preheader.us ], [ %tmp13, %vector.body ] %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %tmp6 = getelementptr inbounds i32, i32* %tmp3, i32 %index - %tmp7 = icmp ule <4 x i32> %induction, %broadcast.splat28 + + ; %tmp7 = icmp ule <4 x i32> %induction, %broadcast.splat28 + %tmp7 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %tmp8 = bitcast i32* %tmp6 to <4 x i32>* %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp8, i32 4, <4 x i1> %tmp7, <4 x i32> undef) %tmp9 = getelementptr inbounds i32, i32* %B, i32 %index %tmp10 = bitcast i32* %tmp9 to <4 x i32>* %wide.masked.load29 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp10, i32 4, <4 x i1> %tmp7, <4 x i32> undef) %tmp11 = mul nsw <4 x i32> %wide.masked.load29, %wide.masked.load %tmp12 = add nsw <4 x i32> %vec.phi, %tmp11 %index.next = add i32 %index, 4 %tmp13 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp5, i32 1) %tmp14 = icmp ne i32 %tmp13, 0 br i1 %tmp14, label %vector.body, label %middle.block middle.block: ; preds = %vector.body %tmp15 = select <4 x i1> %tmp7, <4 x i32> %tmp12, <4 x i32> %vec.phi %tmp16 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %tmp15) store i32 %tmp16, i32* %arrayidx7.us, align 4 %inc9.us = add nuw i32 %i.024.us, 1 %exitcond26 = icmp eq i32 %inc9.us, %N br i1 %exitcond26, label %for.cond.cleanup, label %for.cond1.preheader.us for.cond.cleanup: ; preds = %middle.block, %entry ret void } + ; Function Attrs: argmemonly nounwind readonly willreturn declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #0 ; Function Attrs: argmemonly nounwind readonly willreturn declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) #0 ; Function Attrs: nounwind readnone willreturn declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) #1 ; Function Attrs: noduplicate nounwind declare void @llvm.set.loop.iterations.i32(i32) #2 ; Function Attrs: noduplicate nounwind declare i32 @llvm.loop.decrement.reg.i32(i32, i32) #2 +declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) + attributes #0 = { argmemonly nounwind readonly willreturn } attributes #1 = { nounwind readnone willreturn } attributes #2 = { noduplicate nounwind } diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll index eb54885304a0..dc9da0c9f764 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll @@ -1,329 +1,635 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -disable-mve-tail-predication=false -mattr=+mve %s -S -o - | FileCheck %s define dso_local void @foo(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 { ; CHECK-LABEL: @foo( ; CHECK-NEXT: entry: ; CHECK-NEXT: call void @llvm.set.loop.iterations.i32(i32 8001) ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[LSR_IV14:%.*]] = phi i32* [ [[SCEVGEP15:%.*]], [[VECTOR_BODY]] ], [ [[A:%.*]], [[ENTRY:%.*]] ] ; CHECK-NEXT: [[LSR_IV11:%.*]] = phi i32* [ [[SCEVGEP12:%.*]], [[VECTOR_BODY]] ], [ [[C:%.*]], [[ENTRY]] ] ; CHECK-NEXT: [[LSR_IV:%.*]] = phi i32* [ [[SCEVGEP:%.*]], [[VECTOR_BODY]] ], [ [[B:%.*]], [[ENTRY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ 8001, [[ENTRY]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 32002, [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 32003, [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[LSR_IV1416:%.*]] = bitcast i32* [[LSR_IV14]] to <4 x i32>* ; CHECK-NEXT: [[LSR_IV1113:%.*]] = bitcast i32* [[LSR_IV11]] to <4 x i32>* ; CHECK-NEXT: [[LSR_IV10:%.*]] = bitcast i32* [[LSR_IV]] to <4 x i32>* +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], ; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP1]]) ; CHECK-NEXT: [[TMP3]] = sub i32 [[TMP1]], 4 ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[LSR_IV10]], i32 4, <4 x i1> [[TMP2]], <4 x i32> undef) ; CHECK-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[LSR_IV1113]], i32 4, <4 x i1> [[TMP2]], <4 x i32> undef) ; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD]] ; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP4]], <4 x i32>* [[LSR_IV1416]], i32 4, <4 x i1> [[TMP2]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 ; CHECK-NEXT: [[SCEVGEP]] = getelementptr i32, i32* [[LSR_IV]], i32 4 ; CHECK-NEXT: [[SCEVGEP12]] = getelementptr i32, i32* [[LSR_IV11]], i32 4 ; CHECK-NEXT: [[SCEVGEP15]] = getelementptr i32, i32* [[LSR_IV14]], i32 4 ; CHECK-NEXT: [[TMP5]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP0]], i32 1) ; CHECK-NEXT: [[TMP6:%.*]] = icmp ne i32 [[TMP5]], 0 ; CHECK-NEXT: br i1 [[TMP6]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void ; entry: call void @llvm.set.loop.iterations.i32(i32 8001) br label %vector.body vector.body: %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %A, %entry ] %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ] %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ] %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] %0 = phi i32 [ 8001, %entry ], [ %3, %vector.body ] %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>* %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>* %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, - %1 = icmp ult <4 x i32> %induction, + + ; %1 = icmp ult <4 x i32> %induction, + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32002) + %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef) %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef) %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %2, <4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %1) %index.next = add i32 %index, 4 %scevgep = getelementptr i32, i32* %lsr.iv, i32 4 %scevgep12 = getelementptr i32, i32* %lsr.iv11, i32 4 %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4 %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1) %4 = icmp ne i32 %3, 0 br i1 %4, label %vector.body, label %for.cond.cleanup for.cond.cleanup: ret void } ; Silly test case: the loop count is constant and a multiple of the vectorisation ; factor. So, the vectoriser should not produce masked loads/stores and there's ; nothing to tail-predicate here, just checking. define dso_local void @foo2(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 { ; CHECK-LABEL: @foo2( ; CHECK-NEXT: entry: ; CHECK-NEXT: call void @llvm.set.loop.iterations.i32(i32 2000) ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[LSR_IV14:%.*]] = phi i32* [ [[SCEVGEP15:%.*]], [[VECTOR_BODY]] ], [ [[A:%.*]], [[ENTRY:%.*]] ] ; CHECK-NEXT: [[LSR_IV11:%.*]] = phi i32* [ [[SCEVGEP12:%.*]], [[VECTOR_BODY]] ], [ [[C:%.*]], [[ENTRY]] ] ; CHECK-NEXT: [[LSR_IV:%.*]] = phi i32* [ [[SCEVGEP:%.*]], [[VECTOR_BODY]] ], [ [[B:%.*]], [[ENTRY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ 2000, [[ENTRY]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[LSR_IV1416:%.*]] = bitcast i32* [[LSR_IV14]] to <4 x i32>* ; CHECK-NEXT: [[LSR_IV1113:%.*]] = bitcast i32* [[LSR_IV11]] to <4 x i32>* ; CHECK-NEXT: [[LSR_IV10:%.*]] = bitcast i32* [[LSR_IV]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[LSR_IV10]], align 4 ; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <4 x i32>, <4 x i32>* [[LSR_IV1113]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD9]], [[WIDE_LOAD]] ; CHECK-NEXT: store <4 x i32> [[TMP1]], <4 x i32>* [[LSR_IV1416]], align 4 ; CHECK-NEXT: [[SCEVGEP]] = getelementptr i32, i32* [[LSR_IV]], i32 4 ; CHECK-NEXT: [[SCEVGEP12]] = getelementptr i32, i32* [[LSR_IV11]], i32 4 ; CHECK-NEXT: [[SCEVGEP15]] = getelementptr i32, i32* [[LSR_IV14]], i32 4 ; CHECK-NEXT: [[TMP2]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP0]], i32 1) ; CHECK-NEXT: [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0 ; CHECK-NEXT: br i1 [[TMP3]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void ; entry: call void @llvm.set.loop.iterations.i32(i32 2000) br label %vector.body vector.body: %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %A, %entry ] %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ] %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ] %0 = phi i32 [ 2000, %entry ], [ %2, %vector.body ] %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>* %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>* %wide.load = load <4 x i32>, <4 x i32>* %lsr.iv10, align 4 %wide.load9 = load <4 x i32>, <4 x i32>* %lsr.iv1113, align 4 %1 = add nsw <4 x i32> %wide.load9, %wide.load store <4 x i32> %1, <4 x i32>* %lsr.iv1416, align 4 %scevgep = getelementptr i32, i32* %lsr.iv, i32 4 %scevgep12 = getelementptr i32, i32* %lsr.iv11, i32 4 %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4 %2 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1) %3 = icmp ne i32 %2, 0 br i1 %3, label %vector.body, label %for.cond.cleanup for.cond.cleanup: ret void } ; Check that the icmp is a ult define dso_local void @foo3(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 { ; CHECK-LABEL: @foo3( ; CHECK-NEXT: entry: ; CHECK-NEXT: call void @llvm.set.loop.iterations.i32(i32 8001) ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[LSR_IV14:%.*]] = phi i32* [ [[SCEVGEP15:%.*]], [[VECTOR_BODY]] ], [ [[A:%.*]], [[ENTRY:%.*]] ] ; CHECK-NEXT: [[LSR_IV11:%.*]] = phi i32* [ [[SCEVGEP12:%.*]], [[VECTOR_BODY]] ], [ [[C:%.*]], [[ENTRY]] ] ; CHECK-NEXT: [[LSR_IV:%.*]] = phi i32* [ [[SCEVGEP:%.*]], [[VECTOR_BODY]] ], [ [[B:%.*]], [[ENTRY]] ] ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ 8001, [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[LSR_IV1416:%.*]] = bitcast i32* [[LSR_IV14]] to <4 x i32>* ; CHECK-NEXT: [[LSR_IV1113:%.*]] = bitcast i32* [[LSR_IV11]] to <4 x i32>* ; CHECK-NEXT: [[LSR_IV10:%.*]] = bitcast i32* [[LSR_IV]] to <4 x i32>* ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer ; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], ; CHECK-NEXT: [[TMP1:%.*]] = icmp ugt <4 x i32> [[INDUCTION]], ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[LSR_IV10]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef) ; CHECK-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[LSR_IV1113]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef) ; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD]] ; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP2]], <4 x i32>* [[LSR_IV1416]], i32 4, <4 x i1> [[TMP1]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 ; CHECK-NEXT: [[SCEVGEP]] = getelementptr i32, i32* [[LSR_IV]], i32 4 ; CHECK-NEXT: [[SCEVGEP12]] = getelementptr i32, i32* [[LSR_IV11]], i32 4 ; CHECK-NEXT: [[SCEVGEP15]] = getelementptr i32, i32* [[LSR_IV14]], i32 4 ; CHECK-NEXT: [[TMP3]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP0]], i32 1) ; CHECK-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0 ; CHECK-NEXT: br i1 [[TMP4]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void ; entry: call void @llvm.set.loop.iterations.i32(i32 8001) br label %vector.body vector.body: %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %A, %entry ] %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ] %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ] %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] %0 = phi i32 [ 8001, %entry ], [ %3, %vector.body ] %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>* %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>* %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, ; UGT here: %1 = icmp ugt <4 x i32> %induction, + %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef) %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef) %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %2, <4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %1) %index.next = add i32 %index, 4 %scevgep = getelementptr i32, i32* %lsr.iv, i32 4 %scevgep12 = getelementptr i32, i32* %lsr.iv11, i32 4 %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4 %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1) %4 = icmp ne i32 %3, 0 br i1 %4, label %vector.body, label %for.cond.cleanup for.cond.cleanup: ret void } -; Check that this loop behaves as expected, i.e, that the loop increment is -; an increment and not a decrement. -define dso_local void @foo4(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 { -; CHECK-LABEL: @foo4( +define dso_local void @foo5(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 { +; CHECK-LABEL: @foo5( ; CHECK-NEXT: entry: ; CHECK-NEXT: call void @llvm.set.loop.iterations.i32(i32 8001) ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[LSR_IV14:%.*]] = phi i32* [ [[SCEVGEP15:%.*]], [[VECTOR_BODY]] ], [ [[A:%.*]], [[ENTRY:%.*]] ] ; CHECK-NEXT: [[LSR_IV11:%.*]] = phi i32* [ [[SCEVGEP12:%.*]], [[VECTOR_BODY]] ], [ [[C:%.*]], [[ENTRY]] ] ; CHECK-NEXT: [[LSR_IV:%.*]] = phi i32* [ [[SCEVGEP:%.*]], [[VECTOR_BODY]] ], [ [[B:%.*]], [[ENTRY]] ] ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ 8001, [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[LSR_IV1416:%.*]] = bitcast i32* [[LSR_IV14]] to <4 x i32>* ; CHECK-NEXT: [[LSR_IV1113:%.*]] = bitcast i32* [[LSR_IV11]] to <4 x i32>* ; CHECK-NEXT: [[LSR_IV10:%.*]] = bitcast i32* [[LSR_IV]] to <4 x i32>* ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer ; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], -; CHECK-NEXT: [[TMP1:%.*]] = icmp ult <4 x i32> [[INDUCTION]], +; CHECK-NEXT: [[TMP1:%.*]] = icmp ult <4 x i32> [[INDUCTION]], ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[LSR_IV10]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef) ; CHECK-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[LSR_IV1113]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef) ; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD]] ; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP2]], <4 x i32>* [[LSR_IV1416]], i32 4, <4 x i1> [[TMP1]]) -; CHECK-NEXT: [[INDEX_NEXT]] = sub i32 [[INDEX]], 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 ; CHECK-NEXT: [[SCEVGEP]] = getelementptr i32, i32* [[LSR_IV]], i32 4 ; CHECK-NEXT: [[SCEVGEP12]] = getelementptr i32, i32* [[LSR_IV11]], i32 4 ; CHECK-NEXT: [[SCEVGEP15]] = getelementptr i32, i32* [[LSR_IV14]], i32 4 ; CHECK-NEXT: [[TMP3]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP0]], i32 1) ; CHECK-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0 ; CHECK-NEXT: br i1 [[TMP4]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void ; entry: call void @llvm.set.loop.iterations.i32(i32 8001) br label %vector.body vector.body: %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %A, %entry ] %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ] %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ] %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] %0 = phi i32 [ 8001, %entry ], [ %3, %vector.body ] %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>* %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>* %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, - %1 = icmp ult <4 x i32> %induction, + +; Non-uniform constant vector here. This can't be represented with +; @llvm.get.active.lane.mask, but let's keep this test as a sanity check: + %1 = icmp ult <4 x i32> %induction, + %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef) %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef) %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %2, <4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %1) + %index.next = add i32 %index, 4 + %scevgep = getelementptr i32, i32* %lsr.iv, i32 4 + %scevgep12 = getelementptr i32, i32* %lsr.iv11, i32 4 + %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4 + %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1) + %4 = icmp ne i32 %3, 0 + br i1 %4, label %vector.body, label %for.cond.cleanup + +for.cond.cleanup: + ret void +} -; Counting down: - %index.next = sub i32 %index, 4 +; CHECK-LABEL: @overflow_BTC_plus_1( +; +; CHECK-NOT: @llvm.arm.mve.vctp32 +; CHECK-NOT: @llvm.get.active.lane.mask +; +; CHECK: %lane.mask.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 +; CHECK: %lane.mask.splat = shufflevector <4 x i32> %lane.mask.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK: %lane.mask.induction = add <4 x i32> %lane.mask.splat, +; CHECK: %[[ICMP:.*]] = icmp ule <4 x i32> %lane.mask.induction, +; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ICMP]], <4 x i32> undef) +; +; CHECK: ret void +; +define dso_local void @overflow_BTC_plus_1(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 { +entry: + call void @llvm.set.loop.iterations.i32(i32 8001) + br label %vector.body + +vector.body: + %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %A, %entry ] + %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ] + %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ] + %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = phi i32 [ 8001, %entry ], [ %3, %vector.body ] + %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* + %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>* + %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>* + %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + %induction = add <4 x i32> %broadcast.splat, + +; BTC = UINT_MAX, and scalar trip count BTC + 1 would overflow: + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 4294967295) + + %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef) + %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef) + %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %2, <4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %1) + %index.next = add i32 %index, 4 %scevgep = getelementptr i32, i32* %lsr.iv, i32 4 %scevgep12 = getelementptr i32, i32* %lsr.iv11, i32 4 %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4 %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1) %4 = icmp ne i32 %3, 0 br i1 %4, label %vector.body, label %for.cond.cleanup for.cond.cleanup: ret void } -define dso_local void @foo5(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 { -; CHECK-LABEL: @foo5( -; CHECK-NEXT: entry: -; CHECK-NEXT: call void @llvm.set.loop.iterations.i32(i32 8001) -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[LSR_IV14:%.*]] = phi i32* [ [[SCEVGEP15:%.*]], [[VECTOR_BODY]] ], [ [[A:%.*]], [[ENTRY:%.*]] ] -; CHECK-NEXT: [[LSR_IV11:%.*]] = phi i32* [ [[SCEVGEP12:%.*]], [[VECTOR_BODY]] ], [ [[C:%.*]], [[ENTRY]] ] -; CHECK-NEXT: [[LSR_IV:%.*]] = phi i32* [ [[SCEVGEP:%.*]], [[VECTOR_BODY]] ], [ [[B:%.*]], [[ENTRY]] ] -; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ 8001, [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[LSR_IV1416:%.*]] = bitcast i32* [[LSR_IV14]] to <4 x i32>* -; CHECK-NEXT: [[LSR_IV1113:%.*]] = bitcast i32* [[LSR_IV11]] to <4 x i32>* -; CHECK-NEXT: [[LSR_IV10:%.*]] = bitcast i32* [[LSR_IV]] to <4 x i32>* -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer -; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], -; CHECK-NEXT: [[TMP1:%.*]] = icmp ult <4 x i32> [[INDUCTION]], -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[LSR_IV10]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef) -; CHECK-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[LSR_IV1113]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef) -; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD]] -; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP2]], <4 x i32>* [[LSR_IV1416]], i32 4, <4 x i1> [[TMP1]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 -; CHECK-NEXT: [[SCEVGEP]] = getelementptr i32, i32* [[LSR_IV]], i32 4 -; CHECK-NEXT: [[SCEVGEP12]] = getelementptr i32, i32* [[LSR_IV11]], i32 4 -; CHECK-NEXT: [[SCEVGEP15]] = getelementptr i32, i32* [[LSR_IV14]], i32 4 -; CHECK-NEXT: [[TMP3]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP0]], i32 1) -; CHECK-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0 -; CHECK-NEXT: br i1 [[TMP4]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]] -; CHECK: for.cond.cleanup: -; CHECK-NEXT: ret void +; CHECK-LABEL: @overflow_in_sub( +; CHECK-NOT: @llvm.arm.mve.vctp32 +; CHECK-NOT: @llvm.get.active.lane.mask +; CHECK: ret void ; +define dso_local void @overflow_in_sub(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 { entry: call void @llvm.set.loop.iterations.i32(i32 8001) br label %vector.body vector.body: %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %A, %entry ] %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ] %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ] %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] %0 = phi i32 [ 8001, %entry ], [ %3, %vector.body ] %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>* %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>* %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, -; non-uniform constant vector here: - %1 = icmp ult <4 x i32> %induction, +; Overflow in the substraction. This should hold: +; +; ceil(ElementCount / VectorWidth) >= TripCount +; +; But we have: +; +; ceil(3200 / 4) >= 8001 +; 8000 >= 8001 +; + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 31999) + + %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef) + %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef) + %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %2, <4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %1) + %index.next = add i32 %index, 4 + %scevgep = getelementptr i32, i32* %lsr.iv, i32 4 + %scevgep12 = getelementptr i32, i32* %lsr.iv11, i32 4 + %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4 + %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1) + %4 = icmp ne i32 %3, 0 + br i1 %4, label %vector.body, label %for.cond.cleanup + +for.cond.cleanup: + ret void +} + +; CHECK-LABEL: @overflow_in_rounding_tripcount( +; CHECK-NOT: @llvm.arm.mve.vctp32 +; CHECK-NOT: @llvm.get.active.lane.mask +; CHECK: ret void +; +define dso_local void @overflow_in_rounding_tripcount(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 { +entry: + +; TC = 4294967292 +; 4294967292 <= 4294967291 (MAX - vectorwidth) +; False +; + call void @llvm.set.loop.iterations.i32(i32 4294967291) + br label %vector.body + +vector.body: + %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %A, %entry ] + %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ] + %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ] + %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = phi i32 [ 8001, %entry ], [ %3, %vector.body ] + %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* + %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>* + %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>* + %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + %induction = add <4 x i32> %broadcast.splat, + + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32002) + %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef) %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef) %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %2, <4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %1) %index.next = add i32 %index, 4 %scevgep = getelementptr i32, i32* %lsr.iv, i32 4 %scevgep12 = getelementptr i32, i32* %lsr.iv11, i32 4 %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4 %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1) %4 = icmp ne i32 %3, 0 br i1 %4, label %vector.body, label %for.cond.cleanup for.cond.cleanup: ret void } + +; CHECK-LABEL: @IV_not_an_induction( +; +; CHECK-NOT: @llvm.arm.mve.vctp32 +; CHECK-NOT: @llvm.get.active.lane.mask +; +; CHECK: %lane.mask.splatinsert = insertelement <4 x i32> undef, i32 %N, i32 0 +; CHECK: %lane.mask.splat = shufflevector <4 x i32> %lane.mask.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK: %lane.mask.induction = add <4 x i32> %lane.mask.splat, +; CHECK: %[[ICMP:.*]] = icmp ule <4 x i32> %lane.mask.induction, +; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ICMP]], <4 x i32> undef) +; CHECK: ret void +; +define dso_local void @IV_not_an_induction(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 { +entry: + call void @llvm.set.loop.iterations.i32(i32 8001) + br label %vector.body + +vector.body: + %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %A, %entry ] + %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ] + %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ] + %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = phi i32 [ 8001, %entry ], [ %3, %vector.body ] + %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* + %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>* + %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>* + %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + %induction = add <4 x i32> %broadcast.splat, + +; The induction variable %D is not an IV: + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %N, i32 32002) + + %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef) + %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef) + %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %2, <4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %1) + %index.next = add i32 %index, 4 + %scevgep = getelementptr i32, i32* %lsr.iv, i32 4 + %scevgep12 = getelementptr i32, i32* %lsr.iv11, i32 4 + %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4 + %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1) + %4 = icmp ne i32 %3, 0 + br i1 %4, label %vector.body, label %for.cond.cleanup + +for.cond.cleanup: + ret void +} + +; CHECK-LABEL: @IV_wrong_step( +; +; CHECK-NOT: @llvm.arm.mve.vctp32 +; CHECK-NOT: @llvm.get.active.lane.mask +; +; CHECK: %lane.mask.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 +; CHECK: %lane.mask.splat = shufflevector <4 x i32> %lane.mask.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK: %lane.mask.induction = add <4 x i32> %lane.mask.splat, +; CHECK: %[[ICMP:.*]] = icmp ule <4 x i32> %lane.mask.induction, +; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ICMP]], <4 x i32> undef) +; CHECK: ret void +; +define dso_local void @IV_wrong_step(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 { +entry: + call void @llvm.set.loop.iterations.i32(i32 8001) + br label %vector.body + +vector.body: + %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %A, %entry ] + %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ] + %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ] + %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = phi i32 [ 8001, %entry ], [ %3, %vector.body ] + %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* + %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>* + %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>* + %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + %induction = add <4 x i32> %broadcast.splat, + + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32002) + + %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef) + %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef) + %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %2, <4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %1) + +; %index is incremented with 3 and not 4, which is the vectorisation factor +; that we expect here: + %index.next = add i32 %index, 3 + + %scevgep = getelementptr i32, i32* %lsr.iv, i32 4 + %scevgep12 = getelementptr i32, i32* %lsr.iv11, i32 4 + %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4 + %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1) + %4 = icmp ne i32 %3, 0 + br i1 %4, label %vector.body, label %for.cond.cleanup + +for.cond.cleanup: + ret void +} + +; CHECK-LABEL: @IV_step_not_constant( +; +; CHECK-NOT: @llvm.arm.mve.vctp32 +; CHECK-NOT: @llvm.get.active.lane.mask +; +; CHECK: %lane.mask.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 +; CHECK: %lane.mask.splat = shufflevector <4 x i32> %lane.mask.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK: %lane.mask.induction = add <4 x i32> %lane.mask.splat, +; CHECK: %[[ICMP:.*]] = icmp ule <4 x i32> %lane.mask.induction, +; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ICMP]], <4 x i32> undef) +; CHECK: ret void +; +define dso_local void @IV_step_not_constant(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 { +entry: + call void @llvm.set.loop.iterations.i32(i32 8001) + br label %vector.body + +vector.body: + %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %A, %entry ] + %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ] + %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ] + %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = phi i32 [ 8001, %entry ], [ %3, %vector.body ] + %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* + %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>* + %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>* + %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + %induction = add <4 x i32> %broadcast.splat, + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32002) + %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef) + %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef) + %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %2, <4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %1) + +; %index is incremented with some runtime value, i.e. not a constant: + %index.next = add i32 %index, %N + + %scevgep = getelementptr i32, i32* %lsr.iv, i32 4 + %scevgep12 = getelementptr i32, i32* %lsr.iv11, i32 4 + %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4 + %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1) + %4 = icmp ne i32 %3, 0 + br i1 %4, label %vector.body, label %for.cond.cleanup + +for.cond.cleanup: + ret void +} + +; CHECK-LABEL: @outerloop_phi( +; +; CHECK-NOT: @llvm.arm.mve.vctp32 +; CHECK-NOT: @llvm.get.active.lane.mask +; CHECK: %lane.mask.splatinsert = insertelement <4 x i32> undef, i32 %j.025, i32 0 +; CHECK: %lane.mask.splat = shufflevector <4 x i32> %lane.mask.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK: %lane.mask.induction = add <4 x i32> %lane.mask.splat, +; CHECK: %[[ICMP:.*]] = icmp ule <4 x i32> %lane.mask.induction, +; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ICMP]], <4 x i32> undef) +; +; CHECK: ret void +; +define dso_local void @outerloop_phi(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 { +entry: + %cmp24 = icmp eq i32 %N, 0 + br i1 %cmp24, label %for.cond.cleanup, label %vector.ph.preheader + +vector.ph.preheader: ; preds = %entry + br label %vector.ph + +vector.ph: ; preds = %vector.ph.preheader, %for.cond.cleanup3 + %lsr.iv36 = phi i32* [ %B, %vector.ph.preheader ], [ %scevgep37, %for.cond.cleanup3 ] + %lsr.iv31 = phi i32* [ %C, %vector.ph.preheader ], [ %scevgep32, %for.cond.cleanup3 ] + %lsr.iv = phi i32* [ %A, %vector.ph.preheader ], [ %scevgep, %for.cond.cleanup3 ] + %j.025 = phi i32 [ %inc11, %for.cond.cleanup3 ], [ 0, %vector.ph.preheader ] + call void @llvm.set.loop.iterations.i32(i32 1025) + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %lsr.iv38 = phi i32* [ %scevgep39, %vector.body ], [ %lsr.iv36, %vector.ph ] + %lsr.iv33 = phi i32* [ %scevgep34, %vector.body ], [ %lsr.iv31, %vector.ph ] + %lsr.iv28 = phi i32* [ %scevgep29, %vector.body ], [ %lsr.iv, %vector.ph ] + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %0 = phi i32 [ 1025, %vector.ph ], [ %2, %vector.body ] + %lsr.iv3840 = bitcast i32* %lsr.iv38 to <4 x i32>* + %lsr.iv3335 = bitcast i32* %lsr.iv33 to <4 x i32>* + %lsr.iv2830 = bitcast i32* %lsr.iv28 to <4 x i32>* + +; It's using %j.025, the induction variable from its outer loop: + %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %j.025, i32 4096) + + %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv3840, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) + %wide.masked.load27 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv3335, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) + %1 = add nsw <4 x i32> %wide.masked.load27, %wide.masked.load + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %lsr.iv2830, i32 4, <4 x i1> %active.lane.mask) + %index.next = add i32 %index, 4 + %scevgep29 = getelementptr i32, i32* %lsr.iv28, i32 4 + %scevgep34 = getelementptr i32, i32* %lsr.iv33, i32 4 + %scevgep39 = getelementptr i32, i32* %lsr.iv38, i32 4 + %2 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1) + %3 = icmp ne i32 %2, 0 + br i1 %3, label %vector.body, label %for.cond.cleanup3 + +for.cond.cleanup: ; preds = %for.cond.cleanup3, %entry + ret void + +for.cond.cleanup3: ; preds = %vector.body + %inc11 = add nuw i32 %j.025, 1 + %scevgep = getelementptr i32, i32* %lsr.iv, i32 1 + %scevgep32 = getelementptr i32, i32* %lsr.iv31, i32 1 + %scevgep37 = getelementptr i32, i32* %lsr.iv36, i32 1 + %exitcond26 = icmp eq i32 %inc11, %N + br i1 %exitcond26, label %for.cond.cleanup, label %vector.ph +} + + declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #1 declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #2 declare i32 @llvm.loop.decrement.reg.i32(i32 , i32 ) declare void @llvm.set.loop.iterations.i32(i32) +declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll index 330c6db24a74..3a9d3d117126 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll @@ -1,173 +1,184 @@ ; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -disable-mve-tail-predication=false -mattr=+mve,+lob %s -S -o - | FileCheck %s ; CHECK-LABEL: expand_v8i16_v8i32 ; CHECK-NOT: call i32 @llvm.arm.mve.vctp define void @expand_v8i16_v8i32(i16* noalias nocapture readonly %a, i16* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) { entry: %cmp8 = icmp eq i32 %N, 0 %tmp8 = add i32 %N, 7 %tmp9 = lshr i32 %tmp8, 3 %tmp10 = shl nuw i32 %tmp9, 3 %tmp11 = add i32 %tmp10, -8 %tmp12 = lshr i32 %tmp11, 3 %tmp13 = add nuw nsw i32 %tmp12, 1 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph vector.ph: ; preds = %entry %trip.count.minus.1 = add i32 %N, -1 %broadcast.splatinsert10 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat11 = shufflevector <8 x i32> %broadcast.splatinsert10, <8 x i32> undef, <8 x i32> zeroinitializer call void @llvm.set.loop.iterations.i32(i32 %tmp13) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ] %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer %induction = add <8 x i32> %broadcast.splat, %tmp = getelementptr inbounds i16, i16* %a, i32 %index - %tmp1 = icmp ule <8 x i32> %induction, %broadcast.splat11 + + ; %tmp1 = icmp ule <8 x i32> %induction, %broadcast.splat11 + %tmp1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %trip.count.minus.1) + %tmp2 = bitcast i16* %tmp to <8 x i16>* %wide.masked.load = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp2, i32 4, <8 x i1> %tmp1, <8 x i16> undef) %tmp3 = getelementptr inbounds i16, i16* %b, i32 %index %tmp4 = bitcast i16* %tmp3 to <8 x i16>* %wide.masked.load2 = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> %tmp1, <8 x i16> undef) %expand.1 = zext <8 x i16> %wide.masked.load to <8 x i32> %expand.2 = zext <8 x i16> %wide.masked.load2 to <8 x i32> %mul = mul nsw <8 x i32> %expand.2, %expand.1 %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index %tmp7 = bitcast i32* %tmp6 to <8 x i32>* tail call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> %mul, <8 x i32>* %tmp7, i32 4, <8 x i1> %tmp1) %index.next = add i32 %index, 8 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1) %tmp16 = icmp ne i32 %tmp15, 0 br i1 %tmp16, label %vector.body, label %for.cond.cleanup for.cond.cleanup: ; preds = %vector.body, %entry ret void } ; CHECK-LABEL: expand_v8i16_v4i32 ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[ELEMS_REM:%[^ ]+]], %vector.body ] ; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[ELEMS]]) ; CHECK: [[ELEMS_REM]] = sub i32 [[ELEMS]], 8 ; CHECK: tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* {{.*}}, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef) ; CHECK: %store.pred = icmp ule <4 x i32> %induction.store ; CHECK: tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> {{.*}}, <4 x i32>* {{.*}}, i32 4, <4 x i1> %store.pred) ; CHECK: tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> {{.*}}, <4 x i32>* {{.*}}, i32 4, <4 x i1> %store.pred) define void @expand_v8i16_v4i32(i16* readonly %a, i16* readonly %b, i32* %c, i32* %d, i32 %N) { entry: %cmp8 = icmp eq i32 %N, 0 %tmp8 = add i32 %N, 7 %tmp9 = lshr i32 %tmp8, 3 %tmp10 = shl nuw i32 %tmp9, 3 %tmp11 = add i32 %tmp10, -8 %tmp12 = lshr i32 %tmp11, 3 %tmp13 = add nuw nsw i32 %tmp12, 1 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph vector.ph: ; preds = %entry %trip.count.minus.1 = add i32 %N, -1 %broadcast.splatinsert10 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat11 = shufflevector <8 x i32> %broadcast.splatinsert10, <8 x i32> undef, <8 x i32> zeroinitializer %broadcast.splatinsert10.store = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat11.store = shufflevector <4 x i32> %broadcast.splatinsert10.store, <4 x i32> undef, <4 x i32> zeroinitializer call void @llvm.set.loop.iterations.i32(i32 %tmp13) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %store.idx = phi i32 [ 0, %vector.ph ], [ %store.idx.next, %vector.body ] %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ] %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer %induction = add <8 x i32> %broadcast.splat, %tmp = getelementptr inbounds i16, i16* %a, i32 %index - %tmp1 = icmp ule <8 x i32> %induction, %broadcast.splat11 + + ; %tmp1 = icmp ule <8 x i32> %induction, %broadcast.splat11 + %tmp1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %trip.count.minus.1) + %tmp2 = bitcast i16* %tmp to <8 x i16>* %wide.masked.load = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp2, i32 4, <8 x i1> %tmp1, <8 x i16> undef) %tmp3 = getelementptr inbounds i16, i16* %b, i32 %index %tmp4 = bitcast i16* %tmp3 to <8 x i16>* %wide.masked.load2 = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> %tmp1, <8 x i16> undef) %extract.2.low = shufflevector <8 x i16> %wide.masked.load2, <8 x i16> undef, < 4 x i32> %extract.2.high = shufflevector <8 x i16> %wide.masked.load2, <8 x i16> undef, < 4 x i32> %expand.1 = zext <4 x i16> %extract.2.low to <4 x i32> %expand.2 = zext <4 x i16> %extract.2.high to <4 x i32> %mul = mul nsw <4 x i32> %expand.2, %expand.1 %sub = mul nsw <4 x i32> %expand.1, %expand.2 %broadcast.splatinsert.store = insertelement <4 x i32> undef, i32 %store.idx, i32 0 %broadcast.splat.store = shufflevector <4 x i32> %broadcast.splatinsert.store, <4 x i32> undef, <4 x i32> zeroinitializer %induction.store = add <4 x i32> %broadcast.splat.store, %store.pred = icmp ule <4 x i32> %induction.store, %broadcast.splat11.store %tmp6 = getelementptr inbounds i32, i32* %c, i32 %store.idx %tmp7 = bitcast i32* %tmp6 to <4 x i32>* tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %mul, <4 x i32>* %tmp7, i32 4, <4 x i1> %store.pred) %gep = getelementptr inbounds i32, i32* %d, i32 %store.idx %cast.gep = bitcast i32* %gep to <4 x i32>* tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %sub, <4 x i32>* %cast.gep, i32 4, <4 x i1> %store.pred) %store.idx.next = add i32 %store.idx, 4 %index.next = add i32 %index, 8 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1) %tmp16 = icmp ne i32 %tmp15, 0 br i1 %tmp16, label %vector.body, label %for.cond.cleanup for.cond.cleanup: ; preds = %vector.body, %entry ret void } ; CHECK-LABEL: expand_v4i32_v4i64 ; CHECK-NOT: call i32 @llvm.arm.mve.vctp define void @expand_v4i32_v4i64(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i64* noalias nocapture %c, i32 %N) { entry: %cmp8 = icmp eq i32 %N, 0 %tmp8 = add i32 %N, 3 %tmp9 = lshr i32 %tmp8, 2 %tmp10 = shl nuw i32 %tmp9, 2 %tmp11 = add i32 %tmp10, -4 %tmp12 = lshr i32 %tmp11, 2 %tmp13 = add nuw nsw i32 %tmp12, 1 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph vector.ph: ; preds = %entry %trip.count.minus.1 = add i32 %N, -1 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer call void @llvm.set.loop.iterations.i32(i32 %tmp13) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ] %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %tmp = getelementptr inbounds i32, i32* %a, i32 %index - %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + + ; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + %tmp1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %tmp2 = bitcast i32* %tmp to <4 x i32>* %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef) %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index %tmp4 = bitcast i32* %tmp3 to <4 x i32>* %wide.masked.load2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef) %expand.1 = zext <4 x i32> %wide.masked.load to <4 x i64> %expand.2 = zext <4 x i32> %wide.masked.load2 to <4 x i64> %mul = mul nsw <4 x i64> %expand.2, %expand.1 %tmp6 = getelementptr inbounds i64, i64* %c, i32 %index %tmp7 = bitcast i64* %tmp6 to <4 x i64>* tail call void @llvm.masked.store.v4i64.p0v4i64(<4 x i64> %mul, <4 x i64>* %tmp7, i32 4, <4 x i1> %tmp1) %index.next = add i32 %index, 4 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1) %tmp16 = icmp ne i32 %tmp15, 0 br i1 %tmp16, label %vector.body, label %for.cond.cleanup for.cond.cleanup: ; preds = %vector.body, %entry ret void } declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>) declare void @llvm.masked.store.v8i32.p0v8i32(<8 x i32>, <8 x i32>*, i32 immarg, <8 x i1>) declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) declare void @llvm.masked.store.v4i64.p0v4i64(<4 x i64>, <4 x i64>*, i32 immarg, <4 x i1>) declare void @llvm.set.loop.iterations.i32(i32) declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) +declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) +declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll index c7ed9ce674dd..a93440dbcce7 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll @@ -1,118 +1,325 @@ ; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -disable-mve-tail-predication=false -mattr=+mve %s -S -o - | FileCheck %s +; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -disable-mve-tail-predication=false \ +; RUN: -force-tail-predication -mattr=+mve %s -S -o - | FileCheck %s --check-prefix=FORCE ; CHECK-LABEL: reduction_i32 -; CHECK: phi i32 [ 0, %entry ] -; CHECK: phi <8 x i16> [ zeroinitializer, %entry ] +; CHECK: phi i32 [ 0, %vector.ph ] +; CHECK: phi <8 x i16> [ zeroinitializer, %vector.ph ] ; CHECK: phi i32 -; CHECK: [[PHI:%[^ ]+]] = phi i32 [ %N, %entry ], [ [[ELEMS:%[^ ]+]], %vector.body ] +; CHECK: [[PHI:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[ELEMS:%[^ ]+]], %vector.body ] ; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[PHI]]) ; CHECK: [[ELEMS]] = sub i32 [[PHI]], 8 ; CHECK: call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef) ; CHECK: call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp6, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef) define i16 @reduction_i32(i16* nocapture readonly %A, i16* nocapture readonly %B, i32 %N) { entry: + %cmp8 = icmp eq i32 %N, 0 + br i1 %cmp8, label %for.cond.cleanup, label %vector.ph + +vector.ph: %tmp = add i32 %N, -1 - %n.rnd.up = add nuw nsw i32 %tmp, 8 + %n.rnd.up = add i32 %tmp, 8 %n.vec = and i32 %n.rnd.up, -8 %broadcast.splatinsert1 = insertelement <8 x i32> undef, i32 %tmp, i32 0 %broadcast.splat2 = shufflevector <8 x i32> %broadcast.splatinsert1, <8 x i32> undef, <8 x i32> zeroinitializer %0 = add i32 %n.vec, -8 %1 = lshr i32 %0, 3 - %2 = add nuw nsw i32 %1, 1 + %2 = add i32 %1, 1 call void @llvm.set.loop.iterations.i32(i32 %2) br label %vector.body -vector.body: ; preds = %vector.body, %entry - %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] - %vec.phi = phi <8 x i16> [ zeroinitializer, %entry ], [ %tmp8, %vector.body ] - %3 = phi i32 [ %2, %entry ], [ %4, %vector.body ] +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph], [ %index.next, %vector.body ] + %vec.phi = phi <8 x i16> [ zeroinitializer, %vector.ph], [ %tmp8, %vector.body ] + %3 = phi i32 [ %2, %vector.ph], [ %4, %vector.body ] %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer %induction = add <8 x i32> %broadcast.splat, %tmp2 = getelementptr inbounds i16, i16* %A, i32 %index - %tmp3 = icmp ule <8 x i32> %induction, %broadcast.splat2 + + ; %tmp3 = icmp ule <8 x i32> %induction, %broadcast.splat2 + %tmp3 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %tmp) + %tmp4 = bitcast i16* %tmp2 to <8 x i16>* %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> %tmp3, <8 x i16> undef) %tmp5 = getelementptr inbounds i16, i16* %B, i32 %index %tmp6 = bitcast i16* %tmp5 to <8 x i16>* %wide.masked.load3 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp6, i32 4, <8 x i1> %tmp3, <8 x i16> undef) %tmp7 = add <8 x i16> %wide.masked.load, %vec.phi %tmp8 = add <8 x i16> %tmp7, %wide.masked.load3 - %index.next = add nuw nsw i32 %index, 8 + %index.next = add i32 %index, 8 %4 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %3, i32 1) %5 = icmp ne i32 %4, 0 br i1 %5, label %vector.body, label %middle.block middle.block: ; preds = %vector.body %vec.phi.lcssa = phi <8 x i16> [ %vec.phi, %vector.body ] %.lcssa3 = phi <8 x i1> [ %tmp3, %vector.body ] %.lcssa = phi <8 x i16> [ %tmp8, %vector.body ] %tmp10 = select <8 x i1> %.lcssa3, <8 x i16> %.lcssa, <8 x i16> %vec.phi.lcssa %rdx.shuf = shufflevector <8 x i16> %tmp10, <8 x i16> undef, <8 x i32> %bin.rdx = add <8 x i16> %rdx.shuf, %tmp10 %rdx.shuf4 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> %bin.rdx5 = add <8 x i16> %rdx.shuf4, %bin.rdx %rdx.shuf6 = shufflevector <8 x i16> %bin.rdx5, <8 x i16> undef, <8 x i32> %bin.rdx7 = add <8 x i16> %rdx.shuf6, %bin.rdx5 %tmp11 = extractelement <8 x i16> %bin.rdx7, i32 0 ret i16 %tmp11 + +for.cond.cleanup: + %res.0 = phi i16 [ 0, %entry ] + ret i16 %res.0 } ; CHECK-LABEL: reduction_i32_with_scalar -; CHECK: phi i32 [ 0, %entry ] -; CHECK: phi <8 x i16> [ zeroinitializer, %entry ] -; CHECK: phi i32 -; CHECK: [[PHI:%[^ ]+]] = phi i32 [ %N, %entry ], [ [[ELEMS:%[^ ]+]], %vector.body ] +; CHECK: vector.body: +; CHECK: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; CHECK: %vec.phi = phi <8 x i16> [ zeroinitializer, %vector.ph ], [ %{{.*}}, %vector.body ] +; CHECK: %{{.*}} = phi i32 [ %{{.*}}, %vector.ph ], [ %{{.*}}, %vector.body ] +; CHECK: [[PHI:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[ELEMS:%[^ ]+]], %vector.body ] ; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[PHI]]) ; CHECK: [[ELEMS]] = sub i32 [[PHI]], 8 ; CHECK: call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef) define i16 @reduction_i32_with_scalar(i16* nocapture readonly %A, i16 %B, i32 %N) local_unnamed_addr { +entry: + %cmp8 = icmp eq i32 %N, 0 + br i1 %cmp8, label %for.cond.cleanup, label %vector.ph + +vector.ph: + %tmp = add i32 %N, -1 + %n.rnd.up = add nuw nsw i32 %tmp, 8 + %n.vec = and i32 %n.rnd.up, -8 + %broadcast.splatinsert1 = insertelement <8 x i32> undef, i32 %tmp, i32 0 + %broadcast.splat2 = shufflevector <8 x i32> %broadcast.splatinsert1, <8 x i32> undef, <8 x i32> zeroinitializer + %broadcast.splatinsert3 = insertelement <8 x i16> undef, i16 %B, i32 0 + %broadcast.splat4 = shufflevector <8 x i16> %broadcast.splatinsert3, <8 x i16> undef, <8 x i32> zeroinitializer + %0 = add i32 %n.vec, -8 + %1 = lshr i32 %0, 3 + %2 = add nuw nsw i32 %1, 1 + call void @llvm.set.loop.iterations.i32(i32 %2) + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph], [ %index.next, %vector.body ] + %vec.phi = phi <8 x i16> [ zeroinitializer, %vector.ph], [ %tmp6, %vector.body ] + %3 = phi i32 [ %2, %vector.ph], [ %4, %vector.body ] + %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0 + %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer + %induction = add <8 x i32> %broadcast.splat, + %tmp2 = getelementptr inbounds i16, i16* %A, i32 %index + + ; %tmp3 = icmp ule <8 x i32> %induction, %broadcast.splat2 + %tmp3 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %tmp) + + %tmp4 = bitcast i16* %tmp2 to <8 x i16>* + %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> %tmp3, <8 x i16> undef) + %tmp5 = add <8 x i16> %vec.phi, %broadcast.splat4 + %tmp6 = add <8 x i16> %tmp5, %wide.masked.load + %index.next = add nuw nsw i32 %index, 8 + %4 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %3, i32 1) + %5 = icmp ne i32 %4, 0 + br i1 %5, label %vector.body, label %middle.block + +middle.block: ; preds = %vector.body + %tmp8 = select <8 x i1> %tmp3, <8 x i16> %tmp6, <8 x i16> %vec.phi + %rdx.shuf = shufflevector <8 x i16> %tmp8, <8 x i16> undef, <8 x i32> + %bin.rdx = add <8 x i16> %rdx.shuf, %tmp8 + %rdx.shuf5 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> + %bin.rdx6 = add <8 x i16> %rdx.shuf5, %bin.rdx + %rdx.shuf7 = shufflevector <8 x i16> %bin.rdx6, <8 x i16> undef, <8 x i32> + %bin.rdx8 = add <8 x i16> %rdx.shuf7, %bin.rdx6 + %tmp9 = extractelement <8 x i16> %bin.rdx8, i32 0 + ret i16 %tmp9 + +for.cond.cleanup: + %res.0 = phi i16 [ 0, %entry ] + ret i16 %res.0 +} + +; The vector loop is not guarded with an entry check (N == 0). +; This means we can't calculate a precise range for the backedge count in +; @llvm.get.active.lane.mask, and are assuming overflow can happen and thus +; we can't insert the VCTP here. +; +; CHECK-LABEL: @reduction_not_guarded +; +; CHECK-NOT: @llvm.arm.mve.vctp +; CHECK-NOT: @llvm.get.active.lane.mask.v8i1.i32 +; +; CHECK: entry: +; CHECK: %[[ELEMCOUNT:.*]] = add i32 %N, -1 +; CHECK: %broadcast.splatinsert1 = insertelement <8 x i32> undef, i32 %[[ELEMCOUNT]], i32 0 +; CHECK %broadcast.splat2 = shufflevector <8 x i32> %broadcast.splatinsert1, <8 x i32> undef, <8 x i32> zeroinitializer +; +; CHECK: vector.body: +; CHECK: %lane.mask.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0 +; CHECK: %lane.mask.splat = shufflevector <8 x i32> %lane.mask.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer +; CHECK: %lane.mask.induction = add <8 x i32> %lane.mask.splat, +; CHECK: %[[ICMP:.*]] = icmp ule <8 x i32> %lane.mask.induction, %broadcast.splat2 +; CHECK: call <8 x i16> @llvm.masked.load.v8i16.p0v8i16({{.*}}, <8 x i1> %[[ICMP]], <8 x i16> undef) +; CHECK: ret +; +define i16 @reduction_not_guarded(i16* nocapture readonly %A, i16 %B, i32 %N) local_unnamed_addr { entry: %tmp = add i32 %N, -1 %n.rnd.up = add nuw nsw i32 %tmp, 8 %n.vec = and i32 %n.rnd.up, -8 %broadcast.splatinsert1 = insertelement <8 x i32> undef, i32 %tmp, i32 0 %broadcast.splat2 = shufflevector <8 x i32> %broadcast.splatinsert1, <8 x i32> undef, <8 x i32> zeroinitializer %broadcast.splatinsert3 = insertelement <8 x i16> undef, i16 %B, i32 0 %broadcast.splat4 = shufflevector <8 x i16> %broadcast.splatinsert3, <8 x i16> undef, <8 x i32> zeroinitializer %0 = add i32 %n.vec, -8 %1 = lshr i32 %0, 3 %2 = add nuw nsw i32 %1, 1 call void @llvm.set.loop.iterations.i32(i32 %2) br label %vector.body -vector.body: ; preds = %vector.body, %entry - %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] - %vec.phi = phi <8 x i16> [ zeroinitializer, %entry ], [ %tmp6, %vector.body ] +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %entry], [ %index.next, %vector.body ] + %vec.phi = phi <8 x i16> [ zeroinitializer, %entry], [ %tmp6, %vector.body ] %3 = phi i32 [ %2, %entry ], [ %4, %vector.body ] %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer %induction = add <8 x i32> %broadcast.splat, %tmp2 = getelementptr inbounds i16, i16* %A, i32 %index - %tmp3 = icmp ule <8 x i32> %induction, %broadcast.splat2 + + ; %tmp3 = icmp ule <8 x i32> %induction, %broadcast.splat2 + %tmp3 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %tmp) + %tmp4 = bitcast i16* %tmp2 to <8 x i16>* %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> %tmp3, <8 x i16> undef) %tmp5 = add <8 x i16> %vec.phi, %broadcast.splat4 %tmp6 = add <8 x i16> %tmp5, %wide.masked.load %index.next = add nuw nsw i32 %index, 8 %4 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %3, i32 1) %5 = icmp ne i32 %4, 0 br i1 %5, label %vector.body, label %middle.block middle.block: ; preds = %vector.body %tmp8 = select <8 x i1> %tmp3, <8 x i16> %tmp6, <8 x i16> %vec.phi %rdx.shuf = shufflevector <8 x i16> %tmp8, <8 x i16> undef, <8 x i32> %bin.rdx = add <8 x i16> %rdx.shuf, %tmp8 %rdx.shuf5 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> %bin.rdx6 = add <8 x i16> %rdx.shuf5, %bin.rdx %rdx.shuf7 = shufflevector <8 x i16> %bin.rdx6, <8 x i16> undef, <8 x i32> %bin.rdx8 = add <8 x i16> %rdx.shuf7, %bin.rdx6 %tmp9 = extractelement <8 x i16> %bin.rdx8, i32 0 ret i16 %tmp9 } +; Without forcing tail-predication, we bail because overflow analysis says: +; +; overflow possible in: {(-1 + (sext i16 %Size to i32)),+,-1}<%for.body> +; +; CHECK-LABEL: @Correlation +; +; CHECK: entry: +; CHECK: for.body.lr.ph: ; preds = %entry +; CHECK: for.body: ; preds = %for.end, %for.body.lr.ph +; CHECK: vector.ph: ; preds = %for.body +; CHECK: %trip.count.minus.1 = add i32 %8, -1 +; CHECK: call void @llvm.set.loop.iterations.i32(i32 %7) +; CHECK: %insert.btc = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 +; CHECK: %splat.btc = shufflevector <4 x i32> %insert.btc, <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK: br label %vector.body +; CHECK: vector.body: +; CHECK-NOT: @llvm.arm.mve.vctp +; CHECK: %[[ICMP:.*]] = icmp ule <4 x i32> %lane.mask.induction, %splat.btc +; CHECK: call <4 x i16> @llvm.masked.load.v4i16.p0v4i16({{.*}}, <4 x i1> %[[ICMP]],{{.*}} +; +; +; FORCE-LABEL: @Correlation +; FORCE: vector.ph: ; preds = %for.body +; FORCE: %trip.count.minus.1 = add i32 %{{.*}}, -1 +; FORCE: call void @llvm.set.loop.iterations.i32(i32 %{{.*}}) +; FORCE: br label %vector.body +; FORCE: vector.body: ; preds = %vector.body, %vector.ph +; FORCE: %[[VCTP:.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 %{{.*}}) +; FORCE: call <4 x i16> @llvm.masked.load.v4i16.p0v4i16({{.*}}, <4 x i1> %[[VCTP]]{{.*}} +; +define dso_local void @Correlation(i16* nocapture readonly %Input, i16* nocapture %Output, i16 signext %Size, i16 signext %N, i16 signext %Scale) local_unnamed_addr #0 { +entry: + %conv = sext i16 %N to i32 + %cmp36 = icmp sgt i16 %N, 0 + br i1 %cmp36, label %for.body.lr.ph, label %for.end17 + +for.body.lr.ph: + %conv2 = sext i16 %Size to i32 + %conv1032 = zext i16 %Scale to i32 + %0 = add i32 %conv2, 3 + br label %for.body + +for.body: + %lsr.iv51 = phi i32 [ %lsr.iv.next, %for.end ], [ %0, %for.body.lr.ph ] + %lsr.iv46 = phi i16* [ %scevgep47, %for.end ], [ %Input, %for.body.lr.ph ] + %i.037 = phi i32 [ 0, %for.body.lr.ph ], [ %inc16, %for.end ] + %1 = mul nsw i32 %i.037, -1 + %2 = add i32 %0, %1 + %3 = lshr i32 %2, 2 + %4 = shl nuw i32 %3, 2 + %5 = add i32 %4, -4 + %6 = lshr i32 %5, 2 + %7 = add nuw nsw i32 %6, 1 + %8 = sub i32 %conv2, %i.037 + %cmp433 = icmp slt i32 %i.037, %conv2 + br i1 %cmp433, label %vector.ph, label %for.end + +vector.ph: ; preds = %for.body + %trip.count.minus.1 = add i32 %8, -1 + call void @llvm.set.loop.iterations.i32(i32 %7) + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %lsr.iv48 = phi i16* [ %scevgep49, %vector.body ], [ %lsr.iv46, %vector.ph ] + %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %Input, %vector.ph ] + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %16, %vector.body ] + %9 = phi i32 [ %7, %vector.ph ], [ %17, %vector.body ] + %lsr.iv4850 = bitcast i16* %lsr.iv48 to <4 x i16>* + %lsr.iv45 = bitcast i16* %lsr.iv to <4 x i16>* + %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv45, i32 2, <4 x i1> %active.lane.mask, <4 x i16> undef) + %10 = sext <4 x i16> %wide.masked.load to <4 x i32> + %wide.masked.load42 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv4850, i32 2, <4 x i1> %active.lane.mask, <4 x i16> undef) + %11 = sext <4 x i16> %wide.masked.load42 to <4 x i32> + %12 = mul nsw <4 x i32> %11, %10 + %13 = insertelement <4 x i32> undef, i32 %conv1032, i32 0 + %14 = shufflevector <4 x i32> %13, <4 x i32> undef, <4 x i32> zeroinitializer + %15 = ashr <4 x i32> %12, %14 + %16 = add <4 x i32> %15, %vec.phi + %index.next = add i32 %index, 4 + %scevgep = getelementptr i16, i16* %lsr.iv, i32 4 + %scevgep49 = getelementptr i16, i16* %lsr.iv48, i32 4 + %17 = call i32 @llvm.loop.decrement.reg.i32(i32 %9, i32 1) + %18 = icmp ne i32 %17, 0 + br i1 %18, label %vector.body, label %middle.block + +middle.block: ; preds = %vector.body + %19 = select <4 x i1> %active.lane.mask, <4 x i32> %16, <4 x i32> %vec.phi + %20 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %19) + br label %for.end + +for.end: ; preds = %middle.block, %for.body + %Sum.0.lcssa = phi i32 [ 0, %for.body ], [ %20, %middle.block ] + %21 = lshr i32 %Sum.0.lcssa, 16 + %conv13 = trunc i32 %21 to i16 + %arrayidx14 = getelementptr inbounds i16, i16* %Output, i32 %i.037 + store i16 %conv13, i16* %arrayidx14, align 2 + %inc16 = add nuw nsw i32 %i.037, 1 + %scevgep47 = getelementptr i16, i16* %lsr.iv46, i32 1 + %lsr.iv.next = add i32 %lsr.iv51, -1 + %exitcond39 = icmp eq i32 %inc16, %conv + br i1 %exitcond39, label %for.end17, label %for.body + +for.end17: ; preds = %for.end, %entry + ret void +} + declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>) declare void @llvm.set.loop.iterations.i32(i32) declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) - - +declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) +declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32) +declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) +declare i32 @llvm.loop.decrement.reg.i32(i32, i32) +declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll index e2f68d32c2f5..d2e9c584c257 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll @@ -1,430 +1,458 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=armv8.1m.main -mattr=+mve -disable-mve-tail-predication=false --verify-machineinstrs %s -o - | FileCheck %s define dso_local i32 @mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) { ; CHECK-LABEL: mul_reduce_add: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: itt eq ; CHECK-NEXT: moveq r0, #0 ; CHECK-NEXT: bxeq lr ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: adds r3, r2, #3 ; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: bic r3, r3, #3 ; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2 -; CHECK-NEXT: lsr.w r3, r12, #2 -; CHECK-NEXT: sub.w r3, r2, r3, lsl #2 +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB0_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r2 ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vpstt ; CHECK-NEXT: vldrwt.u32 q0, [r0], #16 ; CHECK-NEXT: vldrwt.u32 q2, [r1], #16 -; CHECK-NEXT: subs r2, #4 +; CHECK-NEXT: adds r3, #4 ; CHECK-NEXT: vmul.i32 q0, q2, q0 +; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vadd.i32 q0, q0, q1 ; CHECK-NEXT: le lr, .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block -; CHECK-NEXT: vctp.32 r3 ; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: pop {r7, pc} entry: %cmp8 = icmp eq i32 %N, 0 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph vector.ph: ; preds = %entry %n.rnd.up = add i32 %N, 3 %n.vec = and i32 %n.rnd.up, -4 %trip.count.minus.1 = add i32 %N, -1 %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %6, %vector.body ] %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %0 = getelementptr inbounds i32, i32* %a, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat12 + + ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat12 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast i32* %0 to <4 x i32>* %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef) %3 = getelementptr inbounds i32, i32* %b, i32 %index %4 = bitcast i32* %3 to <4 x i32>* %wide.masked.load13 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %4, i32 4, <4 x i1> %1, <4 x i32> undef) %5 = mul nsw <4 x i32> %wide.masked.load13, %wide.masked.load %6 = add nsw <4 x i32> %5, %vec.phi %index.next = add i32 %index, 4 %7 = icmp eq i32 %index.next, %n.vec br i1 %7, label %middle.block, label %vector.body middle.block: ; preds = %vector.body %8 = select <4 x i1> %1, <4 x i32> %6, <4 x i32> %vec.phi %9 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %8) br label %for.cond.cleanup for.cond.cleanup: ; preds = %middle.block, %entry %res.0.lcssa = phi i32 [ 0, %entry ], [ %9, %middle.block ] ret i32 %res.0.lcssa } define dso_local i32 @mul_reduce_add_const(i32* noalias nocapture readonly %a, i32 %b, i32 %N) { ; CHECK-LABEL: mul_reduce_add_const: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: itt eq ; CHECK-NEXT: moveq r0, #0 ; CHECK-NEXT: bxeq lr ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: adds r1, r2, #3 ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: bic r1, r1, #3 ; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: subs r1, #4 ; CHECK-NEXT: add.w lr, r3, r1, lsr #2 -; CHECK-NEXT: lsrs r1, r1, #2 -; CHECK-NEXT: sub.w r1, r2, r1, lsl #2 +; CHECK-NEXT: movs r1, #0 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB1_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r2 ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vpst ; CHECK-NEXT: vldrwt.u32 q0, [r0], #16 +; CHECK-NEXT: adds r1, #4 ; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vadd.i32 q0, q0, q1 ; CHECK-NEXT: le lr, .LBB1_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block -; CHECK-NEXT: vctp.32 r1 ; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: pop {r7, pc} entry: %cmp6 = icmp eq i32 %N, 0 br i1 %cmp6, label %for.cond.cleanup, label %vector.ph vector.ph: ; preds = %entry %n.rnd.up = add i32 %N, 3 %n.vec = and i32 %n.rnd.up, -4 %trip.count.minus.1 = add i32 %N, -1 %broadcast.splatinsert9 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat10 = shufflevector <4 x i32> %broadcast.splatinsert9, <4 x i32> undef, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ] %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %0 = getelementptr inbounds i32, i32* %a, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat10 + + ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat10 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast i32* %0 to <4 x i32>* %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef) %3 = add nsw <4 x i32> %wide.masked.load, %vec.phi %index.next = add i32 %index, 4 %4 = icmp eq i32 %index.next, %n.vec br i1 %4, label %middle.block, label %vector.body middle.block: ; preds = %vector.body %5 = select <4 x i1> %1, <4 x i32> %3, <4 x i32> %vec.phi %6 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %5) br label %for.cond.cleanup for.cond.cleanup: ; preds = %middle.block, %entry %res.0.lcssa = phi i32 [ 0, %entry ], [ %6, %middle.block ] ret i32 %res.0.lcssa } define dso_local i32 @add_reduce_add_const(i32* noalias nocapture readonly %a, i32 %b, i32 %N) { ; CHECK-LABEL: add_reduce_add_const: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: itt eq ; CHECK-NEXT: moveq r0, #0 ; CHECK-NEXT: bxeq lr ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: adds r1, r2, #3 ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: bic r1, r1, #3 ; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: subs r1, #4 ; CHECK-NEXT: add.w lr, r3, r1, lsr #2 -; CHECK-NEXT: lsrs r1, r1, #2 -; CHECK-NEXT: sub.w r1, r2, r1, lsl #2 +; CHECK-NEXT: movs r1, #0 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB2_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r2 ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vpst ; CHECK-NEXT: vldrwt.u32 q0, [r0], #16 +; CHECK-NEXT: adds r1, #4 ; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vadd.i32 q0, q0, q1 ; CHECK-NEXT: le lr, .LBB2_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block -; CHECK-NEXT: vctp.32 r1 ; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: pop {r7, pc} entry: %cmp6 = icmp eq i32 %N, 0 br i1 %cmp6, label %for.cond.cleanup, label %vector.ph vector.ph: ; preds = %entry %n.rnd.up = add i32 %N, 3 %n.vec = and i32 %n.rnd.up, -4 %trip.count.minus.1 = add i32 %N, -1 %broadcast.splatinsert9 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat10 = shufflevector <4 x i32> %broadcast.splatinsert9, <4 x i32> undef, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ] %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %0 = getelementptr inbounds i32, i32* %a, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat10 + + ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat10 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast i32* %0 to <4 x i32>* %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef) %3 = add nsw <4 x i32> %wide.masked.load, %vec.phi %index.next = add i32 %index, 4 %4 = icmp eq i32 %index.next, %n.vec br i1 %4, label %middle.block, label %vector.body middle.block: ; preds = %vector.body %5 = select <4 x i1> %1, <4 x i32> %3, <4 x i32> %vec.phi %6 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %5) br label %for.cond.cleanup for.cond.cleanup: ; preds = %middle.block, %entry %res.0.lcssa = phi i32 [ 0, %entry ], [ %6, %middle.block ] ret i32 %res.0.lcssa } define dso_local void @vector_mul_const(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %c, i32 %N) { ; CHECK-LABEL: vector_mul_const: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB3_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: add.w r12, r12, #4 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16 ; CHECK-NEXT: vmul.i32 q0, q0, r2 ; CHECK-NEXT: vstrw.32 q0, [r0], #16 ; CHECK-NEXT: letp lr, .LBB3_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} entry: %cmp6 = icmp eq i32 %N, 0 br i1 %cmp6, label %for.cond.cleanup, label %vector.ph vector.ph: ; preds = %entry %n.rnd.up = add i32 %N, 3 %n.vec = and i32 %n.rnd.up, -4 %trip.count.minus.1 = add i32 %N, -1 %broadcast.splatinsert8 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat9 = shufflevector <4 x i32> %broadcast.splatinsert8, <4 x i32> undef, <4 x i32> zeroinitializer %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %c, i32 0 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %0 = getelementptr inbounds i32, i32* %b, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat9 + + ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat9 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast i32* %0 to <4 x i32>* %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef) %3 = mul nsw <4 x i32> %wide.masked.load, %broadcast.splat11 %4 = getelementptr inbounds i32, i32* %a, i32 %index %5 = bitcast i32* %4 to <4 x i32>* call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %5, i32 4, <4 x i1> %1) %index.next = add i32 %index, 4 %6 = icmp eq i32 %index.next, %n.vec br i1 %6, label %for.cond.cleanup, label %vector.body for.cond.cleanup: ; preds = %vector.body, %entry ret void } define dso_local void @vector_add_const(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %c, i32 %N) { ; CHECK-LABEL: vector_add_const: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB4_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: add.w r12, r12, #4 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16 ; CHECK-NEXT: vadd.i32 q0, q0, r2 ; CHECK-NEXT: vstrw.32 q0, [r0], #16 ; CHECK-NEXT: letp lr, .LBB4_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} entry: %cmp6 = icmp eq i32 %N, 0 br i1 %cmp6, label %for.cond.cleanup, label %vector.ph vector.ph: ; preds = %entry %n.rnd.up = add i32 %N, 3 %n.vec = and i32 %n.rnd.up, -4 %trip.count.minus.1 = add i32 %N, -1 %broadcast.splatinsert8 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat9 = shufflevector <4 x i32> %broadcast.splatinsert8, <4 x i32> undef, <4 x i32> zeroinitializer %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %c, i32 0 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %0 = getelementptr inbounds i32, i32* %b, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat9 + + ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat9 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast i32* %0 to <4 x i32>* %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef) %3 = add nsw <4 x i32> %wide.masked.load, %broadcast.splat11 %4 = getelementptr inbounds i32, i32* %a, i32 %index %5 = bitcast i32* %4 to <4 x i32>* call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %5, i32 4, <4 x i1> %1) %index.next = add i32 %index, 4 %6 = icmp eq i32 %index.next, %n.vec br i1 %6, label %for.cond.cleanup, label %vector.body for.cond.cleanup: ; preds = %vector.body, %entry ret void } define dso_local arm_aapcs_vfpcc void @vector_mul_vector_i8(i8* noalias nocapture %a, i8* noalias nocapture readonly %b, i8* noalias nocapture readonly %c, i32 %N) { ; CHECK-LABEL: vector_mul_vector_i8: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dlstp.8 lr, r3 ; CHECK-NEXT: .LBB5_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: add.w r12, r12, #16 ; CHECK-NEXT: vldrb.u8 q0, [r1], #16 ; CHECK-NEXT: vldrb.u8 q1, [r2], #16 ; CHECK-NEXT: vmul.i8 q0, q1, q0 ; CHECK-NEXT: vstrb.8 q0, [r0], #16 ; CHECK-NEXT: letp lr, .LBB5_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} entry: %cmp10 = icmp eq i32 %N, 0 br i1 %cmp10, label %for.cond.cleanup, label %vector.ph vector.ph: ; preds = %entry %n.rnd.up = add i32 %N, 15 %n.vec = and i32 %n.rnd.up, -16 %trip.count.minus.1 = add i32 %N, -1 %broadcast.splatinsert12 = insertelement <16 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat13 = shufflevector <16 x i32> %broadcast.splatinsert12, <16 x i32> undef, <16 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %broadcast.splatinsert = insertelement <16 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <16 x i32> %broadcast.splatinsert, <16 x i32> undef, <16 x i32> zeroinitializer %induction = add <16 x i32> %broadcast.splat, %0 = getelementptr inbounds i8, i8* %b, i32 %index - %1 = icmp ule <16 x i32> %induction, %broadcast.splat13 + + ; %1 = icmp ule <16 x i32> %induction, %broadcast.splat13 + %1 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast i8* %0 to <16 x i8>* %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %2, i32 1, <16 x i1> %1, <16 x i8> undef) %3 = getelementptr inbounds i8, i8* %c, i32 %index %4 = bitcast i8* %3 to <16 x i8>* %wide.masked.load14 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %4, i32 1, <16 x i1> %1, <16 x i8> undef) %5 = mul <16 x i8> %wide.masked.load14, %wide.masked.load %6 = getelementptr inbounds i8, i8* %a, i32 %index %7 = bitcast i8* %6 to <16 x i8>* call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %5, <16 x i8>* %7, i32 1, <16 x i1> %1) %index.next = add i32 %index, 16 %8 = icmp eq i32 %index.next, %n.vec br i1 %8, label %for.cond.cleanup, label %vector.body for.cond.cleanup: ; preds = %vector.body, %entry ret void } ; Function Attrs: nofree norecurse nounwind define dso_local arm_aapcs_vfpcc void @vector_mul_vector_i16(i16* noalias nocapture %a, i16* noalias nocapture readonly %b, i16* noalias nocapture readonly %c, i32 %N) local_unnamed_addr #0 { ; CHECK-LABEL: vector_mul_vector_i16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dlstp.16 lr, r3 ; CHECK-NEXT: .LBB6_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: add.w r12, r12, #8 ; CHECK-NEXT: vldrh.u16 q0, [r1], #16 ; CHECK-NEXT: vldrh.u16 q1, [r2], #16 ; CHECK-NEXT: vmul.i16 q0, q1, q0 ; CHECK-NEXT: vstrh.16 q0, [r0], #16 ; CHECK-NEXT: letp lr, .LBB6_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} entry: %cmp10 = icmp eq i32 %N, 0 br i1 %cmp10, label %for.cond.cleanup, label %vector.ph vector.ph: ; preds = %entry %n.rnd.up = add i32 %N, 7 %n.vec = and i32 %n.rnd.up, -8 %trip.count.minus.1 = add i32 %N, -1 %broadcast.splatinsert12 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat13 = shufflevector <8 x i32> %broadcast.splatinsert12, <8 x i32> undef, <8 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer %induction = add <8 x i32> %broadcast.splat, %0 = getelementptr inbounds i16, i16* %b, i32 %index - %1 = icmp ule <8 x i32> %induction, %broadcast.splat13 + + ; %1 = icmp ule <8 x i32> %induction, %broadcast.splat13 + %1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast i16* %0 to <8 x i16>* %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %2, i32 2, <8 x i1> %1, <8 x i16> undef) %3 = getelementptr inbounds i16, i16* %c, i32 %index %4 = bitcast i16* %3 to <8 x i16>* %wide.masked.load14 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %4, i32 2, <8 x i1> %1, <8 x i16> undef) %5 = mul <8 x i16> %wide.masked.load14, %wide.masked.load %6 = getelementptr inbounds i16, i16* %a, i32 %index %7 = bitcast i16* %6 to <8 x i16>* call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %5, <8 x i16>* %7, i32 2, <8 x i1> %1) %index.next = add i32 %index, 8 %8 = icmp eq i32 %index.next, %n.vec br i1 %8, label %for.cond.cleanup, label %vector.body for.cond.cleanup: ; preds = %vector.body, %entry ret void } declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>) declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>) declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32 immarg, <16 x i1>) declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32 immarg, <8 x i1>) declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) - +declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) +declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32) +declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll index a5b4c61a82ed..1d766f378fc5 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll @@ -1,81 +1,84 @@ ; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -disable-mve-tail-predication=false -mattr=+mve %s -S -o - | FileCheck %s ; CHECK-LABEL: vec_mul_reduce_add ; CHECK: vector.ph: ; CHECK: call void @llvm.set.loop.iterations.i32 -; CHECK: [[UF:%[^ ]+]] = shl i32 %{{.*}}, 2 -; CHECK: [[REMAT_ITER:%[^ ]+]] = sub i32 %N, [[UF]] ; CHECK: br label %vector.body ; CHECK: vector.body: -; CHECK-NOT: phi i32 [ 0, %vector.ph ] ; CHECK: [[ELTS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[SUB:%[^ ]+]], %vector.body ] ; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELTS]]) ; CHECK: [[SUB]] = sub i32 [[ELTS]], 4 ; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]] ; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], ; CHECK: middle.block: -; CHECK: [[VCTP_CLONE:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[REMAT_ITER]]) +; CHECK: [[VCTP_CLONE:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[REMAT_ITER:%.*]]) ; CHECK: [[VPSEL:%[^ ]+]] = select <4 x i1> [[VCTP_CLONE]], ; CHECK: call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[VPSEL]]) define i32 @vec_mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) { entry: %cmp8 = icmp eq i32 %N, 0 %0 = add i32 %N, 3 %1 = lshr i32 %0, 2 %2 = shl nuw i32 %1, 2 %3 = add i32 %2, -4 %4 = lshr i32 %3, 2 %5 = add nuw nsw i32 %4, 1 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph - + vector.ph: ; preds = %entry %trip.count.minus.1 = add i32 %N, -1 %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer call void @llvm.set.loop.iterations.i32(i32 %5) br label %vector.body - + vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %lsr.iv2 = phi i32* [ %scevgep3, %vector.body ], [ %a, %vector.ph ] %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %b, %vector.ph ] %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %9, %vector.body ] %6 = phi i32 [ %5, %vector.ph ], [ %10, %vector.body ] %lsr.iv24 = bitcast i32* %lsr.iv2 to <4 x i32>* %lsr.iv1 = bitcast i32* %lsr.iv to <4 x i32>* %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, - %7 = icmp ule <4 x i32> %induction, %broadcast.splat12 + + ; %7 = icmp ule <4 x i32> %induction, %broadcast.splat12 + %7 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv24, i32 4, <4 x i1> %7, <4 x i32> undef) %wide.masked.load13 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1, i32 4, <4 x i1> %7, <4 x i32> undef) %8 = mul nsw <4 x i32> %wide.masked.load13, %wide.masked.load %9 = add nsw <4 x i32> %8, %vec.phi %index.next = add i32 %index, 4 %scevgep = getelementptr i32, i32* %lsr.iv, i32 4 %scevgep3 = getelementptr i32, i32* %lsr.iv2, i32 4 %10 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %6, i32 1) %11 = icmp ne i32 %10, 0 br i1 %11, label %vector.body, label %middle.block - + middle.block: ; preds = %vector.body - %12 = icmp ule <4 x i32> %induction, %broadcast.splat12 +; TODO: check that the intrinsic is also emitted here by the loop vectoriser +; %12 = icmp ule <4 x i32> %induction, %broadcast.splat12 + %12 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %13 = select <4 x i1> %12, <4 x i32> %9, <4 x i32> %vec.phi %14 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %13) br label %for.cond.cleanup - + for.cond.cleanup: ; preds = %middle.block, %entry %res.0.lcssa = phi i32 [ 0, %entry ], [ %14, %middle.block ] ret i32 %res.0.lcssa } - + declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) declare void @llvm.set.loop.iterations.i32(i32) declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) - +declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) diff --git a/llvm/test/CodeGen/Thumb2/mve-fma-loops.ll b/llvm/test/CodeGen/Thumb2/mve-fma-loops.ll index bd1141d4b1af..0ba224415b67 100644 --- a/llvm/test/CodeGen/Thumb2/mve-fma-loops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-fma-loops.ll @@ -1,712 +1,773 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs -disable-mve-tail-predication=false %s -o - | FileCheck %s define arm_aapcs_vfpcc void @fmas1(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) { ; CHECK-LABEL: fmas1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt -; CHECK-NEXT: poplt {r7, pc} +; CHECK-NEXT: poplt {r4, pc} ; CHECK-NEXT: vmov r12, s0 +; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB0_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: adds r4, #4 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16 ; CHECK-NEXT: vfmas.f32 q1, q0, r12 ; CHECK-NEXT: vstrw.32 q1, [r2], #16 ; CHECK-NEXT: letp lr, .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: pop {r4, pc} entry: %cmp8 = icmp sgt i32 %n, 0 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup vector.ph: ; preds = %entry %n.rnd.up = add i32 %n, 3 %n.vec = and i32 %n.rnd.up, -4 %trip.count.minus.1 = add i32 %n, -1 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0 %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = or <4 x i32> %broadcast.splat, %0 = getelementptr inbounds float, float* %x, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + + ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast float* %0 to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) %3 = getelementptr inbounds float, float* %y, i32 %index %4 = bitcast float* %3 to <4 x float>* %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %1, <4 x float> undef) %5 = call fast <4 x float> @llvm.fma.v4f32(<4 x float> %wide.masked.load, <4 x float> %wide.masked.load12, <4 x float> %broadcast.splat14) %6 = getelementptr inbounds float, float* %z, i32 %index %7 = bitcast float* %6 to <4 x float>* call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %5, <4 x float>* %7, i32 4, <4 x i1> %1) %index.next = add i32 %index, 4 %8 = icmp eq i32 %index.next, %n.vec br i1 %8, label %for.cond.cleanup, label %vector.body for.cond.cleanup: ; preds = %vector.body, %entry ret void } define arm_aapcs_vfpcc void @fmas2(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) { ; CHECK-LABEL: fmas2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt -; CHECK-NEXT: poplt {r7, pc} +; CHECK-NEXT: poplt {r4, pc} ; CHECK-NEXT: vmov r12, s0 +; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB1_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: adds r4, #4 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: vldrw.u32 q1, [r1], #16 ; CHECK-NEXT: vfmas.f32 q1, q0, r12 ; CHECK-NEXT: vstrw.32 q1, [r2], #16 ; CHECK-NEXT: letp lr, .LBB1_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: pop {r4, pc} entry: %cmp8 = icmp sgt i32 %n, 0 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup vector.ph: ; preds = %entry %n.rnd.up = add i32 %n, 3 %n.vec = and i32 %n.rnd.up, -4 %trip.count.minus.1 = add i32 %n, -1 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0 %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = or <4 x i32> %broadcast.splat, %0 = getelementptr inbounds float, float* %x, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + + ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast float* %0 to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) %3 = getelementptr inbounds float, float* %y, i32 %index %4 = bitcast float* %3 to <4 x float>* %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %1, <4 x float> undef) %5 = fmul fast <4 x float> %wide.masked.load12, %wide.masked.load %6 = fadd fast <4 x float> %5, %broadcast.splat14 %7 = getelementptr inbounds float, float* %z, i32 %index %8 = bitcast float* %7 to <4 x float>* call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %6, <4 x float>* %8, i32 4, <4 x i1> %1) %index.next = add i32 %index, 4 %9 = icmp eq i32 %index.next, %n.vec br i1 %9, label %for.cond.cleanup, label %vector.body for.cond.cleanup: ; preds = %vector.body, %entry ret void } define arm_aapcs_vfpcc void @fma1(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) { ; CHECK-LABEL: fma1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt -; CHECK-NEXT: poplt {r7, pc} +; CHECK-NEXT: poplt {r4, pc} ; CHECK-NEXT: vmov r12, s0 +; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB2_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: adds r4, #4 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: vldrw.u32 q1, [r1], #16 ; CHECK-NEXT: vfma.f32 q1, q0, r12 ; CHECK-NEXT: vstrw.32 q1, [r2], #16 ; CHECK-NEXT: letp lr, .LBB2_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: pop {r4, pc} entry: %cmp8 = icmp sgt i32 %n, 0 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup vector.ph: ; preds = %entry %n.rnd.up = add i32 %n, 3 %n.vec = and i32 %n.rnd.up, -4 %trip.count.minus.1 = add i32 %n, -1 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0 %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = or <4 x i32> %broadcast.splat, %0 = getelementptr inbounds float, float* %x, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + + ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast float* %0 to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) %3 = getelementptr inbounds float, float* %y, i32 %index %4 = bitcast float* %3 to <4 x float>* %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %1, <4 x float> undef) %5 = call fast <4 x float> @llvm.fma.v4f32(<4 x float> %wide.masked.load, <4 x float> %broadcast.splat14, <4 x float> %wide.masked.load12) %6 = getelementptr inbounds float, float* %z, i32 %index %7 = bitcast float* %6 to <4 x float>* call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %5, <4 x float>* %7, i32 4, <4 x i1> %1) %index.next = add i32 %index, 4 %8 = icmp eq i32 %index.next, %n.vec br i1 %8, label %for.cond.cleanup, label %vector.body for.cond.cleanup: ; preds = %vector.body, %entry ret void } define arm_aapcs_vfpcc void @fma2(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) { ; CHECK-LABEL: fma2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt -; CHECK-NEXT: poplt {r7, pc} +; CHECK-NEXT: poplt {r4, pc} ; CHECK-NEXT: vmov r12, s0 +; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB3_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: adds r4, #4 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: vldrw.u32 q1, [r1], #16 ; CHECK-NEXT: vfma.f32 q1, q0, r12 ; CHECK-NEXT: vstrw.32 q1, [r2], #16 ; CHECK-NEXT: letp lr, .LBB3_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: pop {r4, pc} entry: %cmp8 = icmp sgt i32 %n, 0 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup vector.ph: ; preds = %entry %n.rnd.up = add i32 %n, 3 %n.vec = and i32 %n.rnd.up, -4 %trip.count.minus.1 = add i32 %n, -1 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer %broadcast.splatinsert12 = insertelement <4 x float> undef, float %a, i32 0 %broadcast.splat13 = shufflevector <4 x float> %broadcast.splatinsert12, <4 x float> undef, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = or <4 x i32> %broadcast.splat, %0 = getelementptr inbounds float, float* %x, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + + ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast float* %0 to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) %3 = fmul fast <4 x float> %wide.masked.load, %broadcast.splat13 %4 = getelementptr inbounds float, float* %y, i32 %index %5 = bitcast float* %4 to <4 x float>* %wide.masked.load14 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %5, i32 4, <4 x i1> %1, <4 x float> undef) %6 = fadd fast <4 x float> %3, %wide.masked.load14 %7 = getelementptr inbounds float, float* %z, i32 %index %8 = bitcast float* %7 to <4 x float>* call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %6, <4 x float>* %8, i32 4, <4 x i1> %1) %index.next = add i32 %index, 4 %9 = icmp eq i32 %index.next, %n.vec br i1 %9, label %for.cond.cleanup, label %vector.body for.cond.cleanup: ; preds = %vector.body, %entry ret void } define arm_aapcs_vfpcc void @fmss1(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) { ; CHECK-LABEL: fmss1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt -; CHECK-NEXT: poplt {r7, pc} -; CHECK-NEXT: vmov r12, s0 +; CHECK-NEXT: poplt {r4, pc} +; CHECK-NEXT: vmov r4, s0 ; CHECK-NEXT: dlstp.32 lr, r3 -; CHECK-NEXT: eor r12, r12, #-2147483648 +; CHECK-NEXT: eor r12, r4, #-2147483648 +; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: .LBB4_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: adds r4, #4 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16 ; CHECK-NEXT: vfmas.f32 q1, q0, r12 ; CHECK-NEXT: vstrw.32 q1, [r2], #16 ; CHECK-NEXT: letp lr, .LBB4_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: pop {r4, pc} entry: %cmp8 = icmp sgt i32 %n, 0 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup vector.ph: ; preds = %entry %fneg = fneg fast float %a %n.rnd.up = add i32 %n, 3 %n.vec = and i32 %n.rnd.up, -4 %trip.count.minus.1 = add i32 %n, -1 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer %broadcast.splatinsert13 = insertelement <4 x float> undef, float %fneg, i32 0 %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = or <4 x i32> %broadcast.splat, %0 = getelementptr inbounds float, float* %x, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + + ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast float* %0 to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) %3 = getelementptr inbounds float, float* %y, i32 %index %4 = bitcast float* %3 to <4 x float>* %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %1, <4 x float> undef) %5 = call fast <4 x float> @llvm.fma.v4f32(<4 x float> %wide.masked.load, <4 x float> %wide.masked.load12, <4 x float> %broadcast.splat14) %6 = getelementptr inbounds float, float* %z, i32 %index %7 = bitcast float* %6 to <4 x float>* call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %5, <4 x float>* %7, i32 4, <4 x i1> %1) %index.next = add i32 %index, 4 %8 = icmp eq i32 %index.next, %n.vec br i1 %8, label %for.cond.cleanup, label %vector.body for.cond.cleanup: ; preds = %vector.body, %entry ret void } define arm_aapcs_vfpcc void @fmss2(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) { ; CHECK-LABEL: fmss2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt -; CHECK-NEXT: poplt {r7, pc} -; CHECK-NEXT: vmov r12, s0 -; CHECK-NEXT: vdup.32 q0, r12 -; CHECK-NEXT: dlstp.32 lr, r3 +; CHECK-NEXT: poplt {r4, pc} +; CHECK-NEXT: vmov r4, s0 +; CHECK-NEXT: vdup.32 q0, r4 ; CHECK-NEXT: vneg.f32 q0, q0 +; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB5_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: add.w r12, r12, #4 ; CHECK-NEXT: vmov q3, q0 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16 ; CHECK-NEXT: vldrw.u32 q2, [r1], #16 ; CHECK-NEXT: vfma.f32 q3, q2, q1 ; CHECK-NEXT: vstrw.32 q3, [r2], #16 ; CHECK-NEXT: letp lr, .LBB5_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: pop {r4, pc} entry: %cmp8 = icmp sgt i32 %n, 0 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup vector.ph: ; preds = %entry %n.rnd.up = add i32 %n, 3 %n.vec = and i32 %n.rnd.up, -4 %trip.count.minus.1 = add i32 %n, -1 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0 %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = or <4 x i32> %broadcast.splat, %0 = getelementptr inbounds float, float* %x, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + + ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast float* %0 to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) %3 = getelementptr inbounds float, float* %y, i32 %index %4 = bitcast float* %3 to <4 x float>* %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %1, <4 x float> undef) %5 = fmul fast <4 x float> %wide.masked.load12, %wide.masked.load %6 = fsub fast <4 x float> %5, %broadcast.splat14 %7 = getelementptr inbounds float, float* %z, i32 %index %8 = bitcast float* %7 to <4 x float>* call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %6, <4 x float>* %8, i32 4, <4 x i1> %1) %index.next = add i32 %index, 4 %9 = icmp eq i32 %index.next, %n.vec br i1 %9, label %for.cond.cleanup, label %vector.body for.cond.cleanup: ; preds = %vector.body, %entry ret void } define arm_aapcs_vfpcc void @fmss3(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) { ; CHECK-LABEL: fmss3: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r7, pc} ; CHECK-NEXT: vmov r12, s0 ; CHECK-NEXT: vdup.32 q0, r12 +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB6_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: add.w r12, r12, #4 ; CHECK-NEXT: vmov q3, q0 ; CHECK-NEXT: vldrw.u32 q1, [r1], #16 ; CHECK-NEXT: vldrw.u32 q2, [r0], #16 ; CHECK-NEXT: vfms.f32 q3, q2, q1 ; CHECK-NEXT: vstrw.32 q3, [r2], #16 ; CHECK-NEXT: letp lr, .LBB6_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} entry: %cmp8 = icmp sgt i32 %n, 0 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup vector.ph: ; preds = %entry %n.rnd.up = add i32 %n, 3 %n.vec = and i32 %n.rnd.up, -4 %trip.count.minus.1 = add i32 %n, -1 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0 %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = or <4 x i32> %broadcast.splat, %0 = getelementptr inbounds float, float* %x, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + + ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast float* %0 to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) %3 = getelementptr inbounds float, float* %y, i32 %index %4 = bitcast float* %3 to <4 x float>* %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %1, <4 x float> undef) %5 = fneg fast <4 x float> %wide.masked.load12 %6 = call fast <4 x float> @llvm.fma.v4f32(<4 x float> %wide.masked.load, <4 x float> %5, <4 x float> %broadcast.splat14) %7 = getelementptr inbounds float, float* %z, i32 %index %8 = bitcast float* %7 to <4 x float>* call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %6, <4 x float>* %8, i32 4, <4 x i1> %1) %index.next = add i32 %index, 4 %9 = icmp eq i32 %index.next, %n.vec br i1 %9, label %for.cond.cleanup, label %vector.body for.cond.cleanup: ; preds = %vector.body, %entry ret void } define arm_aapcs_vfpcc void @fmss4(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) { ; CHECK-LABEL: fmss4: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r7, pc} ; CHECK-NEXT: vmov r12, s0 ; CHECK-NEXT: vdup.32 q0, r12 +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB7_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: add.w r12, r12, #4 ; CHECK-NEXT: vmov q3, q0 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16 ; CHECK-NEXT: vldrw.u32 q2, [r1], #16 ; CHECK-NEXT: vfms.f32 q3, q2, q1 ; CHECK-NEXT: vstrw.32 q3, [r2], #16 ; CHECK-NEXT: letp lr, .LBB7_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} entry: %cmp8 = icmp sgt i32 %n, 0 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup vector.ph: ; preds = %entry %n.rnd.up = add i32 %n, 3 %n.vec = and i32 %n.rnd.up, -4 %trip.count.minus.1 = add i32 %n, -1 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0 %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = or <4 x i32> %broadcast.splat, %0 = getelementptr inbounds float, float* %x, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + + ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast float* %0 to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) %3 = getelementptr inbounds float, float* %y, i32 %index %4 = bitcast float* %3 to <4 x float>* %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %1, <4 x float> undef) %5 = fmul fast <4 x float> %wide.masked.load12, %wide.masked.load %6 = fsub fast <4 x float> %broadcast.splat14, %5 %7 = getelementptr inbounds float, float* %z, i32 %index %8 = bitcast float* %7 to <4 x float>* call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %6, <4 x float>* %8, i32 4, <4 x i1> %1) %index.next = add i32 %index, 4 %9 = icmp eq i32 %index.next, %n.vec br i1 %9, label %for.cond.cleanup, label %vector.body for.cond.cleanup: ; preds = %vector.body, %entry ret void } define arm_aapcs_vfpcc void @fms1(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) { ; CHECK-LABEL: fms1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt -; CHECK-NEXT: poplt {r7, pc} -; CHECK-NEXT: vmov r12, s0 +; CHECK-NEXT: poplt {r4, pc} +; CHECK-NEXT: vmov r4, s0 ; CHECK-NEXT: dlstp.32 lr, r3 -; CHECK-NEXT: eor r12, r12, #-2147483648 +; CHECK-NEXT: eor r12, r4, #-2147483648 +; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: .LBB8_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: adds r4, #4 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: vldrw.u32 q1, [r1], #16 ; CHECK-NEXT: vfma.f32 q1, q0, r12 ; CHECK-NEXT: vstrw.32 q1, [r2], #16 ; CHECK-NEXT: letp lr, .LBB8_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: pop {r4, pc} entry: %cmp8 = icmp sgt i32 %n, 0 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup vector.ph: ; preds = %entry %fneg = fneg fast float %a %n.rnd.up = add i32 %n, 3 %n.vec = and i32 %n.rnd.up, -4 %trip.count.minus.1 = add i32 %n, -1 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer %broadcast.splatinsert13 = insertelement <4 x float> undef, float %fneg, i32 0 %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = or <4 x i32> %broadcast.splat, %0 = getelementptr inbounds float, float* %x, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + + ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast float* %0 to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) %3 = getelementptr inbounds float, float* %y, i32 %index %4 = bitcast float* %3 to <4 x float>* %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %1, <4 x float> undef) %5 = call fast <4 x float> @llvm.fma.v4f32(<4 x float> %wide.masked.load, <4 x float> %broadcast.splat14, <4 x float> %wide.masked.load12) %6 = getelementptr inbounds float, float* %z, i32 %index %7 = bitcast float* %6 to <4 x float>* call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %5, <4 x float>* %7, i32 4, <4 x i1> %1) %index.next = add i32 %index, 4 %8 = icmp eq i32 %index.next, %n.vec br i1 %8, label %for.cond.cleanup, label %vector.body for.cond.cleanup: ; preds = %vector.body, %entry ret void } define arm_aapcs_vfpcc void @fms2(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) { ; CHECK-LABEL: fms2: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r7, pc} ; CHECK-NEXT: vmov r12, s0 ; CHECK-NEXT: vdup.32 q0, r12 +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB9_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: add.w r12, r12, #4 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16 ; CHECK-NEXT: vldrw.u32 q2, [r1], #16 ; CHECK-NEXT: vfms.f32 q2, q1, q0 ; CHECK-NEXT: vstrw.32 q2, [r2], #16 ; CHECK-NEXT: letp lr, .LBB9_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} entry: %cmp8 = icmp sgt i32 %n, 0 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup vector.ph: ; preds = %entry %n.rnd.up = add i32 %n, 3 %n.vec = and i32 %n.rnd.up, -4 %trip.count.minus.1 = add i32 %n, -1 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0 %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = or <4 x i32> %broadcast.splat, %0 = getelementptr inbounds float, float* %x, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + + ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast float* %0 to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) %3 = getelementptr inbounds float, float* %y, i32 %index %4 = bitcast float* %3 to <4 x float>* %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %1, <4 x float> undef) %5 = fmul fast <4 x float> %wide.masked.load, %broadcast.splat14 %6 = fsub fast <4 x float> %wide.masked.load12, %5 %7 = getelementptr inbounds float, float* %z, i32 %index %8 = bitcast float* %7 to <4 x float>* call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %6, <4 x float>* %8, i32 4, <4 x i1> %1) %index.next = add i32 %index, 4 %9 = icmp eq i32 %index.next, %n.vec br i1 %9, label %for.cond.cleanup, label %vector.body for.cond.cleanup: ; preds = %vector.body, %entry ret void } define arm_aapcs_vfpcc void @fms3(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) { ; CHECK-LABEL: fms3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt -; CHECK-NEXT: poplt {r7, pc} +; CHECK-NEXT: poplt {r4, pc} ; CHECK-NEXT: vmov r12, s0 +; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB10_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r1], #16 -; CHECK-NEXT: vldrw.u32 q1, [r0], #16 -; CHECK-NEXT: vneg.f32 q0, q0 -; CHECK-NEXT: vfma.f32 q0, q1, r12 -; CHECK-NEXT: vstrw.32 q0, [r2], #16 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 +; CHECK-NEXT: vldrw.u32 q1, [r1], #16 +; CHECK-NEXT: adds r4, #4 +; CHECK-NEXT: vneg.f32 q1, q1 +; CHECK-NEXT: vfma.f32 q1, q0, r12 +; CHECK-NEXT: vstrw.32 q1, [r2], #16 ; CHECK-NEXT: letp lr, .LBB10_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: pop {r4, pc} entry: %cmp8 = icmp sgt i32 %n, 0 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup vector.ph: ; preds = %entry %n.rnd.up = add i32 %n, 3 %n.vec = and i32 %n.rnd.up, -4 %trip.count.minus.1 = add i32 %n, -1 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0 %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = or <4 x i32> %broadcast.splat, %0 = getelementptr inbounds float, float* %x, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + + ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast float* %0 to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) %3 = getelementptr inbounds float, float* %y, i32 %index %4 = bitcast float* %3 to <4 x float>* %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %1, <4 x float> undef) %5 = fneg fast <4 x float> %wide.masked.load12 %6 = call fast <4 x float> @llvm.fma.v4f32(<4 x float> %wide.masked.load, <4 x float> %broadcast.splat14, <4 x float> %5) %7 = getelementptr inbounds float, float* %z, i32 %index %8 = bitcast float* %7 to <4 x float>* call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %6, <4 x float>* %8, i32 4, <4 x i1> %1) %index.next = add i32 %index, 4 %9 = icmp eq i32 %index.next, %n.vec br i1 %9, label %for.cond.cleanup, label %vector.body for.cond.cleanup: ; preds = %vector.body, %entry ret void } define arm_aapcs_vfpcc void @fms4(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) { ; CHECK-LABEL: fms4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt -; CHECK-NEXT: poplt {r7, pc} +; CHECK-NEXT: poplt {r4, pc} ; CHECK-NEXT: vmov r12, s0 +; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB11_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r1], #16 -; CHECK-NEXT: vldrw.u32 q1, [r0], #16 -; CHECK-NEXT: vneg.f32 q0, q0 -; CHECK-NEXT: vfma.f32 q0, q1, r12 -; CHECK-NEXT: vstrw.32 q0, [r2], #16 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 +; CHECK-NEXT: vldrw.u32 q1, [r1], #16 +; CHECK-NEXT: adds r4, #4 +; CHECK-NEXT: vneg.f32 q1, q1 +; CHECK-NEXT: vfma.f32 q1, q0, r12 +; CHECK-NEXT: vstrw.32 q1, [r2], #16 ; CHECK-NEXT: letp lr, .LBB11_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: pop {r4, pc} entry: %cmp8 = icmp sgt i32 %n, 0 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup vector.ph: ; preds = %entry %n.rnd.up = add i32 %n, 3 %n.vec = and i32 %n.rnd.up, -4 %trip.count.minus.1 = add i32 %n, -1 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer %broadcast.splatinsert12 = insertelement <4 x float> undef, float %a, i32 0 %broadcast.splat13 = shufflevector <4 x float> %broadcast.splatinsert12, <4 x float> undef, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = or <4 x i32> %broadcast.splat, %0 = getelementptr inbounds float, float* %x, i32 %index - %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + + ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %2 = bitcast float* %0 to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) %3 = fmul fast <4 x float> %wide.masked.load, %broadcast.splat13 %4 = getelementptr inbounds float, float* %y, i32 %index %5 = bitcast float* %4 to <4 x float>* %wide.masked.load14 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %5, i32 4, <4 x i1> %1, <4 x float> undef) %6 = fsub fast <4 x float> %3, %wide.masked.load14 %7 = getelementptr inbounds float, float* %z, i32 %index %8 = bitcast float* %7 to <4 x float>* call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %6, <4 x float>* %8, i32 4, <4 x i1> %1) %index.next = add i32 %index, 4 %9 = icmp eq i32 %index.next, %n.vec br i1 %9, label %for.cond.cleanup, label %vector.body for.cond.cleanup: ; preds = %vector.body, %entry ret void } declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4 x i1>, <4 x float>) declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32 immarg, <4 x i1>) +declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)