Index: llvm/lib/Target/ARM/MVETailPredication.cpp =================================================================== --- llvm/lib/Target/ARM/MVETailPredication.cpp +++ llvm/lib/Target/ARM/MVETailPredication.cpp @@ -31,15 +31,14 @@ /// blocks of instructions operating on different vector types. /// /// This pass: -/// 1) Pattern matches the scalar iteration count produced by the vectoriser. -/// The scalar loop iteration count represents the number of elements to be -/// processed. -/// TODO: this could be emitted using an intrinsic, similar to the hardware -/// loop intrinsics, so that we don't need to pattern match this here. -/// 2) Inserts the VCTP intrinsic to represent the effect of -/// tail predication. This will be picked up by the ARM Low-overhead loop -/// pass, which performs the final transformation to a DLSTP or WLSTP -/// tail-predicated loop. +/// 1) Checks if the predicates of the masked load/store instructions are +/// generated by intrinsic @llvm.get.active.lanes(). This intrinsic consumes +/// the Backedge Taken Count (BTC) of the scalar loop as its second argument, +/// which we extract to set up the number of elements processed by the loop. +/// 2) Intrinsic @llvm.get.active.lanes() is then replaced by the MVE target +/// specific VCTP intrinsic to represent the effect of tail predication. +/// This will be picked up by the ARM Low-overhead loop pass, which performs +/// the final transformation to a DLSTP or WLSTP tail-predicated loop. #include "ARM.h" #include "ARMSubtarget.h" @@ -70,28 +69,6 @@ cl::desc("Disable MVE Tail Predication")); namespace { -// Bookkeeping for pattern matching the loop trip count and the number of -// elements processed by the loop. -struct TripCountPattern { - // An icmp instruction that calculates a predicate of active/inactive lanes - // used by the masked loads/stores. - Instruction *Predicate = nullptr; - - // The add instruction that increments the IV. - Value *TripCount = nullptr; - - // The number of elements processed by the vector loop. - Value *NumElements = nullptr; - - // Other instructions in the icmp chain that calculate the predicate. - FixedVectorType *VecTy = nullptr; - Instruction *Shuffle = nullptr; - Instruction *Induction = nullptr; - - TripCountPattern(Instruction *P, Value *TC, FixedVectorType *VT) - : Predicate(P), TripCount(TC), VecTy(VT){}; -}; - class MVETailPredication : public LoopPass { SmallVector MaskedInsts; Loop *L = nullptr; @@ -129,17 +106,9 @@ /// load/stores. bool IsPredicatedVectorLoop(); - /// Compute a value for the total number of elements that the predicated - /// loop will process if it is a runtime value. - bool ComputeRuntimeElements(TripCountPattern &TCP); - - /// Return whether this is the icmp that generates an i1 vector, based - /// upon a loop counter and a limit that is defined outside the loop, - /// that generates the active/inactive lanes required for tail-predication. - bool isTailPredicate(TripCountPattern &TCP); - /// Insert the intrinsic to represent the effect of tail predication. - void InsertVCTPIntrinsic(TripCountPattern &TCP, + void InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask, Value *TripCount, + FixedVectorType *VecTy, DenseMap &NewPredicates); /// Rematerialize the iteration count in exit blocks, which enables @@ -262,76 +231,6 @@ return false; } -// Pattern match predicates/masks and determine if they use the loop induction -// variable to control the number of elements processed by the loop. If so, -// the loop is a candidate for tail-predication. -bool MVETailPredication::isTailPredicate(TripCountPattern &TCP) { - using namespace PatternMatch; - - // Pattern match the loop body and find the add with takes the index iv - // and adds a constant vector to it: - // - // vector.body: - // .. - // %index = phi i32 - // %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 - // %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, - // <4 x i32> undef, - // <4 x i32> zeroinitializer - // %induction = [add|or] <4 x i32> %broadcast.splat, - // %pred = icmp ule <4 x i32> %induction, %broadcast.splat11 - // - // Please note that the 'or' is equivalent to the 'and' here, this relies on - // BroadcastSplat being the IV which we know is a phi with 0 start and Lanes - // increment, which is all being checked below. - Instruction *BroadcastSplat = nullptr; - Constant *Const = nullptr; - if (!match(TCP.Induction, - m_Add(m_Instruction(BroadcastSplat), m_Constant(Const))) && - !match(TCP.Induction, - m_Or(m_Instruction(BroadcastSplat), m_Constant(Const)))) - return false; - - // Check that we're adding <0, 1, 2, 3... - if (auto *CDS = dyn_cast(Const)) { - for (unsigned i = 0; i < CDS->getNumElements(); ++i) { - if (CDS->getElementAsInteger(i) != i) - return false; - } - } else - return false; - - Instruction *Insert = nullptr; - // The shuffle which broadcasts the index iv into a vector. - if (!match(BroadcastSplat, - m_ShuffleVector(m_Instruction(Insert), m_Undef(), m_ZeroMask()))) - return false; - - // The insert element which initialises a vector with the index iv. - Instruction *IV = nullptr; - if (!match(Insert, m_InsertElement(m_Undef(), m_Instruction(IV), m_Zero()))) - return false; - - // The index iv. - auto *Phi = dyn_cast(IV); - if (!Phi) - return false; - - // TODO: Don't think we need to check the entry value. - Value *OnEntry = Phi->getIncomingValueForBlock(L->getLoopPreheader()); - if (!match(OnEntry, m_Zero())) - return false; - - Value *InLoop = Phi->getIncomingValueForBlock(L->getLoopLatch()); - unsigned Lanes = cast(Insert->getType())->getNumElements(); - - Instruction *LHS = nullptr; - if (!match(InLoop, m_Add(m_Instruction(LHS), m_SpecificInt(Lanes)))) - return false; - - return LHS == Phi; -} - static FixedVectorType *getVectorType(IntrinsicInst *I) { unsigned TypeOp = I->getIntrinsicID() == Intrinsic::masked_load ? 0 : 1; auto *PtrTy = cast(I->getOperand(TypeOp)->getType()); @@ -368,177 +267,6 @@ return !MaskedInsts.empty(); } -// Pattern match the predicate, which is an icmp with a constant vector of this -// form: -// -// icmp ult <4 x i32> %induction, -// -// and return the constant, i.e. 32002 in this example. This is assumed to be -// the scalar loop iteration count: the number of loop elements by the -// the vector loop. Further checks are performed in function isTailPredicate(), -// to verify 'induction' behaves as an induction variable. -// -static bool ComputeConstElements(TripCountPattern &TCP) { - if (!dyn_cast(TCP.TripCount)) - return false; - - ConstantInt *VF = ConstantInt::get( - cast(TCP.TripCount->getType()), TCP.VecTy->getNumElements()); - using namespace PatternMatch; - CmpInst::Predicate CC; - - if (!match(TCP.Predicate, m_ICmp(CC, m_Instruction(TCP.Induction), - m_AnyIntegralConstant())) || - CC != ICmpInst::ICMP_ULT) - return false; - - LLVM_DEBUG(dbgs() << "ARM TP: icmp with constants: "; TCP.Predicate->dump();); - Value *ConstVec = TCP.Predicate->getOperand(1); - - auto *CDS = dyn_cast(ConstVec); - if (!CDS || CDS->getNumElements() != VF->getSExtValue()) - return false; - - if ((TCP.NumElements = CDS->getSplatValue())) { - assert(dyn_cast(TCP.NumElements)->getSExtValue() % - VF->getSExtValue() != - 0 && - "tail-predication: trip count should not be a multiple of the VF"); - LLVM_DEBUG(dbgs() << "ARM TP: Found const elem count: " << *TCP.NumElements - << "\n"); - return true; - } - return false; -} - -// Pattern match the loop iteration count setup: -// -// %trip.count.minus.1 = add i32 %N, -1 -// %broadcast.splatinsert10 = insertelement <4 x i32> undef, -// i32 %trip.count.minus.1, i32 0 -// %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, -// <4 x i32> undef, -// <4 x i32> zeroinitializer -// .. -// vector.body: -// .. -// -static bool MatchElemCountLoopSetup(Loop *L, Instruction *Shuffle, - Value *NumElements) { - using namespace PatternMatch; - Instruction *Insert = nullptr; - - if (!match(Shuffle, - m_ShuffleVector(m_Instruction(Insert), m_Undef(), m_ZeroMask()))) - return false; - - // Insert the limit into a vector. - Instruction *BECount = nullptr; - if (!match(Insert, - m_InsertElement(m_Undef(), m_Instruction(BECount), m_Zero()))) - return false; - - // The limit calculation, backedge count. - Value *TripCount = nullptr; - if (!match(BECount, m_Add(m_Value(TripCount), m_AllOnes()))) - return false; - - if (TripCount != NumElements || !L->isLoopInvariant(BECount)) - return false; - - return true; -} - -bool MVETailPredication::ComputeRuntimeElements(TripCountPattern &TCP) { - using namespace PatternMatch; - const SCEV *TripCountSE = SE->getSCEV(TCP.TripCount); - ConstantInt *VF = ConstantInt::get( - cast(TCP.TripCount->getType()), TCP.VecTy->getNumElements()); - - if (VF->equalsInt(1)) - return false; - - CmpInst::Predicate Pred; - if (!match(TCP.Predicate, m_ICmp(Pred, m_Instruction(TCP.Induction), - m_Instruction(TCP.Shuffle))) || - Pred != ICmpInst::ICMP_ULE) - return false; - - LLVM_DEBUG(dbgs() << "Computing number of elements for vector trip count: "; - TCP.TripCount->dump()); - - // Otherwise, continue and try to pattern match the vector iteration - // count expression - auto VisitAdd = [&](const SCEVAddExpr *S) -> const SCEVMulExpr * { - if (auto *Const = dyn_cast(S->getOperand(0))) { - if (Const->getAPInt() != -VF->getValue()) - return nullptr; - } else - return nullptr; - return dyn_cast(S->getOperand(1)); - }; - - auto VisitMul = [&](const SCEVMulExpr *S) -> const SCEVUDivExpr * { - if (auto *Const = dyn_cast(S->getOperand(0))) { - if (Const->getValue() != VF) - return nullptr; - } else - return nullptr; - return dyn_cast(S->getOperand(1)); - }; - - auto VisitDiv = [&](const SCEVUDivExpr *S) -> const SCEV * { - if (auto *Const = dyn_cast(S->getRHS())) { - if (Const->getValue() != VF) - return nullptr; - } else - return nullptr; - - if (auto *RoundUp = dyn_cast(S->getLHS())) { - if (auto *Const = dyn_cast(RoundUp->getOperand(0))) { - if (Const->getAPInt() != (VF->getValue() - 1)) - return nullptr; - } else - return nullptr; - - return RoundUp->getOperand(1); - } - return nullptr; - }; - - // TODO: Can we use SCEV helpers, such as findArrayDimensions, and friends to - // determine the numbers of elements instead? Looks like this is what is used - // for delinearization, but I'm not sure if it can be applied to the - // vectorized form - at least not without a bit more work than I feel - // comfortable with. - - // Search for Elems in the following SCEV: - // (1 + ((-VF + (VF * (((VF - 1) + %Elems) /u VF))) /u VF)) - const SCEV *Elems = nullptr; - if (auto *TC = dyn_cast(TripCountSE)) - if (auto *Div = dyn_cast(TC->getOperand(1))) - if (auto *Add = dyn_cast(Div->getLHS())) - if (auto *Mul = VisitAdd(Add)) - if (auto *Div = VisitMul(Mul)) - if (auto *Res = VisitDiv(Div)) - Elems = Res; - - if (!Elems) - return false; - - Instruction *InsertPt = L->getLoopPreheader()->getTerminator(); - if (!isSafeToExpandAt(Elems, InsertPt, *SE)) - return false; - - auto DL = L->getHeader()->getModule()->getDataLayout(); - SCEVExpander Expander(*SE, DL, "elements"); - TCP.NumElements = Expander.expandCodeFor(Elems, Elems->getType(), InsertPt); - - if (!MatchElemCountLoopSetup(L, TCP.Shuffle, TCP.NumElements)) - return false; - - return true; -} // Look through the exit block to see whether there's a duplicate predicate // instruction. This can happen when we need to perform a select on values @@ -587,10 +315,10 @@ if (auto *OpI = dyn_cast(U)) MaybeDead.insert(OpI); - I->dropAllReferences(); Dead.insert(I); } + for (auto *I : Dead) { LLVM_DEBUG(dbgs() << "ARM TP: removing dead insn: "; I->dump()); I->eraseFromParent(); @@ -602,23 +330,36 @@ return ClonedVCTPInExitBlock; } -void MVETailPredication::InsertVCTPIntrinsic(TripCountPattern &TCP, +void MVETailPredication::InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask, + Value *TripCount, FixedVectorType *VecTy, DenseMap &NewPredicates) { - IRBuilder<> Builder(L->getHeader()->getFirstNonPHI()); + + IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); Module *M = L->getHeader()->getModule(); Type *Ty = IntegerType::get(M->getContext(), 32); + // The backedge-taken count in @llvm.get.active.lane.mask, its 2nd operand, + // is one less than the trip count. So, here we create: + // %num.elements = %BTC + 1 + // and dump that in the preheader. + Value *ScalarBTC = ActiveLaneMask->getOperand(1); + Builder.SetInsertPoint(L->getLoopPreheader()->getTerminator()); + Value *NumElements = Builder.CreateAdd(ScalarBTC, + ConstantInt::get(ScalarBTC->getType(), 1), "num.elements"); + // Insert a phi to count the number of elements processed by the loop. + Builder.SetInsertPoint(L->getHeader()->getFirstNonPHI() ); PHINode *Processed = Builder.CreatePHI(Ty, 2); - Processed->addIncoming(TCP.NumElements, L->getLoopPreheader()); + Processed->addIncoming(NumElements, L->getLoopPreheader()); - // Insert the intrinsic to represent the effect of tail predication. - Builder.SetInsertPoint(cast(TCP.Predicate)); + // Replace @llvm.get.active.mask() with the ARM specific VCTP intrinic, and thus + // represent the effect of tail predication. + Builder.SetInsertPoint(ActiveLaneMask); ConstantInt *Factor = - ConstantInt::get(cast(Ty), TCP.VecTy->getNumElements()); + ConstantInt::get(cast(Ty), VecTy->getNumElements()); Intrinsic::ID VCTPID; - switch (TCP.VecTy->getNumElements()) { + switch (VecTy->getNumElements()) { default: llvm_unreachable("unexpected number of lanes"); case 4: VCTPID = Intrinsic::arm_mve_vctp32; break; @@ -632,9 +373,9 @@ // purposes, but takes a v4i1 instead of a v2i1. } Function *VCTP = Intrinsic::getDeclaration(M, VCTPID); - Value *TailPredicate = Builder.CreateCall(VCTP, Processed); - TCP.Predicate->replaceAllUsesWith(TailPredicate); - NewPredicates[TCP.Predicate] = cast(TailPredicate); + Value *VCTPCall = Builder.CreateCall(VCTP, Processed); + ActiveLaneMask->replaceAllUsesWith(VCTPCall); + NewPredicates[ActiveLaneMask] = cast(VCTPCall); // Add the incoming value to the new phi. // TODO: This add likely already exists in the loop. @@ -642,7 +383,7 @@ Processed->addIncoming(Remaining, L->getLoopLatch()); LLVM_DEBUG(dbgs() << "ARM TP: Insert processed elements phi: " << *Processed << "\n" - << "ARM TP: Inserted VCTP: " << *TailPredicate << "\n"); + << "ARM TP: Inserted VCTP: " << *VCTPCall << "\n"); } bool MVETailPredication::TryConvert(Value *TripCount) { @@ -653,51 +394,28 @@ LLVM_DEBUG(dbgs() << "ARM TP: Found predicated vector loop.\n"); - // Walk through the masked intrinsics and try to find whether the predicate - // operand is generated from an induction variable. SetVector Predicates; DenseMap NewPredicates; -#ifndef NDEBUG - // For debugging purposes, use this to indicate we have been able to - // pattern match the scalar loop trip count. - bool FoundScalarTC = false; -#endif - + // Walk through the masked intrinsics and try to find whether the predicate + // operand is generated by intrinsic @llvm.get.active.lanes(). for (auto *I : MaskedInsts) { - Intrinsic::ID ID = I->getIntrinsicID(); - // First, find the icmp used by this masked load/store. - unsigned PredOp = ID == Intrinsic::masked_load ? 2 : 3; + unsigned PredOp = I->getIntrinsicID() == Intrinsic::masked_load ? 2 : 3; auto *Predicate = dyn_cast(I->getArgOperand(PredOp)); if (!Predicate || Predicates.count(Predicate)) continue; - // Step 1: using this icmp, now calculate the number of elements - // processed by this loop. - TripCountPattern TCP(Predicate, TripCount, getVectorType(I)); - if (!(ComputeConstElements(TCP) || ComputeRuntimeElements(TCP))) + auto *ActiveLaneMask = dyn_cast(Predicate); + if (!ActiveLaneMask || + ActiveLaneMask->getIntrinsicID() != Intrinsic::get_active_lane_mask) continue; - LLVM_DEBUG(FoundScalarTC = true); + LLVM_DEBUG(dbgs() << "ARM TP: Found active lane mask: " + << *ActiveLaneMask << "\n"); - if (!isTailPredicate(TCP)) { - LLVM_DEBUG(dbgs() << "ARM TP: Not an icmp that generates tail predicate: " - << *Predicate << "\n"); - continue; - } - - LLVM_DEBUG(dbgs() << "ARM TP: Found icmp generating tail predicate: " - << *Predicate << "\n"); Predicates.insert(Predicate); - - // Step 2: emit the VCTP intrinsic representing the effect of TP. - InsertVCTPIntrinsic(TCP, NewPredicates); - } - - if (!NewPredicates.size()) { - LLVM_DEBUG(if (!FoundScalarTC) - dbgs() << "ARM TP: Can't determine loop itertion count\n"); - return false; + InsertVCTPIntrinsic(ActiveLaneMask, TripCount, getVectorType(I), + NewPredicates); } // Now clean up. Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll @@ -1,9 +1,10 @@ ; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -disable-mve-tail-predication=false -mattr=+mve,+lob %s -S -o - | FileCheck %s ; CHECK-LABEL: mul_v16i8 +; CHECK: %num.elements = add i32 %trip.count.minus.1, 1 ; CHECK: vector.body: ; CHECK: %index = phi i32 -; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ] +; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %num.elements, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ] ; CHECK: [[VCTP:%[^ ]+]] = call <16 x i1> @llvm.arm.mve.vctp8(i32 [[ELEMS]]) ; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 16 ; CHECK: [[LD0:%[^ ]+]] = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* {{.*}}, i32 4, <16 x i1> [[VCTP]], <16 x i8> undef) @@ -34,16 +35,19 @@ %broadcast.splat = shufflevector <16 x i32> %broadcast.splatinsert, <16 x i32> undef, <16 x i32> zeroinitializer %induction = or <16 x i32> %broadcast.splat, %tmp = getelementptr inbounds i8, i8* %a, i32 %index - %tmp1 = icmp ule <16 x i32> %induction, %broadcast.splat11 + +; %tmp1 = icmp ule <16 x i32> %induction, %broadcast.splat11 + %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %trip.count.minus.1) + %tmp2 = bitcast i8* %tmp to <16 x i8>* - %wide.masked.load = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %tmp2, i32 4, <16 x i1> %tmp1, <16 x i8> undef) + %wide.masked.load = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %tmp2, i32 4, <16 x i1> %active.lane.mask, <16 x i8> undef) %tmp3 = getelementptr inbounds i8, i8* %b, i32 %index %tmp4 = bitcast i8* %tmp3 to <16 x i8>* - %wide.masked.load2 = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %tmp4, i32 4, <16 x i1> %tmp1, <16 x i8> undef) + %wide.masked.load2 = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %tmp4, i32 4, <16 x i1> %active.lane.mask, <16 x i8> undef) %mul = mul nsw <16 x i8> %wide.masked.load2, %wide.masked.load %tmp6 = getelementptr inbounds i8, i8* %c, i32 %index %tmp7 = bitcast i8* %tmp6 to <16 x i8>* - tail call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %mul, <16 x i8>* %tmp7, i32 4, <16 x i1> %tmp1) + tail call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %mul, <16 x i8>* %tmp7, i32 4, <16 x i1> %active.lane.mask) %index.next = add i32 %index, 16 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1) %tmp16 = icmp ne i32 %tmp15, 0 @@ -54,9 +58,10 @@ } ; CHECK-LABEL: mul_v8i16 +; CHECK: %num.elements = add i32 %trip.count.minus.1, 1 ; CHECK: vector.body: ; CHECK: %index = phi i32 -; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ] +; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %num.elements, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ] ; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[ELEMS]]) ; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 8 ; CHECK: [[LD0:%[^ ]+]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* {{.*}}, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef) @@ -87,16 +92,19 @@ %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer %induction = add <8 x i32> %broadcast.splat, %tmp = getelementptr inbounds i16, i16* %a, i32 %index - %tmp1 = icmp ule <8 x i32> %induction, %broadcast.splat11 + +; %tmp1 = icmp ule <8 x i32> %induction, %broadcast.splat11 + %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %trip.count.minus.1) + %tmp2 = bitcast i16* %tmp to <8 x i16>* - %wide.masked.load = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp2, i32 4, <8 x i1> %tmp1, <8 x i16> undef) + %wide.masked.load = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp2, i32 4, <8 x i1> %active.lane.mask, <8 x i16> undef) %tmp3 = getelementptr inbounds i16, i16* %b, i32 %index %tmp4 = bitcast i16* %tmp3 to <8 x i16>* - %wide.masked.load2 = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> %tmp1, <8 x i16> undef) + %wide.masked.load2 = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> %active.lane.mask, <8 x i16> undef) %mul = mul nsw <8 x i16> %wide.masked.load2, %wide.masked.load %tmp6 = getelementptr inbounds i16, i16* %c, i32 %index %tmp7 = bitcast i16* %tmp6 to <8 x i16>* - tail call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %mul, <8 x i16>* %tmp7, i32 4, <8 x i1> %tmp1) + tail call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %mul, <8 x i16>* %tmp7, i32 4, <8 x i1> %active.lane.mask) %index.next = add i32 %index, 8 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1) %tmp16 = icmp ne i32 %tmp15, 0 @@ -107,8 +115,9 @@ } ; CHECK-LABEL: mul_v4i32 +; CHECK: %num.elements = add i32 %trip.count.minus.1, 1 ; CHECK: vector.body: -; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ] +; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %num.elements, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ] ; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELEMS]]) ; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4 ; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef) @@ -139,16 +148,17 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = or <4 x i32> %broadcast.splat, %tmp = getelementptr inbounds i32, i32* %a, i32 %index - %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + ; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 %tmp2 = bitcast i32* %tmp to <4 x i32>* - %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef) + %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index %tmp4 = bitcast i32* %tmp3 to <4 x i32>* - %wide.masked.load2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef) + %wide.masked.load2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) %mul = mul nsw <4 x i32> %wide.masked.load2, %wide.masked.load %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index %tmp7 = bitcast i32* %tmp6 to <4 x i32>* - tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %mul, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1) + tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %mul, <4 x i32>* %tmp7, i32 4, <4 x i1> %active.lane.mask) %index.next = add i32 %index, 4 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1) %tmp16 = icmp ne i32 %tmp15, 0 @@ -159,9 +169,10 @@ } ; CHECK-LABEL: split_vector +; CHECK: %num.elements = add i32 %trip.count.minus.1, 1 ; CHECK: vector.body: ; CHECK: %index = phi i32 -; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ] +; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %num.elements, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ] ; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELEMS]]) ; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4 ; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef) @@ -192,14 +203,15 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %tmp = getelementptr inbounds i32, i32* %a, i32 %index - %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 +; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) %tmp2 = bitcast i32* %tmp to <4 x i32>* - %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef) + %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) %extract.1.low = shufflevector <4 x i32> %wide.masked.load, <4 x i32> undef, < 2 x i32> < i32 0, i32 2> %extract.1.high = shufflevector <4 x i32> %wide.masked.load, <4 x i32> undef, < 2 x i32> < i32 1, i32 3> %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index %tmp4 = bitcast i32* %tmp3 to <4 x i32>* - %wide.masked.load2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef) + %wide.masked.load2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) %extract.2.low = shufflevector <4 x i32> %wide.masked.load2, <4 x i32> undef, < 2 x i32> < i32 0, i32 2> %extract.2.high = shufflevector <4 x i32> %wide.masked.load2, <4 x i32> undef, < 2 x i32> < i32 1, i32 3> %mul = mul nsw <2 x i32> %extract.1.low, %extract.2.low @@ -207,7 +219,7 @@ %combine = shufflevector <2 x i32> %mul, <2 x i32> %sub, <4 x i32> %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index %tmp7 = bitcast i32* %tmp6 to <4 x i32>* - tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %combine, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1) + tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %combine, <4 x i32>* %tmp7, i32 4, <4 x i1> %active.lane.mask) %index.next = add i32 %index, 4 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1) %tmp16 = icmp ne i32 %tmp15, 0 @@ -219,7 +231,8 @@ ; One of the loads now uses ult predicate. ; CHECK-LABEL: mismatch_load_pred -; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ] +; CHECK: %num.elements = add i32 %trip.count.minus.1, 1 +; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %num.elements, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ] ; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELEMS]]) ; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4 ; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef) @@ -250,17 +263,20 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %tmp = getelementptr inbounds i32, i32* %a, i32 %index - %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + +; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %wrong = icmp ult <4 x i32> %induction, %broadcast.splat11 %tmp2 = bitcast i32* %tmp to <4 x i32>* - %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef) + %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index %tmp4 = bitcast i32* %tmp3 to <4 x i32>* %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %wrong, <4 x i32> undef) %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index %tmp7 = bitcast i32* %tmp6 to <4 x i32>* - tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1) + tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %active.lane.mask) %index.next = add i32 %index, 4 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1) %tmp16 = icmp ne i32 %tmp15, 0 @@ -272,8 +288,10 @@ ; The store now uses ult predicate. ; CHECK-LABEL: mismatch_store_pred +; CHECK: %num.elements = add i32 %trip.count.minus.1, 1 +; CHECK: vector.body: ; CHECK: %index = phi i32 -; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ] +; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %num.elements, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ] ; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELEMS]]) ; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4 ; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef) @@ -304,13 +322,16 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %tmp = getelementptr inbounds i32, i32* %a, i32 %index - %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + +; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 + %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %wrong = icmp ult <4 x i32> %induction, %broadcast.splat11 %tmp2 = bitcast i32* %tmp to <4 x i32>* - %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef) + %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index %tmp4 = bitcast i32* %tmp3 to <4 x i32>* - %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef) + %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index %tmp7 = bitcast i32* %tmp6 to <4 x i32>* @@ -334,4 +355,6 @@ declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) declare void @llvm.set.loop.iterations.i32(i32) declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) - +declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) +declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32) +declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32)