diff --git a/llvm/lib/Target/ARM/MVETailPredication.cpp b/llvm/lib/Target/ARM/MVETailPredication.cpp --- a/llvm/lib/Target/ARM/MVETailPredication.cpp +++ b/llvm/lib/Target/ARM/MVETailPredication.cpp @@ -86,8 +86,6 @@ TargetTransformInfo *TTI = nullptr; TargetLibraryInfo *TLI = nullptr; bool ClonedVCTPInExitBlock = false; - IntrinsicInst *ActiveLaneMask = nullptr; - FixedVectorType *VecTy = nullptr; public: static char ID; @@ -119,7 +117,8 @@ /// intrinsic: check if the first is a loop induction variable, and for the /// the second check that no overflow can occur in the expression that use /// this backedge-taken count. - bool IsSafeActiveMask(Value *TripCount, FixedVectorType *VecTy); + bool IsSafeActiveMask(IntrinsicInst *ActiveLaneMask, Value *TripCount, + FixedVectorType *VecTy); /// Insert the intrinsic to represent the effect of tail predication. void InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask, Value *TripCount, @@ -130,10 +129,6 @@ /// ARMLowOverheadLoops to better optimise away loop update statements inside /// hardware-loops. void RematerializeIterCount(); - - /// If it is not safe to lower @llvm.get.active.lane.mask to a VCTP, it needs - /// to be lowered to an icmp. - void RevertActiveLaneMask(); }; } // end namespace @@ -167,83 +162,6 @@ DeadInsts); } -void MVETailPredication::RevertActiveLaneMask() { - if (!ActiveLaneMask) - return; - - int VectorWidth = VecTy->getElementCount().Min; - IRBuilder<> Builder(ActiveLaneMask->getParent()->getFirstNonPHI()); - - // 1. Create the vector induction step. This %induction will be the LHS of - // the icmp: - // - // %splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 - // %splat = shufflevector <4 x i32> %splatinsert, <4 x i32> undef, <4 x i32> 0 - // %induction = add <4 x i32> %splat, - // - Value *Index = ActiveLaneMask->getOperand(0); - Value *SplatIndex = - Builder.CreateVectorSplat(VectorWidth, Index, "lane.mask"); - - SmallVector Indices; - for (int i = 0; i < VectorWidth; ++i) - Indices.push_back(ConstantInt::get(Index->getType(), i)); - - Constant *CV = ConstantVector::get(Indices); - Value *Induction = Builder.CreateAdd(SplatIndex, CV, "lane.mask.induction"); - - LLVM_DEBUG(dbgs() << "ARM TP: New index: " << *SplatIndex << "\n"; - dbgs() << "ARM TP: New Induction: " << *Induction << "\n"); - - // 2. In the Preheader, first look if the splat BTC already exists. Find this - // %splat, which will be the RHS of the icmp: - // - // %TC.minus.1 = add i32 %N, -1 - // %splatinsert = insertelement <4 x i32> undef, i32 %TC.minus.1, i32 0 - // %splat = shufflevector <4 x i32> %splatinsert, <4 x i32> undef, <16 x i32> 0 - // - auto *Preheader = L->getLoopPreheader(); - auto *BTC = ActiveLaneMask->getOperand(1); - Value *SplatBTC = nullptr; - - if (auto *C = dyn_cast(BTC)) { - Builder.SetInsertPoint(Preheader->getTerminator()); - SplatBTC = Builder.CreateVectorSplat(VectorWidth, C); - LLVM_DEBUG(dbgs() << "ARM TCP: New splat BTC: " << *SplatBTC << "\n"); - } else { - Instruction *InsertElem; - for (auto &V : *Preheader) { - InsertElem = dyn_cast(&V); - if (!InsertElem) - continue; - ConstantInt *CI = dyn_cast(InsertElem->getOperand(2)); - if (!CI) - continue; - if (InsertElem->getOperand(1) != BTC || CI->getSExtValue() != 0) - continue; - if ((SplatBTC = dyn_cast(*InsertElem->users().begin()))) - break; - } - } - // Or create the splat BTC if it doesn't exist. - if (!SplatBTC) { - Builder.SetInsertPoint(Preheader->getTerminator()); - Value *Undef = - UndefValue::get(FixedVectorType::get(BTC->getType(), VectorWidth)); - Value *Insert = Builder.CreateInsertElement(Undef, - BTC, Builder.getInt32(0), "insert.btc"); - Value *Zero = ConstantInt::get(Insert->getType(), 0); - SplatBTC = Builder.CreateShuffleVector (Insert, Undef, Zero, "splat.btc"); - LLVM_DEBUG(dbgs() << "ARM TCP: New splat BTC: " << *SplatBTC << "\n"); - } - - Builder.SetInsertPoint(ActiveLaneMask); - Value *ICmp = Builder.CreateICmp(ICmpInst::ICMP_ULE, Induction, SplatBTC); - LLVM_DEBUG(dbgs() << "ARM TP: New compare: " << *ICmp << "\n"); - ActiveLaneMask->replaceAllUsesWith(ICmp); - ActiveLaneMask->eraseFromParent(); -} - bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) { if (skipLoop(L) || DisableTailPredication) return false; @@ -261,7 +179,6 @@ TLI = TLIP ? &TLIP->getTLI(*L->getHeader()->getParent()) : nullptr; DL = &L->getHeader()->getModule()->getDataLayout(); this->L = L; - ActiveLaneMask = nullptr; // The MVE and LOB extensions are combined to enable tail-predication, but // there's nothing preventing us from generating VCTP instructions for v8.1m. @@ -318,15 +235,14 @@ LLVM_DEBUG(dbgs() << "ARM TP: Running on Loop: " << *L << *Setup << "\n" << *Decrement << "\n"); - if (TryConvert(Setup->getArgOperand(0))) { - if (ClonedVCTPInExitBlock) - RematerializeIterCount(); - return true; - } else - RevertActiveLaneMask(); + if (!TryConvert(Setup->getArgOperand(0))) { + LLVM_DEBUG(dbgs() << "ARM TP: Can't tail-predicate this loop.\n"); + return false; + } - LLVM_DEBUG(dbgs() << "ARM TP: Can't tail-predicate this loop.\n"); - return false; + if (ClonedVCTPInExitBlock) + RematerializeIterCount(); + return true; } static FixedVectorType *getVectorType(IntrinsicInst *I) { @@ -341,10 +257,27 @@ // Check that the loop contains at least one masked load/store intrinsic. // We only support 'normal' vector instructions - other than masked // load/stores. + bool ActiveLaneMask = false; for (auto *BB : L->getBlocks()) { for (auto &I : *BB) { + auto *Int = dyn_cast(&I); + if (!Int) + continue; + + switch (Int->getIntrinsicID()) { + case Intrinsic::get_active_lane_mask: + ActiveLaneMask = true; + LLVM_FALLTHROUGH; + case Intrinsic::fma: + case Intrinsic::sadd_sat: + case Intrinsic::uadd_sat: + continue; + default: + break; + } + if (IsMasked(&I)) { - FixedVectorType *VecTy = getVectorType(cast(&I)); + auto *VecTy = getVectorType(Int); unsigned Lanes = VecTy->getNumElements(); unsigned ElementWidth = VecTy->getScalarSizeInBits(); // MVE vectors are 128-bit, but don't support 128 x i1. @@ -353,23 +286,20 @@ if (Lanes * ElementWidth > MaxWidth || Lanes == MaxWidth) return false; MaskedInsts.push_back(cast(&I)); - } else if (auto *Int = dyn_cast(&I)) { - switch (Int->getIntrinsicID()) { - case Intrinsic::fma: - case Intrinsic::sadd_sat: - case Intrinsic::uadd_sat: - continue; - default: - break; - } - for (auto &U : Int->args()) { - if (isa(U->getType())) - return false; - } + continue; + } + + for (const Use &U : Int->args()) { + if (isa(U->getType())) + return false; } } } + if (!ActiveLaneMask) { + LLVM_DEBUG(dbgs() << "ARM TP: No get.active.lane.mask intrinsic found.\n"); + return false; + } return !MaskedInsts.empty(); } @@ -451,14 +381,15 @@ // (((ElementCount + (VectorWidth - 1)) / VectorWidth) - TripCount // 3) The IV must be an induction phi with an increment equal to the // vector width. -bool MVETailPredication::IsSafeActiveMask(Value *TripCount, - FixedVectorType *VecTy) { +bool MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask, + Value *TripCount, FixedVectorType *VecTy) { // 1) Test whether entry to the loop is protected by a conditional // BTC + 1 < 0. In other words, if the scalar trip count overflows, // becomes negative, we shouldn't enter the loop and creating // tripcount expression BTC + 1 is not safe. So, check that BTC // isn't max. This is evaluated in unsigned, because the semantics // of @get.active.lane.mask is a ULE comparison. + int VectorWidth = VecTy->getNumElements(); auto *BackedgeTakenCount = ActiveLaneMask->getOperand(1); auto *BTC = SE->getSCEV(BackedgeTakenCount); @@ -570,8 +501,8 @@ if (VectorWidth == StepValue) return true; - LLVM_DEBUG(dbgs() << "ARM TP: step value " << StepValue << " doesn't match " - "vector width : " << VectorWidth << "\n"); + LLVM_DEBUG(dbgs() << "ARM TP: Step value " << StepValue << " doesn't match " + "vector width " << VectorWidth << "\n"); return false; } @@ -614,6 +545,7 @@ IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); Module *M = L->getHeader()->getModule(); Type *Ty = IntegerType::get(M->getContext(), 32); + unsigned VectorWidth = VecTy->getNumElements(); // The backedge-taken count in @llvm.get.active.lane.mask, its 2nd operand, // is one less than the trip count. So we need to find or create @@ -631,10 +563,10 @@ // represent the effect of tail predication. Builder.SetInsertPoint(ActiveLaneMask); ConstantInt *Factor = - ConstantInt::get(cast(Ty), VecTy->getNumElements()); + ConstantInt::get(cast(Ty), VectorWidth); Intrinsic::ID VCTPID; - switch (VecTy->getNumElements()) { + switch (VectorWidth) { default: llvm_unreachable("unexpected number of lanes"); case 4: VCTPID = Intrinsic::arm_mve_vctp32; break; @@ -680,7 +612,7 @@ if (!Predicate || Predicates.count(Predicate)) continue; - ActiveLaneMask = dyn_cast(Predicate); + auto *ActiveLaneMask = dyn_cast(Predicate); if (!ActiveLaneMask || ActiveLaneMask->getIntrinsicID() != Intrinsic::get_active_lane_mask) continue; @@ -689,8 +621,8 @@ LLVM_DEBUG(dbgs() << "ARM TP: Found active lane mask: " << *ActiveLaneMask << "\n"); - VecTy = getVectorType(I); - if (!IsSafeActiveMask(TripCount, VecTy)) { + auto *VecTy = getVectorType(I); + if (!IsSafeActiveMask(ActiveLaneMask, TripCount, VecTy)) { LLVM_DEBUG(dbgs() << "ARM TP: Not safe to insert VCTP.\n"); return false; } diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll @@ -49,7 +49,7 @@ %tmp7 = bitcast i8* %tmp6 to <16 x i8>* tail call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %mul, <16 x i8>* %tmp7, i32 4, <16 x i1> %active.lane.mask) %index.next = add i32 %index, 16 - %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1) + %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1) %tmp16 = icmp ne i32 %tmp15, 0 br i1 %tmp16, label %vector.body, label %for.cond.cleanup @@ -106,7 +106,7 @@ %tmp7 = bitcast i16* %tmp6 to <8 x i16>* tail call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %mul, <8 x i16>* %tmp7, i32 4, <8 x i1> %active.lane.mask) %index.next = add i32 %index, 8 - %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1) + %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1) %tmp16 = icmp ne i32 %tmp15, 0 br i1 %tmp16, label %vector.body, label %for.cond.cleanup @@ -160,7 +160,7 @@ %tmp7 = bitcast i32* %tmp6 to <4 x i32>* tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %mul, <4 x i32>* %tmp7, i32 4, <4 x i1> %active.lane.mask) %index.next = add i32 %index, 4 - %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1) + %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1) %tmp16 = icmp ne i32 %tmp15, 0 br i1 %tmp16, label %vector.body, label %for.cond.cleanup @@ -221,7 +221,7 @@ %tmp7 = bitcast i32* %tmp6 to <4 x i32>* tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %combine, <4 x i32>* %tmp7, i32 4, <4 x i1> %active.lane.mask) %index.next = add i32 %index, 4 - %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1) + %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1) %tmp16 = icmp ne i32 %tmp15, 0 br i1 %tmp16, label %vector.body, label %for.cond.cleanup @@ -277,7 +277,7 @@ %tmp7 = bitcast i32* %tmp6 to <4 x i32>* tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %active.lane.mask) %index.next = add i32 %index, 4 - %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1) + %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1) %tmp16 = icmp ne i32 %tmp15, 0 br i1 %tmp16, label %vector.body, label %for.cond.cleanup @@ -336,7 +336,7 @@ %tmp7 = bitcast i32* %tmp6 to <4 x i32>* tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %wrong) %index.next = add i32 %index, 4 - %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1) + %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1) %tmp16 = icmp ne i32 %tmp15, 0 br i1 %tmp16, label %vector.body, label %for.cond.cleanup @@ -344,6 +344,92 @@ ret void } +; TODO: Multiple intrinsics not yet supported. +; This is currently rejected, because if the vector body is unrolled, the step +; is not what we expect: +; +; Step value 16 doesn't match vector width 4 +; +; CHECK-LABEL: interleave4 +; CHECK: vector.body: +; CHECK: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) +; CHECK: %active.lane.mask{{.*}} = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v7, i32 %trip.count.minus.1) +; CHECK: %active.lane.mask{{.*}} = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v8, i32 %trip.count.minus.1) +; CHECK: %active.lane.mask{{.*}} = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v9, i32 %trip.count.minus.1) +; +define dso_local void @interleave4(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 { +entry: + %cmp8 = icmp sgt i32 %N, 0 + %v0 = add i32 %N, 15 + %v1 = lshr i32 %v0, 4 + %v2 = shl nuw i32 %v1, 4 + %v3 = add i32 %v2, -16 + %v4 = lshr i32 %v3, 4 + %v5 = add nuw nsw i32 %v4, 1 + br i1 %cmp8, label %vector.ph, label %for.cond.cleanup + + +vector.ph: + %trip.count.minus.1 = add i32 %N, -1 + %scevgep = getelementptr i32, i32* %A, i32 8 + %scevgep30 = getelementptr i32, i32* %C, i32 8 + %scevgep37 = getelementptr i32, i32* %B, i32 8 + call void @llvm.set.loop.iterations.i32(i32 %v5) + br label %vector.body + +vector.body: + %lsr.iv38 = phi i32* [ %scevgep39, %vector.body ], [ %scevgep37, %vector.ph ] + %lsr.iv31 = phi i32* [ %scevgep32, %vector.body ], [ %scevgep30, %vector.ph ] + %lsr.iv = phi i32* [ %scevgep25, %vector.body ], [ %scevgep, %vector.ph ] + %index = phi i32 [ 0, %vector.ph ], [ %v14, %vector.body ] + %v6 = phi i32 [ %v5, %vector.ph ], [ %v15, %vector.body ] + %lsr.iv3840 = bitcast i32* %lsr.iv38 to <4 x i32>* + %lsr.iv3133 = bitcast i32* %lsr.iv31 to <4 x i32>* + %lsr.iv26 = bitcast i32* %lsr.iv to <4 x i32>* + %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %v7 = add i32 %index, 4 + %active.lane.mask15 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v7, i32 %trip.count.minus.1) + %v8 = add i32 %v7, 4 + %active.lane.mask16 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v8, i32 %trip.count.minus.1) + %v9 = add i32 %v8, 4 + %active.lane.mask17 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v9, i32 %trip.count.minus.1) + %scevgep42 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv3840, i32 -2 + %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %scevgep42, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) + %scevgep43 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv3840, i32 -1 + %wide.masked.load18 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* nonnull %scevgep43, i32 4, <4 x i1> %active.lane.mask15, <4 x i32> undef) + %wide.masked.load19 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* nonnull %lsr.iv3840, i32 4, <4 x i1> %active.lane.mask16, <4 x i32> undef) + %scevgep41 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv3840, i32 1 + %wide.masked.load20 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* nonnull %scevgep41, i32 4, <4 x i1> %active.lane.mask17, <4 x i32> undef) + %scevgep34 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv3133, i32 -2 + %wide.masked.load21 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %scevgep34, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) + %scevgep35 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv3133, i32 -1 + %wide.masked.load22 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* nonnull %scevgep35, i32 4, <4 x i1> %active.lane.mask15, <4 x i32> undef) + %wide.masked.load23 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* nonnull %lsr.iv3133, i32 4, <4 x i1> %active.lane.mask16, <4 x i32> undef) + %scevgep36 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv3133, i32 1 + %wide.masked.load24 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* nonnull %scevgep36, i32 4, <4 x i1> %active.lane.mask17, <4 x i32> undef) + %v10 = add nsw <4 x i32> %wide.masked.load21, %wide.masked.load + %v11 = add nsw <4 x i32> %wide.masked.load22, %wide.masked.load18 + %v12 = add nsw <4 x i32> %wide.masked.load23, %wide.masked.load19 + %v13 = add nsw <4 x i32> %wide.masked.load24, %wide.masked.load20 + %scevgep27 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv26, i32 -2 + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %v10, <4 x i32>* %scevgep27, i32 4, <4 x i1> %active.lane.mask) + %scevgep28 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv26, i32 -1 + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %v11, <4 x i32>* %scevgep28, i32 4, <4 x i1> %active.lane.mask15) + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %v12, <4 x i32>* %lsr.iv26, i32 4, <4 x i1> %active.lane.mask16) + %scevgep29 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv26, i32 1 + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %v13, <4 x i32>* %scevgep29, i32 4, <4 x i1> %active.lane.mask17) + %scevgep25 = getelementptr i32, i32* %lsr.iv, i32 16 + %scevgep32 = getelementptr i32, i32* %lsr.iv31, i32 16 + %scevgep39 = getelementptr i32, i32* %lsr.iv38, i32 16 + %v14 = add i32 %v9, 4 + %v15 = call i32 @llvm.loop.decrement.reg.i32(i32 %v6, i32 1) + %v16 = icmp ne i32 %v15, 0 + br i1 %v16, label %vector.body, label %for.cond.cleanup + +for.cond.cleanup: + ret void +} + declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>) declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32 immarg, <16 x i1>) declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>) @@ -353,7 +439,7 @@ declare <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>*, i32 immarg, <2 x i1>, <2 x i64>) declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) declare void @llvm.set.loop.iterations.i32(i32) -declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) +declare i32 @llvm.loop.decrement.reg.i32(i32, i32) declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32) declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll @@ -266,16 +266,9 @@ } ; CHECK-LABEL: @overflow_BTC_plus_1( -; +; CHECK: vector.body: ; CHECK-NOT: @llvm.arm.mve.vctp32 -; CHECK-NOT: @llvm.get.active.lane.mask -; -; CHECK: %lane.mask.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 -; CHECK: %lane.mask.splat = shufflevector <4 x i32> %lane.mask.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer -; CHECK: %lane.mask.induction = add <4 x i32> %lane.mask.splat, -; CHECK: %[[ICMP:.*]] = icmp ule <4 x i32> %lane.mask.induction, -; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ICMP]], <4 x i32> undef) -; +; CHECK: @llvm.get.active.lane.mask ; CHECK: ret void ; define dso_local void @overflow_BTC_plus_1(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 { @@ -316,8 +309,9 @@ } ; CHECK-LABEL: @overflow_in_sub( +; CHECK: vector.body: ; CHECK-NOT: @llvm.arm.mve.vctp32 -; CHECK-NOT: @llvm.get.active.lane.mask +; CHECK: @llvm.get.active.lane.mask ; CHECK: ret void ; define dso_local void @overflow_in_sub(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 { @@ -366,8 +360,9 @@ } ; CHECK-LABEL: @overflow_in_rounding_tripcount( +; CHECK: vector.body: ; CHECK-NOT: @llvm.arm.mve.vctp32 -; CHECK-NOT: @llvm.get.active.lane.mask +; CHECK: @llvm.get.active.lane.mask ; CHECK: ret void ; define dso_local void @overflow_in_rounding_tripcount(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 { @@ -413,15 +408,9 @@ ; CHECK-LABEL: @IV_not_an_induction( -; +; CHECK: vector.body: ; CHECK-NOT: @llvm.arm.mve.vctp32 -; CHECK-NOT: @llvm.get.active.lane.mask -; -; CHECK: %lane.mask.splatinsert = insertelement <4 x i32> undef, i32 %N, i32 0 -; CHECK: %lane.mask.splat = shufflevector <4 x i32> %lane.mask.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer -; CHECK: %lane.mask.induction = add <4 x i32> %lane.mask.splat, -; CHECK: %[[ICMP:.*]] = icmp ule <4 x i32> %lane.mask.induction, -; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ICMP]], <4 x i32> undef) +; CHECK: @llvm.get.active.lane.mask ; CHECK: ret void ; define dso_local void @IV_not_an_induction(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 { @@ -462,15 +451,9 @@ } ; CHECK-LABEL: @IV_wrong_step( -; +; CHECK: vector.body: ; CHECK-NOT: @llvm.arm.mve.vctp32 -; CHECK-NOT: @llvm.get.active.lane.mask -; -; CHECK: %lane.mask.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 -; CHECK: %lane.mask.splat = shufflevector <4 x i32> %lane.mask.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer -; CHECK: %lane.mask.induction = add <4 x i32> %lane.mask.splat, -; CHECK: %[[ICMP:.*]] = icmp ule <4 x i32> %lane.mask.induction, -; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ICMP]], <4 x i32> undef) +; CHECK: @llvm.get.active.lane.mask ; CHECK: ret void ; define dso_local void @IV_wrong_step(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 { @@ -514,15 +497,9 @@ } ; CHECK-LABEL: @IV_step_not_constant( -; +; CHECK: vector.body: ; CHECK-NOT: @llvm.arm.mve.vctp32 -; CHECK-NOT: @llvm.get.active.lane.mask -; -; CHECK: %lane.mask.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 -; CHECK: %lane.mask.splat = shufflevector <4 x i32> %lane.mask.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer -; CHECK: %lane.mask.induction = add <4 x i32> %lane.mask.splat, -; CHECK: %[[ICMP:.*]] = icmp ule <4 x i32> %lane.mask.induction, -; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ICMP]], <4 x i32> undef) +; CHECK: @llvm.get.active.lane.mask ; CHECK: ret void ; define dso_local void @IV_step_not_constant(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 { @@ -563,15 +540,9 @@ } ; CHECK-LABEL: @outerloop_phi( -; +; CHECK: vector.body: ; CHECK-NOT: @llvm.arm.mve.vctp32 -; CHECK-NOT: @llvm.get.active.lane.mask -; CHECK: %lane.mask.splatinsert = insertelement <4 x i32> undef, i32 %j.025, i32 0 -; CHECK: %lane.mask.splat = shufflevector <4 x i32> %lane.mask.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer -; CHECK: %lane.mask.induction = add <4 x i32> %lane.mask.splat, -; CHECK: %[[ICMP:.*]] = icmp ule <4 x i32> %lane.mask.induction, -; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ICMP]], <4 x i32> undef) -; +; CHECK: @llvm.get.active.lane.mask ; CHECK: ret void ; define dso_local void @outerloop_phi(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 { diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll @@ -143,21 +143,10 @@ ; ; CHECK-LABEL: @reduction_not_guarded ; +; CHECK: vector.body: ; CHECK-NOT: @llvm.arm.mve.vctp -; CHECK-NOT: @llvm.get.active.lane.mask.v8i1.i32 -; -; CHECK: entry: -; CHECK: %[[ELEMCOUNT:.*]] = add i32 %N, -1 -; CHECK: %broadcast.splatinsert1 = insertelement <8 x i32> undef, i32 %[[ELEMCOUNT]], i32 0 -; CHECK %broadcast.splat2 = shufflevector <8 x i32> %broadcast.splatinsert1, <8 x i32> undef, <8 x i32> zeroinitializer -; -; CHECK: vector.body: -; CHECK: %lane.mask.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0 -; CHECK: %lane.mask.splat = shufflevector <8 x i32> %lane.mask.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer -; CHECK: %lane.mask.induction = add <8 x i32> %lane.mask.splat, -; CHECK: %[[ICMP:.*]] = icmp ule <8 x i32> %lane.mask.induction, %broadcast.splat2 -; CHECK: call <8 x i16> @llvm.masked.load.v8i16.p0v8i16({{.*}}, <8 x i1> %[[ICMP]], <8 x i16> undef) -; CHECK: ret +; CHECK: @llvm.get.active.lane.mask.v8i1.i32 +; CHECK: ret ; define i16 @reduction_not_guarded(i16* nocapture readonly %A, i16 %B, i32 %N) local_unnamed_addr { entry: @@ -213,20 +202,9 @@ ; ; CHECK-LABEL: @Correlation ; -; CHECK: entry: -; CHECK: for.body.lr.ph: ; preds = %entry -; CHECK: for.body: ; preds = %for.end, %for.body.lr.ph -; CHECK: vector.ph: ; preds = %for.body -; CHECK: %trip.count.minus.1 = add i32 %8, -1 -; CHECK: call void @llvm.set.loop.iterations.i32(i32 %7) -; CHECK: %insert.btc = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 -; CHECK: %splat.btc = shufflevector <4 x i32> %insert.btc, <4 x i32> undef, <4 x i32> zeroinitializer -; CHECK: br label %vector.body ; CHECK: vector.body: -; CHECK-NOT: @llvm.arm.mve.vctp -; CHECK: %[[ICMP:.*]] = icmp ule <4 x i32> %lane.mask.induction, %splat.btc -; CHECK: call <4 x i16> @llvm.masked.load.v4i16.p0v4i16({{.*}}, <4 x i1> %[[ICMP]],{{.*}} -; +; CHECK-NOT: @llvm.arm.mve.vctp +; CHECK: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) ; ; FORCE-LABEL: @Correlation ; FORCE: vector.ph: ; preds = %for.body