Index: llvm/lib/Target/ARM/MVETailPredication.cpp =================================================================== --- llvm/lib/Target/ARM/MVETailPredication.cpp +++ llvm/lib/Target/ARM/MVETailPredication.cpp @@ -85,8 +85,7 @@ TargetTransformInfo *TTI = nullptr; TargetLibraryInfo *TLI = nullptr; bool ClonedVCTPInExitBlock = false; - IntrinsicInst *ActiveLaneMask = nullptr; - FixedVectorType *VecTy = nullptr; + std::vector ActiveLaneMasks; public: static char ID; @@ -118,11 +117,12 @@ /// intrinsic: check if the first is a loop induction variable, and for the /// the second check that no overflow can occur in the expression that use /// this backedge-taken count. - bool IsSafeActiveMask(Value *TripCount, FixedVectorType *VecTy); + bool IsSafeActiveMask(IntrinsicInst *ActiveLaneMask, Value *TripCount, + FixedVectorType *VecTy); /// Insert the intrinsic to represent the effect of tail predication. void InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask, Value *TripCount, - FixedVectorType *VecTy, + FixedVectorType *vecTy, DenseMap &NewPredicates); /// Rematerialize the iteration count in exit blocks, which enables @@ -167,11 +167,21 @@ } void MVETailPredication::RevertActiveLaneMask() { - if (!ActiveLaneMask) + if (ActiveLaneMasks.empty()) return; - int VectorWidth = VecTy->getElementCount().Min; - IRBuilder<> Builder(ActiveLaneMask->getParent()->getFirstNonPHI()); + // Perform some sanity checks on the intrinsics: they should at least all have + // the same 2nd argument, i.e. the BTC should be the same. + auto *BTC = ActiveLaneMasks.front()->getOperand(1); + auto *VecTy = dyn_cast(ActiveLaneMasks.front()->getType()); + for (auto *I : ActiveLaneMasks) { + if (I->getOperand(1) == BTC) + assert("Same BTC expected for all get.active.lane.mask intrinsics"); + if (dyn_cast(I->getType()) == VecTy) + assert("Same types expected"); + } + + unsigned VectorWidth = VecTy->getNumElements(); // 1. Create the vector induction step. This %induction will be the LHS of // the icmp: @@ -180,19 +190,25 @@ // %splat = shufflevector <4 x i32> %splatinsert, <4 x i32> undef, <4 x i32> 0 // %induction = add <4 x i32> %splat, // - Value *Index = ActiveLaneMask->getOperand(0); - Value *SplatIndex = - Builder.CreateVectorSplat(VectorWidth, Index, "lane.mask"); - - SmallVector Indices; - for (int i = 0; i < VectorWidth; ++i) - Indices.push_back(ConstantInt::get(Index->getType(), i)); - - Constant *CV = ConstantVector::get(Indices); - Value *Induction = Builder.CreateAdd(SplatIndex, CV, "lane.mask.induction"); - - LLVM_DEBUG(dbgs() << "ARM TP: New index: " << *SplatIndex << "\n"; - dbgs() << "ARM TP: New Induction: " << *Induction << "\n"); + IRBuilder<> Builder(ActiveLaneMasks.front()->getParent()->getFirstNonPHI()); + std::vector Inductions; + for (auto *ActiveLaneMask : ActiveLaneMasks) { + Builder.SetInsertPoint(ActiveLaneMask); + Value *Index = ActiveLaneMask->getOperand(0); + Value *SplatIndex = + Builder.CreateVectorSplat(VectorWidth, Index, "index"); + + SmallVector Indices; + for (unsigned i = 0; i < VectorWidth; ++i) + Indices.push_back(ConstantInt::get(Index->getType(), i)); + + Constant *CV = ConstantVector::get(Indices); + Value *Induction = Builder.CreateAdd(SplatIndex, CV, "viv.induction"); + Inductions.push_back(Induction); + + LLVM_DEBUG(dbgs() << "ARM TP: New index: " << *SplatIndex << "\n"; + dbgs() << "ARM TP: New Induction: " << *Induction << "\n"); + } // 2. In the Preheader, first look if the splat BTC already exists. Find this // %splat, which will be the RHS of the icmp: @@ -202,7 +218,6 @@ // %splat = shufflevector <4 x i32> %splatinsert, <4 x i32> undef, <16 x i32> 0 // auto *Preheader = L->getLoopPreheader(); - auto *BTC = ActiveLaneMask->getOperand(1); Value *SplatBTC = nullptr; if (auto *C = dyn_cast(BTC)) { @@ -236,11 +251,15 @@ LLVM_DEBUG(dbgs() << "ARM TCP: New splat BTC: " << *SplatBTC << "\n"); } - Builder.SetInsertPoint(ActiveLaneMask); - Value *ICmp = Builder.CreateICmp(ICmpInst::ICMP_ULE, Induction, SplatBTC); - LLVM_DEBUG(dbgs() << "ARM TP: New compare: " << *ICmp << "\n"); - ActiveLaneMask->replaceAllUsesWith(ICmp); - ActiveLaneMask->eraseFromParent(); + int i = 0; + for (auto *ActiveLaneMask : ActiveLaneMasks) { + Builder.SetInsertPoint(ActiveLaneMask); + Value *ICmp = + Builder.CreateICmp(ICmpInst::ICMP_ULE, Inductions[i++], SplatBTC); + LLVM_DEBUG(dbgs() << "ARM TP: New compare: " << *ICmp << "\n"); + ActiveLaneMask->replaceAllUsesWith(ICmp); + ActiveLaneMask->eraseFromParent(); + } } bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) { @@ -260,7 +279,7 @@ TLI = TLIP ? &TLIP->getTLI(*L->getHeader()->getParent()) : nullptr; DL = &L->getHeader()->getModule()->getDataLayout(); this->L = L; - ActiveLaneMask = nullptr; + ActiveLaneMasks.clear(); // The MVE and LOB extensions are combined to enable tail-predication, but // there's nothing preventing us from generating VCTP instructions for v8.1m. @@ -317,15 +336,15 @@ LLVM_DEBUG(dbgs() << "ARM TP: Running on Loop: " << *L << *Setup << "\n" << *Decrement << "\n"); - if (TryConvert(Setup->getArgOperand(0))) { - if (ClonedVCTPInExitBlock) - RematerializeIterCount(); - return true; - } else + if (!TryConvert(Setup->getArgOperand(0))) { + LLVM_DEBUG(dbgs() << "ARM TP: Can't tail-predicate this loop.\n"); RevertActiveLaneMask(); + return false; + } - LLVM_DEBUG(dbgs() << "ARM TP: Can't tail-predicate this loop.\n"); - return false; + if (ClonedVCTPInExitBlock) + RematerializeIterCount(); + return true; } static FixedVectorType *getVectorType(IntrinsicInst *I) { @@ -342,8 +361,19 @@ // load/stores. for (auto *BB : L->getBlocks()) { for (auto &I : *BB) { + auto *Int = dyn_cast(&I); + if (!Int) + continue; + + if (Int->getIntrinsicID() == Intrinsic::get_active_lane_mask) { + ActiveLaneMasks.push_back(Int); + continue; + } + if (Int->getIntrinsicID() == Intrinsic::fma) + continue; + if (IsMasked(&I)) { - FixedVectorType *VecTy = getVectorType(cast(&I)); + auto *VecTy = getVectorType(Int); unsigned Lanes = VecTy->getNumElements(); unsigned ElementWidth = VecTy->getScalarSizeInBits(); // MVE vectors are 128-bit, but don't support 128 x i1. @@ -352,17 +382,27 @@ if (Lanes * ElementWidth > MaxWidth || Lanes == MaxWidth) return false; MaskedInsts.push_back(cast(&I)); - } else if (auto *Int = dyn_cast(&I)) { - if (Int->getIntrinsicID() == Intrinsic::fma) - continue; - for (auto &U : Int->args()) { - if (isa(U->getType())) - return false; - } + continue; + } + + for (const Use &U : Int->args()) { + if (isa(U->getType())) + return false; } } } + if (!ActiveLaneMasks.size()) { + LLVM_DEBUG(dbgs() << "ARM TP: No get.active.lane.mask intrinsic found.\n"); + return false; + } + // TODO: we only expect/support 1 lane intrinsic, revert if we find more. + if (ActiveLaneMasks.size() > 1) { + LLVM_DEBUG(dbgs() + << "ARM TP: Multiple lane intrinsics not yet supported.\n"); + return false; + } + return !MaskedInsts.empty(); } @@ -444,14 +484,15 @@ // (((ElementCount + (VectorWidth - 1)) / VectorWidth) - TripCount // 3) The IV must be an induction phi with an increment equal to the // vector width. -bool MVETailPredication::IsSafeActiveMask(Value *TripCount, - FixedVectorType *VecTy) { +bool MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask, + Value *TripCount, FixedVectorType *VecTy) { // 1) Test whether entry to the loop is protected by a conditional // BTC + 1 < 0. In other words, if the scalar trip count overflows, // becomes negative, we shouldn't enter the loop and creating // tripcount expression BTC + 1 is not safe. So, check that BTC // isn't max. This is evaluated in unsigned, because the semantics // of @get.active.lane.mask is a ULE comparison. + int VectorWidth = VecTy->getNumElements(); auto *BackedgeTakenCount = ActiveLaneMask->getOperand(1); auto *BTC = SE->getSCEV(BackedgeTakenCount); @@ -607,6 +648,7 @@ IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); Module *M = L->getHeader()->getModule(); Type *Ty = IntegerType::get(M->getContext(), 32); + unsigned VectorWidth = VecTy->getNumElements(); // The backedge-taken count in @llvm.get.active.lane.mask, its 2nd operand, // is one less than the trip count. So we need to find or create @@ -624,10 +666,10 @@ // represent the effect of tail predication. Builder.SetInsertPoint(ActiveLaneMask); ConstantInt *Factor = - ConstantInt::get(cast(Ty), VecTy->getNumElements()); + ConstantInt::get(cast(Ty), VectorWidth); Intrinsic::ID VCTPID; - switch (VecTy->getNumElements()) { + switch (VectorWidth) { default: llvm_unreachable("unexpected number of lanes"); case 4: VCTPID = Intrinsic::arm_mve_vctp32; break; @@ -673,7 +715,7 @@ if (!Predicate || Predicates.count(Predicate)) continue; - ActiveLaneMask = dyn_cast(Predicate); + auto *ActiveLaneMask = dyn_cast(Predicate); if (!ActiveLaneMask || ActiveLaneMask->getIntrinsicID() != Intrinsic::get_active_lane_mask) continue; @@ -682,8 +724,8 @@ LLVM_DEBUG(dbgs() << "ARM TP: Found active lane mask: " << *ActiveLaneMask << "\n"); - VecTy = getVectorType(I); - if (!IsSafeActiveMask(TripCount, VecTy)) { + auto *VecTy = getVectorType(I); + if (!IsSafeActiveMask(ActiveLaneMask, TripCount, VecTy)) { LLVM_DEBUG(dbgs() << "ARM TP: Not safe to insert VCTP.\n"); return false; } Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll @@ -49,7 +49,7 @@ %tmp7 = bitcast i8* %tmp6 to <16 x i8>* tail call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %mul, <16 x i8>* %tmp7, i32 4, <16 x i1> %active.lane.mask) %index.next = add i32 %index, 16 - %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1) + %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1) %tmp16 = icmp ne i32 %tmp15, 0 br i1 %tmp16, label %vector.body, label %for.cond.cleanup @@ -106,7 +106,7 @@ %tmp7 = bitcast i16* %tmp6 to <8 x i16>* tail call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %mul, <8 x i16>* %tmp7, i32 4, <8 x i1> %active.lane.mask) %index.next = add i32 %index, 8 - %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1) + %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1) %tmp16 = icmp ne i32 %tmp15, 0 br i1 %tmp16, label %vector.body, label %for.cond.cleanup @@ -160,7 +160,7 @@ %tmp7 = bitcast i32* %tmp6 to <4 x i32>* tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %mul, <4 x i32>* %tmp7, i32 4, <4 x i1> %active.lane.mask) %index.next = add i32 %index, 4 - %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1) + %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1) %tmp16 = icmp ne i32 %tmp15, 0 br i1 %tmp16, label %vector.body, label %for.cond.cleanup @@ -221,7 +221,7 @@ %tmp7 = bitcast i32* %tmp6 to <4 x i32>* tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %combine, <4 x i32>* %tmp7, i32 4, <4 x i1> %active.lane.mask) %index.next = add i32 %index, 4 - %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1) + %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1) %tmp16 = icmp ne i32 %tmp15, 0 br i1 %tmp16, label %vector.body, label %for.cond.cleanup @@ -277,7 +277,7 @@ %tmp7 = bitcast i32* %tmp6 to <4 x i32>* tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %active.lane.mask) %index.next = add i32 %index, 4 - %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1) + %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1) %tmp16 = icmp ne i32 %tmp15, 0 br i1 %tmp16, label %vector.body, label %for.cond.cleanup @@ -336,7 +336,7 @@ %tmp7 = bitcast i32* %tmp6 to <4 x i32>* tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %wrong) %index.next = add i32 %index, 4 - %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1) + %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1) %tmp16 = icmp ne i32 %tmp15, 0 br i1 %tmp16, label %vector.body, label %for.cond.cleanup @@ -344,6 +344,115 @@ ret void } +; CHECK-LABEL: interleave4 +; +; CHECK: vector.ph: ; preds = %entry +; CHECK: %insert.btc = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 +; CHECK: %splat.btc = shufflevector <4 x i32> %insert.btc, <4 x i32> undef, <4 x i32> zeroinitializer +; +; CHECK: vector.body: ; preds = %vector.body, %vector.ph +; CHECK: %index.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 +; CHECK: %index.splat = shufflevector <4 x i32> %index.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK: %viv.induction = add <4 x i32> %index.splat, +; CHECK: %[[ICMP1:.*]] = icmp ule <4 x i32> %viv.induction, %splat.btc +; CHECK: %v7 = add i32 %index, 4 +; CHECK: %index.splatinsert1 = insertelement <4 x i32> undef, i32 %v7, i32 0 +; CHECK: %index.splat2 = shufflevector <4 x i32> %index.splatinsert1, <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK: %viv.induction3 = add <4 x i32> %index.splat2, +; CHECK: %[[ICMP2:.*]] = icmp ule <4 x i32> %viv.induction3, %splat.btc +; CHECK: %v8 = add i32 %v7, 4 +; CHECK: %index.splatinsert4 = insertelement <4 x i32> undef, i32 %v8, i32 0 +; CHECK: %index.splat5 = shufflevector <4 x i32> %index.splatinsert4, <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK: %viv.induction6 = add <4 x i32> %index.splat5, +; CHECK: %[[ICMP3:.*]] = icmp ule <4 x i32> %viv.induction6, %splat.btc +; CHECK: %v9 = add i32 %v8, 4 +; CHECK: %index.splatinsert7 = insertelement <4 x i32> undef, i32 %v9, i32 0 +; CHECK: %index.splat8 = shufflevector <4 x i32> %index.splatinsert7, <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK: %viv.induction9 = add <4 x i32> %index.splat8, +; CHECK: %[[ICMP4:.*]] = icmp ule <4 x i32> %viv.induction9, %splat.btc +; +; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ICMP1]], <4 x i32> undef) +; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ICMP2]], <4 x i32> undef) +; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ICMP3]], <4 x i32> undef) +; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ICMP4]], <4 x i32> undef) +; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ICMP1]], <4 x i32> undef) +; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ICMP2]], <4 x i32> undef) +; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ICMP3]], <4 x i32> undef) +; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ICMP4]], <4 x i32> undef) +; +define dso_local void @interleave4(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 { +entry: + %cmp8 = icmp sgt i32 %N, 0 + %v0 = add i32 %N, 15 + %v1 = lshr i32 %v0, 4 + %v2 = shl nuw i32 %v1, 4 + %v3 = add i32 %v2, -16 + %v4 = lshr i32 %v3, 4 + %v5 = add nuw nsw i32 %v4, 1 + br i1 %cmp8, label %vector.ph, label %for.cond.cleanup + + +vector.ph: + %trip.count.minus.1 = add i32 %N, -1 + %scevgep = getelementptr i32, i32* %A, i32 8 + %scevgep30 = getelementptr i32, i32* %C, i32 8 + %scevgep37 = getelementptr i32, i32* %B, i32 8 + call void @llvm.set.loop.iterations.i32(i32 %v5) + br label %vector.body + +vector.body: + %lsr.iv38 = phi i32* [ %scevgep39, %vector.body ], [ %scevgep37, %vector.ph ] + %lsr.iv31 = phi i32* [ %scevgep32, %vector.body ], [ %scevgep30, %vector.ph ] + %lsr.iv = phi i32* [ %scevgep25, %vector.body ], [ %scevgep, %vector.ph ] + %index = phi i32 [ 0, %vector.ph ], [ %v14, %vector.body ] + %v6 = phi i32 [ %v5, %vector.ph ], [ %v15, %vector.body ] + %lsr.iv3840 = bitcast i32* %lsr.iv38 to <4 x i32>* + %lsr.iv3133 = bitcast i32* %lsr.iv31 to <4 x i32>* + %lsr.iv26 = bitcast i32* %lsr.iv to <4 x i32>* + %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %v7 = add i32 %index, 4 + %active.lane.mask15 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v7, i32 %trip.count.minus.1) + %v8 = add i32 %v7, 4 + %active.lane.mask16 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v8, i32 %trip.count.minus.1) + %v9 = add i32 %v8, 4 + %active.lane.mask17 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v9, i32 %trip.count.minus.1) + %scevgep42 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv3840, i32 -2 + %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %scevgep42, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) + %scevgep43 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv3840, i32 -1 + %wide.masked.load18 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* nonnull %scevgep43, i32 4, <4 x i1> %active.lane.mask15, <4 x i32> undef) + %wide.masked.load19 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* nonnull %lsr.iv3840, i32 4, <4 x i1> %active.lane.mask16, <4 x i32> undef) + %scevgep41 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv3840, i32 1 + %wide.masked.load20 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* nonnull %scevgep41, i32 4, <4 x i1> %active.lane.mask17, <4 x i32> undef) + %scevgep34 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv3133, i32 -2 + %wide.masked.load21 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %scevgep34, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) + %scevgep35 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv3133, i32 -1 + %wide.masked.load22 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* nonnull %scevgep35, i32 4, <4 x i1> %active.lane.mask15, <4 x i32> undef) + %wide.masked.load23 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* nonnull %lsr.iv3133, i32 4, <4 x i1> %active.lane.mask16, <4 x i32> undef) + %scevgep36 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv3133, i32 1 + %wide.masked.load24 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* nonnull %scevgep36, i32 4, <4 x i1> %active.lane.mask17, <4 x i32> undef) + %v10 = add nsw <4 x i32> %wide.masked.load21, %wide.masked.load + %v11 = add nsw <4 x i32> %wide.masked.load22, %wide.masked.load18 + %v12 = add nsw <4 x i32> %wide.masked.load23, %wide.masked.load19 + %v13 = add nsw <4 x i32> %wide.masked.load24, %wide.masked.load20 + %scevgep27 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv26, i32 -2 + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %v10, <4 x i32>* %scevgep27, i32 4, <4 x i1> %active.lane.mask) + %scevgep28 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv26, i32 -1 + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %v11, <4 x i32>* %scevgep28, i32 4, <4 x i1> %active.lane.mask15) + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %v12, <4 x i32>* %lsr.iv26, i32 4, <4 x i1> %active.lane.mask16) + %scevgep29 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv26, i32 1 + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %v13, <4 x i32>* %scevgep29, i32 4, <4 x i1> %active.lane.mask17) + %scevgep25 = getelementptr i32, i32* %lsr.iv, i32 16 + %scevgep32 = getelementptr i32, i32* %lsr.iv31, i32 16 + %scevgep39 = getelementptr i32, i32* %lsr.iv38, i32 16 + %v14 = add i32 %v9, 4 + %v15 = call i32 @llvm.loop.decrement.reg.i32(i32 %v6, i32 1) + %v16 = icmp ne i32 %v15, 0 + br i1 %v16, label %vector.body, label %for.cond.cleanup + +for.cond.cleanup: + ret void +} + declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>) declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32 immarg, <16 x i1>) declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>) @@ -353,7 +462,7 @@ declare <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>*, i32 immarg, <2 x i1>, <2 x i64>) declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) declare void @llvm.set.loop.iterations.i32(i32) -declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) +declare i32 @llvm.loop.decrement.reg.i32(i32, i32) declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32) declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32) Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll @@ -270,10 +270,10 @@ ; CHECK-NOT: @llvm.arm.mve.vctp32 ; CHECK-NOT: @llvm.get.active.lane.mask ; -; CHECK: %lane.mask.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 -; CHECK: %lane.mask.splat = shufflevector <4 x i32> %lane.mask.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer -; CHECK: %lane.mask.induction = add <4 x i32> %lane.mask.splat, -; CHECK: %[[ICMP:.*]] = icmp ule <4 x i32> %lane.mask.induction, +; CHECK: %index.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 +; CHECK: %index.splat = shufflevector <4 x i32> %index.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK: %viv.induction = add <4 x i32> %index.splat, +; CHECK: %[[ICMP:.*]] = icmp ule <4 x i32> %viv.induction, ; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ICMP]], <4 x i32> undef) ; ; CHECK: ret void @@ -417,10 +417,10 @@ ; CHECK-NOT: @llvm.arm.mve.vctp32 ; CHECK-NOT: @llvm.get.active.lane.mask ; -; CHECK: %lane.mask.splatinsert = insertelement <4 x i32> undef, i32 %N, i32 0 -; CHECK: %lane.mask.splat = shufflevector <4 x i32> %lane.mask.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer -; CHECK: %lane.mask.induction = add <4 x i32> %lane.mask.splat, -; CHECK: %[[ICMP:.*]] = icmp ule <4 x i32> %lane.mask.induction, +; CHECK: %index.splatinsert = insertelement <4 x i32> undef, i32 %N, i32 0 +; CHECK: %index.splat = shufflevector <4 x i32> %index.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK: %viv.induction = add <4 x i32> %index.splat, +; CHECK: %[[ICMP:.*]] = icmp ule <4 x i32> %viv.induction, ; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ICMP]], <4 x i32> undef) ; CHECK: ret void ; @@ -466,10 +466,10 @@ ; CHECK-NOT: @llvm.arm.mve.vctp32 ; CHECK-NOT: @llvm.get.active.lane.mask ; -; CHECK: %lane.mask.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 -; CHECK: %lane.mask.splat = shufflevector <4 x i32> %lane.mask.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer -; CHECK: %lane.mask.induction = add <4 x i32> %lane.mask.splat, -; CHECK: %[[ICMP:.*]] = icmp ule <4 x i32> %lane.mask.induction, +; CHECK: %index.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 +; CHECK: %index.splat = shufflevector <4 x i32> %index.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK: %viv.induction = add <4 x i32> %index.splat, +; CHECK: %[[ICMP:.*]] = icmp ule <4 x i32> %viv.induction, ; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ICMP]], <4 x i32> undef) ; CHECK: ret void ; @@ -518,10 +518,10 @@ ; CHECK-NOT: @llvm.arm.mve.vctp32 ; CHECK-NOT: @llvm.get.active.lane.mask ; -; CHECK: %lane.mask.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 -; CHECK: %lane.mask.splat = shufflevector <4 x i32> %lane.mask.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer -; CHECK: %lane.mask.induction = add <4 x i32> %lane.mask.splat, -; CHECK: %[[ICMP:.*]] = icmp ule <4 x i32> %lane.mask.induction, +; CHECK: %index.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 +; CHECK: %index.splat = shufflevector <4 x i32> %index.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK: %viv.induction = add <4 x i32> %index.splat, +; CHECK: %[[ICMP:.*]] = icmp ule <4 x i32> %viv.induction, ; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ICMP]], <4 x i32> undef) ; CHECK: ret void ; @@ -566,10 +566,10 @@ ; ; CHECK-NOT: @llvm.arm.mve.vctp32 ; CHECK-NOT: @llvm.get.active.lane.mask -; CHECK: %lane.mask.splatinsert = insertelement <4 x i32> undef, i32 %j.025, i32 0 -; CHECK: %lane.mask.splat = shufflevector <4 x i32> %lane.mask.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer -; CHECK: %lane.mask.induction = add <4 x i32> %lane.mask.splat, -; CHECK: %[[ICMP:.*]] = icmp ule <4 x i32> %lane.mask.induction, +; CHECK: %index.splatinsert = insertelement <4 x i32> undef, i32 %j.025, i32 0 +; CHECK: %index.splat = shufflevector <4 x i32> %index.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK: %viv.induction = add <4 x i32> %index.splat, +; CHECK: %[[ICMP:.*]] = icmp ule <4 x i32> %viv.induction, ; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ICMP]], <4 x i32> undef) ; ; CHECK: ret void Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll @@ -152,10 +152,10 @@ ; CHECK %broadcast.splat2 = shufflevector <8 x i32> %broadcast.splatinsert1, <8 x i32> undef, <8 x i32> zeroinitializer ; ; CHECK: vector.body: -; CHECK: %lane.mask.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0 -; CHECK: %lane.mask.splat = shufflevector <8 x i32> %lane.mask.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer -; CHECK: %lane.mask.induction = add <8 x i32> %lane.mask.splat, -; CHECK: %[[ICMP:.*]] = icmp ule <8 x i32> %lane.mask.induction, %broadcast.splat2 +; CHECK: %index.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0 +; CHECK: %index.splat = shufflevector <8 x i32> %index.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer +; CHECK: %viv.induction = add <8 x i32> %index.splat, +; CHECK: %[[ICMP:.*]] = icmp ule <8 x i32> %viv.induction, %broadcast.splat2 ; CHECK: call <8 x i16> @llvm.masked.load.v8i16.p0v8i16({{.*}}, <8 x i1> %[[ICMP]], <8 x i16> undef) ; CHECK: ret ; @@ -224,7 +224,7 @@ ; CHECK: br label %vector.body ; CHECK: vector.body: ; CHECK-NOT: @llvm.arm.mve.vctp -; CHECK: %[[ICMP:.*]] = icmp ule <4 x i32> %lane.mask.induction, %splat.btc +; CHECK: %[[ICMP:.*]] = icmp ule <4 x i32> %viv.induction, %splat.btc ; CHECK: call <4 x i16> @llvm.masked.load.v4i16.p0v4i16({{.*}}, <4 x i1> %[[ICMP]],{{.*}} ; ;