Index: llvm/lib/Target/ARM/MVETailPredication.cpp =================================================================== --- llvm/lib/Target/ARM/MVETailPredication.cpp +++ llvm/lib/Target/ARM/MVETailPredication.cpp @@ -124,16 +124,21 @@ /// the second check that no overflow can occur in the expression that use /// this backedge-taken count. bool IsSafeActiveMask(IntrinsicInst *ActiveLaneMask, Value *TripCount, - FixedVectorType *VecTy); + FixedVectorType *VecTy, Instruction *DefBTC); /// Insert the intrinsic to represent the effect of tail predication. void InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask, Value *TripCount, - FixedVectorType *VecTy); + FixedVectorType *VecTy, Instruction *DefBTC); /// Rematerialize the iteration count in exit blocks, which enables /// ARMLowOverheadLoops to better optimise away loop update statements inside /// hardware-loops. void RematerializeIterCount(); + + /// Given the backedge taken count (BTC) use from get.active.lane.mask, + /// find its definition from which we can extract the number of elements + /// processed by the loop. + Instruction *MatchDefBTC(Value *BTC); }; } // end namespace @@ -347,6 +352,7 @@ // Here we perform checks that this intrinsic behaves as expected, // which means: // +// 0) Check that BTC is indeed the loop's BTC. // 1) The element count, which is calculated with BTC + 1, cannot overflow. // 2) The element count needs to be sufficiently large that the decrement of // element counter doesn't overflow, which means that we need to prove: @@ -358,19 +364,58 @@ // 3) The IV must be an induction phi with an increment equal to the // vector width. bool MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask, - Value *TripCount, FixedVectorType *VecTy) { + Value *TripCount, FixedVectorType *VecTy, Instruction *DefBTC) { bool ForceTailPredication = EnableTailPredication == TailPredication::ForceEnabledNoReductions || EnableTailPredication == TailPredication::ForceEnabled; + auto *IntrinsicBTC = ActiveLaneMask->getOperand(1); + int VectorWidth = VecTy->getNumElements(); + ConstantInt *BTCValue = nullptr; + + // 0) Check that this BTC is indeed the loop's BTC. + if ((BTCValue = dyn_cast(IntrinsicBTC))) { + ConstantInt *TC = dyn_cast(TripCount); + if (!TC) { + LLVM_DEBUG(dbgs() << "ARM TP: Constant tripcount expected in " + "set.loop.iterations\n"); + return false; + } + + // Calculate 2 tripcount values and check that they are consistent with + // each other: + // - The number of loop iterations extracted from the set.loop.iterations + // intrinsic, multipled by the vector width: + uint64_t TC1 = TC->getZExtValue() * VectorWidth; + + // - TC1 has to be equal to BTC + 1 + 1, where BTC + 1 is the loop + // tripcount and the extra + 1 to compensate for start counting from 0. + uint64_t TC2 = BTCValue->getZExtValue() + 1 + 1; + + if (TC1 != TC2) { + LLVM_DEBUG(dbgs() << "ARM TP: inconsistent constant tripcount values: " + << TC1 << " from set.loop.iterations, and " + << TC2 << " from get.active.lane.mask\n"); + return false; + } + } else if (DefBTC) { + // Because we look for the definition of BTC in preheader, we know it is + // loopinvariant and don't need to further check that here. + LLVM_DEBUG(dbgs() << "ARM TP: BTC found in the preheader block: " + << *DefBTC << "\n"); + } else { + LLVM_DEBUG(dbgs() << "ARM TP: Couldn't verify that get.active.lane.mask " + "second argument is the backedge taken count.\n"); + return false; + } + // 1) Test whether entry to the loop is protected by a conditional // BTC + 1 < 0. In other words, if the scalar trip count overflows, // becomes negative, we shouldn't enter the loop and creating // tripcount expression BTC + 1 is not safe. So, check that BTC // isn't max. This is evaluated in unsigned, because the semantics // of @get.active.lane.mask is a ULE comparison. - auto *BackedgeTakenCount = ActiveLaneMask->getOperand(1); - auto *BTC = SE->getSCEV(BackedgeTakenCount); + auto *BTC = SE->getSCEV(IntrinsicBTC); auto *MaxBTC = SE->getConstantMaxBackedgeTakenCount(L); if (isa(MaxBTC)) { @@ -404,7 +449,6 @@ // auto *TC = SE->getSCEV(TripCount); unsigned SizeInBits = TripCount->getType()->getScalarSizeInBits(); - int VectorWidth = VecTy->getNumElements(); auto Diff = APInt(SizeInBits, ~0) - APInt(SizeInBits, VectorWidth); uint64_t MaxMinusVW = Diff.getZExtValue(); uint64_t UpperboundTC = SE->getSignedRange(TC).getUpper().getZExtValue(); @@ -494,40 +538,23 @@ return false; } -// Materialize NumElements in the preheader block. -static Value *getNumElements(BasicBlock *Preheader, Value *BTC) { - // First, check the preheader if it not already exist: - // - // preheader: - // %BTC = add i32 %N, -1 - // .. - // vector.body: - // - // if %BTC already exists. We don't need to emit %NumElems = %BTC + 1, - // but instead can just return %N. - for (auto &I : *Preheader) { - if (I.getOpcode() != Instruction::Add || &I != BTC) - continue; - ConstantInt *MinusOne = nullptr; - if (!(MinusOne = dyn_cast(I.getOperand(1)))) - continue; - if (MinusOne->getSExtValue() == -1) { - LLVM_DEBUG(dbgs() << "ARM TP: Found num elems: " << I << "\n"); - return I.getOperand(0); - } - } +// Materialize NumElements in the preheader block if necessary. +static Value *getNumElements(BasicBlock *Preheader, Instruction *DefBTC, + Value *UseBTC) { + if (DefBTC) + return DefBTC->getOperand(0); - // But we do need to materialise BTC if it is not already there, - // e.g. if it is a constant. + // But we do need to materialise if the definition of BTC is not already + // present, e.g. if it is a constant. IRBuilder<> Builder(Preheader->getTerminator()); - Value *NumElements = Builder.CreateAdd(BTC, - ConstantInt::get(BTC->getType(), 1), "num.elements"); + Value *NumElements = Builder.CreateAdd(UseBTC, + ConstantInt::get(UseBTC->getType(), 1), "num.elements"); LLVM_DEBUG(dbgs() << "ARM TP: Created num elems: " << *NumElements << "\n"); return NumElements; } void MVETailPredication::InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask, - Value *TripCount, FixedVectorType *VecTy) { + Value *TripCount, FixedVectorType *VecTy, Instruction *DefBTC) { IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); Module *M = L->getHeader()->getModule(); Type *Ty = IntegerType::get(M->getContext(), 32); @@ -536,9 +563,9 @@ // The backedge-taken count in @llvm.get.active.lane.mask, its 2nd operand, // is one less than the trip count. So we need to find or create // %num.elements = %BTC + 1 in the preheader. - Value *BTC = ActiveLaneMask->getOperand(1); + Value *UseBTC = ActiveLaneMask->getOperand(1); Builder.SetInsertPoint(L->getLoopPreheader()->getTerminator()); - Value *NumElements = getNumElements(L->getLoopPreheader(), BTC); + Value *NumElements = getNumElements(L->getLoopPreheader(), DefBTC, UseBTC); // Insert a phi to count the number of elements processed by the loop. Builder.SetInsertPoint(L->getHeader()->getFirstNonPHI() ); @@ -578,6 +605,55 @@ << "ARM TP: Inserted VCTP: " << *VCTPCall << "\n"); } + +// If it is not a constant value, we assume that the backedge taken count +// expression exist in this form in the loop-nest hierarchy: +// +// preheader: +// %BTC = add i32 %N, -1 +// .. +// vector.body: +// +// We use this to materialise the value that represents the "number of elements +// processed", and also in the sanity checks for get.active.lane.mask to check +// that the BTC is in fact the loop's BTC. +// +Instruction *MVETailPredication::MatchDefBTC(Value *BTC) { + auto getBTC = [&] (BasicBlock *BB) -> Instruction * { + for (auto &I : *BB) { + if (I.getOpcode() != Instruction::Add || &I != BTC) + continue; + ConstantInt *MinusOne = nullptr; + if (!(MinusOne = dyn_cast(I.getOperand(1)))) + continue; + if (MinusOne->getSExtValue() == -1) { + LLVM_DEBUG(dbgs() << "ARM TP: Found num elems: " << I << "\n"); + return &I; + } + } + return nullptr; + }; + + Loop *L = this->L; + + do { + if (!L->getLoopPreheader()) + return nullptr; + + Instruction *I = nullptr; + if ((I = getBTC(L->getLoopPreheader()))) + return I; + + BasicBlock *Pred = nullptr; + if ((Pred = L->getLoopPreheader()->getSinglePredecessor())) + if ((I = getBTC(Pred))) + return I; + + } while ((L = L->getParentLoop())); + + return nullptr; +} + bool MVETailPredication::TryConvert(Value *TripCount) { if (!IsPredicatedVectorLoop()) { LLVM_DEBUG(dbgs() << "ARM TP: no masked instructions in loop.\n"); @@ -605,12 +681,14 @@ << *ActiveLaneMask << "\n"); auto *VecTy = getVectorType(I); - if (!IsSafeActiveMask(ActiveLaneMask, TripCount, VecTy)) { + auto *DefBTC= MatchDefBTC(ActiveLaneMask->getOperand(1)); + + if (!IsSafeActiveMask(ActiveLaneMask, TripCount, VecTy, DefBTC)) { LLVM_DEBUG(dbgs() << "ARM TP: Not safe to insert VCTP.\n"); return false; } LLVM_DEBUG(dbgs() << "ARM TP: Safe to insert VCTP.\n"); - InsertVCTPIntrinsic(ActiveLaneMask, TripCount, VecTy); + InsertVCTPIntrinsic(ActiveLaneMask, TripCount, VecTy, DefBTC); } Cleanup(Predicates, L); Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll @@ -430,6 +430,211 @@ ret void } +; CHECK-LABEL: BTC_not_N_minus_1 +; CHECK: call <4 x i1> @llvm.get.active.lane.mask +; CHECK-NOT: vctp +; CHECK: ret void +; +define dso_local void @BTC_not_N_minus_1(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 { +entry: + %cmp8 = icmp sgt i32 %N, 0 + %0 = add i32 %N, 3 + %1 = lshr i32 %0, 2 + %2 = shl nuw i32 %1, 2 + %3 = add i32 %2, -4 + %4 = lshr i32 %3, 2 + %5 = add nuw nsw i32 %4, 1 + br i1 %cmp8, label %vector.ph, label %for.cond.cleanup + +vector.ph: + +; BTC is not of the form BTC = N - 1 here: + + %trip.count.minus.1 = add i32 %N, -2 + call void @llvm.set.loop.iterations.i32(i32 %5) + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %lsr.iv17 = phi i32* [ %scevgep18, %vector.body ], [ %A, %vector.ph ] + %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %C, %vector.ph ] + %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %vector.ph ] + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %6 = phi i32 [ %5, %vector.ph ], [ %8, %vector.body ] + %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>* + %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* + %lsr.iv1719 = bitcast i32* %lsr.iv17 to <4 x i32>* + + %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + + %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) + %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) + %7 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %7, <4 x i32>* %lsr.iv1719, i32 4, <4 x i1> %active.lane.mask) + %index.next = add i32 %index, 4 + %scevgep = getelementptr i32, i32* %lsr.iv, i32 4 + %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4 + %scevgep18 = getelementptr i32, i32* %lsr.iv17, i32 4 + %8 = call i32 @llvm.loop.decrement.reg.i32(i32 %6, i32 1) + %9 = icmp ne i32 %8, 0 + br i1 %9, label %vector.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %vector.body, %entry + ret void +} + +; CHECK-LABEL: BTC_not_N_minus_1_v2 +; CHECK: call <4 x i1> @llvm.get.active.lane.mask +; CHECK-NOT: vctp +; CHECK: ret void +; +define dso_local void @BTC_not_N_minus_1_v2(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 { +entry: + %cmp8 = icmp sgt i32 %N, 0 + %0 = add i32 %N, 3 + %1 = lshr i32 %0, 2 + %2 = shl nuw i32 %1, 2 + %3 = add i32 %2, -4 + %4 = lshr i32 %3, 2 + %5 = add nuw nsw i32 %4, 1 + br i1 %cmp8, label %vector.ph, label %for.cond.cleanup + +vector.ph: + +; BTC is not of the form BTC = N - 1 here: + + %trip.count.minus.1 = sub i32 %N, -1 + call void @llvm.set.loop.iterations.i32(i32 %5) + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %lsr.iv17 = phi i32* [ %scevgep18, %vector.body ], [ %A, %vector.ph ] + %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %C, %vector.ph ] + %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %vector.ph ] + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %6 = phi i32 [ %5, %vector.ph ], [ %8, %vector.body ] + %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>* + %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* + %lsr.iv1719 = bitcast i32* %lsr.iv17 to <4 x i32>* + + %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + + %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) + %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) + %7 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %7, <4 x i32>* %lsr.iv1719, i32 4, <4 x i1> %active.lane.mask) + %index.next = add i32 %index, 4 + %scevgep = getelementptr i32, i32* %lsr.iv, i32 4 + %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4 + %scevgep18 = getelementptr i32, i32* %lsr.iv17, i32 4 + %8 = call i32 @llvm.loop.decrement.reg.i32(i32 %6, i32 1) + %9 = icmp ne i32 %8, 0 + br i1 %9, label %vector.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %vector.body, %entry + ret void +} + +; CHECK-LABEL: BTC_not_N_minus_1_v3 +; CHECK: call <4 x i1> @llvm.get.active.lane.mask +; CHECK-NOT: vctp +; CHECK: ret void +; +define dso_local void @BTC_not_N_minus_1_v3(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 { +entry: + %cmp8 = icmp sgt i32 %N, 0 + %0 = add i32 %N, 3 + %1 = lshr i32 %0, 2 + %2 = shl nuw i32 %1, 2 + %3 = add i32 %2, -4 + %4 = lshr i32 %3, 2 + %5 = add nuw nsw i32 %4, 1 + br i1 %cmp8, label %vector.ph, label %for.cond.cleanup + +vector.ph: + +; We don't have a BTC = N - 1 instruction here (or anywhere else). + + call void @llvm.set.loop.iterations.i32(i32 %5) + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %lsr.iv17 = phi i32* [ %scevgep18, %vector.body ], [ %A, %vector.ph ] + %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %C, %vector.ph ] + %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %vector.ph ] + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %6 = phi i32 [ %5, %vector.ph ], [ %8, %vector.body ] + %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>* + %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* + %lsr.iv1719 = bitcast i32* %lsr.iv17 to <4 x i32>* + + %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) + + %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) + %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) + %7 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %7, <4 x i32>* %lsr.iv1719, i32 4, <4 x i1> %active.lane.mask) + %index.next = add i32 %index, 4 + %scevgep = getelementptr i32, i32* %lsr.iv, i32 4 + %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4 + %scevgep18 = getelementptr i32, i32* %lsr.iv17, i32 4 + %8 = call i32 @llvm.loop.decrement.reg.i32(i32 %6, i32 1) + %9 = icmp ne i32 %8, 0 + br i1 %9, label %vector.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %vector.body, %entry + ret void +} + +; CHECK-LABEL: const_expected_in_set_loop +; CHECK: call <4 x i1> @llvm.get.active.lane.mask +; CHECK-NOT: vctp +; CHECK: ret void +; +define dso_local void @const_expected_in_set_loop(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 { +entry: + %cmp8 = icmp sgt i32 %N, 0 + %0 = add i32 %N, 3 + %1 = lshr i32 %0, 2 + %2 = shl nuw i32 %1, 2 + %3 = add i32 %2, -4 + %4 = lshr i32 %3, 2 + %5 = add nuw nsw i32 %4, 1 + br i1 %cmp8, label %vector.ph, label %for.cond.cleanup + +vector.ph: + call void @llvm.set.loop.iterations.i32(i32 %5) + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %lsr.iv17 = phi i32* [ %scevgep18, %vector.body ], [ %A, %vector.ph ] + %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %C, %vector.ph ] + %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %vector.ph ] + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %6 = phi i32 [ %5, %vector.ph ], [ %8, %vector.body ] + %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>* + %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* + %lsr.iv1719 = bitcast i32* %lsr.iv17 to <4 x i32>* + + %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 42) + + %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) + %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) + %7 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %7, <4 x i32>* %lsr.iv1719, i32 4, <4 x i1> %active.lane.mask) + %index.next = add i32 %index, 4 + %scevgep = getelementptr i32, i32* %lsr.iv, i32 4 + %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4 + %scevgep18 = getelementptr i32, i32* %lsr.iv17, i32 4 + %8 = call i32 @llvm.loop.decrement.reg.i32(i32 %6, i32 1) + %9 = icmp ne i32 %8, 0 + br i1 %9, label %vector.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %vector.body, %entry + ret void +} + + + declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>) declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32 immarg, <16 x i1>) declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>) Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/clear-maskedinsts.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/clear-maskedinsts.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/clear-maskedinsts.ll @@ -23,13 +23,12 @@ ; CHECK-NEXT: [[BROADCAST_SPLATINSERT71:%.*]] = insertelement <4 x i32> undef, i32 [[X]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT72:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT71]], <4 x i32> undef, <4 x i32> zeroinitializer ; CHECK-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[TMP3]]) -; CHECK-NEXT: [[NUM_ELEMENTS:%.*]] = add i32 [[TRIP_COUNT_MINUS_183]], 1 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[LSR_IV9:%.*]] = phi i32* [ [[SCEVGEP10:%.*]], [[VECTOR_BODY]] ], [ [[D:%.*]], [[VECTOR_PH]] ] ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP4:%.*]] = phi i32 [ [[TMP3]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP5:%.*]] = phi i32 [ [[NUM_ELEMENTS]], [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP5:%.*]] = phi i32 [ %n, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[LSR_IV911:%.*]] = bitcast i32* [[LSR_IV9]] to <4 x i32>* ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll @@ -24,13 +24,12 @@ ; CHECK-NEXT: [[ARRAYIDX8_PROMOTED_US:%.*]] = load i32, i32* [[ARRAYIDX8_US]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> , i32 [[ARRAYIDX8_PROMOTED_US]], i32 0 ; CHECK-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[TMP2]]) -; CHECK-NEXT: [[NUM_ELEMENTS:%.*]] = add i32 [[TRIP_COUNT_MINUS_1]], 1 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[FOR_COND1_PREHEADER_US]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP4]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP5:%.*]] = phi i32 [ [[TMP2]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[NUM_ELEMENTS]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ %N, [[FOR_COND1_PREHEADER_US]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer ; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], @@ -146,13 +145,12 @@ ; CHECK-NEXT: [[ARRAYIDX7_PROMOTED_US:%.*]] = load i32, i32* [[ARRAYIDX7_US]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> , i32 [[ARRAYIDX7_PROMOTED_US]], i32 0 ; CHECK-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[TMP2]]) -; CHECK-NEXT: [[NUM_ELEMENTS:%.*]] = add i32 [[TRIP_COUNT_MINUS_1]], 1 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[FOR_COND1_PREHEADER_US]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP4]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP5:%.*]] = phi i32 [ [[TMP2]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[NUM_ELEMENTS]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ %N, [[FOR_COND1_PREHEADER_US]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer ; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll @@ -265,13 +265,13 @@ ret void } -; CHECK-LABEL: @overflow_BTC_plus_1( +; CHECK-LABEL: @inconsistent_tripcounts( ; CHECK: vector.body: ; CHECK-NOT: @llvm.arm.mve.vctp32 ; CHECK: @llvm.get.active.lane.mask ; CHECK: ret void ; -define dso_local void @overflow_BTC_plus_1(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 { +define dso_local void @inconsistent_tripcounts(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 { entry: call void @llvm.set.loop.iterations.i32(i32 8001) br label %vector.body @@ -316,63 +316,7 @@ ; define dso_local void @overflow_in_sub(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 { entry: - call void @llvm.set.loop.iterations.i32(i32 8001) - br label %vector.body - -vector.body: - %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %A, %entry ] - %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ] - %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ] - %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] - %0 = phi i32 [ 8001, %entry ], [ %3, %vector.body ] - %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* - %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>* - %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>* - %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 - %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer - %induction = add <4 x i32> %broadcast.splat, - -; Overflow in the substraction. This should hold: -; -; ceil(ElementCount / VectorWidth) >= TripCount -; -; But we have: -; -; ceil(3200 / 4) >= 8001 -; 8000 >= 8001 -; - %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 31999) - - %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef) - %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef) - %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load - call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %2, <4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %1) - %index.next = add i32 %index, 4 - %scevgep = getelementptr i32, i32* %lsr.iv, i32 4 - %scevgep12 = getelementptr i32, i32* %lsr.iv11, i32 4 - %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4 - %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1) - %4 = icmp ne i32 %3, 0 - br i1 %4, label %vector.body, label %for.cond.cleanup - -for.cond.cleanup: - ret void -} - -; CHECK-LABEL: @overflow_in_rounding_tripcount( -; CHECK: vector.body: -; CHECK-NOT: @llvm.arm.mve.vctp32 -; CHECK: @llvm.get.active.lane.mask -; CHECK: ret void -; -define dso_local void @overflow_in_rounding_tripcount(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 { -entry: - -; TC = 4294967292 -; 4294967292 <= 4294967291 (MAX - vectorwidth) -; False -; - call void @llvm.set.loop.iterations.i32(i32 4294967291) + call void @llvm.set.loop.iterations.i32(i32 1073741824) br label %vector.body vector.body: @@ -388,7 +332,7 @@ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, - %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32002) + %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 4294967294) %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef) %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef)