Diff 294621

llvm/lib/Target/ARM/MVETailPredication.cpp

	Show First 20 Lines • Show All 367 Lines • ▼ Show 20 Lines
	// 3) The IV must be an induction phi with an increment equal to the			// 3) The IV must be an induction phi with an increment equal to the
	// vector width.			// vector width.
	bool MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask,			bool MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask,
	Value TripCount, FixedVectorType VecTy) {			Value TripCount, FixedVectorType VecTy) {
	bool ForceTailPredication =			bool ForceTailPredication =
	EnableTailPredication == TailPredication::ForceEnabledNoReductions \|\|			EnableTailPredication == TailPredication::ForceEnabledNoReductions \|\|
	EnableTailPredication == TailPredication::ForceEnabled;			EnableTailPredication == TailPredication::ForceEnabled;

	// 1) Check that the original scalar loop TripCount (TC) belongs to this loop.
	// The scalar tripcount corresponds the number of elements processed by the
	// loop, so we will refer to that from this point on.
	Value *ElemCount = ActiveLaneMask->getOperand(1);			Value *ElemCount = ActiveLaneMask->getOperand(1);
	auto *EC= SE->getSCEV(ElemCount);			auto *EC= SE->getSCEV(ElemCount);
	auto *TC = SE->getSCEV(TripCount);			auto *TC = SE->getSCEV(TripCount);
	int VectorWidth = VecTy->getNumElements();			int VectorWidth = VecTy->getNumElements();
	ConstantInt *ConstElemCount = nullptr;			ConstantInt *ConstElemCount = nullptr;

				// 1) Smoke tests that the original scalar loop TripCount (TC) belongs to
				// this loop. The scalar tripcount corresponds the number of elements
				// processed by the loop, so we will refer to that from this point on.
	if (!SE->isLoopInvariant(EC, L)) {			if (!SE->isLoopInvariant(EC, L)) {
	LLVM_DEBUG(dbgs() << "ARM TP: element count must be loop invariant.\n");			LLVM_DEBUG(dbgs() << "ARM TP: element count must be loop invariant.\n");
	return false;			return false;
	}			}

	if ((ConstElemCount = dyn_cast<ConstantInt>(ElemCount))) {			if ((ConstElemCount = dyn_cast<ConstantInt>(ElemCount))) {
	ConstantInt *TC = dyn_cast<ConstantInt>(TripCount);			ConstantInt *TC = dyn_cast<ConstantInt>(TripCount);
	if (!TC) {			if (!TC) {
	LLVM_DEBUG(dbgs() << "ARM TP: Constant tripcount expected in "			LLVM_DEBUG(dbgs() << "ARM TP: Constant tripcount expected in "
	"set.loop.iterations\n");			"set.loop.iterations\n");
	return false;			return false;
	}			}

	// Calculate 2 tripcount values and check that they are consistent with			// Calculate 2 tripcount values and check that they are consistent with
	// each other:			// each other:
	// i) The number of loop iterations extracted from the set.loop.iterations			// i) The number of loop iterations extracted from the set.loop.iterations
	// intrinsic, multipled by the vector width:			// intrinsic, multipled by the vector width:
	uint64_t TC1 = TC->getZExtValue() * VectorWidth;			uint64_t TC1 = TC->getZExtValue() * VectorWidth;

	// ii) TC1 has to be equal to TC + 1, with the + 1 to compensate for start			// ii) TC1 has to be equal to TC + 1, with the + 1 to compensate for start
	// counting from 0.			// counting from 0.
				efriedmaUnsubmitted Not Done Reply Inline Actions Maybe add a comment clarifying why we want to bail out here? efriedma: Maybe add a comment clarifying why we want to bail out here?
	uint64_t TC2 = ConstElemCount->getZExtValue() + 1;			uint64_t TC2 = ConstElemCount->getZExtValue() + 1;

				// If the tripcount values are inconsistent, we don't want to insert the
				// VCTP and trigger tail-predication; it's better to keep intrinsic
				// get.active.lane.mask and legalize this.
	if (TC1 != TC2) {			if (TC1 != TC2) {
	LLVM_DEBUG(dbgs() << "ARM TP: inconsistent constant tripcount values: "			LLVM_DEBUG(dbgs() << "ARM TP: inconsistent constant tripcount values: "
	<< TC1 << " from set.loop.iterations, and "			<< TC1 << " from set.loop.iterations, and "
	<< TC2 << " from get.active.lane.mask\n");			<< TC2 << " from get.active.lane.mask\n");
	return false;			return false;
	}			}
	} else if (!ForceTailPredication) {			} else if (!ForceTailPredication) {
	// Smoke tests if the element count is a runtime value. I.e., this isn't			// 2) We need to prove that the sub expression that we create in the
	// fully generic because that would require a full SCEV visitor here. It			// tail-predicated loop body, which calculates the remaining elements to be
	// would require extracting the variable from the elementcount SCEV			// processed, is non-negative, i.e. it doesn't overflow:
	// expression, and match this up with the tripcount SCEV expression. If
	// this matches up, we know both expressions are bound by the same
	// variable, and thus we know this tripcount belongs to this loop. The
	// checks below will catch most cases though.
	if (isa<SCEVAddExpr>(EC) \|\| isa<SCEVUnknown>(EC)) {
	// If the element count is a simple AddExpr or SCEVUnknown, which is e.g.
	// the case when the element count is just a variable %N, we can just see
	// if it is an operand in the tripcount scev expression.
	if (isa<SCEVAddExpr>(TC) && !SE->hasOperand(TC, EC)) {
	LLVM_DEBUG(dbgs() << "ARM TP: Can't verify the element counter\n");
	return false;
	}
	} else if (const SCEVAddRecExpr *AddRecExpr = dyn_cast<SCEVAddRecExpr>(EC)) {
	// For more complicated AddRecExpr, check that the corresponding loop and
	// its loop hierarhy contains the trip count loop.
	if (!AddRecExpr->getLoop()->contains(L)) {
	LLVM_DEBUG(dbgs() << "ARM TP: Can't verify the element counter\n");
	return false;
	}
	} else {
	LLVM_DEBUG(dbgs() << "ARM TP: Unsupported SCEV type, can't verify the "
	"element counter\n");
	return false;
	}
	}

	// 2) Prove that the sub expression is non-negative, i.e. it doesn't overflow:
	//			//
	// (((ElementCount + (VectorWidth - 1)) / VectorWidth) - TripCount			// ((ElementCount + VectorWidth - 1) / VectorWidth) - TripCount >= 0
	//
	// 2.1) First prove overflow can't happen in:
	//			//
	// ElementCount + (VectorWidth - 1)			// This is true if:
	//			//
	// Because of a lack of context, it is difficult to get a useful bounds on			// TripCount == (ElementCount + VectorWidth - 1) / VectorWidth
	// this expression. But since ElementCount uses the same variables as the
	// TripCount (TC), for which we can find meaningful value ranges, we use that
	// instead and assert that:
	//			//
	// upperbound(TC) <= UINT_MAX - VectorWidth			// which what we will be using here.
	//			//
	unsigned SizeInBits = TripCount->getType()->getScalarSizeInBits();			auto *VW = SE->getSCEV(ConstantInt::get(TripCount->getType(), VectorWidth));
	auto MaxMinusVW = APInt(SizeInBits, ~0) - APInt(SizeInBits, VectorWidth);			// ElementCount + (VW-1):
	APInt UpperboundTC = SE->getUnsignedRangeMax(TC);			auto *ECPlusVWMinus1 = SE->getAddExpr(EC,
				SE->getSCEV(ConstantInt::get(TripCount->getType(), VectorWidth - 1)));

	if (UpperboundTC.ugt(MaxMinusVW) && !ForceTailPredication) {			// Ceil = ElementCount + (VW-1) / VW
	LLVM_DEBUG(dbgs() << "ARM TP: Overflow possible in tripcount rounding:\n";			auto *Ceil = SE->getUDivExpr(ECPlusVWMinus1, VW);
	dbgs() << "upperbound(TC) <= UINT_MAX - VectorWidth\n";
	dbgs() << UpperboundTC << " <= " << MaxMinusVW << " == false\n";);
	return false;
	}

	// 2.2) Make sure overflow doesn't happen in final expression:			LLVM_DEBUG(
	// (((ElementCount + (VectorWidth - 1)) / VectorWidth) - TripCount,			dbgs() << "ARM TP: Analysing overflow behaviour for:\n";
	// To do this, compare the full ranges of these subexpressions:			dbgs() << "ARM TP: - TripCount = "; TC->dump();
	//			dbgs() << "ARM TP: - ElemCount = "; EC->dump();
	// Range(Ceil) <= Range(TC)			dbgs() << "ARM TP: - VecWidth = " << VectorWidth << "\n";
	//			dbgs() << "ARM TP: - (ElemCount+VW-1) / VW = "; Ceil->dump();
	// where Ceil = ElementCount + (VW-1) / VW. If Ceil and TC are runtime			);
	// values (and not constants), we have to compensate for the lowerbound value
	// range to be off by 1. The reason is that the TC lives in the preheader in			// As an example, almost all the tripcount expressions (produced by the
	// this form:			// vectoriser) look like this:
	//			//
	// %trip.count.minus = add nsw nuw i32 %N, -1			// TC = ((-4 + (4 * ((3 + %N) /u 4))<nuw>) /u 4)
	//			//
	// For the loop to be executed, %N has to be >= 1 and as a result the value			// and "ElementCount + (VW-1) / VW":
	// range of %trip.count.minus has a lower bound of 0. Value %TC has this form:
	//			//
	// %5 = add nuw nsw i32 %4, 1			// Ceil = ((3 + %N) /u 4)
	// call void @llvm.set.loop.iterations.i32(i32 %5)
	//			//
	// where %5 is some expression using %N, which needs to have a lower bound of			// Check for equality of TC and Ceil by calculating SCEV expression
	// 1. Thus, if the ranges of Ceil and TC are not a single constant but a set,			// TC - Ceil and test it for zero.
	// we first add 0 to TC such that we can do the <= comparison on both sets.
	//			//
				bool Zero = SE->getMinusSCEV(
				SE->getBackedgeTakenCount(L),
				efriedmaUnsubmitted Not Done Reply Inline Actions I'm not really comfortable using hasOperand here. Can we actually do something like `SE->getBackedgeTakenCount(L) == S->getUDiv(SE->getAdd(SE->getMul(Ceil, VectorWidth), SE->getNeg(VectorWidth)), VectorWidth)`? Or is that not reliable enough? efriedma: I'm not really comfortable using hasOperand here. Can we actually do something like `SE…
				SjoerdMeijerAuthorUnsubmitted Done Reply Inline Actions Yeah, I was trying to avoid that bit of pattern matching (and hasOperand was so convenient), but will give that a try tomorrow. SjoerdMeijer: Yeah, I was trying to avoid that bit of pattern matching (and hasOperand was so convenient)…
				SE->getUDivExpr(SE->getAddExpr(SE->getMulExpr(Ceil, VW),
				SE->getNegativeSCEV(VW)),
				VW))
				->isZero();

	// Tmp = ElementCount + (VW-1)			if (!Zero) {
	auto *ECPlusVWMinus1 = SE->getAddExpr(EC,			LLVM_DEBUG(dbgs() << "ARM TP: possible overflow in sub expression.\n");
	SE->getSCEV(ConstantInt::get(TripCount->getType(), VectorWidth - 1)));
	// Ceil = ElementCount + (VW-1) / VW
	auto *Ceil = SE->getUDivExpr(ECPlusVWMinus1,
	SE->getSCEV(ConstantInt::get(TripCount->getType(), VectorWidth)));

	ConstantRange RangeCeil = SE->getUnsignedRange(Ceil) ;
	ConstantRange RangeTC = SE->getUnsignedRange(TC) ;
	if (!RangeTC.isSingleElement()) {
	auto ZeroRange =
	ConstantRange(APInt(TripCount->getType()->getScalarSizeInBits(), 0));
	RangeTC = RangeTC.unionWith(ZeroRange);
	}
	if (!RangeTC.contains(RangeCeil) && !ForceTailPredication) {
	LLVM_DEBUG(dbgs() << "ARM TP: Overflow possible in sub\n");
	return false;			return false;
	}			}
				}

	// 3) Find out if IV is an induction phi. Note that we can't use Loop			// 3) Find out if IV is an induction phi. Note that we can't use Loop
				efriedmaUnsubmitted Not Done Reply Inline Actions Should this be using EC, not TC? efriedma: Should this be using EC, not TC?
				efriedmaUnsubmitted Not Done Reply Inline Actions I'm not confident this is actually proving what you want to prove? If x is in the range [0, 10), and y is in the range [0, 12), that doesn't prove x<y. efriedma: I'm not confident this is actually proving what you want to prove? If x is in the range [0…
	// helpers here to get the induction variable, because the hardware loop is			// helpers here to get the induction variable, because the hardware loop is
	// no longer in loopsimplify form, and also the hwloop intrinsic uses a			// no longer in loopsimplify form, and also the hwloop intrinsic uses a
	// different counter. Using SCEV, we check that the induction is of the			// different counter. Using SCEV, we check that the induction is of the
	// form i = i + 4, where the increment must be equal to the VectorWidth.			// form i = i + 4, where the increment must be equal to the VectorWidth.
	auto *IV = ActiveLaneMask->getOperand(0);			auto *IV = ActiveLaneMask->getOperand(0);
	auto *IVExpr = SE->getSCEV(IV);			auto *IVExpr = SE->getSCEV(IV);
	auto *AddExpr = dyn_cast<SCEVAddRecExpr>(IVExpr);			auto *AddExpr = dyn_cast<SCEVAddRecExpr>(IVExpr);

	if (!AddExpr) {			if (!AddExpr) {
	LLVM_DEBUG(dbgs() << "ARM TP: induction not an add expr: "; IVExpr->dump());			LLVM_DEBUG(dbgs() << "ARM TP: induction not an add expr: "; IVExpr->dump());
	return false;			return false;
	}			}
	// Check that this AddRec is associated with this loop.			// Check that this AddRec is associated with this loop.
	if (AddExpr->getLoop() != L) {			if (AddExpr->getLoop() != L) {
	LLVM_DEBUG(dbgs() << "ARM TP: phi not part of this loop\n");			LLVM_DEBUG(dbgs() << "ARM TP: phi not part of this loop\n");
	return false;			return false;
	}			}
				auto *Base = dyn_cast<SCEVConstant>(AddExpr->getOperand(0));
				if (!Base \|\| !Base->isZero()) {
				LLVM_DEBUG(dbgs() << "ARM TP: induction base is not 0\n");
				return false;
				}
	auto *Step = dyn_cast<SCEVConstant>(AddExpr->getOperand(1));			auto *Step = dyn_cast<SCEVConstant>(AddExpr->getOperand(1));
	if (!Step) {			if (!Step) {
	LLVM_DEBUG(dbgs() << "ARM TP: induction step is not a constant: ";			LLVM_DEBUG(dbgs() << "ARM TP: induction step is not a constant: ";
	AddExpr->getOperand(1)->dump());			AddExpr->getOperand(1)->dump());
	return false;			return false;
	}			}
				efriedmaUnsubmitted Not Done Reply Inline Actions Do we need to check the first operand of the AddRec is zero? efriedma: Do we need to check the first operand of the AddRec is zero?
				efriedmaUnsubmitted Not Done Reply Inline Actions Any further thought on this? efriedma: Any further thought on this?
				SjoerdMeijerAuthorUnsubmitted Done Reply Inline Actions Sorry, forgot about this one, will also address this. SjoerdMeijer: Sorry, forgot about this one, will also address this.
	auto StepValue = Step->getValue()->getSExtValue();			auto StepValue = Step->getValue()->getSExtValue();
	if (VectorWidth == StepValue)			if (VectorWidth == StepValue)
	return true;			return true;

	LLVM_DEBUG(dbgs() << "ARM TP: Step value " << StepValue << " doesn't match "			LLVM_DEBUG(dbgs() << "ARM TP: Step value " << StepValue << " doesn't match "
	"vector width " << VectorWidth << "\n");			"vector width " << VectorWidth << "\n");

	return false;			return false;
	▲ Show 20 Lines • Show All 101 Lines • Show Last 20 Lines

llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-basic.ll

Show First 20 Lines • Show All 472 Lines • ▼ Show 20 Lines	vector.body: ; preds = %vector.body, %vector.ph
%8 = call i32 @llvm.loop.decrement.reg.i32(i32 %6, i32 1)		%8 = call i32 @llvm.loop.decrement.reg.i32(i32 %6, i32 1)
%9 = icmp ne i32 %8, 0		%9 = icmp ne i32 %8, 0
br i1 %9, label %vector.body, label %for.cond.cleanup		br i1 %9, label %vector.body, label %for.cond.cleanup

for.cond.cleanup: ; preds = %vector.body, %entry		for.cond.cleanup: ; preds = %vector.body, %entry
ret void		ret void
}		}

; CHECK-LABEL: wrong_tripcount_arg		; CHECK-LABEL: tripcount_arg_not_invariant
; CHECK: vector.body:		; CHECK: call <4 x i1> @llvm.get.active.lane.mask
; CHECK: call <4 x i1> @llvm.arm.mve.vctp32		; CHECK-NOT: vctp
; CHECK-NOT: call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32
; CHECK: vector.body35:
; CHECK: call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32
; CHECK-NOT: call <4 x i1> @llvm.arm.mve.vctp32
; CHECK: ret void		; CHECK: ret void
;		;
define dso_local void @wrong_tripcount_arg(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture %D, i32 %N1, i32 %N2) local_unnamed_addr #0 {		define dso_local void @tripcount_arg_not_invariant(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
entry:		entry:
%cmp29 = icmp sgt i32 %N1, 0		%cmp8 = icmp sgt i32 %N, 0
%0 = add i32 %N1, 3		%0 = add i32 %N, 3
%1 = lshr i32 %0, 2		%1 = lshr i32 %0, 2
%2 = shl nuw i32 %1, 2		%2 = shl nuw i32 %1, 2
%3 = add i32 %2, -4		%3 = add i32 %2, -4
%4 = lshr i32 %3, 2		%4 = lshr i32 %3, 2
%5 = add nuw nsw i32 %4, 1		%5 = add nuw nsw i32 %4, 1
br i1 %cmp29, label %vector.ph, label %for.cond4.preheader		br i1 %cmp8, label %vector.ph, label %for.cond.cleanup

vector.ph: ; preds = %entry		vector.ph: ; preds = %entry
		%trip.count.minus.1 = add i32 %N, -1
call void @llvm.set.loop.iterations.i32(i32 %5)		call void @llvm.set.loop.iterations.i32(i32 %5)
br label %vector.body		br label %vector.body

vector.body: ; preds = %vector.body, %vector.ph		vector.body: ; preds = %vector.body, %vector.ph
%lsr.iv62 = phi i32* [ %scevgep63, %vector.body ], [ %D, %vector.ph ]		%lsr.iv17 = phi i32* [ %scevgep18, %vector.body ], [ %A, %vector.ph ]
%lsr.iv59 = phi i32* [ %scevgep60, %vector.body ], [ %C, %vector.ph ]		%lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %C, %vector.ph ]
%lsr.iv56 = phi i32* [ %scevgep57, %vector.body ], [ %B, %vector.ph ]		%lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %vector.ph ]
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]		%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%6 = phi i32 [ %5, %vector.ph ], [ %8, %vector.body ]		%6 = phi i32 [ %5, %vector.ph ], [ %8, %vector.body ]
%lsr.iv5658 = bitcast i32* %lsr.iv56 to <4 x i32>*
%lsr.iv5961 = bitcast i32* %lsr.iv59 to <4 x i32>*		%lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>*
%lsr.iv6264 = bitcast i32* %lsr.iv62 to <4 x i32>*		%lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N1)		%lsr.iv1719 = bitcast i32* %lsr.iv17 to <4 x i32>*
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv5658, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
%wide.masked.load32 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv5961, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)		%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %index)
%7 = add nsw <4 x i32> %wide.masked.load32, %wide.masked.load
call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %7, <4 x i32>* %lsr.iv6264, i32 4, <4 x i1> %active.lane.mask)		%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
		%wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
		%7 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load
		call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %7, <4 x i32>* %lsr.iv1719, i32 4, <4 x i1> %active.lane.mask)
%index.next = add i32 %index, 4		%index.next = add i32 %index, 4
%scevgep57 = getelementptr i32, i32* %lsr.iv56, i32 4		%scevgep = getelementptr i32, i32* %lsr.iv, i32 4
%scevgep60 = getelementptr i32, i32* %lsr.iv59, i32 4		%scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4
%scevgep63 = getelementptr i32, i32* %lsr.iv62, i32 4		%scevgep18 = getelementptr i32, i32* %lsr.iv17, i32 4
%8 = call i32 @llvm.loop.decrement.reg.i32(i32 %6, i32 1)		%8 = call i32 @llvm.loop.decrement.reg.i32(i32 %6, i32 1)
%9 = icmp ne i32 %8, 0		%9 = icmp ne i32 %8, 0
br i1 %9, label %vector.body, label %for.cond4.preheader		;br i1 %9, label %vector.body, label %for.cond.cleanup
		br i1 %9, label %vector.body, label %vector.ph
for.cond4.preheader: ; preds = %vector.body, %entry
%cmp527 = icmp sgt i32 %N2, 0
%10 = add i32 %N2, 3
%11 = lshr i32 %10, 2
%12 = shl nuw i32 %11, 2
%13 = add i32 %12, -4
%14 = lshr i32 %13, 2
%15 = add nuw nsw i32 %14, 1
br i1 %cmp527, label %vector.ph36, label %for.cond.cleanup6

vector.ph36: ; preds = %for.cond4.preheader
call void @llvm.set.loop.iterations.i32(i32 %15)
br label %vector.body35

vector.body35: ; preds = %vector.body35, %vector.ph36
%lsr.iv53 = phi i32* [ %scevgep54, %vector.body35 ], [ %A, %vector.ph36 ]
%lsr.iv50 = phi i32* [ %scevgep51, %vector.body35 ], [ %C, %vector.ph36 ]
%lsr.iv = phi i32* [ %scevgep, %vector.body35 ], [ %B, %vector.ph36 ]
%index40 = phi i32 [ 0, %vector.ph36 ], [ %index.next41, %vector.body35 ]
%16 = phi i32 [ %15, %vector.ph36 ], [ %18, %vector.body35 ]
%lsr.iv49 = bitcast i32* %lsr.iv to <4 x i32>*
%lsr.iv5052 = bitcast i32* %lsr.iv50 to <4 x i32>*
%lsr.iv5355 = bitcast i32* %lsr.iv53 to <4 x i32>*

; This has N1 as the tripcount / element count, which is the tripcount of the
; first loop and not this one:
%active.lane.mask46 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index40, i32 %N1)

%wide.masked.load47 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv49, i32 4, <4 x i1> %active.lane.mask46, <4 x i32> undef)
%wide.masked.load48 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv5052, i32 4, <4 x i1> %active.lane.mask46, <4 x i32> undef)
%17 = add nsw <4 x i32> %wide.masked.load48, %wide.masked.load47
call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %17, <4 x i32>* %lsr.iv5355, i32 4, <4 x i1> %active.lane.mask46)
%index.next41 = add i32 %index40, 4
%scevgep = getelementptr i32, i32* %lsr.iv, i32 4
%scevgep51 = getelementptr i32, i32* %lsr.iv50, i32 4
%scevgep54 = getelementptr i32, i32* %lsr.iv53, i32 4
%18 = call i32 @llvm.loop.decrement.reg.i32(i32 %16, i32 1)
%19 = icmp ne i32 %18, 0
br i1 %19, label %vector.body35, label %for.cond.cleanup6

for.cond.cleanup6: ; preds = %vector.body35, %for.cond4.preheader		for.cond.cleanup: ; preds = %vector.body, %entry
ret void		ret void
}		}

; CHECK-LABEL: tripcount_arg_not_invariant		; CHECK-LABEL: addrec_base_not_zero
; CHECK: call <4 x i1> @llvm.get.active.lane.mask		; CHECK: call <4 x i1> @llvm.get.active.lane.mask
; CHECK-NOT: vctp		; CHECK-NOT: vctp
; CHECK: ret void		; CHECK: ret void
;		;
define dso_local void @tripcount_arg_not_invariant(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {		define dso_local void @addrec_base_not_zero(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
entry:		entry:
%cmp8 = icmp sgt i32 %N, 0		%cmp8 = icmp sgt i32 %N, 0
%0 = add i32 %N, 3		%0 = add i32 %N, 3
%1 = lshr i32 %0, 2		%1 = lshr i32 %0, 2
%2 = shl nuw i32 %1, 2		%2 = shl nuw i32 %1, 2
%3 = add i32 %2, -4		%3 = add i32 %2, -4
%4 = lshr i32 %3, 2		%4 = lshr i32 %3, 2
%5 = add nuw nsw i32 %4, 1		%5 = add nuw nsw i32 %4, 1
br i1 %cmp8, label %vector.ph, label %for.cond.cleanup		br i1 %cmp8, label %vector.ph, label %for.cond.cleanup

vector.ph: ; preds = %entry		vector.ph: ; preds = %entry
%trip.count.minus.1 = add i32 %N, -1		%trip.count.minus.1 = add i32 %N, -1
call void @llvm.set.loop.iterations.i32(i32 %5)		call void @llvm.set.loop.iterations.i32(i32 %5)
br label %vector.body		br label %vector.body

vector.body: ; preds = %vector.body, %vector.ph		vector.body: ; preds = %vector.body, %vector.ph
%lsr.iv17 = phi i32* [ %scevgep18, %vector.body ], [ %A, %vector.ph ]		%lsr.iv17 = phi i32* [ %scevgep18, %vector.body ], [ %A, %vector.ph ]
%lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %C, %vector.ph ]		%lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %C, %vector.ph ]
%lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %vector.ph ]		%lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %vector.ph ]
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%6 = phi i32 [ %5, %vector.ph ], [ %8, %vector.body ]

		; AddRec base is not 0:
		%index = phi i32 [ 1, %vector.ph ], [ %index.next, %vector.body ]

		%6 = phi i32 [ %5, %vector.ph ], [ %8, %vector.body ]
%lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>*		%lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>*
%lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*		%lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
%lsr.iv1719 = bitcast i32* %lsr.iv17 to <4 x i32>*		%lsr.iv1719 = bitcast i32* %lsr.iv17 to <4 x i32>*
		%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %index)

%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)		%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
%wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)		%wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
%7 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load		%7 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load
call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %7, <4 x i32>* %lsr.iv1719, i32 4, <4 x i1> %active.lane.mask)		call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %7, <4 x i32>* %lsr.iv1719, i32 4, <4 x i1> %active.lane.mask)
%index.next = add i32 %index, 4		%index.next = add i32 %index, 4
%scevgep = getelementptr i32, i32* %lsr.iv, i32 4		%scevgep = getelementptr i32, i32* %lsr.iv, i32 4
%scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4		%scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4
%scevgep18 = getelementptr i32, i32* %lsr.iv17, i32 4		%scevgep18 = getelementptr i32, i32* %lsr.iv17, i32 4
%8 = call i32 @llvm.loop.decrement.reg.i32(i32 %6, i32 1)		%8 = call i32 @llvm.loop.decrement.reg.i32(i32 %6, i32 1)
%9 = icmp ne i32 %8, 0		%9 = icmp ne i32 %8, 0
;br i1 %9, label %vector.body, label %for.cond.cleanup		;br i1 %9, label %vector.body, label %for.cond.cleanup
br i1 %9, label %vector.body, label %vector.ph		br i1 %9, label %vector.body, label %vector.ph

for.cond.cleanup: ; preds = %vector.body, %entry		for.cond.cleanup: ; preds = %vector.body, %entry
ret void		ret void
}		}


declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>)		declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>)
declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32 immarg, <16 x i1>)		declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32 immarg, <16 x i1>)
declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>)		declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>)
declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32 immarg, <8 x i1>)		declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32 immarg, <8 x i1>)
declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)		declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
declare void @llvm.masked.store.v2i64.p0v2i64(<2 x i64>, <2 x i64>*, i32 immarg, <2 x i1>)		declare void @llvm.masked.store.v2i64.p0v2i64(<2 x i64>, <2 x i64>*, i32 immarg, <2 x i1>)
declare <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>*, i32 immarg, <2 x i1>, <2 x i64>)		declare <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>*, i32 immarg, <2 x i1>, <2 x i64>)
declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)		declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
declare void @llvm.set.loop.iterations.i32(i32)		declare void @llvm.set.loop.iterations.i32(i32)
declare i32 @llvm.loop.decrement.reg.i32(i32, i32)		declare i32 @llvm.loop.decrement.reg.i32(i32, i32)
declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)		declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32)		declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32)
declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32)		declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32)

llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll

Show First 20 Lines • Show All 369 Lines • ▼ Show 20 Lines	vector.body:
%0 = phi i32 [ 8001, %entry ], [ %3, %vector.body ]		%0 = phi i32 [ 8001, %entry ], [ %3, %vector.body ]
%lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*		%lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
%lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>*		%lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>*
%lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>*		%lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>*
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0		%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer		%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>		%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>

; The induction variable %D is not an IV:		; The induction variable %N is not an IV:
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %N, i32 32003)		%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %N, i32 32003)

%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef)		%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef)
%wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef)		%wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef)
%2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load		%2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load
call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %2, <4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %1)		call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %2, <4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %1)
%index.next = add i32 %index, 4		%index.next = add i32 %index, 4
%scevgep = getelementptr i32, i32* %lsr.iv, i32 4		%scevgep = getelementptr i32, i32* %lsr.iv, i32 4
▲ Show 20 Lines • Show All 164 Lines • Show Last 20 Lines

llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-forced.ll

This file was deleted.

	; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -tail-predication=enabled -mattr=+mve,+lob %s -S -o - \| FileCheck %s --check-prefixes=CHECK,ENABLED
	; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -tail-predication=force-enabled -mattr=+mve,+lob %s -S -o - \| FileCheck %s --check-prefixes=CHECK,FORCED

	; CHECK-LABEL: set_iterations_not_rounded_up
	;
	; ENABLED: call <4 x i1> @llvm.get.active.lane.mask
	; ENABLED-NOT: vctp
	;
	; FORCED-NOT: call <4 x i1> @llvm.get.active.lane.mask
	; FORCED: vctp
	;
	; CHECK: ret void
	;
	define dso_local void @set_iterations_not_rounded_up(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
	entry:
	%cmp8 = icmp sgt i32 %N, 0

	; Here, v5 which is used in set.loop.iterations which is usually rounded up to
	; a next multiple of the VF when emitted from the vectoriser, which means a
	; bound can be put on this expression. Without this, we can't, and should flag
	; this as potentially overflow behaviour.

	%v5 = add nuw nsw i32 %N, 1
	br i1 %cmp8, label %vector.ph, label %for.cond.cleanup

	vector.ph: ; preds = %entry
	%trip.count.minus.1 = add i32 %N, -1
	call void @llvm.set.loop.iterations.i32(i32 %v5)
	br label %vector.body

	vector.body: ; preds = %vector.body, %vector.ph
	%lsr.iv17 = phi i32* [ %scevgep18, %vector.body ], [ %A, %vector.ph ]
	%lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %C, %vector.ph ]
	%lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %vector.ph ]
	%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
	%v6 = phi i32 [ %v5, %vector.ph ], [ %v8, %vector.body ]
	%lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>*
	%lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
	%lsr.iv1719 = bitcast i32* %lsr.iv17 to <4 x i32>*
	%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
	%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
	%wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
	%v7 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load
	call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %v7, <4 x i32>* %lsr.iv1719, i32 4, <4 x i1> %active.lane.mask)
	%index.next = add i32 %index, 4
	%scevgep = getelementptr i32, i32* %lsr.iv, i32 4
	%scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4
	%scevgep18 = getelementptr i32, i32* %lsr.iv17, i32 4
	%v8 = call i32 @llvm.loop.decrement.reg.i32(i32 %v6, i32 1)
	%v9 = icmp ne i32 %v8, 0
	br i1 %v9, label %vector.body, label %for.cond.cleanup

	for.cond.cleanup: ; preds = %vector.body, %entry
	ret void
	}

	declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
	declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
	declare void @llvm.set.loop.iterations.i32(i32)
	declare i32 @llvm.loop.decrement.reg.i32(i32, i32)
	declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)

This is an archive of the discontinued LLVM Phabricator instance.

[ARM][MVE] tail-predication: checks for the elementcount, cont'd
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 294621

llvm/lib/Target/ARM/MVETailPredication.cpp

llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-basic.ll

llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll

llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-forced.ll

This is an archive of the discontinued LLVM Phabricator instance.

[ARM][MVE] tail-predication: checks for the elementcount, cont'dClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 294621

llvm/lib/Target/ARM/MVETailPredication.cpp

llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-basic.ll

llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll

llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-forced.ll

[ARM][MVE] tail-predication: checks for the elementcount, cont'd
ClosedPublic