Diff 228255

llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp

Show First 20 Lines • Show All 994 Lines • ▼ Show 20 Lines	bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
HWLoopInfo.CounterInReg = true;		HWLoopInfo.CounterInReg = true;
HWLoopInfo.IsNestingLegal = false;		HWLoopInfo.IsNestingLegal = false;
HWLoopInfo.PerformEntryTest = true;		HWLoopInfo.PerformEntryTest = true;
HWLoopInfo.CountType = Type::getInt32Ty(C);		HWLoopInfo.CountType = Type::getInt32Ty(C);
HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);		HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
return true;		return true;
}		}

		static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) {
		// We don't allow icmp's, and because we only look at single block loops,
		samparkerUnsubmitted Done Reply Inline Actions isHardwareLoopProfitable will already catch these and tries quite hard to get it right. isa<> not dyn_cast. The zext and truncs could likely be folded into memory options, so you're gonna need to check the uses / users. sext would be the same. samparker: - isHardwareLoopProfitable will already catch these and tries quite hard to get it right.
		// we simply count the icmps, i.e. there should only be 1 for the backedge.
		if (isa<ICmpInst>(&I) && ++ICmpCount > 1)
		return false;

		if (isa<TruncInst>(&I) \|\| isa<SExtInst>(&I) \|\| isa<ZExtInst>(&I)) {
		samparkerUnsubmitted Not Done Reply Inline Actions ... check uses/users. samparker: ... check uses/users.
		if (isa<LoadInst>(I.getOperand(0)) \|\| isa<StoreInst>(I.getOperand(0))) {
		samparkerUnsubmitted Not Done Reply Inline Actions You need to check the user of the load, not it's operand. Also the logic is wrong here, we need to accept extending loads and truncating stores. samparker: You need to check the user of the load, not it's operand. Also the logic is wrong here, we need…
		LLVM_DEBUG(dbgs() << "not allowing instruction: "; I.dump());
		return false;
		}
		}
		return true;
		}
		samparkerUnsubmitted Done Reply Inline Actions If we're checking for casts within the loop, I don't see why this is necessary, because we shouldn't then also have to then check all the types used. samparker: If we're checking for casts within the loop, I don't see why this is necessary, because we…

		// To set up a tail-predicated loop, we need to know the total number of
		samparkerUnsubmitted Not Done Reply Inline Actions SmallVectorImpl<Instruction> samparker:* SmallVectorImpl<Instruction*>
		// elements processed by that loop. Thus, we need to determine the element
		// size and:
		// 1) it should be uniform for all operations in the vector loop, so we
		// e.g. don't want any widening/narrowing operations.
		// 2) it should be smaller than i64s because we don't have vector operations
		// that work on i64s.
		// 3) we don't want elements to be reversed or shuffled, to make sure the
		// tail-predication masks/predicates the right lanes.
		samparkerUnsubmitted Not Done Reply Inline Actions if (Stride != 1) return false samparker: if (Stride != 1) return false
		//
		static bool canTailPredicateLoop(Loop L, LoopInfo LI, ScalarEvolution &SE,
		const DataLayout &DL,
		const LoopAccessInfo *LAI) {
		PredicatedScalarEvolution PSE = LAI->getPSE();
		int ICmpCount = 0;
		int Stride = 0;
		bool FirstStride = true;

		LLVM_DEBUG(dbgs() << "tail-predication: checking allowed instructions\n");
		SmallVector<Instruction *, 16> LoadStores;
		for (BasicBlock *BB : L->blocks()) {
		for (Instruction &I : BB->instructionsWithoutDebug()) {
		if (isa<PHINode>(&I))
		continue;
		if (!canTailPredicateInstruction(I, ICmpCount))
		return false;

		Type *T = I.getType();
		if (T->isPointerTy())
		T = T->getPointerElementType();

		// TODO: the float types
		if (T->isHalfTy() \|\| T->isFloatTy() \|\| T->isDoubleTy())
		samparkerUnsubmitted Not Done Reply Inline Actions What is the complication of supporting half and float now? samparker: What is the complication of supporting half and float now?
		return false;
		samparkerUnsubmitted Not Done Reply Inline Actions isa samparker: isa

		if (T->getScalarSizeInBits() > 32) {
		LLVM_DEBUG(dbgs() << "Unsupported Type: "; T->dump());
		return false;
		}

		if (isa<StoreInst>(I) \|\| isa<LoadInst>(I)) {
		Value *Ptr = isa<LoadInst>(I) ? I.getOperand(0) : I.getOperand(1);
		int64_t NextStride = getPtrStride(PSE, Ptr, L);
		// TODO: for now only allow consecutive strides of 1 and -1. We
		// could support other strides as long as it is uniform, but let's
		// keep it simple for now.
		if (FirstStride && (NextStride == 1 \|\| NextStride == -1)) {
		samparkerUnsubmitted Not Done Reply Inline Actions You don't need FirstStride, the uninitialised Stride holds the same info. samparker: You don't need FirstStride, the uninitialised Stride holds the same info.
		FirstStride = false;
		dmgreenUnsubmitted Not Done Reply Inline Actions Can you explain the reasoning for adding -1 strides? I would expect them to become vrev's and vmov's, both of which will be difficult to prove in the backend (if not outright incorrect). Are you expecting any reverse shuffles to be cancelled out? dmgreen: Can you explain the reasoning for adding -1 strides? I would expect them to become vrev's and…
		Stride = NextStride;
		continue;
		}
		if (Stride != NextStride) {
		LLVM_DEBUG(dbgs() << "Different strides found, can't "
		samparkerUnsubmitted Not Done Reply Inline Actions Makes sense to just check the stride here and then we can exit early. samparker: Makes sense to just check the stride here and then we can exit early.
		"tail-predicate\n.");
		return false;
		}
		}
		}
		}

		LLVM_DEBUG(dbgs() << "tail-predication: all instructions allowed!\n");
		return true;
		}

bool ARMTTIImpl::preferPredicateOverEpilogue(Loop L, LoopInfo LI,		bool ARMTTIImpl::preferPredicateOverEpilogue(Loop L, LoopInfo LI,
ScalarEvolution &SE,		ScalarEvolution &SE,
AssumptionCache &AC,		AssumptionCache &AC,
TargetLibraryInfo *TLI,		TargetLibraryInfo *TLI,
DominatorTree *DT,		DominatorTree *DT,
const LoopAccessInfo *LAI) {		const LoopAccessInfo *LAI) {
// Creating a predicated vector loop is the first step for generating a		// Creating a predicated vector loop is the first step for generating a
		dmgreenUnsubmitted Not Done Reply Inline Actions Nice one. dmgreen: Nice one.
// tail-predicated hardware loop, for which we need the MVE masked		// tail-predicated hardware loop, for which we need the MVE masked
// load/stores instructions:		// load/stores instructions:
if (!ST->hasMVEIntegerOps())		if (!ST->hasMVEIntegerOps())
return false;		return false;

		// For now, restrict this to single block loops.
		if (L->getNumBlocks() > 1) {
		LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: not a single block "
		"loop.\n");
		return false;
		}

		assert(L->empty() && "preferPredicateOverEpilogue: inner-loop expected");

HardwareLoopInfo HWLoopInfo(L);		HardwareLoopInfo HWLoopInfo(L);
if (!HWLoopInfo.canAnalyze(*LI)) {		if (!HWLoopInfo.canAnalyze(*LI)) {
LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "		LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
"analyzable.\n");		"analyzable.\n");
return false;		return false;
}		}

// This checks if we have the low-overhead branch architecture		// This checks if we have the low-overhead branch architecture
// extension, and if we will create a hardware-loop:		// extension, and if we will create a hardware-loop:
if (!isHardwareLoopProfitable(L, SE, AC, TLI, HWLoopInfo)) {		if (!isHardwareLoopProfitable(L, SE, AC, TLI, HWLoopInfo)) {
LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "		LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
"profitable.\n");		"profitable.\n");
return false;		return false;
}		}

if (!HWLoopInfo.isHardwareLoopCandidate(SE, LI, DT)) {		if (!HWLoopInfo.isHardwareLoopCandidate(SE, LI, DT)) {
LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "		LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
"a candidate.\n");		"a candidate.\n");
return false;		return false;
}		}

// TODO: to set up a tail-predicated loop, which works by setting up		return canTailPredicateLoop(L, LI, SE, DL, LAI);
// the total number of elements processed by the loop, we need to
// determine the element size here, and if it is uniform for all operations
// in the vector loop. This means we will reject narrowing/widening
// operations, and don't want to predicate the vector loop, which is
// the main prep step for tail-predicated loops.

return false;
}		}


void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,		void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP) {		TTI::UnrollingPreferences &UP) {
// Only currently enable these preferences for M-Class cores.		// Only currently enable these preferences for M-Class cores.
if (!ST->isMClass())		if (!ST->isMClass())
return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP);		return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP);
▲ Show 20 Lines • Show All 91 Lines • Show Last 20 Lines

llvm/test/Transforms/LoopVectorize/ARM/prefer-tail-loop-folding.ll

	Show All 10 Lines
	; 'isHardwareLoopProfitable' return false, so that we test avoiding folding for			; 'isHardwareLoopProfitable' return false, so that we test avoiding folding for
	; these cases.			; these cases.
	; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve,-lob < %s -loop-vectorize -enable-arm-maskedldst=true -S \| \			; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve,-lob < %s -loop-vectorize -enable-arm-maskedldst=true -S \| \
	; RUN: FileCheck %s -check-prefixes=CHECK,NO-FOLDING			; RUN: FileCheck %s -check-prefixes=CHECK,NO-FOLDING

	; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve < %s -loop-vectorize -enable-arm-maskedldst=true -S \| \			; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve < %s -loop-vectorize -enable-arm-maskedldst=true -S \| \
	; RUN: FileCheck %s -check-prefixes=CHECK,PREFER-FOLDING			; RUN: FileCheck %s -check-prefixes=CHECK,PREFER-FOLDING

	define dso_local void @tail_folding(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) {			define void @prefer_folding(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 {
	; CHECK-LABEL: tail_folding(			; CHECK-LABEL: prefer_folding(
				; PREFER-FOLDING: vector.body:
				; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32
				; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32
				; PREFER-FOLDING: call void @llvm.masked.store.v4i32.p0v4i32
				; PREFER-FOLDING: br i1 %{{.}}, label %{{.}}, label %vector.body
	;			;
	; NO-FOLDING-NOT: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(			; NO-FOLDING-NOT: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(
	; NO-FOLDING-NOT: call void @llvm.masked.store.v4i32.p0v4i32(			; NO-FOLDING-NOT: call void @llvm.masked.store.v4i32.p0v4i32(
				; NO-FOLDING: br i1 %{{.}}, label %{{.}}, label %for.body
				entry:
				br label %for.body

				for.cond.cleanup:
				ret void

				for.body:
				%i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
				%arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.09
				%0 = load i32, i32* %arrayidx, align 4
				%arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09
				%1 = load i32, i32* %arrayidx1, align 4
				%add = add nsw i32 %1, %0
				%arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09
				store i32 %add, i32* %arrayidx2, align 4
				%add3 = add nuw nsw i32 %i.09, 1
				%exitcond = icmp eq i32 %add3, 431
				br i1 %exitcond, label %for.cond.cleanup, label %for.body
				}

				define void @mixed_types(i16* noalias nocapture %A, i16* noalias nocapture readonly %B, i16* noalias nocapture readonly %C, i32* noalias nocapture %D, i32* noalias nocapture readonly %E, i32* noalias nocapture readonly %F) #0 {
				; CHECK-LABEL: mixed_types(
				; PREFER-FOLDING: vector.body:
				; PREFER-FOLDING: call <4 x i16> @llvm.masked.load.v4i16.p0v4i16
				; PREFER-FOLDING: call <4 x i16> @llvm.masked.load.v4i16.p0v4i16
				; PREFER-FOLDING: call void @llvm.masked.store.v4i16.p0v4i16
				; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32
				; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32
				; PREFER-FOLDING: call void @llvm.masked.store.v4i32.p0v4i32
				; PREFER-FOLDING: br i1 %{{.}}, label %{{.}}, label %vector.body
				entry:
				br label %for.body

				for.cond.cleanup:
				ret void

				for.body:
				%i.018 = phi i32 [ 0, %entry ], [ %add9, %for.body ]
				%arrayidx = getelementptr inbounds i16, i16* %B, i32 %i.018
				%0 = load i16, i16* %arrayidx, align 2
				%arrayidx1 = getelementptr inbounds i16, i16* %C, i32 %i.018
				%1 = load i16, i16* %arrayidx1, align 2
				%add = add i16 %1, %0
				%arrayidx4 = getelementptr inbounds i16, i16* %A, i32 %i.018
				store i16 %add, i16* %arrayidx4, align 2
				%arrayidx5 = getelementptr inbounds i32, i32* %E, i32 %i.018
				%2 = load i32, i32* %arrayidx5, align 4
				%arrayidx6 = getelementptr inbounds i32, i32* %F, i32 %i.018
				%3 = load i32, i32* %arrayidx6, align 4
				%add7 = add nsw i32 %3, %2
				%arrayidx8 = getelementptr inbounds i32, i32* %D, i32 %i.018
				store i32 %add7, i32* %arrayidx8, align 4
				%add9 = add nuw nsw i32 %i.018, 1
				%exitcond = icmp eq i32 %add9, 431
				br i1 %exitcond, label %for.cond.cleanup, label %for.body
				}

				define void @unsupported_i64_type(i64* noalias nocapture %A, i64* noalias nocapture readonly %B, i64* noalias nocapture readonly %C) #0 {
				; CHECK-LABEL: unsupported_i64_type(
				; PREFER-FOLDING-NOT: vector.body:
				; PREFER-FOLDING-NOT: llvm.masked.load
				; PREFER-FOLDING-NOT: llvm.masked.store
				; PREFER-FOLDING: for.body:
				entry:
				br label %for.body

				for.cond.cleanup:
				ret void

				for.body:
				%i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
				%arrayidx = getelementptr inbounds i64, i64* %B, i32 %i.09
				%0 = load i64, i64* %arrayidx, align 8
				%arrayidx1 = getelementptr inbounds i64, i64* %C, i32 %i.09
				%1 = load i64, i64* %arrayidx1, align 8
				%add = add nsw i64 %1, %0
				%arrayidx2 = getelementptr inbounds i64, i64* %A, i32 %i.09
				store i64 %add, i64* %arrayidx2, align 8
				%add3 = add nuw nsw i32 %i.09, 1
				%exitcond = icmp eq i32 %add3, 431
				br i1 %exitcond, label %for.cond.cleanup, label %for.body
				}

				define void @zext_not_allowed(i32* noalias nocapture %A, i8* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 {
				; CHECK-LABEL: zext_not_allowed(
				; PREFER-FOLDING: vector.body:
				; PREFER-FOLDING-NOT: llvm.masked.load
				; PREFER-FOLDING-NOT: llvm.masked.store
				; PREFER-FOLDING: br i1 %{{.}}, label %{{.}}, label %vector.body
				entry:
				br label %for.body

				for.cond.cleanup:
				ret void

				for.body:
				%i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
				%arrayidx = getelementptr inbounds i8, i8* %B, i32 %i.09
				%0 = load i8, i8* %arrayidx, align 1
				%conv = zext i8 %0 to i32
				%arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09
				%1 = load i32, i32* %arrayidx1, align 4
				%add = add nsw i32 %1, %conv
				%arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09
				store i32 %add, i32* %arrayidx2, align 4
				%add3 = add nuw nsw i32 %i.09, 1
				%exitcond = icmp eq i32 %add3, 431
				br i1 %exitcond, label %for.cond.cleanup, label %for.body
				}

				define void @sext_not_allowed(i32* noalias nocapture %A, i8* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 {
				; CHECK-LABEL: sext_not_allowed(
				; PREFER-FOLDING: vector.body:
				; PREFER-FOLDING-NOT: llvm.masked.load
				; PREFER-FOLDING-NOT: llvm.masked.store
				; PREFER-FOLDING: br i1 %{{.}}, label %{{.}}, label %vector.body
				entry:
				br label %for.body

				for.cond.cleanup:
				ret void

				for.body:
				%i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
				%arrayidx = getelementptr inbounds i8, i8* %B, i32 %i.09
				%0 = load i8, i8* %arrayidx, align 1
				%conv = sext i8 %0 to i32
				%arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09
				%1 = load i32, i32* %arrayidx1, align 4
				%add = add nsw i32 %1, %conv
				%arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09
				store i32 %add, i32* %arrayidx2, align 4
				%add3 = add nuw nsw i32 %i.09, 1
				%exitcond = icmp eq i32 %add3, 431
				br i1 %exitcond, label %for.cond.cleanup, label %for.body
				}

				define void @trunc_not_allowed(i8* noalias nocapture %A, i8* noalias nocapture readonly %B, i16* noalias nocapture readonly %C) #0 {
				; CHECK-LABEL: trunc_not_allowed(
				; PREFER-FOLDING: vector.body:
				; PREFER-FOLDING-NOT: llvm.masked.load
				; PREFER-FOLDING-NOT: llvm.masked.store
				; PREFER-FOLDING: br i1 %{{.}}, label %{{.}}, label %vector.body
				entry:
				br label %for.body

				for.cond.cleanup: ; preds = %for.body
				ret void

				for.body: ; preds = %for.body, %entry
				%i.012 = phi i32 [ 0, %entry ], [ %add6, %for.body ]
				%arrayidx = getelementptr inbounds i16, i16* %C, i32 %i.012
				%0 = load i16, i16* %arrayidx, align 2
				%arrayidx1 = getelementptr inbounds i8, i8* %B, i32 %i.012
				%1 = load i8, i8* %arrayidx1, align 1
				%conv3 = trunc i16 %0 to i8
				%add = add i8 %1, %conv3
				%arrayidx5 = getelementptr inbounds i8, i8* %A, i32 %i.012
				store i8 %add, i8* %arrayidx5, align 1
				%add6 = add nuw nsw i32 %i.012, 1
				%exitcond = icmp eq i32 %add6, 431
				br i1 %exitcond, label %for.cond.cleanup, label %for.body
				}

				; Here we have a trunc, but it's not for a load/store, so this
				; should be allowed.
				define void @trunc_is_allowed(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 {
				; CHECK-LABEL: trunc_is_allowed(
				; PREFER-FOLDING: vector.body:
				; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32
				; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32
				; PREFER-FOLDING: call void @llvm.masked.store.v4i32.p0v4i32
				; PREFER-FOLDING: br i1 %{{.}}, label %{{.}}, label %vector.body
				entry:
				br label %for.body

				for.cond.cleanup:
				ret void

				for.body:
				%i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
				%arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.09
				%0 = load i32, i32* %arrayidx, align 4
				%arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09
				%1 = load i32, i32* %arrayidx1, align 4
				%add = add nsw i32 %1, %0
				%arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09
				store i32 %add, i32* %arrayidx2, align 4
				%add3 = add nuw nsw i32 %i.09, 1

				%add.iv = trunc i32 %add3 to i16

				%exitcond = icmp eq i16 %add.iv, 431
				br i1 %exitcond, label %for.cond.cleanup, label %for.body
				}


				@tab = common global [32 x i8] zeroinitializer, align 1

				define i32 @icmp_not_allowed() #0 {
				; CHECK-LABEL: icmp_not_allowed(
				; PREFER-FOLDING: vector.body:
				; PREFER-FOLDING-NOT: llvm.masked.load
				; PREFER-FOLDING-NOT: llvm.masked.store
				; PREFER-FOLDING: br i1 %{{.}}, label %{{.}}, label %vector.body
				entry:
				br label %for.body

				for.body:
				%i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
				%arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08
				%0 = load i8, i8* %arrayidx, align 1
				%cmp1 = icmp eq i8 %0, 0
				%. = select i1 %cmp1, i8 2, i8 1
				store i8 %., i8* %arrayidx, align 1
				%inc = add nsw i32 %i.08, 1
				%exitcond = icmp slt i32 %inc, 1000
				br i1 %exitcond, label %for.body, label %for.end

				for.end:
				ret i32 0
				}

				define void @pragma_vect_predicate_disable(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 {
				; CHECK-LABEL: pragma_vect_predicate_disable(
	;			;
	; TODO: this needs implementation of TTI::preferPredicateOverEpilogue,			; FIXME:
	; then this will be tail-folded too:			; respect loop hint predicate.enable = false, and don't tail-fold here:
	;
	; PREFER-FOLDING-NOT: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(
	; PREFER-FOLDING-NOT: call void @llvm.masked.store.v4i32.p0v4i32(
	;			;
				; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32
				; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32
				; PREFER-FOLDING: call void @llvm.masked.store.v4i32.p0v4i32
				; PREFER-FOLDING: br i1 %{{.}}, label %{{.}}, label %vector.body
				entry:
				br label %for.body

				for.cond.cleanup:
				ret void

				for.body:
				%i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
				%arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.09
				%0 = load i32, i32* %arrayidx, align 4
				%arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09
				%1 = load i32, i32* %arrayidx1, align 4
				%add = add nsw i32 %1, %0
				%arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09
				store i32 %add, i32* %arrayidx2, align 4
				%add3 = add nuw nsw i32 %i.09, 1
				%exitcond = icmp eq i32 %add3, 431
				br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !7
				}

				; Test directions for array indices i and N-1. I.e. check strides 1 and -1, and
				; force vectorisation with a loop hint.
				define void @strides_different_direction(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) #0 {
				; CHECK-LABEL: strides_different_direction(
				; PREFER-FOLDING: vector.body:
				; PREFER-FOLDING-NOT: llvm.masked.load
				; PREFER-FOLDING-NOT: llvm.masked.store
				; PREFER-FOLDING: br i1 %{{.}}, label %{{.}}, label %vector.body
				entry:
				br label %for.body

				for.cond.cleanup:
				ret void

				for.body:
				%i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
				%arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.09
				%0 = load i32, i32* %arrayidx, align 4
				%sub = sub nsw i32 %N, %i.09
				%arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %sub
				%1 = load i32, i32* %arrayidx1, align 4
				%add = add nsw i32 %1, %0
				%arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09
				store i32 %add, i32* %arrayidx2, align 4
				%add3 = add nuw nsw i32 %i.09, 1
				%exitcond = icmp eq i32 %add3, 431
				br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !10
				}

				define void @strides_minus_one(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) #0 {
				; CHECK-LABEL: strides_minus_one(
				; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32
				; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32
				; PREFER-FOLDING: call void @llvm.masked.store.v4i32.p0v4i32
				; PREFER-FOLDING: br i1 %{{.}}, label %{{.}}, label %vector.body
	entry:			entry:
	br label %for.body			br label %for.body

	for.cond.cleanup:			for.cond.cleanup:
	ret void			ret void

	for.body:			for.body:
	%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]			%i.013 = phi i32 [ 0, %entry ], [ %add5, %for.body ]
	%arrayidx = getelementptr inbounds i32, i32* %B, i64 %indvars.iv			%sub = sub nsw i32 %N, %i.013
				%arrayidx = getelementptr inbounds i32, i32* %B, i32 %sub
	%0 = load i32, i32* %arrayidx, align 4			%0 = load i32, i32* %arrayidx, align 4
	%arrayidx2 = getelementptr inbounds i32, i32* %C, i64 %indvars.iv			%arrayidx2 = getelementptr inbounds i32, i32* %C, i32 %sub
	%1 = load i32, i32* %arrayidx2, align 4			%1 = load i32, i32* %arrayidx2, align 4
	%add = add nsw i32 %1, %0			%add = add nsw i32 %1, %0
	%arrayidx4 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv			%arrayidx4 = getelementptr inbounds i32, i32* %A, i32 %sub
	store i32 %add, i32* %arrayidx4, align 4			store i32 %add, i32* %arrayidx4, align 4
	%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1			%add5 = add nuw nsw i32 %i.013, 1
	%exitcond = icmp eq i64 %indvars.iv.next, 430			%exitcond = icmp eq i32 %add5, 431
				br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !10
				}


				define dso_local void @stride_4(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) local_unnamed_addr #0 {
				; CHECK-LABEL: stride_4(
				; PREFER-FOLDING: vector.body:
				; PREFER-FOLDING-NOT: llvm.masked.load
				; PREFER-FOLDING-NOT: llvm.masked.store
				; PREFER-FOLDING: br i1 %{{.}}, label %{{.}}, label %vector.body
				entry:
				br label %for.body

				for.cond.cleanup:
				ret void

				for.body:
				%i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
				%arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.09
				%0 = load i32, i32* %arrayidx, align 4
				%arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09
				%1 = load i32, i32* %arrayidx1, align 4
				%add = add nsw i32 %1, %0
				%arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09
				store i32 %add, i32* %arrayidx2, align 4
				%add3 = add nuw nsw i32 %i.09, 4
				%cmp = icmp ult i32 %add3, 731
				br i1 %cmp, label %for.body, label %for.cond.cleanup, !llvm.loop !5
				}

				define void @too_many_loop_blocks(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 {
				; CHECK-LABEL: too_many_loop_blocks(
				; PREFER-FOLDING: vector.body:
				; PREFER-FOLDING-NOT: llvm.masked.load
				; PREFER-FOLDING-NOT: llvm.masked.store
				; PREFER-FOLDING: br i1 %{{.}}, label %{{.}}, label %vector.body
				entry:
				br label %for.body

				for.cond.cleanup:
				ret void

				for.body:
				%i.09 = phi i32 [ 0, %entry ], [ %add3, %loopincr ]
				%arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.09
				%0 = load i32, i32* %arrayidx, align 4
				%arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09
				%1 = load i32, i32* %arrayidx1, align 4
				%add = add nsw i32 %1, %0
				%arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09
				store i32 %add, i32* %arrayidx2, align 4
				br label %loopincr

				loopincr:
				%add3 = add nuw nsw i32 %i.09, 1
				%exitcond = icmp eq i32 %add3, 431
	br i1 %exitcond, label %for.cond.cleanup, label %for.body			br i1 %exitcond, label %for.cond.cleanup, label %for.body
	}			}

				; TODO: the float types
				define void @float(float* noalias nocapture %A, float* noalias nocapture readonly %B, float* noalias nocapture readonly %C) #0 {
				; CHECK-LABEL: float(
				; PREFER-FOLDING: vector.body:
				; PREFER-FOLDING-NOT: llvm.masked.load
				; PREFER-FOLDING-NOT: llvm.masked.store
				; PREFER-FOLDING: br i1 %{{.}}, label %{{.}}, label %vector.body
				entry:
				br label %for.body

				for.cond.cleanup:
				ret void

				for.body:
				%i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
				%arrayidx = getelementptr inbounds float, float* %B, i32 %i.09
				%0 = load float, float* %arrayidx, align 4
				%arrayidx1 = getelementptr inbounds float, float* %C, i32 %i.09
				%1 = load float, float* %arrayidx1, align 4
				%add = fadd fast float %1, %0
				%arrayidx2 = getelementptr inbounds float, float* %A, i32 %i.09
				store float %add, float* %arrayidx2, align 4
				%add3 = add nuw nsw i32 %i.09, 1
				%exitcond = icmp eq i32 %add3, 431
				br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !10
				}

				attributes #0 = { nofree norecurse nounwind "target-features"="+armv8.1-m.main,+mve.fp" }

				!5 = distinct !{!5, !6}
				!6 = !{!"llvm.loop.vectorize.enable", i1 true}

				!7 = distinct !{!7, !8}
				!8 = !{!"llvm.loop.vectorize.predicate.enable", i1 false}

				!10 = distinct !{!10, !11}
				!11 = !{!"llvm.loop.vectorize.width", i32 4}

This is an archive of the discontinued LLVM Phabricator instance.

[ARM][MVE] canTailPredicateLoop
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 228255

llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp

llvm/test/Transforms/LoopVectorize/ARM/prefer-tail-loop-folding.ll

This is an archive of the discontinued LLVM Phabricator instance.

[ARM][MVE] canTailPredicateLoopClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 228255

llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp

llvm/test/Transforms/LoopVectorize/ARM/prefer-tail-loop-folding.ll

[ARM][MVE] canTailPredicateLoop
ClosedPublic