Diff 229067

llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp

Show All 38 Lines
static cl::opt<bool> EnableMaskedLoadStores(		static cl::opt<bool> EnableMaskedLoadStores(
"enable-arm-maskedldst", cl::Hidden, cl::init(false),		"enable-arm-maskedldst", cl::Hidden, cl::init(false),
cl::desc("Enable the generation of masked loads and stores"));		cl::desc("Enable the generation of masked loads and stores"));

static cl::opt<bool> DisableLowOverheadLoops(		static cl::opt<bool> DisableLowOverheadLoops(
"disable-arm-loloops", cl::Hidden, cl::init(false),		"disable-arm-loloops", cl::Hidden, cl::init(false),
cl::desc("Disable the generation of low-overhead loops"));		cl::desc("Disable the generation of low-overhead loops"));

		extern cl::opt<bool> DisableTailPredication;

bool ARMTTIImpl::areInlineCompatible(const Function *Caller,		bool ARMTTIImpl::areInlineCompatible(const Function *Caller,
const Function *Callee) const {		const Function *Callee) const {
const TargetMachine &TM = getTLI()->getTargetMachine();		const TargetMachine &TM = getTLI()->getTargetMachine();
const FeatureBitset &CallerBits =		const FeatureBitset &CallerBits =
TM.getSubtargetImpl(*Caller)->getFeatureBits();		TM.getSubtargetImpl(*Caller)->getFeatureBits();
const FeatureBitset &CalleeBits =		const FeatureBitset &CalleeBits =
TM.getSubtargetImpl(*Callee)->getFeatureBits();		TM.getSubtargetImpl(*Callee)->getFeatureBits();

▲ Show 20 Lines • Show All 940 Lines • ▼ Show 20 Lines	bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
HWLoopInfo.CounterInReg = true;		HWLoopInfo.CounterInReg = true;
HWLoopInfo.IsNestingLegal = false;		HWLoopInfo.IsNestingLegal = false;
HWLoopInfo.PerformEntryTest = true;		HWLoopInfo.PerformEntryTest = true;
HWLoopInfo.CountType = Type::getInt32Ty(C);		HWLoopInfo.CountType = Type::getInt32Ty(C);
HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);		HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
return true;		return true;
}		}

		static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) {
		// We don't allow icmp's, and because we only look at single block loops,
		samparkerUnsubmitted Done Reply Inline Actions isHardwareLoopProfitable will already catch these and tries quite hard to get it right. isa<> not dyn_cast. The zext and truncs could likely be folded into memory options, so you're gonna need to check the uses / users. sext would be the same. samparker: - isHardwareLoopProfitable will already catch these and tries quite hard to get it right.
		// we simply count the icmps, i.e. there should only be 1 for the backedge.
		if (isa<ICmpInst>(&I) && ++ICmpCount > 1)
		return false;

		// We could allow extending/narrowing FP loads/stores, but codegen is
		samparkerUnsubmitted Not Done Reply Inline Actions ... check uses/users. samparker: ... check uses/users.
		// too inefficient so reject this for now.
		samparkerUnsubmitted Not Done Reply Inline Actions You need to check the user of the load, not it's operand. Also the logic is wrong here, we need to accept extending loads and truncating stores. samparker: You need to check the user of the load, not it's operand. Also the logic is wrong here, we need…
		if (isa<FPExtInst>(&I) \|\| isa<FPTruncInst>(&I))
		return false;

		// Extends have to be extending-loads
		if (isa<SExtInst>(&I) \|\| isa<ZExtInst>(&I) )
		if (!I.getOperand(0)->hasOneUse() \|\| !isa<LoadInst>(I.getOperand(0)))
		samparkerUnsubmitted Done Reply Inline Actions If we're checking for casts within the loop, I don't see why this is necessary, because we shouldn't then also have to then check all the types used. samparker: If we're checking for casts within the loop, I don't see why this is necessary, because we…
		return false;

		samparkerUnsubmitted Not Done Reply Inline Actions SmallVectorImpl<Instruction> samparker:* SmallVectorImpl<Instruction*>
		// Truncs have to be narrowing-stores
		if (isa<TruncInst>(&I) )
		if (!I.hasOneUse() \|\| !isa<StoreInst>(*I.user_begin()))
		return false;

		return true;
		}

		samparkerUnsubmitted Not Done Reply Inline Actions if (Stride != 1) return false samparker: if (Stride != 1) return false
		// To set up a tail-predicated loop, we need to know the total number of
		// elements processed by that loop. Thus, we need to determine the element
		// size and:
		// 1) it should be uniform for all operations in the vector loop, so we
		// e.g. don't want any widening/narrowing operations.
		// 2) it should be smaller than i64s because we don't have vector operations
		// that work on i64s.
		// 3) we don't want elements to be reversed or shuffled, to make sure the
		// tail-predication masks/predicates the right lanes.
		//
		static bool canTailPredicateLoop(Loop L, LoopInfo LI, ScalarEvolution &SE,
		const DataLayout &DL,
		const LoopAccessInfo *LAI) {
		PredicatedScalarEvolution PSE = LAI->getPSE();
		int ICmpCount = 0;
		int Stride = 0;

		LLVM_DEBUG(dbgs() << "tail-predication: checking allowed instructions\n");
		SmallVector<Instruction *, 16> LoadStores;
		for (BasicBlock *BB : L->blocks()) {
		for (Instruction &I : BB->instructionsWithoutDebug()) {
		if (isa<PHINode>(&I))
		continue;
		if (!canTailPredicateInstruction(I, ICmpCount)) {
		samparkerUnsubmitted Not Done Reply Inline Actions What is the complication of supporting half and float now? samparker: What is the complication of supporting half and float now?
		LLVM_DEBUG(dbgs() << "Instruction not allowed: "; I.dump());
		samparkerUnsubmitted Not Done Reply Inline Actions isa samparker: isa
		return false;
		}

		Type *T = I.getType();
		if (T->isPointerTy())
		T = T->getPointerElementType();

		if (T->getScalarSizeInBits() > 32) {
		LLVM_DEBUG(dbgs() << "Unsupported Type: "; T->dump());
		return false;
		}

		if (isa<StoreInst>(I) \|\| isa<LoadInst>(I)) {
		samparkerUnsubmitted Not Done Reply Inline Actions You don't need FirstStride, the uninitialised Stride holds the same info. samparker: You don't need FirstStride, the uninitialised Stride holds the same info.
		Value *Ptr = isa<LoadInst>(I) ? I.getOperand(0) : I.getOperand(1);
		dmgreenUnsubmitted Not Done Reply Inline Actions Can you explain the reasoning for adding -1 strides? I would expect them to become vrev's and vmov's, both of which will be difficult to prove in the backend (if not outright incorrect). Are you expecting any reverse shuffles to be cancelled out? dmgreen: Can you explain the reasoning for adding -1 strides? I would expect them to become vrev's and…
		int64_t NextStride = getPtrStride(PSE, Ptr, L);
		// TODO: for now only allow consecutive strides of 1. We could support
		// other strides as long as it is uniform, but let's keep it simple for
		// now.
		if (Stride == 0 && NextStride == 1) {
		samparkerUnsubmitted Not Done Reply Inline Actions Makes sense to just check the stride here and then we can exit early. samparker: Makes sense to just check the stride here and then we can exit early.
		Stride = NextStride;
		continue;
		}
		if (Stride != NextStride) {
		LLVM_DEBUG(dbgs() << "Different strides found, can't "
		"tail-predicate\n.");
		return false;
		}
		}
		}
		}

		LLVM_DEBUG(dbgs() << "tail-predication: all instructions allowed!\n");
		return true;
		}

bool ARMTTIImpl::preferPredicateOverEpilogue(Loop L, LoopInfo LI,		bool ARMTTIImpl::preferPredicateOverEpilogue(Loop L, LoopInfo LI,
ScalarEvolution &SE,		ScalarEvolution &SE,
AssumptionCache &AC,		AssumptionCache &AC,
TargetLibraryInfo *TLI,		TargetLibraryInfo *TLI,
DominatorTree *DT,		DominatorTree *DT,
const LoopAccessInfo *LAI) {		const LoopAccessInfo *LAI) {
		if (DisableTailPredication)
		dmgreenUnsubmitted Not Done Reply Inline Actions Nice one. dmgreen: Nice one.
		return false;

// Creating a predicated vector loop is the first step for generating a		// Creating a predicated vector loop is the first step for generating a
// tail-predicated hardware loop, for which we need the MVE masked		// tail-predicated hardware loop, for which we need the MVE masked
// load/stores instructions:		// load/stores instructions:
if (!ST->hasMVEIntegerOps())		if (!ST->hasMVEIntegerOps())
return false;		return false;

		// For now, restrict this to single block loops.
		if (L->getNumBlocks() > 1) {
		LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: not a single block "
		"loop.\n");
		return false;
		}

		assert(L->empty() && "preferPredicateOverEpilogue: inner-loop expected");

HardwareLoopInfo HWLoopInfo(L);		HardwareLoopInfo HWLoopInfo(L);
if (!HWLoopInfo.canAnalyze(*LI)) {		if (!HWLoopInfo.canAnalyze(*LI)) {
LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "		LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
"analyzable.\n");		"analyzable.\n");
return false;		return false;
}		}

// This checks if we have the low-overhead branch architecture		// This checks if we have the low-overhead branch architecture
// extension, and if we will create a hardware-loop:		// extension, and if we will create a hardware-loop:
if (!isHardwareLoopProfitable(L, SE, AC, TLI, HWLoopInfo)) {		if (!isHardwareLoopProfitable(L, SE, AC, TLI, HWLoopInfo)) {
LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "		LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
"profitable.\n");		"profitable.\n");
return false;		return false;
}		}

if (!HWLoopInfo.isHardwareLoopCandidate(SE, LI, DT)) {		if (!HWLoopInfo.isHardwareLoopCandidate(SE, LI, DT)) {
LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "		LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
"a candidate.\n");		"a candidate.\n");
return false;		return false;
}		}

// TODO: to set up a tail-predicated loop, which works by setting up		return canTailPredicateLoop(L, LI, SE, DL, LAI);
// the total number of elements processed by the loop, we need to
// determine the element size here, and if it is uniform for all operations
// in the vector loop. This means we will reject narrowing/widening
// operations, and don't want to predicate the vector loop, which is
// the main prep step for tail-predicated loops.

return false;
}		}


void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,		void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP) {		TTI::UnrollingPreferences &UP) {
// Only currently enable these preferences for M-Class cores.		// Only currently enable these preferences for M-Class cores.
if (!ST->isMClass())		if (!ST->isMClass())
return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP);		return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP);
▲ Show 20 Lines • Show All 91 Lines • Show Last 20 Lines

llvm/lib/Target/ARM/MVETailPredication.cpp

	Show All 35 Lines
	#include "ARM.h"			#include "ARM.h"
	#include "ARMSubtarget.h"			#include "ARMSubtarget.h"

	using namespace llvm;			using namespace llvm;

	#define DEBUG_TYPE "mve-tail-predication"			#define DEBUG_TYPE "mve-tail-predication"
	#define DESC "Transform predicated vector loops to use MVE tail predication"			#define DESC "Transform predicated vector loops to use MVE tail predication"

	static cl::opt<bool>			cl::opt<bool>
	DisableTailPredication("disable-mve-tail-predication", cl::Hidden,			DisableTailPredication("disable-mve-tail-predication", cl::Hidden,
	cl::init(true),			cl::init(true),
	cl::desc("Disable MVE Tail Predication"));			cl::desc("Disable MVE Tail Predication"));
	namespace {			namespace {

	class MVETailPredication : public LoopPass {			class MVETailPredication : public LoopPass {
	SmallVector<IntrinsicInst*, 4> MaskedInsts;			SmallVector<IntrinsicInst*, 4> MaskedInsts;
	Loop *L = nullptr;			Loop *L = nullptr;
	▲ Show 20 Lines • Show All 467 Lines • Show Last 20 Lines

llvm/test/Transforms/LoopVectorize/ARM/prefer-tail-loop-folding.ll

	; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf < %s -loop-vectorize -S \| \			; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf \
				; RUN: -disable-mve-tail-predication=false -loop-vectorize -S < %s \| \
	; RUN: FileCheck %s -check-prefixes=CHECK,NO-FOLDING			; RUN: FileCheck %s -check-prefixes=CHECK,NO-FOLDING

	; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=-mve < %s -loop-vectorize -enable-arm-maskedldst=true -S \| \			; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=-mve \
				; RUN: -disable-mve-tail-predication=false -loop-vectorize \
				; RUN: -enable-arm-maskedldst=true -S < %s \| \
	; RUN: FileCheck %s -check-prefixes=CHECK,NO-FOLDING			; RUN: FileCheck %s -check-prefixes=CHECK,NO-FOLDING

	; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve < %s -loop-vectorize -enable-arm-maskedldst=false -S \| \			; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve \
				; RUN: -disable-mve-tail-predication=false -loop-vectorize \
				; RUN: -enable-arm-maskedldst=false -S < %s \| \
				; RUN: FileCheck %s -check-prefixes=CHECK,NO-FOLDING

				; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve \
				; RUN: -disable-mve-tail-predication=true -loop-vectorize \
				; RUN: -enable-arm-maskedldst=true -S < %s \| \
	; RUN: FileCheck %s -check-prefixes=CHECK,NO-FOLDING			; RUN: FileCheck %s -check-prefixes=CHECK,NO-FOLDING

	; Disabling the low-overhead branch extension will make			; Disabling the low-overhead branch extension will make
	; 'isHardwareLoopProfitable' return false, so that we test avoiding folding for			; 'isHardwareLoopProfitable' return false, so that we test avoiding folding for
	; these cases.			; these cases.
	; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve,-lob < %s -loop-vectorize -enable-arm-maskedldst=true -S \| \			; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve,-lob \
				; RUN: -disable-mve-tail-predication=false -loop-vectorize \
				; RUN: -enable-arm-maskedldst=true -S < %s \| \
	; RUN: FileCheck %s -check-prefixes=CHECK,NO-FOLDING			; RUN: FileCheck %s -check-prefixes=CHECK,NO-FOLDING

	; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve < %s -loop-vectorize -enable-arm-maskedldst=true -S \| \			; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve.fp \
				; RUN: -disable-mve-tail-predication=false -loop-vectorize \
				; RUN: -enable-arm-maskedldst=true -S < %s \| \
	; RUN: FileCheck %s -check-prefixes=CHECK,PREFER-FOLDING			; RUN: FileCheck %s -check-prefixes=CHECK,PREFER-FOLDING

	define dso_local void @tail_folding(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) {			define void @prefer_folding(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 {
	; CHECK-LABEL: tail_folding(			; CHECK-LABEL: prefer_folding(
				; PREFER-FOLDING: vector.body:
				; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32
				; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32
				; PREFER-FOLDING: call void @llvm.masked.store.v4i32.p0v4i32
				; PREFER-FOLDING: br i1 %{{.}}, label %{{.}}, label %vector.body
	;			;
	; NO-FOLDING-NOT: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(			; NO-FOLDING-NOT: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(
	; NO-FOLDING-NOT: call void @llvm.masked.store.v4i32.p0v4i32(			; NO-FOLDING-NOT: call void @llvm.masked.store.v4i32.p0v4i32(
				; NO-FOLDING: br i1 %{{.}}, label %{{.}}, label %for.body
				entry:
				br label %for.body

				for.cond.cleanup:
				ret void

				for.body:
				%i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
				%arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.09
				%0 = load i32, i32* %arrayidx, align 4
				%arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09
				%1 = load i32, i32* %arrayidx1, align 4
				%add = add nsw i32 %1, %0
				%arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09
				store i32 %add, i32* %arrayidx2, align 4
				%add3 = add nuw nsw i32 %i.09, 1
				%exitcond = icmp eq i32 %add3, 431
				br i1 %exitcond, label %for.cond.cleanup, label %for.body
				}

				define void @mixed_types(i16* noalias nocapture %A, i16* noalias nocapture readonly %B, i16* noalias nocapture readonly %C, i32* noalias nocapture %D, i32* noalias nocapture readonly %E, i32* noalias nocapture readonly %F) #0 {
				; CHECK-LABEL: mixed_types(
				; PREFER-FOLDING: vector.body:
				; PREFER-FOLDING: call <4 x i16> @llvm.masked.load.v4i16.p0v4i16
				; PREFER-FOLDING: call <4 x i16> @llvm.masked.load.v4i16.p0v4i16
				; PREFER-FOLDING: call void @llvm.masked.store.v4i16.p0v4i16
				; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32
				; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32
				; PREFER-FOLDING: call void @llvm.masked.store.v4i32.p0v4i32
				; PREFER-FOLDING: br i1 %{{.}}, label %{{.}}, label %vector.body
				entry:
				br label %for.body

				for.cond.cleanup:
				ret void

				for.body:
				%i.018 = phi i32 [ 0, %entry ], [ %add9, %for.body ]
				%arrayidx = getelementptr inbounds i16, i16* %B, i32 %i.018
				%0 = load i16, i16* %arrayidx, align 2
				%arrayidx1 = getelementptr inbounds i16, i16* %C, i32 %i.018
				%1 = load i16, i16* %arrayidx1, align 2
				%add = add i16 %1, %0
				%arrayidx4 = getelementptr inbounds i16, i16* %A, i32 %i.018
				store i16 %add, i16* %arrayidx4, align 2
				%arrayidx5 = getelementptr inbounds i32, i32* %E, i32 %i.018
				%2 = load i32, i32* %arrayidx5, align 4
				%arrayidx6 = getelementptr inbounds i32, i32* %F, i32 %i.018
				%3 = load i32, i32* %arrayidx6, align 4
				%add7 = add nsw i32 %3, %2
				%arrayidx8 = getelementptr inbounds i32, i32* %D, i32 %i.018
				store i32 %add7, i32* %arrayidx8, align 4
				%add9 = add nuw nsw i32 %i.018, 1
				%exitcond = icmp eq i32 %add9, 431
				br i1 %exitcond, label %for.cond.cleanup, label %for.body
				}

				define void @unsupported_i64_type(i64* noalias nocapture %A, i64* noalias nocapture readonly %B, i64* noalias nocapture readonly %C) #0 {
				; CHECK-LABEL: unsupported_i64_type(
				; PREFER-FOLDING-NOT: vector.body:
				; PREFER-FOLDING-NOT: llvm.masked.load
				; PREFER-FOLDING-NOT: llvm.masked.store
				; PREFER-FOLDING: for.body:
				entry:
				br label %for.body

				for.cond.cleanup:
				ret void

				for.body:
				%i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
				%arrayidx = getelementptr inbounds i64, i64* %B, i32 %i.09
				%0 = load i64, i64* %arrayidx, align 8
				%arrayidx1 = getelementptr inbounds i64, i64* %C, i32 %i.09
				%1 = load i64, i64* %arrayidx1, align 8
				%add = add nsw i64 %1, %0
				%arrayidx2 = getelementptr inbounds i64, i64* %A, i32 %i.09
				store i64 %add, i64* %arrayidx2, align 8
				%add3 = add nuw nsw i32 %i.09, 1
				%exitcond = icmp eq i32 %add3, 431
				br i1 %exitcond, label %for.cond.cleanup, label %for.body
				}

				define void @zero_extending_load_allowed(i32* noalias nocapture %A, i8* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 {
				; CHECK-LABEL: zero_extending_load_allowed(
				; PREFER-FOLDING: vector.body:
				; PREFER-FOLDING: call <4 x i8> @llvm.masked.load.v4i8.p0v4i8
				; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32
				; PREFER-FOLDING: call void @llvm.masked.store.v4i32.p0v4i32
				; PREFER-FOLDING: br i1 %{{.}}, label %{{.}}, label %vector.body
				entry:
				br label %for.body

				for.cond.cleanup:
				ret void

				for.body:
				%i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
				%arrayidx = getelementptr inbounds i8, i8* %B, i32 %i.09
				%0 = load i8, i8* %arrayidx, align 1
				%conv = zext i8 %0 to i32
				%arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09
				%1 = load i32, i32* %arrayidx1, align 4
				%add = add nsw i32 %1, %conv
				%arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09
				store i32 %add, i32* %arrayidx2, align 4
				%add3 = add nuw nsw i32 %i.09, 1
				%exitcond = icmp eq i32 %add3, 431
				br i1 %exitcond, label %for.cond.cleanup, label %for.body
				}

				define void @sign_extending_load_allowed(i32* noalias nocapture %A, i8* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 {
				; CHECK-LABEL: sign_extending_load_allowed(
				; PREFER-FOLDING: vector.body:
				; PREFER-FOLDING: call <4 x i8> @llvm.masked.load.v4i8.p0v4i8
				; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32
				; PREFER-FOLDING: call void @llvm.masked.store.v4i32.p0v4i32
				; PREFER-FOLDING: br i1 %{{.}}, label %{{.}}, label %vector.body
				entry:
				br label %for.body

				for.cond.cleanup:
				ret void

				for.body:
				%i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
				%arrayidx = getelementptr inbounds i8, i8* %B, i32 %i.09
				%0 = load i8, i8* %arrayidx, align 1
				%conv = sext i8 %0 to i32
				%arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09
				%1 = load i32, i32* %arrayidx1, align 4
				%add = add nsw i32 %1, %conv
				%arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09
				store i32 %add, i32* %arrayidx2, align 4
				%add3 = add nuw nsw i32 %i.09, 1
				%exitcond = icmp eq i32 %add3, 431
				br i1 %exitcond, label %for.cond.cleanup, label %for.body
				}

				define void @narrowing_load_not_allowed(i8* noalias nocapture %A, i8* noalias nocapture readonly %B, i16* noalias nocapture readonly %C) #0 {
				; CHECK-LABEL: narrowing_load_not_allowed(
				; PREFER-FOLDING: vector.body:
				; PREFER-FOLDING-NOT: llvm.masked.load
				; PREFER-FOLDING-NOT: llvm.masked.store
				; PREFER-FOLDING: br i1 %{{.}}, label %{{.}}, label %vector.body
				entry:
				br label %for.body

				for.cond.cleanup: ; preds = %for.body
				ret void

				for.body: ; preds = %for.body, %entry
				%i.012 = phi i32 [ 0, %entry ], [ %add6, %for.body ]
				%arrayidx = getelementptr inbounds i16, i16* %C, i32 %i.012
				%0 = load i16, i16* %arrayidx, align 2
				%arrayidx1 = getelementptr inbounds i8, i8* %B, i32 %i.012
				%1 = load i8, i8* %arrayidx1, align 1
				%conv3 = trunc i16 %0 to i8
				%add = add i8 %1, %conv3
				%arrayidx5 = getelementptr inbounds i8, i8* %A, i32 %i.012
				store i8 %add, i8* %arrayidx5, align 1
				%add6 = add nuw nsw i32 %i.012, 1
				%exitcond = icmp eq i32 %add6, 431
				br i1 %exitcond, label %for.cond.cleanup, label %for.body
				}

				define void @narrowing_store_allowed(i8* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 {
				; CHECK-LABEL: narrowing_store_allowed(
				; PREFER-FOLDING: call void @llvm.masked.store.v4i8.p0v4i8
				; PREFER-FOLDING: br i1 %{{.}}, label %{{.}}, label %vector.body
				entry:
				br label %for.body

				for.cond.cleanup:
				ret void

				for.body:
				%i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
				%arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.09
				%0 = load i32, i32* %arrayidx, align 4
				%arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09
				%1 = load i32, i32* %arrayidx1, align 4
				%add = add nsw i32 %1, %0
				%conv = trunc i32 %add to i8
				%arrayidx2 = getelementptr inbounds i8, i8* %A, i32 %i.09
				store i8 %conv, i8* %arrayidx2, align 1
				%add3 = add nuw nsw i32 %i.09, 1
				%exitcond = icmp eq i32 %add3, 431
				br i1 %exitcond, label %for.cond.cleanup, label %for.body
				}

				; This is a trunc not connected to a store, so we don't allow this.
				; TODO: this is conservative, because the trunc is only used in the
				; loop control statements, and thus not affecting element sizes, so
				; we could allow this case.
				define void @trunc_not_allowed(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 {
				; CHECK-LABEL: trunc_not_allowed(
				; PREFER-FOLDING: vector.body:
				; PREFER-FOLDING-NOT: llvm.masked.load
				; PREFER-FOLDING-NOT: llvm.masked.store
				; PREFER-FOLDING: br i1 %{{.}}, label %{{.}}, label %vector.body
				entry:
				br label %for.body

				for.cond.cleanup:
				ret void

				for.body:
				%i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
				%arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.09
				%0 = load i32, i32* %arrayidx, align 4
				%arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09
				%1 = load i32, i32* %arrayidx1, align 4
				%add = add nsw i32 %1, %0
				%arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09
				store i32 %add, i32* %arrayidx2, align 4
				%add3 = add nuw nsw i32 %i.09, 1

				%add.iv = trunc i32 %add3 to i16

				%exitcond = icmp eq i16 %add.iv, 431
				br i1 %exitcond, label %for.cond.cleanup, label %for.body
				}

				define void @trunc_not_allowed_different_vec_elemns(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i16* noalias nocapture %D) #0 {
				; CHECK-LABEL: trunc_not_allowed_different_vec_elemns(
				; PREFER-FOLDING: vector.body:
				; PREFER-FOLDING-NOT: llvm.masked.load
				; PREFER-FOLDING-NOT: llvm.masked.store
				; PREFER-FOLDING: br i1 %{{.}}, label %{{.}}, label %vector.body
				entry:
				br label %for.body

				for.cond.cleanup:
				ret void

				for.body:
				%i.021 = phi i32 [ 0, %entry ], [ %add9, %for.body ]
				%arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.021
				%0 = load i32, i32* %arrayidx, align 4
				%arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.021
				%1 = load i32, i32* %arrayidx1, align 4
				%add = add nsw i32 %1, %0
				%arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.021
				store i32 %add, i32* %arrayidx2, align 4
				%add.tr = trunc i32 %add to i16
				%conv7 = shl i16 %add.tr, 1
				%arrayidx8 = getelementptr inbounds i16, i16* %D, i32 %i.021
				store i16 %conv7, i16* %arrayidx8, align 2
				%add9 = add nuw nsw i32 %i.021, 1
				%exitcond = icmp eq i32 %add9, 431
				br i1 %exitcond, label %for.cond.cleanup, label %for.body
				}


				@tab = common global [32 x i8] zeroinitializer, align 1

				define i32 @icmp_not_allowed() #0 {
				; CHECK-LABEL: icmp_not_allowed(
				; PREFER-FOLDING: vector.body:
				; PREFER-FOLDING-NOT: llvm.masked.load
				; PREFER-FOLDING-NOT: llvm.masked.store
				; PREFER-FOLDING: br i1 %{{.}}, label %{{.}}, label %vector.body
				entry:
				br label %for.body

				for.body:
				%i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
				%arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08
				%0 = load i8, i8* %arrayidx, align 1
				%cmp1 = icmp eq i8 %0, 0
				%. = select i1 %cmp1, i8 2, i8 1
				store i8 %., i8* %arrayidx, align 1
				%inc = add nsw i32 %i.08, 1
				%exitcond = icmp slt i32 %inc, 1000
				br i1 %exitcond, label %for.body, label %for.end

				for.end:
				ret i32 0
				}

				define void @pragma_vect_predicate_disable(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 {
				; CHECK-LABEL: pragma_vect_predicate_disable(
	;			;
	; TODO: this needs implementation of TTI::preferPredicateOverEpilogue,			; FIXME:
	; then this will be tail-folded too:			; respect loop hint predicate.enable = false, and don't tail-fold here:
	;
	; PREFER-FOLDING-NOT: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(
	; PREFER-FOLDING-NOT: call void @llvm.masked.store.v4i32.p0v4i32(
	;			;
				; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32
				; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32
				; PREFER-FOLDING: call void @llvm.masked.store.v4i32.p0v4i32
				; PREFER-FOLDING: br i1 %{{.}}, label %{{.}}, label %vector.body
				entry:
				br label %for.body

				for.cond.cleanup:
				ret void

				for.body:
				%i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
				%arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.09
				%0 = load i32, i32* %arrayidx, align 4
				%arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09
				%1 = load i32, i32* %arrayidx1, align 4
				%add = add nsw i32 %1, %0
				%arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09
				store i32 %add, i32* %arrayidx2, align 4
				%add3 = add nuw nsw i32 %i.09, 1
				%exitcond = icmp eq i32 %add3, 431
				br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !7
				}

				; Test directions for array indices i and N-1. I.e. check strides 1 and -1, and
				; force vectorisation with a loop hint.
				define void @strides_different_direction(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) #0 {
				; CHECK-LABEL: strides_different_direction(
				; PREFER-FOLDING: vector.body:
				; PREFER-FOLDING-NOT: llvm.masked.load
				; PREFER-FOLDING-NOT: llvm.masked.store
				; PREFER-FOLDING: br i1 %{{.}}, label %{{.}}, label %vector.body
				entry:
				br label %for.body

				for.cond.cleanup:
				ret void

				for.body:
				%i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
				%arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.09
				%0 = load i32, i32* %arrayidx, align 4
				%sub = sub nsw i32 %N, %i.09
				%arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %sub
				%1 = load i32, i32* %arrayidx1, align 4
				%add = add nsw i32 %1, %0
				%arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09
				store i32 %add, i32* %arrayidx2, align 4
				%add3 = add nuw nsw i32 %i.09, 1
				%exitcond = icmp eq i32 %add3, 431
				br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !10
				}

				define void @stride_4(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 {
				; CHECK-LABEL: stride_4(
				; PREFER-FOLDING: vector.body:
				; PREFER-FOLDING-NOT: llvm.masked.load
				; PREFER-FOLDING-NOT: llvm.masked.store
				; PREFER-FOLDING: br i1 %{{.}}, label %{{.}}, label %vector.body
				entry:
				br label %for.body

				for.cond.cleanup:
				ret void

				for.body:
				%i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
				%arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.09
				%0 = load i32, i32* %arrayidx, align 4
				%arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09
				%1 = load i32, i32* %arrayidx1, align 4
				%add = add nsw i32 %1, %0
				%arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09
				store i32 %add, i32* %arrayidx2, align 4
				%add3 = add nuw nsw i32 %i.09, 4
				%cmp = icmp ult i32 %add3, 731
				br i1 %cmp, label %for.body, label %for.cond.cleanup, !llvm.loop !5
				}

				define void @too_many_loop_blocks(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 {
				; CHECK-LABEL: too_many_loop_blocks(
				; PREFER-FOLDING: vector.body:
				; PREFER-FOLDING-NOT: llvm.masked.load
				; PREFER-FOLDING-NOT: llvm.masked.store
				; PREFER-FOLDING: br i1 %{{.}}, label %{{.}}, label %vector.body
	entry:			entry:
	br label %for.body			br label %for.body

	for.cond.cleanup:			for.cond.cleanup:
	ret void			ret void

	for.body:			for.body:
	%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]			%i.09 = phi i32 [ 0, %entry ], [ %add3, %loopincr ]
	%arrayidx = getelementptr inbounds i32, i32* %B, i64 %indvars.iv			%arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.09
	%0 = load i32, i32* %arrayidx, align 4			%0 = load i32, i32* %arrayidx, align 4
	%arrayidx2 = getelementptr inbounds i32, i32* %C, i64 %indvars.iv			%arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09
	%1 = load i32, i32* %arrayidx2, align 4			%1 = load i32, i32* %arrayidx1, align 4
	%add = add nsw i32 %1, %0			%add = add nsw i32 %1, %0
	%arrayidx4 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv			%arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09
	store i32 %add, i32* %arrayidx4, align 4			store i32 %add, i32* %arrayidx2, align 4
	%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1			br label %loopincr
	%exitcond = icmp eq i64 %indvars.iv.next, 430
				loopincr:
				%add3 = add nuw nsw i32 %i.09, 1
				%exitcond = icmp eq i32 %add3, 431
	br i1 %exitcond, label %for.cond.cleanup, label %for.body			br i1 %exitcond, label %for.cond.cleanup, label %for.body
	}			}

				define dso_local void @half(half* noalias nocapture %A, half* noalias nocapture readonly %B, half* noalias nocapture readonly %C) #0 {
				; CHECK-LABEL: half(
				; PREFER-FOLDING: vector.body:
				; PREFER-FOLDING: call <8 x half> @llvm.masked.load.v8f16.p0v8f16
				; PREFER-FOLDING: call <8 x half> @llvm.masked.load.v8f16.p0v8f16
				; PREFER-FOLDING: call void @llvm.masked.store.v8f16.p0v8f16
				; PREFER-FOLDING: br i1 %{{.}}, label %{{.}}, label %vector.body
				entry:
				br label %for.body

				for.cond.cleanup:
				ret void

				for.body:
				%i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
				%arrayidx = getelementptr inbounds half, half* %B, i32 %i.09
				%0 = load half, half* %arrayidx, align 2
				%arrayidx1 = getelementptr inbounds half, half* %C, i32 %i.09
				%1 = load half, half* %arrayidx1, align 2
				%add = fadd fast half %1, %0
				%arrayidx2 = getelementptr inbounds half, half* %A, i32 %i.09
				store half %add, half* %arrayidx2, align 2
				%add3 = add nuw nsw i32 %i.09, 1
				%exitcond = icmp eq i32 %add3, 431
				br i1 %exitcond, label %for.cond.cleanup, label %for.body
				}

				define void @float(float* noalias nocapture %A, float* noalias nocapture readonly %B, float* noalias nocapture readonly %C) #0 {
				; CHECK-LABEL: float(
				; PREFER-FOLDING: vector.body:
				; PREFER-FOLDING: call <4 x float> @llvm.masked.load.v4f32.p0v4f32
				; PREFER-FOLDING: call <4 x float> @llvm.masked.load.v4f32.p0v4f32
				; PREFER-FOLDING: call void @llvm.masked.store.v4f32.p0v4f32
				; PREFER-FOLDING: br i1 %{{.}}, label %{{.}}, label %vector.body
				entry:
				br label %for.body

				for.cond.cleanup:
				ret void

				for.body:
				%i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
				%arrayidx = getelementptr inbounds float, float* %B, i32 %i.09
				%0 = load float, float* %arrayidx, align 4
				%arrayidx1 = getelementptr inbounds float, float* %C, i32 %i.09
				%1 = load float, float* %arrayidx1, align 4
				%add = fadd fast float %1, %0
				%arrayidx2 = getelementptr inbounds float, float* %A, i32 %i.09
				store float %add, float* %arrayidx2, align 4
				%add3 = add nuw nsw i32 %i.09, 1
				%exitcond = icmp eq i32 %add3, 431
				br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !10
				}

				define void @double(double* noalias nocapture %A, double* noalias nocapture readonly %B, double* noalias nocapture readonly %C) #0 {
				; CHECK-LABEL: double(
				; PREFER-FOLDING: for.body:
				; PREFER-FOLDING-NOT: vector.body:
				entry:
				br label %for.body

				for.cond.cleanup:
				ret void

				for.body:
				%i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
				%arrayidx = getelementptr inbounds double, double* %B, i32 %i.09
				%0 = load double, double* %arrayidx, align 8
				%arrayidx1 = getelementptr inbounds double, double* %C, i32 %i.09
				%1 = load double, double* %arrayidx1, align 8
				%add = fadd fast double %1, %0
				%arrayidx2 = getelementptr inbounds double, double* %A, i32 %i.09
				store double %add, double* %arrayidx2, align 8
				%add3 = add nuw nsw i32 %i.09, 1
				%exitcond = icmp eq i32 %add3, 431
				br i1 %exitcond, label %for.cond.cleanup, label %for.body
				}

				; TODO: this fpext could be allowed, but we don't lower it very efficiently yet,
				; so reject this for now.
				define void @fpext_allowed(float* noalias nocapture %A, half* noalias nocapture readonly %B, float* noalias nocapture readonly %C) #0 {
				; CHECK-LABEL: fpext_allowed(
				; PREFER-FOLDING: vector.body:
				; PREFER-FOLDING-NOT: llvm.masked.load
				; PREFER-FOLDING-NOT: llvm.masked.store
				; PREFER-FOLDING: br i1 %{{.}}, label %{{.}}, label %vector.body
				entry:
				br label %for.body

				for.cond.cleanup:
				ret void

				for.body:
				%i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
				%arrayidx = getelementptr inbounds half, half* %B, i32 %i.09
				%0 = load half, half* %arrayidx, align 2
				%conv = fpext half %0 to float
				%arrayidx1 = getelementptr inbounds float, float* %C, i32 %i.09
				%1 = load float, float* %arrayidx1, align 4
				%add = fadd fast float %1, %conv
				%arrayidx2 = getelementptr inbounds float, float* %A, i32 %i.09
				store float %add, float* %arrayidx2, align 4
				%add3 = add nuw nsw i32 %i.09, 1
				%exitcond = icmp eq i32 %add3, 431
				br i1 %exitcond, label %for.cond.cleanup, label %for.body
				}

				; TODO: this fptrunc could be allowed, but we don't lower it very efficiently yet,
				; so reject this for now.
				define void @fptrunc_allowed(half* noalias nocapture %A, float* noalias nocapture readonly %B, float* noalias nocapture readonly %C) #0 {
				; CHECK-LABEL: fptrunc_allowed(
				; PREFER-FOLDING: vector.body:
				; PREFER-FOLDING-NOT: llvm.masked.load
				; PREFER-FOLDING-NOT: llvm.masked.store
				; PREFER-FOLDING: br i1 %{{.}}, label %{{.}}, label %vector.body
				entry:
				br label %for.body

				for.cond.cleanup:
				ret void

				for.body:
				%i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
				%arrayidx = getelementptr inbounds float, float* %B, i32 %i.09
				%0 = load float, float* %arrayidx, align 4
				%arrayidx1 = getelementptr inbounds float, float* %C, i32 %i.09
				%1 = load float, float* %arrayidx1, align 4
				%add = fadd fast float %1, %0
				%conv = fptrunc float %add to half
				%arrayidx2 = getelementptr inbounds half, half* %A, i32 %i.09
				store half %conv, half* %arrayidx2, align 2
				%add3 = add nuw nsw i32 %i.09, 1
				%exitcond = icmp eq i32 %add3, 431
				br i1 %exitcond, label %for.cond.cleanup, label %for.body
				}

				define void @fptrunc_not_allowed(float* noalias nocapture %A, float* noalias nocapture readonly %B, float* noalias nocapture readonly %C, half* noalias nocapture %D) #0 {
				; CHECK-LABEL: fptrunc_not_allowed(
				; PREFER-FOLDING: vector.body:
				; PREFER-FOLDING-NOT: llvm.masked.load
				; PREFER-FOLDING-NOT: llvm.masked.store
				; PREFER-FOLDING: br i1 %{{.}}, label %{{.}}, label %vector.body
				entry:
				br label %for.body

				for.cond.cleanup:
				ret void

				for.body:
				%i.017 = phi i32 [ 0, %entry ], [ %add6, %for.body ]
				%arrayidx = getelementptr inbounds float, float* %B, i32 %i.017
				%0 = load float, float* %arrayidx, align 4
				%arrayidx1 = getelementptr inbounds float, float* %C, i32 %i.017
				%1 = load float, float* %arrayidx1, align 4
				%add = fadd fast float %1, %0
				%arrayidx2 = getelementptr inbounds float, float* %A, i32 %i.017
				store float %add, float* %arrayidx2, align 4
				%conv = fptrunc float %add to half
				%factor = fmul fast half %conv, 0xH4000
				%arrayidx5 = getelementptr inbounds half, half* %D, i32 %i.017
				store half %factor, half* %arrayidx5, align 2
				%add6 = add nuw nsw i32 %i.017, 1
				%exitcond = icmp eq i32 %add6, 431
				br i1 %exitcond, label %for.cond.cleanup, label %for.body
				}

				attributes #0 = { nofree norecurse nounwind "target-features"="+armv8.1-m.main,+mve.fp" }

				!5 = distinct !{!5, !6}
				!6 = !{!"llvm.loop.vectorize.enable", i1 true}

				!7 = distinct !{!7, !8}
				!8 = !{!"llvm.loop.vectorize.predicate.enable", i1 false}

				!10 = distinct !{!10, !11}
				!11 = !{!"llvm.loop.vectorize.width", i32 4}

This is an archive of the discontinued LLVM Phabricator instance.

[ARM][MVE] canTailPredicateLoop
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 229067

llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp

llvm/lib/Target/ARM/MVETailPredication.cpp

llvm/test/Transforms/LoopVectorize/ARM/prefer-tail-loop-folding.ll

This is an archive of the discontinued LLVM Phabricator instance.

[ARM][MVE] canTailPredicateLoopClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 229067

llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp

llvm/lib/Target/ARM/MVETailPredication.cpp

llvm/test/Transforms/LoopVectorize/ARM/prefer-tail-loop-folding.ll

[ARM][MVE] canTailPredicateLoop
ClosedPublic