Diff 109067

lib/Target/ARM/ARM.td

	Show First 20 Lines • Show All 92 Lines • ▼ Show 20 Lines
	def : ProcNoItin<"xscale", [ARMv5te]>;			def : ProcNoItin<"xscale", [ARMv5te]>;
	def : ProcNoItin<"iwmmxt", [ARMv5te]>;			def : ProcNoItin<"iwmmxt", [ARMv5te]>;

	def : Processor<"arm1136j-s", ARMV6Itineraries, [ARMv6]>;			def : Processor<"arm1136j-s", ARMV6Itineraries, [ARMv6]>;
	def : Processor<"arm1136jf-s", ARMV6Itineraries, [ARMv6,			def : Processor<"arm1136jf-s", ARMV6Itineraries, [ARMv6,
	FeatureVFP2,			FeatureVFP2,
	FeatureHasSlowFPVMLx]>;			FeatureHasSlowFPVMLx]>;

	def : Processor<"cortex-m0", ARMV6Itineraries, [ARMv6m]>;			def : Processor<"cortex-m0", ARMV6Itineraries, [ARMv6m,
	def : Processor<"cortex-m0plus", ARMV6Itineraries, [ARMv6m]>;			FeatureHasNoBranchPredictor]>;
				def : Processor<"cortex-m0plus", ARMV6Itineraries, [ARMv6m,
				FeatureHasNoBranchPredictor]>;
	def : Processor<"cortex-m1", ARMV6Itineraries, [ARMv6m]>;			def : Processor<"cortex-m1", ARMV6Itineraries, [ARMv6m]>;
	def : Processor<"sc000", ARMV6Itineraries, [ARMv6m]>;			def : Processor<"sc000", ARMV6Itineraries, [ARMv6m]>;

	def : Processor<"arm1176j-s", ARMV6Itineraries, [ARMv6kz]>;			def : Processor<"arm1176j-s", ARMV6Itineraries, [ARMv6kz]>;
	def : Processor<"arm1176jz-s", ARMV6Itineraries, [ARMv6kz]>;			def : Processor<"arm1176jz-s", ARMV6Itineraries, [ARMv6kz]>;
	def : Processor<"arm1176jzf-s", ARMV6Itineraries, [ARMv6kz,			def : Processor<"arm1176jzf-s", ARMV6Itineraries, [ARMv6kz,
	FeatureVFP2,			FeatureVFP2,
	FeatureHasSlowFPVMLx]>;			FeatureHasSlowFPVMLx]>;
	▲ Show 20 Lines • Show All 168 Lines • ▼ Show 20 Lines
	FeatureD16,			FeatureD16,
	FeatureHasNoBranchPredictor]>;			FeatureHasNoBranchPredictor]>;

	def : ProcNoItin<"cortex-m7", [ARMv7em,			def : ProcNoItin<"cortex-m7", [ARMv7em,
	FeatureFPARMv8,			FeatureFPARMv8,
	FeatureD16]>;			FeatureD16]>;

	def : ProcNoItin<"cortex-m23", [ARMv8mBaseline,			def : ProcNoItin<"cortex-m23", [ARMv8mBaseline,
	FeatureNoMovt]>;			FeatureNoMovt,
				FeatureHasNoBranchPredictor]>;

	def : ProcessorModel<"cortex-m33", CortexM3Model, [ARMv8mMainline,			def : ProcessorModel<"cortex-m33", CortexM3Model, [ARMv8mMainline,
	FeatureDSP,			FeatureDSP,
	FeatureFPARMv8,			FeatureFPARMv8,
	FeatureD16,			FeatureD16,
	FeatureVFPOnlySP,			FeatureVFPOnlySP,
	FeatureHasNoBranchPredictor]>;			FeatureHasNoBranchPredictor]>;

	▲ Show 20 Lines • Show All 92 Lines • Show Last 20 Lines

lib/Target/ARM/ARMTargetTransformInfo.cpp

	Show First 20 Lines • Show All 92 Lines • ▼ Show 20 Lines

	return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,	return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
	Alignment, AddressSpace);	Alignment, AddressSpace);
	}	}

	void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,	void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
	TTI::UnrollingPreferences &UP) {	TTI::UnrollingPreferences &UP) {
	// Only currently enable these preferences for M-Class cores.	// Only currently enable these preferences for M-Class cores.
	if (!ST->isMClass() \|\| L->getNumBlocks() != 1)	if (!ST->isMClass())
	return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP);	return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP);

		// Only enable for simple loops.
		if (L->getNumBlocks() != 1)
		return;

	// Disable loop unrolling for Oz and Os.	// Disable loop unrolling for Oz and Os.
	UP.OptSizeThreshold = 0;	UP.OptSizeThreshold = 0;
	UP.PartialOptSizeThreshold = 0;	UP.PartialOptSizeThreshold = 0;
		BasicBlock *BB = L->getLoopLatch();
		if (BB->getParent()->optForSize())
		return;

	// Scan the loop: don't unroll loops with calls as this could prevent	// Scan the loop: don't unroll loops with calls as this could prevent
	// inlining.	// inlining.
	BasicBlock *BB = L->getLoopLatch();	unsigned numInsts = 0;
	for (auto &I : *BB) {	for (auto &I : *BB) {
	if (isa<CallInst>(I) \|\| isa<InvokeInst>(I)) {	if (isa<CallInst>(I) \|\| isa<InvokeInst>(I)) {
	ImmutableCallSite CS(&I);	ImmutableCallSite CS(&I);
	if (const Function *F = CS.getCalledFunction()) {	if (const Function *F = CS.getCalledFunction()) {
	if (!isLoweredToCall(F))	if (!isLoweredToCall(F))
	continue;	continue;
	}	}
	return;	return;
	}	} else if (isa<CastInst>(I) \|\| isa<GetElementPtrInst>(I) \|\|
		isa<BranchInst>(I))
		continue;
		efriedmaUnsubmitted Not Done Reply Inline Actions getUserCost()? efriedma: getUserCost()?
		samparkerAuthorUnsubmitted Not Done Reply Inline Actions This has crossed my mind, I'll look into it. samparker: This has crossed my mind, I'll look into it.
		++numInsts;
	}	}

	// Enable partial and runtime unrolling, set the initial threshold based upon	// Enable partial and runtime unrolling, set the initial threshold based upon
	// the number of registers available.	// the number of registers available.
	UP.Partial = true;	UP.Partial = true;
	UP.Runtime = true;	UP.Runtime = true;
	UP.Threshold = ST->isThumb1Only() ? 75 : 150;	UP.Threshold = ST->isThumb2() ? 150 : 75;
	UP.PartialThreshold = ST->isThumb1Only() ? 75 : 150;	UP.PartialThreshold = ST->isThumb2() ? 150 : 75;
		// Runtime unrolling introduces code to calculate the iterations and so
		// for small trip counts, large unrolling factors will slow the execution.
		// The extra code for an unroll of 2 is less of an overhead and the
		// unrolled body is more likely to be executed. The overhead on a device with
		// a branch predictor is less, so risk a higher unroll count.
		UP.DefaultUnrollRuntimeCount = ST->hasBranchPredictor() ? 4 : 2;

		if (numInsts < 16) {
		// Force unrolling small loops can be very useful because of the branch
		// taken cost of the backedge.
		UP.Force = true;
		// For loops that SCEV doesn't understand, the unrolled loop will have
		// early exits and/or predicated instructions. The overhead of unrolling
		// by an unhelpful factor is therefore less, so increase the count on
		// small loops.
		const SCEV *Count = SE.getExitCount(L, BB);
		efriedmaUnsubmitted Not Done Reply Inline Actions This seems like you're running into some sort of limitation of unrolling infrastructure. Maybe we need to add a feature to unroll remainder loops? Also, which function in the test covers this codepath? efriedma: This seems like you're running into some sort of limitation of unrolling infrastructure. Maybe…
		samparkerAuthorUnsubmitted Not Done Reply Inline Actions The integer type check isn't actually tested and wasn't something that I was interested in, so I will remove it. I'm not sure I understand what you mean. For clarity and posterity, the type check on the SCEV is not querying the number of iterations but whether the expression is based on int, float, etc... values. Currently, there is a TODO in the unroller to handle counts with pointer types. The runtime unroller creates a unrolled body and just uses an if-else statement to execute the correct loop, but the original loop is also called after the unrolled loop for the remaining iterations (N % unroll_count). The runtime unroller, by default, will only unroll the loops for which SCEV can produce a trip count because it can guarantee than the basic block can be duplicated and merged. Otherwise, the body can be duplicated but the basic blocks cannot be merged. The iterate_inc function is what tests this and hopefully highlights the problem that the loop count is dependent on the length of the linked list and SCEV cannot be expected to be able express this. samparker: The integer type check isn't actually tested and wasn't something that I was interested in, so…
		if (isa<SCEVCouldNotCompute>(Count) \|\| !Count->getType()->isIntegerTy())
		UP.DefaultUnrollRuntimeCount = 4;
		efriedmaUnsubmitted Not Done Reply Inline Actions Have you experimented with setting "UP.Runtime = false"? efriedma: Have you experimented with setting "UP.Runtime = false"?
		samparkerAuthorUnsubmitted Not Done Reply Inline Actions Yes, but there's a lot of performance to be gained by enabling runtime. This patch is trying to get some of that improvement, while minimising the negative affect it can have. samparker: Yes, but there's a lot of performance to be gained by enabling runtime. This patch is trying to…
		}
	}	}
Context not available.

test/Transforms/LoopUnroll/ARM/loop-unrolling.ll

	; RUN: opt -mtriple=armv7 -mcpu=cortex-a57 -loop-unroll -S %s -o - \| FileCheck %s --check-prefix=CHECK-UNROLL-V7			; RUN: opt -mtriple=armv7 -mcpu=cortex-a57 -loop-unroll -S %s -o - \| FileCheck %s --check-prefix=CHECK-UNROLL-A
	; RUN: opt -mtriple=thumbv7 -mcpu=cortex-a57 -loop-unroll -S %s -o - \| FileCheck %s --check-prefix=CHECK-UNROLL-V7			; RUN: opt -mtriple=thumbv7 -mcpu=cortex-a57 -loop-unroll -S %s -o - \| FileCheck %s --check-prefix=CHECK-UNROLL-A
	; RUN: opt -mtriple=thumbv8m -mcpu=cortex-m23 -loop-unroll -S %s -o - \| FileCheck %s --check-prefix=CHECK-UNROLL-SMALL			; RUN: opt -mtriple=thumbv8m -mcpu=cortex-m23 -loop-unroll -S %s -o - \| FileCheck %s --check-prefix=CHECK-UNROLL-T1
	; RUN: opt -mtriple=thumbv7m -mcpu=cortex-m4 -loop-unroll -S %s -o - \| FileCheck %s --check-prefix=CHECK-UNROLL			; RUN: opt -mtriple=thumbv8m.main -mcpu=cortex-m33 -loop-unroll -S %s -o - \| FileCheck %s --check-prefix=CHECK-UNROLL-T2
	; RUN: opt -mtriple=thumbv8m.main -mcpu=cortex-m33 -loop-unroll -S %s -o - \| FileCheck %s --check-prefix=CHECK-UNROLL			; RUN: opt -mtriple=thumbv7em -mcpu=cortex-m7 -loop-unroll -S %s -o - \| FileCheck %s --check-prefix=CHECK-UNROLL-T2-BP

	; CHECK-LABEL: partial			; CHECK-LABEL: partial
	define arm_aapcs_vfpcc void @partial(i32* nocapture %C, i32* nocapture readonly %A, i32* nocapture readonly %B) local_unnamed_addr #0 {			define arm_aapcs_vfpcc void @partial(i32* nocapture %C, i32* nocapture readonly %A, i32* nocapture readonly %B) local_unnamed_addr #0 {
	entry:			entry:
	br label %for.body			br label %for.body

	; CHECK-LABEL: for.body			; CHECK-LABEL: for.body
	for.body:			for.body:

	; CHECK-UNROLL-V7: [[IV0:%[a-z.0-9]+]] = phi i32 [ 0, %entry ], [ [[IV2:%[a-z.0-9]+]], %for.body ]			; CHECK-UNROLL-A: [[IV0:%[a-z.0-9]+]] = phi i32 [ 0, %entry ], [ [[IV2:%[a-z.0-9]+]], %for.body ]
	; CHECK-UNROLL-V7: [[IV1:%[a-z.0-9]+]] = add nuw nsw i32 [[IV0]], 1			; CHECK-UNROLL-A: [[IV1:%[a-z.0-9]+]] = add nuw nsw i32 [[IV0]], 1
	; CHECK-UNROLL-V7: [[IV2]] = add nuw nsw i32 [[IV1]], 1			; CHECK-UNROLL-A: [[IV2]] = add nuw nsw i32 [[IV1]], 1
	; CHECK-UNROLL-V7: [[CMP:%[a-z.0-9]+]] = icmp eq i32 [[IV2]], 1024			; CHECK-UNROLL-A: [[CMP:%[a-z.0-9]+]] = icmp eq i32 [[IV2]], 1024
	; CHECK-UNROLL-V7: br i1 [[CMP]], label [[END:%[a-z.]+]], label %for.body			; CHECK-UNROLL-A: br i1 [[CMP]], label [[END:%[a-z.]+]], label %for.body

	; CHECK-UNROLL-SMALL: [[IV0:%[a-z.0-9]+]] = phi i32 [ 0, %entry ], [ [[IV8:%[a-z.0-9]+]], %for.body ]			; CHECK-UNROLL-T1: [[IV0:%[a-z.0-9]+]] = phi i32 [ 0, %entry ], [ [[IV8:%[a-z.0-9]+]], %for.body ]
	; CHECK-UNROLL-SMALL: [[IV1:%[a-z.0-9]+]] = add nuw nsw i32 [[IV0]], 1			; CHECK-UNROLL-T1: [[IV1:%[a-z.0-9]+]] = add nuw nsw i32 [[IV0]], 1
	; CHECK-UNROLL-SMALL: [[IV2:%[a-z.0-9]+]] = add nuw nsw i32 [[IV1]], 1			; CHECK-UNROLL-T1: [[IV2:%[a-z.0-9]+]] = add nuw nsw i32 [[IV1]], 1
	; CHECK-UNROLL-SMALL: [[IV3:%[a-z.0-9]+]] = add nuw nsw i32 [[IV2]], 1			; CHECK-UNROLL-T1: [[IV3:%[a-z.0-9]+]] = add nuw nsw i32 [[IV2]], 1
	; CHECK-UNROLL-SMALL: [[IV4:%[a-z.0-9]+]] = add nuw nsw i32 [[IV3]], 1			; CHECK-UNROLL-T1: [[IV4:%[a-z.0-9]+]] = add nuw nsw i32 [[IV3]], 1
	; CHECK-UNROLL-SMALL: [[IV5:%[a-z.0-9]+]] = add nuw nsw i32 [[IV4]], 1			; CHECK-UNROLL-T1: [[IV5:%[a-z.0-9]+]] = add nuw nsw i32 [[IV4]], 1
	; CHECK-UNROLL-SMALL: [[IV6:%[a-z.0-9]+]] = add nuw nsw i32 [[IV5]], 1			; CHECK-UNROLL-T1: [[IV6:%[a-z.0-9]+]] = add nuw nsw i32 [[IV5]], 1
	; CHECK-UNROLL-SMALL: [[IV7:%[a-z.0-9]+]] = add nuw nsw i32 [[IV6]], 1			; CHECK-UNROLL-T1: [[IV7:%[a-z.0-9]+]] = add nuw nsw i32 [[IV6]], 1
	; CHECK-UNROLL-SMALL: [[IV8]] = add nuw nsw i32 [[IV7]], 1			; CHECK-UNROLL-T1: [[IV8]] = add nuw nsw i32 [[IV7]], 1
	; CHECK-UNROLL-SMALL: [[CMP:%[a-z.0-9]+]] = icmp eq i32 [[IV8]], 1024			; CHECK-UNROLL-T1: [[CMP:%[a-z.0-9]+]] = icmp eq i32 [[IV8]], 1024
	; CHECK-UNROLL-SMALL: br i1 [[CMP]], label [[END:%[a-z.]+]], label %for.body			; CHECK-UNROLL-T1: br i1 [[CMP]], label [[END:%[a-z.]+]], label %for.body

	; CHECK-UNROLL: [[IV0:%[a-z.0-9]+]] = phi i32 [ 0, %entry ], [ [[IV16:%[a-z.0-9]+]], %for.body ]			; CHECK-UNROLL-T2: [[IV0:%[a-z.0-9]+]] = phi i32 [ 0, %entry ], [ [[IV16:%[a-z.0-9]+]], %for.body ]
	; CHECK-UNROLL: [[IV1:%[a-z.0-9]+]] = add nuw nsw i32 [[IV0]], 1			; CHECK-UNROLL-T2: [[IV1:%[a-z.0-9]+]] = add nuw nsw i32 [[IV0]], 1
	; CHECK-UNROLL: [[IV2:%[a-z.0-9]+]] = add nuw nsw i32 [[IV1]], 1			; CHECK-UNROLL-T2: [[IV2:%[a-z.0-9]+]] = add nuw nsw i32 [[IV1]], 1
	; CHECK-UNROLL: [[IV3:%[a-z.0-9]+]] = add nuw nsw i32 [[IV2]], 1			; CHECK-UNROLL-T2: [[IV3:%[a-z.0-9]+]] = add nuw nsw i32 [[IV2]], 1
	; CHECK-UNROLL: [[IV4:%[a-z.0-9]+]] = add nuw nsw i32 [[IV3]], 1			; CHECK-UNROLL-T2: [[IV4:%[a-z.0-9]+]] = add nuw nsw i32 [[IV3]], 1
	; CHECK-UNROLL: [[IV5:%[a-z.0-9]+]] = add nuw nsw i32 [[IV4]], 1			; CHECK-UNROLL-T2: [[IV5:%[a-z.0-9]+]] = add nuw nsw i32 [[IV4]], 1
	; CHECK-UNROLL: [[IV6:%[a-z.0-9]+]] = add nuw nsw i32 [[IV5]], 1			; CHECK-UNROLL-T2: [[IV6:%[a-z.0-9]+]] = add nuw nsw i32 [[IV5]], 1
	; CHECK-UNROLL: [[IV7:%[a-z.0-9]+]] = add nuw nsw i32 [[IV6]], 1			; CHECK-UNROLL-T2: [[IV7:%[a-z.0-9]+]] = add nuw nsw i32 [[IV6]], 1
	; CHECK-UNROLL: [[IV8:%[a-z.0-9]+]] = add nuw nsw i32 [[IV7]], 1			; CHECK-UNROLL-T2: [[IV8:%[a-z.0-9]+]] = add nuw nsw i32 [[IV7]], 1
	; CHECK-UNROLL: [[IV9:%[a-z.0-9]+]] = add nuw nsw i32 [[IV8]], 1			; CHECK-UNROLL-T2: [[IV9:%[a-z.0-9]+]] = add nuw nsw i32 [[IV8]], 1
	; CHECK-UNROLL: [[IV10:%[a-z.0-9]+]] = add nuw nsw i32 [[IV9]], 1			; CHECK-UNROLL-T2: [[IV10:%[a-z.0-9]+]] = add nuw nsw i32 [[IV9]], 1
	; CHECK-UNROLL: [[IV11:%[a-z.0-9]+]] = add nuw nsw i32 [[IV10]], 1			; CHECK-UNROLL-T2: [[IV11:%[a-z.0-9]+]] = add nuw nsw i32 [[IV10]], 1
	; CHECK-UNROLL: [[IV12:%[a-z.0-9]+]] = add nuw nsw i32 [[IV11]], 1			; CHECK-UNROLL-T2: [[IV12:%[a-z.0-9]+]] = add nuw nsw i32 [[IV11]], 1
	; CHECK-UNROLL: [[IV13:%[a-z.0-9]+]] = add nuw nsw i32 [[IV12]], 1			; CHECK-UNROLL-T2: [[IV13:%[a-z.0-9]+]] = add nuw nsw i32 [[IV12]], 1
	; CHECK-UNROLL: [[IV14:%[a-z.0-9]+]] = add nuw nsw i32 [[IV13]], 1			; CHECK-UNROLL-T2: [[IV14:%[a-z.0-9]+]] = add nuw nsw i32 [[IV13]], 1
	; CHECK-UNROLL: [[IV15:%[a-z.0-9]+]] = add nuw nsw i32 [[IV14]], 1			; CHECK-UNROLL-T2: [[IV15:%[a-z.0-9]+]] = add nuw nsw i32 [[IV14]], 1
	; CHECK-UNROLL: [[IV16]] = add nuw nsw i32 [[IV15]], 1			; CHECK-UNROLL-T2: [[IV16]] = add nuw nsw i32 [[IV15]], 1
	; CHECK-UNROLL: [[CMP:%[a-z.0-9]+]] = icmp eq i32 [[IV16]], 1024			; CHECK-UNROLL-T2: [[CMP:%[a-z.0-9]+]] = icmp eq i32 [[IV16]], 1024
	; CHECK-UNROLL: br i1 [[CMP]], label [[END:%[a-z.]+]], label %for.body			; CHECK-UNROLL-T2: br i1 [[CMP]], label [[END:%[a-z.]+]], label %for.body

				; CHECK-UNROLL-T2-BP: [[IV0:%[a-z.0-9]+]] = phi i32 [ 0, %entry ], [ [[IV16:%[a-z.0-9]+]], %for.body ]
				; CHECK-UNROLL-T2-BP: [[IV1:%[a-z.0-9]+]] = add nuw nsw i32 [[IV0]], 1
				; CHECK-UNROLL-T2-BP: [[IV2:%[a-z.0-9]+]] = add nuw nsw i32 [[IV1]], 1
				; CHECK-UNROLL-T2-BP: [[IV3:%[a-z.0-9]+]] = add nuw nsw i32 [[IV2]], 1
				; CHECK-UNROLL-T2-BP: [[IV4:%[a-z.0-9]+]] = add nuw nsw i32 [[IV3]], 1
				; CHECK-UNROLL-T2-BP: [[IV5:%[a-z.0-9]+]] = add nuw nsw i32 [[IV4]], 1
				; CHECK-UNROLL-T2-BP: [[IV6:%[a-z.0-9]+]] = add nuw nsw i32 [[IV5]], 1
				; CHECK-UNROLL-T2-BP: [[IV7:%[a-z.0-9]+]] = add nuw nsw i32 [[IV6]], 1
				; CHECK-UNROLL-T2-BP: [[IV8:%[a-z.0-9]+]] = add nuw nsw i32 [[IV7]], 1
				; CHECK-UNROLL-T2-BP: [[IV9:%[a-z.0-9]+]] = add nuw nsw i32 [[IV8]], 1
				; CHECK-UNROLL-T2-BP: [[IV10:%[a-z.0-9]+]] = add nuw nsw i32 [[IV9]], 1
				; CHECK-UNROLL-T2-BP: [[IV11:%[a-z.0-9]+]] = add nuw nsw i32 [[IV10]], 1
				; CHECK-UNROLL-T2-BP: [[IV12:%[a-z.0-9]+]] = add nuw nsw i32 [[IV11]], 1
				; CHECK-UNROLL-T2-BP: [[IV13:%[a-z.0-9]+]] = add nuw nsw i32 [[IV12]], 1
				; CHECK-UNROLL-T2-BP: [[IV14:%[a-z.0-9]+]] = add nuw nsw i32 [[IV13]], 1
				; CHECK-UNROLL-T2-BP: [[IV15:%[a-z.0-9]+]] = add nuw nsw i32 [[IV14]], 1
				; CHECK-UNROLL-T2-BP: [[IV16]] = add nuw nsw i32 [[IV15]], 1
				; CHECK-UNROLL-T2-BP: [[CMP:%[a-z.0-9]+]] = icmp eq i32 [[IV16]], 1024
				; CHECK-UNROLL-T2-BP: br i1 [[CMP]], label [[END:%[a-z.]+]], label %for.body

	%i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]			%i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
	%arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.08			%arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.08
	%0 = load i32, i32* %arrayidx, align 4			%0 = load i32, i32* %arrayidx, align 4
	%arrayidx1 = getelementptr inbounds i32, i32* %B, i32 %i.08			%arrayidx1 = getelementptr inbounds i32, i32* %B, i32 %i.08
	%1 = load i32, i32* %arrayidx1, align 4			%1 = load i32, i32* %arrayidx1, align 4
	%mul = mul nsw i32 %1, %0			%mul = mul nsw i32 %1, %0
	%arrayidx2 = getelementptr inbounds i32, i32* %C, i32 %i.08			%arrayidx2 = getelementptr inbounds i32, i32* %C, i32 %i.08
	Show All 9 Lines
	; CHECK-LABEL: runtime			; CHECK-LABEL: runtime
	define arm_aapcs_vfpcc void @runtime(i32* nocapture %C, i32* nocapture readonly %A, i32* nocapture readonly %B, i32 %N) local_unnamed_addr #0 {			define arm_aapcs_vfpcc void @runtime(i32* nocapture %C, i32* nocapture readonly %A, i32* nocapture readonly %B, i32 %N) local_unnamed_addr #0 {
	entry:			entry:
	%cmp8 = icmp eq i32 %N, 0			%cmp8 = icmp eq i32 %N, 0
	br i1 %cmp8, label %for.cond.cleanup, label %for.body			br i1 %cmp8, label %for.cond.cleanup, label %for.body

	; CHECK-LABEL: for.body			; CHECK-LABEL: for.body
	for.body:			for.body:
	; CHECK-UNROLL-V7: [[IV0:%[a-z.0-9]+]] = phi i32 [ 0, [[PRE:%[a-z.0-9]+]] ], [ [[IV8:%[a-z.0-9]+]], %for.body ]			; CHECK-UNROLL-A: [[IV0:%[a-z.0-9]+]] = phi i32 [ 0, [[PRE:%[a-z.0-9]+]] ], [ [[IV2:%[a-z.0-9]+]], %for.body ]
	; CHECK-UNROLL-V7: [[IV1:%[a-z.0-9]+]] = add nuw nsw i32 [[IV0]], 1			; CHECK-UNROLL-A: [[IV1:%[a-z.0-9]+]] = add nuw nsw i32 [[IV0]], 1
	; CHECK-UNROLL-V7: [[IV2]] = add nuw i32 [[IV1]], 1			; CHECK-UNROLL-A: [[IV2]] = add nuw i32 [[IV1]], 1
				; CHECK-UNROLL-A: br
	; CHECK-UNROLL-SMALL: [[IV0:%[a-z.0-9]+]] = phi i32 [ 0, [[PRE:%[a-z.0-9]+]] ], [ [[IV8:%[a-z.0-9]+]], %for.body ]
	; CHECK-UNROLL-SMALL: [[IV1:%[a-z.0-9]+]] = add nuw nsw i32 [[IV0]], 1			; CHECK-UNROLL-T1: [[IV0:%[a-z.0-9]+]] = phi i32 [ 0, [[PRE:%[a-z.0-9]+]] ], [ [[IV2:%[a-z.0-9]+]], %for.body ]
	; CHECK-UNROLL-SMALL: [[IV2:%[a-z.0-9]+]] = add nuw nsw i32 [[IV1]], 1			; CHECK-UNROLL-T1: [[IV1:%[a-z.0-9]+]] = add nuw nsw i32 [[IV0]], 1
	; CHECK-UNROLL-SMALL: [[IV3:%[a-z.0-9]+]] = add nuw nsw i32 [[IV2]], 1			; CHECK-UNROLL-T1: [[IV2]] = add nuw i32 [[IV1]], 1
	; CHECK-UNROLL-SMALL: [[IV4:%[a-z.0-9]+]] = add nuw nsw i32 [[IV3]], 1			; CHECK-UNROLL-T1: br
	; CHECK-UNROLL-SMALL: [[IV5:%[a-z.0-9]+]] = add nuw nsw i32 [[IV4]], 1
	; CHECK-UNROLL-SMALL: [[IV6:%[a-z.0-9]+]] = add nuw nsw i32 [[IV5]], 1			; CHECK-UNROLL-T2: [[IV0:%[a-z.0-9]+]] = phi i32 [ 0, [[PRE:%[a-z.0-9]+]] ], [ [[IV2:%[a-z.0-9]+]], %for.body ]
	; CHECK-UNROLL-SMALL: [[IV7:%[a-z.0-9]+]] = add nuw nsw i32 [[IV6]], 1			; CHECK-UNROLL-T2: [[IV1:%[a-z.0-9]+]] = add nuw nsw i32 [[IV0]], 1
	; CHECK-UNROLL-SMALL: [[IV8]] = add nuw i32 [[IV7]], 1			; CHECK-UNROLL-T2: [[IV2]] = add nuw i32 [[IV1]], 1
				; CHECK-UNROLL-T2: br
	; CHECK-UNROLL: [[IV0:%[a-z.0-9]+]] = phi i32 [ 0, [[PRE:%[a-z.0-9]+]] ], [ [[IV8:%[a-z.0-9]+]], %for.body ]
	; CHECK-UNROLL: [[IV1:%[a-z.0-9]+]] = add nuw nsw i32 [[IV0]], 1			; CHECK-UNROLL-T2-BP: [[IV0:%[a-z.0-9]+]] = phi i32 [ 0, [[PRE:%[a-z.0-9]+]] ], [ [[IV4:%[a-z.0-9]+]], %for.body ]
	; CHECK-UNROLL: [[IV2:%[a-z.0-9]+]] = add nuw nsw i32 [[IV1]], 1			; CHECK-UNROLL-T2-BP: [[IV1:%[a-z.0-9]+]] = add nuw nsw i32 [[IV0]], 1
	; CHECK-UNROLL: [[IV3:%[a-z.0-9]+]] = add nuw nsw i32 [[IV2]], 1			; CHECK-UNROLL-T2-BP: [[IV2:%[a-z.0-9]+]] = add nuw nsw i32 [[IV1]], 1
	; CHECK-UNROLL: [[IV4:%[a-z.0-9]+]] = add nuw nsw i32 [[IV3]], 1			; CHECK-UNROLL-T2-BP: [[IV3:%[a-z.0-9]+]] = add nuw nsw i32 [[IV2]], 1
	; CHECK-UNROLL: [[IV5:%[a-z.0-9]+]] = add nuw nsw i32 [[IV4]], 1			; CHECK-UNROLL-T2-BP: [[IV4]] = add nuw i32 [[IV3]], 1
	; CHECK-UNROLL: [[IV6:%[a-z.0-9]+]] = add nuw nsw i32 [[IV5]], 1			; CHECK-UNROLL-T2-BP: br
	; CHECK-UNROLL: [[IV7:%[a-z.0-9]+]] = add nuw nsw i32 [[IV6]], 1
	; CHECK-UNROLL: [[IV8]] = add nuw i32 [[IV7]], 1

	%i.09 = phi i32 [ %inc, %for.body ], [ 0, %entry ]			%i.09 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
	%arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.09			%arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.09
	%0 = load i32, i32* %arrayidx, align 4			%0 = load i32, i32* %arrayidx, align 4
	%arrayidx1 = getelementptr inbounds i32, i32* %B, i32 %i.09			%arrayidx1 = getelementptr inbounds i32, i32* %B, i32 %i.09
	%1 = load i32, i32* %arrayidx1, align 4			%1 = load i32, i32* %arrayidx1, align 4
	%mul = mul nsw i32 %1, %0			%mul = mul nsw i32 %1, %0
	%arrayidx2 = getelementptr inbounds i32, i32* %C, i32 %i.09			%arrayidx2 = getelementptr inbounds i32, i32* %C, i32 %i.09
	Show All 22 Lines

	for.cond.cleanup3:			for.cond.cleanup3:
	%inc11 = add nuw i32 %h.026, 1			%inc11 = add nuw i32 %h.026, 1
	%exitcond27 = icmp eq i32 %inc11, %N			%exitcond27 = icmp eq i32 %inc11, %N
	br i1 %exitcond27, label %for.cond.cleanup, label %for.body4.lr.ph			br i1 %exitcond27, label %for.cond.cleanup, label %for.body4.lr.ph

	; CHECK-LABEL: for.body4			; CHECK-LABEL: for.body4
	for.body4:			for.body4:
	; CHECK-UNROLL-V7: [[IV0:%[a-z.0-9]+]] = phi i32 [ 0, [[PRE:%[a-z0-9.]+]] ], [ [[IV1:%[a-z.0-9]+]], %for.body4 ]			; CHECK-UNROLL-T1: [[IV0:%[a-z.0-9]+]] = phi i32 [ 0, [[PRE:%[a-z0-9.]+]] ], [ [[IV2:%[a-z.0-9]+]], %for.body4 ]
	; CHECK-UNROLL-V7: [[IV1:%[a-z.0-9]+]] = add nuw i32 [[IV0]], 1			; CHECK-UNROLL-T1: [[IV1:%[a-z.0-9]+]] = add nuw nsw i32 [[IV0]], 1
				; CHECK-UNROLL-T1: [[IV2]] = add nuw i32 [[IV1]], 1
	; CHECK-UNROLL-SMALL: [[IV0:%[a-z.0-9]+]] = phi i32 [ 0, [[PRE:%[a-z0-9.]+]] ], [ [[IV4:%[a-z.0-9]+]], %for.body4 ]			; CHECK-UNROLL-T1: br
	; CHECK-UNROLL-SMALL: [[IV1:%[a-z.0-9]+]] = add nuw nsw i32 [[IV0]], 1
	; CHECK-UNROLL-SMALL: [[IV2:%[a-z.0-9]+]] = add nuw nsw i32 [[IV1]], 1			; CHECK-UNROLL-T2: [[IV0:%[a-z.0-9]+]] = phi i32 [ 0, [[PRE:%[a-z0-9.]+]] ], [ [[IV2:%[a-z.0-9]+]], %for.body4 ]
	; CHECK-UNROLL-SMALL: [[IV3:%[a-z.0-9]+]] = add nuw nsw i32 [[IV2]], 1			; CHECK-UNROLL-T2: [[IV1:%[a-z.0-9]+]] = add nuw nsw i32 [[IV0]], 1
	; CHECK-UNROLL-SMALL: [[IV4]] = add nuw i32 [[IV3]], 1			; CHECK-UNROLL-T2: [[IV2]] = add nuw i32 [[IV1]], 1
				; CHECK-UNROLL-T2: br
	; CHECK-UNROLL: [[IV0:%[a-z.0-9]+]] = phi i32 [ 0, [[PRE:%[a-z0-9.]+]] ], [ [[IV8:%[a-z.0-9]+]], %for.body4 ]
	; CHECK-UNROLL: [[IV1:%[a-z.0-9]+]] = add nuw nsw i32 [[IV0]], 1			; CHECK-UNROLL-T2-BP: [[IV0:%[a-z.0-9]+]] = phi i32 [ 0, [[PRE:%[a-z0-9.]+]] ], [ [[IV4:%[a-z.0-9]+]], %for.body4 ]
	; CHECK-UNROLL: [[IV2:%[a-z.0-9]+]] = add nuw nsw i32 [[IV1]], 1			; CHECK-UNROLL-T2-BP: [[IV1:%[a-z.0-9]+]] = add nuw nsw i32 [[IV0]], 1
	; CHECK-UNROLL: [[IV3:%[a-z.0-9]+]] = add nuw nsw i32 [[IV2]], 1			; CHECK-UNROLL-T2-BP: [[IV2:%[a-z.0-9]+]] = add nuw nsw i32 [[IV1]], 1
	; CHECK-UNROLL: [[IV4:%[a-z.0-9]+]] = add nuw nsw i32 [[IV3]], 1			; CHECK-UNROLL-T2-BP: [[IV3:%[a-z.0-9]+]] = add nuw nsw i32 [[IV2]], 1
	; CHECK-UNROLL: [[IV5:%[a-z.0-9]+]] = add nuw nsw i32 [[IV4]], 1			; CHECK-UNROLL-T2-BP: [[IV4]] = add nuw i32 [[IV3]], 1
	; CHECK-UNROLL: [[IV6:%[a-z.0-9]+]] = add nuw nsw i32 [[IV5]], 1			; CHECK-UNROLL-T2-BP: br
	; CHECK-UNROLL: [[IV7:%[a-z.0-9]+]] = add nuw nsw i32 [[IV6]], 1
	; CHECK-UNROLL: [[IV8:%[a-z.0-9]+]] = add nuw i32 [[IV7]], 1

	%w.024 = phi i32 [ 0, %for.body4.lr.ph ], [ %inc, %for.body4 ]			%w.024 = phi i32 [ 0, %for.body4.lr.ph ], [ %inc, %for.body4 ]
	%add = add i32 %w.024, %mul			%add = add i32 %w.024, %mul
	%arrayidx = getelementptr inbounds i16, i16* %A, i32 %add			%arrayidx = getelementptr inbounds i16, i16* %A, i32 %add
	%0 = load i16, i16* %arrayidx, align 2			%0 = load i16, i16* %arrayidx, align 2
	%conv = sext i16 %0 to i32			%conv = sext i16 %0 to i32
	%arrayidx5 = getelementptr inbounds i16, i16* %B, i32 %w.024			%arrayidx5 = getelementptr inbounds i16, i16* %B, i32 %w.024
	%1 = load i16, i16* %arrayidx5, align 2			%1 = load i16, i16* %arrayidx5, align 2
	Show All 13 Lines
	entry:			entry:
	br label %for.body			br label %for.body

	for.cond.cleanup:			for.cond.cleanup:
	ret void			ret void

	; CHECK-LABEL: for.body			; CHECK-LABEL: for.body
	for.body:			for.body:
	; CHECK-UNROLL-V7: [[IV0:%[a-z.0-9]+]] = phi i32 [ 0, %entry ], [ [[IV1:%[a-z.0-9]+]], %for.body ]			; CHECK-UNROLL-A: [[IV0:%[a-z.0-9]+]] = phi i32 [ 0, %entry ], [ [[IV1:%[a-z.0-9]+]], %for.body ]
	; CHECK-UNROLL-V7: [[IV1]] = add nuw nsw i32 [[IV0]], 1			; CHECK-UNROLL-A: [[IV1]] = add nuw nsw i32 [[IV0]], 1
	; CHECK-UNROLL-V7: icmp eq i32 [[IV1]], 1024			; CHECK-UNROLL-A: icmp eq i32 [[IV1]], 1024
				; CHECK-UNROLL-A: br
	; CHECK-UNROLL-SMALL: [[IV0:%[a-z.0-9]+]] = phi i32 [ 0, %entry ], [ [[IV1:%[a-z.0-9]+]], %for.body ]
	; CHECK-UNROLL-SMALL: [[IV1]] = add nuw nsw i32 [[IV0]], 1			; CHECK-UNROLL-T1: [[IV0:%[a-z.0-9]+]] = phi i32 [ 0, %entry ], [ [[IV1:%[a-z.0-9]+]], %for.body ]
	; CHECK-UNROLL-SMALL: icmp eq i32 [[IV1]], 1024			; CHECK-UNROLL-T1: [[IV1]] = add nuw nsw i32 [[IV0]], 1
				; CHECK-UNROLL-T1: icmp eq i32 [[IV1]], 1024
	; CHECK-UNROLL: [[IV0:%[a-z.0-9]+]] = phi i32 [ 0, %entry ], [ [[IV1:%[a-z.0-9]+]], %for.body ]			; CHECK-UNROLL-T1: br
	; CHECK-UNROLL: [[IV1]] = add nuw nsw i32 [[IV0]], 1
	; CHECK-UNROLL: icmp eq i32 [[IV1]], 1024			; CHECK-UNROLL-T2: [[IV0:%[a-z.0-9]+]] = phi i32 [ 0, %entry ], [ [[IV1:%[a-z.0-9]+]], %for.body ]
				; CHECK-UNROLL-T2: [[IV1]] = add nuw nsw i32 [[IV0]], 1
				; CHECK-UNROLL-T2: icmp eq i32 [[IV1]], 1024
				; CHECK-UNROLL-T2: br

				; CHECK-UNROLL-T2-BP: [[IV0:%[a-z.0-9]+]] = phi i32 [ 0, %entry ], [ [[IV1:%[a-z.0-9]+]], %for.body ]
				; CHECK-UNROLL-T2-BP: [[IV1]] = add nuw nsw i32 [[IV0]], 1
				; CHECK-UNROLL-T2-BP: icmp eq i32 [[IV1]], 1024
				; CHECK-UNROLL-T2-BP: br

	%i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]			%i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
	%arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.08			%arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.08
	%0 = load i32, i32* %arrayidx, align 4			%0 = load i32, i32* %arrayidx, align 4
	%arrayidx1 = getelementptr inbounds i32, i32* %B, i32 %i.08			%arrayidx1 = getelementptr inbounds i32, i32* %B, i32 %i.08
	%1 = load i32, i32* %arrayidx1, align 4			%1 = load i32, i32* %arrayidx1, align 4
	%call = tail call arm_aapcs_vfpcc i32 @some_func(i32 %0, i32 %1) #3			%call = tail call arm_aapcs_vfpcc i32 @some_func(i32 %0, i32 %1) #3
	%arrayidx2 = getelementptr inbounds i32, i32* %C, i32 %i.08			%arrayidx2 = getelementptr inbounds i32, i32* %C, i32 %i.08
	store i32 %call, i32* %arrayidx2, align 4			store i32 %call, i32* %arrayidx2, align 4
	%inc = add nuw nsw i32 %i.08, 1			%inc = add nuw nsw i32 %i.08, 1
	%exitcond = icmp eq i32 %inc, 1024			%exitcond = icmp eq i32 %inc, 1024
	br i1 %exitcond, label %for.cond.cleanup, label %for.body			br i1 %exitcond, label %for.cond.cleanup, label %for.body
	}			}

				; CHECK-LABEL: iterate_inc
				; CHECK-UNROLL-A: %n.addr.04 = phi %struct.Node* [ %1, %while.body ], [ %n, %while.body.preheader ]
				; CHECK-UNROLL-A: %tobool = icmp eq %struct.Node* %1, null
				; CHECK-UNROLL-A: br i1 %tobool
				; CHECK-UNROLL-A-NOT: load

				; CHECK-UNROLL-T1: [[CMP0:%[a-z.0-9]+]] = icmp eq %struct.Node* [[VAR0:%[a-z.0-9]+]], null
				; CHECK-UNROLL-T1: br i1 [[CMP0]], label [[END:%[a-z.0-9]+]]
				; CHECK-UNROLL-T1: [[CMP1:%[a-z.0-9]+]] = icmp eq %struct.Node* [[VAR1:%[a-z.0-9]+]], null
				; CHECK-UNROLL-T1: br i1 [[CMP1]], label [[END]]
				; CHECK-UNROLL-T1: [[CMP2:%[a-z.0-9]+]] = icmp eq %struct.Node* [[VAR2:%[a-z.0-9]+]], null
				; CHECK-UNROLL-T1: br i1 [[CMP2]], label [[END]]
				; CHECK-UNROLL-T1: [[CMP3:%[a-z.0-9]+]] = icmp eq %struct.Node* [[VAR3:%[a-z.0-9]+]], null
				; CHECK-UNROLL-T1: br i1 [[CMP3]], label [[END]]
				; CHECK-UNROLL-T1: [[CMP4:%[a-z.0-9]+]] = icmp eq %struct.Node* [[VAR4:%[a-z.0-9]+]], null
				; CHECK-UNROLL-T1: br i1 [[CMP4]], label [[END]]
				; CHECK-UNROLL-T1-NOT: load

				; CHECK-UNROLL-T2: [[CMP0:%[a-z.0-9]+]] = icmp eq %struct.Node* [[VAR0:%[a-z.0-9]+]], null
				; CHECK-UNROLL-T2: br i1 [[CMP0]], label [[END:%[a-z.0-9]+]]
				; CHECK-UNROLL-T2: [[CMP1:%[a-z.0-9]+]] = icmp eq %struct.Node* [[VAR1:%[a-z.0-9]+]], null
				; CHECK-UNROLL-T2: br i1 [[CMP1]], label [[END]]
				; CHECK-UNROLL-T2: [[CMP2:%[a-z.0-9]+]] = icmp eq %struct.Node* [[VAR2:%[a-z.0-9]+]], null
				; CHECK-UNROLL-T2: br i1 [[CMP2]], label [[END]]
				; CHECK-UNROLL-T2: [[CMP3:%[a-z.0-9]+]] = icmp eq %struct.Node* [[VAR3:%[a-z.0-9]+]], null
				; CHECK-UNROLL-T2: br i1 [[CMP3]], label [[END]]
				; CHECK-UNROLL-T2: [[CMP4:%[a-z.0-9]+]] = icmp eq %struct.Node* [[VAR4:%[a-z.0-9]+]], null
				; CHECK-UNROLL-T2: br i1 [[CMP4]], label [[END]]
				; CHECK-UNROLL-T2-NOT: load

				; CHECK-UNROLL-T2-BP: [[CMP0:%[a-z.0-9]+]] = icmp eq %struct.Node* [[VAR0:%[a-z.0-9]+]], null
				; CHECK-UNROLL-T2-BP: br i1 [[CMP0]], label [[END:%[a-z.0-9]+]]
				; CHECK-UNROLL-T2-BP: [[CMP1:%[a-z.0-9]+]] = icmp eq %struct.Node* [[VAR1:%[a-z.0-9]+]], null
				; CHECK-UNROLL-T2-BP: br i1 [[CMP1]], label [[END]]
				; CHECK-UNROLL-T2-BP: [[CMP2:%[a-z.0-9]+]] = icmp eq %struct.Node* [[VAR2:%[a-z.0-9]+]], null
				; CHECK-UNROLL-T2-BP: br i1 [[CMP2]], label [[END]]
				; CHECK-UNROLL-T2-BP: [[CMP3:%[a-z.0-9]+]] = icmp eq %struct.Node* [[VAR3:%[a-z.0-9]+]], null
				; CHECK-UNROLL-T2-BP: br i1 [[CMP3]], label [[END]]
				; CHECK-UNROLL-T2-BP: [[CMP4:%[a-z.0-9]+]] = icmp eq %struct.Node* [[VAR4:%[a-z.0-9]+]], null
				; CHECK-UNROLL-T2-BP: br i1 [[CMP4]], label [[END]]
				; CHECK-UNROLL-T2-BP-NOT: load
				%struct.Node = type { %struct.Node*, i32 }

				define arm_aapcscc void @iterate_inc(%struct.Node* %n) local_unnamed_addr #0 {
				entry:
				%tobool3 = icmp eq %struct.Node* %n, null
				br i1 %tobool3, label %while.end, label %while.body.preheader

				while.body.preheader:
				br label %while.body

				while.body:
				%n.addr.04 = phi %struct.Node* [ %1, %while.body ], [ %n, %while.body.preheader ]
				%val = getelementptr inbounds %struct.Node, %struct.Node* %n.addr.04, i32 0, i32 1
				%0 = load i32, i32* %val, align 4
				%add = add nsw i32 %0, 1
				store i32 %add, i32* %val, align 4
				%next = getelementptr inbounds %struct.Node, %struct.Node* %n.addr.04, i32 0, i32 0
				%1 = load %struct.Node, %struct.Node* %next, align 4
				%tobool = icmp eq %struct.Node* %1, null
				br i1 %tobool, label %while.end, label %while.body

				while.end:
				ret void
				}

	declare arm_aapcs_vfpcc i32 @some_func(i32, i32) local_unnamed_addr #2			declare arm_aapcs_vfpcc i32 @some_func(i32, i32) local_unnamed_addr #2

This is an archive of the discontinued LLVM Phabricator instance.

[ARM] Improve loop unrolling for Cortex-M
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 109067

lib/Target/ARM/ARM.td

lib/Target/ARM/ARMTargetTransformInfo.cpp

test/Transforms/LoopUnroll/ARM/loop-unrolling.ll

This is an archive of the discontinued LLVM Phabricator instance.

[ARM] Improve loop unrolling for Cortex-MClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 109067

lib/Target/ARM/ARM.td

lib/Target/ARM/ARMTargetTransformInfo.cpp

test/Transforms/LoopUnroll/ARM/loop-unrolling.ll

[ARM] Improve loop unrolling for Cortex-M
ClosedPublic