This is an archive of the discontinued LLVM Phabricator instance.

[LV] Relax Small Size Reduction Type Requirement
ClosedPublic

Authored by mssimpso on Sep 10 2015, 12:08 PM.

Download Raw Diff

Details

Reviewers

jmolloy
hfinkel

Commits

rG29dc0f70751b: [LV] Relax Small Size Reduction Type Requirement
rL247337: [LV] Relax Small Size Reduction Type Requirement

Summary

This patch enables small size reductions in which the source types are smaller
than the reduction type (e.g., computing an i16 sum from the values in an i8
array). The previous behavior was to only allow small size reductions if the
source types and reduction type were the same. The change accounts for the fact
that the existing sign- and zero-extend instructions in these cases, should
still be included in the cost model.

Diff Detail

Repository: rL LLVM

Event Timeline

mssimpso updated this revision to Diff 34473.Sep 10 2015, 12:08 PM

mssimpso retitled this revision from to [LV] Relax Small Size Reduction Type Requirement.

mssimpso updated this object.

mssimpso added reviewers: jmolloy, hfinkel.

mssimpso added subscribers: llvm-commits, mcrosier.

@jmolloy: This change catches a loop in 256.bzip2 when vectorization is forced that the previous revision (D12202) missed.

Hi Matt,

This looks fine to me.

Cheers,

James

This revision is now accepted and ready to land.Sep 10 2015, 12:55 PM

Thanks very much, James, for the quick review!

Closed by commit rL247337: [LV] Relax Small Size Reduction Type Requirement (authored by mssimpso). · Explain WhySep 10 2015, 2:14 PM

This revision was automatically updated to reflect the committed changes.

What kind of correctness/performance testing has been conducted?

test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll
69 ↗	(On Diff #34473)	Instead of the _2 suffix, how about _short_short?
95 ↗	(On Diff #34473)	_short_short
130 ↗	(On Diff #34473)	_2 -> _short_char

Correctness validations were run on the nightly test suite, spec2000, and spec2006. The change does not affect the performance of these programs. The 256.bzip2 loop I mentioned to @jmolloy is affected only by disabling the cost model. I had disabled the cost model in order to discover current limitations.

Revision Contents

Path

Size

llvm/

trunk/

lib/

Transforms/

Utils/

LoopUtils.cpp

20 lines

test/

Transforms/

LoopVectorize/

AArch64/

reduction-small-size.ll

69 lines

Diff 34489

llvm/trunk/lib/Transforms/Utils/LoopUtils.cpp

	Show First 20 Lines • Show All 92 Lines • ▼ Show 20 Lines

	bool RecurrenceDescriptor::getSourceExtensionKind(			bool RecurrenceDescriptor::getSourceExtensionKind(
	Instruction Start, Instruction Exit, Type *RT, bool &IsSigned,			Instruction Start, Instruction Exit, Type *RT, bool &IsSigned,
	SmallPtrSetImpl<Instruction *> &Visited,			SmallPtrSetImpl<Instruction *> &Visited,
	SmallPtrSetImpl<Instruction *> &CI) {			SmallPtrSetImpl<Instruction *> &CI) {

	SmallVector<Instruction *, 8> Worklist;			SmallVector<Instruction *, 8> Worklist;
	bool FoundOneOperand = false;			bool FoundOneOperand = false;
				unsigned DstSize = RT->getPrimitiveSizeInBits();
	Worklist.push_back(Exit);			Worklist.push_back(Exit);

	// Traverse the instructions in the reduction expression, beginning with the			// Traverse the instructions in the reduction expression, beginning with the
	// exit value.			// exit value.
	while (!Worklist.empty()) {			while (!Worklist.empty()) {
	Instruction *I = Worklist.pop_back_val();			Instruction *I = Worklist.pop_back_val();
	for (Use &U : I->operands()) {			for (Use &U : I->operands()) {

	// Terminate the traversal if the operand is not an instruction, or we			// Terminate the traversal if the operand is not an instruction, or we
	// reach the starting value.			// reach the starting value.
	Instruction *J = dyn_cast<Instruction>(U.get());			Instruction *J = dyn_cast<Instruction>(U.get());
	if (!J \|\| J == Start)			if (!J \|\| J == Start)
	continue;			continue;

	// Otherwise, investigate the operation if it is also in the expression.			// Otherwise, investigate the operation if it is also in the expression.
	if (Visited.count(J)) {			if (Visited.count(J)) {
	Worklist.push_back(J);			Worklist.push_back(J);
	continue;			continue;
	}			}

	// If the operand is not in Visited, it is not a reduction operation, but			// If the operand is not in Visited, it is not a reduction operation, but
	// it does feed into one. Make sure it is either a single-use sign- or			// it does feed into one. Make sure it is either a single-use sign- or
	// zero-extend of the recurrence type.			// zero-extend instruction.
	CastInst *Cast = dyn_cast<CastInst>(J);			CastInst *Cast = dyn_cast<CastInst>(J);
	bool IsSExtInst = isa<SExtInst>(J);			bool IsSExtInst = isa<SExtInst>(J);
	if (!Cast \|\| !Cast->hasOneUse() \|\| Cast->getSrcTy() != RT \|\|			if (!Cast \|\| !Cast->hasOneUse() \|\| !(isa<ZExtInst>(J) \|\| IsSExtInst))
	!(isa<ZExtInst>(J) \|\| IsSExtInst))			return false;

				// Ensure the source type of the extend is no larger than the reduction
				// type. It is not necessary for the types to be identical.
				unsigned SrcSize = Cast->getSrcTy()->getPrimitiveSizeInBits();
				if (SrcSize > DstSize)
	return false;			return false;

	// Furthermore, ensure that all such extends are of the same kind.			// Furthermore, ensure that all such extends are of the same kind.
	if (FoundOneOperand) {			if (FoundOneOperand) {
	if (IsSigned != IsSExtInst)			if (IsSigned != IsSExtInst)
	return false;			return false;
	} else {			} else {
	FoundOneOperand = true;			FoundOneOperand = true;
	IsSigned = IsSExtInst;			IsSigned = IsSExtInst;
	}			}

	// Lastly, add the sign- or zero-extend to CI so that we can avoid			// Lastly, if the source type of the extend matches the reduction type,
	// accounting for it in the cost model.			// add the extend to CI so that we can avoid accounting for it in the
				// cost model.
				if (SrcSize == DstSize)
	CI.insert(Cast);			CI.insert(Cast);
	}			}
	}			}
	return true;			return true;
	}			}

	bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurrenceKind Kind,			bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurrenceKind Kind,
	Loop *TheLoop, bool HasFunNoNaNAttr,			Loop *TheLoop, bool HasFunNoNaNAttr,
	RecurrenceDescriptor &RedDes) {			RecurrenceDescriptor &RedDes) {
	▲ Show 20 Lines • Show All 565 Lines • Show Last 20 Lines

llvm/trunk/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll

Show First 20 Lines • Show All 60 Lines • ▼ Show 20 Lines	for.body:
%add = add nuw nsw i32 %conv, %conv4		%add = add nuw nsw i32 %conv, %conv4
%add5 = add nuw nsw i32 %add, %conv3		%add5 = add nuw nsw i32 %add, %conv3
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1		%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32		%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, %n		%exitcond = icmp eq i32 %lftr.wideiv, %n
br i1 %exitcond, label %for.cond.for.cond.cleanup_crit_edge, label %for.body		br i1 %exitcond, label %for.cond.for.cond.cleanup_crit_edge, label %for.body
}		}

; CHECK-LABEL: @reduction_i16		; CHECK-LABEL: @reduction_i16_1
;		;
; short reduction_i16(short a, short b, int n) {		; short reduction_i16_1(short a, short b, int n) {
; short sum = 0;		; short sum = 0;
; for (int i = 0; i < n; ++i)		; for (int i = 0; i < n; ++i)
; sum += (a[i] + b[i]);		; sum += (a[i] + b[i]);
; return sum;		; return sum;
; }		; }
;		;
; CHECK: vector.body:		; CHECK: vector.body:
; CHECK: phi <8 x i16>		; CHECK: phi <8 x i16>
; CHECK: load <8 x i16>		; CHECK: load <8 x i16>
; CHECK: load <8 x i16>		; CHECK: load <8 x i16>
; CHECK: add <8 x i16>		; CHECK: add <8 x i16>
; CHECK: add <8 x i16>		; CHECK: add <8 x i16>
;		;
; CHECK: middle.block:		; CHECK: middle.block:
; CHECK: shufflevector <8 x i16>		; CHECK: shufflevector <8 x i16>
; CHECK: add <8 x i16>		; CHECK: add <8 x i16>
; CHECK: shufflevector <8 x i16>		; CHECK: shufflevector <8 x i16>
; CHECK: add <8 x i16>		; CHECK: add <8 x i16>
; CHECK: shufflevector <8 x i16>		; CHECK: shufflevector <8 x i16>
; CHECK: add <8 x i16>		; CHECK: add <8 x i16>
; CHECK: [[Rdx:%[a-zA-Z0-9.]+]] = extractelement <8 x i16>		; CHECK: [[Rdx:%[a-zA-Z0-9.]+]] = extractelement <8 x i16>
; CHECK: zext i16 [[Rdx]] to i32		; CHECK: zext i16 [[Rdx]] to i32
;		;
define i16 @reduction_i16(i16* nocapture readonly %a, i16* nocapture readonly %b, i32 %n) {		define i16 @reduction_i16_1(i16* nocapture readonly %a, i16* nocapture readonly %b, i32 %n) {
entry:		entry:
%cmp.16 = icmp sgt i32 %n, 0		%cmp.16 = icmp sgt i32 %n, 0
br i1 %cmp.16, label %for.body.preheader, label %for.cond.cleanup		br i1 %cmp.16, label %for.body.preheader, label %for.cond.cleanup

for.body.preheader:		for.body.preheader:
br label %for.body		br label %for.body

for.cond.for.cond.cleanup_crit_edge:		for.cond.for.cond.cleanup_crit_edge:
Show All 17 Lines	for.body:
%conv4.13 = and i32 %sum.017, 65535		%conv4.13 = and i32 %sum.017, 65535
%add = add nuw nsw i32 %conv.14, %conv4.13		%add = add nuw nsw i32 %conv.14, %conv4.13
%add5 = add nuw nsw i32 %add, %conv3.15		%add5 = add nuw nsw i32 %add, %conv3.15
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1		%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32		%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, %n		%exitcond = icmp eq i32 %lftr.wideiv, %n
br i1 %exitcond, label %for.cond.for.cond.cleanup_crit_edge, label %for.body		br i1 %exitcond, label %for.cond.for.cond.cleanup_crit_edge, label %for.body
}		}

		; CHECK-LABEL: @reduction_i16_2
		;
		; short reduction_i16_2(char a, char b, int n) {
		; short sum = 0;
		; for (int i = 0; i < n; ++i)
		; sum += (a[i] + b[i]);
		; return sum;
		; }
		;
		; CHECK: vector.body:
		; CHECK: phi <8 x i16>
		; CHECK: [[Ld1:%[a-zA-Z0-9.]+]] = load <8 x i8>
		; CHECK: zext <8 x i8> [[Ld1]] to <8 x i16>
		; CHECK: [[Ld2:%[a-zA-Z0-9.]+]] = load <8 x i8>
		; CHECK: zext <8 x i8> [[Ld2]] to <8 x i16>
		; CHECK: add <8 x i16>
		; CHECK: add <8 x i16>
		;
		; CHECK: middle.block:
		; CHECK: shufflevector <8 x i16>
		; CHECK: add <8 x i16>
		; CHECK: shufflevector <8 x i16>
		; CHECK: add <8 x i16>
		; CHECK: shufflevector <8 x i16>
		; CHECK: add <8 x i16>
		; CHECK: [[Rdx:%[a-zA-Z0-9.]+]] = extractelement <8 x i16>
		; CHECK: zext i16 [[Rdx]] to i32
		;
		define i16 @reduction_i16_2(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %n) {
		entry:
		%cmp.14 = icmp sgt i32 %n, 0
		br i1 %cmp.14, label %for.body.preheader, label %for.cond.cleanup

		for.body.preheader:
		br label %for.body

		for.cond.for.cond.cleanup_crit_edge:
		%add5.lcssa = phi i32 [ %add5, %for.body ]
		%conv6 = trunc i32 %add5.lcssa to i16
		br label %for.cond.cleanup

		for.cond.cleanup:
		%sum.0.lcssa = phi i16 [ %conv6, %for.cond.for.cond.cleanup_crit_edge ], [ 0, %entry ]
		ret i16 %sum.0.lcssa

		for.body:
		%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
		%sum.015 = phi i32 [ %add5, %for.body ], [ 0, %for.body.preheader ]
		%arrayidx = getelementptr inbounds i8, i8* %a, i64 %indvars.iv
		%0 = load i8, i8* %arrayidx, align 1
		%conv = zext i8 %0 to i32
		%arrayidx2 = getelementptr inbounds i8, i8* %b, i64 %indvars.iv
		%1 = load i8, i8* %arrayidx2, align 1
		%conv3 = zext i8 %1 to i32
		%conv4.13 = and i32 %sum.015, 65535
		%add = add nuw nsw i32 %conv, %conv4.13
		%add5 = add nuw nsw i32 %add, %conv3
		%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
		%lftr.wideiv = trunc i64 %indvars.iv.next to i32
		%exitcond = icmp eq i32 %lftr.wideiv, %n
		br i1 %exitcond, label %for.cond.for.cond.cleanup_crit_edge, label %for.body
		}