This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
llvm/trunk/
-
trunk/
-
include/llvm/Transforms/Utils/
-
llvm/
-
Transforms/
-
Utils/
-
LoopUtils.h
-
lib/
-
CodeGen/
-
ExpandReductions.cpp
-
Transforms/Utils/
-
Utils/
-
LoopUtils.cpp
-
test/CodeGen/Generic/
-
CodeGen/
-
Generic/
-
expand-experimental-reductions.ll

Differential D45366

Support generic expansion of ordered vector reduction (PR36732)
ClosedPublic

Authored by RKSimon on Apr 6 2018, 6:55 AM.

Download Raw Diff

Details

Reviewers

aemerson
• gnzlbg
chandlerc
hfinkel
rengolin
mkuper
ABataev
spatel

Commits

rG23c2182c2bfc: Support generic expansion of ordered vector reduction (PR36732)
rL329585: Support generic expansion of ordered vector reduction (PR36732)

Summary

Without the fast math flags, the llvm.experimental.vector.reduce.fadd/fmul intrinsic expansions must be expanded in order.

This patch scalarizes the reduction, applying the accumulator at the start of the sequence: ((((Acc + Scl[0]) + Scl[1]) + Scl[2]) + ) ... + Scl[NumElts-1]

Diff Detail

Repository: rL LLVM

Event Timeline

RKSimon created this revision.Apr 6 2018, 6:55 AM

ABataev added inline comments.Apr 6 2018, 7:06 AM

include/llvm/Transforms/Utils/LoopUtils.h
517 ↗	(On Diff #141335)	`ArrayRef<Value > RedOps = ArrayRef<Value >()`->`ArrayRef<Value *> RedOps = None`
lib/CodeGen/ExpandReductions.cpp
122–124 ↗	(On Diff #141335)	`auto`->`Value *`
lib/Transforms/Utils/LoopUtils.cpp
1537 ↗	(On Diff #141335)	`auto`->`auto &&` Use explicit capturing
1560 ↗	(On Diff #141335)	Use preincrement expression

For now, that the shuffle reduction is in loop utils, this patch is fine. But this really ought to be in target transform info.

lib/Transforms/Utils/LoopUtils.cpp
1531 ↗	(On Diff #141335)	SVE will be able to do this with a single instruction, we should be able to override this based on target info.

In D45366#1059769, @rengolin wrote:

For now, that the shuffle reduction is in loop utils, this patch is fine. But this really ought to be in target transform info.

I agree, I'll do that as an NFC once we've finalised the behaviour of reductions a bit more.

lib/Transforms/Utils/LoopUtils.cpp
1531 ↗	(On Diff #141335)	This should be covered by TTI::useReductionIntrinsic
1537 ↗	(On Diff #141335)	Even better, I'll remove the lambda entirely and inline it.

Cleaned up based on @ABataev's comments.

@aemerson explained on D45336 that the strict/ordered reductions should always use the accumulator argument, so I've removed the is null or undef logic.

RKSimon mentioned this in rL329425: Add additional tests from D45366.Apr 6 2018, 10:20 AM

RKSimon mentioned this in rL329431: Cleanup Reduction helpers by using ArrayRef(NoneType) constructor. NFCI..Apr 6 2018, 10:28 AM

rebased

aemerson accepted this revision.Apr 9 2018, 2:41 AM

This revision is now accepted and ready to land.Apr 9 2018, 2:41 AM

Closed by commit rL329585: Support generic expansion of ordered vector reduction (PR36732) (authored by RKSimon). · Explain WhyApr 9 2018, 8:47 AM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

llvm/

trunk/

include/

llvm/

Transforms/

Utils/

LoopUtils.h

7 lines

lib/

CodeGen/

ExpandReductions.cpp

15 lines

Transforms/

Utils/

LoopUtils.cpp

32 lines

test/

CodeGen/

Generic/

expand-experimental-reductions.ll

44 lines

Diff 141665

llvm/trunk/include/llvm/Transforms/Utils/LoopUtils.h

	Show First 20 Lines • Show All 503 Lines • ▼ Show 20 Lines
	/// instructions from loop body to preheader/exit. Check if the instruction			/// instructions from loop body to preheader/exit. Check if the instruction
	/// can execute speculatively.			/// can execute speculatively.
	/// If \p ORE is set use it to emit optimization remarks.			/// If \p ORE is set use it to emit optimization remarks.
	bool canSinkOrHoistInst(Instruction &I, AAResults AA, DominatorTree DT,			bool canSinkOrHoistInst(Instruction &I, AAResults AA, DominatorTree DT,
	Loop CurLoop, AliasSetTracker CurAST,			Loop CurLoop, AliasSetTracker CurAST,
	LoopSafetyInfo *SafetyInfo,			LoopSafetyInfo *SafetyInfo,
	OptimizationRemarkEmitter *ORE = nullptr);			OptimizationRemarkEmitter *ORE = nullptr);

				/// Generates an ordered vector reduction using extracts to reduce the value.
				Value *
				getOrderedReduction(IRBuilder<> &Builder, Value Acc, Value Src, unsigned Op,
				RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
				RecurrenceDescriptor::MRK_Invalid,
				ArrayRef<Value *> RedOps = None);

	/// Generates a vector reduction using shufflevectors to reduce the value.			/// Generates a vector reduction using shufflevectors to reduce the value.
	Value getShuffleReduction(IRBuilder<> &Builder, Value Src, unsigned Op,			Value getShuffleReduction(IRBuilder<> &Builder, Value Src, unsigned Op,
	RecurrenceDescriptor::MinMaxRecurrenceKind			RecurrenceDescriptor::MinMaxRecurrenceKind
	MinMaxKind = RecurrenceDescriptor::MRK_Invalid,			MinMaxKind = RecurrenceDescriptor::MRK_Invalid,
	ArrayRef<Value *> RedOps = None);			ArrayRef<Value *> RedOps = None);

	/// Create a target reduction of the given vector. The reduction operation			/// Create a target reduction of the given vector. The reduction operation
	/// is described by the \p Opcode parameter. min/max reductions require			/// is described by the \p Opcode parameter. min/max reductions require
	Show All 27 Lines

llvm/trunk/lib/CodeGen/ExpandReductions.cpp

Show First 20 Lines • Show All 72 Lines • ▼ Show 20 Lines	case Intrinsic::experimental_vector_reduce_fmin:
return RecurrenceDescriptor::MRK_FloatMin;		return RecurrenceDescriptor::MRK_FloatMin;
default:		default:
return RecurrenceDescriptor::MRK_Invalid;		return RecurrenceDescriptor::MRK_Invalid;
}		}
}		}

bool expandReductions(Function &F, const TargetTransformInfo *TTI) {		bool expandReductions(Function &F, const TargetTransformInfo *TTI) {
bool Changed = false;		bool Changed = false;
SmallVector<IntrinsicInst*, 4> Worklist;		SmallVector<IntrinsicInst *, 4> Worklist;
for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I)		for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I)
if (auto II = dyn_cast<IntrinsicInst>(&*I))		if (auto II = dyn_cast<IntrinsicInst>(&*I))
Worklist.push_back(II);		Worklist.push_back(II);

for (auto *II : Worklist) {		for (auto *II : Worklist) {
IRBuilder<> Builder(II);		IRBuilder<> Builder(II);
		bool IsOrdered = false;
		Value *Acc = nullptr;
Value *Vec = nullptr;		Value *Vec = nullptr;
auto ID = II->getIntrinsicID();		auto ID = II->getIntrinsicID();
auto MRK = RecurrenceDescriptor::MRK_Invalid;		auto MRK = RecurrenceDescriptor::MRK_Invalid;
switch (ID) {		switch (ID) {
case Intrinsic::experimental_vector_reduce_fadd:		case Intrinsic::experimental_vector_reduce_fadd:
case Intrinsic::experimental_vector_reduce_fmul:		case Intrinsic::experimental_vector_reduce_fmul:
// FMFs must be attached to the call, otherwise it's an ordered reduction		// FMFs must be attached to the call, otherwise it's an ordered reduction
// and it can't be handled by generating this shuffle sequence.		// and it can't be handled by generating a shuffle sequence.
// TODO: Implement scalarization of ordered reductions here for targets
// without native support.
if (!II->getFastMathFlags().isFast())		if (!II->getFastMathFlags().isFast())
continue;		IsOrdered = true;
		Acc = II->getArgOperand(0);
Vec = II->getArgOperand(1);		Vec = II->getArgOperand(1);
break;		break;
case Intrinsic::experimental_vector_reduce_add:		case Intrinsic::experimental_vector_reduce_add:
case Intrinsic::experimental_vector_reduce_mul:		case Intrinsic::experimental_vector_reduce_mul:
case Intrinsic::experimental_vector_reduce_and:		case Intrinsic::experimental_vector_reduce_and:
case Intrinsic::experimental_vector_reduce_or:		case Intrinsic::experimental_vector_reduce_or:
case Intrinsic::experimental_vector_reduce_xor:		case Intrinsic::experimental_vector_reduce_xor:
case Intrinsic::experimental_vector_reduce_smax:		case Intrinsic::experimental_vector_reduce_smax:
case Intrinsic::experimental_vector_reduce_smin:		case Intrinsic::experimental_vector_reduce_smin:
case Intrinsic::experimental_vector_reduce_umax:		case Intrinsic::experimental_vector_reduce_umax:
case Intrinsic::experimental_vector_reduce_umin:		case Intrinsic::experimental_vector_reduce_umin:
case Intrinsic::experimental_vector_reduce_fmax:		case Intrinsic::experimental_vector_reduce_fmax:
case Intrinsic::experimental_vector_reduce_fmin:		case Intrinsic::experimental_vector_reduce_fmin:
Vec = II->getArgOperand(0);		Vec = II->getArgOperand(0);
MRK = getMRK(ID);		MRK = getMRK(ID);
break;		break;
default:		default:
continue;		continue;
}		}
if (!TTI->shouldExpandReduction(II))		if (!TTI->shouldExpandReduction(II))
continue;		continue;
auto Rdx = getShuffleReduction(Builder, Vec, getOpcode(ID), MRK);		Value *Rdx =
		IsOrdered ? getOrderedReduction(Builder, Acc, Vec, getOpcode(ID), MRK)
		: getShuffleReduction(Builder, Vec, getOpcode(ID), MRK);
II->replaceAllUsesWith(Rdx);		II->replaceAllUsesWith(Rdx);
II->eraseFromParent();		II->eraseFromParent();
Changed = true;		Changed = true;
}		}
return Changed;		return Changed;
}		}

class ExpandReductions : public FunctionPass {		class ExpandReductions : public FunctionPass {
Show All 38 Lines

llvm/trunk/lib/Transforms/Utils/LoopUtils.cpp

Show First 20 Lines • Show All 1,520 Lines • ▼ Show 20 Lines	static Value addFastMathFlag(Value V) {
if (isa<FPMathOperator>(V)) {		if (isa<FPMathOperator>(V)) {
FastMathFlags Flags;		FastMathFlags Flags;
Flags.setFast();		Flags.setFast();
cast<Instruction>(V)->setFastMathFlags(Flags);		cast<Instruction>(V)->setFastMathFlags(Flags);
}		}
return V;		return V;
}		}

		// Helper to generate an ordered reduction.
		Value *
		llvm::getOrderedReduction(IRBuilder<> &Builder, Value Acc, Value Src,
		unsigned Op,
		RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind,
		ArrayRef<Value *> RedOps) {
		unsigned VF = Src->getType()->getVectorNumElements();

		// Extract and apply reduction ops in ascending order:
		// e.g. ((((Acc + Scl[0]) + Scl[1]) + Scl[2]) + ) ... + Scl[VF-1]
		Value *Result = Acc;
		for (unsigned ExtractIdx = 0; ExtractIdx != VF; ++ExtractIdx) {
		Value *Ext =
		Builder.CreateExtractElement(Src, Builder.getInt32(ExtractIdx));

		if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
		Result = Builder.CreateBinOp((Instruction::BinaryOps)Op, Result, Ext,
		"bin.rdx");
		} else {
		assert(MinMaxKind != RecurrenceDescriptor::MRK_Invalid &&
		"Invalid min/max");
		Result = RecurrenceDescriptor::createMinMaxOp(Builder, MinMaxKind, Result,
		Ext);
		}

		if (!RedOps.empty())
		propagateIRFlags(Result, RedOps);
		}

		return Result;
		}

// Helper to generate a log2 shuffle reduction.		// Helper to generate a log2 shuffle reduction.
Value *		Value *
llvm::getShuffleReduction(IRBuilder<> &Builder, Value *Src, unsigned Op,		llvm::getShuffleReduction(IRBuilder<> &Builder, Value *Src, unsigned Op,
RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind,		RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind,
ArrayRef<Value *> RedOps) {		ArrayRef<Value *> RedOps) {
unsigned VF = Src->getType()->getVectorNumElements();		unsigned VF = Src->getType()->getVectorNumElements();
// VF is a power of 2 so we can emit the reduction using log2(VF) shuffles		// VF is a power of 2 so we can emit the reduction using log2(VF) shuffles
// and vector ops, reducing the set of values being computed by half each		// and vector ops, reducing the set of values being computed by half each
▲ Show 20 Lines • Show All 170 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/Generic/expand-experimental-reductions.ll

	Show First 20 Lines • Show All 111 Lines • ▼ Show 20 Lines
	entry:			entry:
	%r = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float %accum, <4 x float> %vec)			%r = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float %accum, <4 x float> %vec)
	ret float %r			ret float %r
	}			}

	define float @fadd_f32_strict(<4 x float> %vec) {			define float @fadd_f32_strict(<4 x float> %vec) {
	; CHECK-LABEL: @fadd_f32_strict(			; CHECK-LABEL: @fadd_f32_strict(
	; CHECK-NEXT: entry:			; CHECK-NEXT: entry:
	; CHECK-NEXT: [[R:%.]] = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float undef, <4 x float> [[VEC:%.]])			; CHECK-NEXT: [[TMP0:%.]] = extractelement <4 x float> [[VEC:%.]], i32 0
	; CHECK-NEXT: ret float [[R]]			; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd float undef, [[TMP0]]
				; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[VEC]], i32 1
				; CHECK-NEXT: [[BIN_RDX1:%.*]] = fadd float [[BIN_RDX]], [[TMP1]]
				; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[VEC]], i32 2
				; CHECK-NEXT: [[BIN_RDX2:%.*]] = fadd float [[BIN_RDX1]], [[TMP2]]
				; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[VEC]], i32 3
				; CHECK-NEXT: [[BIN_RDX3:%.*]] = fadd float [[BIN_RDX2]], [[TMP3]]
				; CHECK-NEXT: ret float [[BIN_RDX3]]
	;			;
	entry:			entry:
	%r = call float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %vec)			%r = call float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %vec)
	ret float %r			ret float %r
	}			}

	define float @fadd_f32_strict_accum(float %accum, <4 x float> %vec) {			define float @fadd_f32_strict_accum(float %accum, <4 x float> %vec) {
	; CHECK-LABEL: @fadd_f32_strict_accum(			; CHECK-LABEL: @fadd_f32_strict_accum(
	; CHECK-NEXT: entry:			; CHECK-NEXT: entry:
	; CHECK-NEXT: [[R:%.]] = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float [[ACCUM:%.]], <4 x float> [[VEC:%.*]])			; CHECK-NEXT: [[TMP0:%.]] = extractelement <4 x float> [[VEC:%.]], i32 0
	; CHECK-NEXT: ret float [[R]]			; CHECK-NEXT: [[BIN_RDX:%.]] = fadd float [[ACCUM:%.]], [[TMP0]]
				; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[VEC]], i32 1
				; CHECK-NEXT: [[BIN_RDX1:%.*]] = fadd float [[BIN_RDX]], [[TMP1]]
				; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[VEC]], i32 2
				; CHECK-NEXT: [[BIN_RDX2:%.*]] = fadd float [[BIN_RDX1]], [[TMP2]]
				; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[VEC]], i32 3
				; CHECK-NEXT: [[BIN_RDX3:%.*]] = fadd float [[BIN_RDX2]], [[TMP3]]
				; CHECK-NEXT: ret float [[BIN_RDX3]]
	;			;
	entry:			entry:
	%r = call float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float %accum, <4 x float> %vec)			%r = call float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float %accum, <4 x float> %vec)
	ret float %r			ret float %r
	}			}

	define float @fmul_f32(<4 x float> %vec) {			define float @fmul_f32(<4 x float> %vec) {
	; CHECK-LABEL: @fmul_f32(			; CHECK-LABEL: @fmul_f32(
	Show All 23 Lines
	entry:			entry:
	%r = call fast float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float %accum, <4 x float> %vec)			%r = call fast float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float %accum, <4 x float> %vec)
	ret float %r			ret float %r
	}			}

	define float @fmul_f32_strict(<4 x float> %vec) {			define float @fmul_f32_strict(<4 x float> %vec) {
	; CHECK-LABEL: @fmul_f32_strict(			; CHECK-LABEL: @fmul_f32_strict(
	; CHECK-NEXT: entry:			; CHECK-NEXT: entry:
	; CHECK-NEXT: [[R:%.]] = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float undef, <4 x float> [[VEC:%.]])			; CHECK-NEXT: [[TMP0:%.]] = extractelement <4 x float> [[VEC:%.]], i32 0
	; CHECK-NEXT: ret float [[R]]			; CHECK-NEXT: [[BIN_RDX:%.*]] = fmul float undef, [[TMP0]]
				; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[VEC]], i32 1
				; CHECK-NEXT: [[BIN_RDX1:%.*]] = fmul float [[BIN_RDX]], [[TMP1]]
				; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[VEC]], i32 2
				; CHECK-NEXT: [[BIN_RDX2:%.*]] = fmul float [[BIN_RDX1]], [[TMP2]]
				; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[VEC]], i32 3
				; CHECK-NEXT: [[BIN_RDX3:%.*]] = fmul float [[BIN_RDX2]], [[TMP3]]
				; CHECK-NEXT: ret float [[BIN_RDX3]]
	;			;
	entry:			entry:
	%r = call float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float undef, <4 x float> %vec)			%r = call float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float undef, <4 x float> %vec)
	ret float %r			ret float %r
	}			}

	define float @fmul_f32_strict_accum(float %accum, <4 x float> %vec) {			define float @fmul_f32_strict_accum(float %accum, <4 x float> %vec) {
	; CHECK-LABEL: @fmul_f32_strict_accum(			; CHECK-LABEL: @fmul_f32_strict_accum(
	; CHECK-NEXT: entry:			; CHECK-NEXT: entry:
	; CHECK-NEXT: [[R:%.]] = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float [[ACCUM:%.]], <4 x float> [[VEC:%.*]])			; CHECK-NEXT: [[TMP0:%.]] = extractelement <4 x float> [[VEC:%.]], i32 0
	; CHECK-NEXT: ret float [[R]]			; CHECK-NEXT: [[BIN_RDX:%.]] = fmul float [[ACCUM:%.]], [[TMP0]]
				; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[VEC]], i32 1
				; CHECK-NEXT: [[BIN_RDX1:%.*]] = fmul float [[BIN_RDX]], [[TMP1]]
				; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[VEC]], i32 2
				; CHECK-NEXT: [[BIN_RDX2:%.*]] = fmul float [[BIN_RDX1]], [[TMP2]]
				; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[VEC]], i32 3
				; CHECK-NEXT: [[BIN_RDX3:%.*]] = fmul float [[BIN_RDX2]], [[TMP3]]
				; CHECK-NEXT: ret float [[BIN_RDX3]]
	;			;
	entry:			entry:
	%r = call float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float %accum, <4 x float> %vec)			%r = call float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float %accum, <4 x float> %vec)
	ret float %r			ret float %r
	}			}

	define i64 @smax_i64(<2 x i64> %vec) {			define i64 @smax_i64(<2 x i64> %vec) {
	; CHECK-LABEL: @smax_i64(			; CHECK-LABEL: @smax_i64(
	▲ Show 20 Lines • Show All 81 Lines • Show Last 20 Lines