This is an archive of the discontinued LLVM Phabricator instance.

Differential D80322

[AMDGPU] Tune threshold for cmp/select vector lowering
ClosedPublic

Authored by rampitec on May 20 2020, 2:07 PM.

Download Raw Diff

Details

Reviewers

arsenm

Commits

rG1dfd1b3e4b2b: [AMDGPU] Tune threshold for cmp/select vector lowering

Summary

It was set in total vector size while the idea was to limit
a number of instructions. Now it started to work with doubles
and thresholds needs to be updated.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

rampitec created this revision.May 20 2020, 2:07 PM

Herald added a project: Restricted Project. · View Herald TranscriptMay 20 2020, 2:07 PM

Herald added subscribers: kerbowa, hiraditya, t-tye and 7 others. · View Herald Transcript

rampitec added a parent revision: D80032: [AMDGPU] Always expand ext/insertelement with divergent idx.May 20 2020, 2:08 PM

To expand a little bit on the reasoning: 256 bits of float/int yield 8 compares and 8 cndmasks, 16 instructions together. For doubles to fall under 16 instructions it takes double5: 5 compares and 10 cndmasks. Currently it is double4 which will be expanded.

I have done perf measurements to compare this expansion to s_set_gpr_idx on Vega10 and it breaks even around 5-6 elements with a tiny margin.

The condition became too complicated for me to understand, so I have just hoisted it into a predicate function. I also think we may move this predicate somewhere later, as we need it at least in GlobalISel, maybe in some other places too. Anyway, the same condition was already used in two places.

It would be good if we could commit these benchmarks somewhere

This revision is now accepted and ready to land.May 21 2020, 6:49 AM

Closed by commit rG1dfd1b3e4b2b: [AMDGPU] Tune threshold for cmp/select vector lowering (authored by rampitec). · Explain WhyMay 21 2020, 9:09 AM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

llvm/

lib/

Target/

AMDGPU/

SIISelLowering.cpp

54 lines

test/

CodeGen/

AMDGPU/

extract_vector_dynelt.ll

18 lines

insert_vector_dynelt.ll

11 lines

Diff 265517

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 9,468 Lines • ▼ Show 20 Lines	SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
DAGCombinerInfo &DCI) const {		DAGCombinerInfo &DCI) const {
SDValue Src0 = N->getOperand(0);		SDValue Src0 = N->getOperand(0);
SDValue Src1 = N->getOperand(1);		SDValue Src1 = N->getOperand(1);
if (Src0.isUndef() && Src1.isUndef())		if (Src0.isUndef() && Src1.isUndef())
return DCI.DAG.getUNDEF(N->getValueType(0));		return DCI.DAG.getUNDEF(N->getValueType(0));
return SDValue();		return SDValue();
}		}

		// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
		// expanded into a set of cmp/select instructions.
		static bool shouldExpandVectorDynExt(SDNode *N) {
		SDValue Idx = N->getOperand(N->getNumOperands() - 1);
		if (UseDivergentRegisterIndexing \|\| isa<ConstantSDNode>(Idx))
		return false;

		SDValue Vec = N->getOperand(0);
		EVT VecVT = Vec.getValueType();
		EVT EltVT = VecVT.getVectorElementType();
		unsigned VecSize = VecVT.getSizeInBits();
		unsigned EltSize = EltVT.getSizeInBits();
		unsigned NumElem = VecVT.getVectorNumElements();

		// Sub-dword vectors of size 2 dword or less have better implementation.
		if (VecSize <= 64 && EltSize < 32)
		return false;

		// Always expand the rest of sub-dword instructions, otherwise it will be
		// lowered via memory.
		if (EltSize < 32)
		return true;

		// Always do this if var-idx is divergent, otherwise it will become a loop.
		if (Idx->isDivergent())
		return true;

		// Large vectors would yield too many compares and v_cndmask_b32 instructions.
		unsigned NumInsts = NumElem /* Number of compares */ +
		((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
		return NumInsts <= 16;
		}

SDValue SITargetLowering::performExtractVectorEltCombine(		SDValue SITargetLowering::performExtractVectorEltCombine(
SDNode *N, DAGCombinerInfo &DCI) const {		SDNode *N, DAGCombinerInfo &DCI) const {
SDValue Vec = N->getOperand(0);		SDValue Vec = N->getOperand(0);
SelectionDAG &DAG = DCI.DAG;		SelectionDAG &DAG = DCI.DAG;

EVT VecVT = Vec.getValueType();		EVT VecVT = Vec.getValueType();
EVT EltVT = VecVT.getVectorElementType();		EVT EltVT = VecVT.getVectorElementType();

▲ Show 20 Lines • Show All 45 Lines • ▼ Show 20 Lines	if (Vec.hasOneUse() && DCI.isBeforeLegalize()) {
}		}
}		}
}		}

unsigned VecSize = VecVT.getSizeInBits();		unsigned VecSize = VecVT.getSizeInBits();
unsigned EltSize = EltVT.getSizeInBits();		unsigned EltSize = EltVT.getSizeInBits();

// EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)		// EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
// This elminates non-constant index and subsequent movrel or scratch access.		if (shouldExpandVectorDynExt(N)) {
// Sub-dword vectors of size 2 dword or less have better implementation.
// Vectors of size bigger than 8 dwords would yield too many v_cndmask_b32
// instructions.
// Always do this if var-idx is divergent, otherwise it will become a loop.
if (!UseDivergentRegisterIndexing &&
(VecSize <= 256 \|\| N->getOperand(1)->isDivergent()) &&
(VecSize > 64 \|\| EltSize >= 32) &&
!isa<ConstantSDNode>(N->getOperand(1))) {
SDLoc SL(N);		SDLoc SL(N);
SDValue Idx = N->getOperand(1);		SDValue Idx = N->getOperand(1);
SDValue V;		SDValue V;
for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {		for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
SDValue IC = DAG.getVectorIdxConstant(I, SL);		SDValue IC = DAG.getVectorIdxConstant(I, SL);
SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);		SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
if (I == 0)		if (I == 0)
V = Elt;		V = Elt;
▲ Show 20 Lines • Show All 43 Lines • ▼ Show 20 Lines

SDValue		SDValue
SITargetLowering::performInsertVectorEltCombine(SDNode *N,		SITargetLowering::performInsertVectorEltCombine(SDNode *N,
DAGCombinerInfo &DCI) const {		DAGCombinerInfo &DCI) const {
SDValue Vec = N->getOperand(0);		SDValue Vec = N->getOperand(0);
SDValue Idx = N->getOperand(2);		SDValue Idx = N->getOperand(2);
EVT VecVT = Vec.getValueType();		EVT VecVT = Vec.getValueType();
EVT EltVT = VecVT.getVectorElementType();		EVT EltVT = VecVT.getVectorElementType();
unsigned VecSize = VecVT.getSizeInBits();
unsigned EltSize = EltVT.getSizeInBits();

// INSERT_VECTOR_ELT (<n x e>, var-idx)		// INSERT_VECTOR_ELT (<n x e>, var-idx)
// => BUILD_VECTOR n x select (e, const-idx)		// => BUILD_VECTOR n x select (e, const-idx)
// This elminates non-constant index and subsequent movrel or scratch access.		if (!shouldExpandVectorDynExt(N))
// Sub-dword vectors of size 2 dword or less have better implementation.
// Vectors of size bigger than 8 dwords would yield too many v_cndmask_b32
// instructions.
// Always do this if var-idx is divergent, otherwise it will become a loop.
if (UseDivergentRegisterIndexing \|\| isa<ConstantSDNode>(Idx) \|\|
(VecSize > 256 && !Idx->isDivergent()) \|\|
(VecSize <= 64 && EltSize < 32))
return SDValue();		return SDValue();

SelectionDAG &DAG = DCI.DAG;		SelectionDAG &DAG = DCI.DAG;
SDLoc SL(N);		SDLoc SL(N);
SDValue Ins = N->getOperand(1);		SDValue Ins = N->getOperand(1);
EVT IdxVT = Idx.getValueType();		EVT IdxVT = Idx.getValueType();

SmallVector<SDValue, 16> Ops;		SmallVector<SDValue, 16> Ops;
▲ Show 20 Lines • Show All 1,612 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll

	Show First 20 Lines • Show All 42 Lines • ▼ Show 20 Lines
	; GCN: store_dwordx2 v[{{[0-9:]+}}]			; GCN: store_dwordx2 v[{{[0-9:]+}}]
	define amdgpu_kernel void @double4_extelt(double addrspace(1)* %out, i32 %sel) {			define amdgpu_kernel void @double4_extelt(double addrspace(1)* %out, i32 %sel) {
	entry:			entry:
	%ext = extractelement <4 x double> <double 0.01, double 1.01, double 2.01, double 4.01>, i32 %sel			%ext = extractelement <4 x double> <double 0.01, double 1.01, double 2.01, double 4.01>, i32 %sel
	store double %ext, double addrspace(1)* %out			store double %ext, double addrspace(1)* %out
	ret void			ret void
	}			}

				; GCN-LABEL: {{^}}double5_extelt:
				; GCN-NOT: buffer_
				; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1
				; GCN-DAG: v_cmp_eq_u32_e64 [[C2:[^,]+]], [[IDX]], 2
				; GCN-DAG: v_cmp_eq_u32_e64 [[C3:[^,]+]], [[IDX]], 3
				; GCN-DAG: v_cmp_eq_u32_e64 [[C4:[^,]+]], [[IDX]], 4
				; GCN-DAG: v_cndmask_b32_e{{32\|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C1]]
				; GCN-DAG: v_cndmask_b32_e{{32\|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C2]]
				; GCN-DAG: v_cndmask_b32_e{{32\|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C3]]
				; GCN-DAG: v_cndmask_b32_e{{32\|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C4]]
				; GCN: store_dwordx2 v[{{[0-9:]+}}]
				define amdgpu_kernel void @double5_extelt(double addrspace(1)* %out, i32 %sel) {
				entry:
				%ext = extractelement <5 x double> <double 0.01, double 1.01, double 2.01, double 4.01, double 5.01>, i32 %sel
				store double %ext, double addrspace(1)* %out
				ret void
				}

	; GCN-LABEL: {{^}}half4_extelt:			; GCN-LABEL: {{^}}half4_extelt:
	; GCN-NOT: buffer_			; GCN-NOT: buffer_
	; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0x40003c00			; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0x40003c00
	; GCN-DAG: s_mov_b32 s[[SH:[0-9]+]], 0x44004200			; GCN-DAG: s_mov_b32 s[[SH:[0-9]+]], 0x44004200
	; GCN-DAG: s_lshl_b32 [[SEL:s[0-p]+]], s{{[0-9]+}}, 4			; GCN-DAG: s_lshl_b32 [[SEL:s[0-p]+]], s{{[0-9]+}}, 4
	; GCN: s_lshr_b64 s{{\[}}[[RL:[0-9]+]]:{{[0-9]+}}], s{{\[}}[[SL]]:[[SH]]], [[SEL]]			; GCN: s_lshr_b64 s{{\[}}[[RL:[0-9]+]]:{{[0-9]+}}], s{{\[}}[[SL]]:[[SH]]], [[SEL]]
	; GCN-DAG: v_mov_b32_e32 v[[VRL:[0-9]+]], s[[RL]]			; GCN-DAG: v_mov_b32_e32 v[[VRL:[0-9]+]], s[[RL]]
	; GCN: store_short v[{{[0-9:]+}}], v[[VRL]]			; GCN: store_short v[{{[0-9:]+}}], v[[VRL]]
	▲ Show 20 Lines • Show All 356 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll

	Show First 20 Lines • Show All 271 Lines • ▼ Show 20 Lines
	; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC2]]			; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC2]]
	define amdgpu_kernel void @double2_inselt(<2 x double> addrspace(1)* %out, <2 x double> %vec, i32 %sel) {			define amdgpu_kernel void @double2_inselt(<2 x double> addrspace(1)* %out, <2 x double> %vec, i32 %sel) {
	entry:			entry:
	%v = insertelement <2 x double> %vec, double 1.000000e+00, i32 %sel			%v = insertelement <2 x double> %vec, double 1.000000e+00, i32 %sel
	store <2 x double> %v, <2 x double> addrspace(1)* %out			store <2 x double> %v, <2 x double> addrspace(1)* %out
	ret void			ret void
	}			}

				; GCN-LABEL: {{^}}double5_inselt:
				; GCN-NOT: v_movrel
				; GCN-NOT: buffer_
				; GCN-COUNT-10: v_cndmask_b32
				define amdgpu_kernel void @double5_inselt(<5 x double> addrspace(1)* %out, <5 x double> %vec, i32 %sel) {
				entry:
				%v = insertelement <5 x double> %vec, double 1.000000e+00, i32 %sel
				store <5 x double> %v, <5 x double> addrspace(1)* %out
				ret void
				}

	; GCN-LABEL: {{^}}double8_inselt:			; GCN-LABEL: {{^}}double8_inselt:
	; GCN-NOT: v_cndmask			; GCN-NOT: v_cndmask
	; GCN-NOT: buffer_			; GCN-NOT: buffer_
	; GCN-NOT: s_or_b32			; GCN-NOT: s_or_b32
	; GCN-DAG: s_mov_b32 m0, [[IND:s[0-9]+]]			; GCN-DAG: s_mov_b32 m0, [[IND:s[0-9]+]]
	; GCN-DAG: v_movreld_b32_e32 v[[#BASE:]], 0			; GCN-DAG: v_movreld_b32_e32 v[[#BASE:]], 0
	; GCN-NOT: s_mov_b32 m0			; GCN-NOT: s_mov_b32 m0
	; GCN: v_movreld_b32_e32 v[[#BASE+1]],			; GCN: v_movreld_b32_e32 v[[#BASE+1]],
	▲ Show 20 Lines • Show All 112 Lines • Show Last 20 Lines