This is an archive of the discontinued LLVM Phabricator instance.

[X86] Improve a dag-combine that handles a vector extract -> zext sequence.
ClosedPublic

Authored by mkuper on Dec 3 2014, 4:49 AM.

Download Raw Diff

Details

Reviewers

qcolombet
nadav

Commits

rG0492bd2b9e45: [X86] Improve a dag-combine that handles a vector extract -> zext sequence.
rL223360: [X86] Improve a dag-combine that handles a vector extract -> zext sequence.

Summary

The current DAG combine turns a sequence of extracts from <4 x i32> followed by zexts into a store followed by scalar loads.
According to measurements by Martin Krastev (see PR 21269) for x86-64, a sequence of an extract, movs and shifts gives better performance. However, for 32-bit x86, the previous sequence still seems better.

Diff Detail

Repository: rL LLVM

Event Timeline

mkuper updated this revision to Diff 16860.Dec 3 2014, 4:49 AM

mkuper retitled this revision from to [X86] Improve a dag-combine that handles a vector extract -> zext sequence..

mkuper updated this object.

mkuper edited the test plan for this revision. (Show Details)

mkuper added reviewers: qcolombet, nadav.

I did not review the code carefully but overall it looks good.

Hi Michael,

This LGTM with one request.
Could you add a test case where we fall back to the old sequence of store + loads?
No need to send an updated patch.

Thanks,
-Quentin

lib/Target/X86/X86ISelLowering.cpp
22656 ↗	(On Diff #16860)	Period at the end of the comment.

This revision is now accepted and ready to land.Dec 3 2014, 9:53 AM

mkuper edited edge metadata.Dec 3 2014, 1:28 PM

mkuper added a subscriber: Unknown Object (MLST).

Thanks, Quentin!

Sure, will add that.
It'll have to be slightly more contrived because it needs to have an explicit zext, as opposed to an implicit extension coming from a GEP, but that's no big deal.

Closed by commit rL223360 (authored by @mkuper).

Revision Contents

Path

Size

llvm/

trunk/

lib/

Target/

X86/

X86ISelLowering.cpp

75 lines

test/

CodeGen/

X86/

gather-addresses.ll

83 lines

Diff 16922

llvm/trunk/lib/Target/X86/X86ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 22,552 Lines • ▼ Show 20 Lines	Shuffle = DAG.getVectorShuffle(CurrentVT, dl,
&ShuffleMask[0]);		&ShuffleMask[0]);
Shuffle = DAG.getNode(ISD::BITCAST, dl, OriginalVT, Shuffle);		Shuffle = DAG.getNode(ISD::BITCAST, dl, OriginalVT, Shuffle);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,		return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
EltNo);		EltNo);
}		}

/// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index		/// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index
/// generation and convert it from being a bunch of shuffles and extracts		/// generation and convert it from being a bunch of shuffles and extracts
/// to a simple store and scalar loads to extract the elements.		/// into a somewhat faster sequence. For i686, the best sequence is apparently
		/// storing the value and loading scalars back, while for x64 we should
		/// use 64-bit extracts and shifts.
static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,		static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI) {		TargetLowering::DAGCombinerInfo &DCI) {
SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI);		SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI);
if (NewOp.getNode())		if (NewOp.getNode())
return NewOp;		return NewOp;

SDValue InputVector = N->getOperand(0);		SDValue InputVector = N->getOperand(0);

▲ Show 20 Lines • Show All 42 Lines • ▼ Show 20 Lines	for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
Uses.push_back(Extract);		Uses.push_back(Extract);
}		}

// If not all the elements were used, this may not be worthwhile.		// If not all the elements were used, this may not be worthwhile.
if (ExtractedElements != 15)		if (ExtractedElements != 15)
return SDValue();		return SDValue();

// Ok, we've now decided to do the transformation.		// Ok, we've now decided to do the transformation.
		// If 64-bit shifts are legal, use the extract-shift sequence,
		// otherwise bounce the vector off the cache.
		const TargetLowering &TLI = DAG.getTargetLoweringInfo();
		SDValue Vals[4];
SDLoc dl(InputVector);		SDLoc dl(InputVector);

		if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) {
		SDValue Cst = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, InputVector);
		EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy();
		SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
		DAG.getConstant(0, VecIdxTy));
		SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
		DAG.getConstant(1, VecIdxTy));

		SDValue ShAmt = DAG.getConstant(32,
		DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64));
		Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf);
		Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
		DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt));
		Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf);
		Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
		DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt));
		} else {
// Store the value to a temporary stack slot.		// Store the value to a temporary stack slot.
SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType());		SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType());
SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,		SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
MachinePointerInfo(), false, false, 0);		MachinePointerInfo(), false, false, 0);

// Replace each use (extract) with a load of the appropriate element.		EVT ElementType = InputVector.getValueType().getVectorElementType();
for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),		unsigned EltSize = ElementType.getSizeInBits() / 8;
UE = Uses.end(); UI != UE; ++UI) {
SDNode Extract = UI;

// cOMpute the element's address.		// Replace each use (extract) with a load of the appropriate element.
SDValue Idx = Extract->getOperand(1);		for (unsigned i = 0; i < 4; ++i) {
unsigned EltSize =		uint64_t Offset = EltSize * i;
InputVector.getValueType().getVectorElementType().getSizeInBits()/8;
uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy());		SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy());

SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(),		SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(),
StackPtr, OffsetVal);		StackPtr, OffsetVal);

// Load the scalar.		// Load the scalar.
SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch,		Vals[i] = DAG.getLoad(ElementType, dl, Ch,
ScalarAddr, MachinePointerInfo(),		ScalarAddr, MachinePointerInfo(),
false, false, false, 0);		false, false, false, 0);

// Replace the exact with the load.		}
DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), LoadScalar);		}

		// Replace the extracts
		for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
		UE = Uses.end(); UI != UE; ++UI) {
		SDNode Extract = UI;

		SDValue Idx = Extract->getOperand(1);
		uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
		DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]);
}		}

// The replacement was made in place; don't return anything.		// The replacement was made in place; don't return anything.
return SDValue();		return SDValue();
}		}

/// \brief Matches a VSELECT onto min/max or return 0 if the node doesn't match.		/// \brief Matches a VSELECT onto min/max or return 0 if the node doesn't match.
static std::pair<unsigned, bool>		static std::pair<unsigned, bool>
▲ Show 20 Lines • Show All 3,673 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/X86/gather-addresses.ll

	; RUN: llc -mtriple=x86_64-linux -mcpu=nehalem < %s \| FileCheck %s --check-prefix=LIN			; RUN: llc -mtriple=x86_64-linux -mcpu=nehalem < %s \| FileCheck %s --check-prefix=LIN
	; RUN: llc -mtriple=x86_64-win32 -mcpu=nehalem < %s \| FileCheck %s --check-prefix=WIN			; RUN: llc -mtriple=x86_64-win32 -mcpu=nehalem < %s \| FileCheck %s --check-prefix=WIN
				; RUN: llc -mtriple=i686-win32 -mcpu=nehalem < %s \| FileCheck %s --check-prefix=LIN32
	; rdar://7398554			; rdar://7398554

	; When doing vector gather-scatter index calculation with 32-bit indices,			; When doing vector gather-scatter index calculation with 32-bit indices,
	; bounce the vector off of cache rather than shuffling each individual			; use an efficient mov/shift sequence rather than shuffling each individual
	; element out of the index vector.			; element out of the index vector.

	; CHECK: foo:			; CHECK-LABEL: foo:
	; LIN: movaps (%rsi), %xmm0			; LIN: movdqa (%rsi), %xmm0
	; LIN: andps (%rdx), %xmm0			; LIN: pand (%rdx), %xmm0
	; LIN: movaps %xmm0, -24(%rsp)			; LIN: pextrq $1, %xmm0, %r[[REG4:.+]]
	; LIN: movslq -24(%rsp), %[[REG1:r.+]]			; LIN: movd %xmm0, %r[[REG2:.+]]
	; LIN: movslq -20(%rsp), %[[REG2:r.+]]			; LIN: movslq %e[[REG2]], %r[[REG1:.+]]
	; LIN: movslq -16(%rsp), %[[REG3:r.+]]			; LIN: sarq $32, %r[[REG2]]
	; LIN: movslq -12(%rsp), %[[REG4:r.+]]			; LIN: movslq %e[[REG4]], %r[[REG3:.+]]
	; LIN: movsd (%rdi,%[[REG1]],8), %xmm0			; LIN: sarq $32, %r[[REG4]]
	; LIN: movhpd (%rdi,%[[REG2]],8), %xmm0			; LIN: movsd (%rdi,%r[[REG1]],8), %xmm0
	; LIN: movsd (%rdi,%[[REG3]],8), %xmm1			; LIN: movhpd (%rdi,%r[[REG2]],8), %xmm0
	; LIN: movhpd (%rdi,%[[REG4]],8), %xmm1			; LIN: movsd (%rdi,%r[[REG3]],8), %xmm1
				; LIN: movhpd (%rdi,%r[[REG4]],8), %xmm1

	; WIN: movaps (%rdx), %xmm0			; WIN: movdqa (%rdx), %xmm0
	; WIN: andps (%r8), %xmm0			; WIN: pand (%r8), %xmm0
	; WIN: movaps %xmm0, (%rsp)			; WIN: pextrq $1, %xmm0, %r[[REG4:.+]]
	; WIN: movslq (%rsp), %[[REG1:r.+]]			; WIN: movd %xmm0, %r[[REG2:.+]]
	; WIN: movslq 4(%rsp), %[[REG2:r.+]]			; WIN: movslq %e[[REG2]], %r[[REG1:.+]]
	; WIN: movslq 8(%rsp), %[[REG3:r.+]]			; WIN: sarq $32, %r[[REG2]]
	; WIN: movslq 12(%rsp), %[[REG4:r.+]]			; WIN: movslq %e[[REG4]], %r[[REG3:.+]]
	; WIN: movsd (%rcx,%[[REG1]],8), %xmm0			; WIN: sarq $32, %r[[REG4]]
	; WIN: movhpd (%rcx,%[[REG2]],8), %xmm0			; WIN: movsd (%rcx,%r[[REG1]],8), %xmm0
	; WIN: movsd (%rcx,%[[REG3]],8), %xmm1			; WIN: movhpd (%rcx,%r[[REG2]],8), %xmm0
	; WIN: movhpd (%rcx,%[[REG4]],8), %xmm1			; WIN: movsd (%rcx,%r[[REG3]],8), %xmm1
				; WIN: movhpd (%rcx,%r[[REG4]],8), %xmm1

	define <4 x double> @foo(double* %p, <4 x i32>* %i, <4 x i32>* %h) nounwind {			define <4 x double> @foo(double* %p, <4 x i32>* %i, <4 x i32>* %h) nounwind {
	%a = load <4 x i32>* %i			%a = load <4 x i32>* %i
	%b = load <4 x i32>* %h			%b = load <4 x i32>* %h
	%j = and <4 x i32> %a, %b			%j = and <4 x i32> %a, %b
	%d0 = extractelement <4 x i32> %j, i32 0			%d0 = extractelement <4 x i32> %j, i32 0
	%d1 = extractelement <4 x i32> %j, i32 1			%d1 = extractelement <4 x i32> %j, i32 1
	%d2 = extractelement <4 x i32> %j, i32 2			%d2 = extractelement <4 x i32> %j, i32 2
	%d3 = extractelement <4 x i32> %j, i32 3			%d3 = extractelement <4 x i32> %j, i32 3
	%q0 = getelementptr double* %p, i32 %d0			%q0 = getelementptr double* %p, i32 %d0
	%q1 = getelementptr double* %p, i32 %d1			%q1 = getelementptr double* %p, i32 %d1
	%q2 = getelementptr double* %p, i32 %d2			%q2 = getelementptr double* %p, i32 %d2
	%q3 = getelementptr double* %p, i32 %d3			%q3 = getelementptr double* %p, i32 %d3
	%r0 = load double* %q0			%r0 = load double* %q0
	%r1 = load double* %q1			%r1 = load double* %q1
	%r2 = load double* %q2			%r2 = load double* %q2
	%r3 = load double* %q3			%r3 = load double* %q3
	%v0 = insertelement <4 x double> undef, double %r0, i32 0			%v0 = insertelement <4 x double> undef, double %r0, i32 0
	%v1 = insertelement <4 x double> %v0, double %r1, i32 1			%v1 = insertelement <4 x double> %v0, double %r1, i32 1
	%v2 = insertelement <4 x double> %v1, double %r2, i32 2			%v2 = insertelement <4 x double> %v1, double %r2, i32 2
	%v3 = insertelement <4 x double> %v2, double %r3, i32 3			%v3 = insertelement <4 x double> %v2, double %r3, i32 3
	ret <4 x double> %v3			ret <4 x double> %v3
	}			}

				; Check that the sequence previously used above, which bounces the vector off the
				; cache works for x86-32. Note that in this case it will not be used for index
				; calculation, since indexes are 32-bit, not 64.
				; CHECK-LABEL: old:
				; LIN32: movaps %xmm0, (%esp)
				; LIN32-DAG: {{(mov\|and)}}l (%esp),
				; LIN32-DAG: {{(mov\|and)}}l 4(%esp),
				; LIN32-DAG: {{(mov\|and)}}l 8(%esp),
				; LIN32-DAG: {{(mov\|and)}}l 12(%esp),
				define <4 x i64> @old(double* %p, <4 x i32>* %i, <4 x i32>* %h, i64 %f) nounwind {
				%a = load <4 x i32>* %i
				%b = load <4 x i32>* %h
				%j = and <4 x i32> %a, %b
				%d0 = extractelement <4 x i32> %j, i32 0
				%d1 = extractelement <4 x i32> %j, i32 1
				%d2 = extractelement <4 x i32> %j, i32 2
				%d3 = extractelement <4 x i32> %j, i32 3
				%q0 = zext i32 %d0 to i64
				%q1 = zext i32 %d1 to i64
				%q2 = zext i32 %d2 to i64
				%q3 = zext i32 %d3 to i64
				%r0 = and i64 %q0, %f
				%r1 = and i64 %q1, %f
				%r2 = and i64 %q2, %f
				%r3 = and i64 %q3, %f
				%v0 = insertelement <4 x i64> undef, i64 %r0, i32 0
				%v1 = insertelement <4 x i64> %v0, i64 %r1, i32 1
				%v2 = insertelement <4 x i64> %v1, i64 %r2, i32 2
				%v3 = insertelement <4 x i64> %v2, i64 %r3, i32 3
				ret <4 x i64> %v3
				}