This is an archive of the discontinued LLVM Phabricator instance.

[DAGCombiner] simplify shuffle of shuffle
ClosedPublic

Authored by spatel on Mar 28 2019, 2:36 PM.

Download Raw Diff

Details

Reviewers

RKSimon
craig.topper
efriedma

Commits

rG12685d0f7cd8: [DAGCombiner] simplify shuffle of shuffle
rL357258: [DAGCombiner] simplify shuffle of shuffle

Summary

After investigating the examples from D59777 targeting an SSE4.1 machine, it looks like a very different problem due to how we map illegal types (256-bit in these cases).

We're missing a shuffle simplification that maps elements of a vector back to a shuffled operand. We have a more general version of this transform in DAGCombiner::visitVECTOR_SHUFFLE(), but that generality means it is limited to patterns with a one-use constraint, and the examples here have 2 uses. We don't need any uses or legality limitations for a simplification (no new value is created).

It looks like we miss this pattern in IR too.

In one of the zext examples here, we have shuffle masks like this:

Shuf0 = vector_shuffle<0,u,3,7,0,u,3,7>
Shuf = vector_shuffle<4,u,6,7,u,u,u,u>

...so that's moving the high half of the 1st vector into the low half. But the high half of the 1st vector is already identical to the low half.

Diff Detail

Repository: rL LLVM

Event Timeline

spatel created this revision.Mar 28 2019, 2:36 PM

Herald added a project: Restricted Project. · View Herald TranscriptMar 28 2019, 2:36 PM

Herald added subscribers: jdoerfert, hiraditya, mcrosier. · View Herald Transcript

LGTM

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
17952 ↗	(On Diff #192715)	e = Mask.size()?
17956 ↗	(On Diff #192715)	Maybe assert that 0 <= Mask[i] && Mask[i] < e ?
17963 ↗	(On Diff #192715)	Just return Shuf->getOperand(0) ?

This revision is now accepted and ready to land.Mar 29 2019, 5:09 AM

spatel marked 3 inline comments as done.Mar 29 2019, 6:55 AM

Closed by commit rL357258: [DAGCombiner] simplify shuffle of shuffle (authored by spatel). · Explain WhyMar 29 2019, 7:19 AM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

llvm/

trunk/

lib/

CodeGen/

SelectionDAG/

DAGCombiner.cpp

33 lines

test/

CodeGen/

X86/

vector-zext.ll

51 lines

Diff 192820

llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 17,970 Lines • ▼ Show 20 Lines	static SDValue replaceShuffleOfInsert(ShuffleVectorSDNode *Shuf,
// mask index value, not the insert's index value.		// mask index value, not the insert's index value.
// shuffle (insertelt v1, x, C), v2, mask --> insertelt v2, x, C'		// shuffle (insertelt v1, x, C), v2, mask --> insertelt v2, x, C'
SDValue NewInsIndex = DAG.getConstant(ShufOp0Index, SDLoc(Shuf),		SDValue NewInsIndex = DAG.getConstant(ShufOp0Index, SDLoc(Shuf),
Op0.getOperand(2).getValueType());		Op0.getOperand(2).getValueType());
return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Shuf), Op0.getValueType(),		return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Shuf), Op0.getValueType(),
Op1, Op0.getOperand(1), NewInsIndex);		Op1, Op0.getOperand(1), NewInsIndex);
}		}

		/// If we have a unary shuffle of a shuffle, see if it can be folded away
		/// completely. This has the potential to lose undef knowledge because the first
		/// shuffle may not have an undef mask element where the second one does. So
		/// only call this after doing simplifications based on demanded elements.
		static SDValue simplifyShuffleOfShuffle(ShuffleVectorSDNode *Shuf) {
		// shuf (shuf0 X, Y, Mask0), undef, Mask
		auto *Shuf0 = dyn_cast<ShuffleVectorSDNode>(Shuf->getOperand(0));
		if (!Shuf0 \|\| !Shuf->getOperand(1).isUndef())
		return SDValue();

		ArrayRef<int> Mask = Shuf->getMask();
		ArrayRef<int> Mask0 = Shuf0->getMask();
		for (int i = 0, e = (int)Mask.size(); i != e; ++i) {
		// Ignore undef elements.
		if (Mask[i] == -1)
		continue;
		assert(Mask[i] >= 0 && Mask[i] < e && "Unexpected shuffle mask value");

		// Is the element of the shuffle operand chosen by this shuffle the same as
		// the element chosen by the shuffle operand itself?
		if (Mask0[Mask[i]] != Mask0[i])
		return SDValue();
		}
		// Every element of this shuffle is identical to the result of the previous
		// shuffle, so we can replace this value.
		return Shuf->getOperand(0);
		}

SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {		SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
EVT VT = N->getValueType(0);		EVT VT = N->getValueType(0);
unsigned NumElts = VT.getVectorNumElements();		unsigned NumElts = VT.getVectorNumElements();

SDValue N0 = N->getOperand(0);		SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);		SDValue N1 = N->getOperand(1);

assert(N0.getValueType() == VT && "Vector shuffle must be normalized in DAG");		assert(N0.getValueType() == VT && "Vector shuffle must be normalized in DAG");
▲ Show 20 Lines • Show All 94 Lines • ▼ Show 20 Lines	if (V->getOpcode() == ISD::BUILD_VECTOR) {
return NewBV;		return NewBV;
}		}
}		}

// Simplify source operands based on shuffle mask.		// Simplify source operands based on shuffle mask.
if (SimplifyDemandedVectorElts(SDValue(N, 0)))		if (SimplifyDemandedVectorElts(SDValue(N, 0)))
return SDValue(N, 0);		return SDValue(N, 0);

		// This is intentionally placed after demanded elements simplification because
		// it could eliminate knowledge of undef elements created by this shuffle.
		if (SDValue ShufOp = simplifyShuffleOfShuffle(SVN))
		return ShufOp;

// Match shuffles that can be converted to any_vector_extend_in_reg.		// Match shuffles that can be converted to any_vector_extend_in_reg.
if (SDValue V = combineShuffleToVectorExtend(SVN, DAG, TLI, LegalOperations))		if (SDValue V = combineShuffleToVectorExtend(SVN, DAG, TLI, LegalOperations))
return V;		return V;

// Combine "truncate_vector_in_reg" style shuffles.		// Combine "truncate_vector_in_reg" style shuffles.
if (SDValue V = combineTruncationShuffle(SVN, DAG))		if (SDValue V = combineTruncationShuffle(SVN, DAG))
return V;		return V;

▲ Show 20 Lines • Show All 1,765 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/X86/vector-zext.ll

Show First 20 Lines • Show All 2,611 Lines • ▼ Show 20 Lines	; AVX512-NEXT: retq
%shuf = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> zeroinitializer		%shuf = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> zeroinitializer
%ext = zext <4 x i32> %shuf to <4 x i64>		%ext = zext <4 x i32> %shuf to <4 x i64>
ret <4 x i64> %ext		ret <4 x i64> %ext
}		}

define <8 x i32> @splatshuf_zext_v8i32_matching_undefs(<8 x i16> %x) {		define <8 x i32> @splatshuf_zext_v8i32_matching_undefs(<8 x i16> %x) {
; SSE2-LABEL: splatshuf_zext_v8i32_matching_undefs:		; SSE2-LABEL: splatshuf_zext_v8i32_matching_undefs:
; SSE2: # %bb.0:		; SSE2: # %bb.0:
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,0,3,4,5,6,7]		; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,1,3]		; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,7,7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]		; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,5,7]		; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; SSE2-NEXT: retq		; SSE2-NEXT: retq
;		;
; SSSE3-LABEL: splatshuf_zext_v8i32_matching_undefs:		; SSSE3-LABEL: splatshuf_zext_v8i32_matching_undefs:
; SSSE3: # %bb.0:		; SSSE3: # %bb.0:
		; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,xmm0[u,u],zero,zero,xmm0[6,7],zero,zero,xmm0[14,15],zero,zero
; SSSE3-NEXT: movdqa %xmm0, %xmm1		; SSSE3-NEXT: movdqa %xmm0, %xmm1
; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,6,7,6,7,14,15,0,1,6,7,6,7,14,15]
; SSSE3-NEXT: pxor %xmm2, %xmm2
; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; SSSE3-NEXT: retq		; SSSE3-NEXT: retq
;		;
; SSE41-LABEL: splatshuf_zext_v8i32_matching_undefs:		; SSE41-LABEL: splatshuf_zext_v8i32_matching_undefs:
; SSE41: # %bb.0:		; SSE41: # %bb.0:
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,6,7,14,15,0,1,6,7,6,7,14,15]		; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,6,7,14,15,8,9,10,11,12,13,14,15]
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero		; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]		; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq		; SSE41-NEXT: retq
;		;
; AVX1-LABEL: splatshuf_zext_v8i32_matching_undefs:		; AVX1-LABEL: splatshuf_zext_v8i32_matching_undefs:
; AVX1: # %bb.0:		; AVX1: # %bb.0:
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,xmm0[6,7],zero,zero,xmm0[6,7],zero,zero,xmm0[14,15],zero,zero		; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,xmm0[6,7],zero,zero,xmm0[6,7],zero,zero,xmm0[14,15],zero,zero
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0		; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq		; AVX1-NEXT: retq
;		;
Show All 12 Lines	; AVX512-NEXT: retq
%ext = zext <8 x i16> %shuf to <8 x i32>		%ext = zext <8 x i16> %shuf to <8 x i32>
ret <8 x i32> %ext		ret <8 x i32> %ext
}		}

define <8 x i32> @splatshuf_zext_v8i32_unmatched_undef(<8 x i16> %x) {		define <8 x i32> @splatshuf_zext_v8i32_unmatched_undef(<8 x i16> %x) {
; SSE2-LABEL: splatshuf_zext_v8i32_unmatched_undef:		; SSE2-LABEL: splatshuf_zext_v8i32_unmatched_undef:
; SSE2: # %bb.0:		; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]		; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,5,7]		; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,0]		; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,6,5,5,4]		; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7]
; SSE2-NEXT: pxor %xmm2, %xmm2		; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]		; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]		; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: retq		; SSE2-NEXT: retq
;		;
; SSSE3-LABEL: splatshuf_zext_v8i32_unmatched_undef:		; SSSE3-LABEL: splatshuf_zext_v8i32_unmatched_undef:
; SSSE3: # %bb.0:		; SSSE3: # %bb.0:
		; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,xmm0[2,3],zero,zero,xmm0[6,7],zero,zero,xmm0[14,15],zero,zero
; SSSE3-NEXT: movdqa %xmm0, %xmm1		; SSSE3-NEXT: movdqa %xmm0, %xmm1
; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,6,7,14,15,0,1,6,7,6,7,14,15]
; SSSE3-NEXT: pxor %xmm2, %xmm2
; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; SSSE3-NEXT: retq		; SSSE3-NEXT: retq
;		;
; SSE41-LABEL: splatshuf_zext_v8i32_unmatched_undef:		; SSE41-LABEL: splatshuf_zext_v8i32_unmatched_undef:
; SSE41: # %bb.0:		; SSE41: # %bb.0:
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,14,15,0,1,6,7,6,7,14,15]		; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,14,15,14,15,6,7,12,13,14,15]
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero		; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]		; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq		; SSE41-NEXT: retq
;		;
; AVX1-LABEL: splatshuf_zext_v8i32_unmatched_undef:		; AVX1-LABEL: splatshuf_zext_v8i32_unmatched_undef:
; AVX1: # %bb.0:		; AVX1: # %bb.0:
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,14,15,0,1,6,7,6,7,14,15]		; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,14,15,0,1,6,7,6,7,14,15]
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1		; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]		; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero		; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
▲ Show 20 Lines • Show All 64 Lines • Show Last 20 Lines