This is an archive of the discontinued LLVM Phabricator instance.

[DAGCombiner] simplify shuffle of shuffle
ClosedPublic

Authored by spatel on Mar 28 2019, 2:36 PM.

Download Raw Diff

Details

Reviewers

RKSimon
craig.topper
efriedma

Commits

rG12685d0f7cd8: [DAGCombiner] simplify shuffle of shuffle
rL357258: [DAGCombiner] simplify shuffle of shuffle

Summary

After investigating the examples from D59777 targeting an SSE4.1 machine, it looks like a very different problem due to how we map illegal types (256-bit in these cases).

We're missing a shuffle simplification that maps elements of a vector back to a shuffled operand. We have a more general version of this transform in DAGCombiner::visitVECTOR_SHUFFLE(), but that generality means it is limited to patterns with a one-use constraint, and the examples here have 2 uses. We don't need any uses or legality limitations for a simplification (no new value is created).

It looks like we miss this pattern in IR too.

In one of the zext examples here, we have shuffle masks like this:

Shuf0 = vector_shuffle<0,u,3,7,0,u,3,7>
Shuf = vector_shuffle<4,u,6,7,u,u,u,u>

...so that's moving the high half of the 1st vector into the low half. But the high half of the 1st vector is already identical to the low half.

Diff Detail

Event Timeline

spatel created this revision.Mar 28 2019, 2:36 PM

Herald added a project: Restricted Project. · View Herald TranscriptMar 28 2019, 2:36 PM

Herald added subscribers: jdoerfert, hiraditya, mcrosier. · View Herald Transcript

LGTM

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
17952	e = Mask.size()?
17956	Maybe assert that 0 <= Mask[i] && Mask[i] < e ?
17963	Just return Shuf->getOperand(0) ?

This revision is now accepted and ready to land.Mar 29 2019, 5:09 AM

spatel marked 3 inline comments as done.Mar 29 2019, 6:55 AM

Closed by commit rL357258: [DAGCombiner] simplify shuffle of shuffle (authored by spatel). · Explain WhyMar 29 2019, 7:19 AM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

llvm/

lib/

CodeGen/

SelectionDAG/

DAGCombiner.cpp

31 lines

test/

CodeGen/

X86/

vector-zext.ll

51 lines

Diff 192715

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 17,931 Lines • ▼ Show 20 Lines	static SDValue replaceShuffleOfInsert(ShuffleVectorSDNode *Shuf,
// mask index value, not the insert's index value.		// mask index value, not the insert's index value.
// shuffle (insertelt v1, x, C), v2, mask --> insertelt v2, x, C'		// shuffle (insertelt v1, x, C), v2, mask --> insertelt v2, x, C'
SDValue NewInsIndex = DAG.getConstant(ShufOp0Index, SDLoc(Shuf),		SDValue NewInsIndex = DAG.getConstant(ShufOp0Index, SDLoc(Shuf),
Op0.getOperand(2).getValueType());		Op0.getOperand(2).getValueType());
return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Shuf), Op0.getValueType(),		return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Shuf), Op0.getValueType(),
Op1, Op0.getOperand(1), NewInsIndex);		Op1, Op0.getOperand(1), NewInsIndex);
}		}

		/// If we have a unary shuffle of a shuffle, see if it can be folded away
		/// completely. This has the potential to lose undef knowledge because the first
		/// shuffle may not have an undef mask element where the second one does. So
		/// only call this after doing simplifications based on demanded elements.
		static SDValue simplifyShuffleOfShuffle(ShuffleVectorSDNode *Shuf) {
		// shuf (shuf0 X, Y, Mask0), undef, Mask
		auto *Shuf0 = dyn_cast<ShuffleVectorSDNode>(Shuf->getOperand(0));
		if (!Shuf0 \|\| !Shuf->getOperand(1).isUndef())
		return SDValue();

		ArrayRef<int> Mask = Shuf->getMask();
		ArrayRef<int> Mask0 = Shuf0->getMask();
		for (int i = 0, e = (int)(Shuf->getMask().size()); i != e; ++i) {
		RKSimonUnsubmitted Done Reply Inline Actions e = Mask.size()? RKSimon: e = Mask.size()?
		// Ignore undef elements.
		if (Mask[i] == -1)
		continue;
		// Is the element of the shuffle operand chosen by this shuffle the same as
		RKSimonUnsubmitted Done Reply Inline Actions Maybe assert that 0 <= Mask[i] && Mask[i] < e ? RKSimon: Maybe assert that 0 <= Mask[i] && Mask[i] < e ?
		// the element chosen by the shuffle operand itself?
		if (Mask0[Mask[i]] != Mask0[i])
		return SDValue();
		}
		// Every element of this shuffle is identical to the result of the previous
		// shuffle, so we can replace this value.
		return SDValue(Shuf0, 0);
		RKSimonUnsubmitted Done Reply Inline Actions Just return Shuf->getOperand(0) ? RKSimon: Just return Shuf->getOperand(0) ?
		}

SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {		SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
EVT VT = N->getValueType(0);		EVT VT = N->getValueType(0);
unsigned NumElts = VT.getVectorNumElements();		unsigned NumElts = VT.getVectorNumElements();

SDValue N0 = N->getOperand(0);		SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);		SDValue N1 = N->getOperand(1);

assert(N0.getValueType() == VT && "Vector shuffle must be normalized in DAG");		assert(N0.getValueType() == VT && "Vector shuffle must be normalized in DAG");
▲ Show 20 Lines • Show All 94 Lines • ▼ Show 20 Lines	if (V->getOpcode() == ISD::BUILD_VECTOR) {
return NewBV;		return NewBV;
}		}
}		}

// Simplify source operands based on shuffle mask.		// Simplify source operands based on shuffle mask.
if (SimplifyDemandedVectorElts(SDValue(N, 0)))		if (SimplifyDemandedVectorElts(SDValue(N, 0)))
return SDValue(N, 0);		return SDValue(N, 0);

		// This is intentionally placed after demanded elements simplification because
		// it could eliminate knowledge of undef elements created by this shuffle.
		if (SDValue ShufOp = simplifyShuffleOfShuffle(SVN))
		return ShufOp;

// Match shuffles that can be converted to any_vector_extend_in_reg.		// Match shuffles that can be converted to any_vector_extend_in_reg.
if (SDValue V = combineShuffleToVectorExtend(SVN, DAG, TLI, LegalOperations))		if (SDValue V = combineShuffleToVectorExtend(SVN, DAG, TLI, LegalOperations))
return V;		return V;

// Combine "truncate_vector_in_reg" style shuffles.		// Combine "truncate_vector_in_reg" style shuffles.
if (SDValue V = combineTruncationShuffle(SVN, DAG))		if (SDValue V = combineTruncationShuffle(SVN, DAG))
return V;		return V;

▲ Show 20 Lines • Show All 1,765 Lines • Show Last 20 Lines

llvm/test/CodeGen/X86/vector-zext.ll

Show First 20 Lines • Show All 2,611 Lines • ▼ Show 20 Lines	; AVX512-NEXT: retq
%shuf = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> zeroinitializer		%shuf = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> zeroinitializer
%ext = zext <4 x i32> %shuf to <4 x i64>		%ext = zext <4 x i32> %shuf to <4 x i64>
ret <4 x i64> %ext		ret <4 x i64> %ext
}		}

define <8 x i32> @splatshuf_zext_v8i32_matching_undefs(<8 x i16> %x) {		define <8 x i32> @splatshuf_zext_v8i32_matching_undefs(<8 x i16> %x) {
; SSE2-LABEL: splatshuf_zext_v8i32_matching_undefs:		; SSE2-LABEL: splatshuf_zext_v8i32_matching_undefs:
; SSE2: # %bb.0:		; SSE2: # %bb.0:
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,0,3,4,5,6,7]		; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,1,3]		; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,7,7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]		; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,5,7]		; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; SSE2-NEXT: retq		; SSE2-NEXT: retq
;		;
; SSSE3-LABEL: splatshuf_zext_v8i32_matching_undefs:		; SSSE3-LABEL: splatshuf_zext_v8i32_matching_undefs:
; SSSE3: # %bb.0:		; SSSE3: # %bb.0:
		; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,xmm0[u,u],zero,zero,xmm0[6,7],zero,zero,xmm0[14,15],zero,zero
; SSSE3-NEXT: movdqa %xmm0, %xmm1		; SSSE3-NEXT: movdqa %xmm0, %xmm1
; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,6,7,6,7,14,15,0,1,6,7,6,7,14,15]
; SSSE3-NEXT: pxor %xmm2, %xmm2
; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; SSSE3-NEXT: retq		; SSSE3-NEXT: retq
;		;
; SSE41-LABEL: splatshuf_zext_v8i32_matching_undefs:		; SSE41-LABEL: splatshuf_zext_v8i32_matching_undefs:
; SSE41: # %bb.0:		; SSE41: # %bb.0:
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,6,7,14,15,0,1,6,7,6,7,14,15]		; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,6,7,14,15,8,9,10,11,12,13,14,15]
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero		; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]		; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq		; SSE41-NEXT: retq
;		;
; AVX1-LABEL: splatshuf_zext_v8i32_matching_undefs:		; AVX1-LABEL: splatshuf_zext_v8i32_matching_undefs:
; AVX1: # %bb.0:		; AVX1: # %bb.0:
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,xmm0[6,7],zero,zero,xmm0[6,7],zero,zero,xmm0[14,15],zero,zero		; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,xmm0[6,7],zero,zero,xmm0[6,7],zero,zero,xmm0[14,15],zero,zero
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0		; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq		; AVX1-NEXT: retq
;		;
Show All 12 Lines	; AVX512-NEXT: retq
%ext = zext <8 x i16> %shuf to <8 x i32>		%ext = zext <8 x i16> %shuf to <8 x i32>
ret <8 x i32> %ext		ret <8 x i32> %ext
}		}

define <8 x i32> @splatshuf_zext_v8i32_unmatched_undef(<8 x i16> %x) {		define <8 x i32> @splatshuf_zext_v8i32_unmatched_undef(<8 x i16> %x) {
; SSE2-LABEL: splatshuf_zext_v8i32_unmatched_undef:		; SSE2-LABEL: splatshuf_zext_v8i32_unmatched_undef:
; SSE2: # %bb.0:		; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]		; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,5,7]		; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,0]		; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,6,5,5,4]		; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7]
; SSE2-NEXT: pxor %xmm2, %xmm2		; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]		; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]		; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: retq		; SSE2-NEXT: retq
;		;
; SSSE3-LABEL: splatshuf_zext_v8i32_unmatched_undef:		; SSSE3-LABEL: splatshuf_zext_v8i32_unmatched_undef:
; SSSE3: # %bb.0:		; SSSE3: # %bb.0:
		; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,xmm0[2,3],zero,zero,xmm0[6,7],zero,zero,xmm0[14,15],zero,zero
; SSSE3-NEXT: movdqa %xmm0, %xmm1		; SSSE3-NEXT: movdqa %xmm0, %xmm1
; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,6,7,14,15,0,1,6,7,6,7,14,15]
; SSSE3-NEXT: pxor %xmm2, %xmm2
; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; SSSE3-NEXT: retq		; SSSE3-NEXT: retq
;		;
; SSE41-LABEL: splatshuf_zext_v8i32_unmatched_undef:		; SSE41-LABEL: splatshuf_zext_v8i32_unmatched_undef:
; SSE41: # %bb.0:		; SSE41: # %bb.0:
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,14,15,0,1,6,7,6,7,14,15]		; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,14,15,14,15,6,7,12,13,14,15]
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero		; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]		; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq		; SSE41-NEXT: retq
;		;
; AVX1-LABEL: splatshuf_zext_v8i32_unmatched_undef:		; AVX1-LABEL: splatshuf_zext_v8i32_unmatched_undef:
; AVX1: # %bb.0:		; AVX1: # %bb.0:
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,14,15,0,1,6,7,6,7,14,15]		; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,14,15,0,1,6,7,6,7,14,15]
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1		; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]		; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero		; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
▲ Show 20 Lines • Show All 64 Lines • Show Last 20 Lines