This is an archive of the discontinued LLVM Phabricator instance.

[DAGCombiner] reduce shuffle of concat of same vector
ClosedPublic

Authored by spatel on Jan 6 2020, 12:56 PM.

Download Raw Diff

Details

Reviewers

RKSimon
craig.topper

Commits

rG58e2e92a57fc: [DAGCombiner] reduce shuffle of concat of same vector

Summary

This is possibly a small part towards solving PR42024:
https://bugs.llvm.org/show_bug.cgi?id=42024

The vectorizer is creating shuffles of concat like this:

%63 = shufflevector <4 x i64> %x, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
%64 = shufflevector <8 x i64> %63, <8 x i64> undef, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>

That might be fixable in the vectorizers, but we're not allowed to fold that into a single shuffle in instcombine, so we should have a backend backstop to convert that into the likely simpler form:

%64 = shufflevector <8 x i64> %x, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

spatel created this revision.Jan 6 2020, 12:56 PM

Herald added a project: Restricted Project. · View Herald TranscriptJan 6 2020, 12:56 PM

Herald added subscribers: hiraditya, mcrosier. · View Herald Transcript

LG

Accept button missed?

LGTM

This revision is now accepted and ready to land.Jan 6 2020, 8:53 PM

Why doesn't partitionShuffleOfConcats catch this?

In D72300#1807578, @RKSimon wrote:

Why doesn't partitionShuffleOfConcats catch this?

It's not exactly the pattern that function is looking for:

// Tries to turn a shuffle of two CONCAT_VECTORS into a single concat,
// or turn a shuffle of a single concat into simpler shuffle then concat.

I did consider putting this block into that helper since it's similar, but I didn't see any real shared code savings in the pattern matching. I can still move it inside there for better organization of folds.

Its fine to leave it where it is - LGTM cheers.

In D72300#1807837, @RKSimon wrote:

Its fine to leave it where it is - LGTM cheers.

On 2nd look at more motivating patterns, I think we should move it into partitionShuffleOfConcats() and loosen the isShuffleMaskLegal() restriction, but I'll make that a follow-on with more tests.

That's because we have illegal vectors like this being produced for AVX1/2 targets:

define <16 x i32> @concat_self_v4i32(<4 x i32> %x) {
  %t90 = shufflevector <4 x i32> %x, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
  %t91 = shufflevector <8 x i32> %t90, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %s = shufflevector <16 x i32> %t91, <16 x i32> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
  ret <16 x i32> %s
}

This patch won't fire as-is on that because the 512-bit vector requires an illegal shuffle mask for AVX1/2.

Closed by commit rG58e2e92a57fc: [DAGCombiner] reduce shuffle of concat of same vector (authored by spatel). · Explain WhyJan 7 2020, 6:55 AM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

llvm/

lib/

CodeGen/

SelectionDAG/

DAGCombiner.cpp

24 lines

test/

CodeGen/

X86/

vector-shuffle-combining-avx.ll

33 lines

Diff 236578

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 19,280 Lines • ▼ Show 20 Lines	if (N0.getOpcode() == ISD::CONCAT_VECTORS &&
Level < AfterLegalizeVectorOps &&		Level < AfterLegalizeVectorOps &&
(N1.isUndef() \|\|		(N1.isUndef() \|\|
(N1.getOpcode() == ISD::CONCAT_VECTORS &&		(N1.getOpcode() == ISD::CONCAT_VECTORS &&
N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()))) {		N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()))) {
if (SDValue V = partitionShuffleOfConcats(N, DAG))		if (SDValue V = partitionShuffleOfConcats(N, DAG))
return V;		return V;
}		}

		// A shuffle of a concat of the same narrow vector can be reduced to use
		// only low-half elements of a concat with undef:
		// shuf (concat X, X), undef, Mask --> shuf (concat X, undef), undef, Mask'
		if (N0.getOpcode() == ISD::CONCAT_VECTORS && N1.isUndef() &&
		N0.getNumOperands() == 2 &&
		N0.getOperand(0) == N0.getOperand(1)) {
		int HalfNumElts = (int)NumElts / 2;
		SmallVector<int, 8> NewMask;
		for (unsigned i = 0; i != NumElts; ++i) {
		int Idx = SVN->getMaskElt(i);
		if (Idx >= HalfNumElts) {
		assert(Idx < (int)NumElts && "Shuffle mask chooses undef op");
		Idx -= HalfNumElts;
		}
		NewMask.push_back(Idx);
		}
		if (TLI.isShuffleMaskLegal(NewMask, VT)) {
		SDValue UndefVec = DAG.getUNDEF(N0.getOperand(0).getValueType());
		SDValue NewCat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT,
		N0.getOperand(0), UndefVec);
		return DAG.getVectorShuffle(VT, SDLoc(N), NewCat, N1, NewMask);
		}
		}

// Attempt to combine a shuffle of 2 inputs of 'scalar sources' -		// Attempt to combine a shuffle of 2 inputs of 'scalar sources' -
// BUILD_VECTOR or SCALAR_TO_VECTOR into a single BUILD_VECTOR.		// BUILD_VECTOR or SCALAR_TO_VECTOR into a single BUILD_VECTOR.
if (Level < AfterLegalizeDAG && TLI.isTypeLegal(VT))		if (Level < AfterLegalizeDAG && TLI.isTypeLegal(VT))
if (SDValue Res = combineShuffleOfScalars(SVN, DAG, TLI))		if (SDValue Res = combineShuffleOfScalars(SVN, DAG, TLI))
return Res;		return Res;

// If this shuffle only has a single input that is a bitcasted shuffle,		// If this shuffle only has a single input that is a bitcasted shuffle,
// attempt to merge the 2 shuffles and suitably bitcast the inputs/output		// attempt to merge the 2 shuffles and suitably bitcast the inputs/output
▲ Show 20 Lines • Show All 1,898 Lines • Show Last 20 Lines

llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll

Show First 20 Lines • Show All 417 Lines • ▼ Show 20 Lines	entry:
unreachable		unreachable
}		}

define <4 x i64> @concat_self_v4i64(<2 x i64> %x) {		define <4 x i64> @concat_self_v4i64(<2 x i64> %x) {
; AVX1-LABEL: concat_self_v4i64:		; AVX1-LABEL: concat_self_v4i64:
; AVX1: # %bb.0:		; AVX1: # %bb.0:
; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0		; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0		; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]		; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,3,3]
; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,3,2]
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3]
; AVX1-NEXT: ret{{[l\|q]}}		; AVX1-NEXT: ret{{[l\|q]}}
;		;
; AVX2-LABEL: concat_self_v4i64:		; AVX2-LABEL: concat_self_v4i64:
; AVX2: # %bb.0:		; AVX2: # %bb.0:
; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0		; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0		; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,1]
; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: ret{{[l\|q]}}		; AVX2-NEXT: ret{{[l\|q]}}
;		;
; AVX512-LABEL: concat_self_v4i64:		; AVX512-LABEL: concat_self_v4i64:
; AVX512: # %bb.0:		; AVX512: # %bb.0:
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0		; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
; AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0		; AVX512-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,1]
; AVX512-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX512-NEXT: ret{{[l\|q]}}		; AVX512-NEXT: ret{{[l\|q]}}
%cat = shufflevector <2 x i64> %x, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>		%cat = shufflevector <2 x i64> %x, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
%s = shufflevector <4 x i64> %cat, <4 x i64> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>		%s = shufflevector <4 x i64> %cat, <4 x i64> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
ret <4 x i64> %s		ret <4 x i64> %s
}		}

define <8 x i32> @concat_self_v8i32(<4 x i32> %x) {		define <8 x i32> @concat_self_v8i32(<4 x i32> %x) {
; AVX1-LABEL: concat_self_v8i32:		; AVX1-LABEL: concat_self_v8i32:
; AVX1: # %bb.0:		; AVX1: # %bb.0:
; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0		; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,2,1,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1		; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,2,1,3]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1]
; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,2,1,0,4,6,5,7]
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm2		; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm2
; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0		; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0		; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: ret{{[l\|q]}}		; AVX1-NEXT: ret{{[l\|q]}}
;		;
; AVX2-LABEL: concat_self_v8i32:		; AVX2-LABEL: concat_self_v8i32:
; AVX2: # %bb.0:		; AVX2: # %bb.0:
; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0		; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0		; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1
; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [7,6,5,4,0,2,1,3]		; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,0,2,1,3]
; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm1		; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0		; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: ret{{[l\|q]}}		; AVX2-NEXT: ret{{[l\|q]}}
;		;
; AVX512-LABEL: concat_self_v8i32:		; AVX512-LABEL: concat_self_v8i32:
; AVX512: # %bb.0:		; AVX512: # %bb.0:
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0		; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0		; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1
; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [7,6,5,4,0,2,1,3]		; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,0,2,1,3]
; AVX512-NEXT: vpermd %ymm0, %ymm1, %ymm1		; AVX512-NEXT: vpermd %ymm0, %ymm2, %ymm0
; AVX512-NEXT: vpaddd %ymm0, %ymm1, %ymm0		; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX512-NEXT: ret{{[l\|q]}}		; AVX512-NEXT: ret{{[l\|q]}}
%cat = shufflevector <4 x i32> %x, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>		%cat = shufflevector <4 x i32> %x, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
%s = shufflevector <8 x i32> %cat, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 0, i32 2, i32 1, i32 3>		%s = shufflevector <8 x i32> %cat, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 0, i32 2, i32 1, i32 3>
%a = add <8 x i32> %s, %cat		%a = add <8 x i32> %s, %cat
ret <8 x i32> %a		ret <8 x i32> %a
}		}