This is an archive of the discontinued LLVM Phabricator instance.

[DAGCombine] Avoid INSERT_SUBVECTOR reinsertions (PR28678)
ClosedPublic

Authored by RKSimon on Aug 9 2016, 11:54 AM.

Download Raw Diff

Details

Reviewers

spatel
majnemer
andreadb
mkuper

Commits

rG85c7ea86ae93: [DAGCombine] Avoid INSERT_SUBVECTOR reinsertions (PR28678)
rL278211: [DAGCombine] Avoid INSERT_SUBVECTOR reinsertions (PR28678)

Summary

If the input vector to INSERT_SUBVECTOR is another INSERT_SUBVECTOR, and this inserted subvector replaces the last insertion, then insert into the common source vector.

Diff Detail

Repository: rL LLVM

Event Timeline

RKSimon updated this revision to Diff 67385.Aug 9 2016, 11:54 AM

RKSimon retitled this revision from to [DAGCombine] Avoid INSERT_SUBVECTOR reinsertions (PR28678).

RKSimon updated this object.

RKSimon added reviewers: majnemer, mkuper, spatel, andreadb.

RKSimon set the repository for this revision to rL LLVM.

RKSimon added a subscriber: llvm-commits.

LGTM.

Although I'm somewhat curious about how we even end up with this pattern - I guess for test_mm256_insert_epi64, we really need to have two insert_elements post-legalization, but why do we end up with two insert_subvectors?

lib/CodeGen/SelectionDAG/DAGCombiner.cpp
13810 ↗	(On Diff #67385)	Could you make this comment slightly easier to parse? It's probably my fault, but took me a bit too long to understand what you meant. :-) Maybe explicitly say that both insert into the same lane, instead of (or in addition to) using "replaces"?

mkuper accepted this revision.Aug 9 2016, 4:08 PM

mkuper edited edge metadata.

This revision is now accepted and ready to land.Aug 9 2016, 4:08 PM

Closed by commit rL278211: [DAGCombine] Avoid INSERT_SUBVECTOR reinsertions (PR28678) (authored by RKSimon). · Explain WhyAug 10 2016, 3:58 AM

This revision was automatically updated to reflect the committed changes.

In D23330#510591, @mkuper wrote:

Although I'm somewhat curious about how we even end up with this pattern - I guess for test_mm256_insert_epi64, we really need to have two insert_elements post-legalization, but why do we end up with two insert_subvectors?

Thanks, I've cleaned up the description by using pseudocode instead. I also realised that we need to check the subvector types are the same (e.g. to protect against inserting a 256 following by 128 into a 512).

Any time that we're doing partial insertions (i.e. something that build vector won't handle) we tend to end up with these cases - each insertion has a separate extractsub/insertelt/insertsub pattern - what's curious is that a common extractsub is being used.

If you notice for the lower 128-bit vectors we still have a lot of duplicate blends to bring the lower/upper halves back together again.

Revision Contents

Path

Size

llvm/

trunk/

lib/

CodeGen/

SelectionDAG/

DAGCombiner.cpp

11 lines

test/

CodeGen/

X86/

avx-intrinsics-fast-isel.ll

3 lines

insertelement-zero.ll

2 lines

Diff 67503

llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 13,796 Lines • ▼ Show 20 Lines	if (C0 && VT == InVec.getValueType() &&
NewMask);		NewMask);
}		}
}		}

return SDValue();		return SDValue();
}		}

SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) {		SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) {
		EVT VT = N->getValueType(0);
SDValue N0 = N->getOperand(0);		SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);		SDValue N1 = N->getOperand(1);
SDValue N2 = N->getOperand(2);		SDValue N2 = N->getOperand(2);

		// Combine INSERT_SUBVECTORs where we are inserting to the same index.
		// INSERT_SUBVECTOR( INSERT_SUBVECTOR( Vec, SubOld, Idx ), SubNew, Idx )
		// --> INSERT_SUBVECTOR( Vec, SubNew, Idx )
		if (N0.getOpcode() == ISD::INSERT_SUBVECTOR &&
		N0.getOperand(1).getValueType() == N1.getValueType() &&
		N0.getOperand(2) == N2)
		return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), VT, N0.getOperand(0),
		N1, N2);

if (N0.getValueType() != N1.getValueType())		if (N0.getValueType() != N1.getValueType())
return SDValue();		return SDValue();

// If the input vector is a concatenation, and the insert replaces		// If the input vector is a concatenation, and the insert replaces
// one of the halves, we can optimize into a single concat_vectors.		// one of the halves, we can optimize into a single concat_vectors.
if (N0.getOpcode() == ISD::CONCAT_VECTORS && N0->getNumOperands() == 2 &&		if (N0.getOpcode() == ISD::CONCAT_VECTORS && N0->getNumOperands() == 2 &&
N2.getOpcode() == ISD::Constant) {		N2.getOpcode() == ISD::Constant) {
APInt InsIdx = cast<ConstantSDNode>(N2)->getAPIntValue();		APInt InsIdx = cast<ConstantSDNode>(N2)->getAPIntValue();
EVT VT = N->getValueType(0);

// Lower half: fold (insert_subvector (concat_vectors X, Y), Z) ->		// Lower half: fold (insert_subvector (concat_vectors X, Y), Z) ->
// (concat_vectors Z, Y)		// (concat_vectors Z, Y)
if (InsIdx == 0)		if (InsIdx == 0)
return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, N1,		return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, N1,
N0.getOperand(1));		N0.getOperand(1));

// Upper half: fold (insert_subvector (concat_vectors X, Y), Z) ->		// Upper half: fold (insert_subvector (concat_vectors X, Y), Z) ->
▲ Show 20 Lines • Show All 1,227 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/X86/avx-intrinsics-fast-isel.ll

Show First 20 Lines • Show All 1,020 Lines • ▼ Show 20 Lines	; X64-NEXT: retq
ret <4 x i64> %bc		ret <4 x i64> %bc
}		}

define <4 x i64> @test_mm256_insert_epi64(<4 x i64> %a0, i64 %a1) nounwind {		define <4 x i64> @test_mm256_insert_epi64(<4 x i64> %a0, i64 %a1) nounwind {
; X32-LABEL: test_mm256_insert_epi64:		; X32-LABEL: test_mm256_insert_epi64:
; X32: # BB#0:		; X32: # BB#0:
; X32-NEXT: vextractf128 $1, %ymm0, %xmm1		; X32-NEXT: vextractf128 $1, %ymm0, %xmm1
; X32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1		; X32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
; X32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm2		; X32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
; X32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0		; X32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; X32-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; X32-NEXT: retl		; X32-NEXT: retl
;		;
; X64-LABEL: test_mm256_insert_epi64:		; X64-LABEL: test_mm256_insert_epi64:
; X64: # BB#0:		; X64: # BB#0:
; X64-NEXT: vextractf128 $1, %ymm0, %xmm1		; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
; X64-NEXT: vpinsrq $1, %rdi, %xmm1, %xmm1		; X64-NEXT: vpinsrq $1, %rdi, %xmm1, %xmm1
; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0		; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; X64-NEXT: retq		; X64-NEXT: retq
▲ Show 20 Lines • Show All 2,739 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/X86/insertelement-zero.ll

	Show First 20 Lines • Show All 581 Lines • ▼ Show 20 Lines
	; AVX1: # BB#0:			; AVX1: # BB#0:
	; AVX1-NEXT: xorl %eax, %eax			; AVX1-NEXT: xorl %eax, %eax
	; AVX1-NEXT: vpinsrb $0, %eax, %xmm0, %xmm1			; AVX1-NEXT: vpinsrb $0, %eax, %xmm0, %xmm1
	; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]			; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
	; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm1			; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm1
	; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]			; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1			; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
	; AVX1-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1			; AVX1-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1			; AVX1-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0			; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq			; AVX1-NEXT: retq
	;			;
	; AVX2-LABEL: insert_v32i8_z123456789ABCDEzGHIJKLMNOPQRSTzz:			; AVX2-LABEL: insert_v32i8_z123456789ABCDEzGHIJKLMNOPQRSTzz:
	; AVX2: # BB#0:			; AVX2: # BB#0:
	; AVX2-NEXT: xorl %eax, %eax			; AVX2-NEXT: xorl %eax, %eax
	; AVX2-NEXT: vpinsrb $0, %eax, %xmm0, %xmm1			; AVX2-NEXT: vpinsrb $0, %eax, %xmm0, %xmm1
	; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]			; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
	; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm1			; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm1
	; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]			; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
	; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1			; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
	; AVX2-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1			; AVX2-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
	; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
	; AVX2-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1			; AVX2-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
	; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0			; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
	; AVX2-NEXT: retq			; AVX2-NEXT: retq
	%1 = insertelement <32 x i8> %a, i8 0, i32 0			%1 = insertelement <32 x i8> %a, i8 0, i32 0
	%2 = insertelement <32 x i8> %1, i8 0, i32 15			%2 = insertelement <32 x i8> %1, i8 0, i32 15
	%3 = insertelement <32 x i8> %2, i8 0, i32 30			%3 = insertelement <32 x i8> %2, i8 0, i32 30
	%4 = insertelement <32 x i8> %3, i8 0, i32 31			%4 = insertelement <32 x i8> %3, i8 0, i32 31
	ret <32 x i8> %4			ret <32 x i8> %4
	}			}