This is an archive of the discontinued LLVM Phabricator instance.

[X86] EltsFromConsecutiveLoads - detect split loads without a common load base (PR32940)
AbandonedPublic

Authored by RKSimon on Jun 11 2017, 3:54 AM.

Download Raw Diff

Details

Reviewers

craig.topper
niravd
spatel
andreadb
filcab

Summary

As discussed on PR32940, we fail to merge some vector loads as the individual scalar loads have been split (typically for i64 -> i32 legalization on 32-bit targets) which results in not all the loads being offset from the original base.

This patch fixes this by checking to see if the load is consecutive to any previous consecutive load, and not just the first 'base' load.

Diff Detail

Repository: rL LLVM

Event Timeline

RKSimon created this revision.Jun 11 2017, 3:54 AM

RKSimon added a reviewer: filcab.Jun 12 2017, 5:42 AM

It looks like this is really an issue of "areNonVolatileConsecutiveLoads" not being clever enough. BaseIndexOffset in DAGCombiner should do better. I'd favor moving BaseIndexOffset out of the DAG Combiner and rewriting areNonVolatileConsecutiveLoads off of that.

Also, popping up a level, do you know if there's a reason why this is done as an X86-specific pass? It seems entirely generic.

In D34087#777942, @niravd wrote:

Also, popping up a level, do you know if there's a reason why this is done as an X86-specific pass? It seems entirely generic.

It's done here as we're trying to match various x86 specific load patterns - basic consecutive loads, 'masked' loads with undef/zeros that get blended out, loads with all upper vector elements set to zero. I'm also hoping to add broadcast/vector_broadcast soon (possibly ZERO_EXTEND_VECTOR_INREG as well).

In D34087#777942, @niravd wrote:

It looks like this is really an issue of "areNonVolatileConsecutiveLoads" not being clever enough. BaseIndexOffset in DAGCombiner should do better. I'd favor moving BaseIndexOffset out of the DAG Combiner and rewriting areNonVolatileConsecutiveLoads off of that.

Are you suggesting we move BaseIndexOffset to a header so that it can be used in SelectionDAG as well or move all uses from DAGCombiner.cpp into SelectionDAG.cpp (load combines, store merge, neighbour chains etc.)?

I'm suggesting we move BaseIndexOffset into SelectionDAG so it can be called there as well.

Are you suggesting we move BaseIndexOffset to a header so that it can be used in SelectionDAG as well or move all uses from DAGCombiner.cpp into SelectionDAG.cpp (load combines, store merge, neighbour chains etc.)?

niravd mentioned this in D34472: [DAG] Rewrite areNonVolatileConsecutiveLoads to use BaseIndexOffset.Jun 21 2017, 12:56 PM

D34472 is providing a more general solution

niravd mentioned this in rL306819: [DAG] Rewrite areNonVolatileConsecutiveLoads to use BaseIndexOffset.Jun 30 2017, 5:24 AM

niravd mentioned this in rL307114: Rewrite areNonVolatileConsecutiveLoads to use BaseIndexOffset.Jul 4 2017, 6:22 PM

Revision Contents

Path

Size

lib/

Target/

X86/

	X86ISelLowering.cpp
	X86ISelLowering.cpp (revision 305161)

17 lines

test/

CodeGen/

X86/

	build-vector-128.ll
	build-vector-128.ll (revision 305161)

23 lines

	build-vector-256.ll
	build-vector-256.ll (revision 305161)

29 lines

	build-vector-512.ll
	build-vector-512.ll (revision 305161)

20 lines

Diff 102130

lib/Target/X86/X86ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 6,466 Lines • ▼ Show 20 Lines	static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
// Consecutive loads with UNDEFs and ZEROs elements require a		// Consecutive loads with UNDEFs and ZEROs elements require a
// an additional shuffle stage to clear the ZERO elements.		// an additional shuffle stage to clear the ZERO elements.
bool IsConsecutiveLoad = true;		bool IsConsecutiveLoad = true;
bool IsConsecutiveLoadWithZeros = true;		bool IsConsecutiveLoadWithZeros = true;
for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {		for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
if (LoadMask[i]) {		if (LoadMask[i]) {
SDValue Elt = peekThroughBitcasts(Elts[i]);		SDValue Elt = peekThroughBitcasts(Elts[i]);
LoadSDNode *LD = cast<LoadSDNode>(Elt);		LoadSDNode *LD = cast<LoadSDNode>(Elt);
if (!DAG.areNonVolatileConsecutiveLoads(		// See if this load is consecutive to any previously confirmed
LD, LDBase, Elt.getValueType().getStoreSizeInBits() / 8,		// consecutive load. We can't just test against LDBase as we have cases
i - FirstLoadedElt)) {		// where loads have been further split and offset from each other.
		bool IsConsecutive = false;
		for (int j = FirstLoadedElt; j < i; j = LoadMask.find_next(j)) {
		LoadSDNode *LocalBase = cast<LoadSDNode>(peekThroughBitcasts(Elts[j]));
		if (DAG.areNonVolatileConsecutiveLoads(
		LD, LocalBase, Elt.getValueType().getStoreSizeInBits() / 8,
		i - j)) {
		IsConsecutive = true;
		break;
		}
		}
		if (!IsConsecutive) {
IsConsecutiveLoad = false;		IsConsecutiveLoad = false;
IsConsecutiveLoadWithZeros = false;		IsConsecutiveLoadWithZeros = false;
break;		break;
}		}
} else if (ZeroMask[i]) {		} else if (ZeroMask[i]) {
IsConsecutiveLoad = false;		IsConsecutiveLoad = false;
}		}
}		}
▲ Show 20 Lines • Show All 29,830 Lines • Show Last 20 Lines

test/CodeGen/X86/build-vector-128.ll

Show First 20 Lines • Show All 66 Lines • ▼ Show 20 Lines	; AVX-64-NEXT: retq
%ins0 = insertelement <4 x float> undef, float %a0, i32 0		%ins0 = insertelement <4 x float> undef, float %a0, i32 0
%ins1 = insertelement <4 x float> %ins0, float %a1, i32 1		%ins1 = insertelement <4 x float> %ins0, float %a1, i32 1
%ins2 = insertelement <4 x float> %ins1, float %a2, i32 2		%ins2 = insertelement <4 x float> %ins1, float %a2, i32 2
%ins3 = insertelement <4 x float> %ins2, float %a3, i32 3		%ins3 = insertelement <4 x float> %ins2, float %a3, i32 3
ret <4 x float> %ins3		ret <4 x float> %ins3
}		}

define <2 x i64> @test_buildvector_v2i64(i64 %a0, i64 %a1) {		define <2 x i64> @test_buildvector_v2i64(i64 %a0, i64 %a1) {
; SSE2-32-LABEL: test_buildvector_v2i64:		; SSE-32-LABEL: test_buildvector_v2i64:
; SSE2-32: # BB#0:		; SSE-32: # BB#0:
; SSE2-32-NEXT: movq {{.*#+}} xmm1 = mem[0],zero		; SSE-32-NEXT: movups {{[0-9]+}}(%esp), %xmm0
; SSE2-32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero		; SSE-32-NEXT: retl
; SSE2-32-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE2-32-NEXT: retl
;		;
; SSE-64-LABEL: test_buildvector_v2i64:		; SSE-64-LABEL: test_buildvector_v2i64:
; SSE-64: # BB#0:		; SSE-64: # BB#0:
; SSE-64-NEXT: movq %rsi, %xmm1		; SSE-64-NEXT: movq %rsi, %xmm1
; SSE-64-NEXT: movq %rdi, %xmm0		; SSE-64-NEXT: movq %rdi, %xmm0
; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]		; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE-64-NEXT: retq		; SSE-64-NEXT: retq
;		;
; SSE41-32-LABEL: test_buildvector_v2i64:
; SSE41-32: # BB#0:
; SSE41-32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE41-32-NEXT: pinsrd $1, {{[0-9]+}}(%esp), %xmm0
; SSE41-32-NEXT: pinsrd $2, {{[0-9]+}}(%esp), %xmm0
; SSE41-32-NEXT: pinsrd $3, {{[0-9]+}}(%esp), %xmm0
; SSE41-32-NEXT: retl
;
; AVX-32-LABEL: test_buildvector_v2i64:		; AVX-32-LABEL: test_buildvector_v2i64:
; AVX-32: # BB#0:		; AVX-32: # BB#0:
; AVX-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero		; AVX-32-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0
; AVX-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
; AVX-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
; AVX-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
; AVX-32-NEXT: retl		; AVX-32-NEXT: retl
;		;
; AVX-64-LABEL: test_buildvector_v2i64:		; AVX-64-LABEL: test_buildvector_v2i64:
; AVX-64: # BB#0:		; AVX-64: # BB#0:
; AVX-64-NEXT: vmovq %rsi, %xmm0		; AVX-64-NEXT: vmovq %rsi, %xmm0
; AVX-64-NEXT: vmovq %rdi, %xmm1		; AVX-64-NEXT: vmovq %rdi, %xmm1
; AVX-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]		; AVX-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX-64-NEXT: retq		; AVX-64-NEXT: retq
▲ Show 20 Lines • Show All 314 Lines • Show Last 20 Lines

test/CodeGen/X86/build-vector-256.ll

Show First 20 Lines • Show All 45 Lines • ▼ Show 20 Lines	; AVX-64-NEXT: retq
%ins4 = insertelement <8 x float> %ins3, float %a4, i32 4		%ins4 = insertelement <8 x float> %ins3, float %a4, i32 4
%ins5 = insertelement <8 x float> %ins4, float %a5, i32 5		%ins5 = insertelement <8 x float> %ins4, float %a5, i32 5
%ins6 = insertelement <8 x float> %ins5, float %a6, i32 6		%ins6 = insertelement <8 x float> %ins5, float %a6, i32 6
%ins7 = insertelement <8 x float> %ins6, float %a7, i32 7		%ins7 = insertelement <8 x float> %ins6, float %a7, i32 7
ret <8 x float> %ins7		ret <8 x float> %ins7
}		}

define <4 x i64> @test_buildvector_v4i64(i64 %a0, i64 %a1, i64 %a2, i64 %a3) {		define <4 x i64> @test_buildvector_v4i64(i64 %a0, i64 %a1, i64 %a2, i64 %a3) {
; AVX1-32-LABEL: test_buildvector_v4i64:		; AVX-32-LABEL: test_buildvector_v4i64:
; AVX1-32: # BB#0:		; AVX-32: # BB#0:
; AVX1-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero		; AVX-32-NEXT: vmovups {{[0-9]+}}(%esp), %ymm0
; AVX1-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0		; AVX-32-NEXT: retl
; AVX1-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
; AVX1-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
; AVX1-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX1-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
; AVX1-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
; AVX1-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
; AVX1-32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-32-NEXT: retl
;		;
; AVX1-64-LABEL: test_buildvector_v4i64:		; AVX1-64-LABEL: test_buildvector_v4i64:
; AVX1-64: # BB#0:		; AVX1-64: # BB#0:
; AVX1-64-NEXT: vmovq %rcx, %xmm0		; AVX1-64-NEXT: vmovq %rcx, %xmm0
; AVX1-64-NEXT: vmovq %rdx, %xmm1		; AVX1-64-NEXT: vmovq %rdx, %xmm1
; AVX1-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]		; AVX1-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX1-64-NEXT: vmovq %rsi, %xmm1		; AVX1-64-NEXT: vmovq %rsi, %xmm1
; AVX1-64-NEXT: vmovq %rdi, %xmm2		; AVX1-64-NEXT: vmovq %rdi, %xmm2
; AVX1-64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]		; AVX1-64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVX1-64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0		; AVX1-64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-64-NEXT: retq		; AVX1-64-NEXT: retq
;		;
; AVX2-32-LABEL: test_buildvector_v4i64:
; AVX2-32: # BB#0:
; AVX2-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX2-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
; AVX2-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
; AVX2-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
; AVX2-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX2-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
; AVX2-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
; AVX2-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
; AVX2-32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-32-NEXT: retl
;
; AVX2-64-LABEL: test_buildvector_v4i64:		; AVX2-64-LABEL: test_buildvector_v4i64:
; AVX2-64: # BB#0:		; AVX2-64: # BB#0:
; AVX2-64-NEXT: vmovq %rcx, %xmm0		; AVX2-64-NEXT: vmovq %rcx, %xmm0
; AVX2-64-NEXT: vmovq %rdx, %xmm1		; AVX2-64-NEXT: vmovq %rdx, %xmm1
; AVX2-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]		; AVX2-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX2-64-NEXT: vmovq %rsi, %xmm1		; AVX2-64-NEXT: vmovq %rsi, %xmm1
; AVX2-64-NEXT: vmovq %rdi, %xmm2		; AVX2-64-NEXT: vmovq %rdi, %xmm2
; AVX2-64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]		; AVX2-64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
▲ Show 20 Lines • Show All 336 Lines • Show Last 20 Lines

test/CodeGen/X86/build-vector-512.ll

Show First 20 Lines • Show All 73 Lines • ▼ Show 20 Lines	; AVX-64-NEXT: retq
%ins14 = insertelement <16 x float> %ins13, float %a14, i32 14		%ins14 = insertelement <16 x float> %ins13, float %a14, i32 14
%ins15 = insertelement <16 x float> %ins14, float %a15, i32 15		%ins15 = insertelement <16 x float> %ins14, float %a15, i32 15
ret <16 x float> %ins15		ret <16 x float> %ins15
}		}

define <8 x i64> @test_buildvector_v8i64(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i64 %a4, i64 %a5, i64 %a6, i64 %a7) {		define <8 x i64> @test_buildvector_v8i64(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i64 %a4, i64 %a5, i64 %a6, i64 %a7) {
; AVX-32-LABEL: test_buildvector_v8i64:		; AVX-32-LABEL: test_buildvector_v8i64:
; AVX-32: # BB#0:		; AVX-32: # BB#0:
; AVX-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero		; AVX-32-NEXT: vmovups {{[0-9]+}}(%esp), %zmm0
; AVX-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
; AVX-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
; AVX-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
; AVX-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
; AVX-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
; AVX-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
; AVX-32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
; AVX-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
; AVX-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
; AVX-32-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
; AVX-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm2, %xmm2
; AVX-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm2, %xmm2
; AVX-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm2, %xmm2
; AVX-32-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
; AVX-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX-32-NEXT: retl		; AVX-32-NEXT: retl
;		;
; AVX-64-LABEL: test_buildvector_v8i64:		; AVX-64-LABEL: test_buildvector_v8i64:
; AVX-64: # BB#0:		; AVX-64: # BB#0:
; AVX-64-NEXT: vmovq %rcx, %xmm0		; AVX-64-NEXT: vmovq %rcx, %xmm0
; AVX-64-NEXT: vmovq %rdx, %xmm1		; AVX-64-NEXT: vmovq %rdx, %xmm1
; AVX-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]		; AVX-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX-64-NEXT: vmovq %rsi, %xmm1		; AVX-64-NEXT: vmovq %rsi, %xmm1
▲ Show 20 Lines • Show All 604 Lines • Show Last 20 Lines