This is an archive of the discontinued LLVM Phabricator instance.

[x86] Combine x86mmx/i64 to v2i64 conversion to use scalar_to_vector
ClosedPublic

Authored by bruno on Jan 22 2015, 5:10 AM.

Download Raw Diff

Details

Reviewers

spatel
chandlerc
nadav

Commits

rG56567f913505: [x86] Combine x86mmx/i64 to v2i64 conversion to use scalar_to_vector

Summary

Handle the poor codegen for i64/x86xmm->v2i64 (%mm -> %xmm) moves. Instead of using stack store/load pair to do the job, use scalar_to_vector directly, which in the MMX case can use movq2dq. This was the current behavior prior to improvements for vector legalization of extloads in r213897. This patch fixes the regression and as a side-effect also remove some unnecessary shuffles.

In the new attached testcase, we go from:

pshufw $-18, (%rdi), %mm0
movq %mm0, -8(%rsp)
movq -8(%rsp), %xmm0
pshufd $-44, %xmm0, %xmm0
movd %xmm0, %eax
...

To:

pshufw $-18, (%rdi), %mm0
movq2dq %mm0, %xmm0
movd %xmm0, %eax
...

Diff Detail

Repository: rL LLVM

Event Timeline

bruno updated this revision to Diff 18606.Jan 22 2015, 5:10 AM

bruno retitled this revision from to [x86] Combine x86mmx/i64 to v2i64 conversion to use scalar_to_vector.

bruno updated this object.

bruno edited the test plan for this revision. (Show Details)

bruno added reviewers: chandlerc, spatel, nadav.

bruno set the repository for this revision to rL LLVM.

bruno added a subscriber: Unknown Object (MLST).

LGTM.

See inline comments for nitpicks. Otherwise, LGTM.

lib/Target/X86/X86ISelLowering.cpp
24738	"Conversion"
24741	"redundant"
24746	Period at end of sentence.
24754	Period at end of sentence.
test/CodeGen/X86/2012-01-18-vbitcast.ll
5	I was told that we should always put a space between the ';' and 'CHECK'. And yes, I realize that a very large fraction of the existing regression tests do not do this. :)

This revision is now accepted and ready to land.Jan 22 2015, 10:41 AM

Updated patch with Sanjay comments and committed in r226953

Generally nice. Some further comments to spatel's inline. I'm happy with the change once spatel is, and with whatever approach you take for cleaning up the test cases.

lib/Target/X86/X86ISelLowering.cpp
24754	Also, I would use "store" rather than "save".
test/CodeGen/X86/2012-01-18-vbitcast.ll
5	My preference is to convert tests to use the utils/update_llc_test_checks.py script when appropriate (IE, when they are testing a microscopic and very specific sequence of instructions). Where we can, I'd also appreciate merging test cases out of <date>-foo.ll files and into reasonably semantically named files. I'm not too fussed about what order this stuff happens in, but I usually find it easier to review patches when the test cases are moved or regularized with the script first as it tends to make the diff cleaner. Anyways, not super important to this specific patch, but hopefully useful going forward.
test/CodeGen/X86/mmx-movq2dq.ll
1	How about a generic 'vector-shuffle-mmx.ll' test file that we can consolidate MMX-specific shuffle and mov test cases into?

Hi,

Revision Contents

Path

Size

lib/

Target/

X86/

X86ISelLowering.cpp

29 lines

test/

CodeGen/

X86/

2012-01-18-vbitcast.ll

7 lines

lower-bitcast.ll

5 lines

mmx-movq2dq.ll

29 lines

widen_load-2.ll

3 lines

Diff 18606

lib/Target/X86/X86ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 24,689 Lines • ▼ Show 20 Lines

/// PerformLOADCombine - Do target-specific dag combines on LOAD nodes.		/// PerformLOADCombine - Do target-specific dag combines on LOAD nodes.
static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,		static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,		TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget *Subtarget) {		const X86Subtarget *Subtarget) {
LoadSDNode *Ld = cast<LoadSDNode>(N);		LoadSDNode *Ld = cast<LoadSDNode>(N);
EVT RegVT = Ld->getValueType(0);		EVT RegVT = Ld->getValueType(0);
EVT MemVT = Ld->getMemoryVT();		EVT MemVT = Ld->getMemoryVT();
		SDValue Ptr = Ld->getBasePtr();
		SDValue Chain = Ld->getChain();
SDLoc dl(Ld);		SDLoc dl(Ld);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();		const TargetLowering &TLI = DAG.getTargetLoweringInfo();

// For chips with slow 32-byte unaligned loads, break the 32-byte operation		// For chips with slow 32-byte unaligned loads, break the 32-byte operation
// into two 16-byte operations.		// into two 16-byte operations.
ISD::LoadExtType Ext = Ld->getExtensionType();		ISD::LoadExtType Ext = Ld->getExtensionType();
unsigned Alignment = Ld->getAlignment();		unsigned Alignment = Ld->getAlignment();
bool IsAligned = Alignment == 0 \|\| Alignment >= MemVT.getSizeInBits()/8;		bool IsAligned = Alignment == 0 \|\| Alignment >= MemVT.getSizeInBits()/8;
Show All 22 Lines	SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
Load2.getValue(1));		Load2.getValue(1));

SDValue NewVec = DAG.getUNDEF(RegVT);		SDValue NewVec = DAG.getUNDEF(RegVT);
NewVec = Insert128BitVector(NewVec, Load1, 0, DAG, dl);		NewVec = Insert128BitVector(NewVec, Load1, 0, DAG, dl);
NewVec = Insert128BitVector(NewVec, Load2, NumElems/2, DAG, dl);		NewVec = Insert128BitVector(NewVec, Load2, NumElems/2, DAG, dl);
return DCI.CombineTo(N, NewVec, TF, true);		return DCI.CombineTo(N, NewVec, TF, true);
}		}

		// Convertion from x86mmx/i64 to v2i64 types is often done via stack
		spatelUnsubmitted Not Done Reply Inline Actions "Conversion" spatel: "Conversion"
		// store/load. Under certain conditions we can bypass the memory access and
		// combine this load to use a scalar_to_vector instead. This leads to
		// a reduction in the stack use, redudant emission of shuffles and create
		spatelUnsubmitted Not Done Reply Inline Actions "redundant" spatel: "redundant"
		// isel matching candidates for movq2dq instructions.
		if (RegVT == MVT::v2i64 && Subtarget->hasSSE2() && Ext == ISD::EXTLOAD &&
		!Ld->isVolatile() && ISD::isNON_TRUNCStore(Chain.getNode())) {

		// If this load is directly stored, get the original source value
		spatelUnsubmitted Not Done Reply Inline Actions Period at end of sentence. spatel: Period at end of sentence.
		StoreSDNode *PrevST = cast<StoreSDNode>(Chain);
		EVT SrcTy = PrevST->getValue().getValueType();
		if (PrevST->getBasePtr() != Ptr \|\|
		!(SrcTy == MVT::i64 \|\| SrcTy == MVT::x86mmx))
		return SDValue();
		SDValue SrcVal = Chain.getOperand(1);

		// On 32bit systems, we can't save 64bit integers, use f64 instead
		spatelUnsubmitted Not Done Reply Inline Actions Period at end of sentence. spatel: Period at end of sentence.
		chandlercUnsubmitted Not Done Reply Inline Actions Also, I would use "store" rather than "save". chandlerc: Also, I would use "store" rather than "save".
		bool Usef64 = TLI.isTypeLegal(MVT::f64) && !Subtarget->is64Bit();
		if (Usef64)
		SrcVal = DAG.getNode(ISD::BITCAST, dl, MVT::f64, SrcVal);
		SrcVal = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, Usef64 ? MVT::v2f64 : RegVT,
		SrcVal);

		return DCI.CombineTo(N, Usef64 ?
		DAG.getNode(ISD::BITCAST, dl, RegVT, SrcVal) : SrcVal, Chain);
		}

return SDValue();		return SDValue();
}		}

/// PerformSTORECombine - Do target-specific dag combines on STORE nodes.		/// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,		static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
const X86Subtarget *Subtarget) {		const X86Subtarget *Subtarget) {
StoreSDNode *St = cast<StoreSDNode>(N);		StoreSDNode *St = cast<StoreSDNode>(N);
EVT VT = St->getValue().getValueType();		EVT VT = St->getValue().getValueType();
▲ Show 20 Lines • Show All 1,951 Lines • Show Last 20 Lines

test/CodeGen/X86/2012-01-18-vbitcast.ll

	; RUN: llc < %s -march=x86-64 -mcpu=corei7 -mtriple=x86_64-pc-win32 \| FileCheck %s			; RUN: llc < %s -march=x86-64 -mcpu=corei7 -mtriple=x86_64-pc-win32 \| FileCheck %s

	;CHECK-LABEL: vcast:			;CHECK-LABEL: vcast:
	define <2 x i32> @vcast(<2 x float> %a, <2 x float> %b) {			define <2 x i32> @vcast(<2 x float> %a, <2 x float> %b) {
	;CHECK: pmovzxdq			;CHECK-NOT: pmovzxdq
				spatelUnsubmitted Not Done Reply Inline Actions I was told that we should always put a space between the ';' and 'CHECK'. And yes, I realize that a very large fraction of the existing regression tests do not do this. :) spatel: I was told that we should always put a space between the ';' and 'CHECK'. And yes, I realize…
				chandlercUnsubmitted Not Done Reply Inline Actions My preference is to convert tests to use the utils/update_llc_test_checks.py script when appropriate (IE, when they are testing a microscopic and very specific sequence of instructions). Where we can, I'd also appreciate merging test cases out of <date>-foo.ll files and into reasonably semantically named files. I'm not too fussed about what order this stuff happens in, but I usually find it easier to review patches when the test cases are moved or regularized with the script first as it tends to make the diff cleaner. Anyways, not super important to this specific patch, but hopefully useful going forward. chandlerc: My preference is to convert tests to use the utils/update_llc_test_checks.py script when…
	;CHECK: pmovzxdq			;CHECK-NOT: pmovzxdq
				;CHECK: movdqa (%{{.*}}), %[[R0:xmm[0-9]+]]
	%af = bitcast <2 x float> %a to <2 x i32>			%af = bitcast <2 x float> %a to <2 x i32>
	%bf = bitcast <2 x float> %b to <2 x i32>			%bf = bitcast <2 x float> %b to <2 x i32>
	%x = sub <2 x i32> %af, %bf			%x = sub <2 x i32> %af, %bf
	;CHECK: psubq			;CHECK-NEXT: psubq (%{{.*}}), %[[R0]]
	ret <2 x i32> %x			ret <2 x i32> %x
	;CHECK: ret			;CHECK: ret
	}			}

test/CodeGen/X86/lower-bitcast.ll

	Show First 20 Lines • Show All 62 Lines • ▼ Show 20 Lines


	define i64 @test4(i64 %A) {			define i64 @test4(i64 %A) {
	%1 = bitcast i64 %A to <2 x i32>			%1 = bitcast i64 %A to <2 x i32>
	%add = add <2 x i32> %1, <i32 3, i32 5>			%add = add <2 x i32> %1, <i32 3, i32 5>
	%2 = bitcast <2 x i32> %add to i64			%2 = bitcast <2 x i32> %add to i64
	ret i64 %2			ret i64 %2
	}			}
	; FIXME: At the moment we still produce the sequence pshufd+paddd+pshufd.			; FIXME: At the moment we still produce the sequence paddd+pshufd.
	; Ideally, we should fold that sequence into a single paddd. This is fixed with			; Ideally, we should fold that sequence into a single paddd. This is fixed with
	; the widening legalization.			; the widening legalization.
	;			;
	; CHECK-LABEL: test4			; CHECK-LABEL: test4
	; CHECK: pshufd			; CHECK: movd
				; CHECK-NOT: pshufd
	; CHECK-NEXT: paddd			; CHECK-NEXT: paddd
	; CHECK-NEXT: pshufd			; CHECK-NEXT: pshufd
	; CHECK: ret			; CHECK: ret
	;			;
	; CHECK-WIDE-LABEL: test4			; CHECK-WIDE-LABEL: test4
	; CHECK-WIDE: movd %{{rdi\|rcx}},			; CHECK-WIDE: movd %{{rdi\|rcx}},
	; CHECK-WIDE-NEXT: paddd			; CHECK-WIDE-NEXT: paddd
	; CHECK-WIDE-NEXT: movd {{.*}}, %rax			; CHECK-WIDE-NEXT: movd {{.*}}, %rax
	▲ Show 20 Lines • Show All 104 Lines • Show Last 20 Lines

test/CodeGen/X86/mmx-movq2dq.ll

This file was added.

				; RUN: llc < %s -march=x86 -mattr=+mmx,+sse2 \| FileCheck %s -check-prefix=X86-32
				chandlercUnsubmitted Not Done Reply Inline Actions How about a generic 'vector-shuffle-mmx.ll' test file that we can consolidate MMX-specific shuffle and mov test cases into? chandlerc: How about a generic 'vector-shuffle-mmx.ll' test file that we can consolidate MMX-specific…
				; RUN: llc < %s -march=x86-64 -mattr=+mmx,+sse2 \| FileCheck %s -check-prefix=X86-64

				; X86-32-LABEL: test0
				; X86-64-LABEL: test0
				define i32 @test0(<1 x i64>* %v4) {
				%v5 = load <1 x i64>* %v4, align 8
				%v12 = bitcast <1 x i64> %v5 to <4 x i16>
				%v13 = bitcast <4 x i16> %v12 to x86_mmx
				; X86-32: pshufw $-18
				; X86-32-NOT: movq
				; X86-32-NOT: movsd
				; X86-32: movq2dq
				; X86-64: pshufw $-18
				; X86-64-NOT: movq
				; X86-64-NOT: pshufd
				; X86-64: movq2dq
				; X86-64-NEXT: movd
				%v14 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %v13, i8 -18)
				%v15 = bitcast x86_mmx %v14 to <4 x i16>
				%v16 = bitcast <4 x i16> %v15 to <1 x i64>
				%v17 = extractelement <1 x i64> %v16, i32 0
				%v18 = bitcast i64 %v17 to <2 x i32>
				%v19 = extractelement <2 x i32> %v18, i32 0
				%v20 = add i32 %v19, 32
				ret i32 %v20
				}

				declare x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx, i8)

test/CodeGen/X86/widen_load-2.ll

	Show First 20 Lines • Show All 72 Lines • ▼ Show 20 Lines
	%i16vec3 = type <3 x i16>			%i16vec3 = type <3 x i16>
	define void @add3i16(%i16vec3* nocapture sret %ret, %i16vec3* %ap, %i16vec3* %bp) nounwind {			define void @add3i16(%i16vec3* nocapture sret %ret, %i16vec3* %ap, %i16vec3* %bp) nounwind {
	; CHECK-LABEL: add3i16:			; CHECK-LABEL: add3i16:
	; CHECK: pmovzxwd (%{{.*}}), %[[R0:xmm[0-9]+]]			; CHECK: pmovzxwd (%{{.*}}), %[[R0:xmm[0-9]+]]
	; CHECK-NEXT: pmovzxwd (%{{.*}}), %[[R1:xmm[0-9]+]]			; CHECK-NEXT: pmovzxwd (%{{.*}}), %[[R1:xmm[0-9]+]]
	; CHECK-NEXT: paddd %[[R0]], %[[R1]]			; CHECK-NEXT: paddd %[[R0]], %[[R1]]
	; CHECK-NEXT: pextrw $4, %[[R1]], 4(%{{.*}})			; CHECK-NEXT: pextrw $4, %[[R1]], 4(%{{.*}})
	; CHECK-NEXT: pshufb {{.*}}, %[[R1]]			; CHECK-NEXT: pshufb {{.*}}, %[[R1]]
	; CHECK-NEXT: pmovzxdq %[[R1]], %[[R0]]			; CHECK-NEXT: movd %[[R1]], (%{{.*}})
	; CHECK-NEXT: movd %[[R0]], (%{{.*}})
	%a = load %i16vec3* %ap, align 16			%a = load %i16vec3* %ap, align 16
	%b = load %i16vec3* %bp, align 16			%b = load %i16vec3* %bp, align 16
	%x = add %i16vec3 %a, %b			%x = add %i16vec3 %a, %b
	store %i16vec3 %x, %i16vec3* %ret, align 16			store %i16vec3 %x, %i16vec3* %ret, align 16
	ret void			ret void
	}			}

	%i16vec4 = type <4 x i16>			%i16vec4 = type <4 x i16>
	▲ Show 20 Lines • Show All 137 Lines • Show Last 20 Lines