This is an archive of the discontinued LLVM Phabricator instance.

[X86] Add a hack to combinePMULDQ to manually turn SIGN_EXTEND_VECTOR_INREG/ZERO_EXTEND_VECTOR_INREG inputs into an ANY_EXTEND_VECTOR_INREG style shuffle
ClosedPublic

Authored by craig.topper on Aug 19 2019, 1:16 PM.

Download Raw Diff

Details

Reviewers

RKSimon
spatel

Commits

rL369942: [X86] Add a hack to combinePMULDQ to manually turn…
rG36d1588f017b: [X86] Add a hack to combinePMULDQ to manually turn…

Summary

ANY_EXTEND_VECTOR_INREG isn't currently marked Legal which prevents SimplifyDemandedBits from turning SIGN/ZERO_EXTEND_VECTOR_INREG into it after op legalization. And even if we did make it Legal, combineExtInVec doesn't do shuffle combining on the VECTOR_INREG nodes until AVX1.

This patch adds a quick hack to combinePMULDQ to directly emit a vector shuffle corresponding to an ANY_EXTEND_VECTOR_INREG operation. This avoids both of those issues without creating any other regressions on our tests. The xop-ifma.ll change here also showed up when I tried to resurrect D56306 and seemed to be the only improvement that patch creates now. This is a more direct way to get the benefit.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

craig.topper created this revision.Aug 19 2019, 1:16 PM

Herald added a project: Restricted Project. · View Herald TranscriptAug 19 2019, 1:16 PM

Herald added a subscriber: hiraditya. · View Herald Transcript

Ping

LGTM, tbh I'd much prefer to get ANY_EXTEND_VECTOR_INREG handled properly, but there's still some yak shaving to do there.....

This revision is now accepted and ready to land.Aug 26 2019, 7:03 AM

Closed by commit rG36d1588f017b: [X86] Add a hack to combinePMULDQ to manually turn… (authored by craig.topper). · Explain WhyAug 26 2019, 11:29 AM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

llvm/

lib/

Target/

X86/

X86ISelLowering.cpp

28 lines

test/

CodeGen/

X86/

pmul.ll

11 lines

xop-ifma.ll

12 lines

Diff 217215

llvm/lib/Target/X86/X86ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 44,653 Lines • ▼ Show 20 Lines	static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,
if (ISD::isBuildVectorAllZeros(RHS.getNode()))		if (ISD::isBuildVectorAllZeros(RHS.getNode()))
return RHS;		return RHS;

// PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.		// PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
const TargetLowering &TLI = DAG.getTargetLoweringInfo();		const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnesValue(64), DCI))		if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnesValue(64), DCI))
return SDValue(N, 0);		return SDValue(N, 0);

		// If the input is an extend_invec and the SimplifyDemandedBits call didn't
		// convert it to any_extend_invec, due to the LegalOperations check, do the
		// conversion directly to a vector shuffle manually. This exposes combine
		// opportunities missed by combineExtInVec not calling
		// combineX86ShufflesRecursively on SSE4.1 targets.
		// FIXME: This is basically a hack around several other issues related to
		// ANY_EXTEND_VECTOR_INREG.
		if (N->getValueType(0) == MVT::v2i64 && LHS.hasOneUse() &&
		(LHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG \|\|
		LHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
		LHS.getOperand(0).getValueType() == MVT::v4i32) {
		SDLoc dl(N);
		LHS = DAG.getVectorShuffle(MVT::v4i32, dl, LHS.getOperand(0),
		LHS.getOperand(0), { 0, -1, 1, -1 });
		LHS = DAG.getBitcast(MVT::v2i64, LHS);
		return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
		}
		if (N->getValueType(0) == MVT::v2i64 && RHS.hasOneUse() &&
		(RHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG \|\|
		RHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
		RHS.getOperand(0).getValueType() == MVT::v4i32) {
		SDLoc dl(N);
		RHS = DAG.getVectorShuffle(MVT::v4i32, dl, RHS.getOperand(0),
		RHS.getOperand(0), { 0, -1, 1, -1 });
		RHS = DAG.getBitcast(MVT::v2i64, RHS);
		return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
		}

return SDValue();		return SDValue();
}		}

static SDValue combineExtInVec(SDNode *N, SelectionDAG &DAG,		static SDValue combineExtInVec(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,		TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {		const X86Subtarget &Subtarget) {
EVT VT = N->getValueType(0);		EVT VT = N->getValueType(0);
SDValue In = N->getOperand(0);		SDValue In = N->getOperand(0);
▲ Show 20 Lines • Show All 1,398 Lines • Show Last 20 Lines

llvm/test/CodeGen/X86/pmul.ll

	Show First 20 Lines • Show All 1,125 Lines • ▼ Show 20 Lines
	; SSE2-NEXT: psrlq $32, %xmm1			; SSE2-NEXT: psrlq $32, %xmm1
	; SSE2-NEXT: pmuludq %xmm1, %xmm3			; SSE2-NEXT: pmuludq %xmm1, %xmm3
	; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm2[0,2]			; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm2[0,2]
	; SSE2-NEXT: movaps %xmm3, %xmm0			; SSE2-NEXT: movaps %xmm3, %xmm0
	; SSE2-NEXT: retq			; SSE2-NEXT: retq
	;			;
	; SSE41-LABEL: mul_v4i64_zero_lower:			; SSE41-LABEL: mul_v4i64_zero_lower:
	; SSE41: # %bb.0: # %entry			; SSE41: # %bb.0: # %entry
	; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]			; SSE41-NEXT: psrlq $32, %xmm2
	; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero			; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,2,3,3]
	; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero			; SSE41-NEXT: pmuludq %xmm2, %xmm3
	; SSE41-NEXT: psrlq $32, %xmm1			; SSE41-NEXT: psrlq $32, %xmm1
				; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
	; SSE41-NEXT: pmuludq %xmm1, %xmm0			; SSE41-NEXT: pmuludq %xmm1, %xmm0
	; SSE41-NEXT: psrlq $32, %xmm2			; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
	; SSE41-NEXT: pmuludq %xmm3, %xmm2
	; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
	; SSE41-NEXT: retq			; SSE41-NEXT: retq
	;			;
	; AVX-LABEL: mul_v4i64_zero_lower:			; AVX-LABEL: mul_v4i64_zero_lower:
	; AVX: # %bb.0: # %entry			; AVX: # %bb.0: # %entry
	; AVX-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero			; AVX-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
	; AVX-NEXT: vpsrlq $32, %ymm1, %ymm1			; AVX-NEXT: vpsrlq $32, %ymm1, %ymm1
	; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm0			; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
	; AVX-NEXT: vpsllq $32, %ymm0, %ymm0			; AVX-NEXT: vpsllq $32, %ymm0, %ymm0
	▲ Show 20 Lines • Show All 241 Lines • Show Last 20 Lines

llvm/test/CodeGen/X86/xop-ifma.ll

Show First 20 Lines • Show All 61 Lines • ▼ Show 20 Lines	; XOP-AVX2-NEXT: retq
%1 = mul <8 x i32> %a0, %a1		%1 = mul <8 x i32> %a0, %a1
%2 = add <8 x i32> %a2, %1		%2 = add <8 x i32> %a2, %1
ret <8 x i32> %2		ret <8 x i32> %2
}		}

define <4 x i64> @test_mulx_v4i32_add_v4i64(<4 x i32> %a0, <4 x i32> %a1, <4 x i64> %a2) {		define <4 x i64> @test_mulx_v4i32_add_v4i64(<4 x i32> %a0, <4 x i32> %a1, <4 x i64> %a2) {
; XOP-AVX1-LABEL: test_mulx_v4i32_add_v4i64:		; XOP-AVX1-LABEL: test_mulx_v4i32_add_v4i64:
; XOP-AVX1: # %bb.0:		; XOP-AVX1: # %bb.0:
; XOP-AVX1-NEXT: vpmovsxdq %xmm0, %xmm3		; XOP-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero
; XOP-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]		; XOP-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero
; XOP-AVX1-NEXT: vpmovsxdq %xmm0, %xmm0		; XOP-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
; XOP-AVX1-NEXT: vpmovsxdq %xmm1, %xmm4		; XOP-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
; XOP-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
; XOP-AVX1-NEXT: vpmovsxdq %xmm1, %xmm1
; XOP-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5		; XOP-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
; XOP-AVX1-NEXT: vpmacsdql %xmm5, %xmm1, %xmm0, %xmm0		; XOP-AVX1-NEXT: vpmacsdql %xmm5, %xmm1, %xmm0, %xmm0
; XOP-AVX1-NEXT: vpmacsdql %xmm2, %xmm4, %xmm3, %xmm1		; XOP-AVX1-NEXT: vpmacsdql %xmm2, %xmm3, %xmm4, %xmm1
; XOP-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0		; XOP-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; XOP-AVX1-NEXT: retq		; XOP-AVX1-NEXT: retq
;		;
; XOP-AVX2-LABEL: test_mulx_v4i32_add_v4i64:		; XOP-AVX2-LABEL: test_mulx_v4i32_add_v4i64:
; XOP-AVX2: # %bb.0:		; XOP-AVX2: # %bb.0:
; XOP-AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero		; XOP-AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; XOP-AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero		; XOP-AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
; XOP-AVX2-NEXT: vpmuldq %ymm1, %ymm0, %ymm0		; XOP-AVX2-NEXT: vpmuldq %ymm1, %ymm0, %ymm0
▲ Show 20 Lines • Show All 43 Lines • Show Last 20 Lines