This is an archive of the discontinued LLVM Phabricator instance.

[X86][FMA] Optimize FNEG(FMA) Patterns
ClosedPublic

Authored by RKSimon on Nov 22 2015, 11:11 AM.

Download Raw Diff

Details

Reviewers

spatel
qcolombet
delena

Commits

rG1b4fecb098a5: [X86][FMA] Optimize FNEG(FMA) Patterns
rL254016: [X86][FMA] Optimize FNEG(FMA) Patterns

Summary

X86 needs to use its own FMA opcodes, preventing the standard FNEG(FMA) pattern table recognition method used by other platforms. This patch adds support for lowering FNEG(FMA(X,Y,Z)) into a single suitably negated FMA instruction.

Fix for PR24364

Diff Detail

Repository: rL LLVM

Event Timeline

RKSimon updated this revision to Diff 40878.Nov 22 2015, 11:11 AM

RKSimon retitled this revision from to [X86][FMA] Optimize FNEG(FMA) Patterns.

RKSimon updated this object.

RKSimon added reviewers: spatel, delena, qcolombet.

RKSimon set the repository for this revision to rL LLVM.

RKSimon added a subscriber: llvm-commits.

delena added inline comments.Nov 23 2015, 2:25 AM

lib/Target/X86/X86ISelLowering.cpp
13400 ↗	(On Diff #40878)	What we are doing in PerformFMACombine() ? May be we can extend there?

RKSimon added inline comments.Nov 23 2015, 6:15 AM

lib/Target/X86/X86ISelLowering.cpp
13400 ↗	(On Diff #40878)	In PerformFMACombine we convert ISD::FMA nodes to the equivalent X86ISD FMADD/FMSUB/FNMADD/FNMSUB nodes based on which inputs are FNEG. This patch deal with FNEG which has a X86ISD FMA type as its input - we don't have a PerformFNEGCombine but I could move this patch there if you think it necessary.

I agree that the PerformFMACombine() is not the right place. May be PerformFNEGCombine() is more clear? I'll let you to decide.
LGTM.

This revision is now accepted and ready to land.Nov 23 2015, 10:38 PM

Closed by commit rL254016: [X86][FMA] Optimize FNEG(FMA) Patterns (authored by RKSimon). · Explain WhyNov 24 2015, 12:34 PM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

llvm/

trunk/

lib/

Target/

X86/

X86ISelLowering.cpp

29 lines

test/

CodeGen/

X86/

fma_patterns.ll

68 lines

Diff 41074

llvm/trunk/lib/Target/X86/X86ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 1,767 Lines • ▼ Show 20 Lines
setTargetDAGCombine(ISD::SHL);		setTargetDAGCombine(ISD::SHL);
setTargetDAGCombine(ISD::SRA);		setTargetDAGCombine(ISD::SRA);
setTargetDAGCombine(ISD::SRL);		setTargetDAGCombine(ISD::SRL);
setTargetDAGCombine(ISD::OR);		setTargetDAGCombine(ISD::OR);
setTargetDAGCombine(ISD::AND);		setTargetDAGCombine(ISD::AND);
setTargetDAGCombine(ISD::ADD);		setTargetDAGCombine(ISD::ADD);
setTargetDAGCombine(ISD::FADD);		setTargetDAGCombine(ISD::FADD);
setTargetDAGCombine(ISD::FSUB);		setTargetDAGCombine(ISD::FSUB);
		setTargetDAGCombine(ISD::FNEG);
setTargetDAGCombine(ISD::FMA);		setTargetDAGCombine(ISD::FMA);
setTargetDAGCombine(ISD::SUB);		setTargetDAGCombine(ISD::SUB);
setTargetDAGCombine(ISD::LOAD);		setTargetDAGCombine(ISD::LOAD);
setTargetDAGCombine(ISD::MLOAD);		setTargetDAGCombine(ISD::MLOAD);
setTargetDAGCombine(ISD::STORE);		setTargetDAGCombine(ISD::STORE);
setTargetDAGCombine(ISD::MSTORE);		setTargetDAGCombine(ISD::MSTORE);
setTargetDAGCombine(ISD::TRUNCATE);		setTargetDAGCombine(ISD::TRUNCATE);
setTargetDAGCombine(ISD::ZERO_EXTEND);		setTargetDAGCombine(ISD::ZERO_EXTEND);
▲ Show 20 Lines • Show All 24,359 Lines • ▼ Show 20 Lines	static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG,
// Try to synthesize horizontal subs from subs of shuffles.		// Try to synthesize horizontal subs from subs of shuffles.
if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 \|\| VT == MVT::v2f64)) \|\|		if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 \|\| VT == MVT::v2f64)) \|\|
(Subtarget->hasFp256() && (VT == MVT::v8f32 \|\| VT == MVT::v4f64))) &&		(Subtarget->hasFp256() && (VT == MVT::v8f32 \|\| VT == MVT::v4f64))) &&
isHorizontalBinOp(LHS, RHS, false))		isHorizontalBinOp(LHS, RHS, false))
return DAG.getNode(X86ISD::FHSUB, SDLoc(N), VT, LHS, RHS);		return DAG.getNode(X86ISD::FHSUB, SDLoc(N), VT, LHS, RHS);
return SDValue();		return SDValue();
}		}

		/// Do target-specific dag combines on floating point negations.
		static SDValue PerformFNEGCombine(SDNode *N, SelectionDAG &DAG,
		const X86Subtarget *Subtarget) {
		EVT VT = N->getValueType(0);
		SDValue Arg = N->getOperand(0);

		// If we're negating a FMA node, then we can adjust the
		// instruction to include the extra negation.
		if (Arg.hasOneUse()) {
		switch (Arg.getOpcode()) {
		case X86ISD::FMADD:
		return DAG.getNode(X86ISD::FNMSUB, SDLoc(N), VT, Arg.getOperand(0),
		Arg.getOperand(1), Arg.getOperand(2));
		case X86ISD::FMSUB:
		return DAG.getNode(X86ISD::FNMADD, SDLoc(N), VT, Arg.getOperand(0),
		Arg.getOperand(1), Arg.getOperand(2));
		case X86ISD::FNMADD:
		return DAG.getNode(X86ISD::FMSUB, SDLoc(N), VT, Arg.getOperand(0),
		Arg.getOperand(1), Arg.getOperand(2));
		case X86ISD::FNMSUB:
		return DAG.getNode(X86ISD::FMADD, SDLoc(N), VT, Arg.getOperand(0),
		Arg.getOperand(1), Arg.getOperand(2));
		}
		}
		return SDValue();
		}

/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.		/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG,		static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG,
const X86Subtarget *Subtarget) {		const X86Subtarget *Subtarget) {
assert(N->getOpcode() == X86ISD::FOR \|\| N->getOpcode() == X86ISD::FXOR);		assert(N->getOpcode() == X86ISD::FOR \|\| N->getOpcode() == X86ISD::FXOR);

// F[X]OR(0.0, x) -> x		// F[X]OR(0.0, x) -> x
if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))		if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
if (C->getValueAPF().isPosZero())		if (C->getValueAPF().isPosZero())
▲ Show 20 Lines • Show All 878 Lines • ▼ Show 20 Lines	SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::LOAD: return PerformLOADCombine(N, DAG, DCI, Subtarget);		case ISD::LOAD: return PerformLOADCombine(N, DAG, DCI, Subtarget);
case ISD::MLOAD: return PerformMLOADCombine(N, DAG, DCI, Subtarget);		case ISD::MLOAD: return PerformMLOADCombine(N, DAG, DCI, Subtarget);
case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget);		case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget);
case ISD::MSTORE: return PerformMSTORECombine(N, DAG, Subtarget);		case ISD::MSTORE: return PerformMSTORECombine(N, DAG, Subtarget);
case ISD::SINT_TO_FP: return PerformSINT_TO_FPCombine(N, DAG, Subtarget);		case ISD::SINT_TO_FP: return PerformSINT_TO_FPCombine(N, DAG, Subtarget);
case ISD::UINT_TO_FP: return PerformUINT_TO_FPCombine(N, DAG, Subtarget);		case ISD::UINT_TO_FP: return PerformUINT_TO_FPCombine(N, DAG, Subtarget);
case ISD::FADD: return PerformFADDCombine(N, DAG, Subtarget);		case ISD::FADD: return PerformFADDCombine(N, DAG, Subtarget);
case ISD::FSUB: return PerformFSUBCombine(N, DAG, Subtarget);		case ISD::FSUB: return PerformFSUBCombine(N, DAG, Subtarget);
		case ISD::FNEG: return PerformFNEGCombine(N, DAG, Subtarget);
case ISD::TRUNCATE: return PerformTRUNCATECombine(N, DAG, Subtarget);		case ISD::TRUNCATE: return PerformTRUNCATECombine(N, DAG, Subtarget);
case X86ISD::FXOR:		case X86ISD::FXOR:
case X86ISD::FOR: return PerformFORCombine(N, DAG, Subtarget);		case X86ISD::FOR: return PerformFORCombine(N, DAG, Subtarget);
case X86ISD::FMIN:		case X86ISD::FMIN:
case X86ISD::FMAX: return PerformFMinFMaxCombine(N, DAG);		case X86ISD::FMAX: return PerformFMinFMaxCombine(N, DAG);
case X86ISD::FAND: return PerformFANDCombine(N, DAG);		case X86ISD::FAND: return PerformFANDCombine(N, DAG);
case X86ISD::FANDN: return PerformFANDNCombine(N, DAG);		case X86ISD::FANDN: return PerformFANDNCombine(N, DAG);
case X86ISD::BT: return PerformBTCombine(N, DAG, DCI);		case X86ISD::BT: return PerformBTCombine(N, DAG, DCI);
▲ Show 20 Lines • Show All 852 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/X86/fma_patterns.ll

	Show First 20 Lines • Show All 562 Lines • ▼ Show 20 Lines
	; CHECK_FMA4-NEXT: retq			; CHECK_FMA4-NEXT: retq
	%t1 = fsub <4 x double> <double 1.0, double 1.0, double 1.0, double 1.0>, %t			%t1 = fsub <4 x double> <double 1.0, double 1.0, double 1.0, double 1.0>, %t
	%tx = fmul <4 x double> %x, %t			%tx = fmul <4 x double> %x, %t
	%ty = fmul <4 x double> %y, %t1			%ty = fmul <4 x double> %y, %t1
	%r = fadd <4 x double> %tx, %ty			%r = fadd <4 x double> %tx, %ty
	ret <4 x double> %r			ret <4 x double> %r
	}			}

				; (fneg (fma x, y, z)) -> (fma x, -y, -z)

				define <4 x float> @test_v4f32_fneg_fmadd(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
				; CHECK_FMA-LABEL: test_v4f32_fneg_fmadd:
				; CHECK_FMA: # BB#0:
				; CHECK_FMA-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm0
				; CHECK_FMA-NEXT: retq
				;
				; CHECK_FMA4-LABEL: test_v4f32_fneg_fmadd:
				; CHECK_FMA4: # BB#0:
				; CHECK_FMA4-NEXT: vfnmsubps %xmm2, %xmm1, %xmm0, %xmm0
				; CHECK_FMA4-NEXT: retq
				%mul = fmul <4 x float> %a0, %a1
				%add = fadd <4 x float> %mul, %a2
				%neg = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %add
				ret <4 x float> %neg
				}

				define <4 x double> @test_v4f64_fneg_fmsub(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
				; CHECK_FMA-LABEL: test_v4f64_fneg_fmsub:
				; CHECK_FMA: # BB#0:
				; CHECK_FMA-NEXT: vfnmadd213pd %ymm2, %ymm1, %ymm0
				; CHECK_FMA-NEXT: retq
				;
				; CHECK_FMA4-LABEL: test_v4f64_fneg_fmsub:
				; CHECK_FMA4: # BB#0:
				; CHECK_FMA4-NEXT: vfnmaddpd %ymm2, %ymm1, %ymm0, %ymm0
				; CHECK_FMA4-NEXT: retq
				%mul = fmul <4 x double> %a0, %a1
				%sub = fsub <4 x double> %mul, %a2
				%neg = fsub <4 x double> <double -0.0, double -0.0, double -0.0, double -0.0>, %sub
				ret <4 x double> %neg
				}

				define <4 x float> @test_v4f32_fneg_fnmadd(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
				; CHECK_FMA-LABEL: test_v4f32_fneg_fnmadd:
				; CHECK_FMA: # BB#0:
				; CHECK_FMA-NEXT: vfmsub213ps %xmm2, %xmm1, %xmm0
				; CHECK_FMA-NEXT: retq
				;
				; CHECK_FMA4-LABEL: test_v4f32_fneg_fnmadd:
				; CHECK_FMA4: # BB#0:
				; CHECK_FMA4-NEXT: vfmsubps %xmm2, %xmm1, %xmm0, %xmm0
				; CHECK_FMA4-NEXT: retq
				%mul = fmul <4 x float> %a0, %a1
				%neg0 = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %mul
				%add = fadd <4 x float> %neg0, %a2
				%neg1 = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %add
				ret <4 x float> %neg1
				}

				define <4 x double> @test_v4f64_fneg_fnmsub(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
				; CHECK_FMA-LABEL: test_v4f64_fneg_fnmsub:
				; CHECK_FMA: # BB#0:
				; CHECK_FMA-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0
				; CHECK_FMA-NEXT: retq
				;
				; CHECK_FMA4-LABEL: test_v4f64_fneg_fnmsub:
				; CHECK_FMA4: # BB#0:
				; CHECK_FMA4-NEXT: vfmaddpd %ymm2, %ymm1, %ymm0, %ymm0
				; CHECK_FMA4-NEXT: retq
				%mul = fmul <4 x double> %a0, %a1
				%neg0 = fsub <4 x double> <double -0.0, double -0.0, double -0.0, double -0.0>, %mul
				%sub = fsub <4 x double> %neg0, %a2
				%neg1 = fsub <4 x double> <double -0.0, double -0.0, double -0.0, double -0.0>, %sub
				ret <4 x double> %neg1
				}

	; (fma x, c1, (fmul x, c2)) -> (fmul x, c1+c2)			; (fma x, c1, (fmul x, c2)) -> (fmul x, c1+c2)

	define <4 x float> @test_v4f32_fma_x_c1_fmul_x_c2(<4 x float> %x) #0 {			define <4 x float> @test_v4f32_fma_x_c1_fmul_x_c2(<4 x float> %x) #0 {
	; ALL-LABEL: test_v4f32_fma_x_c1_fmul_x_c2:			; ALL-LABEL: test_v4f32_fma_x_c1_fmul_x_c2:
	; ALL: # BB#0:			; ALL: # BB#0:
	; ALL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0			; ALL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0
	; ALL-NEXT: retq			; ALL-NEXT: retq
	%m0 = fmul <4 x float> %x, <float 1.0, float 2.0, float 3.0, float 4.0>			%m0 = fmul <4 x float> %x, <float 1.0, float 2.0, float 3.0, float 4.0>
	Show All 24 Lines