This is an archive of the discontinued LLVM Phabricator instance.

[X86] When pattern-matching scalar FMA3 intrinsics, don't re-arrange the first and second operands
ClosedPublic

Authored by mkuper on May 21 2015, 5:38 AM.

Download Raw Diff

Details

Reviewers

delena
craig.topper
lhames

Commits

rGf1452286764e: [X86] When pattern-matching scalar FMA3 intrinsics, don't re-arrange the first…
rL238131: [X86] When pattern-matching scalar FMA3 intrinsics, don't re-arrange the first…

Summary

The semantics of the scalar FMA intrinsics are that the high vector elements are copied from the first source, e.g. (from the Intel manual):

m128 _mm_fmadd_ss (m128 a, m128 b, m128 c)
Operation:
dst[31:0] := (a[31:0] * b[31:0]) + c[31:0]
dst[127:32] := a[127:32]
dst[MAX:128] := 0

The current pattern switches src1 and src2 around (I guess to match the "213" order), which ends up tying the original src2 to the dest.
Since the actual scalar fma3 instructions copy the high elements from the dest register, the wrong values are copied.

This modifies the pattern to leave src1 and src2 in their original order.

Diff Detail

Repository: rL LLVM

Event Timeline

mkuper updated this revision to Diff 26222.May 21 2015, 5:38 AM

mkuper retitled this revision from to [X86] When pattern-matching scalar FMA3 intrinsics, don't re-arrange the first and second operands.

mkuper updated this object.

mkuper edited the test plan for this revision. (Show Details)

mkuper added reviewers: delena, lhames, craig.topper.

mkuper added a subscriber: Unknown Object (MLST).

LGTM

lib/Target/X86/X86InstrFMA.td
190 ↗	(On Diff #26222)	Please add a comment, that you use 1-2-3 instead of 2-1-3 because src1 is tied to dest.

Will do.
Thanks, Elena!

Closed by commit rL238131: [X86] When pattern-matching scalar FMA3 intrinsics, don't re-arrange the first… (authored by mkuper). · Explain WhyMay 25 2015, 5:39 AM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

llvm/

trunk/

lib/

Target/

X86/

X86InstrFMA.td

9 lines

test/

CodeGen/

X86/

fma3-intrinsics.ll

32 lines

Diff 26424

llvm/trunk/lib/Target/X86/X86InstrFMA.td

	Show First 20 Lines • Show All 177 Lines • ▼ Show 20 Lines
	multiclass fma3s<bits<8> opc132, bits<8> opc213, bits<8> opc231,			multiclass fma3s<bits<8> opc132, bits<8> opc213, bits<8> opc231,
	string OpStr, Intrinsic IntF32, Intrinsic IntF64,			string OpStr, Intrinsic IntF32, Intrinsic IntF64,
	SDNode OpNode> {			SDNode OpNode> {
	defm SS : fma3s_forms<opc132, opc213, opc231, OpStr, "ss", "SS", IntF32, OpNode,			defm SS : fma3s_forms<opc132, opc213, opc231, OpStr, "ss", "SS", IntF32, OpNode,
	FR32, f32, f32mem, ssmem, loadf32, sse_load_f32>;			FR32, f32, f32mem, ssmem, loadf32, sse_load_f32>;
	defm SD : fma3s_forms<opc132, opc213, opc231, OpStr, "sd", "PD", IntF64, OpNode,			defm SD : fma3s_forms<opc132, opc213, opc231, OpStr, "sd", "PD", IntF64, OpNode,
	FR64, f64, f64mem, sdmem, loadf64, sse_load_f64>, VEX_W;			FR64, f64, f64mem, sdmem, loadf64, sse_load_f64>, VEX_W;

				// These patterns use the 123 ordering, instead of 213, even though
				// they match the intrinsic to the 213 version of the instruction.
				// This is because src1 is tied to dest, and the scalar intrinsics
				// require the pass-through values to come from the first source
				// operand, not the second.
	def : Pat<(IntF32 VR128:$src1, VR128:$src2, VR128:$src3),			def : Pat<(IntF32 VR128:$src1, VR128:$src2, VR128:$src3),
	(COPY_TO_REGCLASS			(COPY_TO_REGCLASS
	(!cast<Instruction>(NAME#"SSr213r")			(!cast<Instruction>(NAME#"SSr213r")
	(COPY_TO_REGCLASS $src2, FR32),
	(COPY_TO_REGCLASS $src1, FR32),			(COPY_TO_REGCLASS $src1, FR32),
				(COPY_TO_REGCLASS $src2, FR32),
	(COPY_TO_REGCLASS $src3, FR32)),			(COPY_TO_REGCLASS $src3, FR32)),
	VR128)>;			VR128)>;

	def : Pat<(IntF64 VR128:$src1, VR128:$src2, VR128:$src3),			def : Pat<(IntF64 VR128:$src1, VR128:$src2, VR128:$src3),
	(COPY_TO_REGCLASS			(COPY_TO_REGCLASS
	(!cast<Instruction>(NAME#"SDr213r")			(!cast<Instruction>(NAME#"SDr213r")
	(COPY_TO_REGCLASS $src2, FR64),
	(COPY_TO_REGCLASS $src1, FR64),			(COPY_TO_REGCLASS $src1, FR64),
				(COPY_TO_REGCLASS $src2, FR64),
	(COPY_TO_REGCLASS $src3, FR64)),			(COPY_TO_REGCLASS $src3, FR64)),
	VR128)>;			VR128)>;
	}			}

	defm VFMADD : fma3s<0x99, 0xA9, 0xB9, "vfmadd", int_x86_fma_vfmadd_ss,			defm VFMADD : fma3s<0x99, 0xA9, 0xB9, "vfmadd", int_x86_fma_vfmadd_ss,
	int_x86_fma_vfmadd_sd, X86Fmadd>, VEX_LIG;			int_x86_fma_vfmadd_sd, X86Fmadd>, VEX_LIG;
	defm VFMSUB : fma3s<0x9B, 0xAB, 0xBB, "vfmsub", int_x86_fma_vfmsub_ss,			defm VFMSUB : fma3s<0x9B, 0xAB, 0xBB, "vfmsub", int_x86_fma_vfmsub_ss,
	int_x86_fma_vfmsub_sd, X86Fmsub>, VEX_LIG;			int_x86_fma_vfmsub_sd, X86Fmsub>, VEX_LIG;
	▲ Show 20 Lines • Show All 184 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/X86/fma3-intrinsics.ll

	; RUN: llc < %s -mtriple=x86_64-pc-win32 -mcpu=core-avx2 \| FileCheck %s			; RUN: llc < %s -mtriple=x86_64-pc-win32 -mcpu=core-avx2 \| FileCheck %s
	; RUN: llc < %s -mtriple=x86_64-pc-win32 -mattr=+fma,+fma4 \| FileCheck %s			; RUN: llc < %s -mtriple=x86_64-pc-win32 -mattr=+fma,+fma4 \| FileCheck %s
	; RUN: llc < %s -mcpu=bdver2 -mtriple=x86_64-pc-win32 -mattr=-fma4 \| FileCheck %s			; RUN: llc < %s -mcpu=bdver2 -mtriple=x86_64-pc-win32 -mattr=-fma4 \| FileCheck %s

	define <4 x float> @test_x86_fmadd_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {			define <4 x float> @test_x86_fmadd_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
	; CHECK: fmadd213ss (%r8), %xmm			; CHECK-DAG: vmovaps (%rcx), [[XMM1:%xmm[0-9]+]]
				; CHECK-DAG: vmovaps (%rdx), [[XMM0:%xmm[0-9]+]]
				; CHECK: fmadd213ss (%r8), [[XMM1]], [[XMM0]]
	%res = call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind			%res = call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind
	ret <4 x float> %res			ret <4 x float> %res
	}			}
	declare <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>) nounwind readnone			declare <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>) nounwind readnone

	define <4 x float> @test_x86_fmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {			define <4 x float> @test_x86_fmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
	; CHECK: fmadd213ps			; CHECK: fmadd213ps
	%res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind			%res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind
	ret <4 x float> %res			ret <4 x float> %res
	}			}
	declare <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone			declare <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone

	define <8 x float> @test_x86_fmadd_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {			define <8 x float> @test_x86_fmadd_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
	; CHECK: fmadd213ps {{.\(%r.}}, %ymm			; CHECK: fmadd213ps {{.\(%r.}}, %ymm
	%res = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) nounwind			%res = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) nounwind
	ret <8 x float> %res			ret <8 x float> %res
	}			}
	declare <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone			declare <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone

	define <4 x float> @test_x86_fnmadd_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {			define <4 x float> @test_x86_fnmadd_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
	; CHECK: fnmadd213ss (%r8), %xmm			; CHECK-DAG: vmovaps (%rcx), [[XMM1:%xmm[0-9]+]]
				; CHECK-DAG: vmovaps (%rdx), [[XMM0:%xmm[0-9]+]]
				; CHECK: fnmadd213ss (%r8), [[XMM1]], [[XMM0]]
	%res = call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind			%res = call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind
	ret <4 x float> %res			ret <4 x float> %res
	}			}
	declare <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float>, <4 x float>, <4 x float>) nounwind readnone			declare <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float>, <4 x float>, <4 x float>) nounwind readnone

	define <4 x float> @test_x86_fnmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {			define <4 x float> @test_x86_fnmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
	; CHECK: fnmadd213ps			; CHECK: fnmadd213ps
	%res = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind			%res = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind
	ret <4 x float> %res			ret <4 x float> %res
	}			}
	declare <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone			declare <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone

	define <8 x float> @test_x86_fnmadd_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {			define <8 x float> @test_x86_fnmadd_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
	; CHECK: fnmadd213ps {{.\(%r.}}, %ymm			; CHECK: fnmadd213ps {{.\(%r.}}, %ymm
	%res = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) nounwind			%res = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) nounwind
	ret <8 x float> %res			ret <8 x float> %res
	}			}
	declare <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone			declare <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone


	define <4 x float> @test_x86_fmsub_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {			define <4 x float> @test_x86_fmsub_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
	; CHECK: fmsub213ss			; CHECK-DAG: vmovaps (%rcx), [[XMM1:%xmm[0-9]+]]
				; CHECK-DAG: vmovaps (%rdx), [[XMM0:%xmm[0-9]+]]
				; CHECK: fmsub213ss (%r8), [[XMM1]], [[XMM0]]
	%res = call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind			%res = call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind
	ret <4 x float> %res			ret <4 x float> %res
	}			}
	declare <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float>, <4 x float>, <4 x float>) nounwind readnone			declare <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float>, <4 x float>, <4 x float>) nounwind readnone

	define <4 x float> @test_x86_fmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {			define <4 x float> @test_x86_fmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
	; CHECK: fmsub213ps			; CHECK: fmsub213ps
	%res = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind			%res = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind
	ret <4 x float> %res			ret <4 x float> %res
	}			}
	declare <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone			declare <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone

	define <4 x float> @test_x86_fnmsub_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {			define <4 x float> @test_x86_fnmsub_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
	; CHECK: fnmsub213ss			; CHECK-DAG: vmovaps (%rcx), [[XMM1:%xmm[0-9]+]]
				; CHECK-DAG: vmovaps (%rdx), [[XMM0:%xmm[0-9]+]]
				; CHECK: fnmsub213ss (%r8), [[XMM1]], [[XMM0]]
	%res = call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind			%res = call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind
	ret <4 x float> %res			ret <4 x float> %res
	}			}
	declare <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float>, <4 x float>, <4 x float>) nounwind readnone			declare <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float>, <4 x float>, <4 x float>) nounwind readnone

	define <4 x float> @test_x86_fnmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {			define <4 x float> @test_x86_fnmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
	; CHECK: fnmsub213ps			; CHECK: fnmsub213ps
	%res = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind			%res = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind
	ret <4 x float> %res			ret <4 x float> %res
	}			}
	declare <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone			declare <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone

	;;;;			;;;;

	define <2 x double> @test_x86_fmadd_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {			define <2 x double> @test_x86_fmadd_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
	; CHECK: fmadd213sd			; CHECK-DAG: vmovaps (%rcx), [[XMM1:%xmm[0-9]+]]
				; CHECK-DAG: vmovaps (%rdx), [[XMM0:%xmm[0-9]+]]
				; CHECK: fmadd213sd (%r8), [[XMM1]], [[XMM0]]
	%res = call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind			%res = call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind
	ret <2 x double> %res			ret <2 x double> %res
	}			}
	declare <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone			declare <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone

	define <2 x double> @test_x86_fmadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {			define <2 x double> @test_x86_fmadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
	; CHECK: fmadd213pd			; CHECK: fmadd213pd
	%res = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind			%res = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind
	ret <2 x double> %res			ret <2 x double> %res
	}			}
	declare <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone			declare <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone

	define <2 x double> @test_x86_fnmadd_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {			define <2 x double> @test_x86_fnmadd_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
	; CHECK: fnmadd213sd			; CHECK-DAG: vmovaps (%rcx), [[XMM1:%xmm[0-9]+]]
				; CHECK-DAG: vmovaps (%rdx), [[XMM0:%xmm[0-9]+]]
				; CHECK: fnmadd213sd (%r8), [[XMM1]], [[XMM0]]
	%res = call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind			%res = call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind
	ret <2 x double> %res			ret <2 x double> %res
	}			}
	declare <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone			declare <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone

	define <2 x double> @test_x86_fnmadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {			define <2 x double> @test_x86_fnmadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
	; CHECK: fnmadd213pd			; CHECK: fnmadd213pd
	%res = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind			%res = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind
	ret <2 x double> %res			ret <2 x double> %res
	}			}
	declare <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone			declare <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone



	define <2 x double> @test_x86_fmsub_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {			define <2 x double> @test_x86_fmsub_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
	; CHECK: fmsub213sd			; CHECK-DAG: vmovaps (%rcx), [[XMM1:%xmm[0-9]+]]
				; CHECK-DAG: vmovaps (%rdx), [[XMM0:%xmm[0-9]+]]
				; CHECK: fmsub213sd (%r8), [[XMM1]], [[XMM0]]
	%res = call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind			%res = call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind
	ret <2 x double> %res			ret <2 x double> %res
	}			}
	declare <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone			declare <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone

	define <2 x double> @test_x86_fmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {			define <2 x double> @test_x86_fmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
	; CHECK: fmsub213pd			; CHECK: fmsub213pd
	%res = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind			%res = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind
	ret <2 x double> %res			ret <2 x double> %res
	}			}
	declare <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone			declare <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone

	define <2 x double> @test_x86_fnmsub_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {			define <2 x double> @test_x86_fnmsub_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
	; CHECK: fnmsub213sd			; CHECK-DAG: vmovaps (%rcx), [[XMM1:%xmm[0-9]+]]
				; CHECK-DAG: vmovaps (%rdx), [[XMM0:%xmm[0-9]+]]
				; CHECK: fnmsub213sd (%r8), [[XMM1]], [[XMM0]]
	%res = call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind			%res = call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind
	ret <2 x double> %res			ret <2 x double> %res
	}			}
	declare <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone			declare <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone

	define <2 x double> @test_x86_fnmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {			define <2 x double> @test_x86_fnmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
	; CHECK: fnmsub213pd			; CHECK: fnmsub213pd
	%res = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind			%res = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind
	ret <2 x double> %res			ret <2 x double> %res
	}			}
	declare <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone			declare <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone