This is an archive of the discontinued LLVM Phabricator instance.

[X86] When pattern-matching scalar FMA3 intrinsics, don't re-arrange the first and second operands
ClosedPublic

Authored by mkuper on May 21 2015, 5:38 AM.

Download Raw Diff

Details

Reviewers

delena
craig.topper
lhames

Commits

rGf1452286764e: [X86] When pattern-matching scalar FMA3 intrinsics, don't re-arrange the first…
rL238131: [X86] When pattern-matching scalar FMA3 intrinsics, don't re-arrange the first…

Summary

The semantics of the scalar FMA intrinsics are that the high vector elements are copied from the first source, e.g. (from the Intel manual):

m128 _mm_fmadd_ss (m128 a, m128 b, m128 c)
Operation:
dst[31:0] := (a[31:0] * b[31:0]) + c[31:0]
dst[127:32] := a[127:32]
dst[MAX:128] := 0

The current pattern switches src1 and src2 around (I guess to match the "213" order), which ends up tying the original src2 to the dest.
Since the actual scalar fma3 instructions copy the high elements from the dest register, the wrong values are copied.

This modifies the pattern to leave src1 and src2 in their original order.

Diff Detail

Event Timeline

mkuper updated this revision to Diff 26222.May 21 2015, 5:38 AM

mkuper retitled this revision from to [X86] When pattern-matching scalar FMA3 intrinsics, don't re-arrange the first and second operands.

mkuper updated this object.

mkuper edited the test plan for this revision. (Show Details)

mkuper added reviewers: delena, lhames, craig.topper.

mkuper added a subscriber: Unknown Object (MLST).

LGTM

lib/Target/X86/X86InstrFMA.td
190	Please add a comment, that you use 1-2-3 instead of 2-1-3 because src1 is tied to dest.

Will do.
Thanks, Elena!

Closed by commit rL238131: [X86] When pattern-matching scalar FMA3 intrinsics, don't re-arrange the first… (authored by mkuper). · Explain WhyMay 25 2015, 5:39 AM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

lib/

Target/

X86/

	X86InstrFMA.td
	X86InstrFMA.td (revision 237799)

4 lines

test/

CodeGen/

X86/

	fma3-intrinsics.ll
	fma3-intrinsics.ll (revision 237799)

32 lines

Diff 26222

lib/Target/X86/X86InstrFMA.td

Show First 20 Lines • Show All 180 Lines • ▼ Show 20 Lines	multiclass fma3s<bits<8> opc132, bits<8> opc213, bits<8> opc231,
defm SS : fma3s_forms<opc132, opc213, opc231, OpStr, "ss", "SS", IntF32, OpNode,		defm SS : fma3s_forms<opc132, opc213, opc231, OpStr, "ss", "SS", IntF32, OpNode,
FR32, f32, f32mem, ssmem, loadf32, sse_load_f32>;		FR32, f32, f32mem, ssmem, loadf32, sse_load_f32>;
defm SD : fma3s_forms<opc132, opc213, opc231, OpStr, "sd", "PD", IntF64, OpNode,		defm SD : fma3s_forms<opc132, opc213, opc231, OpStr, "sd", "PD", IntF64, OpNode,
FR64, f64, f64mem, sdmem, loadf64, sse_load_f64>, VEX_W;		FR64, f64, f64mem, sdmem, loadf64, sse_load_f64>, VEX_W;

def : Pat<(IntF32 VR128:$src1, VR128:$src2, VR128:$src3),		def : Pat<(IntF32 VR128:$src1, VR128:$src2, VR128:$src3),
(COPY_TO_REGCLASS		(COPY_TO_REGCLASS
(!cast<Instruction>(NAME#"SSr213r")		(!cast<Instruction>(NAME#"SSr213r")
(COPY_TO_REGCLASS $src2, FR32),
(COPY_TO_REGCLASS $src1, FR32),		(COPY_TO_REGCLASS $src1, FR32),
		(COPY_TO_REGCLASS $src2, FR32),
		delenaUnsubmitted Not Done Reply Inline Actions Please add a comment, that you use 1-2-3 instead of 2-1-3 because src1 is tied to dest. delena: Please add a comment, that you use 1-2-3 instead of 2-1-3 because src1 is tied to dest.
(COPY_TO_REGCLASS $src3, FR32)),		(COPY_TO_REGCLASS $src3, FR32)),
VR128)>;		VR128)>;

def : Pat<(IntF64 VR128:$src1, VR128:$src2, VR128:$src3),		def : Pat<(IntF64 VR128:$src1, VR128:$src2, VR128:$src3),
(COPY_TO_REGCLASS		(COPY_TO_REGCLASS
(!cast<Instruction>(NAME#"SDr213r")		(!cast<Instruction>(NAME#"SDr213r")
(COPY_TO_REGCLASS $src2, FR64),
(COPY_TO_REGCLASS $src1, FR64),		(COPY_TO_REGCLASS $src1, FR64),
		(COPY_TO_REGCLASS $src2, FR64),
(COPY_TO_REGCLASS $src3, FR64)),		(COPY_TO_REGCLASS $src3, FR64)),
VR128)>;		VR128)>;
}		}

defm VFMADD : fma3s<0x99, 0xA9, 0xB9, "vfmadd", int_x86_fma_vfmadd_ss,		defm VFMADD : fma3s<0x99, 0xA9, 0xB9, "vfmadd", int_x86_fma_vfmadd_ss,
int_x86_fma_vfmadd_sd, X86Fmadd>, VEX_LIG;		int_x86_fma_vfmadd_sd, X86Fmadd>, VEX_LIG;
defm VFMSUB : fma3s<0x9B, 0xAB, 0xBB, "vfmsub", int_x86_fma_vfmsub_ss,		defm VFMSUB : fma3s<0x9B, 0xAB, 0xBB, "vfmsub", int_x86_fma_vfmsub_ss,
int_x86_fma_vfmsub_sd, X86Fmsub>, VEX_LIG;		int_x86_fma_vfmsub_sd, X86Fmsub>, VEX_LIG;
▲ Show 20 Lines • Show All 184 Lines • Show Last 20 Lines

test/CodeGen/X86/fma3-intrinsics.ll

	; RUN: llc < %s -mtriple=x86_64-pc-win32 -mcpu=core-avx2 \| FileCheck %s			; RUN: llc < %s -mtriple=x86_64-pc-win32 -mcpu=core-avx2 \| FileCheck %s
	; RUN: llc < %s -mtriple=x86_64-pc-win32 -mattr=+fma,+fma4 \| FileCheck %s			; RUN: llc < %s -mtriple=x86_64-pc-win32 -mattr=+fma,+fma4 \| FileCheck %s
	; RUN: llc < %s -mcpu=bdver2 -mtriple=x86_64-pc-win32 -mattr=-fma4 \| FileCheck %s			; RUN: llc < %s -mcpu=bdver2 -mtriple=x86_64-pc-win32 -mattr=-fma4 \| FileCheck %s

	define <4 x float> @test_x86_fmadd_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {			define <4 x float> @test_x86_fmadd_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
	; CHECK: fmadd213ss (%r8), %xmm			; CHECK-DAG: vmovaps (%rcx), [[XMM1:%xmm[0-9]+]]
				; CHECK-DAG: vmovaps (%rdx), [[XMM0:%xmm[0-9]+]]
				; CHECK: fmadd213ss (%r8), [[XMM1]], [[XMM0]]
	%res = call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind			%res = call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind
	ret <4 x float> %res			ret <4 x float> %res
	}			}
	declare <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>) nounwind readnone			declare <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>) nounwind readnone

	define <4 x float> @test_x86_fmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {			define <4 x float> @test_x86_fmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
	; CHECK: fmadd213ps			; CHECK: fmadd213ps
	%res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind			%res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind
	ret <4 x float> %res			ret <4 x float> %res
	}			}
	declare <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone			declare <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone

	define <8 x float> @test_x86_fmadd_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {			define <8 x float> @test_x86_fmadd_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
	; CHECK: fmadd213ps {{.\(%r.}}, %ymm			; CHECK: fmadd213ps {{.\(%r.}}, %ymm
	%res = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) nounwind			%res = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) nounwind
	ret <8 x float> %res			ret <8 x float> %res
	}			}
	declare <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone			declare <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone

	define <4 x float> @test_x86_fnmadd_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {			define <4 x float> @test_x86_fnmadd_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
	; CHECK: fnmadd213ss (%r8), %xmm			; CHECK-DAG: vmovaps (%rcx), [[XMM1:%xmm[0-9]+]]
				; CHECK-DAG: vmovaps (%rdx), [[XMM0:%xmm[0-9]+]]
				; CHECK: fnmadd213ss (%r8), [[XMM1]], [[XMM0]]
	%res = call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind			%res = call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind
	ret <4 x float> %res			ret <4 x float> %res
	}			}
	declare <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float>, <4 x float>, <4 x float>) nounwind readnone			declare <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float>, <4 x float>, <4 x float>) nounwind readnone

	define <4 x float> @test_x86_fnmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {			define <4 x float> @test_x86_fnmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
	; CHECK: fnmadd213ps			; CHECK: fnmadd213ps
	%res = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind			%res = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind
	ret <4 x float> %res			ret <4 x float> %res
	}			}
	declare <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone			declare <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone

	define <8 x float> @test_x86_fnmadd_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {			define <8 x float> @test_x86_fnmadd_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
	; CHECK: fnmadd213ps {{.\(%r.}}, %ymm			; CHECK: fnmadd213ps {{.\(%r.}}, %ymm
	%res = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) nounwind			%res = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) nounwind
	ret <8 x float> %res			ret <8 x float> %res
	}			}
	declare <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone			declare <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone


	define <4 x float> @test_x86_fmsub_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {			define <4 x float> @test_x86_fmsub_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
	; CHECK: fmsub213ss			; CHECK-DAG: vmovaps (%rcx), [[XMM1:%xmm[0-9]+]]
				; CHECK-DAG: vmovaps (%rdx), [[XMM0:%xmm[0-9]+]]
				; CHECK: fmsub213ss (%r8), [[XMM1]], [[XMM0]]
	%res = call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind			%res = call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind
	ret <4 x float> %res			ret <4 x float> %res
	}			}
	declare <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float>, <4 x float>, <4 x float>) nounwind readnone			declare <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float>, <4 x float>, <4 x float>) nounwind readnone

	define <4 x float> @test_x86_fmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {			define <4 x float> @test_x86_fmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
	; CHECK: fmsub213ps			; CHECK: fmsub213ps
	%res = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind			%res = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind
	ret <4 x float> %res			ret <4 x float> %res
	}			}
	declare <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone			declare <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone

	define <4 x float> @test_x86_fnmsub_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {			define <4 x float> @test_x86_fnmsub_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
	; CHECK: fnmsub213ss			; CHECK-DAG: vmovaps (%rcx), [[XMM1:%xmm[0-9]+]]
				; CHECK-DAG: vmovaps (%rdx), [[XMM0:%xmm[0-9]+]]
				; CHECK: fnmsub213ss (%r8), [[XMM1]], [[XMM0]]
	%res = call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind			%res = call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind
	ret <4 x float> %res			ret <4 x float> %res
	}			}
	declare <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float>, <4 x float>, <4 x float>) nounwind readnone			declare <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float>, <4 x float>, <4 x float>) nounwind readnone

	define <4 x float> @test_x86_fnmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {			define <4 x float> @test_x86_fnmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
	; CHECK: fnmsub213ps			; CHECK: fnmsub213ps
	%res = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind			%res = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind
	ret <4 x float> %res			ret <4 x float> %res
	}			}
	declare <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone			declare <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone

	;;;;			;;;;

	define <2 x double> @test_x86_fmadd_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {			define <2 x double> @test_x86_fmadd_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
	; CHECK: fmadd213sd			; CHECK-DAG: vmovaps (%rcx), [[XMM1:%xmm[0-9]+]]
				; CHECK-DAG: vmovaps (%rdx), [[XMM0:%xmm[0-9]+]]
				; CHECK: fmadd213sd (%r8), [[XMM1]], [[XMM0]]
	%res = call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind			%res = call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind
	ret <2 x double> %res			ret <2 x double> %res
	}			}
	declare <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone			declare <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone

	define <2 x double> @test_x86_fmadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {			define <2 x double> @test_x86_fmadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
	; CHECK: fmadd213pd			; CHECK: fmadd213pd
	%res = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind			%res = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind
	ret <2 x double> %res			ret <2 x double> %res
	}			}
	declare <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone			declare <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone

	define <2 x double> @test_x86_fnmadd_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {			define <2 x double> @test_x86_fnmadd_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
	; CHECK: fnmadd213sd			; CHECK-DAG: vmovaps (%rcx), [[XMM1:%xmm[0-9]+]]
				; CHECK-DAG: vmovaps (%rdx), [[XMM0:%xmm[0-9]+]]
				; CHECK: fnmadd213sd (%r8), [[XMM1]], [[XMM0]]
	%res = call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind			%res = call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind
	ret <2 x double> %res			ret <2 x double> %res
	}			}
	declare <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone			declare <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone

	define <2 x double> @test_x86_fnmadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {			define <2 x double> @test_x86_fnmadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
	; CHECK: fnmadd213pd			; CHECK: fnmadd213pd
	%res = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind			%res = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind
	ret <2 x double> %res			ret <2 x double> %res
	}			}
	declare <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone			declare <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone



	define <2 x double> @test_x86_fmsub_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {			define <2 x double> @test_x86_fmsub_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
	; CHECK: fmsub213sd			; CHECK-DAG: vmovaps (%rcx), [[XMM1:%xmm[0-9]+]]
				; CHECK-DAG: vmovaps (%rdx), [[XMM0:%xmm[0-9]+]]
				; CHECK: fmsub213sd (%r8), [[XMM1]], [[XMM0]]
	%res = call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind			%res = call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind
	ret <2 x double> %res			ret <2 x double> %res
	}			}
	declare <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone			declare <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone

	define <2 x double> @test_x86_fmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {			define <2 x double> @test_x86_fmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
	; CHECK: fmsub213pd			; CHECK: fmsub213pd
	%res = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind			%res = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind
	ret <2 x double> %res			ret <2 x double> %res
	}			}
	declare <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone			declare <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone

	define <2 x double> @test_x86_fnmsub_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {			define <2 x double> @test_x86_fnmsub_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
	; CHECK: fnmsub213sd			; CHECK-DAG: vmovaps (%rcx), [[XMM1:%xmm[0-9]+]]
				; CHECK-DAG: vmovaps (%rdx), [[XMM0:%xmm[0-9]+]]
				; CHECK: fnmsub213sd (%r8), [[XMM1]], [[XMM0]]
	%res = call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind			%res = call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind
	ret <2 x double> %res			ret <2 x double> %res
	}			}
	declare <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone			declare <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone

	define <2 x double> @test_x86_fnmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {			define <2 x double> @test_x86_fnmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
	; CHECK: fnmsub213pd			; CHECK: fnmsub213pd
	%res = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind			%res = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind
	ret <2 x double> %res			ret <2 x double> %res
	}			}
	declare <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone			declare <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone