Index: llvm/trunk/lib/Target/X86/X86InstrFMA.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrFMA.td +++ llvm/trunk/lib/Target/X86/X86InstrFMA.td @@ -183,19 +183,24 @@ defm SD : fma3s_forms, VEX_W; +// These patterns use the 123 ordering, instead of 213, even though +// they match the intrinsic to the 213 version of the instruction. +// This is because src1 is tied to dest, and the scalar intrinsics +// require the pass-through values to come from the first source +// operand, not the second. def : Pat<(IntF32 VR128:$src1, VR128:$src2, VR128:$src3), (COPY_TO_REGCLASS (!cast(NAME#"SSr213r") - (COPY_TO_REGCLASS $src2, FR32), (COPY_TO_REGCLASS $src1, FR32), + (COPY_TO_REGCLASS $src2, FR32), (COPY_TO_REGCLASS $src3, FR32)), VR128)>; def : Pat<(IntF64 VR128:$src1, VR128:$src2, VR128:$src3), (COPY_TO_REGCLASS (!cast(NAME#"SDr213r") - (COPY_TO_REGCLASS $src2, FR64), (COPY_TO_REGCLASS $src1, FR64), + (COPY_TO_REGCLASS $src2, FR64), (COPY_TO_REGCLASS $src3, FR64)), VR128)>; } Index: llvm/trunk/test/CodeGen/X86/fma3-intrinsics.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/fma3-intrinsics.ll +++ llvm/trunk/test/CodeGen/X86/fma3-intrinsics.ll @@ -3,7 +3,9 @@ ; RUN: llc < %s -mcpu=bdver2 -mtriple=x86_64-pc-win32 -mattr=-fma4 | FileCheck %s define <4 x float> @test_x86_fmadd_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) { - ; CHECK: fmadd213ss (%r8), %xmm + ; CHECK-DAG: vmovaps (%rcx), [[XMM1:%xmm[0-9]+]] + ; CHECK-DAG: vmovaps (%rdx), [[XMM0:%xmm[0-9]+]] + ; CHECK: fmadd213ss (%r8), [[XMM1]], [[XMM0]] %res = call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind ret <4 x float> %res } @@ -24,7 +26,9 @@ declare <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone define <4 x float> @test_x86_fnmadd_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) { - ; CHECK: fnmadd213ss (%r8), %xmm + ; CHECK-DAG: vmovaps (%rcx), [[XMM1:%xmm[0-9]+]] + ; CHECK-DAG: vmovaps (%rdx), [[XMM0:%xmm[0-9]+]] + ; CHECK: fnmadd213ss (%r8), [[XMM1]], [[XMM0]] %res = call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind ret <4 x float> %res } @@ -46,7 +50,9 @@ define <4 x float> @test_x86_fmsub_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) { - ; CHECK: fmsub213ss + ; CHECK-DAG: vmovaps (%rcx), [[XMM1:%xmm[0-9]+]] + ; CHECK-DAG: vmovaps (%rdx), [[XMM0:%xmm[0-9]+]] + ; CHECK: fmsub213ss (%r8), [[XMM1]], [[XMM0]] %res = call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind ret <4 x float> %res } @@ -60,7 +66,9 @@ declare <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone define <4 x float> @test_x86_fnmsub_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) { - ; CHECK: fnmsub213ss + ; CHECK-DAG: vmovaps (%rcx), [[XMM1:%xmm[0-9]+]] + ; CHECK-DAG: vmovaps (%rdx), [[XMM0:%xmm[0-9]+]] + ; CHECK: fnmsub213ss (%r8), [[XMM1]], [[XMM0]] %res = call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind ret <4 x float> %res } @@ -76,7 +84,9 @@ ;;;; define <2 x double> @test_x86_fmadd_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) { - ; CHECK: fmadd213sd + ; CHECK-DAG: vmovaps (%rcx), [[XMM1:%xmm[0-9]+]] + ; CHECK-DAG: vmovaps (%rdx), [[XMM0:%xmm[0-9]+]] + ; CHECK: fmadd213sd (%r8), [[XMM1]], [[XMM0]] %res = call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind ret <2 x double> %res } @@ -90,7 +100,9 @@ declare <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone define <2 x double> @test_x86_fnmadd_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) { - ; CHECK: fnmadd213sd + ; CHECK-DAG: vmovaps (%rcx), [[XMM1:%xmm[0-9]+]] + ; CHECK-DAG: vmovaps (%rdx), [[XMM0:%xmm[0-9]+]] + ; CHECK: fnmadd213sd (%r8), [[XMM1]], [[XMM0]] %res = call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind ret <2 x double> %res } @@ -106,7 +118,9 @@ define <2 x double> @test_x86_fmsub_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) { - ; CHECK: fmsub213sd + ; CHECK-DAG: vmovaps (%rcx), [[XMM1:%xmm[0-9]+]] + ; CHECK-DAG: vmovaps (%rdx), [[XMM0:%xmm[0-9]+]] + ; CHECK: fmsub213sd (%r8), [[XMM1]], [[XMM0]] %res = call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind ret <2 x double> %res } @@ -120,7 +134,9 @@ declare <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone define <2 x double> @test_x86_fnmsub_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) { - ; CHECK: fnmsub213sd + ; CHECK-DAG: vmovaps (%rcx), [[XMM1:%xmm[0-9]+]] + ; CHECK-DAG: vmovaps (%rdx), [[XMM0:%xmm[0-9]+]] + ; CHECK: fnmsub213sd (%r8), [[XMM1]], [[XMM0]] %res = call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind ret <2 x double> %res }