Skip to content

Commit f145228

Browse files
author
Michael Kuperstein
committedMay 25, 2015
[X86] When pattern-matching scalar FMA3 intrinsics, don't re-arrange the first and second operands.
The semantics of the scalar FMA intrinsics are that the high vector elements are copied from the first source. The existing pattern switches src1 and src2 around, to match the "213" order, which ends up tying the original src2 to the dest. Since the actual scalar fma3 instructions copy the high elements from the dest register, the wrong values are copied. This modifies the pattern to leave src1 and src2 in their original order. Differential Revision: http://reviews.llvm.org/D9908 llvm-svn: 238131
1 parent 1c1391b commit f145228

File tree

2 files changed

+31
-10
lines changed

2 files changed

+31
-10
lines changed
 

‎llvm/lib/Target/X86/X86InstrFMA.td

+7-2
Original file line numberDiff line numberDiff line change
@@ -183,19 +183,24 @@ multiclass fma3s<bits<8> opc132, bits<8> opc213, bits<8> opc231,
183183
defm SD : fma3s_forms<opc132, opc213, opc231, OpStr, "sd", "PD", IntF64, OpNode,
184184
FR64, f64, f64mem, sdmem, loadf64, sse_load_f64>, VEX_W;
185185

186+
// These patterns use the 123 ordering, instead of 213, even though
187+
// they match the intrinsic to the 213 version of the instruction.
188+
// This is because src1 is tied to dest, and the scalar intrinsics
189+
// require the pass-through values to come from the first source
190+
// operand, not the second.
186191
def : Pat<(IntF32 VR128:$src1, VR128:$src2, VR128:$src3),
187192
(COPY_TO_REGCLASS
188193
(!cast<Instruction>(NAME#"SSr213r")
189-
(COPY_TO_REGCLASS $src2, FR32),
190194
(COPY_TO_REGCLASS $src1, FR32),
195+
(COPY_TO_REGCLASS $src2, FR32),
191196
(COPY_TO_REGCLASS $src3, FR32)),
192197
VR128)>;
193198

194199
def : Pat<(IntF64 VR128:$src1, VR128:$src2, VR128:$src3),
195200
(COPY_TO_REGCLASS
196201
(!cast<Instruction>(NAME#"SDr213r")
197-
(COPY_TO_REGCLASS $src2, FR64),
198202
(COPY_TO_REGCLASS $src1, FR64),
203+
(COPY_TO_REGCLASS $src2, FR64),
199204
(COPY_TO_REGCLASS $src3, FR64)),
200205
VR128)>;
201206
}

‎llvm/test/CodeGen/X86/fma3-intrinsics.ll

+24-8
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,9 @@
33
; RUN: llc < %s -mcpu=bdver2 -mtriple=x86_64-pc-win32 -mattr=-fma4 | FileCheck %s
44

55
define <4 x float> @test_x86_fmadd_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
6-
; CHECK: fmadd213ss (%r8), %xmm
6+
; CHECK-DAG: vmovaps (%rcx), [[XMM1:%xmm[0-9]+]]
7+
; CHECK-DAG: vmovaps (%rdx), [[XMM0:%xmm[0-9]+]]
8+
; CHECK: fmadd213ss (%r8), [[XMM1]], [[XMM0]]
79
%res = call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind
810
ret <4 x float> %res
911
}
@@ -24,7 +26,9 @@ define <8 x float> @test_x86_fmadd_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x f
2426
declare <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
2527

2628
define <4 x float> @test_x86_fnmadd_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
27-
; CHECK: fnmadd213ss (%r8), %xmm
29+
; CHECK-DAG: vmovaps (%rcx), [[XMM1:%xmm[0-9]+]]
30+
; CHECK-DAG: vmovaps (%rdx), [[XMM0:%xmm[0-9]+]]
31+
; CHECK: fnmadd213ss (%r8), [[XMM1]], [[XMM0]]
2832
%res = call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind
2933
ret <4 x float> %res
3034
}
@@ -46,7 +50,9 @@ declare <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float>, <8 x float>, <8 x
4650

4751

4852
define <4 x float> @test_x86_fmsub_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
49-
; CHECK: fmsub213ss
53+
; CHECK-DAG: vmovaps (%rcx), [[XMM1:%xmm[0-9]+]]
54+
; CHECK-DAG: vmovaps (%rdx), [[XMM0:%xmm[0-9]+]]
55+
; CHECK: fmsub213ss (%r8), [[XMM1]], [[XMM0]]
5056
%res = call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind
5157
ret <4 x float> %res
5258
}
@@ -60,7 +66,9 @@ define <4 x float> @test_x86_fmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x flo
6066
declare <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
6167

6268
define <4 x float> @test_x86_fnmsub_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
63-
; CHECK: fnmsub213ss
69+
; CHECK-DAG: vmovaps (%rcx), [[XMM1:%xmm[0-9]+]]
70+
; CHECK-DAG: vmovaps (%rdx), [[XMM0:%xmm[0-9]+]]
71+
; CHECK: fnmsub213ss (%r8), [[XMM1]], [[XMM0]]
6472
%res = call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind
6573
ret <4 x float> %res
6674
}
@@ -76,7 +84,9 @@ declare <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float>, <4 x float>, <4 x floa
7684
;;;;
7785

7886
define <2 x double> @test_x86_fmadd_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
79-
; CHECK: fmadd213sd
87+
; CHECK-DAG: vmovaps (%rcx), [[XMM1:%xmm[0-9]+]]
88+
; CHECK-DAG: vmovaps (%rdx), [[XMM0:%xmm[0-9]+]]
89+
; CHECK: fmadd213sd (%r8), [[XMM1]], [[XMM0]]
8090
%res = call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind
8191
ret <2 x double> %res
8292
}
@@ -90,7 +100,9 @@ define <2 x double> @test_x86_fmadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x
90100
declare <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
91101

92102
define <2 x double> @test_x86_fnmadd_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
93-
; CHECK: fnmadd213sd
103+
; CHECK-DAG: vmovaps (%rcx), [[XMM1:%xmm[0-9]+]]
104+
; CHECK-DAG: vmovaps (%rdx), [[XMM0:%xmm[0-9]+]]
105+
; CHECK: fnmadd213sd (%r8), [[XMM1]], [[XMM0]]
94106
%res = call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind
95107
ret <2 x double> %res
96108
}
@@ -106,7 +118,9 @@ declare <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double>, <2 x double>, <2 x d
106118

107119

108120
define <2 x double> @test_x86_fmsub_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
109-
; CHECK: fmsub213sd
121+
; CHECK-DAG: vmovaps (%rcx), [[XMM1:%xmm[0-9]+]]
122+
; CHECK-DAG: vmovaps (%rdx), [[XMM0:%xmm[0-9]+]]
123+
; CHECK: fmsub213sd (%r8), [[XMM1]], [[XMM0]]
110124
%res = call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind
111125
ret <2 x double> %res
112126
}
@@ -120,7 +134,9 @@ define <2 x double> @test_x86_fmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x
120134
declare <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
121135

122136
define <2 x double> @test_x86_fnmsub_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
123-
; CHECK: fnmsub213sd
137+
; CHECK-DAG: vmovaps (%rcx), [[XMM1:%xmm[0-9]+]]
138+
; CHECK-DAG: vmovaps (%rdx), [[XMM0:%xmm[0-9]+]]
139+
; CHECK: fnmsub213sd (%r8), [[XMM1]], [[XMM0]]
124140
%res = call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind
125141
ret <2 x double> %res
126142
}

0 commit comments

Comments
 (0)
Please sign in to comment.