diff --git a/llvm/lib/Target/ARM/ARMInstrVFP.td b/llvm/lib/Target/ARM/ARMInstrVFP.td --- a/llvm/lib/Target/ARM/ARMInstrVFP.td +++ b/llvm/lib/Target/ARM/ARMInstrVFP.td @@ -2287,16 +2287,6 @@ Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>; // Match @llvm.fma.* intrinsics -// (fneg (fma x, y, z)) -> (vfnma z, x, y) -def : Pat<(fneg (fma (f64 DPR:$Dn), (f64 DPR:$Dm), (f64 DPR:$Ddin))), - (VFNMAD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>, - Requires<[HasVFP4,HasDPVFP]>; -def : Pat<(fneg (fma (f32 SPR:$Sn), (f32 SPR:$Sm), (f32 SPR:$Sdin))), - (VFNMAS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>, - Requires<[HasVFP4]>; -def : Pat<(fneg (fma (f16 HPR:$Sn), (f16 HPR:$Sm), (f16 (f16 HPR:$Sdin)))), - (VFNMAH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>, - Requires<[HasFullFP16]>; // (fma (fneg x), y, (fneg z)) -> (vfnma z, x, y) def : Pat<(f64 (fma (fneg DPR:$Dn), DPR:$Dm, (fneg DPR:$Ddin))), (VFNMAD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>, @@ -2355,16 +2345,6 @@ def : Pat<(f16 (fma (f16 HPR:$Sn), (f16 HPR:$Sm), (fneg (f16 HPR:$Sdin)))), (VFNMSH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>, Requires<[HasFullFP16]>; -// (fneg (fma (fneg x), y, z)) -> (vfnms z, x, y) -def : Pat<(fneg (f64 (fma (fneg DPR:$Dn), DPR:$Dm, DPR:$Ddin))), - (VFNMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>, - Requires<[HasVFP4,HasDPVFP]>; -def : Pat<(fneg (f32 (fma (fneg SPR:$Sn), SPR:$Sm, SPR:$Sdin))), - (VFNMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>, - Requires<[HasVFP4]>; -def : Pat<(fneg (f16 (fma (fneg (f16 HPR:$Sn)), (f16 HPR:$Sm), (f16 HPR:$Sdin)))), - (VFNMSH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>, - Requires<[HasFullFP16]>; //===----------------------------------------------------------------------===// // FP Conditional moves. diff --git a/llvm/test/CodeGen/ARM/GlobalISel/arm-instruction-select-combos.mir b/llvm/test/CodeGen/ARM/GlobalISel/arm-instruction-select-combos.mir --- a/llvm/test/CodeGen/ARM/GlobalISel/arm-instruction-select-combos.mir +++ b/llvm/test/CodeGen/ARM/GlobalISel/arm-instruction-select-combos.mir @@ -539,8 +539,9 @@ ; CHECK: [[COPY:%[0-9]+]]:spr = COPY $s0 ; CHECK: [[COPY1:%[0-9]+]]:spr = COPY $s1 ; CHECK: [[COPY2:%[0-9]+]]:spr = COPY $s2 - ; CHECK: [[VFNMAS:%[0-9]+]]:spr = VFNMAS [[COPY2]], [[COPY]], [[COPY1]], 14 /* CC::al */, $noreg - ; CHECK: $s0 = COPY [[VFNMAS]] + ; CHECK: [[VFMAS:%[0-9]+]]:spr = VFMAS [[COPY2]], [[COPY]], [[COPY1]], 14 /* CC::al */, $noreg + ; CHECK: [[VNEGS:%[0-9]+]]:spr = VNEGS [[VFMAS]], 14 /* CC::al */, $noreg + ; CHECK: $s0 = COPY [[VNEGS]] ; CHECK: BX_RET 14 /* CC::al */, $noreg, implicit $s0 %0(s32) = COPY $s0 %1(s32) = COPY $s1 diff --git a/llvm/test/CodeGen/ARM/GlobalISel/select-pr35926.mir b/llvm/test/CodeGen/ARM/GlobalISel/select-pr35926.mir --- a/llvm/test/CodeGen/ARM/GlobalISel/select-pr35926.mir +++ b/llvm/test/CodeGen/ARM/GlobalISel/select-pr35926.mir @@ -31,8 +31,9 @@ ; CHECK: [[COPY:%[0-9]+]]:dpr = COPY $d0 ; CHECK: [[COPY1:%[0-9]+]]:dpr = COPY $d1 ; CHECK: [[COPY2:%[0-9]+]]:dpr = COPY $d2 - ; CHECK: [[VFNMSD:%[0-9]+]]:dpr = VFNMSD [[COPY2]], [[COPY1]], [[COPY]], 14 /* CC::al */, $noreg - ; CHECK: $d0 = COPY [[VFNMSD]] + ; CHECK: [[VFMSD:%[0-9]+]]:dpr = VFMSD [[COPY2]], [[COPY1]], [[COPY]], 14 /* CC::al */, $noreg + ; CHECK: [[VNEGD:%[0-9]+]]:dpr = VNEGD [[VFMSD]], 14 /* CC::al */, $noreg + ; CHECK: $d0 = COPY [[VNEGD]] ; CHECK: MOVPCLR 14 /* CC::al */, $noreg, implicit $d0 %0:fprb(s64) = COPY $d0 %1:fprb(s64) = COPY $d1 diff --git a/llvm/test/CodeGen/ARM/fp16-fusedMAC.ll b/llvm/test/CodeGen/ARM/fp16-fusedMAC.ll --- a/llvm/test/CodeGen/ARM/fp16-fusedMAC.ll +++ b/llvm/test/CodeGen/ARM/fp16-fusedMAC.ll @@ -288,8 +288,9 @@ ; CHECK-NEXT: vldr.16 s0, [r1] ; CHECK-NEXT: vldr.16 s2, [r0] ; CHECK-NEXT: vldr.16 s4, [r2] -; CHECK-NEXT: vfnma.f16 s4, s2, s0 -; CHECK-NEXT: vstr.16 s4, [r0] +; CHECK-NEXT: vfma.f16 s4, s2, s0 +; CHECK-NEXT: vneg.f16 s0, s4 +; CHECK-NEXT: vstr.16 s0, [r0] ; CHECK-NEXT: bx lr ; ; DONT-FUSE-LABEL: fnma1: @@ -297,8 +298,9 @@ ; DONT-FUSE-NEXT: vldr.16 s0, [r1] ; DONT-FUSE-NEXT: vldr.16 s2, [r0] ; DONT-FUSE-NEXT: vldr.16 s4, [r2] -; DONT-FUSE-NEXT: vfnma.f16 s4, s2, s0 -; DONT-FUSE-NEXT: vstr.16 s4, [r0] +; DONT-FUSE-NEXT: vfma.f16 s4, s2, s0 +; DONT-FUSE-NEXT: vneg.f16 s0, s4 +; DONT-FUSE-NEXT: vstr.16 s0, [r0] ; DONT-FUSE-NEXT: bx lr %f1 = load half, half *%a1, align 2 @@ -373,8 +375,9 @@ ; CHECK-NEXT: vldr.16 s0, [r1] ; CHECK-NEXT: vldr.16 s2, [r0] ; CHECK-NEXT: vldr.16 s4, [r2] -; CHECK-NEXT: vfnms.f16 s4, s2, s0 -; CHECK-NEXT: vstr.16 s4, [r0] +; CHECK-NEXT: vfms.f16 s4, s2, s0 +; CHECK-NEXT: vneg.f16 s0, s4 +; CHECK-NEXT: vstr.16 s0, [r0] ; CHECK-NEXT: bx lr ; ; DONT-FUSE-LABEL: fnms2: @@ -382,8 +385,9 @@ ; DONT-FUSE-NEXT: vldr.16 s0, [r1] ; DONT-FUSE-NEXT: vldr.16 s2, [r0] ; DONT-FUSE-NEXT: vldr.16 s4, [r2] -; DONT-FUSE-NEXT: vfnms.f16 s4, s2, s0 -; DONT-FUSE-NEXT: vstr.16 s4, [r0] +; DONT-FUSE-NEXT: vfms.f16 s4, s2, s0 +; DONT-FUSE-NEXT: vneg.f16 s0, s4 +; DONT-FUSE-NEXT: vstr.16 s0, [r0] ; DONT-FUSE-NEXT: bx lr %f1 = load half, half *%a1, align 2 @@ -402,8 +406,9 @@ ; CHECK-NEXT: vldr.16 s0, [r0] ; CHECK-NEXT: vldr.16 s2, [r1] ; CHECK-NEXT: vldr.16 s4, [r2] -; CHECK-NEXT: vfnms.f16 s4, s2, s0 -; CHECK-NEXT: vstr.16 s4, [r0] +; CHECK-NEXT: vfms.f16 s4, s2, s0 +; CHECK-NEXT: vneg.f16 s0, s4 +; CHECK-NEXT: vstr.16 s0, [r0] ; CHECK-NEXT: bx lr ; ; DONT-FUSE-LABEL: fnms3: @@ -411,8 +416,9 @@ ; DONT-FUSE-NEXT: vldr.16 s0, [r0] ; DONT-FUSE-NEXT: vldr.16 s2, [r1] ; DONT-FUSE-NEXT: vldr.16 s4, [r2] -; DONT-FUSE-NEXT: vfnms.f16 s4, s2, s0 -; DONT-FUSE-NEXT: vstr.16 s4, [r0] +; DONT-FUSE-NEXT: vfms.f16 s4, s2, s0 +; DONT-FUSE-NEXT: vneg.f16 s0, s4 +; DONT-FUSE-NEXT: vstr.16 s0, [r0] ; DONT-FUSE-NEXT: bx lr %f1 = load half, half *%a1, align 2 diff --git a/llvm/test/CodeGen/ARM/fusedMAC.ll b/llvm/test/CodeGen/ARM/fusedMAC.ll --- a/llvm/test/CodeGen/ARM/fusedMAC.ll +++ b/llvm/test/CodeGen/ARM/fusedMAC.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=armv7-eabi -mattr=+neon,+vfp4 -fp-contract=fast | FileCheck %s ; RUN: llc < %s -mtriple=arm-arm-eabi -mcpu=cortex-m7 -fp-contract=fast | FileCheck %s ; RUN: llc < %s -mtriple=arm-arm-eabi -mcpu=cortex-m4 -fp-contract=fast | FileCheck %s -check-prefix=DONT-FUSE @@ -6,20 +7,46 @@ ; Check generated fused MAC and MLS. define arm_aapcs_vfpcc double @fusedMACTest1(double %d1, double %d2, double %d3) { -;CHECK-LABEL: fusedMACTest1: -;CHECK: vfma.f64 +; CHECK-LABEL: fusedMACTest1: +; CHECK: @ %bb.0: +; CHECK-NEXT: vfma.f64 d2, d0, d1 +; CHECK-NEXT: vmov.f64 d0, d2 +; CHECK-NEXT: bx lr +; +; DONT-FUSE-LABEL: fusedMACTest1: +; DONT-FUSE: @ %bb.0: +; DONT-FUSE-NEXT: .save {r7, lr} +; DONT-FUSE-NEXT: push {r7, lr} +; DONT-FUSE-NEXT: .vsave {d8} +; DONT-FUSE-NEXT: vpush {d8} +; DONT-FUSE-NEXT: vmov.f32 s16, s4 +; DONT-FUSE-NEXT: vmov r0, r1, d0 +; DONT-FUSE-NEXT: vmov r2, r3, d1 +; DONT-FUSE-NEXT: vmov.f32 s17, s5 +; DONT-FUSE-NEXT: bl __aeabi_dmul +; DONT-FUSE-NEXT: vmov r2, r3, d8 +; DONT-FUSE-NEXT: bl __aeabi_dadd +; DONT-FUSE-NEXT: vmov d0, r0, r1 +; DONT-FUSE-NEXT: vpop {d8} +; DONT-FUSE-NEXT: pop {r7, pc} %1 = fmul double %d1, %d2 %2 = fadd double %1, %d3 ret double %2 } define arm_aapcs_vfpcc float @fusedMACTest2(float %f1, float %f2, float %f3) { -;CHECK-LABEL: fusedMACTest2: -;CHECK: vfma.f32 +; CHECK-LABEL: fusedMACTest2: +; CHECK: @ %bb.0: +; CHECK-NEXT: vfma.f32 s2, s0, s1 +; CHECK-NEXT: vmov.f32 s0, s2 +; CHECK-NEXT: bx lr +; +; DONT-FUSE-LABEL: fusedMACTest2: +; DONT-FUSE: @ %bb.0: +; DONT-FUSE-NEXT: vmul.f32 s0, s0, s1 +; DONT-FUSE-NEXT: vadd.f32 s0, s0, s2 +; DONT-FUSE-NEXT: bx lr -;DONT-FUSE-LABEL: fusedMACTest2: -;DONT-FUSE: vmul.f32 -;DONT-FUSE-NEXT: vadd.f32 %1 = fmul float %f1, %f2 %2 = fadd float %1, %f3 @@ -27,24 +54,82 @@ } define arm_aapcs_vfpcc double @fusedMACTest3(double %d1, double %d2, double %d3) { -;CHECK-LABEL: fusedMACTest3: -;CHECK: vfms.f64 +; CHECK-LABEL: fusedMACTest3: +; CHECK: @ %bb.0: +; CHECK-NEXT: vfms.f64 d0, d1, d2 +; CHECK-NEXT: bx lr +; +; DONT-FUSE-LABEL: fusedMACTest3: +; DONT-FUSE: @ %bb.0: +; DONT-FUSE-NEXT: .save {r7, lr} +; DONT-FUSE-NEXT: push {r7, lr} +; DONT-FUSE-NEXT: .vsave {d8} +; DONT-FUSE-NEXT: vpush {d8} +; DONT-FUSE-NEXT: vmov.f32 s16, s0 +; DONT-FUSE-NEXT: vmov r0, r1, d1 +; DONT-FUSE-NEXT: vmov r2, r3, d2 +; DONT-FUSE-NEXT: vmov.f32 s17, s1 +; DONT-FUSE-NEXT: bl __aeabi_dmul +; DONT-FUSE-NEXT: mov r2, r0 +; DONT-FUSE-NEXT: mov r3, r1 +; DONT-FUSE-NEXT: vmov r0, r1, d8 +; DONT-FUSE-NEXT: bl __aeabi_dsub +; DONT-FUSE-NEXT: vmov d0, r0, r1 +; DONT-FUSE-NEXT: vpop {d8} +; DONT-FUSE-NEXT: pop {r7, pc} %1 = fmul double %d2, %d3 %2 = fsub double %d1, %1 ret double %2 } define arm_aapcs_vfpcc float @fusedMACTest4(float %f1, float %f2, float %f3) { -;CHECK-LABEL: fusedMACTest4: -;CHECK: vfms.f32 +; CHECK-LABEL: fusedMACTest4: +; CHECK: @ %bb.0: +; CHECK-NEXT: vfms.f32 s0, s1, s2 +; CHECK-NEXT: bx lr +; +; DONT-FUSE-LABEL: fusedMACTest4: +; DONT-FUSE: @ %bb.0: +; DONT-FUSE-NEXT: vmul.f32 s2, s1, s2 +; DONT-FUSE-NEXT: vsub.f32 s0, s0, s2 +; DONT-FUSE-NEXT: bx lr %1 = fmul float %f2, %f3 %2 = fsub float %f1, %1 ret float %2 } define arm_aapcs_vfpcc double @fusedMACTest5(double %d1, double %d2, double %d3) { -;CHECK-LABEL: fusedMACTest5: -;CHECK: vfnma.f64 +; CHECK-LABEL: fusedMACTest5: +; CHECK: @ %bb.0: +; CHECK-NEXT: vfnma.f64 d2, d0, d1 +; CHECK-NEXT: vmov.f64 d0, d2 +; CHECK-NEXT: bx lr +; +; DONT-FUSE-LABEL: fusedMACTest5: +; DONT-FUSE: @ %bb.0: +; DONT-FUSE-NEXT: .save {r7, lr} +; DONT-FUSE-NEXT: push {r7, lr} +; DONT-FUSE-NEXT: .vsave {d8} +; DONT-FUSE-NEXT: vpush {d8} +; DONT-FUSE-NEXT: .pad #8 +; DONT-FUSE-NEXT: sub sp, #8 +; DONT-FUSE-NEXT: vmov.f32 s16, s4 +; DONT-FUSE-NEXT: vmov r0, r1, d0 +; DONT-FUSE-NEXT: vmov r2, r3, d1 +; DONT-FUSE-NEXT: vmov.f32 s17, s5 +; DONT-FUSE-NEXT: bl __aeabi_dmul +; DONT-FUSE-NEXT: vmov d0, r0, r1 +; DONT-FUSE-NEXT: vmov r2, r3, d8 +; DONT-FUSE-NEXT: vstr d0, [sp] +; DONT-FUSE-NEXT: ldrb.w r0, [sp, #7] +; DONT-FUSE-NEXT: eor r0, r0, #128 +; DONT-FUSE-NEXT: strb.w r0, [sp, #7] +; DONT-FUSE-NEXT: ldrd r0, r1, [sp] +; DONT-FUSE-NEXT: bl __aeabi_dsub +; DONT-FUSE-NEXT: vmov d0, r0, r1 +; DONT-FUSE-NEXT: add sp, #8 +; DONT-FUSE-NEXT: vpop {d8} +; DONT-FUSE-NEXT: pop {r7, pc} %1 = fmul double %d1, %d2 %2 = fsub double -0.0, %1 %3 = fsub double %2, %d3 @@ -52,8 +137,17 @@ } define arm_aapcs_vfpcc float @fusedMACTest6(float %f1, float %f2, float %f3) { -;CHECK-LABEL: fusedMACTest6: -;CHECK: vfnma.f32 +; CHECK-LABEL: fusedMACTest6: +; CHECK: @ %bb.0: +; CHECK-NEXT: vfnma.f32 s2, s0, s1 +; CHECK-NEXT: vmov.f32 s0, s2 +; CHECK-NEXT: bx lr +; +; DONT-FUSE-LABEL: fusedMACTest6: +; DONT-FUSE: @ %bb.0: +; DONT-FUSE-NEXT: vnmul.f32 s0, s0, s1 +; DONT-FUSE-NEXT: vsub.f32 s0, s0, s2 +; DONT-FUSE-NEXT: bx lr %1 = fmul float %f1, %f2 %2 = fsub float -0.0, %1 %3 = fsub float %2, %f3 @@ -61,108 +155,275 @@ } define arm_aapcs_vfpcc double @fusedMACTest7(double %d1, double %d2, double %d3) { -;CHECK-LABEL: fusedMACTest7: -;CHECK: vfnms.f64 +; CHECK-LABEL: fusedMACTest7: +; CHECK: @ %bb.0: +; CHECK-NEXT: vfnms.f64 d2, d0, d1 +; CHECK-NEXT: vmov.f64 d0, d2 +; CHECK-NEXT: bx lr +; +; DONT-FUSE-LABEL: fusedMACTest7: +; DONT-FUSE: @ %bb.0: +; DONT-FUSE-NEXT: .save {r7, lr} +; DONT-FUSE-NEXT: push {r7, lr} +; DONT-FUSE-NEXT: .vsave {d8} +; DONT-FUSE-NEXT: vpush {d8} +; DONT-FUSE-NEXT: vmov.f32 s16, s4 +; DONT-FUSE-NEXT: vmov r0, r1, d0 +; DONT-FUSE-NEXT: vmov r2, r3, d1 +; DONT-FUSE-NEXT: vmov.f32 s17, s5 +; DONT-FUSE-NEXT: bl __aeabi_dmul +; DONT-FUSE-NEXT: vmov r2, r3, d8 +; DONT-FUSE-NEXT: bl __aeabi_dsub +; DONT-FUSE-NEXT: vmov d0, r0, r1 +; DONT-FUSE-NEXT: vpop {d8} +; DONT-FUSE-NEXT: pop {r7, pc} %1 = fmul double %d1, %d2 %2 = fsub double %1, %d3 ret double %2 } define arm_aapcs_vfpcc float @fusedMACTest8(float %f1, float %f2, float %f3) { -;CHECK-LABEL: fusedMACTest8: -;CHECK: vfnms.f32 +; CHECK-LABEL: fusedMACTest8: +; CHECK: @ %bb.0: +; CHECK-NEXT: vfnms.f32 s2, s0, s1 +; CHECK-NEXT: vmov.f32 s0, s2 +; CHECK-NEXT: bx lr +; +; DONT-FUSE-LABEL: fusedMACTest8: +; DONT-FUSE: @ %bb.0: +; DONT-FUSE-NEXT: vmul.f32 s0, s0, s1 +; DONT-FUSE-NEXT: vsub.f32 s0, s0, s2 +; DONT-FUSE-NEXT: bx lr %1 = fmul float %f1, %f2 %2 = fsub float %1, %f3 ret float %2 } define arm_aapcs_vfpcc <2 x float> @fusedMACTest9(<2 x float> %a, <2 x float> %b) { -;CHECK-LABEL: fusedMACTest9: -;CHECK: vfma.f32 +; DONT-FUSE-LABEL: fusedMACTest9: +; DONT-FUSE: @ %bb.0: +; DONT-FUSE-NEXT: vmul.f32 s4, s1, s3 +; DONT-FUSE-NEXT: vmul.f32 s2, s0, s2 +; DONT-FUSE-NEXT: vadd.f32 s0, s2, s0 +; DONT-FUSE-NEXT: vadd.f32 s1, s4, s1 +; DONT-FUSE-NEXT: bx lr %mul = fmul <2 x float> %a, %b %add = fadd <2 x float> %mul, %a ret <2 x float> %add } define arm_aapcs_vfpcc <2 x float> @fusedMACTest10(<2 x float> %a, <2 x float> %b) { -;CHECK-LABEL: fusedMACTest10: -;CHECK: vfms.f32 +; DONT-FUSE-LABEL: fusedMACTest10: +; DONT-FUSE: @ %bb.0: +; DONT-FUSE-NEXT: vmul.f32 s4, s1, s3 +; DONT-FUSE-NEXT: vmul.f32 s2, s0, s2 +; DONT-FUSE-NEXT: vsub.f32 s0, s0, s2 +; DONT-FUSE-NEXT: vsub.f32 s1, s1, s4 +; DONT-FUSE-NEXT: bx lr %mul = fmul <2 x float> %a, %b %sub = fsub <2 x float> %a, %mul ret <2 x float> %sub } define arm_aapcs_vfpcc <4 x float> @fusedMACTest11(<4 x float> %a, <4 x float> %b) { -;CHECK-LABEL: fusedMACTest11: -;CHECK: vfma.f32 +; DONT-FUSE-LABEL: fusedMACTest11: +; DONT-FUSE: @ %bb.0: +; DONT-FUSE-NEXT: vmul.f32 s8, s3, s7 +; DONT-FUSE-NEXT: vmul.f32 s6, s2, s6 +; DONT-FUSE-NEXT: vmul.f32 s10, s1, s5 +; DONT-FUSE-NEXT: vmul.f32 s4, s0, s4 +; DONT-FUSE-NEXT: vadd.f32 s0, s4, s0 +; DONT-FUSE-NEXT: vadd.f32 s1, s10, s1 +; DONT-FUSE-NEXT: vadd.f32 s2, s6, s2 +; DONT-FUSE-NEXT: vadd.f32 s3, s8, s3 +; DONT-FUSE-NEXT: bx lr %mul = fmul <4 x float> %a, %b %add = fadd <4 x float> %mul, %a ret <4 x float> %add } define arm_aapcs_vfpcc <4 x float> @fusedMACTest12(<4 x float> %a, <4 x float> %b) { -;CHECK-LABEL: fusedMACTest12: -;CHECK: vfms.f32 +; DONT-FUSE-LABEL: fusedMACTest12: +; DONT-FUSE: @ %bb.0: +; DONT-FUSE-NEXT: vmul.f32 s8, s3, s7 +; DONT-FUSE-NEXT: vmul.f32 s6, s2, s6 +; DONT-FUSE-NEXT: vmul.f32 s10, s1, s5 +; DONT-FUSE-NEXT: vmul.f32 s4, s0, s4 +; DONT-FUSE-NEXT: vsub.f32 s0, s0, s4 +; DONT-FUSE-NEXT: vsub.f32 s1, s1, s10 +; DONT-FUSE-NEXT: vsub.f32 s2, s2, s6 +; DONT-FUSE-NEXT: vsub.f32 s3, s3, s8 +; DONT-FUSE-NEXT: bx lr %mul = fmul <4 x float> %a, %b %sub = fsub <4 x float> %a, %mul ret <4 x float> %sub } define arm_aapcs_vfpcc float @test_fma_f32(float %a, float %b, float %c) nounwind readnone ssp { +; CHECK-LABEL: test_fma_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vfma.f32 s2, s0, s1 +; CHECK-NEXT: vmov.f32 s0, s2 +; CHECK-NEXT: bx lr +; +; DONT-FUSE-LABEL: test_fma_f32: +; DONT-FUSE: @ %bb.0: @ %entry +; DONT-FUSE-NEXT: vfma.f32 s2, s0, s1 +; DONT-FUSE-NEXT: vmov.f32 s0, s2 +; DONT-FUSE-NEXT: bx lr entry: -; CHECK: test_fma_f32 -; CHECK: vfma.f32 %tmp1 = tail call float @llvm.fma.f32(float %a, float %b, float %c) nounwind readnone ret float %tmp1 } define arm_aapcs_vfpcc double @test_fma_f64(double %a, double %b, double %c) nounwind readnone ssp { +; CHECK-LABEL: test_fma_f64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vfma.f64 d2, d0, d1 +; CHECK-NEXT: vmov.f64 d0, d2 +; CHECK-NEXT: bx lr +; +; DONT-FUSE-LABEL: test_fma_f64: +; DONT-FUSE: @ %bb.0: @ %entry +; DONT-FUSE-NEXT: .save {r7, lr} +; DONT-FUSE-NEXT: push {r7, lr} +; DONT-FUSE-NEXT: .pad #8 +; DONT-FUSE-NEXT: sub sp, #8 +; DONT-FUSE-NEXT: vmov r0, r1, d0 +; DONT-FUSE-NEXT: vmov r2, r3, d1 +; DONT-FUSE-NEXT: vstr d2, [sp] +; DONT-FUSE-NEXT: bl fma +; DONT-FUSE-NEXT: vmov d0, r0, r1 +; DONT-FUSE-NEXT: add sp, #8 +; DONT-FUSE-NEXT: pop {r7, pc} entry: -; CHECK: test_fma_f64 -; CHECK: vfma.f64 %tmp1 = tail call double @llvm.fma.f64(double %a, double %b, double %c) nounwind readnone ret double %tmp1 } define arm_aapcs_vfpcc <2 x float> @test_fma_v2f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone ssp { +; DONT-FUSE-LABEL: test_fma_v2f32: +; DONT-FUSE: @ %bb.0: @ %entry +; DONT-FUSE-NEXT: vfma.f32 s4, s0, s2 +; DONT-FUSE-NEXT: vfma.f32 s5, s1, s3 +; DONT-FUSE-NEXT: vmov.f32 s0, s4 +; DONT-FUSE-NEXT: vmov.f32 s1, s5 +; DONT-FUSE-NEXT: bx lr entry: -; CHECK: test_fma_v2f32 -; CHECK: vfma.f32 %tmp1 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind ret <2 x float> %tmp1 } define arm_aapcs_vfpcc double @test_fms_f64(double %a, double %b, double %c) nounwind readnone ssp { +; CHECK-LABEL: test_fms_f64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vfms.f64 d2, d0, d1 +; CHECK-NEXT: vmov.f64 d0, d2 +; CHECK-NEXT: bx lr +; +; DONT-FUSE-LABEL: test_fms_f64: +; DONT-FUSE: @ %bb.0: @ %entry +; DONT-FUSE-NEXT: .save {r7, lr} +; DONT-FUSE-NEXT: push {r7, lr} +; DONT-FUSE-NEXT: .pad #16 +; DONT-FUSE-NEXT: sub sp, #16 +; DONT-FUSE-NEXT: vstr d0, [sp, #8] +; DONT-FUSE-NEXT: ldrb.w r0, [sp, #15] +; DONT-FUSE-NEXT: vmov r2, r3, d1 +; DONT-FUSE-NEXT: eor r0, r0, #128 +; DONT-FUSE-NEXT: strb.w r0, [sp, #15] +; DONT-FUSE-NEXT: ldrd r0, r1, [sp, #8] +; DONT-FUSE-NEXT: vstr d2, [sp] +; DONT-FUSE-NEXT: bl fma +; DONT-FUSE-NEXT: vmov d0, r0, r1 +; DONT-FUSE-NEXT: add sp, #16 +; DONT-FUSE-NEXT: pop {r7, pc} entry: -; CHECK: test_fms_f64 -; CHECK: vfms.f64 %tmp1 = fsub double -0.0, %a %tmp2 = tail call double @llvm.fma.f64(double %tmp1, double %b, double %c) nounwind readnone ret double %tmp2 } define arm_aapcs_vfpcc double @test_fms_f64_2(double %a, double %b, double %c) nounwind readnone ssp { +; CHECK-LABEL: test_fms_f64_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vfms.f64 d2, d1, d0 +; CHECK-NEXT: vmov.f64 d0, d2 +; CHECK-NEXT: bx lr +; +; DONT-FUSE-LABEL: test_fms_f64_2: +; DONT-FUSE: @ %bb.0: @ %entry +; DONT-FUSE-NEXT: .save {r7, lr} +; DONT-FUSE-NEXT: push {r7, lr} +; DONT-FUSE-NEXT: .pad #16 +; DONT-FUSE-NEXT: sub sp, #16 +; DONT-FUSE-NEXT: vstr d1, [sp, #8] +; DONT-FUSE-NEXT: ldrb.w r0, [sp, #15] +; DONT-FUSE-NEXT: eor r0, r0, #128 +; DONT-FUSE-NEXT: strb.w r0, [sp, #15] +; DONT-FUSE-NEXT: ldrd r2, r3, [sp, #8] +; DONT-FUSE-NEXT: vmov r0, r1, d0 +; DONT-FUSE-NEXT: vstr d2, [sp] +; DONT-FUSE-NEXT: bl fma +; DONT-FUSE-NEXT: vmov d0, r0, r1 +; DONT-FUSE-NEXT: add sp, #16 +; DONT-FUSE-NEXT: pop {r7, pc} entry: -; CHECK: test_fms_f64_2 -; CHECK: vfms.f64 %tmp1 = fsub double -0.0, %b %tmp2 = tail call double @llvm.fma.f64(double %a, double %tmp1, double %c) nounwind readnone ret double %tmp2 } define arm_aapcs_vfpcc float @test_fnms_f32(float %a, float %b, float* %c) nounwind readnone ssp { -; CHECK: test_fnms_f32 -; CHECK: vfnms.f32 +; CHECK-LABEL: test_fnms_f32: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr s2, [r0] +; CHECK-NEXT: vfnms.f32 s2, s0, s1 +; CHECK-NEXT: vmov.f32 s0, s2 +; CHECK-NEXT: bx lr +; +; DONT-FUSE-LABEL: test_fnms_f32: +; DONT-FUSE: @ %bb.0: +; DONT-FUSE-NEXT: vldr s2, [r0] +; DONT-FUSE-NEXT: vfnms.f32 s2, s0, s1 +; DONT-FUSE-NEXT: vmov.f32 s0, s2 +; DONT-FUSE-NEXT: bx lr %tmp1 = load float, float* %c, align 4 %tmp2 = fsub float -0.0, %tmp1 %tmp3 = tail call float @llvm.fma.f32(float %a, float %b, float %tmp2) nounwind readnone - ret float %tmp3 + ret float %tmp3 } define arm_aapcs_vfpcc double @test_fnms_f64(double %a, double %b, double %c) nounwind readnone ssp { +; CHECK-LABEL: test_fnms_f64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vfms.f64 d2, d0, d1 +; CHECK-NEXT: vneg.f64 d0, d2 +; CHECK-NEXT: bx lr +; +; DONT-FUSE-LABEL: test_fnms_f64: +; DONT-FUSE: @ %bb.0: @ %entry +; DONT-FUSE-NEXT: .save {r7, lr} +; DONT-FUSE-NEXT: push {r7, lr} +; DONT-FUSE-NEXT: .pad #24 +; DONT-FUSE-NEXT: sub sp, #24 +; DONT-FUSE-NEXT: vstr d0, [sp, #8] +; DONT-FUSE-NEXT: ldrb.w r0, [sp, #15] +; DONT-FUSE-NEXT: vmov r2, r3, d1 +; DONT-FUSE-NEXT: eor r0, r0, #128 +; DONT-FUSE-NEXT: strb.w r0, [sp, #15] +; DONT-FUSE-NEXT: ldrd r0, r1, [sp, #8] +; DONT-FUSE-NEXT: vstr d2, [sp] +; DONT-FUSE-NEXT: bl fma +; DONT-FUSE-NEXT: vmov d0, r0, r1 +; DONT-FUSE-NEXT: vstr d0, [sp, #16] +; DONT-FUSE-NEXT: ldrb.w r0, [sp, #23] +; DONT-FUSE-NEXT: eor r0, r0, #128 +; DONT-FUSE-NEXT: strb.w r0, [sp, #23] +; DONT-FUSE-NEXT: vldr d0, [sp, #16] +; DONT-FUSE-NEXT: add sp, #24 +; DONT-FUSE-NEXT: pop {r7, pc} entry: -; CHECK: test_fnms_f64 -; CHECK: vfnms.f64 %tmp1 = fsub double -0.0, %a %tmp2 = tail call double @llvm.fma.f64(double %tmp1, double %b, double %c) nounwind readnone %tmp3 = fsub double -0.0, %tmp2 @@ -170,9 +431,35 @@ } define arm_aapcs_vfpcc double @test_fnms_f64_2(double %a, double %b, double %c) nounwind readnone ssp { +; CHECK-LABEL: test_fnms_f64_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vfms.f64 d2, d1, d0 +; CHECK-NEXT: vneg.f64 d0, d2 +; CHECK-NEXT: bx lr +; +; DONT-FUSE-LABEL: test_fnms_f64_2: +; DONT-FUSE: @ %bb.0: @ %entry +; DONT-FUSE-NEXT: .save {r7, lr} +; DONT-FUSE-NEXT: push {r7, lr} +; DONT-FUSE-NEXT: .pad #24 +; DONT-FUSE-NEXT: sub sp, #24 +; DONT-FUSE-NEXT: vstr d1, [sp, #8] +; DONT-FUSE-NEXT: ldrb.w r0, [sp, #15] +; DONT-FUSE-NEXT: eor r0, r0, #128 +; DONT-FUSE-NEXT: strb.w r0, [sp, #15] +; DONT-FUSE-NEXT: ldrd r2, r3, [sp, #8] +; DONT-FUSE-NEXT: vmov r0, r1, d0 +; DONT-FUSE-NEXT: vstr d2, [sp] +; DONT-FUSE-NEXT: bl fma +; DONT-FUSE-NEXT: vmov d0, r0, r1 +; DONT-FUSE-NEXT: vstr d0, [sp, #16] +; DONT-FUSE-NEXT: ldrb.w r0, [sp, #23] +; DONT-FUSE-NEXT: eor r0, r0, #128 +; DONT-FUSE-NEXT: strb.w r0, [sp, #23] +; DONT-FUSE-NEXT: vldr d0, [sp, #16] +; DONT-FUSE-NEXT: add sp, #24 +; DONT-FUSE-NEXT: pop {r7, pc} entry: -; CHECK: test_fnms_f64_2 -; CHECK: vfnms.f64 %tmp1 = fsub double -0.0, %b %tmp2 = tail call double @llvm.fma.f64(double %a, double %tmp1, double %c) nounwind readnone %tmp3 = fsub double -0.0, %tmp2 @@ -180,18 +467,66 @@ } define arm_aapcs_vfpcc double @test_fnma_f64(double %a, double %b, double %c) nounwind readnone ssp { +; CHECK-LABEL: test_fnma_f64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vfma.f64 d2, d0, d1 +; CHECK-NEXT: vneg.f64 d0, d2 +; CHECK-NEXT: bx lr +; +; DONT-FUSE-LABEL: test_fnma_f64: +; DONT-FUSE: @ %bb.0: @ %entry +; DONT-FUSE-NEXT: .save {r7, lr} +; DONT-FUSE-NEXT: push {r7, lr} +; DONT-FUSE-NEXT: .pad #16 +; DONT-FUSE-NEXT: sub sp, #16 +; DONT-FUSE-NEXT: vmov r0, r1, d0 +; DONT-FUSE-NEXT: vmov r2, r3, d1 +; DONT-FUSE-NEXT: vstr d2, [sp] +; DONT-FUSE-NEXT: bl fma +; DONT-FUSE-NEXT: vmov d0, r0, r1 +; DONT-FUSE-NEXT: vstr d0, [sp, #8] +; DONT-FUSE-NEXT: ldrb.w r0, [sp, #15] +; DONT-FUSE-NEXT: eor r0, r0, #128 +; DONT-FUSE-NEXT: strb.w r0, [sp, #15] +; DONT-FUSE-NEXT: vldr d0, [sp, #8] +; DONT-FUSE-NEXT: add sp, #16 +; DONT-FUSE-NEXT: pop {r7, pc} entry: -; CHECK: test_fnma_f64 -; CHECK: vfnma.f64 %tmp1 = tail call double @llvm.fma.f64(double %a, double %b, double %c) nounwind readnone %tmp2 = fsub double -0.0, %tmp1 ret double %tmp2 } define arm_aapcs_vfpcc double @test_fnma_f64_2(double %a, double %b, double %c) nounwind readnone ssp { +; CHECK-LABEL: test_fnma_f64_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vfnma.f64 d2, d0, d1 +; CHECK-NEXT: vmov.f64 d0, d2 +; CHECK-NEXT: bx lr +; +; DONT-FUSE-LABEL: test_fnma_f64_2: +; DONT-FUSE: @ %bb.0: @ %entry +; DONT-FUSE-NEXT: .save {r7, lr} +; DONT-FUSE-NEXT: push {r7, lr} +; DONT-FUSE-NEXT: .pad #24 +; DONT-FUSE-NEXT: sub sp, #24 +; DONT-FUSE-NEXT: vstr d0, [sp, #8] +; DONT-FUSE-NEXT: ldrb.w r0, [sp, #15] +; DONT-FUSE-NEXT: eor r0, r0, #128 +; DONT-FUSE-NEXT: strb.w r0, [sp, #15] +; DONT-FUSE-NEXT: ldrd r0, r1, [sp, #8] +; DONT-FUSE-NEXT: vstr d2, [sp, #16] +; DONT-FUSE-NEXT: ldrb.w r2, [sp, #23] +; DONT-FUSE-NEXT: eor r2, r2, #128 +; DONT-FUSE-NEXT: strb.w r2, [sp, #23] +; DONT-FUSE-NEXT: vldr d0, [sp, #16] +; DONT-FUSE-NEXT: vmov r2, r3, d1 +; DONT-FUSE-NEXT: vstr d0, [sp] +; DONT-FUSE-NEXT: bl fma +; DONT-FUSE-NEXT: vmov d0, r0, r1 +; DONT-FUSE-NEXT: add sp, #24 +; DONT-FUSE-NEXT: pop {r7, pc} entry: -; CHECK: test_fnma_f64_2 -; CHECK: vfnma.f64 %tmp1 = fsub double -0.0, %a %tmp2 = fsub double -0.0, %c %tmp3 = tail call double @llvm.fma.f64(double %tmp1, double %b, double %tmp2) nounwind readnone @@ -199,27 +534,69 @@ } define arm_aapcs_vfpcc float @test_fma_const_fold(float %a, float %b) nounwind { -; CHECK: test_fma_const_fold -; CHECK-NOT: vfma -; CHECK-NOT: vmul -; CHECK: vadd +; CHECK-LABEL: test_fma_const_fold: +; CHECK: @ %bb.0: +; CHECK-NEXT: vadd.f32 s0, s0, s1 +; CHECK-NEXT: bx lr +; +; DONT-FUSE-LABEL: test_fma_const_fold: +; DONT-FUSE: @ %bb.0: +; DONT-FUSE-NEXT: vadd.f32 s0, s0, s1 +; DONT-FUSE-NEXT: bx lr %ret = call float @llvm.fma.f32(float %a, float 1.0, float %b) ret float %ret } define arm_aapcs_vfpcc float @test_fma_canonicalize(float %a, float %b) nounwind { -; CHECK: test_fma_canonicalize -; CHECK: vmov.f32 [[R1:s[0-9]+]], #2.000000e+00 -; CHECK: vfma.f32 {{s[0-9]+}}, {{s[0-9]+}}, [[R1]] +; CHECK-LABEL: test_fma_canonicalize: +; CHECK: @ %bb.0: +; CHECK-NEXT: vmov.f32 s2, #2.000000e+00 +; CHECK-NEXT: vfma.f32 s1, s0, s2 +; CHECK-NEXT: vmov.f32 s0, s1 +; CHECK-NEXT: bx lr +; +; DONT-FUSE-LABEL: test_fma_canonicalize: +; DONT-FUSE: @ %bb.0: +; DONT-FUSE-NEXT: vmov.f32 s2, #2.000000e+00 +; DONT-FUSE-NEXT: vfma.f32 s1, s0, s2 +; DONT-FUSE-NEXT: vmov.f32 s0, s1 +; DONT-FUSE-NEXT: bx lr %ret = call float @llvm.fma.f32(float 2.0, float %a, float %b) ret float %ret } ; Check that very wide vector fma's can be split into legal fma's. define arm_aapcs_vfpcc void @test_fma_v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float>* %p) nounwind readnone ssp { -; CHECK: test_fma_v8f32 -; CHECK: vfma.f32 -; CHECK: vfma.f32 +; DONT-FUSE-LABEL: test_fma_v8f32: +; DONT-FUSE: @ %bb.0: @ %entry +; DONT-FUSE-NEXT: .vsave {d8, d9} +; DONT-FUSE-NEXT: vpush {d8, d9} +; DONT-FUSE-NEXT: vldr s16, [sp, #16] +; DONT-FUSE-NEXT: vldr s18, [sp, #20] +; DONT-FUSE-NEXT: vfma.f32 s16, s0, s8 +; DONT-FUSE-NEXT: vldr s0, [sp, #24] +; DONT-FUSE-NEXT: vldr s8, [sp, #28] +; DONT-FUSE-NEXT: vfma.f32 s0, s2, s10 +; DONT-FUSE-NEXT: vldr s2, [sp, #32] +; DONT-FUSE-NEXT: vldr s10, [sp, #36] +; DONT-FUSE-NEXT: vfma.f32 s2, s4, s12 +; DONT-FUSE-NEXT: vldr s4, [sp, #40] +; DONT-FUSE-NEXT: vldr s12, [sp, #44] +; DONT-FUSE-NEXT: vfma.f32 s18, s1, s9 +; DONT-FUSE-NEXT: vfma.f32 s8, s3, s11 +; DONT-FUSE-NEXT: vfma.f32 s10, s5, s13 +; DONT-FUSE-NEXT: vfma.f32 s4, s6, s14 +; DONT-FUSE-NEXT: vfma.f32 s12, s7, s15 +; DONT-FUSE-NEXT: vstr s16, [r0] +; DONT-FUSE-NEXT: vstr s18, [r0, #4] +; DONT-FUSE-NEXT: vstr s0, [r0, #8] +; DONT-FUSE-NEXT: vstr s8, [r0, #12] +; DONT-FUSE-NEXT: vstr s2, [r0, #16] +; DONT-FUSE-NEXT: vstr s10, [r0, #20] +; DONT-FUSE-NEXT: vstr s4, [r0, #24] +; DONT-FUSE-NEXT: vstr s12, [r0, #28] +; DONT-FUSE-NEXT: vpop {d8, d9} +; DONT-FUSE-NEXT: bx lr entry: %call = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c) nounwind readnone store <8 x float> %call, <8 x float>* %p, align 16