diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.td b/llvm/lib/Target/ARM/ARMInstrInfo.td --- a/llvm/lib/Target/ARM/ARMInstrInfo.td +++ b/llvm/lib/Target/ARM/ARMInstrInfo.td @@ -460,6 +460,11 @@ return hasNoVMLxHazardUse(N); }]>; +// An 'fadd' node which can be contracted into a fma +def fadd_contract : PatFrag<(ops node:$lhs, node:$rhs),(fadd node:$lhs, node:$rhs),[{ + return N->getFlags().hasAllowContract(); +}]>; + def imm_even : ImmLeaf; def imm_odd : ImmLeaf; diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td --- a/llvm/lib/Target/ARM/ARMInstrMVE.td +++ b/llvm/lib/Target/ARM/ARMInstrMVE.td @@ -3683,6 +3683,13 @@ (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm), imm:$rot))>; + def: Pat<(VTI.Vec (fadd_contract MQPR:$Qd_src, + (int_arm_mve_vcmulq imm:$rot, + (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm)))), + (VTI.Vec (Inst (VTI.Vec MQPR:$Qd_src), + (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm), + imm:$rot))>; + def : Pat<(VTI.Vec (int_arm_mve_vcmlaq_predicated imm:$rot, (VTI.Vec MQPR:$Qd_src), (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm), @@ -3690,7 +3697,6 @@ (VTI.Vec (Inst (VTI.Vec MQPR:$Qd_src), (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm), imm:$rot, ARMVCCThen, (VTI.Pred VCCR:$mask), zero_reg))>; - } } diff --git a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-mixed-cases.ll b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-mixed-cases.ll --- a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-mixed-cases.ll +++ b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-mixed-cases.ll @@ -390,14 +390,13 @@ define <4 x float> @mul_addequal(<4 x float> %a, <4 x float> %b, <4 x float> %c) { ; CHECK-LABEL: mul_addequal: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: add.w r12, sp, #16 ; CHECK-NEXT: vmov d0, r0, r1 ; CHECK-NEXT: mov r0, sp -; CHECK-NEXT: add r1, sp, #16 +; CHECK-NEXT: vldrw.u32 q2, [r12] ; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vmov d1, r2, r3 -; CHECK-NEXT: vldrw.u32 q2, [r1] -; CHECK-NEXT: vcmul.f32 q3, q0, q1, #0 -; CHECK-NEXT: vadd.f32 q2, q3, q2 +; CHECK-NEXT: vcmla.f32 q2, q0, q1, #0 ; CHECK-NEXT: vcmla.f32 q2, q0, q1, #90 ; CHECK-NEXT: vmov r0, r1, d4 ; CHECK-NEXT: vmov r2, r3, d5 diff --git a/llvm/test/CodeGen/Thumb2/mve-vcmla.ll b/llvm/test/CodeGen/Thumb2/mve-vcmla.ll --- a/llvm/test/CodeGen/Thumb2/mve-vcmla.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vcmla.ll @@ -69,8 +69,7 @@ define arm_aapcs_vfpcc <4 x float> @muladd_f32x4(<4 x float> %a, <4 x float> %b, <4 x float> %c) { ; CHECK-LABEL: muladd_f32x4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcmul.f32 q3, q1, q2, #0 -; CHECK-NEXT: vadd.f32 q0, q3, q0 +; CHECK-NEXT: vcmla.f32 q0, q1, q2, #0 ; CHECK-NEXT: bx lr entry: %d = tail call <4 x float> @llvm.arm.mve.vcmulq.v4f32(i32 0, <4 x float> %b, <4 x float> %c) @@ -81,8 +80,7 @@ define arm_aapcs_vfpcc <4 x float> @muladd_c_f32x4(<4 x float> %a, <4 x float> %b, <4 x float> %c) { ; CHECK-LABEL: muladd_c_f32x4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcmul.f32 q3, q1, q2, #90 -; CHECK-NEXT: vadd.f32 q0, q0, q3 +; CHECK-NEXT: vcmla.f32 q0, q1, q2, #90 ; CHECK-NEXT: bx lr entry: %d = tail call <4 x float> @llvm.arm.mve.vcmulq.v4f32(i32 1, <4 x float> %b, <4 x float> %c) @@ -93,8 +91,7 @@ define arm_aapcs_vfpcc <8 x half> @muladd_f16x4(<8 x half> %a, <8 x half> %b, <8 x half> %c) { ; CHECK-LABEL: muladd_f16x4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcmul.f16 q1, q1, q2, #180 -; CHECK-NEXT: vadd.f16 q0, q1, q0 +; CHECK-NEXT: vcmla.f16 q0, q1, q2, #180 ; CHECK-NEXT: bx lr entry: %d = tail call <8 x half> @llvm.arm.mve.vcmulq.v8f16(i32 2, <8 x half> %b, <8 x half> %c) @@ -105,8 +102,7 @@ define arm_aapcs_vfpcc <8 x half> @muladd_c_f16x4(<8 x half> %a, <8 x half> %b, <8 x half> %c) { ; CHECK-LABEL: muladd_c_f16x4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcmul.f16 q1, q1, q2, #270 -; CHECK-NEXT: vadd.f16 q0, q0, q1 +; CHECK-NEXT: vcmla.f16 q0, q1, q2, #270 ; CHECK-NEXT: bx lr entry: %d = tail call <8 x half> @llvm.arm.mve.vcmulq.v8f16(i32 3, <8 x half> %b, <8 x half> %c)