Index: lib/Target/ARM/ARMInstrNEON.td =================================================================== --- lib/Target/ARM/ARMInstrNEON.td +++ lib/Target/ARM/ARMInstrNEON.td @@ -4734,6 +4734,12 @@ Requires<[HasNEON,HasFullFP16,UseFusedMAC]>; // Match @llvm.fma.* intrinsics +def : Pat<(v4f16 (fma DPR:$Vn, DPR:$Vm, DPR:$src1)), + (VFMAhd DPR:$src1, DPR:$Vn, DPR:$Vm)>, + Requires<[HasFullFP16]>; +def : Pat<(v8f16 (fma QPR:$Vn, QPR:$Vm, QPR:$src1)), + (VFMAhq QPR:$src1, QPR:$Vn, QPR:$Vm)>, + Requires<[HasFullFP16]>; def : Pat<(v2f32 (fma DPR:$Vn, DPR:$Vm, DPR:$src1)), (VFMAfd DPR:$src1, DPR:$Vn, DPR:$Vm)>, Requires<[HasVFP4]>; Index: test/CodeGen/ARM/armv8.2a-fp16-vector-intrinsics.ll =================================================================== --- test/CodeGen/ARM/armv8.2a-fp16-vector-intrinsics.ll +++ test/CodeGen/ARM/armv8.2a-fp16-vector-intrinsics.ll @@ -911,34 +911,48 @@ ret <8 x half> %sub.i } -; FIXME (PR38404) -; -;define dso_local <4 x half> @test_vfma_f16(<4 x half> %a, <4 x half> %b, <4 x half> %c) { -;entry: -; %0 = tail call <4 x half> @llvm.fma.v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %a) -; ret <4 x half> %0 -;} +define dso_local <4 x half> @test_vfma_f16(<4 x half> %a, <4 x half> %b, <4 x half> %c) { +; CHECK-LABEL: test_vfma_f16: +; CHECK: vfma.f16 d0, d1, d2 +; CHECK-NEXT: bx lr +entry: + %0 = tail call <4 x half> @llvm.fma.v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %a) + ret <4 x half> %0 +} -;define dso_local <8 x half> @test_vfmaq_f16(<8 x half> %a, <8 x half> %b, <8 x half> %c) { -;entry: -; %0 = tail call <8 x half> @llvm.fma.v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %a) -; ret <8 x half> %0 -;} +define dso_local <8 x half> @test_vfmaq_f16(<8 x half> %a, <8 x half> %b, <8 x half> %c) { +; CHECK-LABEL: test_vfmaq_f16: +; CHECK: vfma.f16 q0, q1, q2 +; CHECK-NEXT: bx lr +entry: + %0 = tail call <8 x half> @llvm.fma.v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %a) + ret <8 x half> %0 +} -;define dso_local <4 x half> @test_vfms_f16(<4 x half> %a, <4 x half> %b, <4 x half> %c) { -;entry: -; %sub.i = fsub <4 x half> , %b -; %0 = tail call <4 x half> @llvm.fma.v4f16(<4 x half> %sub.i, <4 x half> %c, <4 x half> %a) -; ret <4 x half> %0 -;} +define dso_local <4 x half> @test_vfms_f16(<4 x half> %a, <4 x half> %b, <4 x half> %c) { +; CHECK-LABEL: test_vfms_f16: +; CHECK: vneg.f16 [[D16:d[0-9]+]], d1 +; CHECK-NEXT: vfma.f16 d0, [[D16]], d2 +; CHECK-NEXT: bx lr +entry: + %sub.i = fsub <4 x half> , %b + %0 = tail call <4 x half> @llvm.fma.v4f16(<4 x half> %sub.i, <4 x half> %c, <4 x half> %a) + ret <4 x half> %0 +} -;define dso_local <8 x half> @test_vfmsq_f16(<8 x half> %a, <8 x half> %b, <8 x half> %c) { -;entry: -; %sub.i = fsub <8 x half> , %b -; %0 = tail call <8 x half> @llvm.fma.v8f16(<8 x half> %sub.i, <8 x half> %c, <8 x half> %a) -; ret <8 x half> %0 -;} +define dso_local <8 x half> @test_vfmsq_f16(<8 x half> %a, <8 x half> %b, <8 x half> %c) { +; CHECK-LABEL: test_vfmsq_f16: +; CHECK: vneg.f16 [[Q8:q[0-9]+]], q1 +; CHECK-NEXT: vfma.f16 q0, [[Q8]], q2 +; CHECK-NEXT: bx lr +entry: + %sub.i = fsub <8 x half> , %b + %0 = tail call <8 x half> @llvm.fma.v8f16(<8 x half> %sub.i, <8 x half> %c, <8 x half> %a) + ret <8 x half> %0 +} +; FIXME (PR38404) +; ;define dso_local <4 x half> @test_vmul_lane_f16(<4 x half> %a, <4 x half> %b) { ;entry: ; %shuffle = shufflevector <4 x half> %b, <4 x half> undef, <4 x i32>