diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -6280,18 +6280,18 @@ // On the other hand, there are quite a few valid combinatorial options due to // the commutativity of multiplication and the fact that (-x) * y = x * (-y). defm : SIMDFPIndexedTiedPatterns<"FMLA", - TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)>>; + TriOpFrag<(any_fma node:$RHS, node:$MHS, node:$LHS)>>; defm : SIMDFPIndexedTiedPatterns<"FMLA", - TriOpFrag<(fma node:$MHS, node:$RHS, node:$LHS)>>; + TriOpFrag<(any_fma node:$MHS, node:$RHS, node:$LHS)>>; defm : SIMDFPIndexedTiedPatterns<"FMLS", - TriOpFrag<(fma node:$MHS, (fneg node:$RHS), node:$LHS)> >; + TriOpFrag<(any_fma node:$MHS, (fneg node:$RHS), node:$LHS)> >; defm : SIMDFPIndexedTiedPatterns<"FMLS", - TriOpFrag<(fma node:$RHS, (fneg node:$MHS), node:$LHS)> >; + TriOpFrag<(any_fma node:$RHS, (fneg node:$MHS), node:$LHS)> >; defm : SIMDFPIndexedTiedPatterns<"FMLS", - TriOpFrag<(fma (fneg node:$RHS), node:$MHS, node:$LHS)> >; + TriOpFrag<(any_fma (fneg node:$RHS), node:$MHS, node:$LHS)> >; defm : SIMDFPIndexedTiedPatterns<"FMLS", - TriOpFrag<(fma (fneg node:$MHS), node:$RHS, node:$LHS)> >; + TriOpFrag<(any_fma (fneg node:$MHS), node:$RHS, node:$LHS)> >; multiclass FMLSIndexedAfterNegPatterns { // 3 variants for the .2s version: DUPLANE from 128-bit, DUPLANE from 64-bit @@ -6370,22 +6370,22 @@ } defm : FMLSIndexedAfterNegPatterns< - TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)> >; + TriOpFrag<(any_fma node:$RHS, node:$MHS, node:$LHS)> >; defm : FMLSIndexedAfterNegPatterns< - TriOpFrag<(fma node:$MHS, node:$RHS, node:$LHS)> >; + TriOpFrag<(any_fma node:$MHS, node:$RHS, node:$LHS)> >; defm FMULX : SIMDFPIndexed<1, 0b1001, "fmulx", int_aarch64_neon_fmulx>; defm FMUL : SIMDFPIndexed<0, 0b1001, "fmul", any_fmul>; -def : Pat<(v2f32 (fmul V64:$Rn, (AArch64dup (f32 FPR32:$Rm)))), +def : Pat<(v2f32 (any_fmul V64:$Rn, (AArch64dup (f32 FPR32:$Rm)))), (FMULv2i32_indexed V64:$Rn, (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rm, ssub), (i64 0))>; -def : Pat<(v4f32 (fmul V128:$Rn, (AArch64dup (f32 FPR32:$Rm)))), +def : Pat<(v4f32 (any_fmul V128:$Rn, (AArch64dup (f32 FPR32:$Rm)))), (FMULv4i32_indexed V128:$Rn, (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rm, ssub), (i64 0))>; -def : Pat<(v2f64 (fmul V128:$Rn, (AArch64dup (f64 FPR64:$Rm)))), +def : Pat<(v2f64 (any_fmul V128:$Rn, (AArch64dup (f64 FPR64:$Rm)))), (FMULv2i64_indexed V128:$Rn, (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$Rm, dsub), (i64 0))>; diff --git a/llvm/test/CodeGen/AArch64/arm64-vmul.ll b/llvm/test/CodeGen/AArch64/arm64-vmul.ll --- a/llvm/test/CodeGen/AArch64/arm64-vmul.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vmul.ll @@ -845,6 +845,90 @@ ret <2 x double> %fmla1 } +define <2 x float> @fmls_indexed_2s_strict(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone ssp { +; CHECK-LABEL: fmls_indexed_2s_strict: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: fmls.2s v0, v2, v1[0] +; CHECK-NEXT: ret +entry: + %0 = fneg <2 x float> %c + %lane = shufflevector <2 x float> %b, <2 x float> undef, <2 x i32> zeroinitializer + %fmls1 = tail call <2 x float> @llvm.experimental.constrained.fma.v2f32(<2 x float> %0, <2 x float> %lane, <2 x float> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret <2 x float> %fmls1 +} + +define <4 x float> @fmls_indexed_4s_strict(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone ssp { +; CHECK-LABEL: fmls_indexed_4s_strict: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmls.4s v0, v2, v1[0] +; CHECK-NEXT: ret +entry: + %0 = fneg <4 x float> %c + %lane = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> zeroinitializer + %fmls1 = tail call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> %0, <4 x float> %lane, <4 x float> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret <4 x float> %fmls1 +} + +define <2 x double> @fmls_indexed_2d_strict(<2 x double> %a, <2 x double> %b, <2 x double> %c) nounwind readnone ssp { +; CHECK-LABEL: fmls_indexed_2d_strict: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmls.2d v0, v2, v1[0] +; CHECK-NEXT: ret +entry: + %0 = fneg <2 x double> %c + %lane = shufflevector <2 x double> %b, <2 x double> undef, <2 x i32> zeroinitializer + %fmls1 = tail call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %0, <2 x double> %lane, <2 x double> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret <2 x double> %fmls1 +} + +define <2 x float> @fmla_indexed_scalar_2s_strict(<2 x float> %a, <2 x float> %b, float %c) nounwind readnone ssp { +; CHECK-LABEL: fmla_indexed_scalar_2s_strict: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $s2 killed $s2 def $q2 +; CHECK-NEXT: fmla.2s v0, v1, v2[0] +; CHECK-NEXT: ret +entry: + %v1 = insertelement <2 x float> undef, float %c, i32 0 + %v2 = insertelement <2 x float> %v1, float %c, i32 1 + %fmla1 = tail call <2 x float> @llvm.experimental.constrained.fma.v2f32(<2 x float> %v2, <2 x float> %b, <2 x float> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret <2 x float> %fmla1 +} + +define <4 x float> @fmla_indexed_scalar_4s_strict(<4 x float> %a, <4 x float> %b, float %c) nounwind readnone ssp { +; CHECK-LABEL: fmla_indexed_scalar_4s_strict: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $s2 killed $s2 def $q2 +; CHECK-NEXT: fmla.4s v0, v1, v2[0] +; CHECK-NEXT: ret +entry: + %v1 = insertelement <4 x float> undef, float %c, i32 0 + %v2 = insertelement <4 x float> %v1, float %c, i32 1 + %v3 = insertelement <4 x float> %v2, float %c, i32 2 + %v4 = insertelement <4 x float> %v3, float %c, i32 3 + %fmla1 = tail call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> %v4, <4 x float> %b, <4 x float> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret <4 x float> %fmla1 +} + +define <2 x double> @fmla_indexed_scalar_2d_strict(<2 x double> %a, <2 x double> %b, double %c) nounwind readnone ssp { +; CHECK-LABEL: fmla_indexed_scalar_2d_strict: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: fmla.2d v0, v1, v2[0] +; CHECK-NEXT: ret +entry: + %v1 = insertelement <2 x double> undef, double %c, i32 0 + %v2 = insertelement <2 x double> %v1, double %c, i32 1 + %fmla1 = tail call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %v2, <2 x double> %b, <2 x double> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret <2 x double> %fmla1 +} + +attributes #0 = { strictfp } + +declare <2 x float> @llvm.experimental.constrained.fma.v2f32(<2 x float>, <2 x float>, <2 x float>, metadata, metadata) +declare <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float>, <4 x float>, <4 x float>, metadata, metadata) +declare <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double>, <2 x double>, <2 x double>, metadata, metadata) + define <4 x i16> @mul_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind { ; CHECK-LABEL: mul_4h: ; CHECK: // %bb.0: diff --git a/llvm/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll b/llvm/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll --- a/llvm/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll +++ b/llvm/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll @@ -1,7 +1,11 @@ ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s +attributes #0 = { strictfp } + declare float @llvm.fma.f32(float, float, float) declare double @llvm.fma.f64(double, double, double) +declare float @llvm.experimental.constrained.fma.f32(float, float, float, metadata, metadata) +declare double @llvm.experimental.constrained.fma.f64(double, double, double, metadata, metadata) define float @test_fmla_ss4S(float %a, float %b, <4 x float> %v) { ; CHECK-LABEL: test_fmla_ss4S @@ -106,3 +110,105 @@ ret double %tmp3 } +define float @test_fmla_ss4S_strict(float %a, float %b, <4 x float> %v) { + ; CHECK-LABEL: test_fmla_ss4S_strict + ; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3] + %tmp1 = extractelement <4 x float> %v, i32 3 + %tmp2 = call float @llvm.experimental.constrained.fma.f32(float %b, float %tmp1, float %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret float %tmp2 +} + +define float @test_fmla_ss4S_swap_strict(float %a, float %b, <4 x float> %v) { + ; CHECK-LABEL: test_fmla_ss4S_swap_strict + ; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3] + %tmp1 = extractelement <4 x float> %v, i32 3 + %tmp2 = call float @llvm.experimental.constrained.fma.f32(float %tmp1, float %a, float %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret float %tmp2 +} + +define float @test_fmla_ss2S_strict(float %a, float %b, <2 x float> %v) { + ; CHECK-LABEL: test_fmla_ss2S_strict + ; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1] + %tmp1 = extractelement <2 x float> %v, i32 1 + %tmp2 = call float @llvm.experimental.constrained.fma.f32(float %b, float %tmp1, float %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret float %tmp2 +} + +define double @test_fmla_ddD_strict(double %a, double %b, <1 x double> %v) { + ; CHECK-LABEL: test_fmla_ddD_strict + ; CHECK: {{fmla d[0-9]+, d[0-9]+, v[0-9]+.d\[0]|fmadd d[0-9]+, d[0-9]+, d[0-9]+, d[0-9]+}} + %tmp1 = extractelement <1 x double> %v, i32 0 + %tmp2 = call double @llvm.experimental.constrained.fma.f64(double %b, double %tmp1, double %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret double %tmp2 +} + +define double @test_fmla_dd2D_strict(double %a, double %b, <2 x double> %v) { + ; CHECK-LABEL: test_fmla_dd2D_strict + ; CHECK: fmla {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1] + %tmp1 = extractelement <2 x double> %v, i32 1 + %tmp2 = call double @llvm.experimental.constrained.fma.f64(double %b, double %tmp1, double %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret double %tmp2 +} + +define double @test_fmla_dd2D_swap_strict(double %a, double %b, <2 x double> %v) { + ; CHECK-LABEL: test_fmla_dd2D_swap_strict + ; CHECK: fmla {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1] + %tmp1 = extractelement <2 x double> %v, i32 1 + %tmp2 = call double @llvm.experimental.constrained.fma.f64(double %tmp1, double %b, double %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret double %tmp2 +} + +define float @test_fmls_ss4S_strict(float %a, float %b, <4 x float> %v) { + ; CHECK-LABEL: test_fmls_ss4S_strict + ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3] + %tmp1 = extractelement <4 x float> %v, i32 3 + %tmp2 = fneg float %tmp1 + %tmp3 = call float @llvm.experimental.constrained.fma.f32(float %tmp2, float %tmp1, float %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret float %tmp3 +} + +define float @test_fmls_ss4S_swap_strict(float %a, float %b, <4 x float> %v) { + ; CHECK-LABEL: test_fmls_ss4S_swap_strict + ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3] + %tmp1 = extractelement <4 x float> %v, i32 3 + %tmp2 = fneg float %tmp1 + %tmp3 = call float @llvm.experimental.constrained.fma.f32(float %tmp1, float %tmp2, float %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret float %tmp3 +} + +define float @test_fmls_ss2S_strict(float %a, float %b, <2 x float> %v) { + ; CHECK-LABEL: test_fmls_ss2S_strict + ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1] + %tmp1 = extractelement <2 x float> %v, i32 1 + %tmp2 = fneg float %tmp1 + %tmp3 = call float @llvm.experimental.constrained.fma.f32(float %tmp2, float %tmp1, float %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret float %tmp3 +} + +define double @test_fmls_ddD_strict(double %a, double %b, <1 x double> %v) { + ; CHECK-LABEL: test_fmls_ddD_strict + ; CHECK: {{fmls d[0-9]+, d[0-9]+, v[0-9]+.d\[0]|fmsub d[0-9]+, d[0-9]+, d[0-9]+, d[0-9]+}} + %tmp1 = extractelement <1 x double> %v, i32 0 + %tmp2 = fneg double %tmp1 + %tmp3 = call double @llvm.experimental.constrained.fma.f64(double %tmp2, double %tmp1, double %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret double %tmp3 +} + +define double @test_fmls_dd2D_strict(double %a, double %b, <2 x double> %v) { + ; CHECK-LABEL: test_fmls_dd2D_strict + ; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1] + %tmp1 = extractelement <2 x double> %v, i32 1 + %tmp2 = fneg double %tmp1 + %tmp3 = call double @llvm.experimental.constrained.fma.f64(double %tmp2, double %tmp1, double %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret double %tmp3 +} + +define double @test_fmls_dd2D_swap_strict(double %a, double %b, <2 x double> %v) { + ; CHECK-LABEL: test_fmls_dd2D_swap_strict + ; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1] + %tmp1 = extractelement <2 x double> %v, i32 1 + %tmp2 = fneg double %tmp1 + %tmp3 = call double @llvm.experimental.constrained.fma.f64(double %tmp1, double %tmp2, double %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret double %tmp3 +} +