diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -1234,10 +1234,13 @@ static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic) { switch (Intrinsic) { case Intrinsic::aarch64_sve_fmul: + case Intrinsic::aarch64_sve_fmul_u: return Instruction::BinaryOps::FMul; case Intrinsic::aarch64_sve_fadd: + case Intrinsic::aarch64_sve_fadd_u: return Instruction::BinaryOps::FAdd; case Intrinsic::aarch64_sve_fsub: + case Intrinsic::aarch64_sve_fsub_u: return Instruction::BinaryOps::FSub; default: return Instruction::BinaryOpsEnd; @@ -1283,7 +1286,14 @@ Intrinsic::aarch64_sve_mad>( IC, II, false)) return MAD; - return instCombineSVEVectorBinOp(IC, II); + if (auto Replacement = instCombineSVEVectorBinOp(IC, II)) + return Replacement; + if (auto FMLA_U = + instCombineSVEVectorFuseMulAddSub( + IC, II, true)) + return FMLA_U; + return std::nullopt; } static std::optional instCombineSVEVectorSub(InstCombiner &IC, @@ -1302,7 +1312,109 @@ Intrinsic::aarch64_sve_fnmsb>( IC, II, false)) return FMSB; - return instCombineSVEVectorBinOp(IC, II); + if (auto Replacement = instCombineSVEVectorBinOp(IC, II)) + return Replacement; + if (auto FMLS_U = + instCombineSVEVectorFuseMulAddSub( + IC, II, true)) + return FMLS_U; + return std::nullopt; +} + +static std::optional +instCombineSVEAllActive2VA(InstCombiner &IC, IntrinsicInst &II) { + auto *OpPredicate = II.getOperand(0); + auto *OpA = II.getOperand(1); + auto *OpB = II.getOperand(2); + if (!match(OpPredicate, m_Intrinsic( + m_ConstantInt()))) + return std::nullopt; + IRBuilder<> Builder(II.getContext()); + Builder.SetInsertPoint(&II); + switch (II.getIntrinsicID()) { + default: + return std::nullopt; + case Intrinsic::aarch64_sve_fabd: { + auto FABD_U = Builder.CreateIntrinsic( + Intrinsic::aarch64_sve_fabd_u, {II.getType()}, {OpPredicate, OpA, OpB}); + return IC.replaceInstUsesWith(II, FABD_U); + } + case Intrinsic::aarch64_sve_fdiv: { + auto FDIV_U = Builder.CreateIntrinsic( + Intrinsic::aarch64_sve_fdiv_u, {II.getType()}, {OpPredicate, OpA, OpB}); + return IC.replaceInstUsesWith(II, FDIV_U); + } + case Intrinsic::aarch64_sve_fmax: { + auto FMAX_U = Builder.CreateIntrinsic( + Intrinsic::aarch64_sve_fmax_u, {II.getType()}, {OpPredicate, OpA, OpB}); + return IC.replaceInstUsesWith(II, FMAX_U); + } + case Intrinsic::aarch64_sve_fmaxnm: { + auto FMAXNM_U = + Builder.CreateIntrinsic(Intrinsic::aarch64_sve_fmaxnm_u, {II.getType()}, + {OpPredicate, OpA, OpB}); + return IC.replaceInstUsesWith(II, FMAXNM_U); + } + case Intrinsic::aarch64_sve_fmin: { + auto FMIN_U = Builder.CreateIntrinsic( + Intrinsic::aarch64_sve_fmin_u, {II.getType()}, {OpPredicate, OpA, OpB}); + return IC.replaceInstUsesWith(II, FMIN_U); + } + case Intrinsic::aarch64_sve_fminnm: { + auto FMINNM_U = + Builder.CreateIntrinsic(Intrinsic::aarch64_sve_fminnm_u, {II.getType()}, + {OpPredicate, OpA, OpB}); + return IC.replaceInstUsesWith(II, FMINNM_U); + } + case Intrinsic::aarch64_sve_fmulx: { + auto FMULX_U = + Builder.CreateIntrinsic(Intrinsic::aarch64_sve_fmulx_u, {II.getType()}, + {OpPredicate, OpA, OpB}); + return IC.replaceInstUsesWith(II, FMULX_U); + } + } +} + +static std::optional +instCombineSVEAllActive3VA(InstCombiner &IC, IntrinsicInst &II) { + auto *OpPredicate = II.getOperand(0); + auto *OpA = II.getOperand(1); + auto *OpB = II.getOperand(2); + auto *OpC = II.getOperand(3); + if (!match(OpPredicate, m_Intrinsic( + m_ConstantInt()))) + return std::nullopt; + IRBuilder<> Builder(II.getContext()); + Builder.SetInsertPoint(&II); + switch (II.getIntrinsicID()) { + default: + return std::nullopt; + case Intrinsic::aarch64_sve_fmla: { + auto FMLA_U = + Builder.CreateIntrinsic(Intrinsic::aarch64_sve_fmla_u, {II.getType()}, + {OpPredicate, OpB, OpC, OpA}); + return IC.replaceInstUsesWith(II, FMLA_U); + } + case Intrinsic::aarch64_sve_fmls: { + auto FMLS_U = + Builder.CreateIntrinsic(Intrinsic::aarch64_sve_fmls_u, {II.getType()}, + {OpPredicate, OpB, OpC, OpA}); + return IC.replaceInstUsesWith(II, FMLS_U); + } + case Intrinsic::aarch64_sve_fnmla: { + auto FNMLA_U = + Builder.CreateIntrinsic(Intrinsic::aarch64_sve_fnmla_u, {II.getType()}, + {OpPredicate, OpB, OpC, OpA}); + return IC.replaceInstUsesWith(II, FNMLA_U); + } + case Intrinsic::aarch64_sve_fnmls: { + auto FNMLS_U = + Builder.CreateIntrinsic(Intrinsic::aarch64_sve_fnmls_u, {II.getType()}, + {OpPredicate, OpB, OpC, OpA}); + return IC.replaceInstUsesWith(II, FNMLS_U); + } + } } static std::optional instCombineSVEVectorMul(InstCombiner &IC, @@ -1673,27 +1785,35 @@ case Intrinsic::aarch64_sve_ptest_first: case Intrinsic::aarch64_sve_ptest_last: return instCombineSVEPTest(IC, II); + case Intrinsic::aarch64_sve_fabd: + case Intrinsic::aarch64_sve_fdiv: + case Intrinsic::aarch64_sve_fmax: + case Intrinsic::aarch64_sve_fmaxnm: + case Intrinsic::aarch64_sve_fmin: + case Intrinsic::aarch64_sve_fminnm: + case Intrinsic::aarch64_sve_fmulx: + return instCombineSVEAllActive2VA(IC, II); + case Intrinsic::aarch64_sve_fmla: + case Intrinsic::aarch64_sve_fmls: + case Intrinsic::aarch64_sve_fnmla: + case Intrinsic::aarch64_sve_fnmls: + return instCombineSVEAllActive3VA(IC, II); case Intrinsic::aarch64_sve_mul: case Intrinsic::aarch64_sve_fmul: + case Intrinsic::aarch64_sve_fmul_u: return instCombineSVEVectorMul(IC, II); case Intrinsic::aarch64_sve_fadd: + case Intrinsic::aarch64_sve_fadd_u: case Intrinsic::aarch64_sve_add: return instCombineSVEVectorAdd(IC, II); - case Intrinsic::aarch64_sve_fadd_u: - return instCombineSVEVectorFuseMulAddSub( - IC, II, true); case Intrinsic::aarch64_sve_add_u: return instCombineSVEVectorFuseMulAddSub( IC, II, true); case Intrinsic::aarch64_sve_fsub: + case Intrinsic::aarch64_sve_fsub_u: case Intrinsic::aarch64_sve_sub: return instCombineSVEVectorSub(IC, II); - case Intrinsic::aarch64_sve_fsub_u: - return instCombineSVEVectorFuseMulAddSub( - IC, II, true); case Intrinsic::aarch64_sve_sub_u: return instCombineSVEVectorFuseMulAddSub( diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fma-binops.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fma-binops.ll --- a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fma-binops.ll +++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fma-binops.ll @@ -6,7 +6,7 @@ declare @llvm.aarch64.sve.ptrue.nxv4i1(i32) declare @llvm.aarch64.sve.ptrue.nxv2i1(i32) -; SVE intrinsics fmul and fadd should be replaced with regular fmul and fadd +; SVE intrinsics fmul, fmul_u, fadd, fadd_u, fsub and fsub_u should be replaced with regular fmul, fadd and fsub. declare @llvm.aarch64.sve.fmul.nxv8f16(, , ) define @replace_fmul_intrinsic_half( %a, %b) #0 { ; CHECK-LABEL: @replace_fmul_intrinsic_half @@ -37,6 +37,36 @@ ret %2 } +declare @llvm.aarch64.sve.fmul.u.nxv8f16(, , ) +define @replace_fmul_u_intrinsic_half( %a, %b) #0 { +; CHECK-LABEL: @replace_fmul_u_intrinsic_half +; CHECK-NEXT: %1 = fmul fast %a, %b +; CHECK-NEXT: ret %1 + %1 = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %2 = tail call fast @llvm.aarch64.sve.fmul.u.nxv8f16( %1, %a, %b) + ret %2 +} + +declare @llvm.aarch64.sve.fmul.u.nxv4f32(, , ) +define @replace_fmul_u_intrinsic_float( %a, %b) #0 { +; CHECK-LABEL: @replace_fmul_u_intrinsic_float +; CHECK-NEXT: %1 = fmul fast %a, %b +; CHECK-NEXT: ret %1 + %1 = tail call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %2 = tail call fast @llvm.aarch64.sve.fmul.u.nxv4f32( %1, %a, %b) + ret %2 +} + +declare @llvm.aarch64.sve.fmul.u.nxv2f64(, , ) +define @replace_fmul_u_intrinsic_double( %a, %b) #0 { +; CHECK-LABEL: @replace_fmul_u_intrinsic_double +; CHECK-NEXT: %1 = fmul fast %a, %b +; CHECK-NEXT: ret %1 + %1 = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %2 = tail call fast @llvm.aarch64.sve.fmul.u.nxv2f64( %1, %a, %b) + ret %2 +} + declare @llvm.aarch64.sve.fadd.nxv8f16(, , ) define @replace_fadd_intrinsic_half( %a, %b) #0 { ; CHECK-LABEL: @replace_fadd_intrinsic_half @@ -67,6 +97,36 @@ ret %2 } +declare @llvm.aarch64.sve.fadd.u.nxv8f16(, , ) +define @replace_fadd_u_intrinsic_half( %a, %b) #0 { +; CHECK-LABEL: @replace_fadd_u_intrinsic_half +; CHECK-NEXT: %1 = fadd fast %a, %b +; CHECK-NEXT: ret %1 + %1 = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %2 = tail call fast @llvm.aarch64.sve.fadd.u.nxv8f16( %1, %a, %b) + ret %2 +} + +declare @llvm.aarch64.sve.fadd.u.nxv4f32(, , ) +define @replace_fadd_u_intrinsic_float( %a, %b) #0 { +; CHECK-LABEL: @replace_fadd_u_intrinsic_float +; CHECK-NEXT: %1 = fadd fast %a, %b +; CHECK-NEXT: ret %1 + %1 = tail call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %2 = tail call fast @llvm.aarch64.sve.fadd.u.nxv4f32( %1, %a, %b) + ret %2 +} + +declare @llvm.aarch64.sve.fadd.u.nxv2f64(, , ) +define @replace_fadd_u_intrinsic_double( %a, %b) #0 { +; CHECK-LABEL: @replace_fadd_u_intrinsic_double +; CHECK-NEXT: %1 = fadd fast %a, %b +; CHECK-NEXT: ret %1 + %1 = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %2 = tail call fast @llvm.aarch64.sve.fadd.u.nxv2f64( %1, %a, %b) + ret %2 +} + declare @llvm.aarch64.sve.fsub.nxv8f16(, , ) define @replace_fsub_intrinsic_half( %a, %b) #0 { ; CHECK-LABEL: @replace_fsub_intrinsic_half @@ -87,7 +147,6 @@ ret %2 } - declare @llvm.aarch64.sve.fsub.nxv2f64(, , ) define @replace_fsub_intrinsic_double( %a, %b) #0 { ; CHECK-LABEL: @replace_fsub_intrinsic_double @@ -117,4 +176,44 @@ ret %2 } +declare @llvm.aarch64.sve.fsub.u.nxv8f16(, , ) +define @replace_fsub_u_intrinsic_half( %a, %b) #0 { +; CHECK-LABEL: @replace_fsub_u_intrinsic_half +; CHECK-NEXT: %1 = fsub fast %a, %b +; CHECK-NEXT: ret %1 + %1 = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %2 = tail call fast @llvm.aarch64.sve.fsub.u.nxv8f16( %1, %a, %b) + ret %2 +} + +declare @llvm.aarch64.sve.fsub.u.nxv4f32(, , ) +define @replace_fsub_u_intrinsic_float( %a, %b) #0 { +; CHECK-LABEL: @replace_fsub_u_intrinsic_float +; CHECK-NEXT: %1 = fsub fast %a, %b +; CHECK-NEXT: ret %1 + %1 = tail call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %2 = tail call fast @llvm.aarch64.sve.fsub.u.nxv4f32( %1, %a, %b) + ret %2 +} + +define @no_replace_on_non_ptrue_all_u( %a, %b) #0 { +; CHECK-LABEL: @no_replace_on_non_ptrue_all_u +; CHECK-NEXT: %1 = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 5) +; CHECK-NEXT: %2 = tail call fast @llvm.aarch64.sve.fsub.u.nxv2f64( %1, %a, %b) +; CHECK-NEXT: ret %2 + %1 = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 5) + %2 = tail call fast @llvm.aarch64.sve.fsub.u.nxv2f64( %1, %a, %b) + ret %2 +} + +declare @llvm.aarch64.sve.fsub.u.nxv2f64(, , ) +define @replace_fsub_u_intrinsic_no_fast_flag( %a, %b) #0 { +; CHECK-LABEL: @replace_fsub_u_intrinsic_no_fast_flag +; CHECK-NEXT: %1 = fsub %a, %b +; CHECK-NEXT: ret %1 + %1 = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %2 = tail call @llvm.aarch64.sve.fsub.u.nxv2f64( %1, %a, %b) + ret %2 +} + attributes #0 = { "target-features"="+sve" } diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul_u-idempotency.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul_u-idempotency.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul_u-idempotency.ll @@ -0,0 +1,119 @@ +; RUN: opt -S -passes=instcombine < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +; Idempotent fmuls_u -- should compile to just a ret. +define @idempotent_fmul_u_f16( %pg, %a) #0 { +; CHECK-LABEL: @idempotent_fmul_u_f16( +; CHECK-NEXT: ret [[A:%.*]] +; + %1 = call @llvm.aarch64.sve.dup.x.nxv8f16(half 1.0) + %2 = call @llvm.aarch64.sve.fmul.u.nxv8f16( %pg, %a, %1) + ret %2 +} + +define @idempotent_fmul_u_f32( %pg, %a) #0 { +; CHECK-LABEL: @idempotent_fmul_u_f32( +; CHECK-NEXT: ret [[A:%.*]] +; + %1 = call @llvm.aarch64.sve.dup.x.nxv4f32(float 1.0) + %2 = call @llvm.aarch64.sve.fmul.u.nxv4f32( %pg, %a, %1) + ret %2 +} + +define @idempotent_fmul_u_f64( %pg, %a) #0 { +; CHECK-LABEL: @idempotent_fmul_u_f64( +; CHECK-NEXT: ret [[A:%.*]] +; + %1 = call @llvm.aarch64.sve.dup.x.nxv2f64(double 1.0) + %2 = call @llvm.aarch64.sve.fmul.u.nxv2f64( %pg, %a, %1) + ret %2 +} + +define @idempotent_fmul_u_different_argument_order( %pg, %a) #0 { +; CHECK-LABEL: @idempotent_fmul_u_different_argument_order( +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.fmul.u.nxv2f64( [[PG:%.*]], shufflevector ( insertelement ( poison, double 1.000000e+00, i64 0), poison, zeroinitializer), [[A:%.*]]) +; CHECK-NEXT: ret [[TMP1]] +; + %1 = call @llvm.aarch64.sve.dup.x.nxv2f64(double 1.0) + ; Different argument order to the above tests. + %2 = call @llvm.aarch64.sve.fmul.u.nxv2f64( %pg, %1, %a) + ret %2 +} + +define @idempotent_fmul_u_with_predicated_dup( %pg, %a) #0 { +; CHECK-LABEL: @idempotent_fmul_u_with_predicated_dup( +; CHECK-NEXT: ret [[A:%.*]] +; + %1 = call @llvm.aarch64.sve.dup.nxv8f16( undef, %pg, half 1.0) + %2 = call @llvm.aarch64.sve.fmul.u.nxv8f16( %pg, %a, %1) + ret %2 +} + +define @idempotent_fmul_u_two_dups( %pg, %a) #0 { + ; Edge case -- make sure that the case where we're fmultiplying two dups + ; together is sane. +; CHECK-LABEL: @idempotent_fmul_u_two_dups( +; CHECK-NEXT: ret shufflevector ( insertelement ( poison, half 0xH3C00, i64 0), poison, zeroinitializer) +; + %1 = call @llvm.aarch64.sve.dup.x.nxv8f16(half 1.0) + %2 = call @llvm.aarch64.sve.dup.x.nxv8f16(half 1.0) + %3 = call @llvm.aarch64.sve.fmul.u.nxv8f16( %pg, %1, %2) + ret %3 +} + +; Non-idempotent fmuls_u -- we don't expect these to be optimised out. +define @non_idempotent_fmul_u_f16( %pg, %a) #0 { +; CHECK-LABEL: @non_idempotent_fmul_u_f16( +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.fmul.u.nxv8f16( [[PG:%.*]], [[A:%.*]], shufflevector ( insertelement ( poison, half 0xH4000, i64 0), poison, zeroinitializer)) +; CHECK-NEXT: ret [[TMP1]] +; + %1 = call @llvm.aarch64.sve.dup.x.nxv8f16(half 2.0) + %2 = call @llvm.aarch64.sve.fmul.u.nxv8f16( %pg, %a, %1) + ret %2 +} + +define @non_idempotent_fmul_u_f32( %pg, %a) #0 { +; CHECK-LABEL: @non_idempotent_fmul_u_f32( +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.fmul.u.nxv4f32( [[PG:%.*]], [[A:%.*]], shufflevector ( insertelement ( poison, float 2.000000e+00, i64 0), poison, zeroinitializer)) +; CHECK-NEXT: ret [[TMP1]] +; + %1 = call @llvm.aarch64.sve.dup.x.nxv4f32(float 2.0) + %2 = call @llvm.aarch64.sve.fmul.u.nxv4f32( %pg, %a, %1) + ret %2 +} + +define @non_idempotent_fmul_u_f64( %pg, %a) #0 { +; CHECK-LABEL: @non_idempotent_fmul_u_f64( +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.fmul.u.nxv2f64( [[PG:%.*]], [[A:%.*]], shufflevector ( insertelement ( poison, double 2.000000e+00, i64 0), poison, zeroinitializer)) +; CHECK-NEXT: ret [[TMP1]] +; + %1 = call @llvm.aarch64.sve.dup.x.nxv2f64(double 2.0) + %2 = call @llvm.aarch64.sve.fmul.u.nxv2f64( %pg, %a, %1) + ret %2 +} + +define @non_idempotent_fmul_u_with_predicated_dup( %pg1, %pg2, %a) #0 { + ; Different predicates +; CHECK-LABEL: @non_idempotent_fmul_u_with_predicated_dup( +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.dup.nxv2f64( undef, [[PG1:%.*]], double 1.000000e+00) +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.fmul.u.nxv2f64( [[PG2:%.*]], [[A:%.*]], [[TMP1]]) +; CHECK-NEXT: ret [[TMP2]] +; + %1 = call @llvm.aarch64.sve.dup.nxv2f64( undef, %pg1, double 1.0) + %2 = call @llvm.aarch64.sve.fmul.u.nxv2f64( %pg2, %a, %1) + ret %2 +} + +declare @llvm.aarch64.sve.dup.x.nxv8f16(half) +declare @llvm.aarch64.sve.dup.x.nxv4f32(float) +declare @llvm.aarch64.sve.dup.x.nxv2f64(double) + +declare @llvm.aarch64.sve.dup.nxv2f64(, , double) +declare @llvm.aarch64.sve.dup.nxv8f16(, , half) + +declare @llvm.aarch64.sve.fmul.u.nxv8f16(, , ) +declare @llvm.aarch64.sve.fmul.u.nxv4f32(, , ) +declare @llvm.aarch64.sve.fmul.u.nxv2f64(, , ) + +attributes #0 = { "target-features"="+sve" } diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-m-to-x.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-m-to-x.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-m-to-x.ll @@ -0,0 +1,522 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 +; RUN: opt -S -passes=instcombine < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +declare @llvm.aarch64.sve.ptrue.nxv8i1(i32) +declare @llvm.aarch64.sve.ptrue.nxv4i1(i32) +declare @llvm.aarch64.sve.ptrue.nxv2i1(i32) + +; Replace SVE merging intrinsics to their equivalent undef (_u) variants when they take an all active predicate. + +declare @llvm.aarch64.sve.fabd.nxv8f16(, , ) +define @replace_fabd_intrinsic_half( %a, %b) #0 { +; CHECK-LABEL: define @replace_fabd_intrinsic_half +; CHECK-SAME: ( [[A:%.*]], [[B:%.*]]) #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.fabd.u.nxv8f16( [[TMP1]], [[A]], [[B]]) +; CHECK-NEXT: ret [[TMP2]] +; + %1 = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %2 = tail call fast @llvm.aarch64.sve.fabd.nxv8f16( %1, %a, %b) + ret %2 +} + +declare @llvm.aarch64.sve.fabd.nxv4f32(, , ) +define @replace_fabd_intrinsic_float( %a, %b) #0 { +; CHECK-LABEL: define @replace_fabd_intrinsic_float +; CHECK-SAME: ( [[A:%.*]], [[B:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.fabd.u.nxv4f32( [[TMP1]], [[A]], [[B]]) +; CHECK-NEXT: ret [[TMP2]] +; + %1 = tail call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %2 = tail call fast @llvm.aarch64.sve.fabd.nxv4f32( %1, %a, %b) + ret %2 +} + +declare @llvm.aarch64.sve.fabd.nxv2f64(, , ) +define @replace_fabd_intrinsic_double( %a, %b) #0 { +; CHECK-LABEL: define @replace_fabd_intrinsic_double +; CHECK-SAME: ( [[A:%.*]], [[B:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.fabd.u.nxv2f64( [[TMP1]], [[A]], [[B]]) +; CHECK-NEXT: ret [[TMP2]] +; + %1 = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %2 = tail call fast @llvm.aarch64.sve.fabd.nxv2f64( %1, %a, %b) + ret %2 +} + +define @no_replace_fabd_intrinsic_double( %a, %b) #0 { +; CHECK-LABEL: define @no_replace_fabd_intrinsic_double +; CHECK-SAME: ( [[A:%.*]], [[B:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 5) +; CHECK-NEXT: [[TMP2:%.*]] = tail call fast @llvm.aarch64.sve.fabd.nxv2f64( [[TMP1]], [[A]], [[B]]) +; CHECK-NEXT: ret [[TMP2]] +; + %1 = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 5) + %2 = tail call fast @llvm.aarch64.sve.fabd.nxv2f64( %1, %a, %b) + ret %2 +} + +declare @llvm.aarch64.sve.fdiv.nxv8f16(, , ) +define @replace_fdiv_intrinsic_half( %a, %b) #0 { +; CHECK-LABEL: define @replace_fdiv_intrinsic_half +; CHECK-SAME: ( [[A:%.*]], [[B:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.fdiv.u.nxv8f16( [[TMP1]], [[A]], [[B]]) +; CHECK-NEXT: ret [[TMP2]] +; + %1 = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %2 = tail call fast @llvm.aarch64.sve.fdiv.nxv8f16( %1, %a, %b) + ret %2 +} + +declare @llvm.aarch64.sve.fdiv.nxv4f32(, , ) +define @replace_fdiv_intrinsic_float( %a, %b) #0 { +; CHECK-LABEL: define @replace_fdiv_intrinsic_float +; CHECK-SAME: ( [[A:%.*]], [[B:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.fdiv.u.nxv4f32( [[TMP1]], [[A]], [[B]]) +; CHECK-NEXT: ret [[TMP2]] +; + %1 = tail call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %2 = tail call fast @llvm.aarch64.sve.fdiv.nxv4f32( %1, %a, %b) + ret %2 +} + +declare @llvm.aarch64.sve.fdiv.nxv2f64(, , ) +define @replace_fdiv_intrinsic_double( %a, %b) #0 { +; CHECK-LABEL: define @replace_fdiv_intrinsic_double +; CHECK-SAME: ( [[A:%.*]], [[B:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.fdiv.u.nxv2f64( [[TMP1]], [[A]], [[B]]) +; CHECK-NEXT: ret [[TMP2]] +; + %1 = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %2 = tail call fast @llvm.aarch64.sve.fdiv.nxv2f64( %1, %a, %b) + ret %2 +} + +define @no_replace_fdiv_intrinsic_double( %a, %b) #0 { +; CHECK-LABEL: define @no_replace_fdiv_intrinsic_double +; CHECK-SAME: ( [[A:%.*]], [[B:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 5) +; CHECK-NEXT: [[TMP2:%.*]] = tail call fast @llvm.aarch64.sve.fdiv.nxv2f64( [[TMP1]], [[A]], [[B]]) +; CHECK-NEXT: ret [[TMP2]] +; + %1 = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 5) + %2 = tail call fast @llvm.aarch64.sve.fdiv.nxv2f64( %1, %a, %b) + ret %2 +} + +declare @llvm.aarch64.sve.fmax.nxv8f16(, , ) +define @replace_fmax_intrinsic_half( %a, %b) #0 { +; CHECK-LABEL: define @replace_fmax_intrinsic_half +; CHECK-SAME: ( [[A:%.*]], [[B:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.fmax.u.nxv8f16( [[TMP1]], [[A]], [[B]]) +; CHECK-NEXT: ret [[TMP2]] +; + %1 = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %2 = tail call fast @llvm.aarch64.sve.fmax.nxv8f16( %1, %a, %b) + ret %2 +} + +declare @llvm.aarch64.sve.fmax.nxv4f32(, , ) +define @replace_fmax_intrinsic_float( %a, %b) #0 { +; CHECK-LABEL: define @replace_fmax_intrinsic_float +; CHECK-SAME: ( [[A:%.*]], [[B:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.fmax.u.nxv4f32( [[TMP1]], [[A]], [[B]]) +; CHECK-NEXT: ret [[TMP2]] +; + %1 = tail call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %2 = tail call fast @llvm.aarch64.sve.fmax.nxv4f32( %1, %a, %b) + ret %2 +} + +declare @llvm.aarch64.sve.fmax.nxv2f64(, , ) +define @replace_fmax_intrinsic_double( %a, %b) #0 { +; CHECK-LABEL: define @replace_fmax_intrinsic_double +; CHECK-SAME: ( [[A:%.*]], [[B:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.fmax.u.nxv2f64( [[TMP1]], [[A]], [[B]]) +; CHECK-NEXT: ret [[TMP2]] +; + %1 = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %2 = tail call fast @llvm.aarch64.sve.fmax.nxv2f64( %1, %a, %b) + ret %2 +} + +define @no_replace_fmax_intrinsic_double( %a, %b) #0 { +; CHECK-LABEL: define @no_replace_fmax_intrinsic_double +; CHECK-SAME: ( [[A:%.*]], [[B:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 5) +; CHECK-NEXT: [[TMP2:%.*]] = tail call fast @llvm.aarch64.sve.fmax.nxv2f64( [[TMP1]], [[A]], [[B]]) +; CHECK-NEXT: ret [[TMP2]] +; + %1 = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 5) + %2 = tail call fast @llvm.aarch64.sve.fmax.nxv2f64( %1, %a, %b) + ret %2 +} + +declare @llvm.aarch64.sve.fmaxnm.nxv8f16(, , ) +define @replace_fmaxnm_intrinsic_half( %a, %b) #0 { +; CHECK-LABEL: define @replace_fmaxnm_intrinsic_half +; CHECK-SAME: ( [[A:%.*]], [[B:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.fmaxnm.u.nxv8f16( [[TMP1]], [[A]], [[B]]) +; CHECK-NEXT: ret [[TMP2]] +; + %1 = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %2 = tail call fast @llvm.aarch64.sve.fmaxnm.nxv8f16( %1, %a, %b) + ret %2 +} + +declare @llvm.aarch64.sve.fmaxnm.nxv4f32(, , ) +define @replace_fmaxnm_intrinsic_float( %a, %b) #0 { +; CHECK-LABEL: define @replace_fmaxnm_intrinsic_float +; CHECK-SAME: ( [[A:%.*]], [[B:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.fmaxnm.u.nxv4f32( [[TMP1]], [[A]], [[B]]) +; CHECK-NEXT: ret [[TMP2]] +; + %1 = tail call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %2 = tail call fast @llvm.aarch64.sve.fmaxnm.nxv4f32( %1, %a, %b) + ret %2 +} + +declare @llvm.aarch64.sve.fmaxnm.nxv2f64(, , ) +define @replace_fmaxnm_intrinsic_double( %a, %b) #0 { +; CHECK-LABEL: define @replace_fmaxnm_intrinsic_double +; CHECK-SAME: ( [[A:%.*]], [[B:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.fmax.u.nxv2f64( [[TMP1]], [[A]], [[B]]) +; CHECK-NEXT: ret [[TMP2]] +; + %1 = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %2 = tail call fast @llvm.aarch64.sve.fmax.nxv2f64( %1, %a, %b) + ret %2 +} + +define @no_replace_fmaxnm_intrinsic_double( %a, %b) #0 { +; CHECK-LABEL: define @no_replace_fmaxnm_intrinsic_double +; CHECK-SAME: ( [[A:%.*]], [[B:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 5) +; CHECK-NEXT: [[TMP2:%.*]] = tail call fast @llvm.aarch64.sve.fmaxnm.nxv2f64( [[TMP1]], [[A]], [[B]]) +; CHECK-NEXT: ret [[TMP2]] +; + %1 = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 5) + %2 = tail call fast @llvm.aarch64.sve.fmaxnm.nxv2f64( %1, %a, %b) + ret %2 +} + +declare @llvm.aarch64.sve.fmin.nxv8f16(, , ) +define @replace_fmin_intrinsic_half( %a, %b) #0 { +; CHECK-LABEL: define @replace_fmin_intrinsic_half +; CHECK-SAME: ( [[A:%.*]], [[B:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.fmin.u.nxv8f16( [[TMP1]], [[A]], [[B]]) +; CHECK-NEXT: ret [[TMP2]] +; + %1 = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %2 = tail call fast @llvm.aarch64.sve.fmin.nxv8f16( %1, %a, %b) + ret %2 +} + +declare @llvm.aarch64.sve.fmin.nxv4f32(, , ) +define @replace_fmin_intrinsic_float( %a, %b) #0 { +; CHECK-LABEL: define @replace_fmin_intrinsic_float +; CHECK-SAME: ( [[A:%.*]], [[B:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.fmin.u.nxv4f32( [[TMP1]], [[A]], [[B]]) +; CHECK-NEXT: ret [[TMP2]] +; + %1 = tail call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %2 = tail call fast @llvm.aarch64.sve.fmin.nxv4f32( %1, %a, %b) + ret %2 +} + +declare @llvm.aarch64.sve.fmin.nxv2f64(, , ) +define @replace_fmin_intrinsic_double( %a, %b) #0 { +; CHECK-LABEL: define @replace_fmin_intrinsic_double +; CHECK-SAME: ( [[A:%.*]], [[B:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.fmin.u.nxv2f64( [[TMP1]], [[A]], [[B]]) +; CHECK-NEXT: ret [[TMP2]] +; + %1 = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %2 = tail call fast @llvm.aarch64.sve.fmin.nxv2f64( %1, %a, %b) + ret %2 +} + +define @no_replace_fmin_intrinsic_double( %a, %b) #0 { +; CHECK-LABEL: define @no_replace_fmin_intrinsic_double +; CHECK-SAME: ( [[A:%.*]], [[B:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 5) +; CHECK-NEXT: [[TMP2:%.*]] = tail call fast @llvm.aarch64.sve.fmin.nxv2f64( [[TMP1]], [[A]], [[B]]) +; CHECK-NEXT: ret [[TMP2]] +; + %1 = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 5) + %2 = tail call fast @llvm.aarch64.sve.fmin.nxv2f64( %1, %a, %b) + ret %2 +} + +declare @llvm.aarch64.sve.fminnm.nxv8f16(, , ) +define @replace_fminnm_intrinsic_half( %a, %b) #0 { +; CHECK-LABEL: define @replace_fminnm_intrinsic_half +; CHECK-SAME: ( [[A:%.*]], [[B:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.fminnm.u.nxv8f16( [[TMP1]], [[A]], [[B]]) +; CHECK-NEXT: ret [[TMP2]] +; + %1 = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %2 = tail call fast @llvm.aarch64.sve.fminnm.nxv8f16( %1, %a, %b) + ret %2 +} + +declare @llvm.aarch64.sve.fminnm.nxv4f32(, , ) +define @replace_fminnm_intrinsic_float( %a, %b) #0 { +; CHECK-LABEL: define @replace_fminnm_intrinsic_float +; CHECK-SAME: ( [[A:%.*]], [[B:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.fminnm.u.nxv4f32( [[TMP1]], [[A]], [[B]]) +; CHECK-NEXT: ret [[TMP2]] +; + %1 = tail call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %2 = tail call fast @llvm.aarch64.sve.fminnm.nxv4f32( %1, %a, %b) + ret %2 +} + +declare @llvm.aarch64.sve.fminnm.nxv2f64(, , ) +define @replace_fminnm_intrinsic_double( %a, %b) #0 { +; CHECK-LABEL: define @replace_fminnm_intrinsic_double +; CHECK-SAME: ( [[A:%.*]], [[B:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.fminnm.u.nxv2f64( [[TMP1]], [[A]], [[B]]) +; CHECK-NEXT: ret [[TMP2]] +; + %1 = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %2 = tail call fast @llvm.aarch64.sve.fminnm.nxv2f64( %1, %a, %b) + ret %2 +} + +define @no_replace_fminnm_intrinsic_double( %a, %b) #0 { +; CHECK-LABEL: define @no_replace_fminnm_intrinsic_double +; CHECK-SAME: ( [[A:%.*]], [[B:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 5) +; CHECK-NEXT: [[TMP2:%.*]] = tail call fast @llvm.aarch64.sve.fminnm.nxv2f64( [[TMP1]], [[A]], [[B]]) +; CHECK-NEXT: ret [[TMP2]] +; + %1 = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 5) + %2 = tail call fast @llvm.aarch64.sve.fminnm.nxv2f64( %1, %a, %b) + ret %2 +} + +declare @llvm.aarch64.sve.fmla.nxv8f16(, , , ) +define @replace_fmla_intrinsic_half( %a, %b, %c) #0 { +; CHECK-LABEL: define @replace_fmla_intrinsic_half +; CHECK-SAME: ( [[A:%.*]], [[B:%.*]], [[C:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.fmla.u.nxv8f16( [[TMP1]], [[B]], [[C]], [[A]]) +; CHECK-NEXT: ret [[TMP2]] +; + %1 = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %2 = tail call fast @llvm.aarch64.sve.fmla.nxv8f16( %1, %a, %b, %c) + ret %2 +} + +declare @llvm.aarch64.sve.fmla.nxv4f32(, , , ) +define @replace_fmla_intrinsic_float( %a, %b, %c) #0 { +; CHECK-LABEL: define @replace_fmla_intrinsic_float +; CHECK-SAME: ( [[A:%.*]], [[B:%.*]], [[C:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.fmla.u.nxv4f32( [[TMP1]], [[B]], [[C]], [[A]]) +; CHECK-NEXT: ret [[TMP2]] +; + %1 = tail call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %2 = tail call fast @llvm.aarch64.sve.fmla.nxv4f32( %1, %a, %b, %c) + ret %2 +} + +declare @llvm.aarch64.sve.fmla.nxv2f64(, , , ) +define @replace_fmla_intrinsic_double( %a, %b, %c) #0 { +; CHECK-LABEL: define @replace_fmla_intrinsic_double +; CHECK-SAME: ( [[A:%.*]], [[B:%.*]], [[C:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.fmla.u.nxv2f64( [[TMP1]], [[B]], [[C]], [[A]]) +; CHECK-NEXT: ret [[TMP2]] +; + %1 = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %2 = tail call fast @llvm.aarch64.sve.fmla.nxv2f64( %1, %a, %b, %c) + ret %2 +} + +define @no_replace_fmla_intrinsic_double( %a, %b, %c) #0 { +; CHECK-LABEL: define @no_replace_fmla_intrinsic_double +; CHECK-SAME: ( [[A:%.*]], [[B:%.*]], [[C:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 5) +; CHECK-NEXT: [[TMP2:%.*]] = tail call fast @llvm.aarch64.sve.fmla.nxv2f64( [[TMP1]], [[A]], [[B]], [[C]]) +; CHECK-NEXT: ret [[TMP2]] +; + %1 = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 5) + %2 = tail call fast @llvm.aarch64.sve.fmla.nxv2f64( %1, %a, %b, %c) + ret %2 +} + +declare @llvm.aarch64.sve.fmls.nxv8f16(, , , ) +define @replace_fmls_intrinsic_half( %a, %b, %c) #0 { +; CHECK-LABEL: define @replace_fmls_intrinsic_half +; CHECK-SAME: ( [[A:%.*]], [[B:%.*]], [[C:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.fmls.u.nxv8f16( [[TMP1]], [[B]], [[C]], [[A]]) +; CHECK-NEXT: ret [[TMP2]] +; + %1 = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %2 = tail call fast @llvm.aarch64.sve.fmls.nxv8f16( %1, %a, %b, %c) + ret %2 +} + +declare @llvm.aarch64.sve.fmls.nxv4f32(, , , ) +define @replace_fmls_intrinsic_float( %a, %b, %c) #0 { +; CHECK-LABEL: define @replace_fmls_intrinsic_float +; CHECK-SAME: ( [[A:%.*]], [[B:%.*]], [[C:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.fmls.u.nxv4f32( [[TMP1]], [[B]], [[C]], [[A]]) +; CHECK-NEXT: ret [[TMP2]] +; + %1 = tail call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %2 = tail call fast @llvm.aarch64.sve.fmls.nxv4f32( %1, %a, %b, %c) + ret %2 +} + +declare @llvm.aarch64.sve.fmls.nxv2f64(, , , ) +define @replace_fmls_intrinsic_double( %a, %b, %c) #0 { +; CHECK-LABEL: define @replace_fmls_intrinsic_double +; CHECK-SAME: ( [[A:%.*]], [[B:%.*]], [[C:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.fmls.u.nxv2f64( [[TMP1]], [[B]], [[C]], [[A]]) +; CHECK-NEXT: ret [[TMP2]] +; + %1 = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %2 = tail call fast @llvm.aarch64.sve.fmls.nxv2f64( %1, %a, %b, %c) + ret %2 +} + +define @no_replace_fmls_intrinsic_double( %a, %b, %c) #0 { +; CHECK-LABEL: define @no_replace_fmls_intrinsic_double +; CHECK-SAME: ( [[A:%.*]], [[B:%.*]], [[C:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 5) +; CHECK-NEXT: [[TMP2:%.*]] = tail call fast @llvm.aarch64.sve.fmls.nxv2f64( [[TMP1]], [[A]], [[B]], [[C]]) +; CHECK-NEXT: ret [[TMP2]] +; + %1 = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 5) + %2 = tail call fast @llvm.aarch64.sve.fmls.nxv2f64( %1, %a, %b, %c) + ret %2 +} + +declare @llvm.aarch64.sve.fnmla.nxv8f16(, , , ) +define @replace_fnmla_intrinsic_half( %a, %b, %c) #0 { +; CHECK-LABEL: define @replace_fnmla_intrinsic_half +; CHECK-SAME: ( [[A:%.*]], [[B:%.*]], [[C:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.fnmla.u.nxv8f16( [[TMP1]], [[B]], [[C]], [[A]]) +; CHECK-NEXT: ret [[TMP2]] +; + %1 = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %2 = tail call fast @llvm.aarch64.sve.fnmla.nxv8f16( %1, %a, %b, %c) + ret %2 +} + +declare @llvm.aarch64.sve.fnmla.nxv4f32(, , , ) +define @replace_fnmla_intrinsic_float( %a, %b, %c) #0 { +; CHECK-LABEL: define @replace_fnmla_intrinsic_float +; CHECK-SAME: ( [[A:%.*]], [[B:%.*]], [[C:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.fnmla.u.nxv4f32( [[TMP1]], [[B]], [[C]], [[A]]) +; CHECK-NEXT: ret [[TMP2]] +; + %1 = tail call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %2 = tail call fast @llvm.aarch64.sve.fnmla.nxv4f32( %1, %a, %b, %c) + ret %2 +} + +declare @llvm.aarch64.sve.fnmla.nxv2f64(, , , ) +define @replace_fnmla_intrinsic_double( %a, %b, %c) #0 { +; CHECK-LABEL: define @replace_fnmla_intrinsic_double +; CHECK-SAME: ( [[A:%.*]], [[B:%.*]], [[C:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.fnmla.u.nxv2f64( [[TMP1]], [[B]], [[C]], [[A]]) +; CHECK-NEXT: ret [[TMP2]] +; + %1 = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %2 = tail call fast @llvm.aarch64.sve.fnmla.nxv2f64( %1, %a, %b, %c) + ret %2 +} + +define @no_replace_fnmla_intrinsic_double( %a, %b, %c) #0 { +; CHECK-LABEL: define @no_replace_fnmla_intrinsic_double +; CHECK-SAME: ( [[A:%.*]], [[B:%.*]], [[C:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 5) +; CHECK-NEXT: [[TMP2:%.*]] = tail call fast @llvm.aarch64.sve.fnmla.nxv2f64( [[TMP1]], [[A]], [[B]], [[C]]) +; CHECK-NEXT: ret [[TMP2]] +; + %1 = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 5) + %2 = tail call fast @llvm.aarch64.sve.fnmla.nxv2f64( %1, %a, %b, %c) + ret %2 +} + +declare @llvm.aarch64.sve.fnmls.nxv8f16(, , , ) +define @replace_fnmls_intrinsic_half( %a, %b, %c) #0 { +; CHECK-LABEL: define @replace_fnmls_intrinsic_half +; CHECK-SAME: ( [[A:%.*]], [[B:%.*]], [[C:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.fnmls.u.nxv8f16( [[TMP1]], [[B]], [[C]], [[A]]) +; CHECK-NEXT: ret [[TMP2]] +; + %1 = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %2 = tail call fast @llvm.aarch64.sve.fnmls.nxv8f16( %1, %a, %b, %c) + ret %2 +} + +declare @llvm.aarch64.sve.fnmls.nxv4f32(, , , ) +define @replace_fnmls_intrinsic_float( %a, %b, %c) #0 { +; CHECK-LABEL: define @replace_fnmls_intrinsic_float +; CHECK-SAME: ( [[A:%.*]], [[B:%.*]], [[C:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.fnmls.u.nxv4f32( [[TMP1]], [[B]], [[C]], [[A]]) +; CHECK-NEXT: ret [[TMP2]] +; + %1 = tail call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %2 = tail call fast @llvm.aarch64.sve.fnmls.nxv4f32( %1, %a, %b, %c) + ret %2 +} + +declare @llvm.aarch64.sve.fnmls.nxv2f64(, , , ) +define @replace_fnmls_intrinsic_double( %a, %b, %c) #0 { +; CHECK-LABEL: define @replace_fnmls_intrinsic_double +; CHECK-SAME: ( [[A:%.*]], [[B:%.*]], [[C:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.fnmls.u.nxv2f64( [[TMP1]], [[B]], [[C]], [[A]]) +; CHECK-NEXT: ret [[TMP2]] +; + %1 = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %2 = tail call fast @llvm.aarch64.sve.fnmls.nxv2f64( %1, %a, %b, %c) + ret %2 +} + +define @no_replace_fnmls_intrinsic_double( %a, %b, %c) #0 { +; CHECK-LABEL: define @no_replace_fnmls_intrinsic_double +; CHECK-SAME: ( [[A:%.*]], [[B:%.*]], [[C:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 5) +; CHECK-NEXT: [[TMP2:%.*]] = tail call fast @llvm.aarch64.sve.fnmls.nxv2f64( [[TMP1]], [[A]], [[B]], [[C]]) +; CHECK-NEXT: ret [[TMP2]] +; + %1 = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 5) + %2 = tail call fast @llvm.aarch64.sve.fnmls.nxv2f64( %1, %a, %b, %c) + ret %2 +} + +attributes #0 = { "target-features"="+sve" }