diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1234,10 +1234,13 @@
 static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic) {
   switch (Intrinsic) {
   case Intrinsic::aarch64_sve_fmul:
+  case Intrinsic::aarch64_sve_fmul_u:
     return Instruction::BinaryOps::FMul;
   case Intrinsic::aarch64_sve_fadd:
+  case Intrinsic::aarch64_sve_fadd_u:
     return Instruction::BinaryOps::FAdd;
   case Intrinsic::aarch64_sve_fsub:
+  case Intrinsic::aarch64_sve_fsub_u:
     return Instruction::BinaryOps::FSub;
   default:
     return Instruction::BinaryOpsEnd;
@@ -1283,7 +1286,14 @@
                                                    Intrinsic::aarch64_sve_mad>(
           IC, II, false))
     return MAD;
-  return instCombineSVEVectorBinOp(IC, II);
+  if (auto Replacement = instCombineSVEVectorBinOp(IC, II))
+    return Replacement;
+  if (auto FMLA_U =
+          instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
+                                            Intrinsic::aarch64_sve_fmla_u>(
+              IC, II, true))
+    return FMLA_U;
+  return std::nullopt;
 }
 
 static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC,
@@ -1302,7 +1312,109 @@
                                             Intrinsic::aarch64_sve_fnmsb>(
               IC, II, false))
     return FMSB;
-  return instCombineSVEVectorBinOp(IC, II);
+  if (auto Replacement = instCombineSVEVectorBinOp(IC, II))
+    return Replacement;
+  if (auto FMLS_U =
+          instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
+                                            Intrinsic::aarch64_sve_fmls_u>(
+              IC, II, true))
+    return FMLS_U;
+  return std::nullopt;
+}
+
+static std::optional<Instruction *>
+instCombineSVEAllActive2VA(InstCombiner &IC, IntrinsicInst &II) {
+  auto *OpPredicate = II.getOperand(0);
+  auto *OpA = II.getOperand(1);
+  auto *OpB = II.getOperand(2);
+  if (!match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
+                              m_ConstantInt<AArch64SVEPredPattern::all>())))
+    return std::nullopt;
+  IRBuilder<> Builder(II.getContext());
+  Builder.SetInsertPoint(&II);
+  switch (II.getIntrinsicID()) {
+  default:
+    return std::nullopt;
+  case Intrinsic::aarch64_sve_fabd: {
+    auto FABD_U = Builder.CreateIntrinsic(
+        Intrinsic::aarch64_sve_fabd_u, {II.getType()}, {OpPredicate, OpA, OpB});
+    return IC.replaceInstUsesWith(II, FABD_U);
+  }
+  case Intrinsic::aarch64_sve_fdiv: {
+    auto FDIV_U = Builder.CreateIntrinsic(
+        Intrinsic::aarch64_sve_fdiv_u, {II.getType()}, {OpPredicate, OpA, OpB});
+    return IC.replaceInstUsesWith(II, FDIV_U);
+  }
+  case Intrinsic::aarch64_sve_fmax: {
+    auto FMAX_U = Builder.CreateIntrinsic(
+        Intrinsic::aarch64_sve_fmax_u, {II.getType()}, {OpPredicate, OpA, OpB});
+    return IC.replaceInstUsesWith(II, FMAX_U);
+  }
+  case Intrinsic::aarch64_sve_fmaxnm: {
+    auto FMAXNM_U =
+        Builder.CreateIntrinsic(Intrinsic::aarch64_sve_fmaxnm_u, {II.getType()},
+                                {OpPredicate, OpA, OpB});
+    return IC.replaceInstUsesWith(II, FMAXNM_U);
+  }
+  case Intrinsic::aarch64_sve_fmin: {
+    auto FMIN_U = Builder.CreateIntrinsic(
+        Intrinsic::aarch64_sve_fmin_u, {II.getType()}, {OpPredicate, OpA, OpB});
+    return IC.replaceInstUsesWith(II, FMIN_U);
+  }
+  case Intrinsic::aarch64_sve_fminnm: {
+    auto FMINNM_U =
+        Builder.CreateIntrinsic(Intrinsic::aarch64_sve_fminnm_u, {II.getType()},
+                                {OpPredicate, OpA, OpB});
+    return IC.replaceInstUsesWith(II, FMINNM_U);
+  }
+  case Intrinsic::aarch64_sve_fmulx: {
+    auto FMULX_U =
+        Builder.CreateIntrinsic(Intrinsic::aarch64_sve_fmulx_u, {II.getType()},
+                                {OpPredicate, OpA, OpB});
+    return IC.replaceInstUsesWith(II, FMULX_U);
+  }
+  }
+}
+
+static std::optional<Instruction *>
+instCombineSVEAllActive3VA(InstCombiner &IC, IntrinsicInst &II) {
+  auto *OpPredicate = II.getOperand(0);
+  auto *OpA = II.getOperand(1);
+  auto *OpB = II.getOperand(2);
+  auto *OpC = II.getOperand(3);
+  if (!match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
+                              m_ConstantInt<AArch64SVEPredPattern::all>())))
+    return std::nullopt;
+  IRBuilder<> Builder(II.getContext());
+  Builder.SetInsertPoint(&II);
+  switch (II.getIntrinsicID()) {
+  default:
+    return std::nullopt;
+  case Intrinsic::aarch64_sve_fmla: {
+    auto FMLA_U =
+        Builder.CreateIntrinsic(Intrinsic::aarch64_sve_fmla_u, {II.getType()},
+                                {OpPredicate, OpB, OpC, OpA});
+    return IC.replaceInstUsesWith(II, FMLA_U);
+  }
+  case Intrinsic::aarch64_sve_fmls: {
+    auto FMLS_U =
+        Builder.CreateIntrinsic(Intrinsic::aarch64_sve_fmls_u, {II.getType()},
+                                {OpPredicate, OpB, OpC, OpA});
+    return IC.replaceInstUsesWith(II, FMLS_U);
+  }
+  case Intrinsic::aarch64_sve_fnmla: {
+    auto FNMLA_U =
+        Builder.CreateIntrinsic(Intrinsic::aarch64_sve_fnmla_u, {II.getType()},
+                                {OpPredicate, OpB, OpC, OpA});
+    return IC.replaceInstUsesWith(II, FNMLA_U);
+  }
+  case Intrinsic::aarch64_sve_fnmls: {
+    auto FNMLS_U =
+        Builder.CreateIntrinsic(Intrinsic::aarch64_sve_fnmls_u, {II.getType()},
+                                {OpPredicate, OpB, OpC, OpA});
+    return IC.replaceInstUsesWith(II, FNMLS_U);
+  }
+  }
 }
 
 static std::optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC,
@@ -1673,27 +1785,35 @@
   case Intrinsic::aarch64_sve_ptest_first:
   case Intrinsic::aarch64_sve_ptest_last:
     return instCombineSVEPTest(IC, II);
+  case Intrinsic::aarch64_sve_fabd:
+  case Intrinsic::aarch64_sve_fdiv:
+  case Intrinsic::aarch64_sve_fmax:
+  case Intrinsic::aarch64_sve_fmaxnm:
+  case Intrinsic::aarch64_sve_fmin:
+  case Intrinsic::aarch64_sve_fminnm:
+  case Intrinsic::aarch64_sve_fmulx:
+    return instCombineSVEAllActive2VA(IC, II);
+  case Intrinsic::aarch64_sve_fmla:
+  case Intrinsic::aarch64_sve_fmls:
+  case Intrinsic::aarch64_sve_fnmla:
+  case Intrinsic::aarch64_sve_fnmls:
+    return instCombineSVEAllActive3VA(IC, II);
   case Intrinsic::aarch64_sve_mul:
   case Intrinsic::aarch64_sve_fmul:
+  case Intrinsic::aarch64_sve_fmul_u:
     return instCombineSVEVectorMul(IC, II);
   case Intrinsic::aarch64_sve_fadd:
+  case Intrinsic::aarch64_sve_fadd_u:
   case Intrinsic::aarch64_sve_add:
     return instCombineSVEVectorAdd(IC, II);
-  case Intrinsic::aarch64_sve_fadd_u:
-    return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
-                                             Intrinsic::aarch64_sve_fmla_u>(
-        IC, II, true);
   case Intrinsic::aarch64_sve_add_u:
     return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
                                              Intrinsic::aarch64_sve_mla_u>(
         IC, II, true);
   case Intrinsic::aarch64_sve_fsub:
+  case Intrinsic::aarch64_sve_fsub_u:
   case Intrinsic::aarch64_sve_sub:
     return instCombineSVEVectorSub(IC, II);
-  case Intrinsic::aarch64_sve_fsub_u:
-    return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
-                                             Intrinsic::aarch64_sve_fmls_u>(
-        IC, II, true);
   case Intrinsic::aarch64_sve_sub_u:
     return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
                                              Intrinsic::aarch64_sve_mls_u>(
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fma-binops.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fma-binops.ll
--- a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fma-binops.ll
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fma-binops.ll
@@ -6,7 +6,7 @@
 declare <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32)
 declare <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32)
 
-; SVE intrinsics fmul and fadd should be replaced with regular fmul and fadd
+; SVE intrinsics fmul, fmul_u, fadd, fadd_u, fsub and fsub_u should be replaced with regular fmul, fadd and fsub.
 declare <vscale x 8 x half> @llvm.aarch64.sve.fmul.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
 define <vscale x 8 x half> @replace_fmul_intrinsic_half(<vscale x 8 x half> %a, <vscale x 8 x half> %b) #0 {
 ; CHECK-LABEL: @replace_fmul_intrinsic_half
@@ -37,6 +37,36 @@
   ret <vscale x 2 x double> %2
 }
 
+declare <vscale x 8 x half> @llvm.aarch64.sve.fmul.u.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
+define <vscale x 8 x half> @replace_fmul_u_intrinsic_half(<vscale x 8 x half> %a, <vscale x 8 x half> %b) #0 {
+; CHECK-LABEL: @replace_fmul_u_intrinsic_half
+; CHECK-NEXT:  %1 = fmul fast <vscale x 8 x half> %a, %b
+; CHECK-NEXT:  ret <vscale x 8 x half> %1
+  %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %2 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmul.u.nxv8f16(<vscale x 8 x i1> %1, <vscale x 8 x half> %a, <vscale x 8 x half> %b)
+  ret <vscale x 8 x half> %2
+}
+
+declare <vscale x 4 x float> @llvm.aarch64.sve.fmul.u.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>)
+define <vscale x 4 x float> @replace_fmul_u_intrinsic_float(<vscale x 4 x float> %a, <vscale x 4 x float> %b) #0 {
+; CHECK-LABEL: @replace_fmul_u_intrinsic_float
+; CHECK-NEXT:  %1 = fmul fast <vscale x 4 x float> %a, %b
+; CHECK-NEXT:  ret <vscale x 4 x float> %1
+  %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %2 = tail call fast <vscale x 4 x float> @llvm.aarch64.sve.fmul.u.nxv4f32(<vscale x 4 x i1> %1, <vscale x 4 x float> %a, <vscale x 4 x float> %b)
+  ret <vscale x 4 x float> %2
+}
+
+declare <vscale x 2 x double> @llvm.aarch64.sve.fmul.u.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
+define <vscale x 2 x double> @replace_fmul_u_intrinsic_double(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 {
+; CHECK-LABEL: @replace_fmul_u_intrinsic_double
+; CHECK-NEXT:  %1 = fmul fast <vscale x 2 x double> %a, %b
+; CHECK-NEXT:  ret <vscale x 2 x double> %1
+  %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %2 = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fmul.u.nxv2f64(<vscale x 2 x i1> %1, <vscale x 2 x double> %a, <vscale x 2 x double> %b)
+  ret <vscale x 2 x double> %2
+}
+
 declare <vscale x 8 x half> @llvm.aarch64.sve.fadd.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
 define <vscale x 8 x half> @replace_fadd_intrinsic_half(<vscale x 8 x half> %a, <vscale x 8 x half> %b) #0 {
 ; CHECK-LABEL: @replace_fadd_intrinsic_half
@@ -67,6 +97,36 @@
   ret <vscale x 2 x double> %2
 }
 
+declare <vscale x 8 x half> @llvm.aarch64.sve.fadd.u.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
+define <vscale x 8 x half> @replace_fadd_u_intrinsic_half(<vscale x 8 x half> %a, <vscale x 8 x half> %b) #0 {
+; CHECK-LABEL: @replace_fadd_u_intrinsic_half
+; CHECK-NEXT:  %1 = fadd fast <vscale x 8 x half> %a, %b
+; CHECK-NEXT:  ret <vscale x 8 x half> %1
+  %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %2 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fadd.u.nxv8f16(<vscale x 8 x i1> %1, <vscale x 8 x half> %a, <vscale x 8 x half> %b)
+  ret <vscale x 8 x half> %2
+}
+
+declare <vscale x 4 x float> @llvm.aarch64.sve.fadd.u.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>)
+define <vscale x 4 x float> @replace_fadd_u_intrinsic_float(<vscale x 4 x float> %a, <vscale x 4 x float> %b) #0 {
+; CHECK-LABEL: @replace_fadd_u_intrinsic_float
+; CHECK-NEXT:  %1 = fadd fast <vscale x 4 x float> %a, %b
+; CHECK-NEXT:  ret <vscale x 4 x float> %1
+  %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %2 = tail call fast <vscale x 4 x float> @llvm.aarch64.sve.fadd.u.nxv4f32(<vscale x 4 x i1> %1, <vscale x 4 x float> %a, <vscale x 4 x float> %b)
+  ret <vscale x 4 x float> %2
+}
+
+declare <vscale x 2 x double> @llvm.aarch64.sve.fadd.u.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
+define <vscale x 2 x double> @replace_fadd_u_intrinsic_double(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 {
+; CHECK-LABEL: @replace_fadd_u_intrinsic_double
+; CHECK-NEXT:  %1 = fadd fast <vscale x 2 x double> %a, %b
+; CHECK-NEXT:  ret <vscale x 2 x double> %1
+  %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %2 = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fadd.u.nxv2f64(<vscale x 2 x i1> %1, <vscale x 2 x double> %a, <vscale x 2 x double> %b)
+  ret <vscale x 2 x double> %2
+}
+
 declare <vscale x 8 x half> @llvm.aarch64.sve.fsub.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
 define <vscale x 8 x half> @replace_fsub_intrinsic_half(<vscale x 8 x half> %a, <vscale x 8 x half> %b) #0 {
 ; CHECK-LABEL: @replace_fsub_intrinsic_half
@@ -87,7 +147,6 @@
   ret <vscale x 4 x float> %2
 }
 
-
 declare <vscale x 2 x double> @llvm.aarch64.sve.fsub.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
 define <vscale x 2 x double> @replace_fsub_intrinsic_double(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 {
 ; CHECK-LABEL: @replace_fsub_intrinsic_double
@@ -117,4 +176,44 @@
   ret <vscale x 2 x double> %2
 }
 
+declare <vscale x 8 x half> @llvm.aarch64.sve.fsub.u.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
+define <vscale x 8 x half> @replace_fsub_u_intrinsic_half(<vscale x 8 x half> %a, <vscale x 8 x half> %b) #0 {
+; CHECK-LABEL: @replace_fsub_u_intrinsic_half
+; CHECK-NEXT:  %1 = fsub fast <vscale x 8 x half> %a, %b
+; CHECK-NEXT:  ret <vscale x 8 x half> %1
+  %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %2 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fsub.u.nxv8f16(<vscale x 8 x i1> %1, <vscale x 8 x half> %a, <vscale x 8 x half> %b)
+  ret <vscale x 8 x half> %2
+}
+
+declare <vscale x 4 x float> @llvm.aarch64.sve.fsub.u.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>)
+define <vscale x 4 x float> @replace_fsub_u_intrinsic_float(<vscale x 4 x float> %a, <vscale x 4 x float> %b) #0 {
+; CHECK-LABEL: @replace_fsub_u_intrinsic_float
+; CHECK-NEXT:  %1 = fsub fast <vscale x 4 x float> %a, %b
+; CHECK-NEXT:  ret <vscale x 4 x float> %1
+  %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %2 = tail call fast <vscale x 4 x float> @llvm.aarch64.sve.fsub.u.nxv4f32(<vscale x 4 x i1> %1, <vscale x 4 x float> %a, <vscale x 4 x float> %b)
+  ret <vscale x 4 x float> %2
+}
+
+define <vscale x 2 x double> @no_replace_on_non_ptrue_all_u(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 {
+; CHECK-LABEL: @no_replace_on_non_ptrue_all_u
+; CHECK-NEXT:  %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 5)
+; CHECK-NEXT:  %2 = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fsub.u.nxv2f64(<vscale x 2 x i1> %1, <vscale x 2 x double> %a, <vscale x 2 x double> %b)
+; CHECK-NEXT:  ret <vscale x 2 x double> %2
+  %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 5)
+  %2 = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fsub.u.nxv2f64(<vscale x 2 x i1> %1, <vscale x 2 x double> %a, <vscale x 2 x double> %b)
+  ret <vscale x 2 x double> %2
+}
+
+declare <vscale x 2 x double> @llvm.aarch64.sve.fsub.u.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
+define <vscale x 2 x double> @replace_fsub_u_intrinsic_no_fast_flag(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 {
+; CHECK-LABEL: @replace_fsub_u_intrinsic_no_fast_flag
+; CHECK-NEXT:  %1 = fsub <vscale x 2 x double> %a, %b
+; CHECK-NEXT:  ret <vscale x 2 x double> %1
+  %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %2 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fsub.u.nxv2f64(<vscale x 2 x i1> %1, <vscale x 2 x double> %a, <vscale x 2 x double> %b)
+  ret <vscale x 2 x double> %2
+}
+
 attributes #0 = { "target-features"="+sve" }
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul_u-idempotency.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul_u-idempotency.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul_u-idempotency.ll
@@ -0,0 +1,119 @@
+; RUN: opt -S -passes=instcombine < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+; Idempotent fmuls_u -- should compile to just a ret.
+define <vscale x 8 x half> @idempotent_fmul_u_f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a) #0 {
+; CHECK-LABEL: @idempotent_fmul_u_f16(
+; CHECK-NEXT:    ret <vscale x 8 x half> [[A:%.*]]
+;
+  %1 = call <vscale x 8 x half> @llvm.aarch64.sve.dup.x.nxv8f16(half 1.0)
+  %2 = call <vscale x 8 x half> @llvm.aarch64.sve.fmul.u.nxv8f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a, <vscale x 8 x half> %1)
+  ret <vscale x 8 x half> %2
+}
+
+define <vscale x 4 x float> @idempotent_fmul_u_f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a) #0 {
+; CHECK-LABEL: @idempotent_fmul_u_f32(
+; CHECK-NEXT:    ret <vscale x 4 x float> [[A:%.*]]
+;
+  %1 = call <vscale x 4 x float> @llvm.aarch64.sve.dup.x.nxv4f32(float 1.0)
+  %2 = call <vscale x 4 x float> @llvm.aarch64.sve.fmul.u.nxv4f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a, <vscale x 4 x float> %1)
+  ret <vscale x 4 x float> %2
+}
+
+define <vscale x 2 x double> @idempotent_fmul_u_f64(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a) #0 {
+; CHECK-LABEL: @idempotent_fmul_u_f64(
+; CHECK-NEXT:    ret <vscale x 2 x double> [[A:%.*]]
+;
+  %1 = call <vscale x 2 x double> @llvm.aarch64.sve.dup.x.nxv2f64(double 1.0)
+  %2 = call <vscale x 2 x double> @llvm.aarch64.sve.fmul.u.nxv2f64(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a, <vscale x 2 x double> %1)
+  ret <vscale x 2 x double> %2
+}
+
+define <vscale x 2 x double> @idempotent_fmul_u_different_argument_order(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a) #0 {
+; CHECK-LABEL: @idempotent_fmul_u_different_argument_order(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.fmul.u.nxv2f64(<vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x double> shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 1.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x double> [[A:%.*]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call <vscale x 2 x double> @llvm.aarch64.sve.dup.x.nxv2f64(double 1.0)
+  ; Different argument order to the above tests.
+  %2 = call <vscale x 2 x double> @llvm.aarch64.sve.fmul.u.nxv2f64(<vscale x 2 x i1> %pg, <vscale x 2 x double> %1, <vscale x 2 x double> %a)
+  ret <vscale x 2 x double> %2
+}
+
+define <vscale x 8 x half> @idempotent_fmul_u_with_predicated_dup(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a) #0 {
+; CHECK-LABEL: @idempotent_fmul_u_with_predicated_dup(
+; CHECK-NEXT:    ret <vscale x 8 x half> [[A:%.*]]
+;
+  %1 = call <vscale x 8 x half> @llvm.aarch64.sve.dup.nxv8f16(<vscale x 8 x half> undef, <vscale x 8 x i1> %pg, half 1.0)
+  %2 = call <vscale x 8 x half> @llvm.aarch64.sve.fmul.u.nxv8f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a, <vscale x 8 x half> %1)
+  ret <vscale x 8 x half> %2
+}
+
+define <vscale x 8 x half> @idempotent_fmul_u_two_dups(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a) #0 {
+  ; Edge case -- make sure that the case where we're fmultiplying two dups
+  ; together is sane.
+; CHECK-LABEL: @idempotent_fmul_u_two_dups(
+; CHECK-NEXT:    ret <vscale x 8 x half> shufflevector (<vscale x 8 x half> insertelement (<vscale x 8 x half> poison, half 0xH3C00, i64 0), <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer)
+;
+  %1 = call <vscale x 8 x half> @llvm.aarch64.sve.dup.x.nxv8f16(half 1.0)
+  %2 = call <vscale x 8 x half> @llvm.aarch64.sve.dup.x.nxv8f16(half 1.0)
+  %3 = call <vscale x 8 x half> @llvm.aarch64.sve.fmul.u.nxv8f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> %1, <vscale x 8 x half> %2)
+  ret <vscale x 8 x half> %3
+}
+
+; Non-idempotent fmuls_u -- we don't expect these to be optimised out.
+define <vscale x 8 x half> @non_idempotent_fmul_u_f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a) #0 {
+; CHECK-LABEL: @non_idempotent_fmul_u_f16(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x half> @llvm.aarch64.sve.fmul.u.nxv8f16(<vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> shufflevector (<vscale x 8 x half> insertelement (<vscale x 8 x half> poison, half 0xH4000, i64 0), <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer))
+; CHECK-NEXT:    ret <vscale x 8 x half> [[TMP1]]
+;
+  %1 = call <vscale x 8 x half> @llvm.aarch64.sve.dup.x.nxv8f16(half 2.0)
+  %2 = call <vscale x 8 x half> @llvm.aarch64.sve.fmul.u.nxv8f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a, <vscale x 8 x half> %1)
+  ret <vscale x 8 x half> %2
+}
+
+define <vscale x 4 x float> @non_idempotent_fmul_u_f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a) #0 {
+; CHECK-LABEL: @non_idempotent_fmul_u_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.fmul.u.nxv4f32(<vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x float> [[A:%.*]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 2.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer))
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call <vscale x 4 x float> @llvm.aarch64.sve.dup.x.nxv4f32(float 2.0)
+  %2 = call <vscale x 4 x float> @llvm.aarch64.sve.fmul.u.nxv4f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a, <vscale x 4 x float> %1)
+  ret <vscale x 4 x float> %2
+}
+
+define <vscale x 2 x double> @non_idempotent_fmul_u_f64(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a) #0 {
+; CHECK-LABEL: @non_idempotent_fmul_u_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.fmul.u.nxv2f64(<vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 2.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer))
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call <vscale x 2 x double> @llvm.aarch64.sve.dup.x.nxv2f64(double 2.0)
+  %2 = call <vscale x 2 x double> @llvm.aarch64.sve.fmul.u.nxv2f64(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a, <vscale x 2 x double> %1)
+  ret <vscale x 2 x double> %2
+}
+
+define <vscale x 2 x double> @non_idempotent_fmul_u_with_predicated_dup(<vscale x 2 x i1> %pg1, <vscale x 2 x i1> %pg2, <vscale x 2 x double> %a) #0 {
+  ; Different predicates
+; CHECK-LABEL: @non_idempotent_fmul_u_with_predicated_dup(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.dup.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> [[PG1:%.*]], double 1.000000e+00)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.fmul.u.nxv2f64(<vscale x 2 x i1> [[PG2:%.*]], <vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> [[TMP1]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP2]]
+;
+  %1 = call <vscale x 2 x double> @llvm.aarch64.sve.dup.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> %pg1, double 1.0)
+  %2 = call <vscale x 2 x double> @llvm.aarch64.sve.fmul.u.nxv2f64(<vscale x 2 x i1> %pg2, <vscale x 2 x double> %a, <vscale x 2 x double> %1)
+  ret <vscale x 2 x double> %2
+}
+
+declare <vscale x 8 x half> @llvm.aarch64.sve.dup.x.nxv8f16(half)
+declare <vscale x 4 x float> @llvm.aarch64.sve.dup.x.nxv4f32(float)
+declare <vscale x 2 x double> @llvm.aarch64.sve.dup.x.nxv2f64(double)
+
+declare <vscale x 2 x double> @llvm.aarch64.sve.dup.nxv2f64(<vscale x 2 x double>, <vscale x 2 x i1>, double)
+declare <vscale x 8 x half> @llvm.aarch64.sve.dup.nxv8f16(<vscale x 8 x half>, <vscale x 8 x i1>, half)
+
+declare <vscale x 8 x half> @llvm.aarch64.sve.fmul.u.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
+declare <vscale x 4 x float> @llvm.aarch64.sve.fmul.u.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.aarch64.sve.fmul.u.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
+
+attributes #0 = { "target-features"="+sve" }
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-m-to-x.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-m-to-x.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-m-to-x.ll
@@ -0,0 +1,522 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; RUN: opt -S -passes=instcombine < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+declare <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32)
+declare <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32)
+declare <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32)
+
+; Replace SVE merging intrinsics to their equivalent undef (_u) variants when they take an all active predicate.
+
+declare <vscale x 8 x half> @llvm.aarch64.sve.fabd.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
+define <vscale x 8 x half> @replace_fabd_intrinsic_half(<vscale x 8 x half> %a, <vscale x 8 x half> %b) #0 {
+; CHECK-LABEL: define <vscale x 8 x half> @replace_fabd_intrinsic_half
+; CHECK-SAME: (<vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 8 x half> @llvm.aarch64.sve.fabd.u.nxv8f16(<vscale x 8 x i1> [[TMP1]], <vscale x 8 x half> [[A]], <vscale x 8 x half> [[B]])
+; CHECK-NEXT:    ret <vscale x 8 x half> [[TMP2]]
+;
+  %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %2 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fabd.nxv8f16(<vscale x 8 x i1> %1, <vscale x 8 x half> %a, <vscale x 8 x half> %b)
+  ret <vscale x 8 x half> %2
+}
+
+declare <vscale x 4 x float> @llvm.aarch64.sve.fabd.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>)
+define <vscale x 4 x float> @replace_fabd_intrinsic_float(<vscale x 4 x float> %a, <vscale x 4 x float> %b) #0 {
+; CHECK-LABEL: define <vscale x 4 x float> @replace_fabd_intrinsic_float
+; CHECK-SAME: (<vscale x 4 x float> [[A:%.*]], <vscale x 4 x float> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.fabd.u.nxv4f32(<vscale x 4 x i1> [[TMP1]], <vscale x 4 x float> [[A]], <vscale x 4 x float> [[B]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP2]]
+;
+  %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %2 = tail call fast <vscale x 4 x float> @llvm.aarch64.sve.fabd.nxv4f32(<vscale x 4 x i1> %1, <vscale x 4 x float> %a, <vscale x 4 x float> %b)
+  ret <vscale x 4 x float> %2
+}
+
+declare <vscale x 2 x double> @llvm.aarch64.sve.fabd.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
+define <vscale x 2 x double> @replace_fabd_intrinsic_double(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 {
+; CHECK-LABEL: define <vscale x 2 x double> @replace_fabd_intrinsic_double
+; CHECK-SAME: (<vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.fabd.u.nxv2f64(<vscale x 2 x i1> [[TMP1]], <vscale x 2 x double> [[A]], <vscale x 2 x double> [[B]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP2]]
+;
+  %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %2 = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fabd.nxv2f64(<vscale x 2 x i1> %1, <vscale x 2 x double> %a, <vscale x 2 x double> %b)
+  ret <vscale x 2 x double> %2
+}
+
+define <vscale x 2 x double> @no_replace_fabd_intrinsic_double(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 {
+; CHECK-LABEL: define <vscale x 2 x double> @no_replace_fabd_intrinsic_double
+; CHECK-SAME: (<vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 5)
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fabd.nxv2f64(<vscale x 2 x i1> [[TMP1]], <vscale x 2 x double> [[A]], <vscale x 2 x double> [[B]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP2]]
+;
+  %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 5)
+  %2 = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fabd.nxv2f64(<vscale x 2 x i1> %1, <vscale x 2 x double> %a, <vscale x 2 x double> %b)
+  ret <vscale x 2 x double> %2
+}
+
+declare <vscale x 8 x half> @llvm.aarch64.sve.fdiv.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
+define <vscale x 8 x half> @replace_fdiv_intrinsic_half(<vscale x 8 x half> %a, <vscale x 8 x half> %b) #0 {
+; CHECK-LABEL: define <vscale x 8 x half> @replace_fdiv_intrinsic_half
+; CHECK-SAME: (<vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 8 x half> @llvm.aarch64.sve.fdiv.u.nxv8f16(<vscale x 8 x i1> [[TMP1]], <vscale x 8 x half> [[A]], <vscale x 8 x half> [[B]])
+; CHECK-NEXT:    ret <vscale x 8 x half> [[TMP2]]
+;
+  %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %2 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fdiv.nxv8f16(<vscale x 8 x i1> %1, <vscale x 8 x half> %a, <vscale x 8 x half> %b)
+  ret <vscale x 8 x half> %2
+}
+
+declare <vscale x 4 x float> @llvm.aarch64.sve.fdiv.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>)
+define <vscale x 4 x float> @replace_fdiv_intrinsic_float(<vscale x 4 x float> %a, <vscale x 4 x float> %b) #0 {
+; CHECK-LABEL: define <vscale x 4 x float> @replace_fdiv_intrinsic_float
+; CHECK-SAME: (<vscale x 4 x float> [[A:%.*]], <vscale x 4 x float> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.fdiv.u.nxv4f32(<vscale x 4 x i1> [[TMP1]], <vscale x 4 x float> [[A]], <vscale x 4 x float> [[B]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP2]]
+;
+  %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %2 = tail call fast <vscale x 4 x float> @llvm.aarch64.sve.fdiv.nxv4f32(<vscale x 4 x i1> %1, <vscale x 4 x float> %a, <vscale x 4 x float> %b)
+  ret <vscale x 4 x float> %2
+}
+
+declare <vscale x 2 x double> @llvm.aarch64.sve.fdiv.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
+define <vscale x 2 x double> @replace_fdiv_intrinsic_double(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 {
+; CHECK-LABEL: define <vscale x 2 x double> @replace_fdiv_intrinsic_double
+; CHECK-SAME: (<vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.fdiv.u.nxv2f64(<vscale x 2 x i1> [[TMP1]], <vscale x 2 x double> [[A]], <vscale x 2 x double> [[B]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP2]]
+;
+  %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %2 = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fdiv.nxv2f64(<vscale x 2 x i1> %1, <vscale x 2 x double> %a, <vscale x 2 x double> %b)
+  ret <vscale x 2 x double> %2
+}
+
+define <vscale x 2 x double> @no_replace_fdiv_intrinsic_double(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 {
+; CHECK-LABEL: define <vscale x 2 x double> @no_replace_fdiv_intrinsic_double
+; CHECK-SAME: (<vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 5)
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fdiv.nxv2f64(<vscale x 2 x i1> [[TMP1]], <vscale x 2 x double> [[A]], <vscale x 2 x double> [[B]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP2]]
+;
+  %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 5)
+  %2 = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fdiv.nxv2f64(<vscale x 2 x i1> %1, <vscale x 2 x double> %a, <vscale x 2 x double> %b)
+  ret <vscale x 2 x double> %2
+}
+
+declare <vscale x 8 x half> @llvm.aarch64.sve.fmax.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
+define <vscale x 8 x half> @replace_fmax_intrinsic_half(<vscale x 8 x half> %a, <vscale x 8 x half> %b) #0 {
+; CHECK-LABEL: define <vscale x 8 x half> @replace_fmax_intrinsic_half
+; CHECK-SAME: (<vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 8 x half> @llvm.aarch64.sve.fmax.u.nxv8f16(<vscale x 8 x i1> [[TMP1]], <vscale x 8 x half> [[A]], <vscale x 8 x half> [[B]])
+; CHECK-NEXT:    ret <vscale x 8 x half> [[TMP2]]
+;
+  %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %2 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmax.nxv8f16(<vscale x 8 x i1> %1, <vscale x 8 x half> %a, <vscale x 8 x half> %b)
+  ret <vscale x 8 x half> %2
+}
+
+declare <vscale x 4 x float> @llvm.aarch64.sve.fmax.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>)
+define <vscale x 4 x float> @replace_fmax_intrinsic_float(<vscale x 4 x float> %a, <vscale x 4 x float> %b) #0 {
+; CHECK-LABEL: define <vscale x 4 x float> @replace_fmax_intrinsic_float
+; CHECK-SAME: (<vscale x 4 x float> [[A:%.*]], <vscale x 4 x float> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.fmax.u.nxv4f32(<vscale x 4 x i1> [[TMP1]], <vscale x 4 x float> [[A]], <vscale x 4 x float> [[B]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP2]]
+;
+  %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %2 = tail call fast <vscale x 4 x float> @llvm.aarch64.sve.fmax.nxv4f32(<vscale x 4 x i1> %1, <vscale x 4 x float> %a, <vscale x 4 x float> %b)
+  ret <vscale x 4 x float> %2
+}
+
+declare <vscale x 2 x double> @llvm.aarch64.sve.fmax.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
+define <vscale x 2 x double> @replace_fmax_intrinsic_double(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 {
+; CHECK-LABEL: define <vscale x 2 x double> @replace_fmax_intrinsic_double
+; CHECK-SAME: (<vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.fmax.u.nxv2f64(<vscale x 2 x i1> [[TMP1]], <vscale x 2 x double> [[A]], <vscale x 2 x double> [[B]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP2]]
+;
+  %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %2 = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fmax.nxv2f64(<vscale x 2 x i1> %1, <vscale x 2 x double> %a, <vscale x 2 x double> %b)
+  ret <vscale x 2 x double> %2
+}
+
+define <vscale x 2 x double> @no_replace_fmax_intrinsic_double(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 {
+; CHECK-LABEL: define <vscale x 2 x double> @no_replace_fmax_intrinsic_double
+; CHECK-SAME: (<vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 5)
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fmax.nxv2f64(<vscale x 2 x i1> [[TMP1]], <vscale x 2 x double> [[A]], <vscale x 2 x double> [[B]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP2]]
+;
+  %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 5)
+  %2 = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fmax.nxv2f64(<vscale x 2 x i1> %1, <vscale x 2 x double> %a, <vscale x 2 x double> %b)
+  ret <vscale x 2 x double> %2
+}
+
+declare <vscale x 8 x half> @llvm.aarch64.sve.fmaxnm.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
+define <vscale x 8 x half> @replace_fmaxnm_intrinsic_half(<vscale x 8 x half> %a, <vscale x 8 x half> %b) #0 {
+; CHECK-LABEL: define <vscale x 8 x half> @replace_fmaxnm_intrinsic_half
+; CHECK-SAME: (<vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 8 x half> @llvm.aarch64.sve.fmaxnm.u.nxv8f16(<vscale x 8 x i1> [[TMP1]], <vscale x 8 x half> [[A]], <vscale x 8 x half> [[B]])
+; CHECK-NEXT:    ret <vscale x 8 x half> [[TMP2]]
+;
+  %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %2 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmaxnm.nxv8f16(<vscale x 8 x i1> %1, <vscale x 8 x half> %a, <vscale x 8 x half> %b)
+  ret <vscale x 8 x half> %2
+}
+
+declare <vscale x 4 x float> @llvm.aarch64.sve.fmaxnm.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>)
+define <vscale x 4 x float> @replace_fmaxnm_intrinsic_float(<vscale x 4 x float> %a, <vscale x 4 x float> %b) #0 {
+; CHECK-LABEL: define <vscale x 4 x float> @replace_fmaxnm_intrinsic_float
+; CHECK-SAME: (<vscale x 4 x float> [[A:%.*]], <vscale x 4 x float> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.fmaxnm.u.nxv4f32(<vscale x 4 x i1> [[TMP1]], <vscale x 4 x float> [[A]], <vscale x 4 x float> [[B]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP2]]
+;
+  %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %2 = tail call fast <vscale x 4 x float> @llvm.aarch64.sve.fmaxnm.nxv4f32(<vscale x 4 x i1> %1, <vscale x 4 x float> %a, <vscale x 4 x float> %b)
+  ret <vscale x 4 x float> %2
+}
+
+declare <vscale x 2 x double> @llvm.aarch64.sve.fmaxnm.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
+define <vscale x 2 x double> @replace_fmaxnm_intrinsic_double(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 {
+; CHECK-LABEL: define <vscale x 2 x double> @replace_fmaxnm_intrinsic_double
+; CHECK-SAME: (<vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.fmax.u.nxv2f64(<vscale x 2 x i1> [[TMP1]], <vscale x 2 x double> [[A]], <vscale x 2 x double> [[B]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP2]]
+;
+  %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %2 = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fmax.nxv2f64(<vscale x 2 x i1> %1, <vscale x 2 x double> %a, <vscale x 2 x double> %b)
+  ret <vscale x 2 x double> %2
+}
+
+define <vscale x 2 x double> @no_replace_fmaxnm_intrinsic_double(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 {
+; CHECK-LABEL: define <vscale x 2 x double> @no_replace_fmaxnm_intrinsic_double
+; CHECK-SAME: (<vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 5)
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fmaxnm.nxv2f64(<vscale x 2 x i1> [[TMP1]], <vscale x 2 x double> [[A]], <vscale x 2 x double> [[B]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP2]]
+;
+  %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 5)
+  %2 = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fmaxnm.nxv2f64(<vscale x 2 x i1> %1, <vscale x 2 x double> %a, <vscale x 2 x double> %b)
+  ret <vscale x 2 x double> %2
+}
+
+declare <vscale x 8 x half> @llvm.aarch64.sve.fmin.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
+define <vscale x 8 x half> @replace_fmin_intrinsic_half(<vscale x 8 x half> %a, <vscale x 8 x half> %b) #0 {
+; CHECK-LABEL: define <vscale x 8 x half> @replace_fmin_intrinsic_half
+; CHECK-SAME: (<vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 8 x half> @llvm.aarch64.sve.fmin.u.nxv8f16(<vscale x 8 x i1> [[TMP1]], <vscale x 8 x half> [[A]], <vscale x 8 x half> [[B]])
+; CHECK-NEXT:    ret <vscale x 8 x half> [[TMP2]]
+;
+  %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %2 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmin.nxv8f16(<vscale x 8 x i1> %1, <vscale x 8 x half> %a, <vscale x 8 x half> %b)
+  ret <vscale x 8 x half> %2
+}
+
+declare <vscale x 4 x float> @llvm.aarch64.sve.fmin.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>)
+define <vscale x 4 x float> @replace_fmin_intrinsic_float(<vscale x 4 x float> %a, <vscale x 4 x float> %b) #0 {
+; CHECK-LABEL: define <vscale x 4 x float> @replace_fmin_intrinsic_float
+; CHECK-SAME: (<vscale x 4 x float> [[A:%.*]], <vscale x 4 x float> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.fmin.u.nxv4f32(<vscale x 4 x i1> [[TMP1]], <vscale x 4 x float> [[A]], <vscale x 4 x float> [[B]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP2]]
+;
+  %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %2 = tail call fast <vscale x 4 x float> @llvm.aarch64.sve.fmin.nxv4f32(<vscale x 4 x i1> %1, <vscale x 4 x float> %a, <vscale x 4 x float> %b)
+  ret <vscale x 4 x float> %2
+}
+
+declare <vscale x 2 x double> @llvm.aarch64.sve.fmin.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
+define <vscale x 2 x double> @replace_fmin_intrinsic_double(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 {
+; CHECK-LABEL: define <vscale x 2 x double> @replace_fmin_intrinsic_double
+; CHECK-SAME: (<vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.fmin.u.nxv2f64(<vscale x 2 x i1> [[TMP1]], <vscale x 2 x double> [[A]], <vscale x 2 x double> [[B]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP2]]
+;
+  %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %2 = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fmin.nxv2f64(<vscale x 2 x i1> %1, <vscale x 2 x double> %a, <vscale x 2 x double> %b)
+  ret <vscale x 2 x double> %2
+}
+
+define <vscale x 2 x double> @no_replace_fmin_intrinsic_double(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 {
+; CHECK-LABEL: define <vscale x 2 x double> @no_replace_fmin_intrinsic_double
+; CHECK-SAME: (<vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 5)
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fmin.nxv2f64(<vscale x 2 x i1> [[TMP1]], <vscale x 2 x double> [[A]], <vscale x 2 x double> [[B]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP2]]
+;
+  %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 5)
+  %2 = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fmin.nxv2f64(<vscale x 2 x i1> %1, <vscale x 2 x double> %a, <vscale x 2 x double> %b)
+  ret <vscale x 2 x double> %2
+}
+
+declare <vscale x 8 x half> @llvm.aarch64.sve.fminnm.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
+define <vscale x 8 x half> @replace_fminnm_intrinsic_half(<vscale x 8 x half> %a, <vscale x 8 x half> %b) #0 {
+; CHECK-LABEL: define <vscale x 8 x half> @replace_fminnm_intrinsic_half
+; CHECK-SAME: (<vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 8 x half> @llvm.aarch64.sve.fminnm.u.nxv8f16(<vscale x 8 x i1> [[TMP1]], <vscale x 8 x half> [[A]], <vscale x 8 x half> [[B]])
+; CHECK-NEXT:    ret <vscale x 8 x half> [[TMP2]]
+;
+  %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %2 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fminnm.nxv8f16(<vscale x 8 x i1> %1, <vscale x 8 x half> %a, <vscale x 8 x half> %b)
+  ret <vscale x 8 x half> %2
+}
+
+declare <vscale x 4 x float> @llvm.aarch64.sve.fminnm.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>)
+define <vscale x 4 x float> @replace_fminnm_intrinsic_float(<vscale x 4 x float> %a, <vscale x 4 x float> %b) #0 {
+; CHECK-LABEL: define <vscale x 4 x float> @replace_fminnm_intrinsic_float
+; CHECK-SAME: (<vscale x 4 x float> [[A:%.*]], <vscale x 4 x float> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.fminnm.u.nxv4f32(<vscale x 4 x i1> [[TMP1]], <vscale x 4 x float> [[A]], <vscale x 4 x float> [[B]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP2]]
+;
+  %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %2 = tail call fast <vscale x 4 x float> @llvm.aarch64.sve.fminnm.nxv4f32(<vscale x 4 x i1> %1, <vscale x 4 x float> %a, <vscale x 4 x float> %b)
+  ret <vscale x 4 x float> %2
+}
+
+declare <vscale x 2 x double> @llvm.aarch64.sve.fminnm.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
+define <vscale x 2 x double> @replace_fminnm_intrinsic_double(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 {
+; CHECK-LABEL: define <vscale x 2 x double> @replace_fminnm_intrinsic_double
+; CHECK-SAME: (<vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.fminnm.u.nxv2f64(<vscale x 2 x i1> [[TMP1]], <vscale x 2 x double> [[A]], <vscale x 2 x double> [[B]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP2]]
+;
+  %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %2 = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fminnm.nxv2f64(<vscale x 2 x i1> %1, <vscale x 2 x double> %a, <vscale x 2 x double> %b)
+  ret <vscale x 2 x double> %2
+}
+
+define <vscale x 2 x double> @no_replace_fminnm_intrinsic_double(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 {
+; CHECK-LABEL: define <vscale x 2 x double> @no_replace_fminnm_intrinsic_double
+; CHECK-SAME: (<vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 5)
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fminnm.nxv2f64(<vscale x 2 x i1> [[TMP1]], <vscale x 2 x double> [[A]], <vscale x 2 x double> [[B]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP2]]
+;
+  %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 5)
+  %2 = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fminnm.nxv2f64(<vscale x 2 x i1> %1, <vscale x 2 x double> %a, <vscale x 2 x double> %b)
+  ret <vscale x 2 x double> %2
+}
+
+declare <vscale x 8 x half> @llvm.aarch64.sve.fmla.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>)
+define <vscale x 8 x half> @replace_fmla_intrinsic_half(<vscale x 8 x half> %a, <vscale x 8 x half> %b, <vscale x 8 x half> %c) #0 {
+; CHECK-LABEL: define <vscale x 8 x half> @replace_fmla_intrinsic_half
+; CHECK-SAME: (<vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[B:%.*]], <vscale x 8 x half> [[C:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 8 x half> @llvm.aarch64.sve.fmla.u.nxv8f16(<vscale x 8 x i1> [[TMP1]], <vscale x 8 x half> [[B]], <vscale x 8 x half> [[C]], <vscale x 8 x half> [[A]])
+; CHECK-NEXT:    ret <vscale x 8 x half> [[TMP2]]
+;
+  %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %2 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmla.nxv8f16(<vscale x 8 x i1> %1, <vscale x 8 x half> %a, <vscale x 8 x half> %b, <vscale x 8 x half> %c)
+  ret <vscale x 8 x half> %2
+}
+
+declare <vscale x 4 x float> @llvm.aarch64.sve.fmla.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>)
+define <vscale x 4 x float> @replace_fmla_intrinsic_float(<vscale x 4 x float> %a, <vscale x 4 x float> %b, <vscale x 4 x float> %c) #0 {
+; CHECK-LABEL: define <vscale x 4 x float> @replace_fmla_intrinsic_float
+; CHECK-SAME: (<vscale x 4 x float> [[A:%.*]], <vscale x 4 x float> [[B:%.*]], <vscale x 4 x float> [[C:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.fmla.u.nxv4f32(<vscale x 4 x i1> [[TMP1]], <vscale x 4 x float> [[B]], <vscale x 4 x float> [[C]], <vscale x 4 x float> [[A]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP2]]
+;
+  %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %2 = tail call fast <vscale x 4 x float> @llvm.aarch64.sve.fmla.nxv4f32(<vscale x 4 x i1> %1, <vscale x 4 x float> %a, <vscale x 4 x float> %b, <vscale x 4 x float> %c)
+  ret <vscale x 4 x float> %2
+}
+
+declare <vscale x 2 x double> @llvm.aarch64.sve.fmla.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>)
+define <vscale x 2 x double> @replace_fmla_intrinsic_double(<vscale x 2 x double> %a, <vscale x 2 x double> %b, <vscale x 2 x double> %c) #0 {
+; CHECK-LABEL: define <vscale x 2 x double> @replace_fmla_intrinsic_double
+; CHECK-SAME: (<vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> [[B:%.*]], <vscale x 2 x double> [[C:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.fmla.u.nxv2f64(<vscale x 2 x i1> [[TMP1]], <vscale x 2 x double> [[B]], <vscale x 2 x double> [[C]], <vscale x 2 x double> [[A]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP2]]
+;
+  %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %2 = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fmla.nxv2f64(<vscale x 2 x i1> %1, <vscale x 2 x double> %a, <vscale x 2 x double> %b, <vscale x 2 x double> %c)
+  ret <vscale x 2 x double> %2
+}
+
+define <vscale x 2 x double> @no_replace_fmla_intrinsic_double(<vscale x 2 x double> %a, <vscale x 2 x double> %b, <vscale x 2 x double> %c) #0 {
+; CHECK-LABEL: define <vscale x 2 x double> @no_replace_fmla_intrinsic_double
+; CHECK-SAME: (<vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> [[B:%.*]], <vscale x 2 x double> [[C:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 5)
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fmla.nxv2f64(<vscale x 2 x i1> [[TMP1]], <vscale x 2 x double> [[A]], <vscale x 2 x double> [[B]], <vscale x 2 x double> [[C]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP2]]
+;
+  %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 5)
+  %2 = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fmla.nxv2f64(<vscale x 2 x i1> %1, <vscale x 2 x double> %a, <vscale x 2 x double> %b, <vscale x 2 x double> %c)
+  ret <vscale x 2 x double> %2
+}
+
+declare <vscale x 8 x half> @llvm.aarch64.sve.fmls.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>)
+define <vscale x 8 x half> @replace_fmls_intrinsic_half(<vscale x 8 x half> %a, <vscale x 8 x half> %b, <vscale x 8 x half> %c) #0 {
+; CHECK-LABEL: define <vscale x 8 x half> @replace_fmls_intrinsic_half
+; CHECK-SAME: (<vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[B:%.*]], <vscale x 8 x half> [[C:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 8 x half> @llvm.aarch64.sve.fmls.u.nxv8f16(<vscale x 8 x i1> [[TMP1]], <vscale x 8 x half> [[B]], <vscale x 8 x half> [[C]], <vscale x 8 x half> [[A]])
+; CHECK-NEXT:    ret <vscale x 8 x half> [[TMP2]]
+;
+  %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %2 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmls.nxv8f16(<vscale x 8 x i1> %1, <vscale x 8 x half> %a, <vscale x 8 x half> %b, <vscale x 8 x half> %c)
+  ret <vscale x 8 x half> %2
+}
+
+declare <vscale x 4 x float> @llvm.aarch64.sve.fmls.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>)
+define <vscale x 4 x float> @replace_fmls_intrinsic_float(<vscale x 4 x float> %a, <vscale x 4 x float> %b, <vscale x 4 x float> %c) #0 {
+; CHECK-LABEL: define <vscale x 4 x float> @replace_fmls_intrinsic_float
+; CHECK-SAME: (<vscale x 4 x float> [[A:%.*]], <vscale x 4 x float> [[B:%.*]], <vscale x 4 x float> [[C:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.fmls.u.nxv4f32(<vscale x 4 x i1> [[TMP1]], <vscale x 4 x float> [[B]], <vscale x 4 x float> [[C]], <vscale x 4 x float> [[A]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP2]]
+;
+  %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %2 = tail call fast <vscale x 4 x float> @llvm.aarch64.sve.fmls.nxv4f32(<vscale x 4 x i1> %1, <vscale x 4 x float> %a, <vscale x 4 x float> %b, <vscale x 4 x float> %c)
+  ret <vscale x 4 x float> %2
+}
+
+declare <vscale x 2 x double> @llvm.aarch64.sve.fmls.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>)
+define <vscale x 2 x double> @replace_fmls_intrinsic_double(<vscale x 2 x double> %a, <vscale x 2 x double> %b, <vscale x 2 x double> %c) #0 {
+; CHECK-LABEL: define <vscale x 2 x double> @replace_fmls_intrinsic_double
+; CHECK-SAME: (<vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> [[B:%.*]], <vscale x 2 x double> [[C:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.fmls.u.nxv2f64(<vscale x 2 x i1> [[TMP1]], <vscale x 2 x double> [[B]], <vscale x 2 x double> [[C]], <vscale x 2 x double> [[A]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP2]]
+;
+  %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %2 = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fmls.nxv2f64(<vscale x 2 x i1> %1, <vscale x 2 x double> %a, <vscale x 2 x double> %b, <vscale x 2 x double> %c)
+  ret <vscale x 2 x double> %2
+}
+
+define <vscale x 2 x double> @no_replace_fmls_intrinsic_double(<vscale x 2 x double> %a, <vscale x 2 x double> %b, <vscale x 2 x double> %c) #0 {
+; CHECK-LABEL: define <vscale x 2 x double> @no_replace_fmls_intrinsic_double
+; CHECK-SAME: (<vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> [[B:%.*]], <vscale x 2 x double> [[C:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 5)
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fmls.nxv2f64(<vscale x 2 x i1> [[TMP1]], <vscale x 2 x double> [[A]], <vscale x 2 x double> [[B]], <vscale x 2 x double> [[C]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP2]]
+;
+  %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 5)
+  %2 = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fmls.nxv2f64(<vscale x 2 x i1> %1, <vscale x 2 x double> %a, <vscale x 2 x double> %b, <vscale x 2 x double> %c)
+  ret <vscale x 2 x double> %2
+}
+
+declare <vscale x 8 x half> @llvm.aarch64.sve.fnmla.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>)
+define <vscale x 8 x half> @replace_fnmla_intrinsic_half(<vscale x 8 x half> %a, <vscale x 8 x half> %b, <vscale x 8 x half> %c) #0 {
+; CHECK-LABEL: define <vscale x 8 x half> @replace_fnmla_intrinsic_half
+; CHECK-SAME: (<vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[B:%.*]], <vscale x 8 x half> [[C:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 8 x half> @llvm.aarch64.sve.fnmla.u.nxv8f16(<vscale x 8 x i1> [[TMP1]], <vscale x 8 x half> [[B]], <vscale x 8 x half> [[C]], <vscale x 8 x half> [[A]])
+; CHECK-NEXT:    ret <vscale x 8 x half> [[TMP2]]
+;
+  %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %2 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fnmla.nxv8f16(<vscale x 8 x i1> %1, <vscale x 8 x half> %a, <vscale x 8 x half> %b, <vscale x 8 x half> %c)
+  ret <vscale x 8 x half> %2
+}
+
+declare <vscale x 4 x float> @llvm.aarch64.sve.fnmla.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>)
+define <vscale x 4 x float> @replace_fnmla_intrinsic_float(<vscale x 4 x float> %a, <vscale x 4 x float> %b, <vscale x 4 x float> %c) #0 {
+; CHECK-LABEL: define <vscale x 4 x float> @replace_fnmla_intrinsic_float
+; CHECK-SAME: (<vscale x 4 x float> [[A:%.*]], <vscale x 4 x float> [[B:%.*]], <vscale x 4 x float> [[C:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.fnmla.u.nxv4f32(<vscale x 4 x i1> [[TMP1]], <vscale x 4 x float> [[B]], <vscale x 4 x float> [[C]], <vscale x 4 x float> [[A]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP2]]
+;
+  %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %2 = tail call fast <vscale x 4 x float> @llvm.aarch64.sve.fnmla.nxv4f32(<vscale x 4 x i1> %1, <vscale x 4 x float> %a, <vscale x 4 x float> %b, <vscale x 4 x float> %c)
+  ret <vscale x 4 x float> %2
+}
+
+declare <vscale x 2 x double> @llvm.aarch64.sve.fnmla.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>)
+define <vscale x 2 x double> @replace_fnmla_intrinsic_double(<vscale x 2 x double> %a, <vscale x 2 x double> %b, <vscale x 2 x double> %c) #0 {
+; CHECK-LABEL: define <vscale x 2 x double> @replace_fnmla_intrinsic_double
+; CHECK-SAME: (<vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> [[B:%.*]], <vscale x 2 x double> [[C:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.fnmla.u.nxv2f64(<vscale x 2 x i1> [[TMP1]], <vscale x 2 x double> [[B]], <vscale x 2 x double> [[C]], <vscale x 2 x double> [[A]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP2]]
+;
+  %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %2 = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fnmla.nxv2f64(<vscale x 2 x i1> %1, <vscale x 2 x double> %a, <vscale x 2 x double> %b, <vscale x 2 x double> %c)
+  ret <vscale x 2 x double> %2
+}
+
+define <vscale x 2 x double> @no_replace_fnmla_intrinsic_double(<vscale x 2 x double> %a, <vscale x 2 x double> %b, <vscale x 2 x double> %c) #0 {
+; CHECK-LABEL: define <vscale x 2 x double> @no_replace_fnmla_intrinsic_double
+; CHECK-SAME: (<vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> [[B:%.*]], <vscale x 2 x double> [[C:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 5)
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fnmla.nxv2f64(<vscale x 2 x i1> [[TMP1]], <vscale x 2 x double> [[A]], <vscale x 2 x double> [[B]], <vscale x 2 x double> [[C]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP2]]
+;
+  %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 5)
+  %2 = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fnmla.nxv2f64(<vscale x 2 x i1> %1, <vscale x 2 x double> %a, <vscale x 2 x double> %b, <vscale x 2 x double> %c)
+  ret <vscale x 2 x double> %2
+}
+
+declare <vscale x 8 x half> @llvm.aarch64.sve.fnmls.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>)
+define <vscale x 8 x half> @replace_fnmls_intrinsic_half(<vscale x 8 x half> %a, <vscale x 8 x half> %b, <vscale x 8 x half> %c) #0 {
+; CHECK-LABEL: define <vscale x 8 x half> @replace_fnmls_intrinsic_half
+; CHECK-SAME: (<vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[B:%.*]], <vscale x 8 x half> [[C:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 8 x half> @llvm.aarch64.sve.fnmls.u.nxv8f16(<vscale x 8 x i1> [[TMP1]], <vscale x 8 x half> [[B]], <vscale x 8 x half> [[C]], <vscale x 8 x half> [[A]])
+; CHECK-NEXT:    ret <vscale x 8 x half> [[TMP2]]
+;
+  %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %2 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fnmls.nxv8f16(<vscale x 8 x i1> %1, <vscale x 8 x half> %a, <vscale x 8 x half> %b, <vscale x 8 x half> %c)
+  ret <vscale x 8 x half> %2
+}
+
+declare <vscale x 4 x float> @llvm.aarch64.sve.fnmls.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>)
+define <vscale x 4 x float> @replace_fnmls_intrinsic_float(<vscale x 4 x float> %a, <vscale x 4 x float> %b, <vscale x 4 x float> %c) #0 {
+; CHECK-LABEL: define <vscale x 4 x float> @replace_fnmls_intrinsic_float
+; CHECK-SAME: (<vscale x 4 x float> [[A:%.*]], <vscale x 4 x float> [[B:%.*]], <vscale x 4 x float> [[C:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.fnmls.u.nxv4f32(<vscale x 4 x i1> [[TMP1]], <vscale x 4 x float> [[B]], <vscale x 4 x float> [[C]], <vscale x 4 x float> [[A]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP2]]
+;
+  %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %2 = tail call fast <vscale x 4 x float> @llvm.aarch64.sve.fnmls.nxv4f32(<vscale x 4 x i1> %1, <vscale x 4 x float> %a, <vscale x 4 x float> %b, <vscale x 4 x float> %c)
+  ret <vscale x 4 x float> %2
+}
+
+declare <vscale x 2 x double> @llvm.aarch64.sve.fnmls.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>)
+define <vscale x 2 x double> @replace_fnmls_intrinsic_double(<vscale x 2 x double> %a, <vscale x 2 x double> %b, <vscale x 2 x double> %c) #0 {
+; CHECK-LABEL: define <vscale x 2 x double> @replace_fnmls_intrinsic_double
+; CHECK-SAME: (<vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> [[B:%.*]], <vscale x 2 x double> [[C:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.fnmls.u.nxv2f64(<vscale x 2 x i1> [[TMP1]], <vscale x 2 x double> [[B]], <vscale x 2 x double> [[C]], <vscale x 2 x double> [[A]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP2]]
+;
+  %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %2 = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fnmls.nxv2f64(<vscale x 2 x i1> %1, <vscale x 2 x double> %a, <vscale x 2 x double> %b, <vscale x 2 x double> %c)
+  ret <vscale x 2 x double> %2
+}
+
+define <vscale x 2 x double> @no_replace_fnmls_intrinsic_double(<vscale x 2 x double> %a, <vscale x 2 x double> %b, <vscale x 2 x double> %c) #0 {
+; CHECK-LABEL: define <vscale x 2 x double> @no_replace_fnmls_intrinsic_double
+; CHECK-SAME: (<vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> [[B:%.*]], <vscale x 2 x double> [[C:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 5)
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fnmls.nxv2f64(<vscale x 2 x i1> [[TMP1]], <vscale x 2 x double> [[A]], <vscale x 2 x double> [[B]], <vscale x 2 x double> [[C]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP2]]
+;
+  %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 5)
+  %2 = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fnmls.nxv2f64(<vscale x 2 x i1> %1, <vscale x 2 x double> %a, <vscale x 2 x double> %b, <vscale x 2 x double> %c)
+  ret <vscale x 2 x double> %2
+}
+
+attributes #0 = { "target-features"="+sve" }