diff --git a/llvm/include/llvm/IR/Operator.h b/llvm/include/llvm/IR/Operator.h
--- a/llvm/include/llvm/IR/Operator.h
+++ b/llvm/include/llvm/IR/Operator.h
@@ -243,6 +243,9 @@
   void operator|=(const FastMathFlags &OtherFlags) {
     Flags |= OtherFlags.Flags;
   }
+  bool operator!=(const FastMathFlags &OtherFlags) const {
+    return Flags != OtherFlags.Flags;
+  }
 };
 
 /// Utility class for floating point operations which can have
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -695,6 +695,43 @@
   return None;
 }
 
+static Optional<Instruction *> instCombineSVEVectorFMLA(InstCombiner &IC,
+                                                        IntrinsicInst &II) {
+  // fold (fadd a (fmul b c)) -> (fma a b c)
+  Value *p, *FMul, *a, *b, *c;
+  auto m_SVEFAdd = [](auto p, auto w, auto x) {
+    return m_CombineOr(m_Intrinsic<Intrinsic::aarch64_sve_fadd>(p, w, x),
+                       m_Intrinsic<Intrinsic::aarch64_sve_fadd>(p, x, w));
+  };
+  auto m_SVEFMul = [](auto p, auto y, auto z) {
+    return m_Intrinsic<Intrinsic::aarch64_sve_fmul>(p, y, z);
+  };
+  if (!match(&II, m_SVEFAdd(m_Value(p), m_Value(a),
+                            m_CombineAnd(m_Value(FMul),
+                                         m_SVEFMul(m_Deferred(p), m_Value(b),
+                                                   m_Value(c))))))
+    return None;
+
+  if (!FMul->hasOneUse())
+    return None;
+
+  llvm::FastMathFlags FAddFlags = II.getFastMathFlags();
+  llvm::FastMathFlags FMulFlags = cast<IntrinsicInst>(FMul)->getFastMathFlags();
+  if (!FAddFlags.allowReassoc() || !FMulFlags.allowReassoc())
+    return None;
+  if (!FAddFlags.allowContract() || !FMulFlags.allowContract())
+    return None;
+  if (FAddFlags != FMulFlags)
+    return None;
+
+  IRBuilder<> Builder(II.getContext());
+  Builder.SetInsertPoint(&II);
+  auto FMLA = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_fmla,
+                                      {II.getType()}, {p, a, b, c}, &II);
+  FMLA->setFastMathFlags(FAddFlags);
+  return IC.replaceInstUsesWith(II, FMLA);
+}
+
 static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic) {
   switch (Intrinsic) {
   case Intrinsic::aarch64_sve_fmul:
@@ -724,6 +761,19 @@
   return IC.replaceInstUsesWith(II, BinOp);
 }
 
+static Optional<Instruction *> instCombineSVEVectorFAdd(InstCombiner &IC,
+                                                        IntrinsicInst &II,
+                                                        const TargetOptions &Options) {
+  bool AllowFusionGlobally = (Options.AllowFPOpFusion == FPOpFusion::Fast ||
+                            Options.UnsafeFPMath);
+  if(AllowFusionGlobally){
+    auto FMLA = instCombineSVEVectorFMLA(IC, II);
+    if(FMLA)
+      return FMLA;
+  }
+  return instCombineSVEVectorBinOp(IC, II);
+}
+
 static Optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC,
                                                        IntrinsicInst &II) {
   auto *OpPredicate = II.getOperand(0);
@@ -900,7 +950,10 @@
   case Intrinsic::aarch64_sve_mul:
   case Intrinsic::aarch64_sve_fmul:
     return instCombineSVEVectorMul(IC, II);
-  case Intrinsic::aarch64_sve_fadd:
+  case Intrinsic::aarch64_sve_fadd: {
+    const TargetOptions &Options = ST->getTargetLowering()->getTargetMachine().Options;
+    return instCombineSVEVectorFAdd(IC, II, Options);
+  }
   case Intrinsic::aarch64_sve_fsub:
     return instCombineSVEVectorBinOp(IC, II);
   case Intrinsic::aarch64_sve_tbl:
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmla.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmla.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmla.ll
@@ -0,0 +1,109 @@
+; RUN: opt -S -instcombine < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define dso_local <vscale x 8 x half> @combine_fmla(<vscale x 16 x i1> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, <vscale x 8 x half> %3) local_unnamed_addr #0 {
+; CHECK-LABEL: @combine_fmla
+; CHECK-NEXT:  %5 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %0)
+; CHECK-NEXT:  %6 = call fast <vscale x 8 x half> @llvm.aarch64.sve.fmla.nxv8f16(<vscale x 8 x i1> %5, <vscale x 8 x half> %1, <vscale x 8 x half> %2, <vscale x 8 x half> %3)
+; CHECK-NEXT:  ret <vscale x 8 x half> %6
+  %5 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %0)
+  %6 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmul.nxv8f16(<vscale x 8 x i1> %5, <vscale x 8 x half> %2, <vscale x 8 x half> %3)
+  %7 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fadd.nxv8f16(<vscale x 8 x i1> %5, <vscale x 8 x half> %1, <vscale x 8 x half> %6)
+  ret <vscale x 8 x half> %7
+}
+
+define dso_local <vscale x 8 x half> @combine_fmla_contract_flag_only(<vscale x 16 x i1> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, <vscale x 8 x half> %3) local_unnamed_addr #0 {
+; CHECK-LABEL: @combine_fmla
+; CHECK-NEXT:  %5 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %0)
+; CHECK-NEXT:  %6 = call contract <vscale x 8 x half> @llvm.aarch64.sve.fmla.nxv8f16(<vscale x 8 x i1> %5, <vscale x 8 x half> %1, <vscale x 8 x half> %2, <vscale x 8 x half> %3)
+; CHECK-NEXT:  ret <vscale x 8 x half> %6
+  %5 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %0)
+  %6 = tail call contract <vscale x 8 x half> @llvm.aarch64.sve.fmul.nxv8f16(<vscale x 8 x i1> %5, <vscale x 8 x half> %2, <vscale x 8 x half> %3)
+  %7 = tail call contract <vscale x 8 x half> @llvm.aarch64.sve.fadd.nxv8f16(<vscale x 8 x i1> %5, <vscale x 8 x half> %1, <vscale x 8 x half> %6)
+  ret <vscale x 8 x half> %7
+}
+
+define dso_local <vscale x 8 x half> @neg_combine_fmla_no_fast_flag(<vscale x 16 x i1> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, <vscale x 8 x half> %3) local_unnamed_addr #0 {
+; CHECK-LABEL: @neg_combine_fmla_no_fast_flag
+; CHECK-NEXT:  %5 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %0)
+; CHECK-NEXT:  %6 = tail call <vscale x 8 x half> @llvm.aarch64.sve.fmul.nxv8f16(<vscale x 8 x i1> %5, <vscale x 8 x half> %2, <vscale x 8 x half> %3)
+; CHECK-NEXT:  %7 = tail call <vscale x 8 x half> @llvm.aarch64.sve.fadd.nxv8f16(<vscale x 8 x i1> %5, <vscale x 8 x half> %1, <vscale x 8 x half> %6)
+; CHECK-NEXT:  ret <vscale x 8 x half> %7
+  %5 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %0)
+  %6 = tail call <vscale x 8 x half> @llvm.aarch64.sve.fmul.nxv8f16(<vscale x 8 x i1> %5, <vscale x 8 x half> %2, <vscale x 8 x half> %3)
+  %7 = tail call <vscale x 8 x half> @llvm.aarch64.sve.fadd.nxv8f16(<vscale x 8 x i1> %5, <vscale x 8 x half> %1, <vscale x 8 x half> %6)
+  ret <vscale x 8 x half> %7
+}
+
+define dso_local <vscale x 8 x half> @neg_combine_fmla_no_fmul(<vscale x 16 x i1> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, <vscale x 8 x half> %3) local_unnamed_addr #0 {
+; CHECK-LABEL: @neg_combine_fmla_no_fmul
+; CHECK-NEXT:  %5 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %0)
+; CHECK-NEXT:  %6 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fadd.nxv8f16(<vscale x 8 x i1> %5, <vscale x 8 x half> %2, <vscale x 8 x half> %3)
+; CHECK-NEXT:  %7 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fadd.nxv8f16(<vscale x 8 x i1> %5, <vscale x 8 x half> %1, <vscale x 8 x half> %6)
+; CHECK-NEXT:  ret <vscale x 8 x half> %7
+  %5 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %0)
+  %6 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fadd.nxv8f16(<vscale x 8 x i1> %5, <vscale x 8 x half> %2, <vscale x 8 x half> %3)
+  %7 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fadd.nxv8f16(<vscale x 8 x i1> %5, <vscale x 8 x half> %1, <vscale x 8 x half> %6)
+  ret <vscale x 8 x half> %7
+}
+
+define dso_local <vscale x 8 x half> @neg_combine_fmla_neq_pred(<vscale x 16 x i1> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, <vscale x 8 x half> %3) local_unnamed_addr #0 {
+; CHECK-LABEL: @neg_combine_fmla_neq_pred
+; CHECK-NEXT:  %5 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %0)
+; CHECK-NEXT:  %6 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 5)
+; CHECK-NEXT:  %7 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %6)
+; CHECK-NEXT:  %8 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmul.nxv8f16(<vscale x 8 x i1> %5, <vscale x 8 x half> %2, <vscale x 8 x half> %3)
+; CHECK-NEXT:  %9 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fadd.nxv8f16(<vscale x 8 x i1> %7, <vscale x 8 x half> %1, <vscale x 8 x half> %8)
+; ret <vscale x 8 x half> %9
+  %5 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %0)
+  %6 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 5)
+  %7 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %6)
+  %8 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmul.nxv8f16(<vscale x 8 x i1> %5, <vscale x 8 x half> %2, <vscale x 8 x half> %3)
+  %9 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fadd.nxv8f16(<vscale x 8 x i1> %7, <vscale x 8 x half> %1, <vscale x 8 x half> %8)
+  ret <vscale x 8 x half> %9
+}
+
+define dso_local <vscale x 8 x half> @neg_combine_fmla_two_fmul_uses(<vscale x 16 x i1> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, <vscale x 8 x half> %3) local_unnamed_addr #0 {
+; CHECK-LABEL: @neg_combine_fmla_two_fmul_uses
+; CHECK-NEXT:  %5 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %0)
+; CHECK-NEXT:  %6 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmul.nxv8f16(<vscale x 8 x i1> %5, <vscale x 8 x half> %2, <vscale x 8 x half> %3)
+; CHECK-NEXT:  %7 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fadd.nxv8f16(<vscale x 8 x i1> %5, <vscale x 8 x half> %1, <vscale x 8 x half> %6)
+; CHECK-NEXT:  %8 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fadd.nxv8f16(<vscale x 8 x i1> %5, <vscale x 8 x half> %7, <vscale x 8 x half> %6)
+; ret <vscale x 8 x half> %8
+  %5 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %0)
+  %6 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmul.nxv8f16(<vscale x 8 x i1> %5, <vscale x 8 x half> %2, <vscale x 8 x half> %3)
+  %7 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fadd.nxv8f16(<vscale x 8 x i1> %5, <vscale x 8 x half> %1, <vscale x 8 x half> %6)
+  %8 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fadd.nxv8f16(<vscale x 8 x i1> %5, <vscale x 8 x half> %7, <vscale x 8 x half> %6)
+  ret <vscale x 8 x half> %8
+}
+
+define dso_local <vscale x 8 x half> @neg_combine_fmla_and_flags_1(<vscale x 16 x i1> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, <vscale x 8 x half> %3) local_unnamed_addr #0 {
+; CHECK-LABEL: @neg_combine_fmla_and_flags_1
+; CHECK-NEXT:  %5 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %0)
+; CHECK-NEXT:  %6 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmul.nxv8f16(<vscale x 8 x i1> %5, <vscale x 8 x half> %2, <vscale x 8 x half> %3)
+; CHECK-NEXT:  %7 = tail call <vscale x 8 x half> @llvm.aarch64.sve.fadd.nxv8f16(<vscale x 8 x i1> %5, <vscale x 8 x half> %1, <vscale x 8 x half> %6)
+; ret <vscale x 8 x half> %7
+  %5 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %0)
+  %6 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmul.nxv8f16(<vscale x 8 x i1> %5, <vscale x 8 x half> %2, <vscale x 8 x half> %3)
+  %7 = tail call <vscale x 8 x half> @llvm.aarch64.sve.fadd.nxv8f16(<vscale x 8 x i1> %5, <vscale x 8 x half> %1, <vscale x 8 x half> %6)
+  ret <vscale x 8 x half> %7
+}
+
+define dso_local <vscale x 8 x half> @neg_combine_fmla_and_flags_2(<vscale x 16 x i1> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, <vscale x 8 x half> %3) local_unnamed_addr #0 {
+; CHECK-LABEL: @neg_combine_fmla_and_flags_2
+; CHECK-NEXT:  %5 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %0)
+; CHECK-NEXT:  %6 = tail call <vscale x 8 x half> @llvm.aarch64.sve.fmul.nxv8f16(<vscale x 8 x i1> %5, <vscale x 8 x half> %2, <vscale x 8 x half> %3)
+; CHECK-NEXT:  %7 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fadd.nxv8f16(<vscale x 8 x i1> %5, <vscale x 8 x half> %1, <vscale x 8 x half> %6)
+; CHECK-NEXT:  ret <vscale x 8 x half> %7
+  %5 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %0)
+  %6 = tail call <vscale x 8 x half> @llvm.aarch64.sve.fmul.nxv8f16(<vscale x 8 x i1> %5, <vscale x 8 x half> %2, <vscale x 8 x half> %3)
+  %7 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fadd.nxv8f16(<vscale x 8 x i1> %5, <vscale x 8 x half> %1, <vscale x 8 x half> %6)
+  ret <vscale x 8 x half> %7
+}
+
+declare <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1>)
+declare <vscale x 8 x half> @llvm.aarch64.sve.fmul.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
+declare <vscale x 8 x half> @llvm.aarch64.sve.fadd.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
+declare <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32)
+attributes #0 = { "target-features"="+sve" }