diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h
--- a/llvm/include/llvm/IR/IRBuilder.h
+++ b/llvm/include/llvm/IR/IRBuilder.h
@@ -1420,10 +1420,11 @@
   }
 
   Value *CreateFAdd(Value *L, Value *R, const Twine &Name = "",
-                    MDNode *FPMD = nullptr) {
+                    MDNode *FPMD = nullptr, Value *Flag = nullptr) {
     if (IsFPConstrained)
       return CreateConstrainedFPBinOp(Intrinsic::experimental_constrained_fadd,
-                                      L, R, nullptr, Name, FPMD);
+                                      L, R, nullptr, Name, FPMD, llvm::None,
+                                      llvm::None, Flag);
 
     if (Value *V = foldConstant(Instruction::FAdd, L, R, Name)) return V;
     Instruction *I = setFPAttrs(BinaryOperator::CreateFAdd(L, R), FPMD, FMF);
@@ -1579,7 +1580,7 @@
       Intrinsic::ID ID, Value *L, Value *R, Instruction *FMFSource = nullptr,
       const Twine &Name = "", MDNode *FPMathTag = nullptr,
       Optional<RoundingMode> Rounding = None,
-      Optional<fp::ExceptionBehavior> Except = None);
+      Optional<fp::ExceptionBehavior> Except = None, Value *Flag = nullptr);
 
   Value *CreateNeg(Value *V, const Twine &Name = "",
                    bool HasNUW = false, bool HasNSW = false) {
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1709,7 +1709,7 @@
 
   def int_vector_reduce_fadd : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
                                          [LLVMVectorElementType<0>,
-                                          llvm_anyvector_ty]>;
+                                          llvm_anyvector_ty, llvm_i32_ty]>;
   def int_vector_reduce_fmul : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
                                          [LLVMVectorElementType<0>,
                                           llvm_anyvector_ty]>;
diff --git a/llvm/lib/CodeGen/ExpandReductions.cpp b/llvm/lib/CodeGen/ExpandReductions.cpp
--- a/llvm/lib/CodeGen/ExpandReductions.cpp
+++ b/llvm/lib/CodeGen/ExpandReductions.cpp
@@ -116,7 +116,26 @@
     Builder.setFastMathFlags(FMF);
     switch (ID) {
     default: llvm_unreachable("Unexpected intrinsic!");
-    case Intrinsic::vector_reduce_fadd:
+    case Intrinsic::vector_reduce_fadd: {
+      // FMFs must be attached to the call, otherwise it's an ordered reduction
+      // and it can't be handled by generating a shuffle sequence.
+      Value *Acc = II->getArgOperand(0);
+      Value *Vec = II->getArgOperand(1);
+      Value *Flag = II->getArgOperand(2);
+
+      if (!FMF.allowReassoc() && !Flag)
+        Rdx = getOrderedReduction(Builder, Acc, Vec, getOpcode(ID), RK);
+      else {
+        if (!isPowerOf2_32(
+                cast<FixedVectorType>(Vec->getType())->getNumElements()))
+          continue;
+
+        Rdx = getShuffleReduction(Builder, Vec, getOpcode(ID), RK);
+        Rdx = Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(ID), Acc,
+                                  Rdx, "bin.rdx");
+      }
+      break;
+    }
     case Intrinsic::vector_reduce_fmul: {
       // FMFs must be attached to the call, otherwise it's an ordered reduction
       // and it can't be handled by generating a shuffle sequence.
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -2191,9 +2191,11 @@
       Value *Elt1 = Builder.CreateExtractElement(CI->getArgOperand(1),
                                                  ConstantInt::get(I32Ty, 0));
       Value *EltOp;
-      if (Name.contains(".add."))
-        EltOp = Builder.CreateFAdd(Elt0, Elt1);
-      else if (Name.contains(".sub."))
+      if (Name.contains(".add.")) {
+        Value *Elt2 = Builder.CreateExtractElement(CI->getArgOperand(2),
+                                                   ConstantInt::get(I32Ty, 0));
+        EltOp = Builder.CreateFAdd(Elt0, Elt1, "", nullptr, Elt2);
+      } else if (Name.contains(".sub."))
         EltOp = Builder.CreateFSub(Elt0, Elt1);
       else if (Name.contains(".mul."))
         EltOp = Builder.CreateFMul(Elt0, Elt1);
@@ -3036,7 +3038,8 @@
                                  { CI->getArgOperand(0), CI->getArgOperand(1),
                                    CI->getArgOperand(4) });
       } else {
-        Rep = Builder.CreateFAdd(CI->getArgOperand(0), CI->getArgOperand(1));
+        Rep = Builder.CreateFAdd(CI->getArgOperand(0), CI->getArgOperand(1), "",
+                                 nullptr, CI->getArgOperand(2));
       }
       Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
                           CI->getArgOperand(2));
diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp
--- a/llvm/lib/IR/IRBuilder.cpp
+++ b/llvm/lib/IR/IRBuilder.cpp
@@ -860,9 +860,8 @@
 
 CallInst *IRBuilderBase::CreateConstrainedFPBinOp(
     Intrinsic::ID ID, Value *L, Value *R, Instruction *FMFSource,
-    const Twine &Name, MDNode *FPMathTag,
-    Optional<RoundingMode> Rounding,
-    Optional<fp::ExceptionBehavior> Except) {
+    const Twine &Name, MDNode *FPMathTag, Optional<RoundingMode> Rounding,
+    Optional<fp::ExceptionBehavior> Except, Value *Flag) {
   Value *RoundingV = getConstrainedFPRounding(Rounding);
   Value *ExceptV = getConstrainedFPExcept(Except);
 
@@ -870,8 +869,13 @@
   if (FMFSource)
     UseFMF = FMFSource->getFastMathFlags();
 
-  CallInst *C = CreateIntrinsic(ID, {L->getType()},
-                                {L, R, RoundingV, ExceptV}, nullptr, Name);
+  auto Args = {L, R, RoundingV, ExceptV};
+
+  if (Flag)
+    Args = {L, R, Flag, RoundingV, ExceptV};
+
+  CallInst *C =
+      CreateIntrinsic(ID, {L->getType()}, std::move(Args), nullptr, Name);
   setConstrainedFPCallAttr(C);
   setFPAttrs(C, FPMathTag, UseFMF);
   return C;
diff --git a/llvm/test/Verifier/reduction-intrinsics.ll b/llvm/test/Verifier/reduction-intrinsics.ll
--- a/llvm/test/Verifier/reduction-intrinsics.ll
+++ b/llvm/test/Verifier/reduction-intrinsics.ll
@@ -18,7 +18,7 @@
 
 define float @fadd_match_arg_types(<4 x float> %x) {
 ; CHECK: Intrinsic has incorrect argument type!
-  %r = call float @llvm.vector.reduce.fadd.v4f32(double 0.0, <4 x float> %x)
+  %r = call float @llvm.vector.reduce.fadd.v4f32(double 0.0, <4 x float> %x, i32 0)
   ret float %r
 }
 
@@ -47,7 +47,7 @@
 
 define i32 @not_integer_reduce(<4 x i32> %x) {
 ; CHECK: Intrinsic has incorrect argument type!
-  %r = call i32 @llvm.vector.reduce.fadd.v4i32(i32 0, <4 x i32> %x)
+  %r = call i32 @llvm.vector.reduce.fadd.v4i32(i32 0, <4 x i32> %x, i32 0)
   ret i32 %r
 }
 
@@ -59,8 +59,8 @@
 
 declare float @llvm.vector.reduce.umin.v4f32(<4 x float>)
 declare i32* @llvm.vector.reduce.or.v4p0i32(<4 x i32*>)
-declare i32 @llvm.vector.reduce.fadd.v4i32(i32, <4 x i32>)
-declare float @llvm.vector.reduce.fadd.v4f32(double, <4 x float>)
+declare i32 @llvm.vector.reduce.fadd.v4i32(i32, <4 x i32>, i32)
+declare float @llvm.vector.reduce.fadd.v4f32(double, <4 x float>, i32)
 declare i32* @llvm.vector.reduce.fmin.v4p0i32(<4 x i32*>)
 declare float @llvm.vector.reduce.fmax.f32(float)
 declare i32 @llvm.vector.reduce.smax.i32(i32)