Index: docs/LangRef.rst
===================================================================
--- docs/LangRef.rst
+++ docs/LangRef.rst
@@ -13455,37 +13455,34 @@
 """"""""""
 The argument to this intrinsic must be a vector of integer values.
 
-'``llvm.experimental.vector.reduce.fadd.*``' Intrinsic
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+'``llvm.experimental.vector.reduce.v2.fadd.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Syntax:
 """""""
 
 ::
 
-      declare float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float %acc, <4 x float> %a)
-      declare double @llvm.experimental.vector.reduce.fadd.f64.v2f64(double %acc, <2 x double> %a)
+      declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %start_value, <4 x float> %a)
+      declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double %start_value, <2 x double> %a)
 
 Overview:
 """""""""
 
-The '``llvm.experimental.vector.reduce.fadd.*``' intrinsics do a floating-point
+The '``llvm.experimental.vector.reduce.v2.fadd.*``' intrinsics do a floating-point
 ``ADD`` reduction of a vector, returning the result as a scalar. The return type
 matches the element-type of the vector input.
 
-If the intrinsic call has fast-math flags, then the reduction will not preserve
-the associativity of an equivalent scalarized counterpart. If it does not have
-fast-math flags, then the reduction will be *ordered*, implying that the
-operation respects the associativity of a scalarized reduction.
+If the intrinsic call has the 'reassoc' or 'fast' flags set, then the
+reduction will not preserve the associativity of an equivalent scalarized
+counterpart. Otherwise the reduction will be *ordered*, thus implying that
+the operation respects the associativity of a scalarized reduction.
 
 
 Arguments:
 """"""""""
-The first argument to this intrinsic is a scalar accumulator value, which is
-only used when there are no fast-math flags attached. This argument may be undef
-when fast-math flags are used. The type of the accumulator matches the
-element-type of the vector input.
-
+The first argument to this intrinsic is a scalar start value for the reduction.
+The type of the start value matches the element-type of the vector input.
 The second argument must be a vector of floating-point values.
 
 Examples:
@@ -13493,8 +13490,8 @@
 
 .. code-block:: llvm
 
-      %fast = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %input) ; fast reduction
-      %ord = call float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float %acc, <4 x float> %input) ; ordered reduction
+      %unord = call reassoc float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.0, <4 x float> %input) ; unordered reduction
+      %ord = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %start_value, <4 x float> %input) ; ordered reduction
 
 
 '``llvm.experimental.vector.reduce.mul.*``' Intrinsic
@@ -13519,37 +13516,34 @@
 """"""""""
 The argument to this intrinsic must be a vector of integer values.
 
-'``llvm.experimental.vector.reduce.fmul.*``' Intrinsic
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+'``llvm.experimental.vector.reduce.v2.fmul.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Syntax:
 """""""
 
 ::
 
-      declare float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float %acc, <4 x float> %a)
-      declare double @llvm.experimental.vector.reduce.fmul.f64.v2f64(double %acc, <2 x double> %a)
+      declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float %start_value, <4 x float> %a)
+      declare double @llvm.experimental.vector.reduce.v2.fmul.f64.v2f64(double %start_value, <2 x double> %a)
 
 Overview:
 """""""""
 
-The '``llvm.experimental.vector.reduce.fmul.*``' intrinsics do a floating-point
+The '``llvm.experimental.vector.reduce.v2.fmul.*``' intrinsics do a floating-point
 ``MUL`` reduction of a vector, returning the result as a scalar. The return type
 matches the element-type of the vector input.
 
-If the intrinsic call has fast-math flags, then the reduction will not preserve
-the associativity of an equivalent scalarized counterpart. If it does not have
-fast-math flags, then the reduction will be *ordered*, implying that the
-operation respects the associativity of a scalarized reduction.
+If the intrinsic call has the 'reassoc' or 'fast' flags set, then the
+reduction will not preserve the associativity of an equivalent scalarized
+counterpart. Otherwise the reduction will be *ordered*, thus implying that
+the operation respects the associativity of a scalarized reduction.
 
 
 Arguments:
 """"""""""
-The first argument to this intrinsic is a scalar accumulator value, which is
-only used when there are no fast-math flags attached. This argument may be undef
-when fast-math flags are used. The type of the accumulator matches the
-element-type of the vector input.
-
+The first argument to this intrinsic is a scalar start value for the reduction.
+The type of the start value matches the element-type of the vector input.
 The second argument must be a vector of floating-point values.
 
 Examples:
@@ -13557,8 +13551,8 @@
 
 .. code-block:: llvm
 
-      %fast = call fast float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float undef, <4 x float> %input) ; fast reduction
-      %ord = call float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float %acc, <4 x float> %input) ; ordered reduction
+      %unord = call reassoc float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.0, <4 x float> %input) ; unordered reduction
+      %ord = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float %start_value, <4 x float> %input) ; ordered reduction
 
 '``llvm.experimental.vector.reduce.and.*``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Index: include/llvm/CodeGen/BasicTTIImpl.h
===================================================================
--- include/llvm/CodeGen/BasicTTIImpl.h
+++ include/llvm/CodeGen/BasicTTIImpl.h
@@ -1057,8 +1057,8 @@
     case Intrinsic::experimental_vector_reduce_and:
     case Intrinsic::experimental_vector_reduce_or:
     case Intrinsic::experimental_vector_reduce_xor:
-    case Intrinsic::experimental_vector_reduce_fadd:
-    case Intrinsic::experimental_vector_reduce_fmul:
+    case Intrinsic::experimental_vector_reduce_v2_fadd:
+    case Intrinsic::experimental_vector_reduce_v2_fmul:
     case Intrinsic::experimental_vector_reduce_smax:
     case Intrinsic::experimental_vector_reduce_smin:
     case Intrinsic::experimental_vector_reduce_fmax:
@@ -1248,12 +1248,16 @@
     case Intrinsic::experimental_vector_reduce_xor:
       return ConcreteTTI->getArithmeticReductionCost(Instruction::Xor, Tys[0],
                                                      /*IsPairwiseForm=*/false);
-    case Intrinsic::experimental_vector_reduce_fadd:
-      return ConcreteTTI->getArithmeticReductionCost(Instruction::FAdd, Tys[0],
-                                                     /*IsPairwiseForm=*/false);
-    case Intrinsic::experimental_vector_reduce_fmul:
-      return ConcreteTTI->getArithmeticReductionCost(Instruction::FMul, Tys[0],
-                                                     /*IsPairwiseForm=*/false);
+    case Intrinsic::experimental_vector_reduce_v2_fadd:
+      return ConcreteTTI->getArithmeticReductionCost(
+          Instruction::FAdd, Tys[0],
+          /*IsPairwiseForm=*/false); // FIXME: This should be set to
+                                     // 'FMF.allowReassoc()'
+    case Intrinsic::experimental_vector_reduce_v2_fmul:
+      return ConcreteTTI->getArithmeticReductionCost(
+          Instruction::FMul, Tys[0],
+          /*IsPairwiseForm=*/false); // FIXME: This should be set to
+                                     // 'FMF.allowReassoc()'
     case Intrinsic::experimental_vector_reduce_smax:
     case Intrinsic::experimental_vector_reduce_smin:
     case Intrinsic::experimental_vector_reduce_fmax:
Index: include/llvm/IR/Intrinsics.td
===================================================================
--- include/llvm/IR/Intrinsics.td
+++ include/llvm/IR/Intrinsics.td
@@ -1122,14 +1122,14 @@
 
 //===------------------------ Reduction Intrinsics ------------------------===//
 //
-def int_experimental_vector_reduce_fadd : Intrinsic<[llvm_anyfloat_ty],
-                                                    [LLVMMatchType<0>,
-                                                     llvm_anyvector_ty],
-                                                    [IntrNoMem]>;
-def int_experimental_vector_reduce_fmul : Intrinsic<[llvm_anyfloat_ty],
-                                                    [LLVMMatchType<0>,
-                                                     llvm_anyvector_ty],
-                                                    [IntrNoMem]>;
+def int_experimental_vector_reduce_v2_fadd : Intrinsic<[llvm_anyfloat_ty],
+                                                       [LLVMMatchType<0>,
+                                                        llvm_anyvector_ty],
+                                                       [IntrNoMem]>;
+def int_experimental_vector_reduce_v2_fmul : Intrinsic<[llvm_anyfloat_ty],
+                                                       [LLVMMatchType<0>,
+                                                        llvm_anyvector_ty],
+                                                       [IntrNoMem]>;
 def int_experimental_vector_reduce_add : Intrinsic<[llvm_anyint_ty],
                                                    [llvm_anyvector_ty],
                                                    [IntrNoMem]>;
Index: lib/CodeGen/ExpandReductions.cpp
===================================================================
--- lib/CodeGen/ExpandReductions.cpp
+++ lib/CodeGen/ExpandReductions.cpp
@@ -29,9 +29,9 @@
 
 unsigned getOpcode(Intrinsic::ID ID) {
   switch (ID) {
-  case Intrinsic::experimental_vector_reduce_fadd:
+  case Intrinsic::experimental_vector_reduce_v2_fadd:
     return Instruction::FAdd;
-  case Intrinsic::experimental_vector_reduce_fmul:
+  case Intrinsic::experimental_vector_reduce_v2_fmul:
     return Instruction::FMul;
   case Intrinsic::experimental_vector_reduce_add:
     return Instruction::Add;
@@ -90,12 +90,11 @@
     auto ID = II->getIntrinsicID();
     auto MRK = RecurrenceDescriptor::MRK_Invalid;
     switch (ID) {
-    case Intrinsic::experimental_vector_reduce_fadd:
-    case Intrinsic::experimental_vector_reduce_fmul:
+    case Intrinsic::experimental_vector_reduce_v2_fadd:
+    case Intrinsic::experimental_vector_reduce_v2_fmul:
       // FMFs must be attached to the call, otherwise it's an ordered reduction
       // and it can't be handled by generating a shuffle sequence.
-      if (!II->getFastMathFlags().isFast())
-        IsOrdered = true;
+      IsOrdered = !II->getFastMathFlags().allowReassoc();
       Acc = II->getArgOperand(0);
       Vec = II->getArgOperand(1);
       break;
Index: lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -6646,8 +6646,8 @@
     LowerDeoptimizeCall(&I);
     return nullptr;
 
-  case Intrinsic::experimental_vector_reduce_fadd:
-  case Intrinsic::experimental_vector_reduce_fmul:
+  case Intrinsic::experimental_vector_reduce_v2_fadd:
+  case Intrinsic::experimental_vector_reduce_v2_fmul:
   case Intrinsic::experimental_vector_reduce_add:
   case Intrinsic::experimental_vector_reduce_mul:
   case Intrinsic::experimental_vector_reduce_and:
@@ -8688,15 +8688,17 @@
     FMF = I.getFastMathFlags();
 
   switch (Intrinsic) {
-  case Intrinsic::experimental_vector_reduce_fadd:
-    if (FMF.isFast())
-      Res = DAG.getNode(ISD::VECREDUCE_FADD, dl, VT, Op2);
+  case Intrinsic::experimental_vector_reduce_v2_fadd:
+    if (FMF.allowReassoc())
+      Res = DAG.getNode(ISD::FADD, dl, VT, Op1,
+                        DAG.getNode(ISD::VECREDUCE_FADD, dl, VT, Op2));
     else
       Res = DAG.getNode(ISD::VECREDUCE_STRICT_FADD, dl, VT, Op1, Op2);
     break;
-  case Intrinsic::experimental_vector_reduce_fmul:
-    if (FMF.isFast())
-      Res = DAG.getNode(ISD::VECREDUCE_FMUL, dl, VT, Op2);
+  case Intrinsic::experimental_vector_reduce_v2_fmul:
+    if (FMF.allowReassoc())
+      Res = DAG.getNode(ISD::FMUL, dl, VT, Op1,
+                        DAG.getNode(ISD::VECREDUCE_FMUL, dl, VT, Op2));
     else
       Res = DAG.getNode(ISD::VECREDUCE_STRICT_FMUL, dl, VT, Op1, Op2);
     break;
Index: lib/IR/AutoUpgrade.cpp
===================================================================
--- lib/IR/AutoUpgrade.cpp
+++ lib/IR/AutoUpgrade.cpp
@@ -605,6 +605,26 @@
     }
     break;
   }
+  case 'e': {
+    SmallVector<StringRef, 2> Groups;
+    Regex R("^experimental.vector.reduce.([a-z]+)\\.[fi][0-9]+");
+    if (R.match(Name, &Groups)) {
+      Intrinsic::ID ID = Intrinsic::not_intrinsic;
+      if (Groups[1] == "fadd")
+        ID = Intrinsic::experimental_vector_reduce_v2_fadd;
+      if (Groups[1] == "fmul")
+        ID = Intrinsic::experimental_vector_reduce_v2_fmul;
+
+      if (ID != Intrinsic::not_intrinsic) {
+        rename(F);
+        auto Args = F->getFunctionType()->params();
+        Type *Tys[] = {F->getFunctionType()->getReturnType(), Args[1]};
+        NewFn = Intrinsic::getDeclaration(F->getParent(), ID, Tys);
+        return true;
+      }
+    }
+    break;
+  }
   case 'i':
   case 'l': {
     bool IsLifetimeStart = Name.startswith("lifetime.start");
@@ -3448,7 +3468,28 @@
     DefaultCase();
     return;
   }
-
+  case Intrinsic::experimental_vector_reduce_v2_fmul: {
+    SmallVector<Value *, 2> Args;
+    if (CI->isFast())
+      Args.push_back(ConstantFP::get(CI->getOperand(0)->getType(), 1.0));
+    else
+      Args.push_back(CI->getOperand(0));
+    Args.push_back(CI->getOperand(1));
+    NewCall = Builder.CreateCall(NewFn, Args);
+    cast<Instruction>(NewCall)->copyFastMathFlags(CI);
+    break;
+  }
+  case Intrinsic::experimental_vector_reduce_v2_fadd: {
+    SmallVector<Value *, 2> Args;
+    if (CI->isFast())
+      Args.push_back(Constant::getNullValue(CI->getOperand(0)->getType()));
+    else
+      Args.push_back(CI->getOperand(0));
+    Args.push_back(CI->getOperand(1));
+    NewCall = Builder.CreateCall(NewFn, Args);
+    cast<Instruction>(NewCall)->copyFastMathFlags(CI);
+    break;
+  }
   case Intrinsic::arm_neon_vld1:
   case Intrinsic::arm_neon_vld2:
   case Intrinsic::arm_neon_vld3:
Index: lib/IR/IRBuilder.cpp
===================================================================
--- lib/IR/IRBuilder.cpp
+++ lib/IR/IRBuilder.cpp
@@ -323,7 +323,7 @@
   Value *Ops[] = {Acc, Src};
   Type *Tys[] = {Acc->getType(), Src->getType()};
   auto Decl = Intrinsic::getDeclaration(
-      M, Intrinsic::experimental_vector_reduce_fadd, Tys);
+      M, Intrinsic::experimental_vector_reduce_v2_fadd, Tys);
   return createCallHelper(Decl, Ops, this);
 }
 
@@ -332,7 +332,7 @@
   Value *Ops[] = {Acc, Src};
   Type *Tys[] = {Acc->getType(), Src->getType()};
   auto Decl = Intrinsic::getDeclaration(
-      M, Intrinsic::experimental_vector_reduce_fmul, Tys);
+      M, Intrinsic::experimental_vector_reduce_v2_fmul, Tys);
   return createCallHelper(Decl, Ops, this);
 }
 
Index: lib/Transforms/Utils/LoopUtils.cpp
===================================================================
--- lib/Transforms/Utils/LoopUtils.cpp
+++ lib/Transforms/Utils/LoopUtils.cpp
@@ -804,13 +804,9 @@
     ArrayRef<Value *> RedOps) {
   assert(isa<VectorType>(Src->getType()) && "Type must be a vector");
 
-  Value *ScalarUdf = UndefValue::get(Src->getType()->getVectorElementType());
   std::function<Value *()> BuildFunc;
   using RD = RecurrenceDescriptor;
   RD::MinMaxRecurrenceKind MinMaxKind = RD::MRK_Invalid;
-  // TODO: Support creating ordered reductions.
-  FastMathFlags FMFFast;
-  FMFFast.setFast();
 
   switch (Opcode) {
   case Instruction::Add:
@@ -830,15 +826,17 @@
     break;
   case Instruction::FAdd:
     BuildFunc = [&]() {
-      auto Rdx = Builder.CreateFAddReduce(ScalarUdf, Src);
-      cast<CallInst>(Rdx)->setFastMathFlags(FMFFast);
+      auto Rdx = Builder.CreateFAddReduce(
+          Constant::getNullValue(Src->getType()->getVectorElementType()), Src);
+      cast<CallInst>(Rdx)->setFastMathFlags(FMF);
       return Rdx;
     };
     break;
   case Instruction::FMul:
     BuildFunc = [&]() {
-      auto Rdx = Builder.CreateFMulReduce(ScalarUdf, Src);
-      cast<CallInst>(Rdx)->setFastMathFlags(FMFFast);
+      Type *Ty = Src->getType()->getVectorElementType();
+      auto Rdx = Builder.CreateFMulReduce(ConstantFP::get(Ty, 1.0), Src);
+      cast<CallInst>(Rdx)->setFastMathFlags(FMF);
       return Rdx;
     };
     break;
Index: test/Assembler/invalid-vecreduce.ll
===================================================================
--- test/Assembler/invalid-vecreduce.ll
+++ test/Assembler/invalid-vecreduce.ll
@@ -1,34 +1,34 @@
 ; RUN: not opt -S < %s 2>&1 | FileCheck %s
 
 ; CHECK: Intrinsic has incorrect argument type!
-; CHECK-NEXT: float (double, <2 x double>)* @llvm.experimental.vector.reduce.fadd.f32.f64.v2f64
+; CHECK-NEXT: float (double, <2 x double>)* @llvm.experimental.vector.reduce.v2.fadd.f32.f64.v2f64
 define float @fadd_invalid_scalar_res(double %acc, <2 x double> %in) {
-  %res = call float @llvm.experimental.vector.reduce.fadd.f32.f64.v2f64(double %acc, <2 x double> %in)
+  %res = call float @llvm.experimental.vector.reduce.v2.fadd.f32.f64.v2f64(double %acc, <2 x double> %in)
   ret float %res
 }
 
 ; CHECK: Intrinsic has incorrect argument type!
-; CHECK-NEXT: double (float, <2 x double>)* @llvm.experimental.vector.reduce.fadd.f64.f32.v2f64
+; CHECK-NEXT: double (float, <2 x double>)* @llvm.experimental.vector.reduce.v2.fadd.f64.f32.v2f64
 define double @fadd_invalid_scalar_start(float %acc, <2 x double> %in) {
-  %res = call double @llvm.experimental.vector.reduce.fadd.f64.f32.v2f64(float %acc, <2 x double> %in)
+  %res = call double @llvm.experimental.vector.reduce.v2.fadd.f64.f32.v2f64(float %acc, <2 x double> %in)
   ret double %res
 }
 
 ; CHECK: Intrinsic has incorrect argument type!
-; CHECK-NEXT: <2 x double> (double, <2 x double>)* @llvm.experimental.vector.reduce.fadd.v2f64.f64.v2f64
+; CHECK-NEXT: <2 x double> (double, <2 x double>)* @llvm.experimental.vector.reduce.v2.fadd.v2f64.f64.v2f64
 define <2 x double> @fadd_invalid_vector_res(double %acc, <2 x double> %in) {
-  %res = call <2 x double> @llvm.experimental.vector.reduce.fadd.v2f64.f64.v2f64(double %acc, <2 x double> %in)
+  %res = call <2 x double> @llvm.experimental.vector.reduce.v2.fadd.v2f64.f64.v2f64(double %acc, <2 x double> %in)
   ret <2 x double> %res
 }
 
 ; CHECK: Intrinsic has incorrect argument type!
-; CHECK-NEXT: double (<2 x double>, <2 x double>)* @llvm.experimental.vector.reduce.fadd.f64.v2f64.v2f64
+; CHECK-NEXT: double (<2 x double>, <2 x double>)* @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64.v2f64
 define double @fadd_invalid_vector_start(<2 x double> %in, <2 x double> %acc) {
-  %res = call double @llvm.experimental.vector.reduce.fadd.f64.v2f64.v2f64(<2 x double> %acc, <2 x double> %in)
+  %res = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64.v2f64(<2 x double> %acc, <2 x double> %in)
   ret double %res
 }
 
-declare float @llvm.experimental.vector.reduce.fadd.f32.f64.v2f64(double %acc, <2 x double> %in)
-declare double @llvm.experimental.vector.reduce.fadd.f64.f32.v2f64(float %acc, <2 x double> %in)
-declare double @llvm.experimental.vector.reduce.fadd.f64.v2f64.v2f64(<2 x double> %acc, <2 x double> %in)
-declare <2 x double> @llvm.experimental.vector.reduce.fadd.v2f64.f64.v2f64(double %acc, <2 x double> %in)
+declare float @llvm.experimental.vector.reduce.v2.fadd.f32.f64.v2f64(double %acc, <2 x double> %in)
+declare double @llvm.experimental.vector.reduce.v2.fadd.f64.f32.v2f64(float %acc, <2 x double> %in)
+declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64.v2f64(<2 x double> %acc, <2 x double> %in)
+declare <2 x double> @llvm.experimental.vector.reduce.v2.fadd.v2f64.f64.v2f64(double %acc, <2 x double> %in)
Index: test/Bitcode/upgrade-vecreduce-intrinsics.ll
===================================================================
--- /dev/null
+++ test/Bitcode/upgrade-vecreduce-intrinsics.ll
@@ -0,0 +1,64 @@
+; RUN: opt -S < %s | FileCheck %s
+; RUN: llvm-dis < %s.bc | FileCheck %s
+
+define float @fadd_acc(<4 x float> %in, float %acc) {
+; CHECK-LABEL: @fadd_acc
+; CHECK: %res = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %acc, <4 x float> %in)
+  %res = call float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float %acc, <4 x float> %in)
+  ret float %res
+}
+
+define float @fadd_undef(<4 x float> %in) {
+; CHECK-LABEL: @fadd_undef
+; CHECK: %res = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float undef, <4 x float> %in)
+  %res = call float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %in)
+  ret float %res
+}
+
+define float @fadd_fast_acc(<4 x float> %in, float %acc) {
+; CHECK-LABEL: @fadd_fast_acc
+; CHECK: %res = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> %in)
+  %res = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float %acc, <4 x float> %in)
+  ret float %res
+}
+
+define float @fadd_fast_undef(<4 x float> %in) {
+; CHECK-LABEL: @fadd_fast_undef
+; CHECK: %res = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> %in)
+  %res = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %in)
+  ret float %res
+}
+
+define float @fmul_acc(<4 x float> %in, float %acc) {
+; CHECK-LABEL: @fmul_acc
+; CHECK: %res = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float %acc, <4 x float> %in)
+  %res = call float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float %acc, <4 x float> %in)
+  ret float %res
+}
+
+define float @fmul_undef(<4 x float> %in) {
+; CHECK-LABEL: @fmul_undef
+; CHECK: %res = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float undef, <4 x float> %in)
+  %res = call float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float undef, <4 x float> %in)
+  ret float %res
+}
+
+define float @fmul_fast_acc(<4 x float> %in, float %acc) {
+; CHECK-LABEL: @fmul_fast_acc
+; CHECK: %res = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.000000e+00, <4 x float> %in)
+  %res = call fast float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float %acc, <4 x float> %in)
+  ret float %res
+}
+
+define float @fmul_fast_undef(<4 x float> %in) {
+; CHECK-LABEL: @fmul_fast_undef
+; CHECK: %res = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.000000e+00, <4 x float> %in)
+  %res = call fast float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float undef, <4 x float> %in)
+  ret float %res
+}
+
+declare float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
+; CHECK: declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float, <4 x float>)
+
+declare float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float, <4 x float>)
+; CHECK: declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float, <4 x float>)
Index: test/CodeGen/AArch64/vecreduce-fadd-legalization.ll
===================================================================
--- test/CodeGen/AArch64/vecreduce-fadd-legalization.ll
+++ test/CodeGen/AArch64/vecreduce-fadd-legalization.ll
@@ -1,20 +1,20 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK
 
-declare half @llvm.experimental.vector.reduce.fadd.f16.v1f16(half, <1 x half>)
-declare float @llvm.experimental.vector.reduce.fadd.f32.v1f32(float, <1 x float>)
-declare double @llvm.experimental.vector.reduce.fadd.f64.v1f64(double, <1 x double>)
-declare fp128 @llvm.experimental.vector.reduce.fadd.f128.v1f128(fp128, <1 x fp128>)
+declare half @llvm.experimental.vector.reduce.v2.fadd.f16.v1f16(half, <1 x half>)
+declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v1f32(float, <1 x float>)
+declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v1f64(double, <1 x double>)
+declare fp128 @llvm.experimental.vector.reduce.v2.fadd.f128.v1f128(fp128, <1 x fp128>)
 
-declare float @llvm.experimental.vector.reduce.fadd.f32.v3f32(float, <3 x float>)
-declare fp128 @llvm.experimental.vector.reduce.fadd.f128.v2f128(fp128, <2 x fp128>)
-declare float @llvm.experimental.vector.reduce.fadd.f32.v16f32(float, <16 x float>)
+declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v3f32(float, <3 x float>)
+declare fp128 @llvm.experimental.vector.reduce.v2.fadd.f128.v2f128(fp128, <2 x fp128>)
+declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float, <16 x float>)
 
 define half @test_v1f16(<1 x half> %a) nounwind {
 ; CHECK-LABEL: test_v1f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
-  %b = call fast nnan half @llvm.experimental.vector.reduce.fadd.f16.v1f16(half 0.0, <1 x half> %a)
+  %b = call fast nnan half @llvm.experimental.vector.reduce.v2.fadd.f16.v1f16(half 0.0, <1 x half> %a)
   ret half %b
 }
 
@@ -24,7 +24,7 @@
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-NEXT:    ret
-  %b = call fast nnan float @llvm.experimental.vector.reduce.fadd.f32.v1f32(float 0.0, <1 x float> %a)
+  %b = call fast nnan float @llvm.experimental.vector.reduce.v2.fadd.f32.v1f32(float 0.0, <1 x float> %a)
   ret float %b
 }
 
@@ -32,7 +32,7 @@
 ; CHECK-LABEL: test_v1f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
-  %b = call fast nnan double @llvm.experimental.vector.reduce.fadd.f64.v1f64(double 0.0, <1 x double> %a)
+  %b = call fast nnan double @llvm.experimental.vector.reduce.v2.fadd.f64.v1f64(double 0.0, <1 x double> %a)
   ret double %b
 }
 
@@ -40,7 +40,7 @@
 ; CHECK-LABEL: test_v1f128:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
-  %b = call fast nnan fp128 @llvm.experimental.vector.reduce.fadd.f128.v1f128(fp128 zeroinitializer, <1 x fp128> %a)
+  %b = call fast nnan fp128 @llvm.experimental.vector.reduce.v2.fadd.f128.v1f128(fp128 zeroinitializer, <1 x fp128> %a)
   ret fp128 %b
 }
 
@@ -53,7 +53,7 @@
 ; CHECK-NEXT:    fadd v0.2s, v0.2s, v1.2s
 ; CHECK-NEXT:    faddp s0, v0.2s
 ; CHECK-NEXT:    ret
-  %b = call fast nnan float @llvm.experimental.vector.reduce.fadd.f32.v3f32(float 0.0, <3 x float> %a)
+  %b = call fast nnan float @llvm.experimental.vector.reduce.v2.fadd.f32.v3f32(float 0.0, <3 x float> %a)
   ret float %b
 }
 
@@ -64,7 +64,7 @@
 ; CHECK-NEXT:    bl __addtf3
 ; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
-  %b = call fast nnan fp128 @llvm.experimental.vector.reduce.fadd.f128.v2f128(fp128 zeroinitializer, <2 x fp128> %a)
+  %b = call fast nnan fp128 @llvm.experimental.vector.reduce.v2.fadd.f128.v2f128(fp128 zeroinitializer, <2 x fp128> %a)
   ret fp128 %b
 }
 
@@ -78,6 +78,6 @@
 ; CHECK-NEXT:    fadd v0.2s, v0.2s, v1.2s
 ; CHECK-NEXT:    faddp s0, v0.2s
 ; CHECK-NEXT:    ret
-  %b = call fast nnan float @llvm.experimental.vector.reduce.fadd.f32.v16f32(float 0.0, <16 x float> %a)
+  %b = call fast nnan float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float 0.0, <16 x float> %a)
   ret float %b
 }
Index: test/CodeGen/AArch64/vecreduce-fadd.ll
===================================================================
--- test/CodeGen/AArch64/vecreduce-fadd.ll
+++ test/CodeGen/AArch64/vecreduce-fadd.ll
@@ -5,7 +5,7 @@
 ; CHECK-LABEL: add_HalfS:
 ; CHECK:       faddp s0, v0.2s
 ; CHECK-NEXT:  ret
-  %r = call fast float @llvm.experimental.vector.reduce.fadd.f32.v2f32(float undef, <2 x float> %bin.rdx)
+  %r = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v2f32(float 0.0, <2 x float> %bin.rdx)
   ret float %r
 }
 
@@ -23,7 +23,7 @@
 ; CHECKNOFP16-NOT:   fadd h{{[0-9]+}}
 ; CHECKNOFP16-NOT:   fadd v{{[0-9]+}}.{{[0-9]}}h
 ; CHECKNOFP16:       ret
-  %r = call fast half @llvm.experimental.vector.reduce.fadd.f16.v4f16(half undef, <4 x half> %bin.rdx)
+  %r = call fast half @llvm.experimental.vector.reduce.v2.fadd.f16.v4f16(half 0.0, <4 x half> %bin.rdx)
   ret half %r
 }
 
@@ -45,7 +45,7 @@
 ; CHECKNOFP16-NOT:   fadd h{{[0-9]+}}
 ; CHECKNOFP16-NOT:   fadd v{{[0-9]+}}.{{[0-9]}}h
 ; CHECKNOFP16:       ret
-  %r = call fast half @llvm.experimental.vector.reduce.fadd.f16.v8f16(half undef, <8 x half> %bin.rdx)
+  %r = call fast half @llvm.experimental.vector.reduce.v2.fadd.f16.v8f16(half 0.0, <8 x half> %bin.rdx)
   ret half %r
 }
 
@@ -55,7 +55,7 @@
 ; CHECK-NEXT:  fadd  v0.2s, v0.2s, v1.2s
 ; CHECK-NEXT:  faddp s0, v0.2s
 ; CHECK-NEXT:  ret
-  %r = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %bin.rdx)
+  %r = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.0, <4 x float> %bin.rdx)
   ret float %r
 }
 
@@ -63,7 +63,7 @@
 ; CHECK-LABEL: add_D:
 ; CHECK:       faddp d0, v0.2d
 ; CHECK-NEXT:  ret
-  %r = call fast double @llvm.experimental.vector.reduce.fadd.f64.v2f64(double undef, <2 x double> %bin.rdx)
+  %r = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double 0.0, <2 x double> %bin.rdx)
   ret double %r
 }
 
@@ -84,7 +84,7 @@
 ; CHECKNOFP16-NOT:   fadd h{{[0-9]+}}
 ; CHECKNOFP16-NOT:   fadd v{{[0-9]+}}.{{[0-9]}}h
 ; CHECKNOFP16:       ret
-  %r = call fast half @llvm.experimental.vector.reduce.fadd.f16.v16f16(half undef, <16 x half> %bin.rdx)
+  %r = call fast half @llvm.experimental.vector.reduce.v2.fadd.f16.v16f16(half 0.0, <16 x half> %bin.rdx)
   ret half %r
 }
 
@@ -95,7 +95,7 @@
 ; CHECK-NEXT:  fadd  v0.2s, v0.2s, v1.2s
 ; CHECK-NEXT:  faddp s0, v0.2s
 ; CHECK-NEXT:  ret
-  %r = call fast float @llvm.experimental.vector.reduce.fadd.f32.v8f32(float undef, <8 x float> %bin.rdx)
+  %r = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float 0.0, <8 x float> %bin.rdx)
   ret float %r
 }
 
@@ -104,16 +104,16 @@
 ; CHECK:       fadd v0.2d, v0.2d, v1.2d
 ; CHECK-NEXT:  faddp d0, v0.2d
 ; CHECK-NEXT:  ret
-  %r = call fast double @llvm.experimental.vector.reduce.fadd.f64.v4f64(double undef, <4 x double> %bin.rdx)
+  %r = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double 0.0, <4 x double> %bin.rdx)
   ret double %r
 }
 
 ; Function Attrs: nounwind readnone
-declare half @llvm.experimental.vector.reduce.fadd.f16.v4f16(half, <4 x half>)
-declare half @llvm.experimental.vector.reduce.fadd.f16.v8f16(half, <8 x half>)
-declare half @llvm.experimental.vector.reduce.fadd.f16.v16f16(half, <16 x half>)
-declare float @llvm.experimental.vector.reduce.fadd.f32.v2f32(float, <2 x float>)
-declare float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
-declare float @llvm.experimental.vector.reduce.fadd.f32.v8f32(float, <8 x float>)
-declare double @llvm.experimental.vector.reduce.fadd.f64.v2f64(double, <2 x double>)
-declare double @llvm.experimental.vector.reduce.fadd.f64.v4f64(double, <4 x double>)
+declare half @llvm.experimental.vector.reduce.v2.fadd.f16.v4f16(half, <4 x half>)
+declare half @llvm.experimental.vector.reduce.v2.fadd.f16.v8f16(half, <8 x half>)
+declare half @llvm.experimental.vector.reduce.v2.fadd.f16.v16f16(half, <16 x half>)
+declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v2f32(float, <2 x float>)
+declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float, <4 x float>)
+declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float, <8 x float>)
+declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double, <2 x double>)
+declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double, <4 x double>)
Index: test/CodeGen/Generic/expand-experimental-reductions.ll
===================================================================
--- test/CodeGen/Generic/expand-experimental-reductions.ll
+++ test/CodeGen/Generic/expand-experimental-reductions.ll
@@ -7,8 +7,8 @@
 declare i64 @llvm.experimental.vector.reduce.or.i64.v2i64(<2 x i64>)
 declare i64 @llvm.experimental.vector.reduce.xor.i64.v2i64(<2 x i64>)
 
-declare float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
-declare float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float, <4 x float>)
+declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float, <4 x float>)
+declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float, <4 x float>)
 
 declare i64 @llvm.experimental.vector.reduce.smax.i64.v2i64(<2 x i64>)
 declare i64 @llvm.experimental.vector.reduce.smin.i64.v2i64(<2 x i64>)
@@ -95,7 +95,7 @@
 ; CHECK-NEXT:    ret float [[TMP0]]
 ;
 entry:
-  %r = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %vec)
+  %r = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.0, <4 x float> %vec)
   ret float %r
 }
 
@@ -110,7 +110,7 @@
 ; CHECK-NEXT:    ret float [[TMP0]]
 ;
 entry:
-  %r = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float %accum, <4 x float> %vec)
+  %r = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %accum, <4 x float> %vec)
   ret float %r
 }
 
@@ -128,7 +128,7 @@
 ; CHECK-NEXT:    ret float [[BIN_RDX3]]
 ;
 entry:
-  %r = call float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %vec)
+  %r = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float undef, <4 x float> %vec)
   ret float %r
 }
 
@@ -146,7 +146,7 @@
 ; CHECK-NEXT:    ret float [[BIN_RDX3]]
 ;
 entry:
-  %r = call float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float %accum, <4 x float> %vec)
+  %r = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %accum, <4 x float> %vec)
   ret float %r
 }
 
@@ -161,7 +161,7 @@
 ; CHECK-NEXT:    ret float [[TMP0]]
 ;
 entry:
-  %r = call fast float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float undef, <4 x float> %vec)
+  %r = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.0, <4 x float> %vec)
   ret float %r
 }
 
@@ -176,7 +176,7 @@
 ; CHECK-NEXT:    ret float [[TMP0]]
 ;
 entry:
-  %r = call fast float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float %accum, <4 x float> %vec)
+  %r = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float %accum, <4 x float> %vec)
   ret float %r
 }
 
@@ -194,7 +194,7 @@
 ; CHECK-NEXT:    ret float [[BIN_RDX3]]
 ;
 entry:
-  %r = call float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float undef, <4 x float> %vec)
+  %r = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float undef, <4 x float> %vec)
   ret float %r
 }
 
@@ -212,7 +212,7 @@
 ; CHECK-NEXT:    ret float [[BIN_RDX3]]
 ;
 entry:
-  %r = call float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float %accum, <4 x float> %vec)
+  %r = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float %accum, <4 x float> %vec)
   ret float %r
 }
 
Index: test/CodeGen/X86/haddsub.ll
===================================================================
--- test/CodeGen/X86/haddsub.ll
+++ test/CodeGen/X86/haddsub.ll
@@ -1355,8 +1355,8 @@
 ; Repeat tests from general reductions to verify output for hoppy targets:
 ; PR38971: https://bugs.llvm.org/show_bug.cgi?id=38971
 
-declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float, <8 x float>)
-declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double, <4 x double>)
+declare float @llvm.experimental.vector.reduce.v2.fadd.f32.f32.v8f32(float, <8 x float>)
+declare double @llvm.experimental.vector.reduce.v2.fadd.f64.f64.v4f64(double, <4 x double>)
 
 define float @fadd_reduce_v8f32(float %a0, <8 x float> %a1) {
 ; SSE3-SLOW-LABEL: fadd_reduce_v8f32:
@@ -1398,7 +1398,7 @@
 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    vzeroupper
 ; AVX-FAST-NEXT:    retq
-  %r = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float %a0, <8 x float> %a1)
+  %r = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.f32.v8f32(float %a0, <8 x float> %a1)
   ret float %r
 }
 
@@ -1434,7 +1434,7 @@
 ; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    vzeroupper
 ; AVX-FAST-NEXT:    retq
-  %r = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double %a0, <4 x double> %a1)
+  %r = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.f64.v4f64(double %a0, <4 x double> %a1)
   ret double %r
 }
 
Index: test/CodeGen/X86/vector-reduce-fadd-fast.ll
===================================================================
--- test/CodeGen/X86/vector-reduce-fadd-fast.ll
+++ test/CodeGen/X86/vector-reduce-fadd-fast.ll
@@ -35,7 +35,7 @@
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
 ; AVX512-NEXT:    vaddss %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float %a0, <2 x float> %a1)
+  %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.f32.v2f32(float %a0, <2 x float> %a1)
   ret float %1
 }
 
@@ -74,7 +74,7 @@
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float %a0, <4 x float> %a1)
+  %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.f32.v4f32(float %a0, <4 x float> %a1)
   ret float %1
 }
 
@@ -121,7 +121,7 @@
 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float %a0, <8 x float> %a1)
+  %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.f32.v8f32(float %a0, <8 x float> %a1)
   ret float %1
 }
 
@@ -175,7 +175,7 @@
 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float %a0, <16 x float> %a1)
+  %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.f32.v16f32(float %a0, <16 x float> %a1)
   ret float %1
 }
 
@@ -209,7 +209,7 @@
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float 0.0, <2 x float> %a0)
+  %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.f32.v2f32(float 0.0, <2 x float> %a0)
   ret float %1
 }
 
@@ -249,7 +249,7 @@
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float 0.0, <4 x float> %a0)
+  %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.f32.v4f32(float 0.0, <4 x float> %a0)
   ret float %1
 }
 
@@ -297,7 +297,7 @@
 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float 0.0, <8 x float> %a0)
+  %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.f32.v8f32(float 0.0, <8 x float> %a0)
   ret float %1
 }
 
@@ -352,7 +352,7 @@
 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float 0.0, <16 x float> %a0)
+  %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.f32.v16f32(float 0.0, <16 x float> %a0)
   ret float %1
 }
 
@@ -386,7 +386,7 @@
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float undef, <2 x float> %a0)
+  %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.f32.v2f32(float 0.0, <2 x float> %a0)
   ret float %1
 }
 
@@ -426,7 +426,7 @@
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float undef, <4 x float> %a0)
+  %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.f32.v4f32(float 0.0, <4 x float> %a0)
   ret float %1
 }
 
@@ -474,7 +474,7 @@
 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float undef, <8 x float> %a0)
+  %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.f32.v8f32(float 0.0, <8 x float> %a0)
   ret float %1
 }
 
@@ -529,7 +529,7 @@
 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float undef, <16 x float> %a0)
+  %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.f32.v16f32(float 0.0, <16 x float> %a0)
   ret float %1
 }
 
@@ -556,7 +556,7 @@
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
 ; AVX512-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double %a0, <2 x double> %a1)
+  %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.f64.v2f64(double %a0, <2 x double> %a1)
   ret double %1
 }
 
@@ -586,7 +586,7 @@
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double %a0, <4 x double> %a1)
+  %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.f64.v4f64(double %a0, <4 x double> %a1)
   ret double %1
 }
 
@@ -621,7 +621,7 @@
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double %a0, <8 x double> %a1)
+  %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.f64.v8f64(double %a0, <8 x double> %a1)
   ret double %1
 }
 
@@ -663,7 +663,7 @@
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double %a0, <16 x double> %a1)
+  %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.f64.v16f64(double %a0, <16 x double> %a1)
   ret double %1
 }
 
@@ -691,7 +691,7 @@
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double 0.0, <2 x double> %a0)
+  %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.f64.v2f64(double 0.0, <2 x double> %a0)
   ret double %1
 }
 
@@ -722,7 +722,7 @@
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double 0.0, <4 x double> %a0)
+  %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.f64.v4f64(double 0.0, <4 x double> %a0)
   ret double %1
 }
 
@@ -758,7 +758,7 @@
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double 0.0, <8 x double> %a0)
+  %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.f64.v8f64(double 0.0, <8 x double> %a0)
   ret double %1
 }
 
@@ -800,7 +800,7 @@
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double 0.0, <16 x double> %a0)
+  %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.f64.v16f64(double 0.0, <16 x double> %a0)
   ret double %1
 }
 
@@ -828,7 +828,7 @@
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double undef, <2 x double> %a0)
+  %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.f64.v2f64(double 0.0, <2 x double> %a0)
   ret double %1
 }
 
@@ -859,7 +859,7 @@
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double undef, <4 x double> %a0)
+  %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.f64.v4f64(double 0.0, <4 x double> %a0)
   ret double %1
 }
 
@@ -895,7 +895,7 @@
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double undef, <8 x double> %a0)
+  %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.f64.v8f64(double 0.0, <8 x double> %a0)
   ret double %1
 }
 
@@ -937,16 +937,16 @@
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double undef, <16 x double> %a0)
+  %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.f64.v16f64(double 0.0, <16 x double> %a0)
   ret double %1
 }
 
-declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float, <2 x float>)
-declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float, <4 x float>)
-declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float, <8 x float>)
-declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float, <16 x float>)
+declare float @llvm.experimental.vector.reduce.v2.fadd.f32.f32.v2f32(float, <2 x float>)
+declare float @llvm.experimental.vector.reduce.v2.fadd.f32.f32.v4f32(float, <4 x float>)
+declare float @llvm.experimental.vector.reduce.v2.fadd.f32.f32.v8f32(float, <8 x float>)
+declare float @llvm.experimental.vector.reduce.v2.fadd.f32.f32.v16f32(float, <16 x float>)
 
-declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double, <2 x double>)
-declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double, <4 x double>)
-declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double, <8 x double>)
-declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double, <16 x double>)
+declare double @llvm.experimental.vector.reduce.v2.fadd.f64.f64.v2f64(double, <2 x double>)
+declare double @llvm.experimental.vector.reduce.v2.fadd.f64.f64.v4f64(double, <4 x double>)
+declare double @llvm.experimental.vector.reduce.v2.fadd.f64.f64.v8f64(double, <8 x double>)
+declare double @llvm.experimental.vector.reduce.v2.fadd.f64.f64.v16f64(double, <16 x double>)
Index: test/CodeGen/X86/vector-reduce-fadd.ll
===================================================================
--- test/CodeGen/X86/vector-reduce-fadd.ll
+++ test/CodeGen/X86/vector-reduce-fadd.ll
@@ -38,7 +38,7 @@
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float %a0, <2 x float> %a1)
+  %1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.f32.v2f32(float %a0, <2 x float> %a1)
   ret float %1
 }
 
@@ -89,7 +89,7 @@
 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float %a0, <4 x float> %a1)
+  %1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.f32.v4f32(float %a0, <4 x float> %a1)
   ret float %1
 }
 
@@ -175,7 +175,7 @@
 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float %a0, <8 x float> %a1)
+  %1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.f32.v8f32(float %a0, <8 x float> %a1)
   ret float %1
 }
 
@@ -326,7 +326,7 @@
 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float %a0, <16 x float> %a1)
+  %1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.f32.v16f32(float %a0, <16 x float> %a1)
   ret float %1
 }
 
@@ -366,7 +366,7 @@
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; AVX512-NEXT:    vaddss %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float 0.0, <2 x float> %a0)
+  %1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.f32.v2f32(float 0.0, <2 x float> %a0)
   ret float %1
 }
 
@@ -421,7 +421,7 @@
 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
 ; AVX512-NEXT:    vaddss %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float 0.0, <4 x float> %a0)
+  %1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.f32.v4f32(float 0.0, <4 x float> %a0)
   ret float %1
 }
 
@@ -511,7 +511,7 @@
 ; AVX512-NEXT:    vaddss %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float 0.0, <8 x float> %a0)
+  %1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.f32.v8f32(float 0.0, <8 x float> %a0)
   ret float %1
 }
 
@@ -666,7 +666,7 @@
 ; AVX512-NEXT:    vaddss %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float 0.0, <16 x float> %a0)
+  %1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.f32.v16f32(float 0.0, <16 x float> %a0)
   ret float %1
 }
 
@@ -698,7 +698,7 @@
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; AVX512-NEXT:    vaddss {{.*}}(%rip), %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float undef, <2 x float> %a0)
+  %1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.f32.v2f32(float undef, <2 x float> %a0)
   ret float %1
 }
 
@@ -745,7 +745,7 @@
 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
 ; AVX512-NEXT:    vaddss %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float undef, <4 x float> %a0)
+  %1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.f32.v4f32(float undef, <4 x float> %a0)
   ret float %1
 }
 
@@ -827,7 +827,7 @@
 ; AVX512-NEXT:    vaddss %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float undef, <8 x float> %a0)
+  %1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.f32.v8f32(float undef, <8 x float> %a0)
   ret float %1
 }
 
@@ -974,7 +974,7 @@
 ; AVX512-NEXT:    vaddss %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float undef, <16 x float> %a0)
+  %1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.f32.v16f32(float undef, <16 x float> %a0)
   ret float %1
 }
 
@@ -1003,7 +1003,7 @@
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double %a0, <2 x double> %a1)
+  %1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.f64.v2f64(double %a0, <2 x double> %a1)
   ret double %1
 }
 
@@ -1041,7 +1041,7 @@
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double %a0, <4 x double> %a1)
+  %1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.f64.v4f64(double %a0, <4 x double> %a1)
   ret double %1
 }
 
@@ -1100,7 +1100,7 @@
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double %a0, <8 x double> %a1)
+  %1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.f64.v8f64(double %a0, <8 x double> %a1)
   ret double %1
 }
 
@@ -1201,7 +1201,7 @@
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double %a0, <16 x double> %a1)
+  %1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.f64.v16f64(double %a0, <16 x double> %a1)
   ret double %1
 }
 
@@ -1233,7 +1233,7 @@
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
 ; AVX512-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double 0.0, <2 x double> %a0)
+  %1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.f64.v2f64(double 0.0, <2 x double> %a0)
   ret double %1
 }
 
@@ -1274,7 +1274,7 @@
 ; AVX512-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double 0.0, <4 x double> %a0)
+  %1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.f64.v4f64(double 0.0, <4 x double> %a0)
   ret double %1
 }
 
@@ -1336,7 +1336,7 @@
 ; AVX512-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double 0.0, <8 x double> %a0)
+  %1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.f64.v8f64(double 0.0, <8 x double> %a0)
   ret double %1
 }
 
@@ -1439,7 +1439,7 @@
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double 0.0, <16 x double> %a0)
+  %1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.f64.v16f64(double 0.0, <16 x double> %a0)
   ret double %1
 }
 
@@ -1465,7 +1465,7 @@
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
 ; AVX512-NEXT:    vaddsd {{.*}}(%rip), %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double undef, <2 x double> %a0)
+  %1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.f64.v2f64(double undef, <2 x double> %a0)
   ret double %1
 }
 
@@ -1500,7 +1500,7 @@
 ; AVX512-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double undef, <4 x double> %a0)
+  %1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.f64.v4f64(double undef, <4 x double> %a0)
   ret double %1
 }
 
@@ -1556,7 +1556,7 @@
 ; AVX512-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double undef, <8 x double> %a0)
+  %1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.f64.v8f64(double undef, <8 x double> %a0)
   ret double %1
 }
 
@@ -1653,16 +1653,16 @@
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double undef, <16 x double> %a0)
+  %1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.f64.v16f64(double undef, <16 x double> %a0)
   ret double %1
 }
 
-declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float, <2 x float>)
-declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float, <4 x float>)
-declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float, <8 x float>)
-declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float, <16 x float>)
+declare float @llvm.experimental.vector.reduce.v2.fadd.f32.f32.v2f32(float, <2 x float>)
+declare float @llvm.experimental.vector.reduce.v2.fadd.f32.f32.v4f32(float, <4 x float>)
+declare float @llvm.experimental.vector.reduce.v2.fadd.f32.f32.v8f32(float, <8 x float>)
+declare float @llvm.experimental.vector.reduce.v2.fadd.f32.f32.v16f32(float, <16 x float>)
 
-declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double, <2 x double>)
-declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double, <4 x double>)
-declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double, <8 x double>)
-declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double, <16 x double>)
+declare double @llvm.experimental.vector.reduce.v2.fadd.f64.f64.v2f64(double, <2 x double>)
+declare double @llvm.experimental.vector.reduce.v2.fadd.f64.f64.v4f64(double, <4 x double>)
+declare double @llvm.experimental.vector.reduce.v2.fadd.f64.f64.v8f64(double, <8 x double>)
+declare double @llvm.experimental.vector.reduce.v2.fadd.f64.f64.v16f64(double, <16 x double>)
Index: test/CodeGen/X86/vector-reduce-fmul-fast.ll
===================================================================
--- test/CodeGen/X86/vector-reduce-fmul-fast.ll
+++ test/CodeGen/X86/vector-reduce-fmul-fast.ll
@@ -35,7 +35,7 @@
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
 ; AVX512-NEXT:    vmulss %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v2f32(float %a0, <2 x float> %a1)
+  %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.f32.v2f32(float %a0, <2 x float> %a1)
   ret float %1
 }
 
@@ -74,7 +74,7 @@
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float %a0, <4 x float> %a1)
+  %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.f32.v4f32(float %a0, <4 x float> %a1)
   ret float %1
 }
 
@@ -121,7 +121,7 @@
 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float %a0, <8 x float> %a1)
+  %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.f32.v8f32(float %a0, <8 x float> %a1)
   ret float %1
 }
 
@@ -175,7 +175,7 @@
 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float %a0, <16 x float> %a1)
+  %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.f32.v16f32(float %a0, <16 x float> %a1)
   ret float %1
 }
 
@@ -209,7 +209,7 @@
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v2f32(float 1.0, <2 x float> %a0)
+  %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.f32.v2f32(float 1.0, <2 x float> %a0)
   ret float %1
 }
 
@@ -249,7 +249,7 @@
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float 1.0, <4 x float> %a0)
+  %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.f32.v4f32(float 1.0, <4 x float> %a0)
   ret float %1
 }
 
@@ -297,7 +297,7 @@
 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float 1.0, <8 x float> %a0)
+  %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.f32.v8f32(float 1.0, <8 x float> %a0)
   ret float %1
 }
 
@@ -352,7 +352,7 @@
 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float 1.0, <16 x float> %a0)
+  %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.f32.v16f32(float 1.0, <16 x float> %a0)
   ret float %1
 }
 
@@ -386,7 +386,7 @@
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v2f32(float undef, <2 x float> %a0)
+  %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.f32.v2f32(float 1.0, <2 x float> %a0)
   ret float %1
 }
 
@@ -426,7 +426,7 @@
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float undef, <4 x float> %a0)
+  %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.f32.v4f32(float 1.0, <4 x float> %a0)
   ret float %1
 }
 
@@ -474,7 +474,7 @@
 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float undef, <8 x float> %a0)
+  %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.f32.v8f32(float 1.0, <8 x float> %a0)
   ret float %1
 }
 
@@ -529,7 +529,7 @@
 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float undef, <16 x float> %a0)
+  %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.f32.v16f32(float 1.0, <16 x float> %a0)
   ret float %1
 }
 
@@ -556,7 +556,7 @@
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
 ; AVX512-NEXT:    vmulsd %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v2f64(double %a0, <2 x double> %a1)
+  %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.f64.v2f64(double %a0, <2 x double> %a1)
   ret double %1
 }
 
@@ -586,7 +586,7 @@
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double %a0, <4 x double> %a1)
+  %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.f64.v4f64(double %a0, <4 x double> %a1)
   ret double %1
 }
 
@@ -621,7 +621,7 @@
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double %a0, <8 x double> %a1)
+  %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.f64.v8f64(double %a0, <8 x double> %a1)
   ret double %1
 }
 
@@ -663,7 +663,7 @@
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double %a0, <16 x double> %a1)
+  %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.f64.v16f64(double %a0, <16 x double> %a1)
   ret double %1
 }
 
@@ -691,7 +691,7 @@
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v2f64(double 1.0, <2 x double> %a0)
+  %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.f64.v2f64(double 1.0, <2 x double> %a0)
   ret double %1
 }
 
@@ -722,7 +722,7 @@
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double 1.0, <4 x double> %a0)
+  %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.f64.v4f64(double 1.0, <4 x double> %a0)
   ret double %1
 }
 
@@ -758,7 +758,7 @@
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double 1.0, <8 x double> %a0)
+  %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.f64.v8f64(double 1.0, <8 x double> %a0)
   ret double %1
 }
 
@@ -800,7 +800,7 @@
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double 1.0, <16 x double> %a0)
+  %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.f64.v16f64(double 1.0, <16 x double> %a0)
   ret double %1
 }
 
@@ -828,7 +828,7 @@
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v2f64(double undef, <2 x double> %a0)
+  %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.f64.v2f64(double 1.0, <2 x double> %a0)
   ret double %1
 }
 
@@ -859,7 +859,7 @@
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double undef, <4 x double> %a0)
+  %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.f64.v4f64(double 1.0, <4 x double> %a0)
   ret double %1
 }
 
@@ -895,7 +895,7 @@
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double undef, <8 x double> %a0)
+  %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.f64.v8f64(double 1.0, <8 x double> %a0)
   ret double %1
 }
 
@@ -937,16 +937,16 @@
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double undef, <16 x double> %a0)
+  %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.f64.v16f64(double 1.0, <16 x double> %a0)
   ret double %1
 }
 
-declare float @llvm.experimental.vector.reduce.fmul.f32.f32.v2f32(float, <2 x float>)
-declare float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float, <4 x float>)
-declare float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float, <8 x float>)
-declare float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float, <16 x float>)
+declare float @llvm.experimental.vector.reduce.v2.fmul.f32.f32.v2f32(float, <2 x float>)
+declare float @llvm.experimental.vector.reduce.v2.fmul.f32.f32.v4f32(float, <4 x float>)
+declare float @llvm.experimental.vector.reduce.v2.fmul.f32.f32.v8f32(float, <8 x float>)
+declare float @llvm.experimental.vector.reduce.v2.fmul.f32.f32.v16f32(float, <16 x float>)
 
-declare double @llvm.experimental.vector.reduce.fmul.f64.f64.v2f64(double, <2 x double>)
-declare double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double, <4 x double>)
-declare double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double, <8 x double>)
-declare double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double, <16 x double>)
+declare double @llvm.experimental.vector.reduce.v2.fmul.f64.f64.v2f64(double, <2 x double>)
+declare double @llvm.experimental.vector.reduce.v2.fmul.f64.f64.v4f64(double, <4 x double>)
+declare double @llvm.experimental.vector.reduce.v2.fmul.f64.f64.v8f64(double, <8 x double>)
+declare double @llvm.experimental.vector.reduce.v2.fmul.f64.f64.v16f64(double, <16 x double>)
Index: test/CodeGen/X86/vector-reduce-fmul.ll
===================================================================
--- test/CodeGen/X86/vector-reduce-fmul.ll
+++ test/CodeGen/X86/vector-reduce-fmul.ll
@@ -38,7 +38,7 @@
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v2f32(float %a0, <2 x float> %a1)
+  %1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.f32.v2f32(float %a0, <2 x float> %a1)
   ret float %1
 }
 
@@ -89,7 +89,7 @@
 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float %a0, <4 x float> %a1)
+  %1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.f32.v4f32(float %a0, <4 x float> %a1)
   ret float %1
 }
 
@@ -175,7 +175,7 @@
 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float %a0, <8 x float> %a1)
+  %1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.f32.v8f32(float %a0, <8 x float> %a1)
   ret float %1
 }
 
@@ -326,7 +326,7 @@
 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float %a0, <16 x float> %a1)
+  %1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.f32.v16f32(float %a0, <16 x float> %a1)
   ret float %1
 }
 
@@ -360,7 +360,7 @@
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v2f32(float 1.0, <2 x float> %a0)
+  %1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.f32.v2f32(float 1.0, <2 x float> %a0)
   ret float %1
 }
 
@@ -407,7 +407,7 @@
 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
 ; AVX512-NEXT:    vmulss %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float 1.0, <4 x float> %a0)
+  %1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.f32.v4f32(float 1.0, <4 x float> %a0)
   ret float %1
 }
 
@@ -489,7 +489,7 @@
 ; AVX512-NEXT:    vmulss %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float 1.0, <8 x float> %a0)
+  %1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.f32.v8f32(float 1.0, <8 x float> %a0)
   ret float %1
 }
 
@@ -636,7 +636,7 @@
 ; AVX512-NEXT:    vmulss %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float 1.0, <16 x float> %a0)
+  %1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.f32.v16f32(float 1.0, <16 x float> %a0)
   ret float %1
 }
 
@@ -668,7 +668,7 @@
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; AVX512-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v2f32(float undef, <2 x float> %a0)
+  %1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.f32.v2f32(float undef, <2 x float> %a0)
   ret float %1
 }
 
@@ -715,7 +715,7 @@
 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
 ; AVX512-NEXT:    vmulss %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float undef, <4 x float> %a0)
+  %1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.f32.v4f32(float undef, <4 x float> %a0)
   ret float %1
 }
 
@@ -797,7 +797,7 @@
 ; AVX512-NEXT:    vmulss %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float undef, <8 x float> %a0)
+  %1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.f32.v8f32(float undef, <8 x float> %a0)
   ret float %1
 }
 
@@ -944,7 +944,7 @@
 ; AVX512-NEXT:    vmulss %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float undef, <16 x float> %a0)
+  %1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.f32.v16f32(float undef, <16 x float> %a0)
   ret float %1
 }
 
@@ -973,7 +973,7 @@
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v2f64(double %a0, <2 x double> %a1)
+  %1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.f64.v2f64(double %a0, <2 x double> %a1)
   ret double %1
 }
 
@@ -1011,7 +1011,7 @@
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double %a0, <4 x double> %a1)
+  %1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.f64.v4f64(double %a0, <4 x double> %a1)
   ret double %1
 }
 
@@ -1070,7 +1070,7 @@
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double %a0, <8 x double> %a1)
+  %1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.f64.v8f64(double %a0, <8 x double> %a1)
   ret double %1
 }
 
@@ -1171,7 +1171,7 @@
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double %a0, <16 x double> %a1)
+  %1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.f64.v16f64(double %a0, <16 x double> %a1)
   ret double %1
 }
 
@@ -1199,7 +1199,7 @@
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v2f64(double 1.0, <2 x double> %a0)
+  %1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.f64.v2f64(double 1.0, <2 x double> %a0)
   ret double %1
 }
 
@@ -1236,7 +1236,7 @@
 ; AVX512-NEXT:    vmulsd %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double 1.0, <4 x double> %a0)
+  %1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.f64.v4f64(double 1.0, <4 x double> %a0)
   ret double %1
 }
 
@@ -1294,7 +1294,7 @@
 ; AVX512-NEXT:    vmulsd %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double 1.0, <8 x double> %a0)
+  %1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.f64.v8f64(double 1.0, <8 x double> %a0)
   ret double %1
 }
 
@@ -1392,7 +1392,7 @@
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double 1.0, <16 x double> %a0)
+  %1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.f64.v16f64(double 1.0, <16 x double> %a0)
   ret double %1
 }
 
@@ -1418,7 +1418,7 @@
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
 ; AVX512-NEXT:    vmulsd {{.*}}(%rip), %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v2f64(double undef, <2 x double> %a0)
+  %1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.f64.v2f64(double undef, <2 x double> %a0)
   ret double %1
 }
 
@@ -1453,7 +1453,7 @@
 ; AVX512-NEXT:    vmulsd %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double undef, <4 x double> %a0)
+  %1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.f64.v4f64(double undef, <4 x double> %a0)
   ret double %1
 }
 
@@ -1509,7 +1509,7 @@
 ; AVX512-NEXT:    vmulsd %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double undef, <8 x double> %a0)
+  %1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.f64.v8f64(double undef, <8 x double> %a0)
   ret double %1
 }
 
@@ -1606,16 +1606,16 @@
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double undef, <16 x double> %a0)
+  %1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.f64.v16f64(double undef, <16 x double> %a0)
   ret double %1
 }
 
-declare float @llvm.experimental.vector.reduce.fmul.f32.f32.v2f32(float, <2 x float>)
-declare float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float, <4 x float>)
-declare float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float, <8 x float>)
-declare float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float, <16 x float>)
+declare float @llvm.experimental.vector.reduce.v2.fmul.f32.f32.v2f32(float, <2 x float>)
+declare float @llvm.experimental.vector.reduce.v2.fmul.f32.f32.v4f32(float, <4 x float>)
+declare float @llvm.experimental.vector.reduce.v2.fmul.f32.f32.v8f32(float, <8 x float>)
+declare float @llvm.experimental.vector.reduce.v2.fmul.f32.f32.v16f32(float, <16 x float>)
 
-declare double @llvm.experimental.vector.reduce.fmul.f64.f64.v2f64(double, <2 x double>)
-declare double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double, <4 x double>)
-declare double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double, <8 x double>)
-declare double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double, <16 x double>)
+declare double @llvm.experimental.vector.reduce.v2.fmul.f64.f64.v2f64(double, <2 x double>)
+declare double @llvm.experimental.vector.reduce.v2.fmul.f64.f64.v4f64(double, <4 x double>)
+declare double @llvm.experimental.vector.reduce.v2.fmul.f64.f64.v8f64(double, <8 x double>)
+declare double @llvm.experimental.vector.reduce.v2.fmul.f64.f64.v16f64(double, <16 x double>)