diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -15543,8 +15543,8 @@
 that is used by the conditional branch controlling the loop.
 
 
-Experimental Vector Reduction Intrinsics
-----------------------------------------
+Vector Reduction Intrinsics
+---------------------------
 
 Horizontal reductions of vectors can be expressed using the following
 intrinsics. Each one takes a vector operand as an input and applies its
@@ -15552,21 +15552,21 @@
 scalar result of the same element type.
 
 
-'``llvm.experimental.vector.reduce.add.*``' Intrinsic
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+'``llvm.vector.reduce.add.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Syntax:
 """""""
 
 ::
 
-      declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %a)
-      declare i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %a)
+      declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a)
+      declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a)
 
 Overview:
 """""""""
 
-The '``llvm.experimental.vector.reduce.add.*``' intrinsics do an integer ``ADD``
+The '``llvm.vector.reduce.add.*``' intrinsics do an integer ``ADD``
 reduction of a vector, returning the result as a scalar. The return type matches
 the element-type of the vector input.
 
@@ -15574,34 +15574,34 @@
 """"""""""
 The argument to this intrinsic must be a vector of integer values.
 
-'``llvm.experimental.vector.reduce.v2.fadd.*``' Intrinsic
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+'``llvm.vector.reduce.fadd.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Syntax:
 """""""
 
 ::
 
-      declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %start_value, <4 x float> %a)
-      declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double %start_value, <2 x double> %a)
+      declare float @llvm.vector.reduce.fadd.v4f32(float %start_value, <4 x float> %a)
+      declare double @llvm.vector.reduce.fadd.v2f64(double %start_value, <2 x double> %a)
 
 Overview:
 """""""""
 
-The '``llvm.experimental.vector.reduce.v2.fadd.*``' intrinsics do a floating-point
+The '``llvm.vector.reduce.fadd.*``' intrinsics do a floating-point
 ``ADD`` reduction of a vector, returning the result as a scalar. The return type
 matches the element-type of the vector input.
 
-If the intrinsic call has the 'reassoc' or 'fast' flags set, then the
-reduction will not preserve the associativity of an equivalent scalarized
-counterpart. Otherwise the reduction will be *ordered*, thus implying that
-the operation respects the associativity of a scalarized reduction. That is, the
-reduction begins with the start value and performs an fadd operation with consecutively
-increasing vector element indices. See the following pseudocode:
+If the intrinsic call has the 'reassoc' flag set, then the reduction will not
+preserve the associativity of an equivalent scalarized counterpart. Otherwise
+the reduction will be *sequential*, thus implying that the operation respects
+the associativity of a scalarized reduction. That is, the reduction begins with
+the start value and performs an fadd operation with consecutively increasing
+vector element indices. See the following pseudocode:
 
 ::
 
-    float ordered_fadd(start_value, input_vector)
+    float sequential_fadd(start_value, input_vector)
       result = start_value
       for i = 0 to length(input_vector)
         result = result + input_vector[i]
@@ -15619,25 +15619,25 @@
 
 ::
 
-      %unord = call reassoc float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.0, <4 x float> %input) ; unordered reduction
-      %ord = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %start_value, <4 x float> %input) ; ordered reduction
+      %unord = call reassoc float @llvm.vector.reduce.fadd.v4f32(float 0.0, <4 x float> %input) ; relaxed reduction
+      %ord = call float @llvm.vector.reduce.fadd.v4f32(float %start_value, <4 x float> %input) ; sequential reduction
 
 
-'``llvm.experimental.vector.reduce.mul.*``' Intrinsic
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+'``llvm.vector.reduce.mul.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Syntax:
 """""""
 
 ::
 
-      declare i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> %a)
-      declare i64 @llvm.experimental.vector.reduce.mul.v2i64(<2 x i64> %a)
+      declare i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %a)
+      declare i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> %a)
 
 Overview:
 """""""""
 
-The '``llvm.experimental.vector.reduce.mul.*``' intrinsics do an integer ``MUL``
+The '``llvm.vector.reduce.mul.*``' intrinsics do an integer ``MUL``
 reduction of a vector, returning the result as a scalar. The return type matches
 the element-type of the vector input.
 
@@ -15645,34 +15645,34 @@
 """"""""""
 The argument to this intrinsic must be a vector of integer values.
 
-'``llvm.experimental.vector.reduce.v2.fmul.*``' Intrinsic
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+'``llvm.vector.reduce.fmul.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Syntax:
 """""""
 
 ::
 
-      declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float %start_value, <4 x float> %a)
-      declare double @llvm.experimental.vector.reduce.v2.fmul.f64.v2f64(double %start_value, <2 x double> %a)
+      declare float @llvm.vector.reduce.fmul.v4f32(float %start_value, <4 x float> %a)
+      declare double @llvm.vector.reduce.fmul.v2f64(double %start_value, <2 x double> %a)
 
 Overview:
 """""""""
 
-The '``llvm.experimental.vector.reduce.v2.fmul.*``' intrinsics do a floating-point
+The '``llvm.vector.reduce.fmul.*``' intrinsics do a floating-point
 ``MUL`` reduction of a vector, returning the result as a scalar. The return type
 matches the element-type of the vector input.
 
-If the intrinsic call has the 'reassoc' or 'fast' flags set, then the
-reduction will not preserve the associativity of an equivalent scalarized
-counterpart. Otherwise the reduction will be *ordered*, thus implying that
-the operation respects the associativity of a scalarized reduction. That is, the
-reduction begins with the start value and performs an fmul operation with consecutively
-increasing vector element indices. See the following pseudocode:
+If the intrinsic call has the 'reassoc' flag set, then the reduction will not
+preserve the associativity of an equivalent scalarized counterpart. Otherwise
+the reduction will be *sequential*, thus implying that the operation respects
+the associativity of a scalarized reduction. That is, the reduction begins with
+the start value and performs an fmul operation with consecutively increasing
+vector element indices. See the following pseudocode:
 
 ::
 
-    float ordered_fmul(start_value, input_vector)
+    float sequential_fmul(start_value, input_vector)
       result = start_value
       for i = 0 to length(input_vector)
         result = result * input_vector[i]
@@ -15690,23 +15690,23 @@
 
 ::
 
-      %unord = call reassoc float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.0, <4 x float> %input) ; unordered reduction
-      %ord = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float %start_value, <4 x float> %input) ; ordered reduction
+      %unord = call reassoc float @llvm.vector.reduce.fmul.v4f32(float 1.0, <4 x float> %input) ; relaxed reduction
+      %ord = call float @llvm.vector.reduce.fmul.v4f32(float %start_value, <4 x float> %input) ; sequential reduction
 
-'``llvm.experimental.vector.reduce.and.*``' Intrinsic
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+'``llvm.vector.reduce.and.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Syntax:
 """""""
 
 ::
 
-      declare i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> %a)
+      declare i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %a)
 
 Overview:
 """""""""
 
-The '``llvm.experimental.vector.reduce.and.*``' intrinsics do a bitwise ``AND``
+The '``llvm.vector.reduce.and.*``' intrinsics do a bitwise ``AND``
 reduction of a vector, returning the result as a scalar. The return type matches
 the element-type of the vector input.
 
@@ -15714,20 +15714,20 @@
 """"""""""
 The argument to this intrinsic must be a vector of integer values.
 
-'``llvm.experimental.vector.reduce.or.*``' Intrinsic
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+'``llvm.vector.reduce.or.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Syntax:
 """""""
 
 ::
 
-      declare i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> %a)
+      declare i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %a)
 
 Overview:
 """""""""
 
-The '``llvm.experimental.vector.reduce.or.*``' intrinsics do a bitwise ``OR`` reduction
+The '``llvm.vector.reduce.or.*``' intrinsics do a bitwise ``OR`` reduction
 of a vector, returning the result as a scalar. The return type matches the
 element-type of the vector input.
 
@@ -15735,20 +15735,20 @@
 """"""""""
 The argument to this intrinsic must be a vector of integer values.
 
-'``llvm.experimental.vector.reduce.xor.*``' Intrinsic
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+'``llvm.vector.reduce.xor.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Syntax:
 """""""
 
 ::
 
-      declare i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32> %a)
+      declare i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %a)
 
 Overview:
 """""""""
 
-The '``llvm.experimental.vector.reduce.xor.*``' intrinsics do a bitwise ``XOR``
+The '``llvm.vector.reduce.xor.*``' intrinsics do a bitwise ``XOR``
 reduction of a vector, returning the result as a scalar. The return type matches
 the element-type of the vector input.
 
@@ -15756,20 +15756,20 @@
 """"""""""
 The argument to this intrinsic must be a vector of integer values.
 
-'``llvm.experimental.vector.reduce.smax.*``' Intrinsic
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+'``llvm.vector.reduce.smax.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Syntax:
 """""""
 
 ::
 
-      declare i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> %a)
+      declare i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %a)
 
 Overview:
 """""""""
 
-The '``llvm.experimental.vector.reduce.smax.*``' intrinsics do a signed integer
+The '``llvm.vector.reduce.smax.*``' intrinsics do a signed integer
 ``MAX`` reduction of a vector, returning the result as a scalar. The return type
 matches the element-type of the vector input.
 
@@ -15777,20 +15777,20 @@
 """"""""""
 The argument to this intrinsic must be a vector of integer values.
 
-'``llvm.experimental.vector.reduce.smin.*``' Intrinsic
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+'``llvm.vector.reduce.smin.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Syntax:
 """""""
 
 ::
 
-      declare i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> %a)
+      declare i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %a)
 
 Overview:
 """""""""
 
-The '``llvm.experimental.vector.reduce.smin.*``' intrinsics do a signed integer
+The '``llvm.vector.reduce.smin.*``' intrinsics do a signed integer
 ``MIN`` reduction of a vector, returning the result as a scalar. The return type
 matches the element-type of the vector input.
 
@@ -15798,20 +15798,20 @@
 """"""""""
 The argument to this intrinsic must be a vector of integer values.
 
-'``llvm.experimental.vector.reduce.umax.*``' Intrinsic
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+'``llvm.vector.reduce.umax.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Syntax:
 """""""
 
 ::
 
-      declare i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> %a)
+      declare i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %a)
 
 Overview:
 """""""""
 
-The '``llvm.experimental.vector.reduce.umax.*``' intrinsics do an unsigned
+The '``llvm.vector.reduce.umax.*``' intrinsics do an unsigned
 integer ``MAX`` reduction of a vector, returning the result as a scalar. The
 return type matches the element-type of the vector input.
 
@@ -15819,20 +15819,20 @@
 """"""""""
 The argument to this intrinsic must be a vector of integer values.
 
-'``llvm.experimental.vector.reduce.umin.*``' Intrinsic
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+'``llvm.vector.reduce.umin.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Syntax:
 """""""
 
 ::
 
-      declare i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> %a)
+      declare i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %a)
 
 Overview:
 """""""""
 
-The '``llvm.experimental.vector.reduce.umin.*``' intrinsics do an unsigned
+The '``llvm.vector.reduce.umin.*``' intrinsics do an unsigned
 integer ``MIN`` reduction of a vector, returning the result as a scalar. The
 return type matches the element-type of the vector input.
 
@@ -15840,21 +15840,21 @@
 """"""""""
 The argument to this intrinsic must be a vector of integer values.
 
-'``llvm.experimental.vector.reduce.fmax.*``' Intrinsic
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+'``llvm.vector.reduce.fmax.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Syntax:
 """""""
 
 ::
 
-      declare float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> %a)
-      declare double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double> %a)
+      declare float @llvm.vector.reduce.fmax.v4f32(<4 x float> %a)
+      declare double @llvm.vector.reduce.fmax.v2f64(<2 x double> %a)
 
 Overview:
 """""""""
 
-The '``llvm.experimental.vector.reduce.fmax.*``' intrinsics do a floating-point
+The '``llvm.vector.reduce.fmax.*``' intrinsics do a floating-point
 ``MAX`` reduction of a vector, returning the result as a scalar. The return type
 matches the element-type of the vector input.
 
@@ -15870,8 +15870,8 @@
 """"""""""
 The argument to this intrinsic must be a vector of floating-point values.
 
-'``llvm.experimental.vector.reduce.fmin.*``' Intrinsic
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+'``llvm.vector.reduce.fmin.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Syntax:
 """""""
@@ -15879,13 +15879,13 @@
 
 ::
 
-      declare float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> %a)
-      declare double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double> %a)
+      declare float @llvm.vector.reduce.fmin.v4f32(<4 x float> %a)
+      declare double @llvm.vector.reduce.fmin.v2f64(<2 x double> %a)
 
 Overview:
 """""""""
 
-The '``llvm.experimental.vector.reduce.fmin.*``' intrinsics do a floating-point
+The '``llvm.vector.reduce.fmin.*``' intrinsics do a floating-point
 ``MIN`` reduction of a vector, returning the result as a scalar. The return type
 matches the element-type of the vector input.
 
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -63,6 +63,10 @@
 * Added the ``byref`` attribute to better represent argument passing
   for the `amdgpu_kernel` calling convention.
 
+* The ``llvm.experimental.vector.reduce`` family of intrinsics have been renamed
+  to drop the "experimental" from the name, reflecting their now fully supported
+  status in the IR.
+
 Changes to building LLVM
 ------------------------
 
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1180,19 +1180,19 @@
       return thisT()->getGatherScatterOpCost(Instruction::Load, RetTy, Args[0],
                                              VarMask, Alignment, CostKind, I);
     }
-    case Intrinsic::experimental_vector_reduce_add:
-    case Intrinsic::experimental_vector_reduce_mul:
-    case Intrinsic::experimental_vector_reduce_and:
-    case Intrinsic::experimental_vector_reduce_or:
-    case Intrinsic::experimental_vector_reduce_xor:
-    case Intrinsic::experimental_vector_reduce_v2_fadd:
-    case Intrinsic::experimental_vector_reduce_v2_fmul:
-    case Intrinsic::experimental_vector_reduce_smax:
-    case Intrinsic::experimental_vector_reduce_smin:
-    case Intrinsic::experimental_vector_reduce_fmax:
-    case Intrinsic::experimental_vector_reduce_fmin:
-    case Intrinsic::experimental_vector_reduce_umax:
-    case Intrinsic::experimental_vector_reduce_umin: {
+    case Intrinsic::vector_reduce_add:
+    case Intrinsic::vector_reduce_mul:
+    case Intrinsic::vector_reduce_and:
+    case Intrinsic::vector_reduce_or:
+    case Intrinsic::vector_reduce_xor:
+    case Intrinsic::vector_reduce_fadd:
+    case Intrinsic::vector_reduce_fmul:
+    case Intrinsic::vector_reduce_smax:
+    case Intrinsic::vector_reduce_smin:
+    case Intrinsic::vector_reduce_fmax:
+    case Intrinsic::vector_reduce_fmin:
+    case Intrinsic::vector_reduce_umax:
+    case Intrinsic::vector_reduce_umin: {
       IntrinsicCostAttributes Attrs(IID, RetTy, Args[0]->getType(), FMF, 1, I);
       return getIntrinsicInstrCost(Attrs, CostKind);
     }
@@ -1407,46 +1407,46 @@
       return thisT()->getMaskedMemoryOpCost(Instruction::Load, Ty, TyAlign, 0,
                                             CostKind);
     }
-    case Intrinsic::experimental_vector_reduce_add:
+    case Intrinsic::vector_reduce_add:
       return thisT()->getArithmeticReductionCost(Instruction::Add, VecOpTy,
                                                  /*IsPairwiseForm=*/false,
                                                  CostKind);
-    case Intrinsic::experimental_vector_reduce_mul:
+    case Intrinsic::vector_reduce_mul:
       return thisT()->getArithmeticReductionCost(Instruction::Mul, VecOpTy,
                                                  /*IsPairwiseForm=*/false,
                                                  CostKind);
-    case Intrinsic::experimental_vector_reduce_and:
+    case Intrinsic::vector_reduce_and:
       return thisT()->getArithmeticReductionCost(Instruction::And, VecOpTy,
                                                  /*IsPairwiseForm=*/false,
                                                  CostKind);
-    case Intrinsic::experimental_vector_reduce_or:
+    case Intrinsic::vector_reduce_or:
       return thisT()->getArithmeticReductionCost(Instruction::Or, VecOpTy,
                                                  /*IsPairwiseForm=*/false,
                                                  CostKind);
-    case Intrinsic::experimental_vector_reduce_xor:
+    case Intrinsic::vector_reduce_xor:
       return thisT()->getArithmeticReductionCost(Instruction::Xor, VecOpTy,
                                                  /*IsPairwiseForm=*/false,
                                                  CostKind);
-    case Intrinsic::experimental_vector_reduce_v2_fadd:
+    case Intrinsic::vector_reduce_fadd:
       // FIXME: Add new flag for cost of strict reductions.
       return thisT()->getArithmeticReductionCost(Instruction::FAdd, VecOpTy,
                                                  /*IsPairwiseForm=*/false,
                                                  CostKind);
-    case Intrinsic::experimental_vector_reduce_v2_fmul:
+    case Intrinsic::vector_reduce_fmul:
       // FIXME: Add new flag for cost of strict reductions.
       return thisT()->getArithmeticReductionCost(Instruction::FMul, VecOpTy,
                                                  /*IsPairwiseForm=*/false,
                                                  CostKind);
-    case Intrinsic::experimental_vector_reduce_smax:
-    case Intrinsic::experimental_vector_reduce_smin:
-    case Intrinsic::experimental_vector_reduce_fmax:
-    case Intrinsic::experimental_vector_reduce_fmin:
+    case Intrinsic::vector_reduce_smax:
+    case Intrinsic::vector_reduce_smin:
+    case Intrinsic::vector_reduce_fmax:
+    case Intrinsic::vector_reduce_fmin:
       return thisT()->getMinMaxReductionCost(
           VecOpTy, cast<VectorType>(CmpInst::makeCmpResultType(VecOpTy)),
           /*IsPairwiseForm=*/false,
           /*IsUnsigned=*/false, CostKind);
-    case Intrinsic::experimental_vector_reduce_umax:
-    case Intrinsic::experimental_vector_reduce_umin:
+    case Intrinsic::vector_reduce_umax:
+    case Intrinsic::vector_reduce_umin:
       return thisT()->getMinMaxReductionCost(
           VecOpTy, cast<VectorType>(CmpInst::makeCmpResultType(VecOpTy)),
           /*IsPairwiseForm=*/false,
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1452,34 +1452,35 @@
 //===------------------------ Reduction Intrinsics ------------------------===//
 //
 let IntrProperties = [IntrNoMem, IntrWillReturn] in {
-  def int_experimental_vector_reduce_v2_fadd : Intrinsic<[llvm_anyfloat_ty],
-                                                         [LLVMMatchType<0>,
-                                                          llvm_anyvector_ty]>;
-  def int_experimental_vector_reduce_v2_fmul : Intrinsic<[llvm_anyfloat_ty],
-                                                         [LLVMMatchType<0>,
-                                                          llvm_anyvector_ty]>;
-  def int_experimental_vector_reduce_add : Intrinsic<[LLVMVectorElementType<0>],
-                                                     [llvm_anyvector_ty]>;
-  def int_experimental_vector_reduce_mul : Intrinsic<[LLVMVectorElementType<0>],
-                                                     [llvm_anyvector_ty]>;
-  def int_experimental_vector_reduce_and : Intrinsic<[LLVMVectorElementType<0>],
-                                                     [llvm_anyvector_ty]>;
-  def int_experimental_vector_reduce_or : Intrinsic<[LLVMVectorElementType<0>],
-                                                    [llvm_anyvector_ty]>;
-  def int_experimental_vector_reduce_xor : Intrinsic<[LLVMVectorElementType<0>],
-                                                     [llvm_anyvector_ty]>;
-  def int_experimental_vector_reduce_smax : Intrinsic<[LLVMVectorElementType<0>],
-                                                      [llvm_anyvector_ty]>;
-  def int_experimental_vector_reduce_smin : Intrinsic<[LLVMVectorElementType<0>],
-                                                      [llvm_anyvector_ty]>;
-  def int_experimental_vector_reduce_umax : Intrinsic<[LLVMVectorElementType<0>],
-                                                      [llvm_anyvector_ty]>;
-  def int_experimental_vector_reduce_umin : Intrinsic<[LLVMVectorElementType<0>],
-                                                      [llvm_anyvector_ty]>;
-  def int_experimental_vector_reduce_fmax : Intrinsic<[LLVMVectorElementType<0>],
-                                                      [llvm_anyvector_ty]>;
-  def int_experimental_vector_reduce_fmin : Intrinsic<[LLVMVectorElementType<0>],
-                                                      [llvm_anyvector_ty]>;
+
+  def int_vector_reduce_fadd : Intrinsic<[LLVMVectorElementType<0>],
+                                         [LLVMVectorElementType<0>,
+                                          llvm_anyvector_ty]>;
+  def int_vector_reduce_fmul : Intrinsic<[LLVMVectorElementType<0>],
+                                         [LLVMVectorElementType<0>,
+                                          llvm_anyvector_ty]>;
+  def int_vector_reduce_add : Intrinsic<[LLVMVectorElementType<0>],
+                                        [llvm_anyvector_ty]>;
+  def int_vector_reduce_mul : Intrinsic<[LLVMVectorElementType<0>],
+                                        [llvm_anyvector_ty]>;
+  def int_vector_reduce_and : Intrinsic<[LLVMVectorElementType<0>],
+                                        [llvm_anyvector_ty]>;
+  def int_vector_reduce_or : Intrinsic<[LLVMVectorElementType<0>],
+                                       [llvm_anyvector_ty]>;
+  def int_vector_reduce_xor : Intrinsic<[LLVMVectorElementType<0>],
+                                        [llvm_anyvector_ty]>;
+  def int_vector_reduce_smax : Intrinsic<[LLVMVectorElementType<0>],
+                                         [llvm_anyvector_ty]>;
+  def int_vector_reduce_smin : Intrinsic<[LLVMVectorElementType<0>],
+                                         [llvm_anyvector_ty]>;
+  def int_vector_reduce_umax : Intrinsic<[LLVMVectorElementType<0>],
+                                         [llvm_anyvector_ty]>;
+  def int_vector_reduce_umin : Intrinsic<[LLVMVectorElementType<0>],
+                                         [llvm_anyvector_ty]>;
+  def int_vector_reduce_fmax : Intrinsic<[LLVMVectorElementType<0>],
+                                         [llvm_anyvector_ty]>;
+  def int_vector_reduce_fmin : Intrinsic<[LLVMVectorElementType<0>],
+                                         [llvm_anyvector_ty]>;
 }
 
 //===----- Matrix intrinsics ---------------------------------------------===//
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -1457,15 +1457,15 @@
   case Intrinsic::smul_fix_sat:
   case Intrinsic::bitreverse:
   case Intrinsic::is_constant:
-  case Intrinsic::experimental_vector_reduce_add:
-  case Intrinsic::experimental_vector_reduce_mul:
-  case Intrinsic::experimental_vector_reduce_and:
-  case Intrinsic::experimental_vector_reduce_or:
-  case Intrinsic::experimental_vector_reduce_xor:
-  case Intrinsic::experimental_vector_reduce_smin:
-  case Intrinsic::experimental_vector_reduce_smax:
-  case Intrinsic::experimental_vector_reduce_umin:
-  case Intrinsic::experimental_vector_reduce_umax:
+  case Intrinsic::vector_reduce_add:
+  case Intrinsic::vector_reduce_mul:
+  case Intrinsic::vector_reduce_and:
+  case Intrinsic::vector_reduce_or:
+  case Intrinsic::vector_reduce_xor:
+  case Intrinsic::vector_reduce_smin:
+  case Intrinsic::vector_reduce_smax:
+  case Intrinsic::vector_reduce_umin:
+  case Intrinsic::vector_reduce_umax:
   // Target intrinsics
   case Intrinsic::arm_mve_vctp8:
   case Intrinsic::arm_mve_vctp16:
@@ -1711,31 +1711,31 @@
       return nullptr;
     const APInt &X = CI->getValue();
     switch (IID) {
-    case Intrinsic::experimental_vector_reduce_add:
+    case Intrinsic::vector_reduce_add:
       Acc = Acc + X;
       break;
-    case Intrinsic::experimental_vector_reduce_mul:
+    case Intrinsic::vector_reduce_mul:
       Acc = Acc * X;
       break;
-    case Intrinsic::experimental_vector_reduce_and:
+    case Intrinsic::vector_reduce_and:
       Acc = Acc & X;
       break;
-    case Intrinsic::experimental_vector_reduce_or:
+    case Intrinsic::vector_reduce_or:
       Acc = Acc | X;
       break;
-    case Intrinsic::experimental_vector_reduce_xor:
+    case Intrinsic::vector_reduce_xor:
       Acc = Acc ^ X;
       break;
-    case Intrinsic::experimental_vector_reduce_smin:
+    case Intrinsic::vector_reduce_smin:
       Acc = APIntOps::smin(Acc, X);
       break;
-    case Intrinsic::experimental_vector_reduce_smax:
+    case Intrinsic::vector_reduce_smax:
       Acc = APIntOps::smax(Acc, X);
       break;
-    case Intrinsic::experimental_vector_reduce_umin:
+    case Intrinsic::vector_reduce_umin:
       Acc = APIntOps::umin(Acc, X);
       break;
-    case Intrinsic::experimental_vector_reduce_umax:
+    case Intrinsic::vector_reduce_umax:
       Acc = APIntOps::umax(Acc, X);
       break;
     }
@@ -2240,15 +2240,15 @@
   if (isa<ConstantAggregateZero>(Operands[0])) {
     switch (IntrinsicID) {
     default: break;
-    case Intrinsic::experimental_vector_reduce_add:
-    case Intrinsic::experimental_vector_reduce_mul:
-    case Intrinsic::experimental_vector_reduce_and:
-    case Intrinsic::experimental_vector_reduce_or:
-    case Intrinsic::experimental_vector_reduce_xor:
-    case Intrinsic::experimental_vector_reduce_smin:
-    case Intrinsic::experimental_vector_reduce_smax:
-    case Intrinsic::experimental_vector_reduce_umin:
-    case Intrinsic::experimental_vector_reduce_umax:
+    case Intrinsic::vector_reduce_add:
+    case Intrinsic::vector_reduce_mul:
+    case Intrinsic::vector_reduce_and:
+    case Intrinsic::vector_reduce_or:
+    case Intrinsic::vector_reduce_xor:
+    case Intrinsic::vector_reduce_smin:
+    case Intrinsic::vector_reduce_smax:
+    case Intrinsic::vector_reduce_umin:
+    case Intrinsic::vector_reduce_umax:
       return ConstantInt::get(Ty, 0);
     }
   }
@@ -2259,15 +2259,15 @@
     auto *Op = cast<Constant>(Operands[0]);
     switch (IntrinsicID) {
     default: break;
-    case Intrinsic::experimental_vector_reduce_add:
-    case Intrinsic::experimental_vector_reduce_mul:
-    case Intrinsic::experimental_vector_reduce_and:
-    case Intrinsic::experimental_vector_reduce_or:
-    case Intrinsic::experimental_vector_reduce_xor:
-    case Intrinsic::experimental_vector_reduce_smin:
-    case Intrinsic::experimental_vector_reduce_smax:
-    case Intrinsic::experimental_vector_reduce_umin:
-    case Intrinsic::experimental_vector_reduce_umax:
+    case Intrinsic::vector_reduce_add:
+    case Intrinsic::vector_reduce_mul:
+    case Intrinsic::vector_reduce_and:
+    case Intrinsic::vector_reduce_or:
+    case Intrinsic::vector_reduce_xor:
+    case Intrinsic::vector_reduce_smin:
+    case Intrinsic::vector_reduce_smax:
+    case Intrinsic::vector_reduce_umin:
+    case Intrinsic::vector_reduce_umax:
       if (Constant *C = ConstantFoldVectorReduce(IntrinsicID, Op))
         return C;
       break;
diff --git a/llvm/lib/CodeGen/ExpandReductions.cpp b/llvm/lib/CodeGen/ExpandReductions.cpp
--- a/llvm/lib/CodeGen/ExpandReductions.cpp
+++ b/llvm/lib/CodeGen/ExpandReductions.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 //
 // This pass implements IR expansion for reduction intrinsics, allowing targets
-// to enable the experimental intrinsics until just before codegen.
+// to enable the intrinsics until just before codegen.
 //
 //===----------------------------------------------------------------------===//
 
@@ -30,27 +30,27 @@
 
 unsigned getOpcode(Intrinsic::ID ID) {
   switch (ID) {
-  case Intrinsic::experimental_vector_reduce_v2_fadd:
+  case Intrinsic::vector_reduce_fadd:
     return Instruction::FAdd;
-  case Intrinsic::experimental_vector_reduce_v2_fmul:
+  case Intrinsic::vector_reduce_fmul:
     return Instruction::FMul;
-  case Intrinsic::experimental_vector_reduce_add:
+  case Intrinsic::vector_reduce_add:
     return Instruction::Add;
-  case Intrinsic::experimental_vector_reduce_mul:
+  case Intrinsic::vector_reduce_mul:
     return Instruction::Mul;
-  case Intrinsic::experimental_vector_reduce_and:
+  case Intrinsic::vector_reduce_and:
     return Instruction::And;
-  case Intrinsic::experimental_vector_reduce_or:
+  case Intrinsic::vector_reduce_or:
     return Instruction::Or;
-  case Intrinsic::experimental_vector_reduce_xor:
+  case Intrinsic::vector_reduce_xor:
     return Instruction::Xor;
-  case Intrinsic::experimental_vector_reduce_smax:
-  case Intrinsic::experimental_vector_reduce_smin:
-  case Intrinsic::experimental_vector_reduce_umax:
-  case Intrinsic::experimental_vector_reduce_umin:
+  case Intrinsic::vector_reduce_smax:
+  case Intrinsic::vector_reduce_smin:
+  case Intrinsic::vector_reduce_umax:
+  case Intrinsic::vector_reduce_umin:
     return Instruction::ICmp;
-  case Intrinsic::experimental_vector_reduce_fmax:
-  case Intrinsic::experimental_vector_reduce_fmin:
+  case Intrinsic::vector_reduce_fmax:
+  case Intrinsic::vector_reduce_fmin:
     return Instruction::FCmp;
   default:
     llvm_unreachable("Unexpected ID");
@@ -59,17 +59,17 @@
 
 RecurrenceDescriptor::MinMaxRecurrenceKind getMRK(Intrinsic::ID ID) {
   switch (ID) {
-  case Intrinsic::experimental_vector_reduce_smax:
+  case Intrinsic::vector_reduce_smax:
     return RecurrenceDescriptor::MRK_SIntMax;
-  case Intrinsic::experimental_vector_reduce_smin:
+  case Intrinsic::vector_reduce_smin:
     return RecurrenceDescriptor::MRK_SIntMin;
-  case Intrinsic::experimental_vector_reduce_umax:
+  case Intrinsic::vector_reduce_umax:
     return RecurrenceDescriptor::MRK_UIntMax;
-  case Intrinsic::experimental_vector_reduce_umin:
+  case Intrinsic::vector_reduce_umin:
     return RecurrenceDescriptor::MRK_UIntMin;
-  case Intrinsic::experimental_vector_reduce_fmax:
+  case Intrinsic::vector_reduce_fmax:
     return RecurrenceDescriptor::MRK_FloatMax;
-  case Intrinsic::experimental_vector_reduce_fmin:
+  case Intrinsic::vector_reduce_fmin:
     return RecurrenceDescriptor::MRK_FloatMin;
   default:
     return RecurrenceDescriptor::MRK_Invalid;
@@ -83,19 +83,19 @@
     if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
       switch (II->getIntrinsicID()) {
       default: break;
-      case Intrinsic::experimental_vector_reduce_v2_fadd:
-      case Intrinsic::experimental_vector_reduce_v2_fmul:
-      case Intrinsic::experimental_vector_reduce_add:
-      case Intrinsic::experimental_vector_reduce_mul:
-      case Intrinsic::experimental_vector_reduce_and:
-      case Intrinsic::experimental_vector_reduce_or:
-      case Intrinsic::experimental_vector_reduce_xor:
-      case Intrinsic::experimental_vector_reduce_smax:
-      case Intrinsic::experimental_vector_reduce_smin:
-      case Intrinsic::experimental_vector_reduce_umax:
-      case Intrinsic::experimental_vector_reduce_umin:
-      case Intrinsic::experimental_vector_reduce_fmax:
-      case Intrinsic::experimental_vector_reduce_fmin:
+      case Intrinsic::vector_reduce_fadd:
+      case Intrinsic::vector_reduce_fmul:
+      case Intrinsic::vector_reduce_add:
+      case Intrinsic::vector_reduce_mul:
+      case Intrinsic::vector_reduce_and:
+      case Intrinsic::vector_reduce_or:
+      case Intrinsic::vector_reduce_xor:
+      case Intrinsic::vector_reduce_smax:
+      case Intrinsic::vector_reduce_smin:
+      case Intrinsic::vector_reduce_umax:
+      case Intrinsic::vector_reduce_umin:
+      case Intrinsic::vector_reduce_fmax:
+      case Intrinsic::vector_reduce_fmin:
         if (TTI->shouldExpandReduction(II))
           Worklist.push_back(II);
 
@@ -116,8 +116,8 @@
     Builder.setFastMathFlags(FMF);
     switch (ID) {
     default: llvm_unreachable("Unexpected intrinsic!");
-    case Intrinsic::experimental_vector_reduce_v2_fadd:
-    case Intrinsic::experimental_vector_reduce_v2_fmul: {
+    case Intrinsic::vector_reduce_fadd:
+    case Intrinsic::vector_reduce_fmul: {
       // FMFs must be attached to the call, otherwise it's an ordered reduction
       // and it can't be handled by generating a shuffle sequence.
       Value *Acc = II->getArgOperand(0);
@@ -135,15 +135,15 @@
       }
       break;
     }
-    case Intrinsic::experimental_vector_reduce_add:
-    case Intrinsic::experimental_vector_reduce_mul:
-    case Intrinsic::experimental_vector_reduce_and:
-    case Intrinsic::experimental_vector_reduce_or:
-    case Intrinsic::experimental_vector_reduce_xor:
-    case Intrinsic::experimental_vector_reduce_smax:
-    case Intrinsic::experimental_vector_reduce_smin:
-    case Intrinsic::experimental_vector_reduce_umax:
-    case Intrinsic::experimental_vector_reduce_umin: {
+    case Intrinsic::vector_reduce_add:
+    case Intrinsic::vector_reduce_mul:
+    case Intrinsic::vector_reduce_and:
+    case Intrinsic::vector_reduce_or:
+    case Intrinsic::vector_reduce_xor:
+    case Intrinsic::vector_reduce_smax:
+    case Intrinsic::vector_reduce_smin:
+    case Intrinsic::vector_reduce_umax:
+    case Intrinsic::vector_reduce_umin: {
       Value *Vec = II->getArgOperand(0);
       if (!isPowerOf2_32(
               cast<FixedVectorType>(Vec->getType())->getNumElements()))
@@ -152,8 +152,8 @@
       Rdx = getShuffleReduction(Builder, Vec, getOpcode(ID), MRK);
       break;
     }
-    case Intrinsic::experimental_vector_reduce_fmax:
-    case Intrinsic::experimental_vector_reduce_fmin: {
+    case Intrinsic::vector_reduce_fmax:
+    case Intrinsic::vector_reduce_fmin: {
       // FIXME: We only expand 'fast' reductions here because the underlying
       //        code in createMinMaxOp() assumes that comparisons use 'fast'
       //        semantics.
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -6762,19 +6762,19 @@
     LowerDeoptimizeCall(&I);
     return;
 
-  case Intrinsic::experimental_vector_reduce_v2_fadd:
-  case Intrinsic::experimental_vector_reduce_v2_fmul:
-  case Intrinsic::experimental_vector_reduce_add:
-  case Intrinsic::experimental_vector_reduce_mul:
-  case Intrinsic::experimental_vector_reduce_and:
-  case Intrinsic::experimental_vector_reduce_or:
-  case Intrinsic::experimental_vector_reduce_xor:
-  case Intrinsic::experimental_vector_reduce_smax:
-  case Intrinsic::experimental_vector_reduce_smin:
-  case Intrinsic::experimental_vector_reduce_umax:
-  case Intrinsic::experimental_vector_reduce_umin:
-  case Intrinsic::experimental_vector_reduce_fmax:
-  case Intrinsic::experimental_vector_reduce_fmin:
+  case Intrinsic::vector_reduce_fadd:
+  case Intrinsic::vector_reduce_fmul:
+  case Intrinsic::vector_reduce_add:
+  case Intrinsic::vector_reduce_mul:
+  case Intrinsic::vector_reduce_and:
+  case Intrinsic::vector_reduce_or:
+  case Intrinsic::vector_reduce_xor:
+  case Intrinsic::vector_reduce_smax:
+  case Intrinsic::vector_reduce_smin:
+  case Intrinsic::vector_reduce_umax:
+  case Intrinsic::vector_reduce_umin:
+  case Intrinsic::vector_reduce_fmax:
+  case Intrinsic::vector_reduce_fmin:
     visitVectorReduce(I, Intrinsic);
     return;
 
@@ -8937,7 +8937,7 @@
     SDFlags.copyFMF(*FPMO);
 
   switch (Intrinsic) {
-  case Intrinsic::experimental_vector_reduce_v2_fadd:
+  case Intrinsic::vector_reduce_fadd:
     if (SDFlags.hasAllowReassociation())
       Res = DAG.getNode(ISD::FADD, dl, VT, Op1,
                         DAG.getNode(ISD::VECREDUCE_FADD, dl, VT, Op2, SDFlags),
@@ -8945,7 +8945,7 @@
     else
       Res = DAG.getNode(ISD::VECREDUCE_STRICT_FADD, dl, VT, Op1, Op2, SDFlags);
     break;
-  case Intrinsic::experimental_vector_reduce_v2_fmul:
+  case Intrinsic::vector_reduce_fmul:
     if (SDFlags.hasAllowReassociation())
       Res = DAG.getNode(ISD::FMUL, dl, VT, Op1,
                         DAG.getNode(ISD::VECREDUCE_FMUL, dl, VT, Op2, SDFlags),
@@ -8953,37 +8953,37 @@
     else
       Res = DAG.getNode(ISD::VECREDUCE_STRICT_FMUL, dl, VT, Op1, Op2, SDFlags);
     break;
-  case Intrinsic::experimental_vector_reduce_add:
+  case Intrinsic::vector_reduce_add:
     Res = DAG.getNode(ISD::VECREDUCE_ADD, dl, VT, Op1);
     break;
-  case Intrinsic::experimental_vector_reduce_mul:
+  case Intrinsic::vector_reduce_mul:
     Res = DAG.getNode(ISD::VECREDUCE_MUL, dl, VT, Op1);
     break;
-  case Intrinsic::experimental_vector_reduce_and:
+  case Intrinsic::vector_reduce_and:
     Res = DAG.getNode(ISD::VECREDUCE_AND, dl, VT, Op1);
     break;
-  case Intrinsic::experimental_vector_reduce_or:
+  case Intrinsic::vector_reduce_or:
     Res = DAG.getNode(ISD::VECREDUCE_OR, dl, VT, Op1);
     break;
-  case Intrinsic::experimental_vector_reduce_xor:
+  case Intrinsic::vector_reduce_xor:
     Res = DAG.getNode(ISD::VECREDUCE_XOR, dl, VT, Op1);
     break;
-  case Intrinsic::experimental_vector_reduce_smax:
+  case Intrinsic::vector_reduce_smax:
     Res = DAG.getNode(ISD::VECREDUCE_SMAX, dl, VT, Op1);
     break;
-  case Intrinsic::experimental_vector_reduce_smin:
+  case Intrinsic::vector_reduce_smin:
     Res = DAG.getNode(ISD::VECREDUCE_SMIN, dl, VT, Op1);
     break;
-  case Intrinsic::experimental_vector_reduce_umax:
+  case Intrinsic::vector_reduce_umax:
     Res = DAG.getNode(ISD::VECREDUCE_UMAX, dl, VT, Op1);
     break;
-  case Intrinsic::experimental_vector_reduce_umin:
+  case Intrinsic::vector_reduce_umin:
     Res = DAG.getNode(ISD::VECREDUCE_UMIN, dl, VT, Op1);
     break;
-  case Intrinsic::experimental_vector_reduce_fmax:
+  case Intrinsic::vector_reduce_fmax:
     Res = DAG.getNode(ISD::VECREDUCE_FMAX, dl, VT, Op1, SDFlags);
     break;
-  case Intrinsic::experimental_vector_reduce_fmin:
+  case Intrinsic::vector_reduce_fmin:
     Res = DAG.getNode(ISD::VECREDUCE_FMIN, dl, VT, Op1, SDFlags);
     break;
   default:
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -23,6 +23,7 @@
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/InstVisitor.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicsAArch64.h"
 #include "llvm/IR/IntrinsicsARM.h"
 #include "llvm/IR/IntrinsicsX86.h"
@@ -717,18 +718,42 @@
   }
   case 'e': {
     SmallVector<StringRef, 2> Groups;
-    static const Regex R("^experimental.vector.reduce.([a-z]+)\\.[fi][0-9]+");
+    static const Regex R("^experimental.vector.reduce.([a-z]+)\\.[a-z][0-9]+");
     if (R.match(Name, &Groups)) {
+      Intrinsic::ID ID;
+      ID = StringSwitch<Intrinsic::ID>(Groups[1])
+               .Case("add", Intrinsic::vector_reduce_add)
+               .Case("mul", Intrinsic::vector_reduce_mul)
+               .Case("and", Intrinsic::vector_reduce_and)
+               .Case("or", Intrinsic::vector_reduce_or)
+               .Case("xor", Intrinsic::vector_reduce_xor)
+               .Case("smax", Intrinsic::vector_reduce_smax)
+               .Case("smin", Intrinsic::vector_reduce_smin)
+               .Case("umax", Intrinsic::vector_reduce_umax)
+               .Case("umin", Intrinsic::vector_reduce_umin)
+               .Case("fmax", Intrinsic::vector_reduce_fmax)
+               .Case("fmin", Intrinsic::vector_reduce_fmin)
+               .Default(Intrinsic::not_intrinsic);
+      if (ID != Intrinsic::not_intrinsic) {
+        rename(F);
+        auto Args = F->getFunctionType()->params();
+        NewFn = Intrinsic::getDeclaration(F->getParent(), ID, {Args[0]});
+        return true;
+      }
+    }
+    static const Regex R2(
+        "^experimental.vector.reduce.v2.([a-z]+)\\.[fi][0-9]+");
+    Groups.clear();
+    if (R2.match(Name, &Groups)) {
       Intrinsic::ID ID = Intrinsic::not_intrinsic;
       if (Groups[1] == "fadd")
-        ID = Intrinsic::experimental_vector_reduce_v2_fadd;
+        ID = Intrinsic::vector_reduce_fadd;
       if (Groups[1] == "fmul")
-        ID = Intrinsic::experimental_vector_reduce_v2_fmul;
-
+        ID = Intrinsic::vector_reduce_fmul;
       if (ID != Intrinsic::not_intrinsic) {
         rename(F);
         auto Args = F->getFunctionType()->params();
-        Type *Tys[] = {F->getFunctionType()->getReturnType(), Args[1]};
+        Type *Tys[] = {Args[1]};
         NewFn = Intrinsic::getDeclaration(F->getParent(), ID, Tys);
         return true;
       }
@@ -3620,28 +3645,6 @@
     DefaultCase();
     return;
   }
-  case Intrinsic::experimental_vector_reduce_v2_fmul: {
-    SmallVector<Value *, 2> Args;
-    if (CI->isFast())
-      Args.push_back(ConstantFP::get(CI->getOperand(0)->getType(), 1.0));
-    else
-      Args.push_back(CI->getOperand(0));
-    Args.push_back(CI->getOperand(1));
-    NewCall = Builder.CreateCall(NewFn, Args);
-    cast<Instruction>(NewCall)->copyFastMathFlags(CI);
-    break;
-  }
-  case Intrinsic::experimental_vector_reduce_v2_fadd: {
-    SmallVector<Value *, 2> Args;
-    if (CI->isFast())
-      Args.push_back(Constant::getNullValue(CI->getOperand(0)->getType()));
-    else
-      Args.push_back(CI->getOperand(0));
-    Args.push_back(CI->getOperand(1));
-    NewCall = Builder.CreateCall(NewFn, Args);
-    cast<Instruction>(NewCall)->copyFastMathFlags(CI);
-    break;
-  }
   case Intrinsic::arm_neon_vld1:
   case Intrinsic::arm_neon_vld2:
   case Intrinsic::arm_neon_vld3:
diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp
--- a/llvm/lib/IR/IRBuilder.cpp
+++ b/llvm/lib/IR/IRBuilder.cpp
@@ -325,61 +325,53 @@
 CallInst *IRBuilderBase::CreateFAddReduce(Value *Acc, Value *Src) {
   Module *M = GetInsertBlock()->getParent()->getParent();
   Value *Ops[] = {Acc, Src};
-  Type *Tys[] = {Acc->getType(), Src->getType()};
-  auto Decl = Intrinsic::getDeclaration(
-      M, Intrinsic::experimental_vector_reduce_v2_fadd, Tys);
+  auto Decl = Intrinsic::getDeclaration(M, Intrinsic::vector_reduce_fadd,
+                                        {Src->getType()});
   return createCallHelper(Decl, Ops, this);
 }
 
 CallInst *IRBuilderBase::CreateFMulReduce(Value *Acc, Value *Src) {
   Module *M = GetInsertBlock()->getParent()->getParent();
   Value *Ops[] = {Acc, Src};
-  Type *Tys[] = {Acc->getType(), Src->getType()};
-  auto Decl = Intrinsic::getDeclaration(
-      M, Intrinsic::experimental_vector_reduce_v2_fmul, Tys);
+  auto Decl = Intrinsic::getDeclaration(M, Intrinsic::vector_reduce_fmul,
+                                        {Src->getType()});
   return createCallHelper(Decl, Ops, this);
 }
 
 CallInst *IRBuilderBase::CreateAddReduce(Value *Src) {
-  return getReductionIntrinsic(this, Intrinsic::experimental_vector_reduce_add,
-                               Src);
+  return getReductionIntrinsic(this, Intrinsic::vector_reduce_add, Src);
 }
 
 CallInst *IRBuilderBase::CreateMulReduce(Value *Src) {
-  return getReductionIntrinsic(this, Intrinsic::experimental_vector_reduce_mul,
-                               Src);
+  return getReductionIntrinsic(this, Intrinsic::vector_reduce_mul, Src);
 }
 
 CallInst *IRBuilderBase::CreateAndReduce(Value *Src) {
-  return getReductionIntrinsic(this, Intrinsic::experimental_vector_reduce_and,
-                               Src);
+  return getReductionIntrinsic(this, Intrinsic::vector_reduce_and, Src);
 }
 
 CallInst *IRBuilderBase::CreateOrReduce(Value *Src) {
-  return getReductionIntrinsic(this, Intrinsic::experimental_vector_reduce_or,
-                               Src);
+  return getReductionIntrinsic(this, Intrinsic::vector_reduce_or, Src);
 }
 
 CallInst *IRBuilderBase::CreateXorReduce(Value *Src) {
-  return getReductionIntrinsic(this, Intrinsic::experimental_vector_reduce_xor,
-                               Src);
+  return getReductionIntrinsic(this, Intrinsic::vector_reduce_xor, Src);
 }
 
 CallInst *IRBuilderBase::CreateIntMaxReduce(Value *Src, bool IsSigned) {
-  auto ID = IsSigned ? Intrinsic::experimental_vector_reduce_smax
-                     : Intrinsic::experimental_vector_reduce_umax;
+  auto ID =
+      IsSigned ? Intrinsic::vector_reduce_smax : Intrinsic::vector_reduce_umax;
   return getReductionIntrinsic(this, ID, Src);
 }
 
 CallInst *IRBuilderBase::CreateIntMinReduce(Value *Src, bool IsSigned) {
-  auto ID = IsSigned ? Intrinsic::experimental_vector_reduce_smin
-                     : Intrinsic::experimental_vector_reduce_umin;
+  auto ID =
+      IsSigned ? Intrinsic::vector_reduce_smin : Intrinsic::vector_reduce_umin;
   return getReductionIntrinsic(this, ID, Src);
 }
 
 CallInst *IRBuilderBase::CreateFPMaxReduce(Value *Src, bool NoNaN) {
-  auto Rdx = getReductionIntrinsic(
-      this, Intrinsic::experimental_vector_reduce_fmax, Src);
+  auto Rdx = getReductionIntrinsic(this, Intrinsic::vector_reduce_fmax, Src);
   if (NoNaN) {
     FastMathFlags FMF;
     FMF.setNoNaNs();
@@ -389,8 +381,7 @@
 }
 
 CallInst *IRBuilderBase::CreateFPMinReduce(Value *Src, bool NoNaN) {
-  auto Rdx = getReductionIntrinsic(
-      this, Intrinsic::experimental_vector_reduce_fmin, Src);
+  auto Rdx = getReductionIntrinsic(this, Intrinsic::vector_reduce_fmin, Src);
   if (NoNaN) {
     FastMathFlags FMF;
     FMF.setNoNaNs();
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -219,8 +219,8 @@
 
   bool shouldExpandReduction(const IntrinsicInst *II) const {
     switch (II->getIntrinsicID()) {
-    case Intrinsic::experimental_vector_reduce_v2_fadd:
-    case Intrinsic::experimental_vector_reduce_v2_fmul:
+    case Intrinsic::vector_reduce_fadd:
+    case Intrinsic::vector_reduce_fmul:
       // We don't have legalization support for ordered FP reductions.
       return !II->getFastMathFlags().allowReassoc();
 
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -195,8 +195,8 @@
 
   bool shouldExpandReduction(const IntrinsicInst *II) const {
     switch (II->getIntrinsicID()) {
-    case Intrinsic::experimental_vector_reduce_v2_fadd:
-    case Intrinsic::experimental_vector_reduce_v2_fmul:
+    case Intrinsic::vector_reduce_fadd:
+    case Intrinsic::vector_reduce_fmul:
       // We don't have legalization support for ordered FP reductions.
       return !II->getFastMathFlags().allowReassoc();
     default:
diff --git a/llvm/lib/Target/ARM/MVETailPredication.cpp b/llvm/lib/Target/ARM/MVETailPredication.cpp
--- a/llvm/lib/Target/ARM/MVETailPredication.cpp
+++ b/llvm/lib/Target/ARM/MVETailPredication.cpp
@@ -270,7 +270,7 @@
       case Intrinsic::uadd_sat:
       case Intrinsic::ssub_sat:
       case Intrinsic::usub_sat:
-      case Intrinsic::experimental_vector_reduce_add:
+      case Intrinsic::vector_reduce_add:
         continue;
       case Intrinsic::fma:
       case Intrinsic::trunc:
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -1824,8 +1824,7 @@
   }
 
   auto m_AddRdx = [](Value *&Vec) {
-    return m_OneUse(
-        m_Intrinsic<Intrinsic::experimental_vector_reduce_add>(m_Value(Vec)));
+    return m_OneUse(m_Intrinsic<Intrinsic::vector_reduce_add>(m_Value(Vec)));
   };
   Value *V0, *V1;
   if (match(Op0, m_AddRdx(V0)) && match(Op1, m_AddRdx(V1)) &&
@@ -1833,8 +1832,8 @@
     // Difference of sums is sum of differences:
     // add_rdx(V0) - add_rdx(V1) --> add_rdx(V0 - V1)
     Value *Sub = Builder.CreateSub(V0, V1);
-    Value *Rdx = Builder.CreateIntrinsic(
-        Intrinsic::experimental_vector_reduce_add, {Sub->getType()}, {Sub});
+    Value *Rdx = Builder.CreateIntrinsic(Intrinsic::vector_reduce_add,
+                                         {Sub->getType()}, {Sub});
     return replaceInstUsesWith(I, Rdx);
   }
 
@@ -2280,9 +2279,8 @@
     }
 
     auto m_FaddRdx = [](Value *&Sum, Value *&Vec) {
-      return m_OneUse(
-          m_Intrinsic<Intrinsic::experimental_vector_reduce_v2_fadd>(
-              m_Value(Sum), m_Value(Vec)));
+      return m_OneUse(m_Intrinsic<Intrinsic::vector_reduce_fadd>(m_Value(Sum),
+                                                                 m_Value(Vec)));
     };
     Value *A0, *A1, *V0, *V1;
     if (match(Op0, m_FaddRdx(A0, V0)) && match(Op1, m_FaddRdx(A1, V1)) &&
@@ -2290,9 +2288,8 @@
       // Difference of sums is sum of differences:
       // add_rdx(A0, V0) - add_rdx(A1, V1) --> add_rdx(A0, V0 - V1) - A1
       Value *Sub = Builder.CreateFSubFMF(V0, V1, &I);
-      Value *Rdx = Builder.CreateIntrinsic(
-          Intrinsic::experimental_vector_reduce_v2_fadd,
-          {A0->getType(), Sub->getType()}, {A0, Sub}, &I);
+      Value *Rdx = Builder.CreateIntrinsic(Intrinsic::vector_reduce_fadd,
+                                           {Sub->getType()}, {A0, Sub}, &I);
       return BinaryOperator::CreateFSubFMF(Rdx, A1, &I);
     }
 
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -2995,7 +2995,7 @@
     setOrigin(&I, getOrigin(&I, 0));
   }
 
-  // Instrument experimental.vector.reduce.or intrinsic.
+  // Instrument vector.reduce.or intrinsic.
   // Valid (non-poisoned) set bits in the operand pull low the
   // corresponding shadow bits.
   void handleVectorReduceOrIntrinsic(IntrinsicInst &I) {
@@ -3013,7 +3013,7 @@
     setOrigin(&I, getOrigin(&I, 0));
   }
 
-  // Instrument experimental.vector.reduce.or intrinsic.
+  // Instrument vector.reduce.and intrinsic.
   // Valid (non-poisoned) unset bits in the operand pull down the
   // corresponding shadow bits.
   void handleVectorReduceAndIntrinsic(IntrinsicInst &I) {
@@ -3264,15 +3264,15 @@
     case Intrinsic::masked_load:
       handleMaskedLoad(I);
       break;
-    case Intrinsic::experimental_vector_reduce_and:
+    case Intrinsic::vector_reduce_and:
       handleVectorReduceAndIntrinsic(I);
       break;
-    case Intrinsic::experimental_vector_reduce_or:
+    case Intrinsic::vector_reduce_or:
       handleVectorReduceOrIntrinsic(I);
       break;
-    case Intrinsic::experimental_vector_reduce_add:
-    case Intrinsic::experimental_vector_reduce_xor:
-    case Intrinsic::experimental_vector_reduce_mul:
+    case Intrinsic::vector_reduce_add:
+    case Intrinsic::vector_reduce_xor:
+    case Intrinsic::vector_reduce_mul:
       handleVectorReduceIntrinsic(I);
       break;
     case Intrinsic::x86_sse_stmxcsr:
diff --git a/llvm/test/Analysis/CostModel/AArch64/vector-reduce.ll b/llvm/test/Analysis/CostModel/AArch64/vector-reduce.ll
--- a/llvm/test/Analysis/CostModel/AArch64/vector-reduce.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/vector-reduce.ll
@@ -2,278 +2,278 @@
 ; RUN: llc < %s -mtriple=aarch64--linux-gnu | FileCheck %s --check-prefix=CODE
 
 ; COST-LABEL: add.i8.v8i8
-; COST:       Found an estimated cost of 1 for instruction: %r = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> %v)
+; COST:       Found an estimated cost of 1 for instruction: %r = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %v)
 ; CODE-LABEL: add.i8.v8i8
 ; CODE:       addv b0, v0.8b
 define i8 @add.i8.v8i8(<8 x i8> %v) {
-  %r = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> %v)
+  %r = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %v)
   ret i8 %r
 }
 
 ; COST-LABEL: add.i8.v16i8
-; COST:       Found an estimated cost of 1 for instruction: %r = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %v)
+; COST:       Found an estimated cost of 1 for instruction: %r = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %v)
 ; CODE-LABEL: add.i8.v16i8
 ; CODE:       addv b0, v0.16b
 define i8 @add.i8.v16i8(<16 x i8> %v) {
-  %r = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %v)
+  %r = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %v)
   ret i8 %r
 }
 
 ; COST-LABEL: add.i16.v4i16
-; COST:       Found an estimated cost of 1 for instruction: %r = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> %v)
+; COST:       Found an estimated cost of 1 for instruction: %r = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %v)
 ; CODE-LABEL: add.i16.v4i16
 ; CODE:       addv h0, v0.4h
 define i16 @add.i16.v4i16(<4 x i16> %v) {
-  %r = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> %v)
+  %r = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %v)
   ret i16 %r
 }
 
 ; COST-LABEL: add.i16.v8i16
-; COST:       Found an estimated cost of 1 for instruction: %r = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %v)
+; COST:       Found an estimated cost of 1 for instruction: %r = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %v)
 ; CODE-LABEL: add.i16.v8i16
 ; CODE:       addv h0, v0.8h
 define i16 @add.i16.v8i16(<8 x i16> %v) {
-  %r = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %v)
+  %r = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %v)
   ret i16 %r
 }
 
 ; COST-LABEL: add.i32.v4i32
-; COST:       Found an estimated cost of 1 for instruction: %r = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %v)
+; COST:       Found an estimated cost of 1 for instruction: %r = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %v)
 ; CODE-LABEL: add.i32.v4i32
 ; CODE:       addv s0, v0.4s
 define i32 @add.i32.v4i32(<4 x i32> %v) {
-  %r = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %v)
+  %r = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %v)
   ret i32 %r
 }
 
 ; COST-LABEL: umin.i8.v8i8
-; COST:       Found an estimated cost of 216 for instruction: %r = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> %v)
+; COST:       Found an estimated cost of 216 for instruction: %r = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> %v)
 ; CODE-LABEL: umin.i8.v8i8
 ; CODE:       uminv b0, v0.8b
 define i8 @umin.i8.v8i8(<8 x i8> %v) {
-  %r = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> %v)
+  %r = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> %v)
   ret i8 %r
 }
 
 ; COST-LABEL: umin.i8.v16i8
-; COST:       Found an estimated cost of 608 for instruction: %r = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> %v)
+; COST:       Found an estimated cost of 608 for instruction: %r = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> %v)
 ; CODE-LABEL: umin.i8.v16i8
 ; CODE:       uminv b0, v0.16b
 define i8 @umin.i8.v16i8(<16 x i8> %v) {
-  %r = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> %v)
+  %r = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> %v)
   ret i8 %r
 }
 
 ; COST-LABEL: umin.i16.v4i16
-; COST:       Found an estimated cost of 64 for instruction: %r = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> %v)
+; COST:       Found an estimated cost of 64 for instruction: %r = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> %v)
 ; CODE-LABEL: umin.i16.v4i16
 ; CODE:       uminv h0, v0.4h
 define i16 @umin.i16.v4i16(<4 x i16> %v) {
-  %r = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> %v)
+  %r = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> %v)
   ret i16 %r
 }
 
 ; COST-LABEL: umin.i16.v8i16
-; COST:       Found an estimated cost of 216 for instruction: %r = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> %v)
+; COST:       Found an estimated cost of 216 for instruction: %r = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> %v)
 ; CODE-LABEL: umin.i16.v8i16
 ; CODE:       uminv h0, v0.8h
 define i16 @umin.i16.v8i16(<8 x i16> %v) {
-  %r = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> %v)
+  %r = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> %v)
   ret i16 %r
 }
 
 ; COST-LABEL: umin.i32.v4i32
-; COST:       Found an estimated cost of 34 for instruction: %r = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> %v)
+; COST:       Found an estimated cost of 34 for instruction: %r = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %v)
 ; CODE-LABEL: umin.i32.v4i32
 ; CODE:       uminv s0, v0.4s
 define i32 @umin.i32.v4i32(<4 x i32> %v) {
-  %r = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> %v)
+  %r = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %v)
   ret i32 %r
 }
 
 ; COST-LABEL: umax.i8.v8i8
-; COST:       Found an estimated cost of 216 for instruction: %r = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> %v)
+; COST:       Found an estimated cost of 216 for instruction: %r = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> %v)
 ; CODE-LABEL: umax.i8.v8i8
 ; CODE:       umaxv b0, v0.8b
 define i8 @umax.i8.v8i8(<8 x i8> %v) {
-  %r = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> %v)
+  %r = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> %v)
   ret i8 %r
 }
 
 ; COST-LABEL: umax.i8.v16i8
-; COST:       Found an estimated cost of 608 for instruction: %r = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> %v)
+; COST:       Found an estimated cost of 608 for instruction: %r = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> %v)
 ; CODE-LABEL: umax.i8.v16i8
 ; CODE:       umaxv b0, v0.16b
 define i8 @umax.i8.v16i8(<16 x i8> %v) {
-  %r = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> %v)
+  %r = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> %v)
   ret i8 %r
 }
 
 ; COST-LABEL: umax.i16.v4i16
-; COST:       Found an estimated cost of 64 for instruction: %r = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> %v)
+; COST:       Found an estimated cost of 64 for instruction: %r = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> %v)
 ; CODE-LABEL: umax.i16.v4i16
 ; CODE:       umaxv h0, v0.4h
 define i16 @umax.i16.v4i16(<4 x i16> %v) {
-  %r = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> %v)
+  %r = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> %v)
   ret i16 %r
 }
 
 ; COST-LABEL: umax.i16.v8i16
-; COST:       Found an estimated cost of 216 for instruction: %r = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> %v)
+; COST:       Found an estimated cost of 216 for instruction: %r = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> %v)
 ; CODE-LABEL: umax.i16.v8i16
 ; CODE:       umaxv h0, v0.8h
 define i16 @umax.i16.v8i16(<8 x i16> %v) {
-  %r = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> %v)
+  %r = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> %v)
   ret i16 %r
 }
 
 ; COST-LABEL: umax.i32.v4i32
-; COST:       Found an estimated cost of 34 for instruction: %r = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> %v)
+; COST:       Found an estimated cost of 34 for instruction: %r = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %v)
 ; CODE-LABEL: umax.i32.v4i32
 ; CODE:       umaxv s0, v0.4s
 define i32 @umax.i32.v4i32(<4 x i32> %v) {
-  %r = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> %v)
+  %r = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %v)
   ret i32 %r
 }
 
 ; COST-LABEL: smin.i8.v8i8
-; COST:       Found an estimated cost of 216 for instruction: %r = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> %v)
+; COST:       Found an estimated cost of 216 for instruction: %r = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> %v)
 ; CODE-LABEL: smin.i8.v8i8
 ; CODE:       sminv b0, v0.8b
 define i8 @smin.i8.v8i8(<8 x i8> %v) {
-  %r = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> %v)
+  %r = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> %v)
   ret i8 %r
 }
 
 ; COST-LABEL: smin.i8.v16i8
-; COST:       Found an estimated cost of 608 for instruction: %r = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> %v)
+; COST:       Found an estimated cost of 608 for instruction: %r = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> %v)
 ; CODE-LABEL: smin.i8.v16i8
 ; CODE:       sminv b0, v0.16b
 define i8 @smin.i8.v16i8(<16 x i8> %v) {
-  %r = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> %v)
+  %r = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> %v)
   ret i8 %r
 }
 
 ; COST-LABEL: smin.i16.v4i16
-; COST:       Found an estimated cost of 64 for instruction: %r = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> %v)
+; COST:       Found an estimated cost of 64 for instruction: %r = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> %v)
 ; CODE-LABEL: smin.i16.v4i16
 ; CODE:       sminv h0, v0.4h
 define i16 @smin.i16.v4i16(<4 x i16> %v) {
-  %r = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> %v)
+  %r = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> %v)
   ret i16 %r
 }
 
 ; COST-LABEL: smin.i16.v8i16
-; COST:       Found an estimated cost of 216 for instruction: %r = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> %v)
+; COST:       Found an estimated cost of 216 for instruction: %r = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> %v)
 ; CODE-LABEL: smin.i16.v8i16
 ; CODE:       sminv h0, v0.8h
 define i16 @smin.i16.v8i16(<8 x i16> %v) {
-  %r = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> %v)
+  %r = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> %v)
   ret i16 %r
 }
 
 ; COST-LABEL: smin.i32.v4i32
-; COST:       Found an estimated cost of 34 for instruction: %r = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> %v)
+; COST:       Found an estimated cost of 34 for instruction: %r = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %v)
 ; CODE-LABEL: smin.i32.v4i32
 ; CODE:       sminv s0, v0.4s
 define i32 @smin.i32.v4i32(<4 x i32> %v) {
-  %r = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> %v)
+  %r = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %v)
   ret i32 %r
 }
 
 ; COST-LABEL: smax.i8.v8i8
-; COST:       Found an estimated cost of 216 for instruction: %r = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> %v)
+; COST:       Found an estimated cost of 216 for instruction: %r = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> %v)
 ; CODE-LABEL: smax.i8.v8i8
 ; CODE:       smaxv b0, v0.8b
 define i8 @smax.i8.v8i8(<8 x i8> %v) {
-  %r = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> %v)
+  %r = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> %v)
   ret i8 %r
 }
 
 ; COST-LABEL: smax.i8.v16i8
-; COST:       Found an estimated cost of 608 for instruction: %r = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> %v)
+; COST:       Found an estimated cost of 608 for instruction: %r = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> %v)
 ; CODE-LABEL: smax.i8.v16i8
 ; CODE:       smaxv b0, v0.16b
 define i8 @smax.i8.v16i8(<16 x i8> %v) {
-  %r = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> %v)
+  %r = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> %v)
   ret i8 %r
 }
 
 ; COST-LABEL: smax.i16.v4i16
-; COST:       Found an estimated cost of 64 for instruction: %r = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> %v)
+; COST:       Found an estimated cost of 64 for instruction: %r = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> %v)
 ; CODE-LABEL: smax.i16.v4i16
 ; CODE:       smaxv h0, v0.4h
 define i16 @smax.i16.v4i16(<4 x i16> %v) {
-  %r = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> %v)
+  %r = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> %v)
   ret i16 %r
 }
 
 ; COST-LABEL: smax.i16.v8i16
-; COST:       Found an estimated cost of 216 for instruction: %r = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> %v)
+; COST:       Found an estimated cost of 216 for instruction: %r = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> %v)
 ; CODE-LABEL: smax.i16.v8i16
 ; CODE:       smaxv h0, v0.8h
 define i16 @smax.i16.v8i16(<8 x i16> %v) {
-  %r = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> %v)
+  %r = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> %v)
   ret i16 %r
 }
 
 ; COST-LABEL: smax.i32.v4i32
-; COST:       Found an estimated cost of 34 for instruction: %r = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> %v)
+; COST:       Found an estimated cost of 34 for instruction: %r = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %v)
 ; CODE-LABEL: smax.i32.v4i32
 ; CODE:       smaxv s0, v0.4s
 define i32 @smax.i32.v4i32(<4 x i32> %v) {
-  %r = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> %v)
+  %r = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %v)
   ret i32 %r
 }
 
 ; COST-LABEL: fmin.f32.v4f32
-; COST:       Found an estimated cost of 34 for instruction: %r = call nnan float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> %v)
+; COST:       Found an estimated cost of 34 for instruction: %r = call nnan float @llvm.vector.reduce.fmin.v4f32(<4 x float> %v)
 ; CODE-LABEL: fmin.f32.v4f32
 ; CODE:       fminnmv s0, v0.4s
 define float @fmin.f32.v4f32(<4 x float> %v) {
-  %r = call nnan float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> %v)
+  %r = call nnan float @llvm.vector.reduce.fmin.v4f32(<4 x float> %v)
   ret float %r
 }
 
 ; COST-LABEL: fmax.f32.v4f32
-; COST:       Found an estimated cost of 34 for instruction: %r = call nnan float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> %v)
+; COST:       Found an estimated cost of 34 for instruction: %r = call nnan float @llvm.vector.reduce.fmax.v4f32(<4 x float> %v)
 ; CODE-LABEL: fmax.f32.v4f32
 ; CODE:       fmaxnmv s0, v0.4s
 define float @fmax.f32.v4f32(<4 x float> %v) {
-  %r = call nnan float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> %v)
+  %r = call nnan float @llvm.vector.reduce.fmax.v4f32(<4 x float> %v)
   ret float %r
 }
 
-declare i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8>)
-declare i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16>)
-declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
-
-declare i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8>)
-declare i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16>)
-declare i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32>)
-
-declare i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8>)
-declare i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16>)
-declare i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32>)
-
-declare i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8>)
-declare i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16>)
-declare i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32>)
-
-declare i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8>)
-declare i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16>)
-declare i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32>)
-
-declare float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float>)
-
-declare float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float>)
+declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)
+declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
+
+declare i8 @llvm.vector.reduce.umin.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.umin.v16i8(<16 x i8>)
+declare i16 @llvm.vector.reduce.umin.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.umin.v8i16(<8 x i16>)
+declare i32 @llvm.vector.reduce.umin.v4i32(<4 x i32>)
+
+declare i8 @llvm.vector.reduce.umax.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.umax.v16i8(<16 x i8>)
+declare i16 @llvm.vector.reduce.umax.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.umax.v8i16(<8 x i16>)
+declare i32 @llvm.vector.reduce.umax.v4i32(<4 x i32>)
+
+declare i8 @llvm.vector.reduce.smin.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.smin.v16i8(<16 x i8>)
+declare i16 @llvm.vector.reduce.smin.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.smin.v8i16(<8 x i16>)
+declare i32 @llvm.vector.reduce.smin.v4i32(<4 x i32>)
+
+declare i8 @llvm.vector.reduce.smax.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.smax.v16i8(<16 x i8>)
+declare i16 @llvm.vector.reduce.smax.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.smax.v8i16(<8 x i16>)
+declare i32 @llvm.vector.reduce.smax.v4i32(<4 x i32>)
+
+declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>)
+
+declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>)
diff --git a/llvm/test/Analysis/CostModel/ARM/reduce-add.ll b/llvm/test/Analysis/CostModel/ARM/reduce-add.ll
--- a/llvm/test/Analysis/CostModel/ARM/reduce-add.ll
+++ b/llvm/test/Analysis/CostModel/ARM/reduce-add.ll
@@ -8,155 +8,155 @@
 
 define i32 @reduce_i64(i32 %arg) {
 ; V8M-RECIP-LABEL: 'reduce_i64'
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
 ; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-RECIP-LABEL: 'reduce_i64'
-; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> undef)
-; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> undef)
-; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> undef)
-; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> undef)
-; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
 ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; MVE-RECIP-LABEL: 'reduce_i64'
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 202 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 730 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 202 for instruction: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 730 for instruction: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; V8M-SIZE-LABEL: 'reduce_i64'
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-SIZE-LABEL: 'reduce_i64'
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; MVE-SIZE-LABEL: 'reduce_i64'
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
-  %V1  = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> undef)
-  %V2  = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> undef)
-  %V4  = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> undef)
-  %V8  = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> undef)
-  %V16 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> undef)
+  %V1  = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)
+  %V2  = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
+  %V4  = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
+  %V8  = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
+  %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i32(i32 %arg) {
 ; V8M-RECIP-LABEL: 'reduce_i32'
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 190 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 382 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 190 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 382 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
 ; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-RECIP-LABEL: 'reduce_i32'
-; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef)
-; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef)
-; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 150 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef)
-; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 391 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef)
-; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 488 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef)
-; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 682 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef)
-; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1070 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 150 for instruction: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 391 for instruction: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 488 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 682 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1070 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
 ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; MVE-RECIP-LABEL: 'reduce_i32'
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 782 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4120 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 5658 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 11806 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 36390 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 782 for instruction: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4120 for instruction: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 5658 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 11806 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 36390 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; V8M-SIZE-LABEL: 'reduce_i32'
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-SIZE-LABEL: 'reduce_i32'
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; MVE-SIZE-LABEL: 'reduce_i32'
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
-  %V2   = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef)
-  %V4   = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef)
-  %V8   = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef)
-  %V16  = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef)
-  %V32  = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef)
-  %V64  = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef)
-  %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef)
+  %V2   = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
+  %V4   = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
+  %V8   = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
+  %V16  = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
+  %V32  = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
+  %V64  = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
+  %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
   ret i32 undef
 }
 
-declare i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64>)
-declare i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64>)
-declare i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64>)
+declare i64 @llvm.vector.reduce.add.v1i64(<1 x i64>)
+declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>)
+declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>)
 
-declare i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32>)
+declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.add.v32i32(<32 x i32>)
 
-declare i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16>)
-declare i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16>)
-declare i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16>)
+declare i16 @llvm.vector.reduce.add.v2i16(<2 x i16>)
+declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.add.v32i16(<32 x i16>)
+declare i16 @llvm.vector.reduce.add.v64i16(<64 x i16>)
 
-declare i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8>)
-declare i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8>)
-declare i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8>)
-declare i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8>)
+declare i8 @llvm.vector.reduce.add.v2i8(<2 x i8>)
+declare i8 @llvm.vector.reduce.add.v4i8(<4 x i8>)
+declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.add.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.add.v64i8(<64 x i8>)
+declare i8 @llvm.vector.reduce.add.v128i8(<128 x i8>)
diff --git a/llvm/test/Analysis/CostModel/ARM/reduce-smax.ll b/llvm/test/Analysis/CostModel/ARM/reduce-smax.ll
--- a/llvm/test/Analysis/CostModel/ARM/reduce-smax.ll
+++ b/llvm/test/Analysis/CostModel/ARM/reduce-smax.ll
@@ -5,171 +5,171 @@
 
 define i32 @reduce_i64(i32 %arg) {
 ; V8M-LABEL: 'reduce_i64'
-; V8M-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.v1i64(<1 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smax.v2i64(<2 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smax.v4i64(<4 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smax.v8i64(<8 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 167 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smax.v16i64(<16 x i64> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V4 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %V8 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 167 for instruction: %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef)
 ; V8M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-LABEL: 'reduce_i64'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.v1i64(<1 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smax.v2i64(<2 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smax.v4i64(<4 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smax.v8i64(<8 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 178 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smax.v16i64(<16 x i64> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1 = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V4 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V8 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 178 for instruction: %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; MVE-LABEL: 'reduce_i64'
-; MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.v1i64(<1 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smax.v2i64(<2 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smax.v4i64(<4 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 282 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smax.v8i64(<8 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 970 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smax.v16i64(<16 x i64> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V2 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V4 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 282 for instruction: %V8 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 970 for instruction: %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef)
 ; MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V1  = call i64 @llvm.experimental.vector.reduce.smax.v1i64(<1 x i64> undef)
-  %V2  = call i64 @llvm.experimental.vector.reduce.smax.v2i64(<2 x i64> undef)
-  %V4  = call i64 @llvm.experimental.vector.reduce.smax.v4i64(<4 x i64> undef)
-  %V8  = call i64 @llvm.experimental.vector.reduce.smax.v8i64(<8 x i64> undef)
-  %V16 = call i64 @llvm.experimental.vector.reduce.smax.v16i64(<16 x i64> undef)
+  %V1  = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef)
+  %V2  = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef)
+  %V4  = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef)
+  %V8  = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef)
+  %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i32(i32 %arg) {
 ; V8M-LABEL: 'reduce_i32'
-; V8M-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smax.v32i32(<32 x i32> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef)
 ; V8M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-LABEL: 'reduce_i32'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 237 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smax.v32i32(<32 x i32> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V8 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 237 for instruction: %V32 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; MVE-LABEL: 'reduce_i32'
-; MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 240 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 632 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 2184 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smax.v32i32(<32 x i32> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 240 for instruction: %V8 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 632 for instruction: %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 2184 for instruction: %V32 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef)
 ; MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2  = call i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32> undef)
-  %V4  = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> undef)
-  %V8  = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> undef)
-  %V16 = call i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32> undef)
-  %V32 = call i32 @llvm.experimental.vector.reduce.smax.v32i32(<32 x i32> undef)
+  %V2  = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef)
+  %V4  = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef)
+  %V8  = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef)
+  %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef)
+  %V32 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i16(i32 %arg) {
 ; V8M-LABEL: 'reduce_i16'
-; V8M-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.v32i16(<32 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 379 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.v64i16(<64 x i16> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 379 for instruction: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef)
 ; V8M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-LABEL: 'reduce_i16'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 153 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 203 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 303 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.v32i16(<32 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 503 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.v64i16(<64 x i16> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 153 for instruction: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 203 for instruction: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 303 for instruction: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 503 for instruction: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; MVE-LABEL: 'reduce_i16'
-; MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 788 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 1176 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 2720 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.v32i16(<32 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 8880 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.v64i16(<64 x i16> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 788 for instruction: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 1176 for instruction: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 2720 for instruction: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 8880 for instruction: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef)
 ; MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2  = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> undef)
-  %V4  = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> undef)
-  %V8  = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> undef)
-  %V16 = call i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16> undef)
-  %V32 = call i16 @llvm.experimental.vector.reduce.smax.v32i16(<32 x i16> undef)
-  %V64 = call i16 @llvm.experimental.vector.reduce.smax.v64i16(<64 x i16> undef)
+  %V2  = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef)
+  %V4  = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef)
+  %V8  = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef)
+  %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef)
+  %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef)
+  %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i8(i32 %arg) {
 ; V8M-LABEL: 'reduce_i8'
-; V8M-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 379 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 763 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.v128i8(<128 x i8> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 379 for instruction: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 763 for instruction: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef)
 ; V8M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-LABEL: 'reduce_i8'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 153 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 395 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 493 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 689 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 1081 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.v128i8(<128 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 153 for instruction: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 395 for instruction: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 493 for instruction: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 689 for instruction: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 1081 for instruction: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; MVE-LABEL: 'reduce_i8'
-; MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 788 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 4128 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 5668 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 11820 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 36412 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.v128i8(<128 x i8> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 788 for instruction: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 4128 for instruction: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 5668 for instruction: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 11820 for instruction: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 36412 for instruction: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef)
 ; MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2   = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> undef)
-  %V4   = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> undef)
-  %V8   = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> undef)
-  %V16  = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> undef)
-  %V32  = call i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8> undef)
-  %V64  = call i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8> undef)
-  %V128 = call i8 @llvm.experimental.vector.reduce.smax.v128i8(<128 x i8> undef)
+  %V2   = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef)
+  %V4   = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef)
+  %V8   = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef)
+  %V16  = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef)
+  %V32  = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef)
+  %V64  = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef)
+  %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef)
   ret i32 undef
 }
 
-declare i64 @llvm.experimental.vector.reduce.smax.v1i64(<1 x i64>)
-declare i64 @llvm.experimental.vector.reduce.smax.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.smax.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.smax.v8i64(<8 x i64>)
-declare i64 @llvm.experimental.vector.reduce.smax.v16i64(<16 x i64>)
+declare i64 @llvm.vector.reduce.smax.v1i64(<1 x i64>)
+declare i64 @llvm.vector.reduce.smax.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.smax.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.smax.v8i64(<8 x i64>)
+declare i64 @llvm.vector.reduce.smax.v16i64(<16 x i64>)
 
-declare i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smax.v32i32(<32 x i32>)
+declare i32 @llvm.vector.reduce.smax.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.smax.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.smax.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.smax.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.smax.v32i32(<32 x i32>)
 
-declare i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smax.v32i16(<32 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smax.v64i16(<64 x i16>)
+declare i16 @llvm.vector.reduce.smax.v2i16(<2 x i16>)
+declare i16 @llvm.vector.reduce.smax.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.smax.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.smax.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.smax.v32i16(<32 x i16>)
+declare i16 @llvm.vector.reduce.smax.v64i16(<64 x i16>)
 
-declare i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smax.v128i8(<128 x i8>)
+declare i8 @llvm.vector.reduce.smax.v2i8(<2 x i8>)
+declare i8 @llvm.vector.reduce.smax.v4i8(<4 x i8>)
+declare i8 @llvm.vector.reduce.smax.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.smax.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.smax.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.smax.v64i8(<64 x i8>)
+declare i8 @llvm.vector.reduce.smax.v128i8(<128 x i8>)
diff --git a/llvm/test/Analysis/CostModel/ARM/reduce-smin.ll b/llvm/test/Analysis/CostModel/ARM/reduce-smin.ll
--- a/llvm/test/Analysis/CostModel/ARM/reduce-smin.ll
+++ b/llvm/test/Analysis/CostModel/ARM/reduce-smin.ll
@@ -5,171 +5,171 @@
 
 define i32 @reduce_i64(i32 %arg) {
 ; V8M-LABEL: 'reduce_i64'
-; V8M-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.v1i64(<1 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smin.v2i64(<2 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smin.v4i64(<4 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smin.v8i64(<8 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 167 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smin.v16i64(<16 x i64> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V4 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %V8 = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 167 for instruction: %V16 = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> undef)
 ; V8M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-LABEL: 'reduce_i64'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.v1i64(<1 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smin.v2i64(<2 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smin.v4i64(<4 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smin.v8i64(<8 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 178 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smin.v16i64(<16 x i64> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1 = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V4 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V8 = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 178 for instruction: %V16 = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; MVE-LABEL: 'reduce_i64'
-; MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.v1i64(<1 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smin.v2i64(<2 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smin.v4i64(<4 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 282 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smin.v8i64(<8 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 970 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smin.v16i64(<16 x i64> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V2 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V4 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 282 for instruction: %V8 = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 970 for instruction: %V16 = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> undef)
 ; MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V1  = call i64 @llvm.experimental.vector.reduce.smin.v1i64(<1 x i64> undef)
-  %V2  = call i64 @llvm.experimental.vector.reduce.smin.v2i64(<2 x i64> undef)
-  %V4  = call i64 @llvm.experimental.vector.reduce.smin.v4i64(<4 x i64> undef)
-  %V8  = call i64 @llvm.experimental.vector.reduce.smin.v8i64(<8 x i64> undef)
-  %V16 = call i64 @llvm.experimental.vector.reduce.smin.v16i64(<16 x i64> undef)
+  %V1  = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> undef)
+  %V2  = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef)
+  %V4  = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef)
+  %V8  = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> undef)
+  %V16 = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i32(i32 %arg) {
 ; V8M-LABEL: 'reduce_i32'
-; V8M-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smin.v16i32(<16 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smin.v32i32(<32 x i32> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> undef)
 ; V8M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-LABEL: 'reduce_i32'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smin.v16i32(<16 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 237 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smin.v32i32(<32 x i32> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V8 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 237 for instruction: %V32 = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; MVE-LABEL: 'reduce_i32'
-; MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 240 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 632 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smin.v16i32(<16 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 2184 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smin.v32i32(<32 x i32> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 240 for instruction: %V8 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 632 for instruction: %V16 = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 2184 for instruction: %V32 = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> undef)
 ; MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2  = call i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32> undef)
-  %V4  = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> undef)
-  %V8  = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> undef)
-  %V16 = call i32 @llvm.experimental.vector.reduce.smin.v16i32(<16 x i32> undef)
-  %V32 = call i32 @llvm.experimental.vector.reduce.smin.v32i32(<32 x i32> undef)
+  %V2  = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef)
+  %V4  = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef)
+  %V8  = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef)
+  %V16 = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> undef)
+  %V32 = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i16(i32 %arg) {
 ; V8M-LABEL: 'reduce_i16'
-; V8M-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.v32i16(<32 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 379 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.v64i16(<64 x i16> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 379 for instruction: %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef)
 ; V8M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-LABEL: 'reduce_i16'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 153 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 203 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 303 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.v32i16(<32 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 503 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.v64i16(<64 x i16> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 153 for instruction: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 203 for instruction: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 303 for instruction: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 503 for instruction: %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; MVE-LABEL: 'reduce_i16'
-; MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 788 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 1176 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 2720 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.v32i16(<32 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 8880 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.v64i16(<64 x i16> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 788 for instruction: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 1176 for instruction: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 2720 for instruction: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 8880 for instruction: %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef)
 ; MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2  = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> undef)
-  %V4  = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> undef)
-  %V8  = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> undef)
-  %V16 = call i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16> undef)
-  %V32 = call i16 @llvm.experimental.vector.reduce.smin.v32i16(<32 x i16> undef)
-  %V64 = call i16 @llvm.experimental.vector.reduce.smin.v64i16(<64 x i16> undef)
+  %V2  = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef)
+  %V4  = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef)
+  %V8  = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef)
+  %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef)
+  %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef)
+  %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i8(i32 %arg) {
 ; V8M-LABEL: 'reduce_i8'
-; V8M-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 379 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 763 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.v128i8(<128 x i8> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 379 for instruction: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 763 for instruction: %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef)
 ; V8M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-LABEL: 'reduce_i8'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 153 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 395 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 493 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 689 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 1081 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.v128i8(<128 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 153 for instruction: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 395 for instruction: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 493 for instruction: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 689 for instruction: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 1081 for instruction: %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; MVE-LABEL: 'reduce_i8'
-; MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 788 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 4128 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 5668 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 11820 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 36412 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.v128i8(<128 x i8> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 788 for instruction: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 4128 for instruction: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 5668 for instruction: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 11820 for instruction: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 36412 for instruction: %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef)
 ; MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2   = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> undef)
-  %V4   = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> undef)
-  %V8   = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> undef)
-  %V16  = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> undef)
-  %V32  = call i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8> undef)
-  %V64  = call i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8> undef)
-  %V128 = call i8 @llvm.experimental.vector.reduce.smin.v128i8(<128 x i8> undef)
+  %V2   = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef)
+  %V4   = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef)
+  %V8   = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef)
+  %V16  = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef)
+  %V32  = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef)
+  %V64  = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef)
+  %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef)
   ret i32 undef
 }
 
-declare i64 @llvm.experimental.vector.reduce.smin.v1i64(<1 x i64>)
-declare i64 @llvm.experimental.vector.reduce.smin.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.smin.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.smin.v8i64(<8 x i64>)
-declare i64 @llvm.experimental.vector.reduce.smin.v16i64(<16 x i64>)
+declare i64 @llvm.vector.reduce.smin.v1i64(<1 x i64>)
+declare i64 @llvm.vector.reduce.smin.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.smin.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.smin.v8i64(<8 x i64>)
+declare i64 @llvm.vector.reduce.smin.v16i64(<16 x i64>)
 
-declare i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smin.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smin.v32i32(<32 x i32>)
+declare i32 @llvm.vector.reduce.smin.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.smin.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.smin.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.smin.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.smin.v32i32(<32 x i32>)
 
-declare i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smin.v32i16(<32 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smin.v64i16(<64 x i16>)
+declare i16 @llvm.vector.reduce.smin.v2i16(<2 x i16>)
+declare i16 @llvm.vector.reduce.smin.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.smin.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.smin.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.smin.v32i16(<32 x i16>)
+declare i16 @llvm.vector.reduce.smin.v64i16(<64 x i16>)
 
-declare i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smin.v128i8(<128 x i8>)
+declare i8 @llvm.vector.reduce.smin.v2i8(<2 x i8>)
+declare i8 @llvm.vector.reduce.smin.v4i8(<4 x i8>)
+declare i8 @llvm.vector.reduce.smin.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.smin.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.smin.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.smin.v64i8(<64 x i8>)
+declare i8 @llvm.vector.reduce.smin.v128i8(<128 x i8>)
diff --git a/llvm/test/Analysis/CostModel/ARM/reduce-umax.ll b/llvm/test/Analysis/CostModel/ARM/reduce-umax.ll
--- a/llvm/test/Analysis/CostModel/ARM/reduce-umax.ll
+++ b/llvm/test/Analysis/CostModel/ARM/reduce-umax.ll
@@ -5,171 +5,171 @@
 
 define i32 @reduce_i64(i32 %arg) {
 ; V8M-LABEL: 'reduce_i64'
-; V8M-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.v1i64(<1 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umax.v2i64(<2 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umax.v4i64(<4 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umax.v8i64(<8 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 167 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umax.v16i64(<16 x i64> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V4 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %V8 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 167 for instruction: %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef)
 ; V8M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-LABEL: 'reduce_i64'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.v1i64(<1 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umax.v2i64(<2 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umax.v4i64(<4 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umax.v8i64(<8 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 178 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umax.v16i64(<16 x i64> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1 = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V4 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V8 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 178 for instruction: %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; MVE-LABEL: 'reduce_i64'
-; MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.v1i64(<1 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umax.v2i64(<2 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umax.v4i64(<4 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 282 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umax.v8i64(<8 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 970 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umax.v16i64(<16 x i64> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V2 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V4 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 282 for instruction: %V8 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 970 for instruction: %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef)
 ; MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V1  = call i64 @llvm.experimental.vector.reduce.umax.v1i64(<1 x i64> undef)
-  %V2  = call i64 @llvm.experimental.vector.reduce.umax.v2i64(<2 x i64> undef)
-  %V4  = call i64 @llvm.experimental.vector.reduce.umax.v4i64(<4 x i64> undef)
-  %V8  = call i64 @llvm.experimental.vector.reduce.umax.v8i64(<8 x i64> undef)
-  %V16 = call i64 @llvm.experimental.vector.reduce.umax.v16i64(<16 x i64> undef)
+  %V1  = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef)
+  %V2  = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef)
+  %V4  = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef)
+  %V8  = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef)
+  %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i32(i32 %arg) {
 ; V8M-LABEL: 'reduce_i32'
-; V8M-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umax.v32i32(<32 x i32> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> undef)
 ; V8M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-LABEL: 'reduce_i32'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 237 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umax.v32i32(<32 x i32> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V8 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 237 for instruction: %V32 = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; MVE-LABEL: 'reduce_i32'
-; MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 240 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 632 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 2184 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umax.v32i32(<32 x i32> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 240 for instruction: %V8 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 632 for instruction: %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 2184 for instruction: %V32 = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> undef)
 ; MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2  = call i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32> undef)
-  %V4  = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> undef)
-  %V8  = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> undef)
-  %V16 = call i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32> undef)
-  %V32 = call i32 @llvm.experimental.vector.reduce.umax.v32i32(<32 x i32> undef)
+  %V2  = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef)
+  %V4  = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef)
+  %V8  = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef)
+  %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef)
+  %V32 = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i16(i32 %arg) {
 ; V8M-LABEL: 'reduce_i16'
-; V8M-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.v32i16(<32 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 379 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.v64i16(<64 x i16> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 379 for instruction: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef)
 ; V8M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-LABEL: 'reduce_i16'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 153 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 203 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 303 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.v32i16(<32 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 503 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.v64i16(<64 x i16> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 153 for instruction: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 203 for instruction: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 303 for instruction: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 503 for instruction: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; MVE-LABEL: 'reduce_i16'
-; MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 788 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 1176 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 2720 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.v32i16(<32 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 8880 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.v64i16(<64 x i16> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 788 for instruction: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 1176 for instruction: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 2720 for instruction: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 8880 for instruction: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef)
 ; MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2  = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> undef)
-  %V4  = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> undef)
-  %V8  = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> undef)
-  %V16 = call i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16> undef)
-  %V32 = call i16 @llvm.experimental.vector.reduce.umax.v32i16(<32 x i16> undef)
-  %V64 = call i16 @llvm.experimental.vector.reduce.umax.v64i16(<64 x i16> undef)
+  %V2  = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef)
+  %V4  = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef)
+  %V8  = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef)
+  %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef)
+  %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef)
+  %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i8(i32 %arg) {
 ; V8M-LABEL: 'reduce_i8'
-; V8M-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 379 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 763 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.v128i8(<128 x i8> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 379 for instruction: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 763 for instruction: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef)
 ; V8M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-LABEL: 'reduce_i8'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 153 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 395 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 493 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 689 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 1081 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.v128i8(<128 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 153 for instruction: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 395 for instruction: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 493 for instruction: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 689 for instruction: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 1081 for instruction: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; MVE-LABEL: 'reduce_i8'
-; MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 788 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 4128 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 5668 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 11820 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 36412 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.v128i8(<128 x i8> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 788 for instruction: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 4128 for instruction: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 5668 for instruction: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 11820 for instruction: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 36412 for instruction: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef)
 ; MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2   = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> undef)
-  %V4   = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> undef)
-  %V8   = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> undef)
-  %V16  = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> undef)
-  %V32  = call i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8> undef)
-  %V64  = call i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8> undef)
-  %V128 = call i8 @llvm.experimental.vector.reduce.umax.v128i8(<128 x i8> undef)
+  %V2   = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef)
+  %V4   = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef)
+  %V8   = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef)
+  %V16  = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef)
+  %V32  = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef)
+  %V64  = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef)
+  %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef)
   ret i32 undef
 }
 
-declare i64 @llvm.experimental.vector.reduce.umax.v1i64(<1 x i64>)
-declare i64 @llvm.experimental.vector.reduce.umax.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.umax.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.umax.v8i64(<8 x i64>)
-declare i64 @llvm.experimental.vector.reduce.umax.v16i64(<16 x i64>)
+declare i64 @llvm.vector.reduce.umax.v1i64(<1 x i64>)
+declare i64 @llvm.vector.reduce.umax.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.umax.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.umax.v8i64(<8 x i64>)
+declare i64 @llvm.vector.reduce.umax.v16i64(<16 x i64>)
 
-declare i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umax.v32i32(<32 x i32>)
+declare i32 @llvm.vector.reduce.umax.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.umax.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.umax.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.umax.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.umax.v32i32(<32 x i32>)
 
-declare i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umax.v32i16(<32 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umax.v64i16(<64 x i16>)
+declare i16 @llvm.vector.reduce.umax.v2i16(<2 x i16>)
+declare i16 @llvm.vector.reduce.umax.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.umax.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.umax.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.umax.v32i16(<32 x i16>)
+declare i16 @llvm.vector.reduce.umax.v64i16(<64 x i16>)
 
-declare i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umax.v128i8(<128 x i8>)
+declare i8 @llvm.vector.reduce.umax.v2i8(<2 x i8>)
+declare i8 @llvm.vector.reduce.umax.v4i8(<4 x i8>)
+declare i8 @llvm.vector.reduce.umax.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.umax.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.umax.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.umax.v64i8(<64 x i8>)
+declare i8 @llvm.vector.reduce.umax.v128i8(<128 x i8>)
diff --git a/llvm/test/Analysis/CostModel/ARM/reduce-umin.ll b/llvm/test/Analysis/CostModel/ARM/reduce-umin.ll
--- a/llvm/test/Analysis/CostModel/ARM/reduce-umin.ll
+++ b/llvm/test/Analysis/CostModel/ARM/reduce-umin.ll
@@ -5,171 +5,171 @@
 
 define i32 @reduce_i64(i32 %arg) {
 ; V8M-LABEL: 'reduce_i64'
-; V8M-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.v1i64(<1 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umin.v2i64(<2 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umin.v4i64(<4 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umin.v8i64(<8 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 167 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umin.v16i64(<16 x i64> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V4 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %V8 = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 167 for instruction: %V16 = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> undef)
 ; V8M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-LABEL: 'reduce_i64'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.v1i64(<1 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umin.v2i64(<2 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umin.v4i64(<4 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umin.v8i64(<8 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 178 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umin.v16i64(<16 x i64> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1 = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V4 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V8 = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 178 for instruction: %V16 = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; MVE-LABEL: 'reduce_i64'
-; MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.v1i64(<1 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umin.v2i64(<2 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umin.v4i64(<4 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 282 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umin.v8i64(<8 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 970 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umin.v16i64(<16 x i64> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V2 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V4 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 282 for instruction: %V8 = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 970 for instruction: %V16 = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> undef)
 ; MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V1  = call i64 @llvm.experimental.vector.reduce.umin.v1i64(<1 x i64> undef)
-  %V2  = call i64 @llvm.experimental.vector.reduce.umin.v2i64(<2 x i64> undef)
-  %V4  = call i64 @llvm.experimental.vector.reduce.umin.v4i64(<4 x i64> undef)
-  %V8  = call i64 @llvm.experimental.vector.reduce.umin.v8i64(<8 x i64> undef)
-  %V16 = call i64 @llvm.experimental.vector.reduce.umin.v16i64(<16 x i64> undef)
+  %V1  = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> undef)
+  %V2  = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef)
+  %V4  = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef)
+  %V8  = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> undef)
+  %V16 = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i32(i32 %arg) {
 ; V8M-LABEL: 'reduce_i32'
-; V8M-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umin.v16i32(<16 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umin.v32i32(<32 x i32> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> undef)
 ; V8M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-LABEL: 'reduce_i32'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umin.v16i32(<16 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 237 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umin.v32i32(<32 x i32> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V8 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 237 for instruction: %V32 = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; MVE-LABEL: 'reduce_i32'
-; MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 240 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 632 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umin.v16i32(<16 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 2184 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umin.v32i32(<32 x i32> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 240 for instruction: %V8 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 632 for instruction: %V16 = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 2184 for instruction: %V32 = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> undef)
 ; MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2  = call i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32> undef)
-  %V4  = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> undef)
-  %V8  = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> undef)
-  %V16 = call i32 @llvm.experimental.vector.reduce.umin.v16i32(<16 x i32> undef)
-  %V32 = call i32 @llvm.experimental.vector.reduce.umin.v32i32(<32 x i32> undef)
+  %V2  = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef)
+  %V4  = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef)
+  %V8  = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef)
+  %V16 = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> undef)
+  %V32 = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i16(i32 %arg) {
 ; V8M-LABEL: 'reduce_i16'
-; V8M-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.v32i16(<32 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 379 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.v64i16(<64 x i16> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 379 for instruction: %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef)
 ; V8M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-LABEL: 'reduce_i16'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 153 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 203 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 303 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.v32i16(<32 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 503 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.v64i16(<64 x i16> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 153 for instruction: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 203 for instruction: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 303 for instruction: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 503 for instruction: %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; MVE-LABEL: 'reduce_i16'
-; MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 788 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 1176 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 2720 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.v32i16(<32 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 8880 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.v64i16(<64 x i16> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 788 for instruction: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 1176 for instruction: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 2720 for instruction: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 8880 for instruction: %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef)
 ; MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2  = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> undef)
-  %V4  = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> undef)
-  %V8  = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> undef)
-  %V16 = call i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16> undef)
-  %V32 = call i16 @llvm.experimental.vector.reduce.umin.v32i16(<32 x i16> undef)
-  %V64 = call i16 @llvm.experimental.vector.reduce.umin.v64i16(<64 x i16> undef)
+  %V2  = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef)
+  %V4  = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef)
+  %V8  = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef)
+  %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef)
+  %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef)
+  %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i8(i32 %arg) {
 ; V8M-LABEL: 'reduce_i8'
-; V8M-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 379 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 763 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.v128i8(<128 x i8> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 379 for instruction: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 763 for instruction: %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef)
 ; V8M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-LABEL: 'reduce_i8'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 153 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 395 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 493 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 689 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 1081 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.v128i8(<128 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 153 for instruction: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 395 for instruction: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 493 for instruction: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 689 for instruction: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 1081 for instruction: %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; MVE-LABEL: 'reduce_i8'
-; MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 788 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 4128 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 5668 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 11820 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 36412 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.v128i8(<128 x i8> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 788 for instruction: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 4128 for instruction: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 5668 for instruction: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 11820 for instruction: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 36412 for instruction: %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef)
 ; MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2   = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> undef)
-  %V4   = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> undef)
-  %V8   = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> undef)
-  %V16  = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> undef)
-  %V32  = call i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8> undef)
-  %V64  = call i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8> undef)
-  %V128 = call i8 @llvm.experimental.vector.reduce.umin.v128i8(<128 x i8> undef)
+  %V2   = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef)
+  %V4   = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef)
+  %V8   = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef)
+  %V16  = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef)
+  %V32  = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef)
+  %V64  = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef)
+  %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef)
   ret i32 undef
 }
 
-declare i64 @llvm.experimental.vector.reduce.umin.v1i64(<1 x i64>)
-declare i64 @llvm.experimental.vector.reduce.umin.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.umin.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.umin.v8i64(<8 x i64>)
-declare i64 @llvm.experimental.vector.reduce.umin.v16i64(<16 x i64>)
+declare i64 @llvm.vector.reduce.umin.v1i64(<1 x i64>)
+declare i64 @llvm.vector.reduce.umin.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.umin.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.umin.v8i64(<8 x i64>)
+declare i64 @llvm.vector.reduce.umin.v16i64(<16 x i64>)
 
-declare i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umin.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umin.v32i32(<32 x i32>)
+declare i32 @llvm.vector.reduce.umin.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.umin.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.umin.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.umin.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.umin.v32i32(<32 x i32>)
 
-declare i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umin.v32i16(<32 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umin.v64i16(<64 x i16>)
+declare i16 @llvm.vector.reduce.umin.v2i16(<2 x i16>)
+declare i16 @llvm.vector.reduce.umin.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.umin.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.umin.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.umin.v32i16(<32 x i16>)
+declare i16 @llvm.vector.reduce.umin.v64i16(<64 x i16>)
 
-declare i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umin.v128i8(<128 x i8>)
+declare i8 @llvm.vector.reduce.umin.v2i8(<2 x i8>)
+declare i8 @llvm.vector.reduce.umin.v4i8(<4 x i8>)
+declare i8 @llvm.vector.reduce.umin.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.umin.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.umin.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.umin.v64i8(<64 x i8>)
+declare i8 @llvm.vector.reduce.umin.v128i8(<128 x i8>)
diff --git a/llvm/test/Analysis/CostModel/X86/reduce-add.ll b/llvm/test/Analysis/CostModel/X86/reduce-add.ll
--- a/llvm/test/Analysis/CostModel/X86/reduce-add.ll
+++ b/llvm/test/Analysis/CostModel/X86/reduce-add.ll
@@ -12,279 +12,279 @@
 
 define i32 @reduce_i64(i32 %arg) {
 ; SSE-LABEL: 'reduce_i64'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i64'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i64'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i64'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'reduce_i64'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V1  = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> undef)
-  %V2  = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> undef)
-  %V4  = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> undef)
-  %V8  = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> undef)
-  %V16 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> undef)
+  %V1  = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)
+  %V2  = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
+  %V4  = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
+  %V8  = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
+  %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i32(i32 %arg) {
 ; SSE-LABEL: 'reduce_i32'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i32'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i32'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i32'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'reduce_i32'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2  = call i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32> undef)
-  %V4  = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> undef)
-  %V8  = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> undef)
-  %V16 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> undef)
-  %V32 = call i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32> undef)
+  %V2  = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef)
+  %V4  = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef)
+  %V8  = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef)
+  %V16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef)
+  %V32 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i16(i32 %arg) {
 ; SSE-LABEL: 'reduce_i16'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i16'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V64 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i16'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i16'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i16'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i16'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'reduce_i16'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2  = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef)
-  %V4  = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef)
-  %V8  = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> undef)
-  %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef)
-  %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef)
-  %V64 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> undef)
+  %V2  = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef)
+  %V4  = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef)
+  %V8  = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef)
+  %V16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef)
+  %V32 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> undef)
+  %V64 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i8(i32 %arg) {
 ; SSE-LABEL: 'reduce_i8'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i8'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i8'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i8'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i8'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i8'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'reduce_i8'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2   = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef)
-  %V4   = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef)
-  %V8   = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef)
-  %V16  = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef)
-  %V32  = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef)
-  %V64  = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef)
-  %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef)
+  %V2   = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
+  %V4   = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
+  %V8   = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
+  %V16  = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
+  %V32  = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
+  %V64  = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
+  %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
   ret i32 undef
 }
 
-declare i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64>)
-declare i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64>)
-declare i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64>)
+declare i64 @llvm.vector.reduce.add.v1i64(<1 x i64>)
+declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>)
+declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>)
 
-declare i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32>)
+declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.add.v32i32(<32 x i32>)
 
-declare i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16>)
-declare i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16>)
-declare i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16>)
+declare i16 @llvm.vector.reduce.add.v2i16(<2 x i16>)
+declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.add.v32i16(<32 x i16>)
+declare i16 @llvm.vector.reduce.add.v64i16(<64 x i16>)
 
-declare i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8>)
-declare i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8>)
-declare i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8>)
-declare i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8>)
+declare i8 @llvm.vector.reduce.add.v2i8(<2 x i8>)
+declare i8 @llvm.vector.reduce.add.v4i8(<4 x i8>)
+declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.add.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.add.v64i8(<64 x i8>)
+declare i8 @llvm.vector.reduce.add.v128i8(<128 x i8>)
diff --git a/llvm/test/Analysis/CostModel/X86/reduce-and.ll b/llvm/test/Analysis/CostModel/X86/reduce-and.ll
--- a/llvm/test/Analysis/CostModel/X86/reduce-and.ll
+++ b/llvm/test/Analysis/CostModel/X86/reduce-and.ll
@@ -10,258 +10,258 @@
 
 define i32 @reduce_i64(i32 %arg) {
 ; SSE-LABEL: 'reduce_i64'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.and.v1i64(<1 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.and.v2i64(<2 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.and.v4i64(<4 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.and.v8i64(<8 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.and.v16i64(<16 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.and.v1i64(<1 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.vector.reduce.and.v8i64(<8 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i64 @llvm.vector.reduce.and.v16i64(<16 x i64> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX-LABEL: 'reduce_i64'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.and.v1i64(<1 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.and.v2i64(<2 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.and.v4i64(<4 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.and.v8i64(<8 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.and.v16i64(<16 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.and.v1i64(<1 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.vector.reduce.and.v8i64(<8 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.vector.reduce.and.v16i64(<16 x i64> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i64'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.and.v1i64(<1 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.and.v2i64(<2 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.and.v4i64(<4 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.and.v8i64(<8 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.and.v16i64(<16 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.and.v1i64(<1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.vector.reduce.and.v8i64(<8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.vector.reduce.and.v16i64(<16 x i64> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V1  = call i64 @llvm.experimental.vector.reduce.and.v1i64(<1 x i64> undef)
-  %V2  = call i64 @llvm.experimental.vector.reduce.and.v2i64(<2 x i64> undef)
-  %V4  = call i64 @llvm.experimental.vector.reduce.and.v4i64(<4 x i64> undef)
-  %V8  = call i64 @llvm.experimental.vector.reduce.and.v8i64(<8 x i64> undef)
-  %V16 = call i64 @llvm.experimental.vector.reduce.and.v16i64(<16 x i64> undef)
+  %V1  = call i64 @llvm.vector.reduce.and.v1i64(<1 x i64> undef)
+  %V2  = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> undef)
+  %V4  = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> undef)
+  %V8  = call i64 @llvm.vector.reduce.and.v8i64(<8 x i64> undef)
+  %V16 = call i64 @llvm.vector.reduce.and.v16i64(<16 x i64> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i32(i32 %arg) {
 ; SSE-LABEL: 'reduce_i32'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.and.v2i32(<2 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.and.v16i32(<16 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.and.v32i32(<32 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.vector.reduce.and.v32i32(<32 x i32> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX-LABEL: 'reduce_i32'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.and.v2i32(<2 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.and.v16i32(<16 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.and.v32i32(<32 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.and.v32i32(<32 x i32> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i32'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.and.v2i32(<2 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.and.v16i32(<16 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.and.v32i32(<32 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.and.v32i32(<32 x i32> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2  = call i32 @llvm.experimental.vector.reduce.and.v2i32(<2 x i32> undef)
-  %V4  = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> undef)
-  %V8  = call i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> undef)
-  %V16 = call i32 @llvm.experimental.vector.reduce.and.v16i32(<16 x i32> undef)
-  %V32 = call i32 @llvm.experimental.vector.reduce.and.v32i32(<32 x i32> undef)
+  %V2  = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> undef)
+  %V4  = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> undef)
+  %V8  = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> undef)
+  %V16 = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> undef)
+  %V32 = call i32 @llvm.vector.reduce.and.v32i32(<32 x i32> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i16(i32 %arg) {
 ; SSE-LABEL: 'reduce_i16'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.and.v2i16(<2 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.v32i16(<32 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.v64i16(<64 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.and.v2i16(<2 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.vector.reduce.and.v32i16(<32 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.vector.reduce.and.v64i16(<64 x i16> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX-LABEL: 'reduce_i16'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.and.v2i16(<2 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.v32i16(<32 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.v64i16(<64 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.and.v2i16(<2 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.vector.reduce.and.v32i16(<32 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.vector.reduce.and.v64i16(<64 x i16> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i16'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.and.v2i16(<2 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.v32i16(<32 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.v64i16(<64 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.and.v2i16(<2 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.and.v32i16(<32 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.vector.reduce.and.v64i16(<64 x i16> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2  = call i16 @llvm.experimental.vector.reduce.and.v2i16(<2 x i16> undef)
-  %V4  = call i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16> undef)
-  %V8  = call i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16> undef)
-  %V16 = call i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16> undef)
-  %V32 = call i16 @llvm.experimental.vector.reduce.and.v32i16(<32 x i16> undef)
-  %V64 = call i16 @llvm.experimental.vector.reduce.and.v64i16(<64 x i16> undef)
+  %V2  = call i16 @llvm.vector.reduce.and.v2i16(<2 x i16> undef)
+  %V4  = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> undef)
+  %V8  = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> undef)
+  %V16 = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> undef)
+  %V32 = call i16 @llvm.vector.reduce.and.v32i16(<32 x i16> undef)
+  %V64 = call i16 @llvm.vector.reduce.and.v64i16(<64 x i16> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i8(i32 %arg) {
 ; SSE-LABEL: 'reduce_i8'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.and.v2i8(<2 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.and.v4i8(<4 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.v64i8(<64 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.v128i8(<128 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.and.v2i8(<2 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.vector.reduce.and.v64i8(<64 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.vector.reduce.and.v128i8(<128 x i8> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX-LABEL: 'reduce_i8'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.and.v2i8(<2 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.and.v4i8(<4 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.v64i8(<64 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.v128i8(<128 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.and.v2i8(<2 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.vector.reduce.and.v64i8(<64 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.vector.reduce.and.v128i8(<128 x i8> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i8'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.and.v2i8(<2 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.and.v4i8(<4 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.v64i8(<64 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.v128i8(<128 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.and.v2i8(<2 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.vector.reduce.and.v64i8(<64 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.vector.reduce.and.v128i8(<128 x i8> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2   = call i8 @llvm.experimental.vector.reduce.and.v2i8(<2 x i8> undef)
-  %V4   = call i8 @llvm.experimental.vector.reduce.and.v4i8(<4 x i8> undef)
-  %V8   = call i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8> undef)
-  %V16  = call i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8> undef)
-  %V32  = call i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8> undef)
-  %V64  = call i8 @llvm.experimental.vector.reduce.and.v64i8(<64 x i8> undef)
-  %V128 = call i8 @llvm.experimental.vector.reduce.and.v128i8(<128 x i8> undef)
+  %V2   = call i8 @llvm.vector.reduce.and.v2i8(<2 x i8> undef)
+  %V4   = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> undef)
+  %V8   = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> undef)
+  %V16  = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> undef)
+  %V32  = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> undef)
+  %V64  = call i8 @llvm.vector.reduce.and.v64i8(<64 x i8> undef)
+  %V128 = call i8 @llvm.vector.reduce.and.v128i8(<128 x i8> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i1(i32 %arg) {
 ; SSE-LABEL: 'reduce_i1'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.and.v1i1(<1 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.v2i1(<2 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.v4i1(<4 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.v8i1(<8 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.v16i1(<16 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.v32i1(<32 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.v64i1(<64 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.v128i1(<128 x i1> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i1'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.and.v1i1(<1 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.v2i1(<2 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.v4i1(<4 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.v8i1(<8 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.v16i1(<16 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.v32i1(<32 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.v64i1(<64 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.v128i1(<128 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i1'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.and.v1i1(<1 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.v2i1(<2 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.v4i1(<4 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.v8i1(<8 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.v16i1(<16 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.v32i1(<32 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.v64i1(<64 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.v128i1(<128 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i1'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.and.v1i1(<1 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.v2i1(<2 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.v4i1(<4 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.v8i1(<8 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.v16i1(<16 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.v32i1(<32 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.v64i1(<64 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.v128i1(<128 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i1'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.and.v1i1(<1 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.v2i1(<2 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.v4i1(<4 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.v8i1(<8 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.v16i1(<16 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.v32i1(<32 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.v64i1(<64 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.v128i1(<128 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i1'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.and.v1i1(<1 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.v2i1(<2 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.v4i1(<4 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.v8i1(<8 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.v16i1(<16 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.v32i1(<32 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.v64i1(<64 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.v128i1(<128 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V1   = call i1 @llvm.experimental.vector.reduce.and.v1i1(<1 x i1> undef)
-  %V2   = call i1 @llvm.experimental.vector.reduce.and.v2i1(<2 x i1> undef)
-  %V4   = call i1 @llvm.experimental.vector.reduce.and.v4i1(<4 x i1> undef)
-  %V8   = call i1 @llvm.experimental.vector.reduce.and.v8i1(<8 x i1> undef)
-  %V16  = call i1 @llvm.experimental.vector.reduce.and.v16i1(<16 x i1> undef)
-  %V32  = call i1 @llvm.experimental.vector.reduce.and.v32i1(<32 x i1> undef)
-  %V64  = call i1 @llvm.experimental.vector.reduce.and.v64i1(<64 x i1> undef)
-  %V128 = call i1 @llvm.experimental.vector.reduce.and.v128i1(<128 x i1> undef)
+  %V1   = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef)
+  %V2   = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> undef)
+  %V4   = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> undef)
+  %V8   = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> undef)
+  %V16  = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> undef)
+  %V32  = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef)
+  %V64  = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef)
+  %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef)
   ret i32 undef
 }
 
-declare i64 @llvm.experimental.vector.reduce.and.v1i64(<1 x i64>)
-declare i64 @llvm.experimental.vector.reduce.and.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.and.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.and.v8i64(<8 x i64>)
-declare i64 @llvm.experimental.vector.reduce.and.v16i64(<16 x i64>)
+declare i64 @llvm.vector.reduce.and.v1i64(<1 x i64>)
+declare i64 @llvm.vector.reduce.and.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.and.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.and.v8i64(<8 x i64>)
+declare i64 @llvm.vector.reduce.and.v16i64(<16 x i64>)
 
-declare i32 @llvm.experimental.vector.reduce.and.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.and.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.and.v32i32(<32 x i32>)
+declare i32 @llvm.vector.reduce.and.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.and.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.and.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.and.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.and.v32i32(<32 x i32>)
 
-declare i16 @llvm.experimental.vector.reduce.and.v2i16(<2 x i16>)
-declare i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.and.v32i16(<32 x i16>)
-declare i16 @llvm.experimental.vector.reduce.and.v64i16(<64 x i16>)
+declare i16 @llvm.vector.reduce.and.v2i16(<2 x i16>)
+declare i16 @llvm.vector.reduce.and.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.and.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.and.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.and.v32i16(<32 x i16>)
+declare i16 @llvm.vector.reduce.and.v64i16(<64 x i16>)
 
-declare i8 @llvm.experimental.vector.reduce.and.v2i8(<2 x i8>)
-declare i8 @llvm.experimental.vector.reduce.and.v4i8(<4 x i8>)
-declare i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.and.v64i8(<64 x i8>)
-declare i8 @llvm.experimental.vector.reduce.and.v128i8(<128 x i8>)
+declare i8 @llvm.vector.reduce.and.v2i8(<2 x i8>)
+declare i8 @llvm.vector.reduce.and.v4i8(<4 x i8>)
+declare i8 @llvm.vector.reduce.and.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.and.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.and.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.and.v64i8(<64 x i8>)
+declare i8 @llvm.vector.reduce.and.v128i8(<128 x i8>)
 
-declare i1 @llvm.experimental.vector.reduce.and.v1i1(<1 x i1>)
-declare i1 @llvm.experimental.vector.reduce.and.v2i1(<2 x i1>)
-declare i1 @llvm.experimental.vector.reduce.and.v4i1(<4 x i1>)
-declare i1 @llvm.experimental.vector.reduce.and.v8i1(<8 x i1>)
-declare i1 @llvm.experimental.vector.reduce.and.v16i1(<16 x i1>)
-declare i1 @llvm.experimental.vector.reduce.and.v32i1(<32 x i1>)
-declare i1 @llvm.experimental.vector.reduce.and.v64i1(<64 x i1>)
-declare i1 @llvm.experimental.vector.reduce.and.v128i1(<128 x i1>)
+declare i1 @llvm.vector.reduce.and.v1i1(<1 x i1>)
+declare i1 @llvm.vector.reduce.and.v2i1(<2 x i1>)
+declare i1 @llvm.vector.reduce.and.v4i1(<4 x i1>)
+declare i1 @llvm.vector.reduce.and.v8i1(<8 x i1>)
+declare i1 @llvm.vector.reduce.and.v16i1(<16 x i1>)
+declare i1 @llvm.vector.reduce.and.v32i1(<32 x i1>)
+declare i1 @llvm.vector.reduce.and.v64i1(<64 x i1>)
+declare i1 @llvm.vector.reduce.and.v128i1(<128 x i1>)
diff --git a/llvm/test/Analysis/CostModel/X86/reduce-fmax.ll b/llvm/test/Analysis/CostModel/X86/reduce-fmax.ll
--- a/llvm/test/Analysis/CostModel/X86/reduce-fmax.ll
+++ b/llvm/test/Analysis/CostModel/X86/reduce-fmax.ll
@@ -11,83 +11,83 @@
 
 define i32 @reduce_f64(i32 %arg) {
 ; SSE-LABEL: 'reduce_f64'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.experimental.vector.reduce.fmax.v1f64(<1 x double> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call double @llvm.experimental.vector.reduce.fmax.v8f64(<8 x double> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call double @llvm.experimental.vector.reduce.fmax.v16f64(<16 x double> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.vector.reduce.fmax.v1f64(<1 x double> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call double @llvm.vector.reduce.fmax.v8f64(<8 x double> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call double @llvm.vector.reduce.fmax.v16f64(<16 x double> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX-LABEL: 'reduce_f64'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.experimental.vector.reduce.fmax.v1f64(<1 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call double @llvm.experimental.vector.reduce.fmax.v8f64(<8 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16 = call double @llvm.experimental.vector.reduce.fmax.v16f64(<16 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.vector.reduce.fmax.v1f64(<1 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call double @llvm.vector.reduce.fmax.v8f64(<8 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16 = call double @llvm.vector.reduce.fmax.v16f64(<16 x double> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_f64'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.experimental.vector.reduce.fmax.v1f64(<1 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call double @llvm.experimental.vector.reduce.fmax.v8f64(<8 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16 = call double @llvm.experimental.vector.reduce.fmax.v16f64(<16 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.vector.reduce.fmax.v1f64(<1 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call double @llvm.vector.reduce.fmax.v8f64(<8 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16 = call double @llvm.vector.reduce.fmax.v16f64(<16 x double> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V1  = call double @llvm.experimental.vector.reduce.fmax.v1f64(<1 x double> undef)
-  %V2  = call double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double> undef)
-  %V4  = call double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double> undef)
-  %V8  = call double @llvm.experimental.vector.reduce.fmax.v8f64(<8 x double> undef)
-  %V16 = call double @llvm.experimental.vector.reduce.fmax.v16f64(<16 x double> undef)
+  %V1  = call double @llvm.vector.reduce.fmax.v1f64(<1 x double> undef)
+  %V2  = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
+  %V4  = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef)
+  %V8  = call double @llvm.vector.reduce.fmax.v8f64(<8 x double> undef)
+  %V16 = call double @llvm.vector.reduce.fmax.v16f64(<16 x double> undef)
   ret i32 undef
 }
 
 define i32 @reduce_f32(i32 %arg) {
 ; SSE-LABEL: 'reduce_f32'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.experimental.vector.reduce.fmax.v1f32(<1 x float> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16 = call float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call float @llvm.experimental.vector.reduce.fmax.v32f32(<32 x float> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.vector.reduce.fmax.v1f32(<1 x float> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16 = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call float @llvm.vector.reduce.fmax.v32f32(<32 x float> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX-LABEL: 'reduce_f32'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.experimental.vector.reduce.fmax.v1f32(<1 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16 = call float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32 = call float @llvm.experimental.vector.reduce.fmax.v32f32(<32 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.vector.reduce.fmax.v1f32(<1 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16 = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32 = call float @llvm.vector.reduce.fmax.v32f32(<32 x float> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_f32'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.experimental.vector.reduce.fmax.v1f32(<1 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32 = call float @llvm.experimental.vector.reduce.fmax.v32f32(<32 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.vector.reduce.fmax.v1f32(<1 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32 = call float @llvm.vector.reduce.fmax.v32f32(<32 x float> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V1  = call float @llvm.experimental.vector.reduce.fmax.v1f32(<1 x float> undef)
-  %V2  = call float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float> undef)
-  %V4  = call float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> undef)
-  %V8  = call float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float> undef)
-  %V16 = call float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float> undef)
-  %V32 = call float @llvm.experimental.vector.reduce.fmax.v32f32(<32 x float> undef)
+  %V1  = call float @llvm.vector.reduce.fmax.v1f32(<1 x float> undef)
+  %V2  = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef)
+  %V4  = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
+  %V8  = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
+  %V16 = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> undef)
+  %V32 = call float @llvm.vector.reduce.fmax.v32f32(<32 x float> undef)
   ret i32 undef
 }
 
-declare double @llvm.experimental.vector.reduce.fmax.v1f64(<1 x double>)
-declare double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double>)
-declare double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double>)
-declare double @llvm.experimental.vector.reduce.fmax.v8f64(<8 x double>)
-declare double @llvm.experimental.vector.reduce.fmax.v16f64(<16 x double>)
+declare double @llvm.vector.reduce.fmax.v1f64(<1 x double>)
+declare double @llvm.vector.reduce.fmax.v2f64(<2 x double>)
+declare double @llvm.vector.reduce.fmax.v4f64(<4 x double>)
+declare double @llvm.vector.reduce.fmax.v8f64(<8 x double>)
+declare double @llvm.vector.reduce.fmax.v16f64(<16 x double>)
 
-declare float @llvm.experimental.vector.reduce.fmax.v1f32(<1 x float>)
-declare float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float>)
-declare float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float>)
-declare float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float>)
-declare float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float>)
-declare float @llvm.experimental.vector.reduce.fmax.v32f32(<32 x float>)
+declare float @llvm.vector.reduce.fmax.v1f32(<1 x float>)
+declare float @llvm.vector.reduce.fmax.v2f32(<2 x float>)
+declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>)
+declare float @llvm.vector.reduce.fmax.v8f32(<8 x float>)
+declare float @llvm.vector.reduce.fmax.v16f32(<16 x float>)
+declare float @llvm.vector.reduce.fmax.v32f32(<32 x float>)
diff --git a/llvm/test/Analysis/CostModel/X86/reduce-fmin.ll b/llvm/test/Analysis/CostModel/X86/reduce-fmin.ll
--- a/llvm/test/Analysis/CostModel/X86/reduce-fmin.ll
+++ b/llvm/test/Analysis/CostModel/X86/reduce-fmin.ll
@@ -11,83 +11,83 @@
 
 define i32 @reduce_f64(i32 %arg) {
 ; SSE-LABEL: 'reduce_f64'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.experimental.vector.reduce.fmin.v1f64(<1 x double> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call double @llvm.experimental.vector.reduce.fmin.v4f64(<4 x double> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call double @llvm.experimental.vector.reduce.fmin.v8f64(<8 x double> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call double @llvm.experimental.vector.reduce.fmin.v16f64(<16 x double> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.vector.reduce.fmin.v1f64(<1 x double> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call double @llvm.vector.reduce.fmin.v8f64(<8 x double> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call double @llvm.vector.reduce.fmin.v16f64(<16 x double> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX-LABEL: 'reduce_f64'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.experimental.vector.reduce.fmin.v1f64(<1 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call double @llvm.experimental.vector.reduce.fmin.v4f64(<4 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call double @llvm.experimental.vector.reduce.fmin.v8f64(<8 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16 = call double @llvm.experimental.vector.reduce.fmin.v16f64(<16 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.vector.reduce.fmin.v1f64(<1 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call double @llvm.vector.reduce.fmin.v8f64(<8 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16 = call double @llvm.vector.reduce.fmin.v16f64(<16 x double> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_f64'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.experimental.vector.reduce.fmin.v1f64(<1 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call double @llvm.experimental.vector.reduce.fmin.v4f64(<4 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call double @llvm.experimental.vector.reduce.fmin.v8f64(<8 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16 = call double @llvm.experimental.vector.reduce.fmin.v16f64(<16 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.vector.reduce.fmin.v1f64(<1 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call double @llvm.vector.reduce.fmin.v8f64(<8 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16 = call double @llvm.vector.reduce.fmin.v16f64(<16 x double> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V1  = call double @llvm.experimental.vector.reduce.fmin.v1f64(<1 x double> undef)
-  %V2  = call double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double> undef)
-  %V4  = call double @llvm.experimental.vector.reduce.fmin.v4f64(<4 x double> undef)
-  %V8  = call double @llvm.experimental.vector.reduce.fmin.v8f64(<8 x double> undef)
-  %V16 = call double @llvm.experimental.vector.reduce.fmin.v16f64(<16 x double> undef)
+  %V1  = call double @llvm.vector.reduce.fmin.v1f64(<1 x double> undef)
+  %V2  = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef)
+  %V4  = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef)
+  %V8  = call double @llvm.vector.reduce.fmin.v8f64(<8 x double> undef)
+  %V16 = call double @llvm.vector.reduce.fmin.v16f64(<16 x double> undef)
   ret i32 undef
 }
 
 define i32 @reduce_f32(i32 %arg) {
 ; SSE-LABEL: 'reduce_f32'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.experimental.vector.reduce.fmin.v1f32(<1 x float> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16 = call float @llvm.experimental.vector.reduce.fmin.v16f32(<16 x float> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call float @llvm.experimental.vector.reduce.fmin.v32f32(<32 x float> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.vector.reduce.fmin.v1f32(<1 x float> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16 = call float @llvm.vector.reduce.fmin.v16f32(<16 x float> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call float @llvm.vector.reduce.fmin.v32f32(<32 x float> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX-LABEL: 'reduce_f32'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.experimental.vector.reduce.fmin.v1f32(<1 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16 = call float @llvm.experimental.vector.reduce.fmin.v16f32(<16 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32 = call float @llvm.experimental.vector.reduce.fmin.v32f32(<32 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.vector.reduce.fmin.v1f32(<1 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16 = call float @llvm.vector.reduce.fmin.v16f32(<16 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32 = call float @llvm.vector.reduce.fmin.v32f32(<32 x float> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_f32'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.experimental.vector.reduce.fmin.v1f32(<1 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call float @llvm.experimental.vector.reduce.fmin.v16f32(<16 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32 = call float @llvm.experimental.vector.reduce.fmin.v32f32(<32 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.vector.reduce.fmin.v1f32(<1 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call float @llvm.vector.reduce.fmin.v16f32(<16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32 = call float @llvm.vector.reduce.fmin.v32f32(<32 x float> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V1  = call float @llvm.experimental.vector.reduce.fmin.v1f32(<1 x float> undef)
-  %V2  = call float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float> undef)
-  %V4  = call float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> undef)
-  %V8  = call float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float> undef)
-  %V16 = call float @llvm.experimental.vector.reduce.fmin.v16f32(<16 x float> undef)
-  %V32 = call float @llvm.experimental.vector.reduce.fmin.v32f32(<32 x float> undef)
+  %V1  = call float @llvm.vector.reduce.fmin.v1f32(<1 x float> undef)
+  %V2  = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> undef)
+  %V4  = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef)
+  %V8  = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef)
+  %V16 = call float @llvm.vector.reduce.fmin.v16f32(<16 x float> undef)
+  %V32 = call float @llvm.vector.reduce.fmin.v32f32(<32 x float> undef)
   ret i32 undef
 }
 
-declare double @llvm.experimental.vector.reduce.fmin.v1f64(<1 x double>)
-declare double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double>)
-declare double @llvm.experimental.vector.reduce.fmin.v4f64(<4 x double>)
-declare double @llvm.experimental.vector.reduce.fmin.v8f64(<8 x double>)
-declare double @llvm.experimental.vector.reduce.fmin.v16f64(<16 x double>)
+declare double @llvm.vector.reduce.fmin.v1f64(<1 x double>)
+declare double @llvm.vector.reduce.fmin.v2f64(<2 x double>)
+declare double @llvm.vector.reduce.fmin.v4f64(<4 x double>)
+declare double @llvm.vector.reduce.fmin.v8f64(<8 x double>)
+declare double @llvm.vector.reduce.fmin.v16f64(<16 x double>)
 
-declare float @llvm.experimental.vector.reduce.fmin.v1f32(<1 x float>)
-declare float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float>)
-declare float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float>)
-declare float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float>)
-declare float @llvm.experimental.vector.reduce.fmin.v16f32(<16 x float>)
-declare float @llvm.experimental.vector.reduce.fmin.v32f32(<32 x float>)
+declare float @llvm.vector.reduce.fmin.v1f32(<1 x float>)
+declare float @llvm.vector.reduce.fmin.v2f32(<2 x float>)
+declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>)
+declare float @llvm.vector.reduce.fmin.v8f32(<8 x float>)
+declare float @llvm.vector.reduce.fmin.v16f32(<16 x float>)
+declare float @llvm.vector.reduce.fmin.v32f32(<32 x float>)
diff --git a/llvm/test/Analysis/CostModel/X86/reduce-mul.ll b/llvm/test/Analysis/CostModel/X86/reduce-mul.ll
--- a/llvm/test/Analysis/CostModel/X86/reduce-mul.ll
+++ b/llvm/test/Analysis/CostModel/X86/reduce-mul.ll
@@ -10,276 +10,276 @@
 
 define i32 @reduce_i64(i32 %arg) {
 ; SSE-LABEL: 'reduce_i64'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.mul.v1i64(<1 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.mul.v2i64(<2 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.mul.v4i64(<4 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.mul.v8i64(<8 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.mul.v16i64(<16 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.mul.v1i64(<1 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V16 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i64'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.mul.v1i64(<1 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.mul.v2i64(<2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.mul.v4i64(<4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.mul.v8i64(<8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.mul.v16i64(<16 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.mul.v1i64(<1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V8 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %V16 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i64'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.mul.v1i64(<1 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.mul.v2i64(<2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.mul.v4i64(<4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.mul.v8i64(<8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.mul.v16i64(<16 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.mul.v1i64(<1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V16 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i64'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.mul.v1i64(<1 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.mul.v2i64(<2 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.mul.v4i64(<4 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.mul.v8i64(<8 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.mul.v16i64(<16 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.mul.v1i64(<1 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i64'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.mul.v1i64(<1 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.mul.v2i64(<2 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.mul.v4i64(<4 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.mul.v8i64(<8 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.mul.v16i64(<16 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.mul.v1i64(<1 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i64'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.mul.v1i64(<1 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.mul.v2i64(<2 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.mul.v4i64(<4 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.mul.v8i64(<8 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.mul.v16i64(<16 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.mul.v1i64(<1 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V1  = call i64 @llvm.experimental.vector.reduce.mul.v1i64(<1 x i64> undef)
-  %V2  = call i64 @llvm.experimental.vector.reduce.mul.v2i64(<2 x i64> undef)
-  %V4  = call i64 @llvm.experimental.vector.reduce.mul.v4i64(<4 x i64> undef)
-  %V8  = call i64 @llvm.experimental.vector.reduce.mul.v8i64(<8 x i64> undef)
-  %V16 = call i64 @llvm.experimental.vector.reduce.mul.v16i64(<16 x i64> undef)
+  %V1  = call i64 @llvm.vector.reduce.mul.v1i64(<1 x i64> undef)
+  %V2  = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef)
+  %V4  = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef)
+  %V8  = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef)
+  %V16 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i32(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i32'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.v16i32(<16 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.v32i32(<32 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2 = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V8 = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i32 @llvm.vector.reduce.mul.v32i32(<32 x i32> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i32'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.v16i32(<16 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.v32i32(<32 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2 = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V8 = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i32 @llvm.vector.reduce.mul.v32i32(<32 x i32> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'reduce_i32'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.v16i32(<16 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.v32i32(<32 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i32 @llvm.vector.reduce.mul.v32i32(<32 x i32> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i32'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.v16i32(<16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.v32i32(<32 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16 = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i32 @llvm.vector.reduce.mul.v32i32(<32 x i32> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i32'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.v16i32(<16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.v32i32(<32 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i32 @llvm.vector.reduce.mul.v32i32(<32 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i32'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.v16i32(<16 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.v32i32(<32 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.mul.v32i32(<32 x i32> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2  = call i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32> undef)
-  %V4  = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> undef)
-  %V8  = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> undef)
-  %V16 = call i32 @llvm.experimental.vector.reduce.mul.v16i32(<16 x i32> undef)
-  %V32 = call i32 @llvm.experimental.vector.reduce.mul.v32i32(<32 x i32> undef)
+  %V2  = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> undef)
+  %V4  = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> undef)
+  %V8  = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> undef)
+  %V16 = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> undef)
+  %V32 = call i32 @llvm.vector.reduce.mul.v32i32(<32 x i32> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i16(i32 %arg) {
 ; SSE-LABEL: 'reduce_i16'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.v64i16(<64 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.mul.v2i16(<2 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.mul.v4i16(<4 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.vector.reduce.mul.v32i16(<32 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.vector.reduce.mul.v64i16(<64 x i16> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i16'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.v64i16(<64 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.mul.v2i16(<2 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.mul.v4i16(<4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V32 = call i16 @llvm.vector.reduce.mul.v32i16(<32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V64 = call i16 @llvm.vector.reduce.mul.v64i16(<64 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i16'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.v64i16(<64 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.mul.v2i16(<2 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.mul.v4i16(<4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.vector.reduce.mul.v32i16(<32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.vector.reduce.mul.v64i16(<64 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i16'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.v64i16(<64 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.mul.v2i16(<2 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.mul.v4i16(<4 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.mul.v32i16(<32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i16 @llvm.vector.reduce.mul.v64i16(<64 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i16'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.v64i16(<64 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.mul.v2i16(<2 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.mul.v4i16(<4 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.mul.v32i16(<32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.vector.reduce.mul.v64i16(<64 x i16> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i16'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.v64i16(<64 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.mul.v2i16(<2 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.mul.v4i16(<4 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.mul.v32i16(<32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i16 @llvm.vector.reduce.mul.v64i16(<64 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2  = call i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16> undef)
-  %V4  = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> undef)
-  %V8  = call i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16> undef)
-  %V16 = call i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16> undef)
-  %V32 = call i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16> undef)
-  %V64 = call i16 @llvm.experimental.vector.reduce.mul.v64i16(<64 x i16> undef)
+  %V2  = call i16 @llvm.vector.reduce.mul.v2i16(<2 x i16> undef)
+  %V4  = call i16 @llvm.vector.reduce.mul.v4i16(<4 x i16> undef)
+  %V8  = call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> undef)
+  %V16 = call i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> undef)
+  %V32 = call i16 @llvm.vector.reduce.mul.v32i16(<32 x i16> undef)
+  %V64 = call i16 @llvm.vector.reduce.mul.v64i16(<64 x i16> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i8(i32 %arg) {
 ; SSE-LABEL: 'reduce_i8'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 89 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 137 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.v128i8(<128 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V4 = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8 = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V16 = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V32 = call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 89 for instruction: %V64 = call i8 @llvm.vector.reduce.mul.v64i8(<64 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 137 for instruction: %V128 = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i8'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.v128i8(<128 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V4 = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8 = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V16 = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32 = call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V64 = call i8 @llvm.vector.reduce.mul.v64i8(<64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %V128 = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i8'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.v128i8(<128 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4 = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8 = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V32 = call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V64 = call i8 @llvm.vector.reduce.mul.v64i8(<64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V128 = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i8'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.v128i8(<128 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4 = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16 = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V32 = call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V64 = call i8 @llvm.vector.reduce.mul.v64i8(<64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V128 = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i8'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.v128i8(<128 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8 = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V64 = call i8 @llvm.vector.reduce.mul.v64i8(<64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V128 = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i8'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.v128i8(<128 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4 = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16 = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V32 = call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V64 = call i8 @llvm.vector.reduce.mul.v64i8(<64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V128 = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2   = call i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8> undef)
-  %V4   = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> undef)
-  %V8   = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> undef)
-  %V16  = call i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8> undef)
-  %V32  = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> undef)
-  %V64  = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> undef)
-  %V128 = call i8 @llvm.experimental.vector.reduce.mul.v128i8(<128 x i8> undef)
+  %V2   = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> undef)
+  %V4   = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> undef)
+  %V8   = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> undef)
+  %V16  = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> undef)
+  %V32  = call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> undef)
+  %V64  = call i8 @llvm.vector.reduce.mul.v64i8(<64 x i8> undef)
+  %V128 = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> undef)
   ret i32 undef
 }
 
-declare i64 @llvm.experimental.vector.reduce.mul.v1i64(<1 x i64>)
-declare i64 @llvm.experimental.vector.reduce.mul.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.mul.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.mul.v8i64(<8 x i64>)
-declare i64 @llvm.experimental.vector.reduce.mul.v16i64(<16 x i64>)
+declare i64 @llvm.vector.reduce.mul.v1i64(<1 x i64>)
+declare i64 @llvm.vector.reduce.mul.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.mul.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.mul.v8i64(<8 x i64>)
+declare i64 @llvm.vector.reduce.mul.v16i64(<16 x i64>)
 
-declare i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.mul.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.mul.v32i32(<32 x i32>)
+declare i32 @llvm.vector.reduce.mul.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.mul.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.mul.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.mul.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.mul.v32i32(<32 x i32>)
 
-declare i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16>)
-declare i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16>)
-declare i16 @llvm.experimental.vector.reduce.mul.v64i16(<64 x i16>)
+declare i16 @llvm.vector.reduce.mul.v2i16(<2 x i16>)
+declare i16 @llvm.vector.reduce.mul.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.mul.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.mul.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.mul.v32i16(<32 x i16>)
+declare i16 @llvm.vector.reduce.mul.v64i16(<64 x i16>)
 
-declare i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8>)
-declare i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8>)
-declare i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8>)
-declare i8 @llvm.experimental.vector.reduce.mul.v128i8(<128 x i8>)
+declare i8 @llvm.vector.reduce.mul.v2i8(<2 x i8>)
+declare i8 @llvm.vector.reduce.mul.v4i8(<4 x i8>)
+declare i8 @llvm.vector.reduce.mul.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.mul.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.mul.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.mul.v64i8(<64 x i8>)
+declare i8 @llvm.vector.reduce.mul.v128i8(<128 x i8>)
diff --git a/llvm/test/Analysis/CostModel/X86/reduce-or.ll b/llvm/test/Analysis/CostModel/X86/reduce-or.ll
--- a/llvm/test/Analysis/CostModel/X86/reduce-or.ll
+++ b/llvm/test/Analysis/CostModel/X86/reduce-or.ll
@@ -10,258 +10,258 @@
 
 define i32 @reduce_i64(i32 %arg) {
 ; SSE-LABEL: 'reduce_i64'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.or.v1i64(<1 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.or.v2i64(<2 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.or.v4i64(<4 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.or.v8i64(<8 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.or.v16i64(<16 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.or.v1i64(<1 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX-LABEL: 'reduce_i64'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.or.v1i64(<1 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.or.v2i64(<2 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.or.v4i64(<4 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.or.v8i64(<8 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.or.v16i64(<16 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.or.v1i64(<1 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i64'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.or.v1i64(<1 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.or.v2i64(<2 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.or.v4i64(<4 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.or.v8i64(<8 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.or.v16i64(<16 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.or.v1i64(<1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V1  = call i64 @llvm.experimental.vector.reduce.or.v1i64(<1 x i64> undef)
-  %V2  = call i64 @llvm.experimental.vector.reduce.or.v2i64(<2 x i64> undef)
-  %V4  = call i64 @llvm.experimental.vector.reduce.or.v4i64(<4 x i64> undef)
-  %V8  = call i64 @llvm.experimental.vector.reduce.or.v8i64(<8 x i64> undef)
-  %V16 = call i64 @llvm.experimental.vector.reduce.or.v16i64(<16 x i64> undef)
+  %V1  = call i64 @llvm.vector.reduce.or.v1i64(<1 x i64> undef)
+  %V2  = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> undef)
+  %V4  = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> undef)
+  %V8  = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> undef)
+  %V16 = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i32(i32 %arg) {
 ; SSE-LABEL: 'reduce_i32'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.or.v2i32(<2 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.or.v16i32(<16 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.or.v32i32(<32 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.vector.reduce.or.v32i32(<32 x i32> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX-LABEL: 'reduce_i32'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.or.v2i32(<2 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.or.v16i32(<16 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.or.v32i32(<32 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.or.v32i32(<32 x i32> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i32'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.or.v2i32(<2 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.or.v16i32(<16 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.or.v32i32(<32 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.or.v32i32(<32 x i32> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2  = call i32 @llvm.experimental.vector.reduce.or.v2i32(<2 x i32> undef)
-  %V4  = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> undef)
-  %V8  = call i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> undef)
-  %V16 = call i32 @llvm.experimental.vector.reduce.or.v16i32(<16 x i32> undef)
-  %V32 = call i32 @llvm.experimental.vector.reduce.or.v32i32(<32 x i32> undef)
+  %V2  = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> undef)
+  %V4  = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> undef)
+  %V8  = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> undef)
+  %V16 = call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> undef)
+  %V32 = call i32 @llvm.vector.reduce.or.v32i32(<32 x i32> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i16(i32 %arg) {
 ; SSE-LABEL: 'reduce_i16'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.or.v2i16(<2 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.v4i16(<4 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.v8i16(<8 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.v32i16(<32 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.v64i16(<64 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.or.v2i16(<2 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.vector.reduce.or.v32i16(<32 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.vector.reduce.or.v64i16(<64 x i16> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX-LABEL: 'reduce_i16'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.or.v2i16(<2 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.v4i16(<4 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.v8i16(<8 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.v32i16(<32 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.v64i16(<64 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.or.v2i16(<2 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.vector.reduce.or.v32i16(<32 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.vector.reduce.or.v64i16(<64 x i16> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i16'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.or.v2i16(<2 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.v4i16(<4 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.v8i16(<8 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.v32i16(<32 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.v64i16(<64 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.or.v2i16(<2 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.or.v32i16(<32 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.vector.reduce.or.v64i16(<64 x i16> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2  = call i16 @llvm.experimental.vector.reduce.or.v2i16(<2 x i16> undef)
-  %V4  = call i16 @llvm.experimental.vector.reduce.or.v4i16(<4 x i16> undef)
-  %V8  = call i16 @llvm.experimental.vector.reduce.or.v8i16(<8 x i16> undef)
-  %V16 = call i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16> undef)
-  %V32 = call i16 @llvm.experimental.vector.reduce.or.v32i16(<32 x i16> undef)
-  %V64 = call i16 @llvm.experimental.vector.reduce.or.v64i16(<64 x i16> undef)
+  %V2  = call i16 @llvm.vector.reduce.or.v2i16(<2 x i16> undef)
+  %V4  = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> undef)
+  %V8  = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> undef)
+  %V16 = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> undef)
+  %V32 = call i16 @llvm.vector.reduce.or.v32i16(<32 x i16> undef)
+  %V64 = call i16 @llvm.vector.reduce.or.v64i16(<64 x i16> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i8(i32 %arg) {
 ; SSE-LABEL: 'reduce_i8'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.or.v2i8(<2 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.or.v4i8(<4 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.v8i8(<8 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.v16i8(<16 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.v64i8(<64 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.v128i8(<128 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.or.v2i8(<2 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.vector.reduce.or.v64i8(<64 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.vector.reduce.or.v128i8(<128 x i8> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX-LABEL: 'reduce_i8'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.or.v2i8(<2 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.or.v4i8(<4 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.v8i8(<8 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.v16i8(<16 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.v64i8(<64 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.v128i8(<128 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.or.v2i8(<2 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.vector.reduce.or.v64i8(<64 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.vector.reduce.or.v128i8(<128 x i8> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i8'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.or.v2i8(<2 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.or.v4i8(<4 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.v8i8(<8 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.v16i8(<16 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.v64i8(<64 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.v128i8(<128 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.or.v2i8(<2 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.vector.reduce.or.v64i8(<64 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.vector.reduce.or.v128i8(<128 x i8> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2   = call i8 @llvm.experimental.vector.reduce.or.v2i8(<2 x i8> undef)
-  %V4   = call i8 @llvm.experimental.vector.reduce.or.v4i8(<4 x i8> undef)
-  %V8   = call i8 @llvm.experimental.vector.reduce.or.v8i8(<8 x i8> undef)
-  %V16  = call i8 @llvm.experimental.vector.reduce.or.v16i8(<16 x i8> undef)
-  %V32  = call i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8> undef)
-  %V64  = call i8 @llvm.experimental.vector.reduce.or.v64i8(<64 x i8> undef)
-  %V128 = call i8 @llvm.experimental.vector.reduce.or.v128i8(<128 x i8> undef)
+  %V2   = call i8 @llvm.vector.reduce.or.v2i8(<2 x i8> undef)
+  %V4   = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> undef)
+  %V8   = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> undef)
+  %V16  = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> undef)
+  %V32  = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> undef)
+  %V64  = call i8 @llvm.vector.reduce.or.v64i8(<64 x i8> undef)
+  %V128 = call i8 @llvm.vector.reduce.or.v128i8(<128 x i8> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i1(i32 %arg) {
 ; SSE-LABEL: 'reduce_i1'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.or.v1i1(<1 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.v2i1(<2 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.v4i1(<4 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.v8i1(<8 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.v16i1(<16 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.v32i1(<32 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.v64i1(<64 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.v128i1(<128 x i1> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i1'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.or.v1i1(<1 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.v2i1(<2 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.v4i1(<4 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.v8i1(<8 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.v16i1(<16 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.v32i1(<32 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.v64i1(<64 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.v128i1(<128 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i1'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.or.v1i1(<1 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.v2i1(<2 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.v4i1(<4 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.v8i1(<8 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.v16i1(<16 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.v32i1(<32 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.v64i1(<64 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.v128i1(<128 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i1'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.or.v1i1(<1 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.v2i1(<2 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.v4i1(<4 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.v8i1(<8 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.v16i1(<16 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.v32i1(<32 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.v64i1(<64 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.v128i1(<128 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i1'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.or.v1i1(<1 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.v2i1(<2 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.v4i1(<4 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.v8i1(<8 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.v16i1(<16 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.v32i1(<32 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.v64i1(<64 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.v128i1(<128 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i1'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.or.v1i1(<1 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.v2i1(<2 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.v4i1(<4 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.v8i1(<8 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.v16i1(<16 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.v32i1(<32 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.v64i1(<64 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.v128i1(<128 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V1   = call i1 @llvm.experimental.vector.reduce.or.v1i1(<1 x i1> undef)
-  %V2   = call i1 @llvm.experimental.vector.reduce.or.v2i1(<2 x i1> undef)
-  %V4   = call i1 @llvm.experimental.vector.reduce.or.v4i1(<4 x i1> undef)
-  %V8   = call i1 @llvm.experimental.vector.reduce.or.v8i1(<8 x i1> undef)
-  %V16  = call i1 @llvm.experimental.vector.reduce.or.v16i1(<16 x i1> undef)
-  %V32  = call i1 @llvm.experimental.vector.reduce.or.v32i1(<32 x i1> undef)
-  %V64  = call i1 @llvm.experimental.vector.reduce.or.v64i1(<64 x i1> undef)
-  %V128 = call i1 @llvm.experimental.vector.reduce.or.v128i1(<128 x i1> undef)
+  %V1   = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> undef)
+  %V2   = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> undef)
+  %V4   = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> undef)
+  %V8   = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> undef)
+  %V16  = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> undef)
+  %V32  = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef)
+  %V64  = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef)
+  %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef)
   ret i32 undef
 }
 
-declare i64 @llvm.experimental.vector.reduce.or.v1i64(<1 x i64>)
-declare i64 @llvm.experimental.vector.reduce.or.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.or.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.or.v8i64(<8 x i64>)
-declare i64 @llvm.experimental.vector.reduce.or.v16i64(<16 x i64>)
+declare i64 @llvm.vector.reduce.or.v1i64(<1 x i64>)
+declare i64 @llvm.vector.reduce.or.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.or.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.or.v8i64(<8 x i64>)
+declare i64 @llvm.vector.reduce.or.v16i64(<16 x i64>)
 
-declare i32 @llvm.experimental.vector.reduce.or.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.or.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.or.v32i32(<32 x i32>)
+declare i32 @llvm.vector.reduce.or.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.or.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.or.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.or.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.or.v32i32(<32 x i32>)
 
-declare i16 @llvm.experimental.vector.reduce.or.v2i16(<2 x i16>)
-declare i16 @llvm.experimental.vector.reduce.or.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.or.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.or.v32i16(<32 x i16>)
-declare i16 @llvm.experimental.vector.reduce.or.v64i16(<64 x i16>)
+declare i16 @llvm.vector.reduce.or.v2i16(<2 x i16>)
+declare i16 @llvm.vector.reduce.or.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.or.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.or.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.or.v32i16(<32 x i16>)
+declare i16 @llvm.vector.reduce.or.v64i16(<64 x i16>)
 
-declare i8 @llvm.experimental.vector.reduce.or.v2i8(<2 x i8>)
-declare i8 @llvm.experimental.vector.reduce.or.v4i8(<4 x i8>)
-declare i8 @llvm.experimental.vector.reduce.or.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.or.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.or.v64i8(<64 x i8>)
-declare i8 @llvm.experimental.vector.reduce.or.v128i8(<128 x i8>)
+declare i8 @llvm.vector.reduce.or.v2i8(<2 x i8>)
+declare i8 @llvm.vector.reduce.or.v4i8(<4 x i8>)
+declare i8 @llvm.vector.reduce.or.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.or.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.or.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.or.v64i8(<64 x i8>)
+declare i8 @llvm.vector.reduce.or.v128i8(<128 x i8>)
 
-declare i1 @llvm.experimental.vector.reduce.or.v1i1(<1 x i1>)
-declare i1 @llvm.experimental.vector.reduce.or.v2i1(<2 x i1>)
-declare i1 @llvm.experimental.vector.reduce.or.v4i1(<4 x i1>)
-declare i1 @llvm.experimental.vector.reduce.or.v8i1(<8 x i1>)
-declare i1 @llvm.experimental.vector.reduce.or.v16i1(<16 x i1>)
-declare i1 @llvm.experimental.vector.reduce.or.v32i1(<32 x i1>)
-declare i1 @llvm.experimental.vector.reduce.or.v64i1(<64 x i1>)
-declare i1 @llvm.experimental.vector.reduce.or.v128i1(<128 x i1>)
+declare i1 @llvm.vector.reduce.or.v1i1(<1 x i1>)
+declare i1 @llvm.vector.reduce.or.v2i1(<2 x i1>)
+declare i1 @llvm.vector.reduce.or.v4i1(<4 x i1>)
+declare i1 @llvm.vector.reduce.or.v8i1(<8 x i1>)
+declare i1 @llvm.vector.reduce.or.v16i1(<16 x i1>)
+declare i1 @llvm.vector.reduce.or.v32i1(<32 x i1>)
+declare i1 @llvm.vector.reduce.or.v64i1(<64 x i1>)
+declare i1 @llvm.vector.reduce.or.v128i1(<128 x i1>)
diff --git a/llvm/test/Analysis/CostModel/X86/reduce-smax.ll b/llvm/test/Analysis/CostModel/X86/reduce-smax.ll
--- a/llvm/test/Analysis/CostModel/X86/reduce-smax.ll
+++ b/llvm/test/Analysis/CostModel/X86/reduce-smax.ll
@@ -11,322 +11,322 @@
 
 define i32 @reduce_i64(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i64'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.v1i64(<1 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smax.v2i64(<2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smax.v4i64(<4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smax.v8i64(<8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smax.v16i64(<16 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i64'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.v1i64(<1 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smax.v2i64(<2 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smax.v4i64(<4 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smax.v8i64(<8 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smax.v16i64(<16 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE41-LABEL: 'reduce_i64'
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.v1i64(<1 x i64> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smax.v2i64(<2 x i64> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smax.v4i64(<4 x i64> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smax.v8i64(<8 x i64> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smax.v16i64(<16 x i64> undef)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef)
 ; SSE41-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'reduce_i64'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.v1i64(<1 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smax.v2i64(<2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smax.v4i64(<4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smax.v8i64(<8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smax.v16i64(<16 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i64'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.v1i64(<1 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smax.v2i64(<2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smax.v4i64(<4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smax.v8i64(<8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smax.v16i64(<16 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i64'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.v1i64(<1 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smax.v2i64(<2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smax.v4i64(<4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smax.v8i64(<8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smax.v16i64(<16 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i64'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.v1i64(<1 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smax.v2i64(<2 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smax.v4i64(<4 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smax.v8i64(<8 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smax.v16i64(<16 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V1  = call i64 @llvm.experimental.vector.reduce.smax.v1i64(<1 x i64> undef)
-  %V2  = call i64 @llvm.experimental.vector.reduce.smax.v2i64(<2 x i64> undef)
-  %V4  = call i64 @llvm.experimental.vector.reduce.smax.v4i64(<4 x i64> undef)
-  %V8  = call i64 @llvm.experimental.vector.reduce.smax.v8i64(<8 x i64> undef)
-  %V16 = call i64 @llvm.experimental.vector.reduce.smax.v16i64(<16 x i64> undef)
+  %V1  = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef)
+  %V2  = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef)
+  %V4  = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef)
+  %V8  = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef)
+  %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i32(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i32'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smax.v32i32(<32 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i32'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smax.v32i32(<32 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE4-LABEL: 'reduce_i32'
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smax.v32i32(<32 x i32> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef)
 ; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i32'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smax.v32i32(<32 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i32'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smax.v32i32(<32 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i32'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smax.v32i32(<32 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2  = call i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32> undef)
-  %V4  = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> undef)
-  %V8  = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> undef)
-  %V16 = call i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32> undef)
-  %V32 = call i32 @llvm.experimental.vector.reduce.smax.v32i32(<32 x i32> undef)
+  %V2  = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef)
+  %V4  = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef)
+  %V8  = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef)
+  %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef)
+  %V32 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i16(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i16'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.v32i16(<32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.v64i16(<64 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i16'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.v32i16(<32 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.v64i16(<64 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE4-LABEL: 'reduce_i16'
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.v32i16(<32 x i16> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.v64i16(<64 x i16> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef)
 ; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i16'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.v32i16(<32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.v64i16(<64 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i16'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.v32i16(<32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.v64i16(<64 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i16'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.v32i16(<32 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.v64i16(<64 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i16'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.v32i16(<32 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.v64i16(<64 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i16'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.v32i16(<32 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.v64i16(<64 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2  = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> undef)
-  %V4  = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> undef)
-  %V8  = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> undef)
-  %V16 = call i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16> undef)
-  %V32 = call i16 @llvm.experimental.vector.reduce.smax.v32i16(<32 x i16> undef)
-  %V64 = call i16 @llvm.experimental.vector.reduce.smax.v64i16(<64 x i16> undef)
+  %V2  = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef)
+  %V4  = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef)
+  %V8  = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef)
+  %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef)
+  %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef)
+  %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i8(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i8'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.v128i8(<128 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i8'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.v128i8(<128 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE4-LABEL: 'reduce_i8'
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.v128i8(<128 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef)
 ; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i8'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.v128i8(<128 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i8'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.v128i8(<128 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i8'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.v128i8(<128 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i8'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.v128i8(<128 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i8'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.v128i8(<128 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2   = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> undef)
-  %V4   = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> undef)
-  %V8   = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> undef)
-  %V16  = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> undef)
-  %V32  = call i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8> undef)
-  %V64  = call i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8> undef)
-  %V128 = call i8 @llvm.experimental.vector.reduce.smax.v128i8(<128 x i8> undef)
+  %V2   = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef)
+  %V4   = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef)
+  %V8   = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef)
+  %V16  = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef)
+  %V32  = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef)
+  %V64  = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef)
+  %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef)
   ret i32 undef
 }
 
-declare i64 @llvm.experimental.vector.reduce.smax.v1i64(<1 x i64>)
-declare i64 @llvm.experimental.vector.reduce.smax.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.smax.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.smax.v8i64(<8 x i64>)
-declare i64 @llvm.experimental.vector.reduce.smax.v16i64(<16 x i64>)
+declare i64 @llvm.vector.reduce.smax.v1i64(<1 x i64>)
+declare i64 @llvm.vector.reduce.smax.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.smax.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.smax.v8i64(<8 x i64>)
+declare i64 @llvm.vector.reduce.smax.v16i64(<16 x i64>)
 
-declare i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smax.v32i32(<32 x i32>)
+declare i32 @llvm.vector.reduce.smax.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.smax.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.smax.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.smax.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.smax.v32i32(<32 x i32>)
 
-declare i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smax.v32i16(<32 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smax.v64i16(<64 x i16>)
+declare i16 @llvm.vector.reduce.smax.v2i16(<2 x i16>)
+declare i16 @llvm.vector.reduce.smax.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.smax.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.smax.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.smax.v32i16(<32 x i16>)
+declare i16 @llvm.vector.reduce.smax.v64i16(<64 x i16>)
 
-declare i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smax.v128i8(<128 x i8>)
+declare i8 @llvm.vector.reduce.smax.v2i8(<2 x i8>)
+declare i8 @llvm.vector.reduce.smax.v4i8(<4 x i8>)
+declare i8 @llvm.vector.reduce.smax.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.smax.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.smax.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.smax.v64i8(<64 x i8>)
+declare i8 @llvm.vector.reduce.smax.v128i8(<128 x i8>)
diff --git a/llvm/test/Analysis/CostModel/X86/reduce-smin.ll b/llvm/test/Analysis/CostModel/X86/reduce-smin.ll
--- a/llvm/test/Analysis/CostModel/X86/reduce-smin.ll
+++ b/llvm/test/Analysis/CostModel/X86/reduce-smin.ll
@@ -11,322 +11,322 @@
 
 define i32 @reduce_i64(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i64'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.v1i64(<1 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smin.v2i64(<2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smin.v4i64(<4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smin.v8i64(<8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smin.v16i64(<16 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V16 = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i64'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.v1i64(<1 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smin.v2i64(<2 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smin.v4i64(<4 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smin.v8i64(<8 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smin.v16i64(<16 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V16 = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE41-LABEL: 'reduce_i64'
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.v1i64(<1 x i64> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smin.v2i64(<2 x i64> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smin.v4i64(<4 x i64> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smin.v8i64(<8 x i64> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smin.v16i64(<16 x i64> undef)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> undef)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8 = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> undef)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %V16 = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> undef)
 ; SSE41-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'reduce_i64'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.v1i64(<1 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smin.v2i64(<2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smin.v4i64(<4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smin.v8i64(<8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smin.v16i64(<16 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16 = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i64'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.v1i64(<1 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smin.v2i64(<2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smin.v4i64(<4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smin.v8i64(<8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smin.v16i64(<16 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8 = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V16 = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i64'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.v1i64(<1 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smin.v2i64(<2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smin.v4i64(<4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smin.v8i64(<8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smin.v16i64(<16 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i64'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.v1i64(<1 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smin.v2i64(<2 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smin.v4i64(<4 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smin.v8i64(<8 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smin.v16i64(<16 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V1  = call i64 @llvm.experimental.vector.reduce.smin.v1i64(<1 x i64> undef)
-  %V2  = call i64 @llvm.experimental.vector.reduce.smin.v2i64(<2 x i64> undef)
-  %V4  = call i64 @llvm.experimental.vector.reduce.smin.v4i64(<4 x i64> undef)
-  %V8  = call i64 @llvm.experimental.vector.reduce.smin.v8i64(<8 x i64> undef)
-  %V16 = call i64 @llvm.experimental.vector.reduce.smin.v16i64(<16 x i64> undef)
+  %V1  = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> undef)
+  %V2  = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef)
+  %V4  = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef)
+  %V8  = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> undef)
+  %V16 = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i32(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i32'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smin.v16i32(<16 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smin.v32i32(<32 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V16 = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i32'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smin.v16i32(<16 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smin.v32i32(<32 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V16 = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE4-LABEL: 'reduce_i32'
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smin.v16i32(<16 x i32> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smin.v32i32(<32 x i32> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> undef)
 ; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i32'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smin.v16i32(<16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smin.v32i32(<32 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i32'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smin.v16i32(<16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smin.v32i32(<32 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i32'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smin.v16i32(<16 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smin.v32i32(<32 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2  = call i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32> undef)
-  %V4  = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> undef)
-  %V8  = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> undef)
-  %V16 = call i32 @llvm.experimental.vector.reduce.smin.v16i32(<16 x i32> undef)
-  %V32 = call i32 @llvm.experimental.vector.reduce.smin.v32i32(<32 x i32> undef)
+  %V2  = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef)
+  %V4  = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef)
+  %V8  = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef)
+  %V16 = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> undef)
+  %V32 = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i16(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i16'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.v32i16(<32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.v64i16(<64 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i16'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.v32i16(<32 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.v64i16(<64 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE4-LABEL: 'reduce_i16'
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.v32i16(<32 x i16> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.v64i16(<64 x i16> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef)
 ; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i16'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.v32i16(<32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.v64i16(<64 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i16'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.v32i16(<32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.v64i16(<64 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i16'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.v32i16(<32 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.v64i16(<64 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i16'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.v32i16(<32 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.v64i16(<64 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i16'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.v32i16(<32 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.v64i16(<64 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2  = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> undef)
-  %V4  = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> undef)
-  %V8  = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> undef)
-  %V16 = call i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16> undef)
-  %V32 = call i16 @llvm.experimental.vector.reduce.smin.v32i16(<32 x i16> undef)
-  %V64 = call i16 @llvm.experimental.vector.reduce.smin.v64i16(<64 x i16> undef)
+  %V2  = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef)
+  %V4  = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef)
+  %V8  = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef)
+  %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef)
+  %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef)
+  %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i8(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i8'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.v128i8(<128 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i8'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.v128i8(<128 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE4-LABEL: 'reduce_i8'
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.v128i8(<128 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef)
 ; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i8'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.v128i8(<128 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i8'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.v128i8(<128 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i8'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.v128i8(<128 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i8'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.v128i8(<128 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i8'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.v128i8(<128 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2   = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> undef)
-  %V4   = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> undef)
-  %V8   = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> undef)
-  %V16  = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> undef)
-  %V32  = call i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8> undef)
-  %V64  = call i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8> undef)
-  %V128 = call i8 @llvm.experimental.vector.reduce.smin.v128i8(<128 x i8> undef)
+  %V2   = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef)
+  %V4   = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef)
+  %V8   = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef)
+  %V16  = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef)
+  %V32  = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef)
+  %V64  = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef)
+  %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef)
   ret i32 undef
 }
 
-declare i64 @llvm.experimental.vector.reduce.smin.v1i64(<1 x i64>)
-declare i64 @llvm.experimental.vector.reduce.smin.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.smin.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.smin.v8i64(<8 x i64>)
-declare i64 @llvm.experimental.vector.reduce.smin.v16i64(<16 x i64>)
+declare i64 @llvm.vector.reduce.smin.v1i64(<1 x i64>)
+declare i64 @llvm.vector.reduce.smin.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.smin.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.smin.v8i64(<8 x i64>)
+declare i64 @llvm.vector.reduce.smin.v16i64(<16 x i64>)
 
-declare i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smin.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smin.v32i32(<32 x i32>)
+declare i32 @llvm.vector.reduce.smin.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.smin.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.smin.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.smin.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.smin.v32i32(<32 x i32>)
 
-declare i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smin.v32i16(<32 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smin.v64i16(<64 x i16>)
+declare i16 @llvm.vector.reduce.smin.v2i16(<2 x i16>)
+declare i16 @llvm.vector.reduce.smin.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.smin.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.smin.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.smin.v32i16(<32 x i16>)
+declare i16 @llvm.vector.reduce.smin.v64i16(<64 x i16>)
 
-declare i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smin.v128i8(<128 x i8>)
+declare i8 @llvm.vector.reduce.smin.v2i8(<2 x i8>)
+declare i8 @llvm.vector.reduce.smin.v4i8(<4 x i8>)
+declare i8 @llvm.vector.reduce.smin.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.smin.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.smin.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.smin.v64i8(<64 x i8>)
+declare i8 @llvm.vector.reduce.smin.v128i8(<128 x i8>)
diff --git a/llvm/test/Analysis/CostModel/X86/reduce-umax.ll b/llvm/test/Analysis/CostModel/X86/reduce-umax.ll
--- a/llvm/test/Analysis/CostModel/X86/reduce-umax.ll
+++ b/llvm/test/Analysis/CostModel/X86/reduce-umax.ll
@@ -11,322 +11,322 @@
 
 define i32 @reduce_i64(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i64'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.v1i64(<1 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umax.v2i64(<2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umax.v4i64(<4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umax.v8i64(<8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umax.v16i64(<16 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i64'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.v1i64(<1 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umax.v2i64(<2 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umax.v4i64(<4 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umax.v8i64(<8 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umax.v16i64(<16 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE41-LABEL: 'reduce_i64'
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.v1i64(<1 x i64> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umax.v2i64(<2 x i64> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umax.v4i64(<4 x i64> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umax.v8i64(<8 x i64> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umax.v16i64(<16 x i64> undef)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef)
 ; SSE41-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'reduce_i64'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.v1i64(<1 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umax.v2i64(<2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umax.v4i64(<4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umax.v8i64(<8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umax.v16i64(<16 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i64'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.v1i64(<1 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umax.v2i64(<2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umax.v4i64(<4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umax.v8i64(<8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umax.v16i64(<16 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i64'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.v1i64(<1 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umax.v2i64(<2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umax.v4i64(<4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umax.v8i64(<8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umax.v16i64(<16 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i64'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.v1i64(<1 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umax.v2i64(<2 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umax.v4i64(<4 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umax.v8i64(<8 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umax.v16i64(<16 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V1  = call i64 @llvm.experimental.vector.reduce.umax.v1i64(<1 x i64> undef)
-  %V2  = call i64 @llvm.experimental.vector.reduce.umax.v2i64(<2 x i64> undef)
-  %V4  = call i64 @llvm.experimental.vector.reduce.umax.v4i64(<4 x i64> undef)
-  %V8  = call i64 @llvm.experimental.vector.reduce.umax.v8i64(<8 x i64> undef)
-  %V16 = call i64 @llvm.experimental.vector.reduce.umax.v16i64(<16 x i64> undef)
+  %V1  = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef)
+  %V2  = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef)
+  %V4  = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef)
+  %V8  = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef)
+  %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i32(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i32'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umax.v32i32(<32 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i32'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umax.v32i32(<32 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE4-LABEL: 'reduce_i32'
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umax.v32i32(<32 x i32> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> undef)
 ; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i32'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umax.v32i32(<32 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i32'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umax.v32i32(<32 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i32'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umax.v32i32(<32 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2  = call i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32> undef)
-  %V4  = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> undef)
-  %V8  = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> undef)
-  %V16 = call i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32> undef)
-  %V32 = call i32 @llvm.experimental.vector.reduce.umax.v32i32(<32 x i32> undef)
+  %V2  = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef)
+  %V4  = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef)
+  %V8  = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef)
+  %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef)
+  %V32 = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i16(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i16'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.v32i16(<32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.v64i16(<64 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i16'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.v32i16(<32 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.v64i16(<64 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE4-LABEL: 'reduce_i16'
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.v32i16(<32 x i16> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.v64i16(<64 x i16> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef)
 ; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i16'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.v32i16(<32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.v64i16(<64 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i16'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.v32i16(<32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.v64i16(<64 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i16'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.v32i16(<32 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.v64i16(<64 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i16'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.v32i16(<32 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.v64i16(<64 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i16'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.v32i16(<32 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.v64i16(<64 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2  = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> undef)
-  %V4  = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> undef)
-  %V8  = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> undef)
-  %V16 = call i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16> undef)
-  %V32 = call i16 @llvm.experimental.vector.reduce.umax.v32i16(<32 x i16> undef)
-  %V64 = call i16 @llvm.experimental.vector.reduce.umax.v64i16(<64 x i16> undef)
+  %V2  = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef)
+  %V4  = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef)
+  %V8  = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef)
+  %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef)
+  %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef)
+  %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i8(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i8'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.v128i8(<128 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i8'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.v128i8(<128 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE4-LABEL: 'reduce_i8'
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.v128i8(<128 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef)
 ; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i8'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.v128i8(<128 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i8'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.v128i8(<128 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i8'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.v128i8(<128 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i8'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.v128i8(<128 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i8'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.v128i8(<128 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2   = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> undef)
-  %V4   = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> undef)
-  %V8   = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> undef)
-  %V16  = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> undef)
-  %V32  = call i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8> undef)
-  %V64  = call i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8> undef)
-  %V128 = call i8 @llvm.experimental.vector.reduce.umax.v128i8(<128 x i8> undef)
+  %V2   = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef)
+  %V4   = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef)
+  %V8   = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef)
+  %V16  = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef)
+  %V32  = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef)
+  %V64  = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef)
+  %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef)
   ret i32 undef
 }
 
-declare i64 @llvm.experimental.vector.reduce.umax.v1i64(<1 x i64>)
-declare i64 @llvm.experimental.vector.reduce.umax.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.umax.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.umax.v8i64(<8 x i64>)
-declare i64 @llvm.experimental.vector.reduce.umax.v16i64(<16 x i64>)
+declare i64 @llvm.vector.reduce.umax.v1i64(<1 x i64>)
+declare i64 @llvm.vector.reduce.umax.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.umax.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.umax.v8i64(<8 x i64>)
+declare i64 @llvm.vector.reduce.umax.v16i64(<16 x i64>)
 
-declare i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umax.v32i32(<32 x i32>)
+declare i32 @llvm.vector.reduce.umax.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.umax.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.umax.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.umax.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.umax.v32i32(<32 x i32>)
 
-declare i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umax.v32i16(<32 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umax.v64i16(<64 x i16>)
+declare i16 @llvm.vector.reduce.umax.v2i16(<2 x i16>)
+declare i16 @llvm.vector.reduce.umax.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.umax.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.umax.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.umax.v32i16(<32 x i16>)
+declare i16 @llvm.vector.reduce.umax.v64i16(<64 x i16>)
 
-declare i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umax.v128i8(<128 x i8>)
+declare i8 @llvm.vector.reduce.umax.v2i8(<2 x i8>)
+declare i8 @llvm.vector.reduce.umax.v4i8(<4 x i8>)
+declare i8 @llvm.vector.reduce.umax.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.umax.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.umax.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.umax.v64i8(<64 x i8>)
+declare i8 @llvm.vector.reduce.umax.v128i8(<128 x i8>)
diff --git a/llvm/test/Analysis/CostModel/X86/reduce-umin.ll b/llvm/test/Analysis/CostModel/X86/reduce-umin.ll
--- a/llvm/test/Analysis/CostModel/X86/reduce-umin.ll
+++ b/llvm/test/Analysis/CostModel/X86/reduce-umin.ll
@@ -11,322 +11,322 @@
 
 define i32 @reduce_i64(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i64'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.v1i64(<1 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umin.v2i64(<2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umin.v4i64(<4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umin.v8i64(<8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umin.v16i64(<16 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V16 = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i64'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.v1i64(<1 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umin.v2i64(<2 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umin.v4i64(<4 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umin.v8i64(<8 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umin.v16i64(<16 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V16 = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE41-LABEL: 'reduce_i64'
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.v1i64(<1 x i64> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umin.v2i64(<2 x i64> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umin.v4i64(<4 x i64> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umin.v8i64(<8 x i64> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umin.v16i64(<16 x i64> undef)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> undef)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8 = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> undef)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %V16 = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> undef)
 ; SSE41-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'reduce_i64'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.v1i64(<1 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umin.v2i64(<2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umin.v4i64(<4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umin.v8i64(<8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umin.v16i64(<16 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V16 = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i64'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.v1i64(<1 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umin.v2i64(<2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umin.v4i64(<4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umin.v8i64(<8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umin.v16i64(<16 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i64'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.v1i64(<1 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umin.v2i64(<2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umin.v4i64(<4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umin.v8i64(<8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umin.v16i64(<16 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8 = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V16 = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i64'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.v1i64(<1 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umin.v2i64(<2 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umin.v4i64(<4 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umin.v8i64(<8 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umin.v16i64(<16 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V1  = call i64 @llvm.experimental.vector.reduce.umin.v1i64(<1 x i64> undef)
-  %V2  = call i64 @llvm.experimental.vector.reduce.umin.v2i64(<2 x i64> undef)
-  %V4  = call i64 @llvm.experimental.vector.reduce.umin.v4i64(<4 x i64> undef)
-  %V8  = call i64 @llvm.experimental.vector.reduce.umin.v8i64(<8 x i64> undef)
-  %V16 = call i64 @llvm.experimental.vector.reduce.umin.v16i64(<16 x i64> undef)
+  %V1  = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> undef)
+  %V2  = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef)
+  %V4  = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef)
+  %V8  = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> undef)
+  %V16 = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i32(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i32'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umin.v16i32(<16 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umin.v32i32(<32 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V16 = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i32'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umin.v16i32(<16 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umin.v32i32(<32 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V16 = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE4-LABEL: 'reduce_i32'
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umin.v16i32(<16 x i32> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umin.v32i32(<32 x i32> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> undef)
 ; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i32'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umin.v16i32(<16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umin.v32i32(<32 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i32'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umin.v16i32(<16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umin.v32i32(<32 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i32'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umin.v16i32(<16 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umin.v32i32(<32 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2  = call i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32> undef)
-  %V4  = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> undef)
-  %V8  = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> undef)
-  %V16 = call i32 @llvm.experimental.vector.reduce.umin.v16i32(<16 x i32> undef)
-  %V32 = call i32 @llvm.experimental.vector.reduce.umin.v32i32(<32 x i32> undef)
+  %V2  = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef)
+  %V4  = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef)
+  %V8  = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef)
+  %V16 = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> undef)
+  %V32 = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i16(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i16'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.v32i16(<32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.v64i16(<64 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i16'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.v32i16(<32 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.v64i16(<64 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE4-LABEL: 'reduce_i16'
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.v32i16(<32 x i16> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.v64i16(<64 x i16> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef)
 ; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i16'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.v32i16(<32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.v64i16(<64 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i16'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.v32i16(<32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.v64i16(<64 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i16'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.v32i16(<32 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.v64i16(<64 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i16'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.v32i16(<32 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.v64i16(<64 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i16'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.v32i16(<32 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.v64i16(<64 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2  = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> undef)
-  %V4  = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> undef)
-  %V8  = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> undef)
-  %V16 = call i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16> undef)
-  %V32 = call i16 @llvm.experimental.vector.reduce.umin.v32i16(<32 x i16> undef)
-  %V64 = call i16 @llvm.experimental.vector.reduce.umin.v64i16(<64 x i16> undef)
+  %V2  = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef)
+  %V4  = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef)
+  %V8  = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef)
+  %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef)
+  %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef)
+  %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i8(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i8'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.v128i8(<128 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i8'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.v128i8(<128 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE4-LABEL: 'reduce_i8'
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.v128i8(<128 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef)
 ; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i8'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.v128i8(<128 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i8'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.v128i8(<128 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i8'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.v128i8(<128 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i8'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.v128i8(<128 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i8'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.v128i8(<128 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2   = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> undef)
-  %V4   = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> undef)
-  %V8   = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> undef)
-  %V16  = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> undef)
-  %V32  = call i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8> undef)
-  %V64  = call i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8> undef)
-  %V128 = call i8 @llvm.experimental.vector.reduce.umin.v128i8(<128 x i8> undef)
+  %V2   = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef)
+  %V4   = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef)
+  %V8   = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef)
+  %V16  = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef)
+  %V32  = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef)
+  %V64  = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef)
+  %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef)
   ret i32 undef
 }
 
-declare i64 @llvm.experimental.vector.reduce.umin.v1i64(<1 x i64>)
-declare i64 @llvm.experimental.vector.reduce.umin.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.umin.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.umin.v8i64(<8 x i64>)
-declare i64 @llvm.experimental.vector.reduce.umin.v16i64(<16 x i64>)
+declare i64 @llvm.vector.reduce.umin.v1i64(<1 x i64>)
+declare i64 @llvm.vector.reduce.umin.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.umin.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.umin.v8i64(<8 x i64>)
+declare i64 @llvm.vector.reduce.umin.v16i64(<16 x i64>)
 
-declare i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umin.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umin.v32i32(<32 x i32>)
+declare i32 @llvm.vector.reduce.umin.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.umin.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.umin.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.umin.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.umin.v32i32(<32 x i32>)
 
-declare i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umin.v32i16(<32 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umin.v64i16(<64 x i16>)
+declare i16 @llvm.vector.reduce.umin.v2i16(<2 x i16>)
+declare i16 @llvm.vector.reduce.umin.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.umin.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.umin.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.umin.v32i16(<32 x i16>)
+declare i16 @llvm.vector.reduce.umin.v64i16(<64 x i16>)
 
-declare i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umin.v128i8(<128 x i8>)
+declare i8 @llvm.vector.reduce.umin.v2i8(<2 x i8>)
+declare i8 @llvm.vector.reduce.umin.v4i8(<4 x i8>)
+declare i8 @llvm.vector.reduce.umin.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.umin.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.umin.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.umin.v64i8(<64 x i8>)
+declare i8 @llvm.vector.reduce.umin.v128i8(<128 x i8>)
diff --git a/llvm/test/Analysis/CostModel/X86/reduce-xor.ll b/llvm/test/Analysis/CostModel/X86/reduce-xor.ll
--- a/llvm/test/Analysis/CostModel/X86/reduce-xor.ll
+++ b/llvm/test/Analysis/CostModel/X86/reduce-xor.ll
@@ -10,280 +10,280 @@
 
 define i32 @reduce_i64(i32 %arg) {
 ; SSE-LABEL: 'reduce_i64'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.xor.v1i64(<1 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.xor.v2i64(<2 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.xor.v4i64(<4 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.xor.v8i64(<8 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.xor.v16i64(<16 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.xor.v1i64(<1 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.vector.reduce.xor.v8i64(<8 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i64 @llvm.vector.reduce.xor.v16i64(<16 x i64> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX-LABEL: 'reduce_i64'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.xor.v1i64(<1 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.xor.v2i64(<2 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.xor.v4i64(<4 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.xor.v8i64(<8 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.xor.v16i64(<16 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.xor.v1i64(<1 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.vector.reduce.xor.v8i64(<8 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.vector.reduce.xor.v16i64(<16 x i64> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i64'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.xor.v1i64(<1 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.xor.v2i64(<2 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.xor.v4i64(<4 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.xor.v8i64(<8 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.xor.v16i64(<16 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.xor.v1i64(<1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.vector.reduce.xor.v8i64(<8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.vector.reduce.xor.v16i64(<16 x i64> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V1  = call i64 @llvm.experimental.vector.reduce.xor.v1i64(<1 x i64> undef)
-  %V2  = call i64 @llvm.experimental.vector.reduce.xor.v2i64(<2 x i64> undef)
-  %V4  = call i64 @llvm.experimental.vector.reduce.xor.v4i64(<4 x i64> undef)
-  %V8  = call i64 @llvm.experimental.vector.reduce.xor.v8i64(<8 x i64> undef)
-  %V16 = call i64 @llvm.experimental.vector.reduce.xor.v16i64(<16 x i64> undef)
+  %V1  = call i64 @llvm.vector.reduce.xor.v1i64(<1 x i64> undef)
+  %V2  = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> undef)
+  %V4  = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> undef)
+  %V8  = call i64 @llvm.vector.reduce.xor.v8i64(<8 x i64> undef)
+  %V16 = call i64 @llvm.vector.reduce.xor.v16i64(<16 x i64> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i32(i32 %arg) {
 ; SSE-LABEL: 'reduce_i32'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.xor.v2i32(<2 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.xor.v16i32(<16 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.xor.v32i32(<32 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.vector.reduce.xor.v16i32(<16 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.vector.reduce.xor.v32i32(<32 x i32> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX-LABEL: 'reduce_i32'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.xor.v2i32(<2 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.xor.v16i32(<16 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.xor.v32i32(<32 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.vector.reduce.xor.v16i32(<16 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.xor.v32i32(<32 x i32> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i32'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.xor.v2i32(<2 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.xor.v16i32(<16 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.xor.v32i32(<32 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.vector.reduce.xor.v16i32(<16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.xor.v32i32(<32 x i32> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2  = call i32 @llvm.experimental.vector.reduce.xor.v2i32(<2 x i32> undef)
-  %V4  = call i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32> undef)
-  %V8  = call i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32> undef)
-  %V16 = call i32 @llvm.experimental.vector.reduce.xor.v16i32(<16 x i32> undef)
-  %V32 = call i32 @llvm.experimental.vector.reduce.xor.v32i32(<32 x i32> undef)
+  %V2  = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> undef)
+  %V4  = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> undef)
+  %V8  = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> undef)
+  %V16 = call i32 @llvm.vector.reduce.xor.v16i32(<16 x i32> undef)
+  %V32 = call i32 @llvm.vector.reduce.xor.v32i32(<32 x i32> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i16(i32 %arg) {
 ; SSE-LABEL: 'reduce_i16'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.xor.v2i16(<2 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.v4i16(<4 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.v8i16(<8 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.v16i16(<16 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.v32i16(<32 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.v64i16(<64 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.xor.v2i16(<2 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.vector.reduce.xor.v32i16(<32 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.vector.reduce.xor.v64i16(<64 x i16> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX-LABEL: 'reduce_i16'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.xor.v2i16(<2 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.v4i16(<4 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.v8i16(<8 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.v16i16(<16 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.v32i16(<32 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.v64i16(<64 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.xor.v2i16(<2 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.vector.reduce.xor.v32i16(<32 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.vector.reduce.xor.v64i16(<64 x i16> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i16'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.xor.v2i16(<2 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.v4i16(<4 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.v8i16(<8 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.v16i16(<16 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.v32i16(<32 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.v64i16(<64 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.xor.v2i16(<2 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.xor.v32i16(<32 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.vector.reduce.xor.v64i16(<64 x i16> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2  = call i16 @llvm.experimental.vector.reduce.xor.v2i16(<2 x i16> undef)
-  %V4  = call i16 @llvm.experimental.vector.reduce.xor.v4i16(<4 x i16> undef)
-  %V8  = call i16 @llvm.experimental.vector.reduce.xor.v8i16(<8 x i16> undef)
-  %V16 = call i16 @llvm.experimental.vector.reduce.xor.v16i16(<16 x i16> undef)
-  %V32 = call i16 @llvm.experimental.vector.reduce.xor.v32i16(<32 x i16> undef)
-  %V64 = call i16 @llvm.experimental.vector.reduce.xor.v64i16(<64 x i16> undef)
+  %V2  = call i16 @llvm.vector.reduce.xor.v2i16(<2 x i16> undef)
+  %V4  = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> undef)
+  %V8  = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> undef)
+  %V16 = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> undef)
+  %V32 = call i16 @llvm.vector.reduce.xor.v32i16(<32 x i16> undef)
+  %V64 = call i16 @llvm.vector.reduce.xor.v64i16(<64 x i16> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i8(i32 %arg) {
 ; SSE-LABEL: 'reduce_i8'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.xor.v2i8(<2 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.xor.v4i8(<4 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.v8i8(<8 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.v16i8(<16 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.v32i8(<32 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.v64i8(<64 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.v128i8(<128 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.xor.v2i8(<2 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.vector.reduce.xor.v64i8(<64 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.vector.reduce.xor.v128i8(<128 x i8> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX-LABEL: 'reduce_i8'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.xor.v2i8(<2 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.xor.v4i8(<4 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.v8i8(<8 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.v16i8(<16 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.v32i8(<32 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.v64i8(<64 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.v128i8(<128 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.xor.v2i8(<2 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.vector.reduce.xor.v64i8(<64 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.vector.reduce.xor.v128i8(<128 x i8> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i8'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.xor.v2i8(<2 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.xor.v4i8(<4 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.v8i8(<8 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.v16i8(<16 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.v32i8(<32 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.v64i8(<64 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.v128i8(<128 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.xor.v2i8(<2 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.vector.reduce.xor.v64i8(<64 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.vector.reduce.xor.v128i8(<128 x i8> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2   = call i8 @llvm.experimental.vector.reduce.xor.v2i8(<2 x i8> undef)
-  %V4   = call i8 @llvm.experimental.vector.reduce.xor.v4i8(<4 x i8> undef)
-  %V8   = call i8 @llvm.experimental.vector.reduce.xor.v8i8(<8 x i8> undef)
-  %V16  = call i8 @llvm.experimental.vector.reduce.xor.v16i8(<16 x i8> undef)
-  %V32  = call i8 @llvm.experimental.vector.reduce.xor.v32i8(<32 x i8> undef)
-  %V64  = call i8 @llvm.experimental.vector.reduce.xor.v64i8(<64 x i8> undef)
-  %V128 = call i8 @llvm.experimental.vector.reduce.xor.v128i8(<128 x i8> undef)
+  %V2   = call i8 @llvm.vector.reduce.xor.v2i8(<2 x i8> undef)
+  %V4   = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> undef)
+  %V8   = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> undef)
+  %V16  = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> undef)
+  %V32  = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> undef)
+  %V64  = call i8 @llvm.vector.reduce.xor.v64i8(<64 x i8> undef)
+  %V128 = call i8 @llvm.vector.reduce.xor.v128i8(<128 x i8> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i1(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i1'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.v1i1(<1 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.v2i1(<2 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.v4i1(<4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.v8i1(<8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.v16i1(<16 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.v32i1(<32 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.v64i1(<64 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.v128i1(<128 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V16 = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V64 = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V128 = call i1 @llvm.vector.reduce.xor.v128i1(<128 x i1> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i1'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.v1i1(<1 x i1> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.v2i1(<2 x i1> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.v4i1(<4 x i1> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.v8i1(<8 x i1> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.v16i1(<16 x i1> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.v32i1(<32 x i1> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.v64i1(<64 x i1> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.v128i1(<128 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.vector.reduce.xor.v128i1(<128 x i1> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'reduce_i1'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.v1i1(<1 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.v2i1(<2 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.v4i1(<4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.v8i1(<8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.v16i1(<16 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.v32i1(<32 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.v64i1(<64 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.v128i1(<128 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.vector.reduce.xor.v128i1(<128 x i1> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i1'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.v1i1(<1 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.v2i1(<2 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.v4i1(<4 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.v8i1(<8 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.v16i1(<16 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.v32i1(<32 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.v64i1(<64 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.v128i1(<128 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V64 = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V128 = call i1 @llvm.vector.reduce.xor.v128i1(<128 x i1> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i1'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.v1i1(<1 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.v2i1(<2 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.v4i1(<4 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.v8i1(<8 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.v16i1(<16 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.v32i1(<32 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.v64i1(<64 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.v128i1(<128 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i1 @llvm.vector.reduce.xor.v128i1(<128 x i1> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i1'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.v1i1(<1 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.v2i1(<2 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.v4i1(<4 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.v8i1(<8 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.v16i1(<16 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.v32i1(<32 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.v64i1(<64 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.v128i1(<128 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8 = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %V32 = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V64 = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V128 = call i1 @llvm.vector.reduce.xor.v128i1(<128 x i1> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i1'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.v1i1(<1 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.v2i1(<2 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.v4i1(<4 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.v8i1(<8 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.v16i1(<16 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 326 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.v32i1(<32 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 775 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.v64i1(<64 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 776 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.v128i1(<128 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8 = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 326 for instruction: %V32 = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 775 for instruction: %V64 = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 776 for instruction: %V128 = call i1 @llvm.vector.reduce.xor.v128i1(<128 x i1> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i1'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.v1i1(<1 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.v2i1(<2 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.v4i1(<4 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.v8i1(<8 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.v16i1(<16 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.v32i1(<32 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.v64i1(<64 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.v128i1(<128 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8 = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %V32 = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V64 = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V128 = call i1 @llvm.vector.reduce.xor.v128i1(<128 x i1> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V1   = call i1 @llvm.experimental.vector.reduce.xor.v1i1(<1 x i1> undef)
-  %V2   = call i1 @llvm.experimental.vector.reduce.xor.v2i1(<2 x i1> undef)
-  %V4   = call i1 @llvm.experimental.vector.reduce.xor.v4i1(<4 x i1> undef)
-  %V8   = call i1 @llvm.experimental.vector.reduce.xor.v8i1(<8 x i1> undef)
-  %V16  = call i1 @llvm.experimental.vector.reduce.xor.v16i1(<16 x i1> undef)
-  %V32  = call i1 @llvm.experimental.vector.reduce.xor.v32i1(<32 x i1> undef)
-  %V64  = call i1 @llvm.experimental.vector.reduce.xor.v64i1(<64 x i1> undef)
-  %V128 = call i1 @llvm.experimental.vector.reduce.xor.v128i1(<128 x i1> undef)
+  %V1   = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> undef)
+  %V2   = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> undef)
+  %V4   = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> undef)
+  %V8   = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> undef)
+  %V16  = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> undef)
+  %V32  = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> undef)
+  %V64  = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> undef)
+  %V128 = call i1 @llvm.vector.reduce.xor.v128i1(<128 x i1> undef)
   ret i32 undef
 }
 
-declare i64 @llvm.experimental.vector.reduce.xor.v1i64(<1 x i64>)
-declare i64 @llvm.experimental.vector.reduce.xor.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.xor.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.xor.v8i64(<8 x i64>)
-declare i64 @llvm.experimental.vector.reduce.xor.v16i64(<16 x i64>)
+declare i64 @llvm.vector.reduce.xor.v1i64(<1 x i64>)
+declare i64 @llvm.vector.reduce.xor.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.xor.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.xor.v8i64(<8 x i64>)
+declare i64 @llvm.vector.reduce.xor.v16i64(<16 x i64>)
 
-declare i32 @llvm.experimental.vector.reduce.xor.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.xor.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.xor.v32i32(<32 x i32>)
+declare i32 @llvm.vector.reduce.xor.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.xor.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.xor.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.xor.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.xor.v32i32(<32 x i32>)
 
-declare i16 @llvm.experimental.vector.reduce.xor.v2i16(<2 x i16>)
-declare i16 @llvm.experimental.vector.reduce.xor.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.xor.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.xor.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.xor.v32i16(<32 x i16>)
-declare i16 @llvm.experimental.vector.reduce.xor.v64i16(<64 x i16>)
+declare i16 @llvm.vector.reduce.xor.v2i16(<2 x i16>)
+declare i16 @llvm.vector.reduce.xor.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.xor.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.xor.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.xor.v32i16(<32 x i16>)
+declare i16 @llvm.vector.reduce.xor.v64i16(<64 x i16>)
 
-declare i8 @llvm.experimental.vector.reduce.xor.v2i8(<2 x i8>)
-declare i8 @llvm.experimental.vector.reduce.xor.v4i8(<4 x i8>)
-declare i8 @llvm.experimental.vector.reduce.xor.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.xor.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.xor.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.xor.v64i8(<64 x i8>)
-declare i8 @llvm.experimental.vector.reduce.xor.v128i8(<128 x i8>)
+declare i8 @llvm.vector.reduce.xor.v2i8(<2 x i8>)
+declare i8 @llvm.vector.reduce.xor.v4i8(<4 x i8>)
+declare i8 @llvm.vector.reduce.xor.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.xor.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.xor.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.xor.v64i8(<64 x i8>)
+declare i8 @llvm.vector.reduce.xor.v128i8(<128 x i8>)
 
-declare i1 @llvm.experimental.vector.reduce.xor.v1i1(<1 x i1>)
-declare i1 @llvm.experimental.vector.reduce.xor.v2i1(<2 x i1>)
-declare i1 @llvm.experimental.vector.reduce.xor.v4i1(<4 x i1>)
-declare i1 @llvm.experimental.vector.reduce.xor.v8i1(<8 x i1>)
-declare i1 @llvm.experimental.vector.reduce.xor.v16i1(<16 x i1>)
-declare i1 @llvm.experimental.vector.reduce.xor.v32i1(<32 x i1>)
-declare i1 @llvm.experimental.vector.reduce.xor.v64i1(<64 x i1>)
-declare i1 @llvm.experimental.vector.reduce.xor.v128i1(<128 x i1>)
+declare i1 @llvm.vector.reduce.xor.v1i1(<1 x i1>)
+declare i1 @llvm.vector.reduce.xor.v2i1(<2 x i1>)
+declare i1 @llvm.vector.reduce.xor.v4i1(<4 x i1>)
+declare i1 @llvm.vector.reduce.xor.v8i1(<8 x i1>)
+declare i1 @llvm.vector.reduce.xor.v16i1(<16 x i1>)
+declare i1 @llvm.vector.reduce.xor.v32i1(<32 x i1>)
+declare i1 @llvm.vector.reduce.xor.v64i1(<64 x i1>)
+declare i1 @llvm.vector.reduce.xor.v128i1(<128 x i1>)
diff --git a/llvm/test/Assembler/invalid-vecreduce.ll b/llvm/test/Assembler/invalid-vecreduce.ll
--- a/llvm/test/Assembler/invalid-vecreduce.ll
+++ b/llvm/test/Assembler/invalid-vecreduce.ll
@@ -1,34 +1,34 @@
 ; RUN: not opt -S < %s 2>&1 | FileCheck %s
 
-; CHECK: Intrinsic has incorrect argument type!
-; CHECK-NEXT: float (double, <2 x double>)* @llvm.experimental.vector.reduce.v2.fadd.f32.f64.v2f64
+; CHECK: Intrinsic has incorrect return type!
+; CHECK-NEXT: float (double, <2 x double>)* @llvm.vector.reduce.fadd.f32.f64.v2f64
 define float @fadd_invalid_scalar_res(double %acc, <2 x double> %in) {
-  %res = call float @llvm.experimental.vector.reduce.v2.fadd.f32.f64.v2f64(double %acc, <2 x double> %in)
+  %res = call float @llvm.vector.reduce.fadd.f32.f64.v2f64(double %acc, <2 x double> %in)
   ret float %res
 }
 
 ; CHECK: Intrinsic has incorrect argument type!
-; CHECK-NEXT: double (float, <2 x double>)* @llvm.experimental.vector.reduce.v2.fadd.f64.f32.v2f64
+; CHECK-NEXT: double (float, <2 x double>)* @llvm.vector.reduce.fadd.f64.f32.v2f64
 define double @fadd_invalid_scalar_start(float %acc, <2 x double> %in) {
-  %res = call double @llvm.experimental.vector.reduce.v2.fadd.f64.f32.v2f64(float %acc, <2 x double> %in)
+  %res = call double @llvm.vector.reduce.fadd.f64.f32.v2f64(float %acc, <2 x double> %in)
   ret double %res
 }
 
-; CHECK: Intrinsic has incorrect argument type!
-; CHECK-NEXT: <2 x double> (double, <2 x double>)* @llvm.experimental.vector.reduce.v2.fadd.v2f64.f64.v2f64
+; CHECK: Intrinsic has incorrect return type!
+; CHECK-NEXT: <2 x double> (double, <2 x double>)* @llvm.vector.reduce.fadd.v2f64.f64.v2f64
 define <2 x double> @fadd_invalid_vector_res(double %acc, <2 x double> %in) {
-  %res = call <2 x double> @llvm.experimental.vector.reduce.v2.fadd.v2f64.f64.v2f64(double %acc, <2 x double> %in)
+  %res = call <2 x double> @llvm.vector.reduce.fadd.v2f64.f64.v2f64(double %acc, <2 x double> %in)
   ret <2 x double> %res
 }
 
 ; CHECK: Intrinsic has incorrect argument type!
-; CHECK-NEXT: double (<2 x double>, <2 x double>)* @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64.v2f64
+; CHECK-NEXT: double (<2 x double>, <2 x double>)* @llvm.vector.reduce.fadd.f64.v2f64.v2f64
 define double @fadd_invalid_vector_start(<2 x double> %in, <2 x double> %acc) {
-  %res = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64.v2f64(<2 x double> %acc, <2 x double> %in)
+  %res = call double @llvm.vector.reduce.fadd.f64.v2f64.v2f64(<2 x double> %acc, <2 x double> %in)
   ret double %res
 }
 
-declare float @llvm.experimental.vector.reduce.v2.fadd.f32.f64.v2f64(double %acc, <2 x double> %in)
-declare double @llvm.experimental.vector.reduce.v2.fadd.f64.f32.v2f64(float %acc, <2 x double> %in)
-declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64.v2f64(<2 x double> %acc, <2 x double> %in)
-declare <2 x double> @llvm.experimental.vector.reduce.v2.fadd.v2f64.f64.v2f64(double %acc, <2 x double> %in)
+declare float @llvm.vector.reduce.fadd.f32.f64.v2f64(double %acc, <2 x double> %in)
+declare double @llvm.vector.reduce.fadd.f64.f32.v2f64(float %acc, <2 x double> %in)
+declare double @llvm.vector.reduce.fadd.f64.v2f64.v2f64(<2 x double> %acc, <2 x double> %in)
+declare <2 x double> @llvm.vector.reduce.fadd.v2f64.f64.v2f64(double %acc, <2 x double> %in)
diff --git a/llvm/test/Bitcode/upgrade-vecreduce-intrinsics.ll b/llvm/test/Bitcode/upgrade-vecreduce-intrinsics.ll
--- a/llvm/test/Bitcode/upgrade-vecreduce-intrinsics.ll
+++ b/llvm/test/Bitcode/upgrade-vecreduce-intrinsics.ll
@@ -1,64 +1,130 @@
 ; RUN: opt -S < %s | FileCheck %s
 ; RUN: llvm-dis < %s.bc | FileCheck %s
 
-define float @fadd_acc(<4 x float> %in, float %acc) {
-; CHECK-LABEL: @fadd_acc
-; CHECK: %res = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %acc, <4 x float> %in)
-  %res = call float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float %acc, <4 x float> %in)
+
+define float @fadd_v2(<4 x float> %in, float %acc) {
+; CHECK-LABEL: @fadd_v2
+; CHECK: %res = call float @llvm.vector.reduce.fadd.v4f32(float %acc, <4 x float> %in)
+  %res = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %acc, <4 x float> %in)
   ret float %res
 }
 
-define float @fadd_undef(<4 x float> %in) {
-; CHECK-LABEL: @fadd_undef
-; CHECK: %res = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float undef, <4 x float> %in)
-  %res = call float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %in)
+define float @fadd_v2_fast(<4 x float> %in, float %acc) {
+; CHECK-LABEL: @fadd_v2_fast
+; CHECK: %res = call fast float @llvm.vector.reduce.fadd.v4f32(float %acc, <4 x float> %in)
+  %res = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %acc, <4 x float> %in)
   ret float %res
 }
 
-define float @fadd_fast_acc(<4 x float> %in, float %acc) {
-; CHECK-LABEL: @fadd_fast_acc
-; CHECK: %res = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> %in)
-  %res = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float %acc, <4 x float> %in)
+define float @fmul_v2(<4 x float> %in, float %acc) {
+; CHECK-LABEL: @fmul_v2
+; CHECK: %res = call float @llvm.vector.reduce.fmul.v4f32(float %acc, <4 x float> %in)
+  %res = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float %acc, <4 x float> %in)
   ret float %res
 }
 
-define float @fadd_fast_undef(<4 x float> %in) {
-; CHECK-LABEL: @fadd_fast_undef
-; CHECK: %res = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> %in)
-  %res = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %in)
+define float @fmul_v2_fast(<4 x float> %in, float %acc) {
+; CHECK-LABEL: @fmul_v2_fast
+; CHECK: %res = call fast  float @llvm.vector.reduce.fmul.v4f32(float %acc, <4 x float> %in)
+  %res = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float %acc, <4 x float> %in)
   ret float %res
 }
 
-define float @fmul_acc(<4 x float> %in, float %acc) {
-; CHECK-LABEL: @fmul_acc
-; CHECK: %res = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float %acc, <4 x float> %in)
-  %res = call float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float %acc, <4 x float> %in)
+define float @fmin(<4 x float> %in) {
+; CHECK-LABEL: @fmin
+; CHECK: %res = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %in)
+  %res = call float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> %in)
   ret float %res
 }
 
-define float @fmul_undef(<4 x float> %in) {
-; CHECK-LABEL: @fmul_undef
-; CHECK: %res = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float undef, <4 x float> %in)
-  %res = call float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float undef, <4 x float> %in)
+define float @fmax(<4 x float> %in) {
+; CHECK-LABEL: @fmax
+; CHECK: %res = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %in)
+  %res = call float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> %in)
   ret float %res
 }
 
-define float @fmul_fast_acc(<4 x float> %in, float %acc) {
-; CHECK-LABEL: @fmul_fast_acc
-; CHECK: %res = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.000000e+00, <4 x float> %in)
-  %res = call fast float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float %acc, <4 x float> %in)
-  ret float %res
+define i32 @and(<4 x i32> %in) {
+; CHECK-LABEL: @and
+; CHECK: %res = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %in)
+  %res = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> %in)
+  ret i32 %res
 }
 
-define float @fmul_fast_undef(<4 x float> %in) {
-; CHECK-LABEL: @fmul_fast_undef
-; CHECK: %res = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.000000e+00, <4 x float> %in)
-  %res = call fast float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float undef, <4 x float> %in)
-  ret float %res
+define i32 @or(<4 x i32> %in) {
+; CHECK-LABEL: @or
+; CHECK: %res = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %in)
+  %res = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> %in)
+  ret i32 %res
+}
+
+define i32 @xor(<4 x i32> %in) {
+; CHECK-LABEL: @xor
+; CHECK: %res = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %in)
+  %res = call i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32> %in)
+  ret i32 %res
+}
+
+define i32 @smin(<4 x i32> %in) {
+; CHECK-LABEL: @smin
+; CHECK: %res = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %in)
+  %res = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> %in)
+  ret i32 %res
 }
 
-declare float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
-; CHECK: declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float, <4 x float>)
+define i32 @smax(<4 x i32> %in) {
+; CHECK-LABEL: @smax
+; CHECK: %res = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %in)
+  %res = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> %in)
+  ret i32 %res
+}
+
+define i32 @umin(<4 x i32> %in) {
+; CHECK-LABEL: @umin
+; CHECK: %res = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %in)
+  %res = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> %in)
+  ret i32 %res
+}
+
+define i32 @umax(<4 x i32> %in) {
+; CHECK-LABEL: @umax
+; CHECK: %res = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %in)
+  %res = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> %in)
+  ret i32 %res
+}
+
+
+declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float, <4 x float>)
+declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float, <4 x float>)
+
+declare float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float>)
+; CHECK: declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>)
+
+declare float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float>)
+; CHECK: declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>)
+
+declare i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32>)
+; CHECK: declare i32 @llvm.vector.reduce.and.v4i32(<4 x i32>)
+
+declare i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32>)
+; CHECK: declare i32 @llvm.vector.reduce.or.v4i32(<4 x i32>)
+
+declare i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32>)
+; CHECK: declare i32 @llvm.vector.reduce.xor.v4i32(<4 x i32>)
+
+declare i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32>)
+; CHECK: declare i32 @llvm.vector.reduce.smin.v4i32(<4 x i32>)
+
+declare i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32>)
+; CHECK: declare i32 @llvm.vector.reduce.smax.v4i32(<4 x i32>)
+
+declare i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32>)
+; CHECK: declare i32 @llvm.vector.reduce.umin.v4i32(<4 x i32>)
+
+declare i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32>)
+; CHECK: declare i32 @llvm.vector.reduce.umax.v4i32(<4 x i32>)
+
+
+
+
 
-declare float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float, <4 x float>)
-; CHECK: declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float, <4 x float>)
diff --git a/llvm/test/Bitcode/upgrade-vecreduce-intrinsics.ll.bc b/llvm/test/Bitcode/upgrade-vecreduce-intrinsics.ll.bc
index 0000000000000000000000000000000000000000..0000000000000000000000000000000000000000
GIT binary patch
literal 0
Hc$@<O00001

literal 0
Hc$@<O00001

diff --git a/llvm/test/CodeGen/AArch64/aarch64-addv.ll b/llvm/test/CodeGen/AArch64/aarch64-addv.ll
--- a/llvm/test/CodeGen/AArch64/aarch64-addv.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-addv.ll
@@ -1,16 +1,16 @@
 ; RUN: llc < %s -mtriple=aarch64-eabi -aarch64-neon-syntax=generic | FileCheck %s
 
 ; Function Attrs: nounwind readnone
-declare i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64>)
-declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
-declare i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16>)
-declare i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8>)
+declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
+declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
+declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)
 
 define i8 @add_B(<16 x i8>* %arr)  {
 ; CHECK-LABEL: add_B
 ; CHECK: addv {{b[0-9]+}}, {{v[0-9]+}}.16b
   %bin.rdx = load <16 x i8>, <16 x i8>* %arr
-  %r = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %bin.rdx)
+  %r = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %bin.rdx)
   ret i8 %r
 }
 
@@ -18,7 +18,7 @@
 ; CHECK-LABEL: add_H
 ; CHECK: addv {{h[0-9]+}}, {{v[0-9]+}}.8h
   %bin.rdx = load <8 x i16>, <8 x i16>* %arr
-  %r = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %bin.rdx)
+  %r = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %bin.rdx)
   ret i16 %r
 }
 
@@ -26,7 +26,7 @@
 ; CHECK-LABEL: add_S
 ; CHECK: addv {{s[0-9]+}}, {{v[0-9]+}}.4s
   %bin.rdx = load <4 x i32>, <4 x i32>* %arr
-  %r = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %bin.rdx)
+  %r = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %bin.rdx)
   ret i32 %r
 }
 
@@ -35,11 +35,11 @@
 ; CHECK-NOT: addv
 ; CHECK: addp {{d[0-9]+}}, {{v[0-9]+}}.2d
   %bin.rdx = load <2 x i64>, <2 x i64>* %arr
-  %r = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %bin.rdx)
+  %r = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %bin.rdx)
   ret i64 %r
 }
 
-declare i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
 
 define i32 @oversized_ADDV_256(i8* noalias nocapture readonly %arg1, i8* noalias nocapture readonly %arg2) {
 ; CHECK-LABEL: oversized_ADDV_256
@@ -55,16 +55,16 @@
   %7 = icmp slt <8 x i32> %6, zeroinitializer
   %8 = sub nsw <8 x i32> zeroinitializer, %6
   %9 = select <8 x i1> %7, <8 x i32> %8, <8 x i32> %6
-  %r = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %9)
+  %r = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %9)
   ret i32 %r
 }
 
-declare i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
 
 define i32 @oversized_ADDV_512(<16 x i32>* %arr)  {
 ; CHECK-LABEL: oversized_ADDV_512
 ; CHECK: addv {{s[0-9]+}}, {{v[0-9]+}}.4s
   %bin.rdx = load <16 x i32>, <16 x i32>* %arr
-  %r = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %bin.rdx)
+  %r = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %bin.rdx)
   ret i32 %r
 }
diff --git a/llvm/test/CodeGen/AArch64/aarch64-minmaxv.ll b/llvm/test/CodeGen/AArch64/aarch64-minmaxv.ll
--- a/llvm/test/CodeGen/AArch64/aarch64-minmaxv.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-minmaxv.ll
@@ -2,28 +2,28 @@
 
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 
-declare i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8>)
-declare i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16>)
-declare i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32>)
-declare i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8>)
-declare i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16>)
-declare i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32>)
-
-declare i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8>)
-declare i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16>)
-declare i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32>)
-declare i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8>)
-declare i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16>)
-declare i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32>)
-
-declare float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float>)
-declare float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float>)
+declare i8 @llvm.vector.reduce.smax.v16i8(<16 x i8>)
+declare i16 @llvm.vector.reduce.smax.v8i16(<8 x i16>)
+declare i32 @llvm.vector.reduce.smax.v4i32(<4 x i32>)
+declare i8 @llvm.vector.reduce.umax.v16i8(<16 x i8>)
+declare i16 @llvm.vector.reduce.umax.v8i16(<8 x i16>)
+declare i32 @llvm.vector.reduce.umax.v4i32(<4 x i32>)
+
+declare i8 @llvm.vector.reduce.smin.v16i8(<16 x i8>)
+declare i16 @llvm.vector.reduce.smin.v8i16(<8 x i16>)
+declare i32 @llvm.vector.reduce.smin.v4i32(<4 x i32>)
+declare i8 @llvm.vector.reduce.umin.v16i8(<16 x i8>)
+declare i16 @llvm.vector.reduce.umin.v8i16(<8 x i16>)
+declare i32 @llvm.vector.reduce.umin.v4i32(<4 x i32>)
+
+declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>)
+declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>)
 
 ; CHECK-LABEL: smax_B
 ; CHECK: smaxv {{b[0-9]+}}, {{v[0-9]+}}.16b
 define i8 @smax_B(<16 x i8>* nocapture readonly %arr)  {
   %arr.load = load <16 x i8>, <16 x i8>* %arr
-  %r = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> %arr.load)
+  %r = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> %arr.load)
   ret i8 %r
 }
 
@@ -31,7 +31,7 @@
 ; CHECK: smaxv {{h[0-9]+}}, {{v[0-9]+}}.8h
 define i16 @smax_H(<8 x i16>* nocapture readonly %arr) {
   %arr.load = load <8 x i16>, <8 x i16>* %arr
-  %r = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> %arr.load)
+  %r = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> %arr.load)
   ret i16 %r
 }
 
@@ -39,7 +39,7 @@
 ; CHECK: smaxv {{s[0-9]+}}, {{v[0-9]+}}.4s
 define i32 @smax_S(<4 x i32> * nocapture readonly %arr)  {
   %arr.load = load <4 x i32>, <4 x i32>* %arr
-  %r = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> %arr.load)
+  %r = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %arr.load)
   ret i32 %r
 }
 
@@ -47,7 +47,7 @@
 ; CHECK: umaxv {{b[0-9]+}}, {{v[0-9]+}}.16b
 define i8 @umax_B(<16 x i8>* nocapture readonly %arr)  {
   %arr.load = load <16 x i8>, <16 x i8>* %arr
-  %r = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> %arr.load)
+  %r = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> %arr.load)
   ret i8 %r
 }
 
@@ -55,7 +55,7 @@
 ; CHECK: umaxv {{h[0-9]+}}, {{v[0-9]+}}.8h
 define i16 @umax_H(<8 x i16>* nocapture readonly %arr)  {
   %arr.load = load <8 x i16>, <8 x i16>* %arr
-  %r = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> %arr.load)
+  %r = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> %arr.load)
   ret i16 %r
 }
 
@@ -63,7 +63,7 @@
 ; CHECK: umaxv {{s[0-9]+}}, {{v[0-9]+}}.4s
 define i32 @umax_S(<4 x i32>* nocapture readonly %arr) {
   %arr.load = load <4 x i32>, <4 x i32>* %arr
-  %r = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> %arr.load)
+  %r = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %arr.load)
   ret i32 %r
 }
 
@@ -71,7 +71,7 @@
 ; CHECK: sminv {{b[0-9]+}}, {{v[0-9]+}}.16b
 define i8 @smin_B(<16 x i8>* nocapture readonly %arr) {
   %arr.load = load <16 x i8>, <16 x i8>* %arr
-  %r = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> %arr.load)
+  %r = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> %arr.load)
   ret i8 %r
 }
 
@@ -79,7 +79,7 @@
 ; CHECK: sminv {{h[0-9]+}}, {{v[0-9]+}}.8h
 define i16 @smin_H(<8 x i16>* nocapture readonly %arr) {
   %arr.load = load <8 x i16>, <8 x i16>* %arr
-  %r = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> %arr.load)
+  %r = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> %arr.load)
   ret i16 %r
 }
 
@@ -87,7 +87,7 @@
 ; CHECK: sminv {{s[0-9]+}}, {{v[0-9]+}}.4s
 define i32 @smin_S(<4 x i32>* nocapture readonly %arr) {
   %arr.load = load <4 x i32>, <4 x i32>* %arr
-  %r = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> %arr.load)
+  %r = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %arr.load)
   ret i32 %r
 }
 
@@ -95,7 +95,7 @@
 ; CHECK: uminv {{b[0-9]+}}, {{v[0-9]+}}.16b
 define i8 @umin_B(<16 x i8>* nocapture readonly %arr)  {
   %arr.load = load <16 x i8>, <16 x i8>* %arr
-  %r = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> %arr.load)
+  %r = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> %arr.load)
   ret i8 %r
 }
 
@@ -103,7 +103,7 @@
 ; CHECK: uminv {{h[0-9]+}}, {{v[0-9]+}}.8h
 define i16 @umin_H(<8 x i16>* nocapture readonly %arr)  {
   %arr.load = load <8 x i16>, <8 x i16>* %arr
-  %r = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> %arr.load)
+  %r = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> %arr.load)
   ret i16 %r
 }
 
@@ -111,7 +111,7 @@
 ; CHECK: uminv {{s[0-9]+}}, {{v[0-9]+}}.4s
 define i32 @umin_S(<4 x i32>* nocapture readonly %arr) {
   %arr.load = load <4 x i32>, <4 x i32>* %arr
-  %r = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> %arr.load)
+  %r = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %arr.load)
   ret i32 %r
 }
 
@@ -119,7 +119,7 @@
 ; CHECK: fmaxnmv
 define float @fmaxnm_S(<4 x float>* nocapture readonly %arr) {
   %arr.load  = load <4 x float>, <4 x float>* %arr
-  %r = call nnan float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> %arr.load)
+  %r = call nnan float @llvm.vector.reduce.fmax.v4f32(<4 x float> %arr.load)
   ret float %r
 }
 
@@ -127,22 +127,22 @@
 ; CHECK: fminnmv
 define float @fminnm_S(<4 x float>* nocapture readonly %arr) {
   %arr.load  = load <4 x float>, <4 x float>* %arr
-  %r = call nnan float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> %arr.load)
+  %r = call nnan float @llvm.vector.reduce.fmin.v4f32(<4 x float> %arr.load)
   ret float %r
 }
 
-declare i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.umax.v16i16(<16 x i16>)
 
 define i16 @oversized_umax_256(<16 x i16>* nocapture readonly %arr)  {
 ; CHECK-LABEL: oversized_umax_256
 ; CHECK: umax [[V0:v[0-9]+]].8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 ; CHECK: umaxv {{h[0-9]+}}, [[V0]]
   %arr.load = load <16 x i16>, <16 x i16>* %arr
-  %r = call i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16> %arr.load)
+  %r = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> %arr.load)
   ret i16 %r
 }
 
-declare i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.umax.v16i32(<16 x i32>)
 
 define i32 @oversized_umax_512(<16 x i32>* nocapture readonly %arr)  {
 ; CHECK-LABEL: oversized_umax_512
@@ -151,22 +151,22 @@
 ; CHECK-NEXT: umax [[V0:v[0-9]+]].4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 ; CHECK-NEXT: umaxv {{s[0-9]+}}, [[V0]]
   %arr.load = load <16 x i32>, <16 x i32>* %arr
-  %r = call i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32> %arr.load)
+  %r = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> %arr.load)
   ret i32 %r
 }
 
-declare i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.umin.v16i16(<16 x i16>)
 
 define i16 @oversized_umin_256(<16 x i16>* nocapture readonly %arr)  {
 ; CHECK-LABEL: oversized_umin_256
 ; CHECK: umin [[V0:v[0-9]+]].8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 ; CHECK: uminv {{h[0-9]+}}, [[V0]]
   %arr.load = load <16 x i16>, <16 x i16>* %arr
-  %r = call i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16> %arr.load)
+  %r = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> %arr.load)
   ret i16 %r
 }
 
-declare i32 @llvm.experimental.vector.reduce.umin.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.umin.v16i32(<16 x i32>)
 
 define i32 @oversized_umin_512(<16 x i32>* nocapture readonly %arr)  {
 ; CHECK-LABEL: oversized_umin_512
@@ -175,22 +175,22 @@
 ; CHECK-NEXT: umin [[V0:v[0-9]+]].4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 ; CHECK-NEXT: uminv {{s[0-9]+}}, [[V0]]
   %arr.load = load <16 x i32>, <16 x i32>* %arr
-  %r = call i32 @llvm.experimental.vector.reduce.umin.v16i32(<16 x i32> %arr.load)
+  %r = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> %arr.load)
   ret i32 %r
 }
 
-declare i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.smax.v16i16(<16 x i16>)
 
 define i16 @oversized_smax_256(<16 x i16>* nocapture readonly %arr)  {
 ; CHECK-LABEL: oversized_smax_256
 ; CHECK: smax [[V0:v[0-9]+]].8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 ; CHECK: smaxv {{h[0-9]+}}, [[V0]]
   %arr.load = load <16 x i16>, <16 x i16>* %arr
-  %r = call i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16> %arr.load)
+  %r = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> %arr.load)
   ret i16 %r
 }
 
-declare i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.smax.v16i32(<16 x i32>)
 
 define i32 @oversized_smax_512(<16 x i32>* nocapture readonly %arr)  {
 ; CHECK-LABEL: oversized_smax_512
@@ -199,22 +199,22 @@
 ; CHECK-NEXT: smax [[V0:v[0-9]+]].4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 ; CHECK-NEXT: smaxv {{s[0-9]+}}, [[V0]]
   %arr.load = load <16 x i32>, <16 x i32>* %arr
-  %r = call i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32> %arr.load)
+  %r = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> %arr.load)
   ret i32 %r
 }
 
-declare i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.smin.v16i16(<16 x i16>)
 
 define i16 @oversized_smin_256(<16 x i16>* nocapture readonly %arr)  {
 ; CHECK-LABEL: oversized_smin_256
 ; CHECK: smin [[V0:v[0-9]+]].8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 ; CHECK: sminv {{h[0-9]+}}, [[V0]]
   %arr.load = load <16 x i16>, <16 x i16>* %arr
-  %r = call i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16> %arr.load)
+  %r = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> %arr.load)
   ret i16 %r
 }
 
-declare i32 @llvm.experimental.vector.reduce.smin.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.smin.v16i32(<16 x i32>)
 
 define i32 @oversized_smin_512(<16 x i32>* nocapture readonly %arr)  {
 ; CHECK-LABEL: oversized_smin_512
@@ -223,6 +223,6 @@
 ; CHECK-NEXT: smin [[V0:v[0-9]+]].4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 ; CHECK-NEXT: sminv {{s[0-9]+}}, [[V0]]
   %arr.load = load <16 x i32>, <16 x i32>* %arr
-  %r = call i32 @llvm.experimental.vector.reduce.smin.v16i32(<16 x i32> %arr.load)
+  %r = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> %arr.load)
   ret i32 %r
 }
diff --git a/llvm/test/CodeGen/AArch64/arm64-vabs.ll b/llvm/test/CodeGen/AArch64/arm64-vabs.ll
--- a/llvm/test/CodeGen/AArch64/arm64-vabs.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vabs.ll
@@ -141,7 +141,7 @@
   ret <2 x i64> %tmp4
 }
 
-declare i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
 
 define i16 @uabdl8h_rdx(<16 x i8>* %a, <16 x i8>* %b) {
 ; CHECK-LABEL: uabdl8h_rdx
@@ -155,11 +155,11 @@
   %abcmp = icmp slt <16 x i16> %abdiff, zeroinitializer
   %ababs = sub nsw <16 x i16> zeroinitializer, %abdiff
   %absel = select <16 x i1> %abcmp, <16 x i16> %ababs, <16 x i16> %abdiff
-  %reduced_v = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %absel)
+  %reduced_v = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %absel)
   ret i16 %reduced_v
 }
 
-declare i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
 
 define i32 @uabdl4s_rdx(<8 x i16>* %a, <8 x i16>* %b) {
 ; CHECK-LABEL: uabdl4s_rdx
@@ -173,11 +173,11 @@
   %abcmp = icmp slt <8 x i32> %abdiff, zeroinitializer
   %ababs = sub nsw <8 x i32> zeroinitializer, %abdiff
   %absel = select <8 x i1> %abcmp, <8 x i32> %ababs, <8 x i32> %abdiff
-  %reduced_v = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %absel)
+  %reduced_v = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %absel)
   ret i32 %reduced_v
 }
 
-declare i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
 
 define i64 @uabdl2d_rdx(<4 x i32>* %a, <4 x i32>* %b, i32 %h) {
 ; CHECK: uabdl2d_rdx
@@ -191,7 +191,7 @@
   %abcmp = icmp slt <4 x i64> %abdiff, zeroinitializer
   %ababs = sub nsw <4 x i64> zeroinitializer, %abdiff
   %absel = select <4 x i1> %abcmp, <4 x i64> %ababs, <4 x i64> %abdiff
-  %reduced_v = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %absel)
+  %reduced_v = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %absel)
   ret i64 %reduced_v
 }
 
diff --git a/llvm/test/CodeGen/AArch64/neon-dot-product.ll b/llvm/test/CodeGen/AArch64/neon-dot-product.ll
--- a/llvm/test/CodeGen/AArch64/neon-dot-product.ll
+++ b/llvm/test/CodeGen/AArch64/neon-dot-product.ll
@@ -205,7 +205,7 @@
   ret void
 }
 
-declare i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
 
 define i32 @test_udot_v8i8(i8* nocapture readonly %a, i8* nocapture readonly %b) {
 entry:
@@ -218,7 +218,7 @@
   %4 = load <8 x i8>, <8 x i8>* %3
   %5 = zext <8 x i8> %4 to <8 x i32>
   %6 = mul nuw nsw <8 x i32> %5, %2
-  %7 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %6)
+  %7 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %6)
   ret i32 %7
 }
 
@@ -233,11 +233,11 @@
   %4 = load <8 x i8>, <8 x i8>* %3
   %5 = sext <8 x i8> %4 to <8 x i32>
   %6 = mul nsw <8 x i32> %5, %2
-  %7 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %6)
+  %7 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %6)
   ret i32 %7
 }
 
-declare i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
 
 define i32 @test_udot_v16i8(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %sum) {
 entry:
@@ -250,7 +250,7 @@
   %4 = load <16 x i8>, <16 x i8>* %3
   %5 = zext <16 x i8> %4 to <16 x i32>
   %6 = mul nuw nsw <16 x i32> %5, %2
-  %7 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %6)
+  %7 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %6)
   %op.extra = add i32 %7, %sum
   ret i32 %op.extra
 }
@@ -265,7 +265,7 @@
   %0 = bitcast i8* %a1 to <16 x i8>*
   %1 = load <16 x i8>, <16 x i8>* %0
   %2 = zext <16 x i8> %1 to <16 x i32>
-  %3 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %2)
+  %3 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %2)
   ret i32 %3
 }
 
@@ -280,7 +280,7 @@
   %4 = load <16 x i8>, <16 x i8>* %3
   %5 = sext <16 x i8> %4 to <16 x i32>
   %6 = mul nsw <16 x i32> %5, %2
-  %7 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %6)
+  %7 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %6)
   %op.extra = add nsw i32 %7, %sum
   ret i32 %op.extra
 }
@@ -295,6 +295,6 @@
   %0 = bitcast i8* %a1 to <16 x i8>*
   %1 = load <16 x i8>, <16 x i8>* %0
   %2 = sext <16 x i8> %1 to <16 x i32>
-  %3 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %2)
+  %3 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %2)
   ret i32 %3
 }
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-reduce.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-reduce.ll
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-reduce.ll
@@ -29,7 +29,7 @@
 ; CHECK-LABEL: fmaxv_v4f16:
 ; CHECK: fmaxnmv h0, v0.4h
 ; CHECK-NEXT: ret
-  %res = call half @llvm.experimental.vector.reduce.fmax.v4f16(<4 x half> %a)
+  %res = call half @llvm.vector.reduce.fmax.v4f16(<4 x half> %a)
   ret half %res
 }
 
@@ -38,7 +38,7 @@
 ; CHECK-LABEL: fmaxv_v8f16:
 ; CHECK: fmaxnmv h0, v0.8h
 ; CHECK-NEXT: ret
-  %res = call half @llvm.experimental.vector.reduce.fmax.v8f16(<8 x half> %a)
+  %res = call half @llvm.vector.reduce.fmax.v8f16(<8 x half> %a)
   ret half %res
 }
 
@@ -49,7 +49,7 @@
 ; VBITS_GE_256-NEXT: fmaxnmv h0, [[PG]], [[OP]].h
 ; VBITS_GE_256-NEXT: ret
   %op = load <16 x half>, <16 x half>* %a
-  %res = call half @llvm.experimental.vector.reduce.fmax.v16f16(<16 x half> %op)
+  %res = call half @llvm.vector.reduce.fmax.v16f16(<16 x half> %op)
   ret half %res
 }
 
@@ -60,7 +60,7 @@
 ; VBITS_GE_512-NEXT: fmaxnmv h0, [[PG]], [[OP]].h
 ; VBITS_GE_512-NEXT: ret
   %op = load <32 x half>, <32 x half>* %a
-  %res = call half @llvm.experimental.vector.reduce.fmax.v32f16(<32 x half> %op)
+  %res = call half @llvm.vector.reduce.fmax.v32f16(<32 x half> %op)
   ret half %res
 }
 
@@ -71,7 +71,7 @@
 ; VBITS_GE_1024-NEXT: fmaxnmv h0, [[PG]], [[OP]].h
 ; VBITS_GE_1024-NEXT: ret
   %op = load <64 x half>, <64 x half>* %a
-  %res = call half @llvm.experimental.vector.reduce.fmax.v64f16(<64 x half> %op)
+  %res = call half @llvm.vector.reduce.fmax.v64f16(<64 x half> %op)
   ret half %res
 }
 
@@ -82,7 +82,7 @@
 ; VBITS_GE_2048-NEXT: fmaxnmv h0, [[PG]], [[OP]].h
 ; VBITS_GE_2048-NEXT: ret
   %op = load <128 x half>, <128 x half>* %a
-  %res = call half @llvm.experimental.vector.reduce.fmax.v128f16(<128 x half> %op)
+  %res = call half @llvm.vector.reduce.fmax.v128f16(<128 x half> %op)
   ret half %res
 }
 
@@ -91,7 +91,7 @@
 ; CHECK-LABEL: fmaxv_v2f32:
 ; CHECK: fmaxnmp s0, v0.2s
 ; CHECK: ret
-  %res = call float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float> %a)
+  %res = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> %a)
   ret float %res
 }
 
@@ -100,7 +100,7 @@
 ; CHECK-LABEL: fmaxv_v4f32:
 ; CHECK: fmaxnmv s0, v0.4s
 ; CHECK: ret
-  %res = call float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> %a)
+  %res = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %a)
   ret float %res
 }
 
@@ -111,7 +111,7 @@
 ; VBITS_GE_256-NEXT: fmaxnmv s0, [[PG]], [[OP]].s
 ; VBITS_GE_256-NEXT: ret
   %op = load <8 x float>, <8 x float>* %a
-  %res = call float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float> %op)
+  %res = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> %op)
   ret float %res
 }
 
@@ -122,7 +122,7 @@
 ; VBITS_GE_512-NEXT: fmaxnmv s0, [[PG]], [[OP]].s
 ; VBITS_GE_512-NEXT: ret
   %op = load <16 x float>, <16 x float>* %a
-  %res = call float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float> %op)
+  %res = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %op)
   ret float %res
 }
 
@@ -133,7 +133,7 @@
 ; VBITS_GE_1024-NEXT: fmaxnmv s0, [[PG]], [[OP]].s
 ; VBITS_GE_1024-NEXT: ret
   %op = load <32 x float>, <32 x float>* %a
-  %res = call float @llvm.experimental.vector.reduce.fmax.v32f32(<32 x float> %op)
+  %res = call float @llvm.vector.reduce.fmax.v32f32(<32 x float> %op)
   ret float %res
 }
 
@@ -144,7 +144,7 @@
 ; VBITS_GE_2048-NEXT: fmaxnmv s0, [[PG]], [[OP]].s
 ; VBITS_GE_2048-NEXT: ret
   %op = load <64 x float>, <64 x float>* %a
-  %res = call float @llvm.experimental.vector.reduce.fmax.v64f32(<64 x float> %op)
+  %res = call float @llvm.vector.reduce.fmax.v64f32(<64 x float> %op)
   ret float %res
 }
 
@@ -153,7 +153,7 @@
 ; CHECK-LABEL: fmaxv_v1f64:
 ; CHECK-NOT: fmax
 ; CHECK: ret
-  %res = call double @llvm.experimental.vector.reduce.fmax.v1f64(<1 x double> %a)
+  %res = call double @llvm.vector.reduce.fmax.v1f64(<1 x double> %a)
   ret double %res
 }
 
@@ -162,7 +162,7 @@
 ; CHECK-LABEL: fmaxv_v2f64:
 ; CHECK: fmaxnmp d0, v0.2d
 ; CHECK-NEXT: ret
-  %res = call double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double> %a)
+  %res = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> %a)
   ret double %res
 }
 
@@ -173,7 +173,7 @@
 ; VBITS_GE_256-NEXT: fmaxnmv d0, [[PG]], [[OP]].d
 ; VBITS_GE_256-NEXT: ret
   %op = load <4 x double>, <4 x double>* %a
-  %res = call double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double> %op)
+  %res = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> %op)
   ret double %res
 }
 
@@ -184,7 +184,7 @@
 ; VBITS_GE_512-NEXT: fmaxnmv d0, [[PG]], [[OP]].d
 ; VBITS_GE_512-NEXT: ret
   %op = load <8 x double>, <8 x double>* %a
-  %res = call double @llvm.experimental.vector.reduce.fmax.v8f64(<8 x double> %op)
+  %res = call double @llvm.vector.reduce.fmax.v8f64(<8 x double> %op)
   ret double %res
 }
 
@@ -195,7 +195,7 @@
 ; VBITS_GE_1024-NEXT: fmaxnmv d0, [[PG]], [[OP]].d
 ; VBITS_GE_1024-NEXT: ret
   %op = load <16 x double>, <16 x double>* %a
-  %res = call double @llvm.experimental.vector.reduce.fmax.v16f64(<16 x double> %op)
+  %res = call double @llvm.vector.reduce.fmax.v16f64(<16 x double> %op)
   ret double %res
 }
 
@@ -206,7 +206,7 @@
 ; VBITS_GE_2048-NEXT: fmaxnmv d0, [[PG]], [[OP]].d
 ; VBITS_GE_2048-NEXT: ret
   %op = load <32 x double>, <32 x double>* %a
-  %res = call double @llvm.experimental.vector.reduce.fmax.v32f64(<32 x double> %op)
+  %res = call double @llvm.vector.reduce.fmax.v32f64(<32 x double> %op)
   ret double %res
 }
 
@@ -219,7 +219,7 @@
 ; CHECK-LABEL: fminv_v4f16:
 ; CHECK: fminnmv h0, v0.4h
 ; CHECK-NEXT: ret
-  %res = call half @llvm.experimental.vector.reduce.fmin.v4f16(<4 x half> %a)
+  %res = call half @llvm.vector.reduce.fmin.v4f16(<4 x half> %a)
   ret half %res
 }
 
@@ -228,7 +228,7 @@
 ; CHECK-LABEL: fminv_v8f16:
 ; CHECK: fminnmv h0, v0.8h
 ; CHECK-NEXT: ret
-  %res = call half @llvm.experimental.vector.reduce.fmin.v8f16(<8 x half> %a)
+  %res = call half @llvm.vector.reduce.fmin.v8f16(<8 x half> %a)
   ret half %res
 }
 
@@ -239,7 +239,7 @@
 ; VBITS_GE_256-NEXT: fminnmv h0, [[PG]], [[OP]].h
 ; VBITS_GE_256-NEXT: ret
   %op = load <16 x half>, <16 x half>* %a
-  %res = call half @llvm.experimental.vector.reduce.fmin.v16f16(<16 x half> %op)
+  %res = call half @llvm.vector.reduce.fmin.v16f16(<16 x half> %op)
   ret half %res
 }
 
@@ -250,7 +250,7 @@
 ; VBITS_GE_512-NEXT: fminnmv h0, [[PG]], [[OP]].h
 ; VBITS_GE_512-NEXT: ret
   %op = load <32 x half>, <32 x half>* %a
-  %res = call half @llvm.experimental.vector.reduce.fmin.v32f16(<32 x half> %op)
+  %res = call half @llvm.vector.reduce.fmin.v32f16(<32 x half> %op)
   ret half %res
 }
 
@@ -261,7 +261,7 @@
 ; VBITS_GE_1024-NEXT: fminnmv h0, [[PG]], [[OP]].h
 ; VBITS_GE_1024-NEXT: ret
   %op = load <64 x half>, <64 x half>* %a
-  %res = call half @llvm.experimental.vector.reduce.fmin.v64f16(<64 x half> %op)
+  %res = call half @llvm.vector.reduce.fmin.v64f16(<64 x half> %op)
   ret half %res
 }
 
@@ -272,7 +272,7 @@
 ; VBITS_GE_2048-NEXT: fminnmv h0, [[PG]], [[OP]].h
 ; VBITS_GE_2048-NEXT: ret
   %op = load <128 x half>, <128 x half>* %a
-  %res = call half @llvm.experimental.vector.reduce.fmin.v128f16(<128 x half> %op)
+  %res = call half @llvm.vector.reduce.fmin.v128f16(<128 x half> %op)
   ret half %res
 }
 
@@ -281,7 +281,7 @@
 ; CHECK-LABEL: fminv_v2f32:
 ; CHECK: fminnmp s0, v0.2s
 ; CHECK: ret
-  %res = call float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float> %a)
+  %res = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> %a)
   ret float %res
 }
 
@@ -290,7 +290,7 @@
 ; CHECK-LABEL: fminv_v4f32:
 ; CHECK: fminnmv s0, v0.4s
 ; CHECK: ret
-  %res = call float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> %a)
+  %res = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %a)
   ret float %res
 }
 
@@ -301,7 +301,7 @@
 ; VBITS_GE_256-NEXT: fminnmv s0, [[PG]], [[OP]].s
 ; VBITS_GE_256-NEXT: ret
   %op = load <8 x float>, <8 x float>* %a
-  %res = call float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float> %op)
+  %res = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> %op)
   ret float %res
 }
 
@@ -312,7 +312,7 @@
 ; VBITS_GE_512-NEXT: fminnmv s0, [[PG]], [[OP]].s
 ; VBITS_GE_512-NEXT: ret
   %op = load <16 x float>, <16 x float>* %a
-  %res = call float @llvm.experimental.vector.reduce.fmin.v16f32(<16 x float> %op)
+  %res = call float @llvm.vector.reduce.fmin.v16f32(<16 x float> %op)
   ret float %res
 }
 
@@ -323,7 +323,7 @@
 ; VBITS_GE_1024-NEXT: fminnmv s0, [[PG]], [[OP]].s
 ; VBITS_GE_1024-NEXT: ret
   %op = load <32 x float>, <32 x float>* %a
-  %res = call float @llvm.experimental.vector.reduce.fmin.v32f32(<32 x float> %op)
+  %res = call float @llvm.vector.reduce.fmin.v32f32(<32 x float> %op)
   ret float %res
 }
 
@@ -334,7 +334,7 @@
 ; VBITS_GE_2048-NEXT: fminnmv s0, [[PG]], [[OP]].s
 ; VBITS_GE_2048-NEXT: ret
   %op = load <64 x float>, <64 x float>* %a
-  %res = call float @llvm.experimental.vector.reduce.fmin.v64f32(<64 x float> %op)
+  %res = call float @llvm.vector.reduce.fmin.v64f32(<64 x float> %op)
   ret float %res
 }
 
@@ -343,7 +343,7 @@
 ; CHECK-LABEL: fminv_v1f64:
 ; CHECK-NOT: fmin
 ; CHECK: ret
-  %res = call double @llvm.experimental.vector.reduce.fmin.v1f64(<1 x double> %a)
+  %res = call double @llvm.vector.reduce.fmin.v1f64(<1 x double> %a)
   ret double %res
 }
 
@@ -352,7 +352,7 @@
 ; CHECK-LABEL: fminv_v2f64:
 ; CHECK: fminnmp d0, v0.2d
 ; CHECK-NEXT: ret
-  %res = call double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double> %a)
+  %res = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> %a)
   ret double %res
 }
 
@@ -363,7 +363,7 @@
 ; VBITS_GE_256-NEXT: fminnmv d0, [[PG]], [[OP]].d
 ; VBITS_GE_256-NEXT: ret
   %op = load <4 x double>, <4 x double>* %a
-  %res = call double @llvm.experimental.vector.reduce.fmin.v4f64(<4 x double> %op)
+  %res = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> %op)
   ret double %res
 }
 
@@ -374,7 +374,7 @@
 ; VBITS_GE_512-NEXT: fminnmv d0, [[PG]], [[OP]].d
 ; VBITS_GE_512-NEXT: ret
   %op = load <8 x double>, <8 x double>* %a
-  %res = call double @llvm.experimental.vector.reduce.fmin.v8f64(<8 x double> %op)
+  %res = call double @llvm.vector.reduce.fmin.v8f64(<8 x double> %op)
   ret double %res
 }
 
@@ -385,7 +385,7 @@
 ; VBITS_GE_1024-NEXT: fminnmv d0, [[PG]], [[OP]].d
 ; VBITS_GE_1024-NEXT: ret
   %op = load <16 x double>, <16 x double>* %a
-  %res = call double @llvm.experimental.vector.reduce.fmin.v16f64(<16 x double> %op)
+  %res = call double @llvm.vector.reduce.fmin.v16f64(<16 x double> %op)
   ret double %res
 }
 
@@ -396,50 +396,50 @@
 ; VBITS_GE_2048-NEXT: fminnmv d0, [[PG]], [[OP]].d
 ; VBITS_GE_2048-NEXT: ret
   %op = load <32 x double>, <32 x double>* %a
-  %res = call double @llvm.experimental.vector.reduce.fmin.v32f64(<32 x double> %op)
+  %res = call double @llvm.vector.reduce.fmin.v32f64(<32 x double> %op)
   ret double %res
 }
 
 attributes #0 = { "target-features"="+sve" }
 
-declare half @llvm.experimental.vector.reduce.fmax.v4f16(<4 x half>)
-declare half @llvm.experimental.vector.reduce.fmax.v8f16(<8 x half>)
-declare half @llvm.experimental.vector.reduce.fmax.v16f16(<16 x half>)
-declare half @llvm.experimental.vector.reduce.fmax.v32f16(<32 x half>)
-declare half @llvm.experimental.vector.reduce.fmax.v64f16(<64 x half>)
-declare half @llvm.experimental.vector.reduce.fmax.v128f16(<128 x half>)
-
-declare float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float>)
-declare float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float>)
-declare float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float>)
-declare float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float>)
-declare float @llvm.experimental.vector.reduce.fmax.v32f32(<32 x float>)
-declare float @llvm.experimental.vector.reduce.fmax.v64f32(<64 x float>)
-
-declare double @llvm.experimental.vector.reduce.fmax.v1f64(<1 x double>)
-declare double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double>)
-declare double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double>)
-declare double @llvm.experimental.vector.reduce.fmax.v8f64(<8 x double>)
-declare double @llvm.experimental.vector.reduce.fmax.v16f64(<16 x double>)
-declare double @llvm.experimental.vector.reduce.fmax.v32f64(<32 x double>)
-
-declare half @llvm.experimental.vector.reduce.fmin.v4f16(<4 x half>)
-declare half @llvm.experimental.vector.reduce.fmin.v8f16(<8 x half>)
-declare half @llvm.experimental.vector.reduce.fmin.v16f16(<16 x half>)
-declare half @llvm.experimental.vector.reduce.fmin.v32f16(<32 x half>)
-declare half @llvm.experimental.vector.reduce.fmin.v64f16(<64 x half>)
-declare half @llvm.experimental.vector.reduce.fmin.v128f16(<128 x half>)
-
-declare float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float>)
-declare float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float>)
-declare float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float>)
-declare float @llvm.experimental.vector.reduce.fmin.v16f32(<16 x float>)
-declare float @llvm.experimental.vector.reduce.fmin.v32f32(<32 x float>)
-declare float @llvm.experimental.vector.reduce.fmin.v64f32(<64 x float>)
-
-declare double @llvm.experimental.vector.reduce.fmin.v1f64(<1 x double>)
-declare double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double>)
-declare double @llvm.experimental.vector.reduce.fmin.v4f64(<4 x double>)
-declare double @llvm.experimental.vector.reduce.fmin.v8f64(<8 x double>)
-declare double @llvm.experimental.vector.reduce.fmin.v16f64(<16 x double>)
-declare double @llvm.experimental.vector.reduce.fmin.v32f64(<32 x double>)
+declare half @llvm.vector.reduce.fmax.v4f16(<4 x half>)
+declare half @llvm.vector.reduce.fmax.v8f16(<8 x half>)
+declare half @llvm.vector.reduce.fmax.v16f16(<16 x half>)
+declare half @llvm.vector.reduce.fmax.v32f16(<32 x half>)
+declare half @llvm.vector.reduce.fmax.v64f16(<64 x half>)
+declare half @llvm.vector.reduce.fmax.v128f16(<128 x half>)
+
+declare float @llvm.vector.reduce.fmax.v2f32(<2 x float>)
+declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>)
+declare float @llvm.vector.reduce.fmax.v8f32(<8 x float>)
+declare float @llvm.vector.reduce.fmax.v16f32(<16 x float>)
+declare float @llvm.vector.reduce.fmax.v32f32(<32 x float>)
+declare float @llvm.vector.reduce.fmax.v64f32(<64 x float>)
+
+declare double @llvm.vector.reduce.fmax.v1f64(<1 x double>)
+declare double @llvm.vector.reduce.fmax.v2f64(<2 x double>)
+declare double @llvm.vector.reduce.fmax.v4f64(<4 x double>)
+declare double @llvm.vector.reduce.fmax.v8f64(<8 x double>)
+declare double @llvm.vector.reduce.fmax.v16f64(<16 x double>)
+declare double @llvm.vector.reduce.fmax.v32f64(<32 x double>)
+
+declare half @llvm.vector.reduce.fmin.v4f16(<4 x half>)
+declare half @llvm.vector.reduce.fmin.v8f16(<8 x half>)
+declare half @llvm.vector.reduce.fmin.v16f16(<16 x half>)
+declare half @llvm.vector.reduce.fmin.v32f16(<32 x half>)
+declare half @llvm.vector.reduce.fmin.v64f16(<64 x half>)
+declare half @llvm.vector.reduce.fmin.v128f16(<128 x half>)
+
+declare float @llvm.vector.reduce.fmin.v2f32(<2 x float>)
+declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>)
+declare float @llvm.vector.reduce.fmin.v8f32(<8 x float>)
+declare float @llvm.vector.reduce.fmin.v16f32(<16 x float>)
+declare float @llvm.vector.reduce.fmin.v32f32(<32 x float>)
+declare float @llvm.vector.reduce.fmin.v64f32(<64 x float>)
+
+declare double @llvm.vector.reduce.fmin.v1f64(<1 x double>)
+declare double @llvm.vector.reduce.fmin.v2f64(<2 x double>)
+declare double @llvm.vector.reduce.fmin.v4f64(<4 x double>)
+declare double @llvm.vector.reduce.fmin.v8f64(<8 x double>)
+declare double @llvm.vector.reduce.fmin.v16f64(<16 x double>)
+declare double @llvm.vector.reduce.fmin.v32f64(<32 x double>)
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-reduce.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-reduce.ll
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-reduce.ll
@@ -29,7 +29,7 @@
 ; CHECK-LABEL: uaddv_v8i8:
 ; CHECK: addv b0, v0.8b
 ; CHECK: ret
-  %res = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> %a)
+  %res = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a)
   ret i8 %res
 }
 
@@ -38,7 +38,7 @@
 ; CHECK-LABEL: uaddv_v16i8:
 ; CHECK: addv b0, v0.16b
 ; CHECK: ret
-  %res = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %a)
+  %res = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %a)
   ret i8 %res
 }
 
@@ -50,7 +50,7 @@
 ; VBITS_GE_256-NEXT: fmov x0, [[REDUCE]]
 ; VBITS_GE_256-NEXT: ret
   %op = load <32 x i8>, <32 x i8>* %a
-  %res = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> %op)
+  %res = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> %op)
   ret i8 %res
 }
 
@@ -72,7 +72,7 @@
 ; VBITS_EQ_256-NEXT: fmov x0, [[REDUCE]]
 ; VBITS_EQ_256-NEXT: ret
   %op = load <64 x i8>, <64 x i8>* %a
-  %res = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> %op)
+  %res = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> %op)
   ret i8 %res
 }
 
@@ -84,7 +84,7 @@
 ; VBITS_GE_1024-NEXT: fmov x0, [[REDUCE]]
 ; VBITS_GE_1024-NEXT: ret
   %op = load <128 x i8>, <128 x i8>* %a
-  %res = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> %op)
+  %res = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> %op)
   ret i8 %res
 }
 
@@ -96,7 +96,7 @@
 ; VBITS_GE_2048-NEXT: fmov x0, [[REDUCE]]
 ; VBITS_GE_2048-NEXT: ret
   %op = load <256 x i8>, <256 x i8>* %a
-  %res = call i8 @llvm.experimental.vector.reduce.add.v256i8(<256 x i8> %op)
+  %res = call i8 @llvm.vector.reduce.add.v256i8(<256 x i8> %op)
   ret i8 %res
 }
 
@@ -105,7 +105,7 @@
 ; CHECK-LABEL: uaddv_v4i16:
 ; CHECK: addv h0, v0.4h
 ; CHECK: ret
-  %res = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> %a)
+  %res = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a)
   ret i16 %res
 }
 
@@ -114,7 +114,7 @@
 ; CHECK-LABEL: uaddv_v8i16:
 ; CHECK: addv h0, v0.8h
 ; CHECK: ret
-  %res = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %a)
+  %res = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a)
   ret i16 %res
 }
 
@@ -126,7 +126,7 @@
 ; VBITS_GE_256-NEXT: fmov x0, [[REDUCE]]
 ; VBITS_GE_256-NEXT: ret
   %op = load <16 x i16>, <16 x i16>* %a
-  %res = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %op)
+  %res = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %op)
   ret i16 %res
 }
 
@@ -148,7 +148,7 @@
 ; VBITS_EQ_256-NEXT: fmov x0, [[REDUCE]]
 ; VBITS_EQ_256-NEXT: ret
   %op = load <32 x i16>, <32 x i16>* %a
-  %res = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> %op)
+  %res = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %op)
   ret i16 %res
 }
 
@@ -160,7 +160,7 @@
 ; VBITS_GE_1024-NEXT: fmov x0, [[REDUCE]]
 ; VBITS_GE_1024-NEXT: ret
   %op = load <64 x i16>, <64 x i16>* %a
-  %res = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> %op)
+  %res = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> %op)
   ret i16 %res
 }
 
@@ -172,7 +172,7 @@
 ; VBITS_GE_2048-NEXT: fmov x0, [[REDUCE]]
 ; VBITS_GE_2048-NEXT: ret
   %op = load <128 x i16>, <128 x i16>* %a
-  %res = call i16 @llvm.experimental.vector.reduce.add.v128i16(<128 x i16> %op)
+  %res = call i16 @llvm.vector.reduce.add.v128i16(<128 x i16> %op)
   ret i16 %res
 }
 
@@ -181,7 +181,7 @@
 ; CHECK-LABEL: uaddv_v2i32:
 ; CHECK: addp v0.2s, v0.2s
 ; CHECK: ret
-  %res = call i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32> %a)
+  %res = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a)
   ret i32 %res
 }
 
@@ -190,7 +190,7 @@
 ; CHECK-LABEL: uaddv_v4i32:
 ; CHECK: addv s0, v0.4s
 ; CHECK: ret
-  %res = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %a)
+  %res = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a)
   ret i32 %res
 }
 
@@ -202,7 +202,7 @@
 ; VBITS_GE_256-NEXT: fmov x0, [[REDUCE]]
 ; VBITS_GE_256-NEXT: ret
   %op = load <8 x i32>, <8 x i32>* %a
-  %res = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %op)
+  %res = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %op)
   ret i32 %res
 }
 
@@ -224,7 +224,7 @@
 ; VBITS_EQ_256-NEXT: fmov x0, [[REDUCE]]
 ; VBITS_EQ_256-NEXT: ret
   %op = load <16 x i32>, <16 x i32>* %a
-  %res = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %op)
+  %res = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %op)
   ret i32 %res
 }
 
@@ -236,7 +236,7 @@
 ; VBITS_GE_1024-NEXT: fmov x0, [[REDUCE]]
 ; VBITS_GE_1024-NEXT: ret
   %op = load <32 x i32>, <32 x i32>* %a
-  %res = call i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32> %op)
+  %res = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %op)
   ret i32 %res
 }
 
@@ -248,7 +248,7 @@
 ; VBITS_GE_2048-NEXT: fmov x0, [[REDUCE]]
 ; VBITS_GE_2048-NEXT: ret
   %op = load <64 x i32>, <64 x i32>* %a
-  %res = call i32 @llvm.experimental.vector.reduce.add.v64i32(<64 x i32> %op)
+  %res = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %op)
   ret i32 %res
 }
 
@@ -257,7 +257,7 @@
 ; CHECK-LABEL: uaddv_v1i64:
 ; CHECK: fmov x0, d0
 ; CHECK: ret
-  %res = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> %a)
+  %res = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a)
   ret i64 %res
 }
 
@@ -266,7 +266,7 @@
 ; CHECK-LABEL: uaddv_v2i64:
 ; CHECK: addp d0, v0.2d
 ; CHECK: ret
-  %res = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %a)
+  %res = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a)
   ret i64 %res
 }
 
@@ -278,7 +278,7 @@
 ; VBITS_GE_256-NEXT: fmov x0, [[REDUCE]]
 ; VBITS_GE_256-NEXT: ret
   %op = load <4 x i64>, <4 x i64>* %a
-  %res = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %op)
+  %res = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %op)
   ret i64 %res
 }
 
@@ -300,7 +300,7 @@
 ; VBITS_EQ_256-NEXT: fmov x0, [[REDUCE]]
 ; VBITS_EQ_256-NEXT: ret
   %op = load <8 x i64>, <8 x i64>* %a
-  %res = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %op)
+  %res = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %op)
   ret i64 %res
 }
 
@@ -312,7 +312,7 @@
 ; VBITS_GE_1024-NEXT: fmov x0, [[REDUCE]]
 ; VBITS_GE_1024-NEXT: ret
   %op = load <16 x i64>, <16 x i64>* %a
-  %res = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %op)
+  %res = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %op)
   ret i64 %res
 }
 
@@ -324,7 +324,7 @@
 ; VBITS_GE_2048-NEXT: fmov x0, [[REDUCE]]
 ; VBITS_GE_2048-NEXT: ret
   %op = load <32 x i64>, <32 x i64>* %a
-  %res = call i64 @llvm.experimental.vector.reduce.add.v32i64(<32 x i64> %op)
+  %res = call i64 @llvm.vector.reduce.add.v32i64(<32 x i64> %op)
   ret i64 %res
 }
 
@@ -337,7 +337,7 @@
 ; CHECK-LABEL: smaxv_v8i8:
 ; CHECK: smaxv b0, v0.8b
 ; CHECK: ret
-  %res = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> %a)
+  %res = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> %a)
   ret i8 %res
 }
 
@@ -346,7 +346,7 @@
 ; CHECK-LABEL: smaxv_v16i8:
 ; CHECK: smaxv b0, v0.16b
 ; CHECK: ret
-  %res = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> %a)
+  %res = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> %a)
   ret i8 %res
 }
 
@@ -358,7 +358,7 @@
 ; VBITS_GE_256-NEXT: fmov w0, s[[REDUCE]]
 ; VBITS_GE_256-NEXT: ret
   %op = load <32 x i8>, <32 x i8>* %a
-  %res = call i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8> %op)
+  %res = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> %op)
   ret i8 %res
 }
 
@@ -380,7 +380,7 @@
 ; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]]
 ; VBITS_EQ_256-NEXT: ret
   %op = load <64 x i8>, <64 x i8>* %a
-  %res = call i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8> %op)
+  %res = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> %op)
   ret i8 %res
 }
 
@@ -392,7 +392,7 @@
 ; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]]
 ; VBITS_GE_1024-NEXT: ret
   %op = load <128 x i8>, <128 x i8>* %a
-  %res = call i8 @llvm.experimental.vector.reduce.smax.v128i8(<128 x i8> %op)
+  %res = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> %op)
   ret i8 %res
 }
 
@@ -404,7 +404,7 @@
 ; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]]
 ; VBITS_GE_2048-NEXT: ret
   %op = load <256 x i8>, <256 x i8>* %a
-  %res = call i8 @llvm.experimental.vector.reduce.smax.v256i8(<256 x i8> %op)
+  %res = call i8 @llvm.vector.reduce.smax.v256i8(<256 x i8> %op)
   ret i8 %res
 }
 
@@ -413,7 +413,7 @@
 ; CHECK-LABEL: smaxv_v4i16:
 ; CHECK: smaxv h0, v0.4h
 ; CHECK: ret
-  %res = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> %a)
+  %res = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> %a)
   ret i16 %res
 }
 
@@ -422,7 +422,7 @@
 ; CHECK-LABEL: smaxv_v8i16:
 ; CHECK: smaxv h0, v0.8h
 ; CHECK: ret
-  %res = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> %a)
+  %res = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> %a)
   ret i16 %res
 }
 
@@ -434,7 +434,7 @@
 ; VBITS_GE_256-NEXT: fmov w0, s[[REDUCE]]
 ; VBITS_GE_256-NEXT: ret
   %op = load <16 x i16>, <16 x i16>* %a
-  %res = call i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16> %op)
+  %res = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> %op)
   ret i16 %res
 }
 
@@ -456,7 +456,7 @@
 ; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]]
 ; VBITS_EQ_256-NEXT: ret
   %op = load <32 x i16>, <32 x i16>* %a
-  %res = call i16 @llvm.experimental.vector.reduce.smax.v32i16(<32 x i16> %op)
+  %res = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> %op)
   ret i16 %res
 }
 
@@ -468,7 +468,7 @@
 ; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]]
 ; VBITS_GE_1024-NEXT: ret
   %op = load <64 x i16>, <64 x i16>* %a
-  %res = call i16 @llvm.experimental.vector.reduce.smax.v64i16(<64 x i16> %op)
+  %res = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> %op)
   ret i16 %res
 }
 
@@ -480,7 +480,7 @@
 ; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]]
 ; VBITS_GE_2048-NEXT: ret
   %op = load <128 x i16>, <128 x i16>* %a
-  %res = call i16 @llvm.experimental.vector.reduce.smax.v128i16(<128 x i16> %op)
+  %res = call i16 @llvm.vector.reduce.smax.v128i16(<128 x i16> %op)
   ret i16 %res
 }
 
@@ -489,7 +489,7 @@
 ; CHECK-LABEL: smaxv_v2i32:
 ; CHECK: smaxp v0.2s, v0.2s
 ; CHECK: ret
-  %res = call i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32> %a)
+  %res = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> %a)
   ret i32 %res
 }
 
@@ -498,7 +498,7 @@
 ; CHECK-LABEL: smaxv_v4i32:
 ; CHECK: smaxv s0, v0.4s
 ; CHECK: ret
-  %res = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> %a)
+  %res = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %a)
   ret i32 %res
 }
 
@@ -510,7 +510,7 @@
 ; VBITS_GE_256-NEXT: fmov w0, [[REDUCE]]
 ; VBITS_GE_256-NEXT: ret
   %op = load <8 x i32>, <8 x i32>* %a
-  %res = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> %op)
+  %res = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> %op)
   ret i32 %res
 }
 
@@ -532,7 +532,7 @@
 ; VBITS_EQ_256-NEXT: fmov w0, [[REDUCE]]
 ; VBITS_EQ_256-NEXT: ret
   %op = load <16 x i32>, <16 x i32>* %a
-  %res = call i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32> %op)
+  %res = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> %op)
   ret i32 %res
 }
 
@@ -544,7 +544,7 @@
 ; VBITS_GE_1024-NEXT: fmov w0, [[REDUCE]]
 ; VBITS_GE_1024-NEXT: ret
   %op = load <32 x i32>, <32 x i32>* %a
-  %res = call i32 @llvm.experimental.vector.reduce.smax.v32i32(<32 x i32> %op)
+  %res = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> %op)
   ret i32 %res
 }
 
@@ -556,7 +556,7 @@
 ; VBITS_GE_2048-NEXT: fmov w0, [[REDUCE]]
 ; VBITS_GE_2048-NEXT: ret
   %op = load <64 x i32>, <64 x i32>* %a
-  %res = call i32 @llvm.experimental.vector.reduce.smax.v64i32(<64 x i32> %op)
+  %res = call i32 @llvm.vector.reduce.smax.v64i32(<64 x i32> %op)
   ret i32 %res
 }
 
@@ -565,7 +565,7 @@
 ; CHECK-LABEL: smaxv_v1i64:
 ; CHECK: fmov x0, d0
 ; CHECK: ret
-  %res = call i64 @llvm.experimental.vector.reduce.smax.v1i64(<1 x i64> %a)
+  %res = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> %a)
   ret i64 %res
 }
 
@@ -576,7 +576,7 @@
 ; CHECK-NEXT: smaxv [[REDUCE:d[0-9]+]], [[PG]], z0.d
 ; CHECK-NEXT: fmov x0, [[REDUCE]]
 ; CHECK-NEXT: ret
-  %res = call i64 @llvm.experimental.vector.reduce.smax.v2i64(<2 x i64> %a)
+  %res = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> %a)
   ret i64 %res
 }
 
@@ -588,7 +588,7 @@
 ; VBITS_GE_256-NEXT: fmov x0, [[REDUCE]]
 ; VBITS_GE_256-NEXT: ret
   %op = load <4 x i64>, <4 x i64>* %a
-  %res = call i64 @llvm.experimental.vector.reduce.smax.v4i64(<4 x i64> %op)
+  %res = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> %op)
   ret i64 %res
 }
 
@@ -610,7 +610,7 @@
 ; VBITS_EQ_256-NEXT: fmov x0, [[REDUCE]]
 ; VBITS_EQ_256-NEXT: ret
   %op = load <8 x i64>, <8 x i64>* %a
-  %res = call i64 @llvm.experimental.vector.reduce.smax.v8i64(<8 x i64> %op)
+  %res = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> %op)
   ret i64 %res
 }
 
@@ -622,7 +622,7 @@
 ; VBITS_GE_1024-NEXT: fmov x0, [[REDUCE]]
 ; VBITS_GE_1024-NEXT: ret
   %op = load <16 x i64>, <16 x i64>* %a
-  %res = call i64 @llvm.experimental.vector.reduce.smax.v16i64(<16 x i64> %op)
+  %res = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> %op)
   ret i64 %res
 }
 
@@ -634,7 +634,7 @@
 ; VBITS_GE_2048-NEXT: fmov x0, [[REDUCE]]
 ; VBITS_GE_2048-NEXT: ret
   %op = load <32 x i64>, <32 x i64>* %a
-  %res = call i64 @llvm.experimental.vector.reduce.smax.v32i64(<32 x i64> %op)
+  %res = call i64 @llvm.vector.reduce.smax.v32i64(<32 x i64> %op)
   ret i64 %res
 }
 
@@ -647,7 +647,7 @@
 ; CHECK-LABEL: sminv_v8i8:
 ; CHECK: sminv b0, v0.8b
 ; CHECK: ret
-  %res = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> %a)
+  %res = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> %a)
   ret i8 %res
 }
 
@@ -656,7 +656,7 @@
 ; CHECK-LABEL: sminv_v16i8:
 ; CHECK: sminv b0, v0.16b
 ; CHECK: ret
-  %res = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> %a)
+  %res = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> %a)
   ret i8 %res
 }
 
@@ -668,7 +668,7 @@
 ; VBITS_GE_256-NEXT: fmov w0, s[[REDUCE]]
 ; VBITS_GE_256-NEXT: ret
   %op = load <32 x i8>, <32 x i8>* %a
-  %res = call i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8> %op)
+  %res = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> %op)
   ret i8 %res
 }
 
@@ -690,7 +690,7 @@
 ; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]]
 ; VBITS_EQ_256-NEXT: ret
   %op = load <64 x i8>, <64 x i8>* %a
-  %res = call i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8> %op)
+  %res = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> %op)
   ret i8 %res
 }
 
@@ -702,7 +702,7 @@
 ; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]]
 ; VBITS_GE_1024-NEXT: ret
   %op = load <128 x i8>, <128 x i8>* %a
-  %res = call i8 @llvm.experimental.vector.reduce.smin.v128i8(<128 x i8> %op)
+  %res = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> %op)
   ret i8 %res
 }
 
@@ -714,7 +714,7 @@
 ; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]]
 ; VBITS_GE_2048-NEXT: ret
   %op = load <256 x i8>, <256 x i8>* %a
-  %res = call i8 @llvm.experimental.vector.reduce.smin.v256i8(<256 x i8> %op)
+  %res = call i8 @llvm.vector.reduce.smin.v256i8(<256 x i8> %op)
   ret i8 %res
 }
 
@@ -723,7 +723,7 @@
 ; CHECK-LABEL: sminv_v4i16:
 ; CHECK: sminv h0, v0.4h
 ; CHECK: ret
-  %res = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> %a)
+  %res = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> %a)
   ret i16 %res
 }
 
@@ -732,7 +732,7 @@
 ; CHECK-LABEL: sminv_v8i16:
 ; CHECK: sminv h0, v0.8h
 ; CHECK: ret
-  %res = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> %a)
+  %res = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> %a)
   ret i16 %res
 }
 
@@ -744,7 +744,7 @@
 ; VBITS_GE_256-NEXT: fmov w0, s[[REDUCE]]
 ; VBITS_GE_256-NEXT: ret
   %op = load <16 x i16>, <16 x i16>* %a
-  %res = call i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16> %op)
+  %res = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> %op)
   ret i16 %res
 }
 
@@ -766,7 +766,7 @@
 ; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]]
 ; VBITS_EQ_256-NEXT: ret
   %op = load <32 x i16>, <32 x i16>* %a
-  %res = call i16 @llvm.experimental.vector.reduce.smin.v32i16(<32 x i16> %op)
+  %res = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> %op)
   ret i16 %res
 }
 
@@ -778,7 +778,7 @@
 ; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]]
 ; VBITS_GE_1024-NEXT: ret
   %op = load <64 x i16>, <64 x i16>* %a
-  %res = call i16 @llvm.experimental.vector.reduce.smin.v64i16(<64 x i16> %op)
+  %res = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> %op)
   ret i16 %res
 }
 
@@ -790,7 +790,7 @@
 ; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]]
 ; VBITS_GE_2048-NEXT: ret
   %op = load <128 x i16>, <128 x i16>* %a
-  %res = call i16 @llvm.experimental.vector.reduce.smin.v128i16(<128 x i16> %op)
+  %res = call i16 @llvm.vector.reduce.smin.v128i16(<128 x i16> %op)
   ret i16 %res
 }
 
@@ -799,7 +799,7 @@
 ; CHECK-LABEL: sminv_v2i32:
 ; CHECK: minp v0.2s, v0.2s
 ; CHECK: ret
-  %res = call i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32> %a)
+  %res = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> %a)
   ret i32 %res
 }
 
@@ -808,7 +808,7 @@
 ; CHECK-LABEL: sminv_v4i32:
 ; CHECK: sminv s0, v0.4s
 ; CHECK: ret
-  %res = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> %a)
+  %res = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %a)
   ret i32 %res
 }
 
@@ -820,7 +820,7 @@
 ; VBITS_GE_256-NEXT: fmov w0, [[REDUCE]]
 ; VBITS_GE_256-NEXT: ret
   %op = load <8 x i32>, <8 x i32>* %a
-  %res = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> %op)
+  %res = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> %op)
   ret i32 %res
 }
 
@@ -842,7 +842,7 @@
 ; VBITS_EQ_256-NEXT: fmov w0, [[REDUCE]]
 ; VBITS_EQ_256-NEXT: ret
   %op = load <16 x i32>, <16 x i32>* %a
-  %res = call i32 @llvm.experimental.vector.reduce.smin.v16i32(<16 x i32> %op)
+  %res = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> %op)
   ret i32 %res
 }
 
@@ -854,7 +854,7 @@
 ; VBITS_GE_1024-NEXT: fmov w0, [[REDUCE]]
 ; VBITS_GE_1024-NEXT: ret
   %op = load <32 x i32>, <32 x i32>* %a
-  %res = call i32 @llvm.experimental.vector.reduce.smin.v32i32(<32 x i32> %op)
+  %res = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> %op)
   ret i32 %res
 }
 
@@ -866,7 +866,7 @@
 ; VBITS_GE_2048-NEXT: fmov w0, [[REDUCE]]
 ; VBITS_GE_2048-NEXT: ret
   %op = load <64 x i32>, <64 x i32>* %a
-  %res = call i32 @llvm.experimental.vector.reduce.smin.v64i32(<64 x i32> %op)
+  %res = call i32 @llvm.vector.reduce.smin.v64i32(<64 x i32> %op)
   ret i32 %res
 }
 
@@ -875,7 +875,7 @@
 ; CHECK-LABEL: sminv_v1i64:
 ; CHECK: fmov x0, d0
 ; CHECK: ret
-  %res = call i64 @llvm.experimental.vector.reduce.smin.v1i64(<1 x i64> %a)
+  %res = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> %a)
   ret i64 %res
 }
 
@@ -886,7 +886,7 @@
 ; CHECK-NEXT: sminv [[REDUCE:d[0-9]+]], [[PG]], z0.d
 ; CHECK-NEXT: fmov x0, [[REDUCE]]
 ; CHECK-NEXT: ret
-  %res = call i64 @llvm.experimental.vector.reduce.smin.v2i64(<2 x i64> %a)
+  %res = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> %a)
   ret i64 %res
 }
 
@@ -898,7 +898,7 @@
 ; VBITS_GE_256-NEXT: fmov x0, [[REDUCE]]
 ; VBITS_GE_256-NEXT: ret
   %op = load <4 x i64>, <4 x i64>* %a
-  %res = call i64 @llvm.experimental.vector.reduce.smin.v4i64(<4 x i64> %op)
+  %res = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> %op)
   ret i64 %res
 }
 
@@ -920,7 +920,7 @@
 ; VBITS_EQ_256-NEXT: fmov x0, [[REDUCE]]
 ; VBITS_EQ_256-NEXT: ret
   %op = load <8 x i64>, <8 x i64>* %a
-  %res = call i64 @llvm.experimental.vector.reduce.smin.v8i64(<8 x i64> %op)
+  %res = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> %op)
   ret i64 %res
 }
 
@@ -932,7 +932,7 @@
 ; VBITS_GE_1024-NEXT: fmov x0, [[REDUCE]]
 ; VBITS_GE_1024-NEXT: ret
   %op = load <16 x i64>, <16 x i64>* %a
-  %res = call i64 @llvm.experimental.vector.reduce.smin.v16i64(<16 x i64> %op)
+  %res = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> %op)
   ret i64 %res
 }
 
@@ -944,7 +944,7 @@
 ; VBITS_GE_2048-NEXT: fmov x0, [[REDUCE]]
 ; VBITS_GE_2048-NEXT: ret
   %op = load <32 x i64>, <32 x i64>* %a
-  %res = call i64 @llvm.experimental.vector.reduce.smin.v32i64(<32 x i64> %op)
+  %res = call i64 @llvm.vector.reduce.smin.v32i64(<32 x i64> %op)
   ret i64 %res
 }
 
@@ -957,7 +957,7 @@
 ; CHECK-LABEL: umaxv_v8i8:
 ; CHECK: umaxv b0, v0.8b
 ; CHECK: ret
-  %res = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> %a)
+  %res = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> %a)
   ret i8 %res
 }
 
@@ -966,7 +966,7 @@
 ; CHECK-LABEL: umaxv_v16i8:
 ; CHECK: umaxv b0, v0.16b
 ; CHECK: ret
-  %res = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> %a)
+  %res = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> %a)
   ret i8 %res
 }
 
@@ -978,7 +978,7 @@
 ; VBITS_GE_256-NEXT: fmov w0, s[[REDUCE]]
 ; VBITS_GE_256-NEXT: ret
   %op = load <32 x i8>, <32 x i8>* %a
-  %res = call i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8> %op)
+  %res = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> %op)
   ret i8 %res
 }
 
@@ -1000,7 +1000,7 @@
 ; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]]
 ; VBITS_EQ_256-NEXT: ret
   %op = load <64 x i8>, <64 x i8>* %a
-  %res = call i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8> %op)
+  %res = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> %op)
   ret i8 %res
 }
 
@@ -1012,7 +1012,7 @@
 ; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]]
 ; VBITS_GE_1024-NEXT: ret
   %op = load <128 x i8>, <128 x i8>* %a
-  %res = call i8 @llvm.experimental.vector.reduce.umax.v128i8(<128 x i8> %op)
+  %res = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> %op)
   ret i8 %res
 }
 
@@ -1024,7 +1024,7 @@
 ; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]]
 ; VBITS_GE_2048-NEXT: ret
   %op = load <256 x i8>, <256 x i8>* %a
-  %res = call i8 @llvm.experimental.vector.reduce.umax.v256i8(<256 x i8> %op)
+  %res = call i8 @llvm.vector.reduce.umax.v256i8(<256 x i8> %op)
   ret i8 %res
 }
 
@@ -1033,7 +1033,7 @@
 ; CHECK-LABEL: umaxv_v4i16:
 ; CHECK: umaxv h0, v0.4h
 ; CHECK: ret
-  %res = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> %a)
+  %res = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> %a)
   ret i16 %res
 }
 
@@ -1042,7 +1042,7 @@
 ; CHECK-LABEL: umaxv_v8i16:
 ; CHECK: umaxv h0, v0.8h
 ; CHECK: ret
-  %res = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> %a)
+  %res = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> %a)
   ret i16 %res
 }
 
@@ -1054,7 +1054,7 @@
 ; VBITS_GE_256-NEXT: fmov w0, s[[REDUCE]]
 ; VBITS_GE_256-NEXT: ret
   %op = load <16 x i16>, <16 x i16>* %a
-  %res = call i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16> %op)
+  %res = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> %op)
   ret i16 %res
 }
 
@@ -1076,7 +1076,7 @@
 ; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]]
 ; VBITS_EQ_256-NEXT: ret
   %op = load <32 x i16>, <32 x i16>* %a
-  %res = call i16 @llvm.experimental.vector.reduce.umax.v32i16(<32 x i16> %op)
+  %res = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> %op)
   ret i16 %res
 }
 
@@ -1088,7 +1088,7 @@
 ; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]]
 ; VBITS_GE_1024-NEXT: ret
   %op = load <64 x i16>, <64 x i16>* %a
-  %res = call i16 @llvm.experimental.vector.reduce.umax.v64i16(<64 x i16> %op)
+  %res = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> %op)
   ret i16 %res
 }
 
@@ -1100,7 +1100,7 @@
 ; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]]
 ; VBITS_GE_2048-NEXT: ret
   %op = load <128 x i16>, <128 x i16>* %a
-  %res = call i16 @llvm.experimental.vector.reduce.umax.v128i16(<128 x i16> %op)
+  %res = call i16 @llvm.vector.reduce.umax.v128i16(<128 x i16> %op)
   ret i16 %res
 }
 
@@ -1109,7 +1109,7 @@
 ; CHECK-LABEL: umaxv_v2i32:
 ; CHECK: umaxp v0.2s, v0.2s
 ; CHECK: ret
-  %res = call i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32> %a)
+  %res = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> %a)
   ret i32 %res
 }
 
@@ -1118,7 +1118,7 @@
 ; CHECK-LABEL: umaxv_v4i32:
 ; CHECK: umaxv s0, v0.4s
 ; CHECK: ret
-  %res = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> %a)
+  %res = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %a)
   ret i32 %res
 }
 
@@ -1130,7 +1130,7 @@
 ; VBITS_GE_256-NEXT: fmov w0, [[REDUCE]]
 ; VBITS_GE_256-NEXT: ret
   %op = load <8 x i32>, <8 x i32>* %a
-  %res = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> %op)
+  %res = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> %op)
   ret i32 %res
 }
 
@@ -1152,7 +1152,7 @@
 ; VBITS_EQ_256-NEXT: fmov w0, [[REDUCE]]
 ; VBITS_EQ_256-NEXT: ret
   %op = load <16 x i32>, <16 x i32>* %a
-  %res = call i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32> %op)
+  %res = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> %op)
   ret i32 %res
 }
 
@@ -1164,7 +1164,7 @@
 ; VBITS_GE_1024-NEXT: fmov w0, [[REDUCE]]
 ; VBITS_GE_1024-NEXT: ret
   %op = load <32 x i32>, <32 x i32>* %a
-  %res = call i32 @llvm.experimental.vector.reduce.umax.v32i32(<32 x i32> %op)
+  %res = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> %op)
   ret i32 %res
 }
 
@@ -1176,7 +1176,7 @@
 ; VBITS_GE_2048-NEXT: fmov w0, [[REDUCE]]
 ; VBITS_GE_2048-NEXT: ret
   %op = load <64 x i32>, <64 x i32>* %a
-  %res = call i32 @llvm.experimental.vector.reduce.umax.v64i32(<64 x i32> %op)
+  %res = call i32 @llvm.vector.reduce.umax.v64i32(<64 x i32> %op)
   ret i32 %res
 }
 
@@ -1185,7 +1185,7 @@
 ; CHECK-LABEL: umaxv_v1i64:
 ; CHECK: fmov x0, d0
 ; CHECK: ret
-  %res = call i64 @llvm.experimental.vector.reduce.umax.v1i64(<1 x i64> %a)
+  %res = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> %a)
   ret i64 %res
 }
 
@@ -1196,7 +1196,7 @@
 ; CHECK-NEXT: umaxv [[REDUCE:d[0-9]+]], [[PG]], z0.d
 ; CHECK-NEXT: fmov x0, [[REDUCE]]
 ; CHECK-NEXT: ret
-  %res = call i64 @llvm.experimental.vector.reduce.umax.v2i64(<2 x i64> %a)
+  %res = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> %a)
   ret i64 %res
 }
 
@@ -1208,7 +1208,7 @@
 ; VBITS_GE_256-NEXT: fmov x0, [[REDUCE]]
 ; VBITS_GE_256-NEXT: ret
   %op = load <4 x i64>, <4 x i64>* %a
-  %res = call i64 @llvm.experimental.vector.reduce.umax.v4i64(<4 x i64> %op)
+  %res = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> %op)
   ret i64 %res
 }
 
@@ -1230,7 +1230,7 @@
 ; VBITS_EQ_256-NEXT: fmov x0, [[REDUCE]]
 ; VBITS_EQ_256-NEXT: ret
   %op = load <8 x i64>, <8 x i64>* %a
-  %res = call i64 @llvm.experimental.vector.reduce.umax.v8i64(<8 x i64> %op)
+  %res = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> %op)
   ret i64 %res
 }
 
@@ -1242,7 +1242,7 @@
 ; VBITS_GE_1024-NEXT: fmov x0, [[REDUCE]]
 ; VBITS_GE_1024-NEXT: ret
   %op = load <16 x i64>, <16 x i64>* %a
-  %res = call i64 @llvm.experimental.vector.reduce.umax.v16i64(<16 x i64> %op)
+  %res = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> %op)
   ret i64 %res
 }
 
@@ -1254,7 +1254,7 @@
 ; VBITS_GE_2048-NEXT: fmov x0, [[REDUCE]]
 ; VBITS_GE_2048-NEXT: ret
   %op = load <32 x i64>, <32 x i64>* %a
-  %res = call i64 @llvm.experimental.vector.reduce.umax.v32i64(<32 x i64> %op)
+  %res = call i64 @llvm.vector.reduce.umax.v32i64(<32 x i64> %op)
   ret i64 %res
 }
 
@@ -1267,7 +1267,7 @@
 ; CHECK-LABEL: uminv_v8i8:
 ; CHECK: uminv b0, v0.8b
 ; CHECK: ret
-  %res = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> %a)
+  %res = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> %a)
   ret i8 %res
 }
 
@@ -1276,7 +1276,7 @@
 ; CHECK-LABEL: uminv_v16i8:
 ; CHECK: uminv b0, v0.16b
 ; CHECK: ret
-  %res = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> %a)
+  %res = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> %a)
   ret i8 %res
 }
 
@@ -1288,7 +1288,7 @@
 ; VBITS_GE_256-NEXT: fmov w0, s[[REDUCE]]
 ; VBITS_GE_256-NEXT: ret
   %op = load <32 x i8>, <32 x i8>* %a
-  %res = call i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8> %op)
+  %res = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> %op)
   ret i8 %res
 }
 
@@ -1310,7 +1310,7 @@
 ; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]]
 ; VBITS_EQ_256-NEXT: ret
   %op = load <64 x i8>, <64 x i8>* %a
-  %res = call i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8> %op)
+  %res = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> %op)
   ret i8 %res
 }
 
@@ -1322,7 +1322,7 @@
 ; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]]
 ; VBITS_GE_1024-NEXT: ret
   %op = load <128 x i8>, <128 x i8>* %a
-  %res = call i8 @llvm.experimental.vector.reduce.umin.v128i8(<128 x i8> %op)
+  %res = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> %op)
   ret i8 %res
 }
 
@@ -1334,7 +1334,7 @@
 ; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]]
 ; VBITS_GE_2048-NEXT: ret
   %op = load <256 x i8>, <256 x i8>* %a
-  %res = call i8 @llvm.experimental.vector.reduce.umin.v256i8(<256 x i8> %op)
+  %res = call i8 @llvm.vector.reduce.umin.v256i8(<256 x i8> %op)
   ret i8 %res
 }
 
@@ -1343,7 +1343,7 @@
 ; CHECK-LABEL: uminv_v4i16:
 ; CHECK: uminv h0, v0.4h
 ; CHECK: ret
-  %res = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> %a)
+  %res = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> %a)
   ret i16 %res
 }
 
@@ -1352,7 +1352,7 @@
 ; CHECK-LABEL: uminv_v8i16:
 ; CHECK: uminv h0, v0.8h
 ; CHECK: ret
-  %res = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> %a)
+  %res = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> %a)
   ret i16 %res
 }
 
@@ -1364,7 +1364,7 @@
 ; VBITS_GE_256-NEXT: fmov w0, s[[REDUCE]]
 ; VBITS_GE_256-NEXT: ret
   %op = load <16 x i16>, <16 x i16>* %a
-  %res = call i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16> %op)
+  %res = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> %op)
   ret i16 %res
 }
 
@@ -1386,7 +1386,7 @@
 ; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]]
 ; VBITS_EQ_256-NEXT: ret
   %op = load <32 x i16>, <32 x i16>* %a
-  %res = call i16 @llvm.experimental.vector.reduce.umin.v32i16(<32 x i16> %op)
+  %res = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> %op)
   ret i16 %res
 }
 
@@ -1398,7 +1398,7 @@
 ; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]]
 ; VBITS_GE_1024-NEXT: ret
   %op = load <64 x i16>, <64 x i16>* %a
-  %res = call i16 @llvm.experimental.vector.reduce.umin.v64i16(<64 x i16> %op)
+  %res = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> %op)
   ret i16 %res
 }
 
@@ -1410,7 +1410,7 @@
 ; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]]
 ; VBITS_GE_2048-NEXT: ret
   %op = load <128 x i16>, <128 x i16>* %a
-  %res = call i16 @llvm.experimental.vector.reduce.umin.v128i16(<128 x i16> %op)
+  %res = call i16 @llvm.vector.reduce.umin.v128i16(<128 x i16> %op)
   ret i16 %res
 }
 
@@ -1419,7 +1419,7 @@
 ; CHECK-LABEL: uminv_v2i32:
 ; CHECK: minp v0.2s, v0.2s
 ; CHECK: ret
-  %res = call i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32> %a)
+  %res = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> %a)
   ret i32 %res
 }
 
@@ -1428,7 +1428,7 @@
 ; CHECK-LABEL: uminv_v4i32:
 ; CHECK: uminv s0, v0.4s
 ; CHECK: ret
-  %res = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> %a)
+  %res = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %a)
   ret i32 %res
 }
 
@@ -1440,7 +1440,7 @@
 ; VBITS_GE_256-NEXT: fmov w0, [[REDUCE]]
 ; VBITS_GE_256-NEXT: ret
   %op = load <8 x i32>, <8 x i32>* %a
-  %res = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> %op)
+  %res = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> %op)
   ret i32 %res
 }
 
@@ -1462,7 +1462,7 @@
 ; VBITS_EQ_256-NEXT: fmov w0, [[REDUCE]]
 ; VBITS_EQ_256-NEXT: ret
   %op = load <16 x i32>, <16 x i32>* %a
-  %res = call i32 @llvm.experimental.vector.reduce.umin.v16i32(<16 x i32> %op)
+  %res = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> %op)
   ret i32 %res
 }
 
@@ -1474,7 +1474,7 @@
 ; VBITS_GE_1024-NEXT: fmov w0, [[REDUCE]]
 ; VBITS_GE_1024-NEXT: ret
   %op = load <32 x i32>, <32 x i32>* %a
-  %res = call i32 @llvm.experimental.vector.reduce.umin.v32i32(<32 x i32> %op)
+  %res = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> %op)
   ret i32 %res
 }
 
@@ -1486,7 +1486,7 @@
 ; VBITS_GE_2048-NEXT: fmov w0, [[REDUCE]]
 ; VBITS_GE_2048-NEXT: ret
   %op = load <64 x i32>, <64 x i32>* %a
-  %res = call i32 @llvm.experimental.vector.reduce.umin.v64i32(<64 x i32> %op)
+  %res = call i32 @llvm.vector.reduce.umin.v64i32(<64 x i32> %op)
   ret i32 %res
 }
 
@@ -1495,7 +1495,7 @@
 ; CHECK-LABEL: uminv_v1i64:
 ; CHECK: fmov x0, d0
 ; CHECK: ret
-  %res = call i64 @llvm.experimental.vector.reduce.umin.v1i64(<1 x i64> %a)
+  %res = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> %a)
   ret i64 %res
 }
 
@@ -1506,7 +1506,7 @@
 ; CHECK-NEXT: uminv [[REDUCE:d[0-9]+]], [[PG]], z0.d
 ; CHECK-NEXT: fmov x0, [[REDUCE]]
 ; CHECK-NEXT: ret
-  %res = call i64 @llvm.experimental.vector.reduce.umin.v2i64(<2 x i64> %a)
+  %res = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> %a)
   ret i64 %res
 }
 
@@ -1518,7 +1518,7 @@
 ; VBITS_GE_256-NEXT: fmov x0, [[REDUCE]]
 ; VBITS_GE_256-NEXT: ret
   %op = load <4 x i64>, <4 x i64>* %a
-  %res = call i64 @llvm.experimental.vector.reduce.umin.v4i64(<4 x i64> %op)
+  %res = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> %op)
   ret i64 %res
 }
 
@@ -1540,7 +1540,7 @@
 ; VBITS_EQ_256-NEXT: fmov x0, [[REDUCE]]
 ; VBITS_EQ_256-NEXT: ret
   %op = load <8 x i64>, <8 x i64>* %a
-  %res = call i64 @llvm.experimental.vector.reduce.umin.v8i64(<8 x i64> %op)
+  %res = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> %op)
   ret i64 %res
 }
 
@@ -1552,7 +1552,7 @@
 ; VBITS_GE_1024-NEXT: fmov x0, [[REDUCE]]
 ; VBITS_GE_1024-NEXT: ret
   %op = load <16 x i64>, <16 x i64>* %a
-  %res = call i64 @llvm.experimental.vector.reduce.umin.v16i64(<16 x i64> %op)
+  %res = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> %op)
   ret i64 %res
 }
 
@@ -1564,148 +1564,148 @@
 ; VBITS_GE_2048-NEXT: fmov x0, [[REDUCE]]
 ; VBITS_GE_2048-NEXT: ret
   %op = load <32 x i64>, <32 x i64>* %a
-  %res = call i64 @llvm.experimental.vector.reduce.umin.v32i64(<32 x i64> %op)
+  %res = call i64 @llvm.vector.reduce.umin.v32i64(<32 x i64> %op)
   ret i64 %res
 }
 
 attributes #0 = { "target-features"="+sve" }
 
-declare i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8>)
-declare i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8>)
-declare i8 @llvm.experimental.vector.reduce.add.v256i8(<256 x i8>)
-
-declare i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16>)
-declare i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16>)
-declare i16 @llvm.experimental.vector.reduce.add.v128i16(<128 x i16>)
-
-declare i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32>)
-declare i32 @llvm.experimental.vector.reduce.add.v64i32(<64 x i32>)
-
-declare i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64>)
-declare i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64>)
-declare i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64>)
-declare i64 @llvm.experimental.vector.reduce.add.v32i64(<32 x i64>)
-
-declare i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smax.v128i8(<128 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smax.v256i8(<256 x i8>)
-
-declare i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smax.v32i16(<32 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smax.v64i16(<64 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smax.v128i16(<128 x i16>)
-
-declare i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smax.v32i32(<32 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smax.v64i32(<64 x i32>)
-
-declare i64 @llvm.experimental.vector.reduce.smax.v1i64(<1 x i64>)
-declare i64 @llvm.experimental.vector.reduce.smax.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.smax.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.smax.v8i64(<8 x i64>)
-declare i64 @llvm.experimental.vector.reduce.smax.v16i64(<16 x i64>)
-declare i64 @llvm.experimental.vector.reduce.smax.v32i64(<32 x i64>)
-
-declare i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smin.v128i8(<128 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smin.v256i8(<256 x i8>)
-
-declare i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smin.v32i16(<32 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smin.v64i16(<64 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smin.v128i16(<128 x i16>)
-
-declare i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smin.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smin.v32i32(<32 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smin.v64i32(<64 x i32>)
-
-declare i64 @llvm.experimental.vector.reduce.smin.v1i64(<1 x i64>)
-declare i64 @llvm.experimental.vector.reduce.smin.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.smin.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.smin.v8i64(<8 x i64>)
-declare i64 @llvm.experimental.vector.reduce.smin.v16i64(<16 x i64>)
-declare i64 @llvm.experimental.vector.reduce.smin.v32i64(<32 x i64>)
-
-declare i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umax.v128i8(<128 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umax.v256i8(<256 x i8>)
-
-declare i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umax.v32i16(<32 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umax.v64i16(<64 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umax.v128i16(<128 x i16>)
-
-declare i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umax.v32i32(<32 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umax.v64i32(<64 x i32>)
-
-declare i64 @llvm.experimental.vector.reduce.umax.v1i64(<1 x i64>)
-declare i64 @llvm.experimental.vector.reduce.umax.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.umax.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.umax.v8i64(<8 x i64>)
-declare i64 @llvm.experimental.vector.reduce.umax.v16i64(<16 x i64>)
-declare i64 @llvm.experimental.vector.reduce.umax.v32i64(<32 x i64>)
-
-declare i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umin.v128i8(<128 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umin.v256i8(<256 x i8>)
-
-declare i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umin.v32i16(<32 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umin.v64i16(<64 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umin.v128i16(<128 x i16>)
-
-declare i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umin.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umin.v32i32(<32 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umin.v64i32(<64 x i32>)
-
-declare i64 @llvm.experimental.vector.reduce.umin.v1i64(<1 x i64>)
-declare i64 @llvm.experimental.vector.reduce.umin.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.umin.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.umin.v8i64(<8 x i64>)
-declare i64 @llvm.experimental.vector.reduce.umin.v16i64(<16 x i64>)
-declare i64 @llvm.experimental.vector.reduce.umin.v32i64(<32 x i64>)
+declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.add.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.add.v64i8(<64 x i8>)
+declare i8 @llvm.vector.reduce.add.v128i8(<128 x i8>)
+declare i8 @llvm.vector.reduce.add.v256i8(<256 x i8>)
+
+declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.add.v32i16(<32 x i16>)
+declare i16 @llvm.vector.reduce.add.v64i16(<64 x i16>)
+declare i16 @llvm.vector.reduce.add.v128i16(<128 x i16>)
+
+declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.add.v32i32(<32 x i32>)
+declare i32 @llvm.vector.reduce.add.v64i32(<64 x i32>)
+
+declare i64 @llvm.vector.reduce.add.v1i64(<1 x i64>)
+declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>)
+declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>)
+declare i64 @llvm.vector.reduce.add.v32i64(<32 x i64>)
+
+declare i8 @llvm.vector.reduce.smax.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.smax.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.smax.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.smax.v64i8(<64 x i8>)
+declare i8 @llvm.vector.reduce.smax.v128i8(<128 x i8>)
+declare i8 @llvm.vector.reduce.smax.v256i8(<256 x i8>)
+
+declare i16 @llvm.vector.reduce.smax.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.smax.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.smax.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.smax.v32i16(<32 x i16>)
+declare i16 @llvm.vector.reduce.smax.v64i16(<64 x i16>)
+declare i16 @llvm.vector.reduce.smax.v128i16(<128 x i16>)
+
+declare i32 @llvm.vector.reduce.smax.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.smax.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.smax.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.smax.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.smax.v32i32(<32 x i32>)
+declare i32 @llvm.vector.reduce.smax.v64i32(<64 x i32>)
+
+declare i64 @llvm.vector.reduce.smax.v1i64(<1 x i64>)
+declare i64 @llvm.vector.reduce.smax.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.smax.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.smax.v8i64(<8 x i64>)
+declare i64 @llvm.vector.reduce.smax.v16i64(<16 x i64>)
+declare i64 @llvm.vector.reduce.smax.v32i64(<32 x i64>)
+
+declare i8 @llvm.vector.reduce.smin.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.smin.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.smin.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.smin.v64i8(<64 x i8>)
+declare i8 @llvm.vector.reduce.smin.v128i8(<128 x i8>)
+declare i8 @llvm.vector.reduce.smin.v256i8(<256 x i8>)
+
+declare i16 @llvm.vector.reduce.smin.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.smin.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.smin.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.smin.v32i16(<32 x i16>)
+declare i16 @llvm.vector.reduce.smin.v64i16(<64 x i16>)
+declare i16 @llvm.vector.reduce.smin.v128i16(<128 x i16>)
+
+declare i32 @llvm.vector.reduce.smin.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.smin.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.smin.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.smin.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.smin.v32i32(<32 x i32>)
+declare i32 @llvm.vector.reduce.smin.v64i32(<64 x i32>)
+
+declare i64 @llvm.vector.reduce.smin.v1i64(<1 x i64>)
+declare i64 @llvm.vector.reduce.smin.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.smin.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.smin.v8i64(<8 x i64>)
+declare i64 @llvm.vector.reduce.smin.v16i64(<16 x i64>)
+declare i64 @llvm.vector.reduce.smin.v32i64(<32 x i64>)
+
+declare i8 @llvm.vector.reduce.umax.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.umax.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.umax.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.umax.v64i8(<64 x i8>)
+declare i8 @llvm.vector.reduce.umax.v128i8(<128 x i8>)
+declare i8 @llvm.vector.reduce.umax.v256i8(<256 x i8>)
+
+declare i16 @llvm.vector.reduce.umax.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.umax.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.umax.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.umax.v32i16(<32 x i16>)
+declare i16 @llvm.vector.reduce.umax.v64i16(<64 x i16>)
+declare i16 @llvm.vector.reduce.umax.v128i16(<128 x i16>)
+
+declare i32 @llvm.vector.reduce.umax.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.umax.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.umax.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.umax.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.umax.v32i32(<32 x i32>)
+declare i32 @llvm.vector.reduce.umax.v64i32(<64 x i32>)
+
+declare i64 @llvm.vector.reduce.umax.v1i64(<1 x i64>)
+declare i64 @llvm.vector.reduce.umax.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.umax.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.umax.v8i64(<8 x i64>)
+declare i64 @llvm.vector.reduce.umax.v16i64(<16 x i64>)
+declare i64 @llvm.vector.reduce.umax.v32i64(<32 x i64>)
+
+declare i8 @llvm.vector.reduce.umin.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.umin.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.umin.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.umin.v64i8(<64 x i8>)
+declare i8 @llvm.vector.reduce.umin.v128i8(<128 x i8>)
+declare i8 @llvm.vector.reduce.umin.v256i8(<256 x i8>)
+
+declare i16 @llvm.vector.reduce.umin.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.umin.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.umin.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.umin.v32i16(<32 x i16>)
+declare i16 @llvm.vector.reduce.umin.v64i16(<64 x i16>)
+declare i16 @llvm.vector.reduce.umin.v128i16(<128 x i16>)
+
+declare i32 @llvm.vector.reduce.umin.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.umin.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.umin.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.umin.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.umin.v32i32(<32 x i32>)
+declare i32 @llvm.vector.reduce.umin.v64i32(<64 x i32>)
+
+declare i64 @llvm.vector.reduce.umin.v1i64(<1 x i64>)
+declare i64 @llvm.vector.reduce.umin.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.umin.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.umin.v8i64(<8 x i64>)
+declare i64 @llvm.vector.reduce.umin.v16i64(<16 x i64>)
+declare i64 @llvm.vector.reduce.umin.v32i64(<32 x i64>)
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-add-legalization.ll
--- a/llvm/test/CodeGen/AArch64/vecreduce-add-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-add-legalization.ll
@@ -1,28 +1,28 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK
 
-declare i1 @llvm.experimental.vector.reduce.add.v1i1(<1 x i1> %a)
-declare i8 @llvm.experimental.vector.reduce.add.v1i8(<1 x i8> %a)
-declare i16 @llvm.experimental.vector.reduce.add.v1i16(<1 x i16> %a)
-declare i24 @llvm.experimental.vector.reduce.add.v1i24(<1 x i24> %a)
-declare i32 @llvm.experimental.vector.reduce.add.v1i32(<1 x i32> %a)
-declare i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> %a)
-declare i128 @llvm.experimental.vector.reduce.add.v1i128(<1 x i128> %a)
-
-declare i8 @llvm.experimental.vector.reduce.add.v3i8(<3 x i8> %a)
-declare i8 @llvm.experimental.vector.reduce.add.v9i8(<9 x i8> %a)
-declare i32 @llvm.experimental.vector.reduce.add.v3i32(<3 x i32> %a)
-declare i1 @llvm.experimental.vector.reduce.add.v4i1(<4 x i1> %a)
-declare i24 @llvm.experimental.vector.reduce.add.v4i24(<4 x i24> %a)
-declare i128 @llvm.experimental.vector.reduce.add.v2i128(<2 x i128> %a)
-declare i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %a)
+declare i1 @llvm.vector.reduce.add.v1i1(<1 x i1> %a)
+declare i8 @llvm.vector.reduce.add.v1i8(<1 x i8> %a)
+declare i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %a)
+declare i24 @llvm.vector.reduce.add.v1i24(<1 x i24> %a)
+declare i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a)
+declare i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a)
+declare i128 @llvm.vector.reduce.add.v1i128(<1 x i128> %a)
+
+declare i8 @llvm.vector.reduce.add.v3i8(<3 x i8> %a)
+declare i8 @llvm.vector.reduce.add.v9i8(<9 x i8> %a)
+declare i32 @llvm.vector.reduce.add.v3i32(<3 x i32> %a)
+declare i1 @llvm.vector.reduce.add.v4i1(<4 x i1> %a)
+declare i24 @llvm.vector.reduce.add.v4i24(<4 x i24> %a)
+declare i128 @llvm.vector.reduce.add.v2i128(<2 x i128> %a)
+declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a)
 
 define i1 @test_v1i1(<1 x i1> %a) nounwind {
 ; CHECK-LABEL: test_v1i1:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    and w0, w0, #0x1
 ; CHECK-NEXT:    ret
-  %b = call i1 @llvm.experimental.vector.reduce.add.v1i1(<1 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.add.v1i1(<1 x i1> %a)
   ret i1 %b
 }
 
@@ -32,7 +32,7 @@
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    umov w0, v0.b[0]
 ; CHECK-NEXT:    ret
-  %b = call i8 @llvm.experimental.vector.reduce.add.v1i8(<1 x i8> %a)
+  %b = call i8 @llvm.vector.reduce.add.v1i8(<1 x i8> %a)
   ret i8 %b
 }
 
@@ -42,7 +42,7 @@
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    umov w0, v0.h[0]
 ; CHECK-NEXT:    ret
-  %b = call i16 @llvm.experimental.vector.reduce.add.v1i16(<1 x i16> %a)
+  %b = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %a)
   ret i16 %b
 }
 
@@ -50,7 +50,7 @@
 ; CHECK-LABEL: test_v1i24:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
-  %b = call i24 @llvm.experimental.vector.reduce.add.v1i24(<1 x i24> %a)
+  %b = call i24 @llvm.vector.reduce.add.v1i24(<1 x i24> %a)
   ret i24 %b
 }
 
@@ -60,7 +60,7 @@
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-  %b = call i32 @llvm.experimental.vector.reduce.add.v1i32(<1 x i32> %a)
+  %b = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a)
   ret i32 %b
 }
 
@@ -70,7 +70,7 @@
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
-  %b = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> %a)
+  %b = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a)
   ret i64 %b
 }
 
@@ -78,7 +78,7 @@
 ; CHECK-LABEL: test_v1i128:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
-  %b = call i128 @llvm.experimental.vector.reduce.add.v1i128(<1 x i128> %a)
+  %b = call i128 @llvm.vector.reduce.add.v1i128(<1 x i128> %a)
   ret i128 %b
 }
 
@@ -92,7 +92,7 @@
 ; CHECK-NEXT:    addv h0, v0.4h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-  %b = call i8 @llvm.experimental.vector.reduce.add.v3i8(<3 x i8> %a)
+  %b = call i8 @llvm.vector.reduce.add.v3i8(<3 x i8> %a)
   ret i8 %b
 }
 
@@ -109,7 +109,7 @@
 ; CHECK-NEXT:    addv b0, v0.16b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-  %b = call i8 @llvm.experimental.vector.reduce.add.v9i8(<9 x i8> %a)
+  %b = call i8 @llvm.vector.reduce.add.v9i8(<9 x i8> %a)
   ret i8 %b
 }
 
@@ -120,7 +120,7 @@
 ; CHECK-NEXT:    addv s0, v0.4s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-  %b = call i32 @llvm.experimental.vector.reduce.add.v3i32(<3 x i32> %a)
+  %b = call i32 @llvm.vector.reduce.add.v3i32(<3 x i32> %a)
   ret i32 %b
 }
 
@@ -131,7 +131,7 @@
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    and w0, w8, #0x1
 ; CHECK-NEXT:    ret
-  %b = call i1 @llvm.experimental.vector.reduce.add.v4i1(<4 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.add.v4i1(<4 x i1> %a)
   ret i1 %b
 }
 
@@ -141,7 +141,7 @@
 ; CHECK-NEXT:    addv s0, v0.4s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-  %b = call i24 @llvm.experimental.vector.reduce.add.v4i24(<4 x i24> %a)
+  %b = call i24 @llvm.vector.reduce.add.v4i24(<4 x i24> %a)
   ret i24 %b
 }
 
@@ -151,7 +151,7 @@
 ; CHECK-NEXT:    adds x0, x0, x2
 ; CHECK-NEXT:    adcs x1, x1, x3
 ; CHECK-NEXT:    ret
-  %b = call i128 @llvm.experimental.vector.reduce.add.v2i128(<2 x i128> %a)
+  %b = call i128 @llvm.vector.reduce.add.v2i128(<2 x i128> %a)
   ret i128 %b
 }
 
@@ -164,6 +164,6 @@
 ; CHECK-NEXT:    addv s0, v0.4s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-  %b = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %a)
+  %b = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a)
   ret i32 %b
 }
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll
--- a/llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll
@@ -1,28 +1,28 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK
 
-declare i1 @llvm.experimental.vector.reduce.and.v1i1(<1 x i1> %a)
-declare i8 @llvm.experimental.vector.reduce.and.v1i8(<1 x i8> %a)
-declare i16 @llvm.experimental.vector.reduce.and.v1i16(<1 x i16> %a)
-declare i24 @llvm.experimental.vector.reduce.and.v1i24(<1 x i24> %a)
-declare i32 @llvm.experimental.vector.reduce.and.v1i32(<1 x i32> %a)
-declare i64 @llvm.experimental.vector.reduce.and.v1i64(<1 x i64> %a)
-declare i128 @llvm.experimental.vector.reduce.and.v1i128(<1 x i128> %a)
-
-declare i8 @llvm.experimental.vector.reduce.and.v3i8(<3 x i8> %a)
-declare i8 @llvm.experimental.vector.reduce.and.v9i8(<9 x i8> %a)
-declare i32 @llvm.experimental.vector.reduce.and.v3i32(<3 x i32> %a)
-declare i1 @llvm.experimental.vector.reduce.and.v4i1(<4 x i1> %a)
-declare i24 @llvm.experimental.vector.reduce.and.v4i24(<4 x i24> %a)
-declare i128 @llvm.experimental.vector.reduce.and.v2i128(<2 x i128> %a)
-declare i32 @llvm.experimental.vector.reduce.and.v16i32(<16 x i32> %a)
+declare i1 @llvm.vector.reduce.and.v1i1(<1 x i1> %a)
+declare i8 @llvm.vector.reduce.and.v1i8(<1 x i8> %a)
+declare i16 @llvm.vector.reduce.and.v1i16(<1 x i16> %a)
+declare i24 @llvm.vector.reduce.and.v1i24(<1 x i24> %a)
+declare i32 @llvm.vector.reduce.and.v1i32(<1 x i32> %a)
+declare i64 @llvm.vector.reduce.and.v1i64(<1 x i64> %a)
+declare i128 @llvm.vector.reduce.and.v1i128(<1 x i128> %a)
+
+declare i8 @llvm.vector.reduce.and.v3i8(<3 x i8> %a)
+declare i8 @llvm.vector.reduce.and.v9i8(<9 x i8> %a)
+declare i32 @llvm.vector.reduce.and.v3i32(<3 x i32> %a)
+declare i1 @llvm.vector.reduce.and.v4i1(<4 x i1> %a)
+declare i24 @llvm.vector.reduce.and.v4i24(<4 x i24> %a)
+declare i128 @llvm.vector.reduce.and.v2i128(<2 x i128> %a)
+declare i32 @llvm.vector.reduce.and.v16i32(<16 x i32> %a)
 
 define i1 @test_v1i1(<1 x i1> %a) nounwind {
 ; CHECK-LABEL: test_v1i1:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    and w0, w0, #0x1
 ; CHECK-NEXT:    ret
-  %b = call i1 @llvm.experimental.vector.reduce.and.v1i1(<1 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> %a)
   ret i1 %b
 }
 
@@ -32,7 +32,7 @@
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    umov w0, v0.b[0]
 ; CHECK-NEXT:    ret
-  %b = call i8 @llvm.experimental.vector.reduce.and.v1i8(<1 x i8> %a)
+  %b = call i8 @llvm.vector.reduce.and.v1i8(<1 x i8> %a)
   ret i8 %b
 }
 
@@ -42,7 +42,7 @@
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    umov w0, v0.h[0]
 ; CHECK-NEXT:    ret
-  %b = call i16 @llvm.experimental.vector.reduce.and.v1i16(<1 x i16> %a)
+  %b = call i16 @llvm.vector.reduce.and.v1i16(<1 x i16> %a)
   ret i16 %b
 }
 
@@ -50,7 +50,7 @@
 ; CHECK-LABEL: test_v1i24:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
-  %b = call i24 @llvm.experimental.vector.reduce.and.v1i24(<1 x i24> %a)
+  %b = call i24 @llvm.vector.reduce.and.v1i24(<1 x i24> %a)
   ret i24 %b
 }
 
@@ -60,7 +60,7 @@
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-  %b = call i32 @llvm.experimental.vector.reduce.and.v1i32(<1 x i32> %a)
+  %b = call i32 @llvm.vector.reduce.and.v1i32(<1 x i32> %a)
   ret i32 %b
 }
 
@@ -70,7 +70,7 @@
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
-  %b = call i64 @llvm.experimental.vector.reduce.and.v1i64(<1 x i64> %a)
+  %b = call i64 @llvm.vector.reduce.and.v1i64(<1 x i64> %a)
   ret i64 %b
 }
 
@@ -78,7 +78,7 @@
 ; CHECK-LABEL: test_v1i128:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
-  %b = call i128 @llvm.experimental.vector.reduce.and.v1i128(<1 x i128> %a)
+  %b = call i128 @llvm.vector.reduce.and.v1i128(<1 x i128> %a)
   ret i128 %b
 }
 
@@ -89,7 +89,7 @@
 ; CHECK-NEXT:    and w8, w8, w2
 ; CHECK-NEXT:    and w0, w8, #0xff
 ; CHECK-NEXT:    ret
-  %b = call i8 @llvm.experimental.vector.reduce.and.v3i8(<3 x i8> %a)
+  %b = call i8 @llvm.vector.reduce.and.v3i8(<3 x i8> %a)
   ret i8 %b
 }
 
@@ -120,7 +120,7 @@
 ; CHECK-NEXT:    umov w9, v0.b[7]
 ; CHECK-NEXT:    and w0, w8, w9
 ; CHECK-NEXT:    ret
-  %b = call i8 @llvm.experimental.vector.reduce.and.v9i8(<9 x i8> %a)
+  %b = call i8 @llvm.vector.reduce.and.v9i8(<9 x i8> %a)
   ret i8 %b
 }
 
@@ -133,7 +133,7 @@
 ; CHECK-NEXT:    fmov w9, s1
 ; CHECK-NEXT:    and w0, w9, w8
 ; CHECK-NEXT:    ret
-  %b = call i32 @llvm.experimental.vector.reduce.and.v3i32(<3 x i32> %a)
+  %b = call i32 @llvm.vector.reduce.and.v3i32(<3 x i32> %a)
   ret i32 %b
 }
 
@@ -150,7 +150,7 @@
 ; CHECK-NEXT:    and w8, w9, w8
 ; CHECK-NEXT:    and w0, w8, #0x1
 ; CHECK-NEXT:    ret
-  %b = call i1 @llvm.experimental.vector.reduce.and.v4i1(<4 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> %a)
   ret i1 %b
 }
 
@@ -163,7 +163,7 @@
 ; CHECK-NEXT:    fmov w9, s0
 ; CHECK-NEXT:    and w0, w9, w8
 ; CHECK-NEXT:    ret
-  %b = call i24 @llvm.experimental.vector.reduce.and.v4i24(<4 x i24> %a)
+  %b = call i24 @llvm.vector.reduce.and.v4i24(<4 x i24> %a)
   ret i24 %b
 }
 
@@ -173,7 +173,7 @@
 ; CHECK-NEXT:    and x0, x0, x2
 ; CHECK-NEXT:    and x1, x1, x3
 ; CHECK-NEXT:    ret
-  %b = call i128 @llvm.experimental.vector.reduce.and.v2i128(<2 x i128> %a)
+  %b = call i128 @llvm.vector.reduce.and.v2i128(<2 x i128> %a)
   ret i128 %b
 }
 
@@ -189,6 +189,6 @@
 ; CHECK-NEXT:    fmov w9, s0
 ; CHECK-NEXT:    and w0, w9, w8
 ; CHECK-NEXT:    ret
-  %b = call i32 @llvm.experimental.vector.reduce.and.v16i32(<16 x i32> %a)
+  %b = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> %a)
   ret i32 %b
 }
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-bool.ll b/llvm/test/CodeGen/AArch64/vecreduce-bool.ll
--- a/llvm/test/CodeGen/AArch64/vecreduce-bool.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-bool.ll
@@ -1,19 +1,19 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK
 
-declare i1 @llvm.experimental.vector.reduce.and.v1i1(<1 x i1> %a)
-declare i1 @llvm.experimental.vector.reduce.and.v2i1(<2 x i1> %a)
-declare i1 @llvm.experimental.vector.reduce.and.v4i1(<4 x i1> %a)
-declare i1 @llvm.experimental.vector.reduce.and.v8i1(<8 x i1> %a)
-declare i1 @llvm.experimental.vector.reduce.and.v16i1(<16 x i1> %a)
-declare i1 @llvm.experimental.vector.reduce.and.v32i1(<32 x i1> %a)
+declare i1 @llvm.vector.reduce.and.v1i1(<1 x i1> %a)
+declare i1 @llvm.vector.reduce.and.v2i1(<2 x i1> %a)
+declare i1 @llvm.vector.reduce.and.v4i1(<4 x i1> %a)
+declare i1 @llvm.vector.reduce.and.v8i1(<8 x i1> %a)
+declare i1 @llvm.vector.reduce.and.v16i1(<16 x i1> %a)
+declare i1 @llvm.vector.reduce.and.v32i1(<32 x i1> %a)
 
-declare i1 @llvm.experimental.vector.reduce.or.v1i1(<1 x i1> %a)
-declare i1 @llvm.experimental.vector.reduce.or.v2i1(<2 x i1> %a)
-declare i1 @llvm.experimental.vector.reduce.or.v4i1(<4 x i1> %a)
-declare i1 @llvm.experimental.vector.reduce.or.v8i1(<8 x i1> %a)
-declare i1 @llvm.experimental.vector.reduce.or.v16i1(<16 x i1> %a)
-declare i1 @llvm.experimental.vector.reduce.or.v32i1(<32 x i1> %a)
+declare i1 @llvm.vector.reduce.or.v1i1(<1 x i1> %a)
+declare i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %a)
+declare i1 @llvm.vector.reduce.or.v4i1(<4 x i1> %a)
+declare i1 @llvm.vector.reduce.or.v8i1(<8 x i1> %a)
+declare i1 @llvm.vector.reduce.or.v16i1(<16 x i1> %a)
+declare i1 @llvm.vector.reduce.or.v32i1(<32 x i1> %a)
 
 define i32 @reduce_and_v1(<1 x i8> %a0, i32 %a1, i32 %a2) nounwind {
 ; CHECK-LABEL: reduce_and_v1:
@@ -24,7 +24,7 @@
 ; CHECK-NEXT:    csel w0, w0, w1, lt
 ; CHECK-NEXT:    ret
   %x = icmp slt <1 x i8> %a0, zeroinitializer
-  %y = call i1 @llvm.experimental.vector.reduce.and.v1i1(<1 x i1> %x)
+  %y = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> %x)
   %z = select i1 %y, i32 %a1, i32 %a2
   ret i32 %z
 }
@@ -41,7 +41,7 @@
 ; CHECK-NEXT:    csel w0, w0, w1, ne
 ; CHECK-NEXT:    ret
   %x = icmp slt <2 x i8> %a0, zeroinitializer
-  %y = call i1 @llvm.experimental.vector.reduce.and.v2i1(<2 x i1> %x)
+  %y = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> %x)
   %z = select i1 %y, i32 %a1, i32 %a2
   ret i32 %z
 }
@@ -58,7 +58,7 @@
 ; CHECK-NEXT:    csel w0, w0, w1, ne
 ; CHECK-NEXT:    ret
   %x = icmp slt <4 x i8> %a0, zeroinitializer
-  %y = call i1 @llvm.experimental.vector.reduce.and.v4i1(<4 x i1> %x)
+  %y = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> %x)
   %z = select i1 %y, i32 %a1, i32 %a2
   ret i32 %z
 }
@@ -73,7 +73,7 @@
 ; CHECK-NEXT:    csel w0, w0, w1, ne
 ; CHECK-NEXT:    ret
   %x = icmp slt <8 x i8> %a0, zeroinitializer
-  %y = call i1 @llvm.experimental.vector.reduce.and.v8i1(<8 x i1> %x)
+  %y = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> %x)
   %z = select i1 %y, i32 %a1, i32 %a2
   ret i32 %z
 }
@@ -88,7 +88,7 @@
 ; CHECK-NEXT:    csel w0, w0, w1, ne
 ; CHECK-NEXT:    ret
   %x = icmp slt <16 x i8> %a0, zeroinitializer
-  %y = call i1 @llvm.experimental.vector.reduce.and.v16i1(<16 x i1> %x)
+  %y = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> %x)
   %z = select i1 %y, i32 %a1, i32 %a2
   ret i32 %z
 }
@@ -105,7 +105,7 @@
 ; CHECK-NEXT:    csel w0, w0, w1, ne
 ; CHECK-NEXT:    ret
   %x = icmp slt <32 x i8> %a0, zeroinitializer
-  %y = call i1 @llvm.experimental.vector.reduce.and.v32i1(<32 x i1> %x)
+  %y = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> %x)
   %z = select i1 %y, i32 %a1, i32 %a2
   ret i32 %z
 }
@@ -119,7 +119,7 @@
 ; CHECK-NEXT:    csel w0, w0, w1, lt
 ; CHECK-NEXT:    ret
   %x = icmp slt <1 x i8> %a0, zeroinitializer
-  %y = call i1 @llvm.experimental.vector.reduce.or.v1i1(<1 x i1> %x)
+  %y = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> %x)
   %z = select i1 %y, i32 %a1, i32 %a2
   ret i32 %z
 }
@@ -136,7 +136,7 @@
 ; CHECK-NEXT:    csel w0, w0, w1, ne
 ; CHECK-NEXT:    ret
   %x = icmp slt <2 x i8> %a0, zeroinitializer
-  %y = call i1 @llvm.experimental.vector.reduce.or.v2i1(<2 x i1> %x)
+  %y = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %x)
   %z = select i1 %y, i32 %a1, i32 %a2
   ret i32 %z
 }
@@ -153,7 +153,7 @@
 ; CHECK-NEXT:    csel w0, w0, w1, ne
 ; CHECK-NEXT:    ret
   %x = icmp slt <4 x i8> %a0, zeroinitializer
-  %y = call i1 @llvm.experimental.vector.reduce.or.v4i1(<4 x i1> %x)
+  %y = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> %x)
   %z = select i1 %y, i32 %a1, i32 %a2
   ret i32 %z
 }
@@ -168,7 +168,7 @@
 ; CHECK-NEXT:    csel w0, w0, w1, ne
 ; CHECK-NEXT:    ret
   %x = icmp slt <8 x i8> %a0, zeroinitializer
-  %y = call i1 @llvm.experimental.vector.reduce.or.v8i1(<8 x i1> %x)
+  %y = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> %x)
   %z = select i1 %y, i32 %a1, i32 %a2
   ret i32 %z
 }
@@ -183,7 +183,7 @@
 ; CHECK-NEXT:    csel w0, w0, w1, ne
 ; CHECK-NEXT:    ret
   %x = icmp slt <16 x i8> %a0, zeroinitializer
-  %y = call i1 @llvm.experimental.vector.reduce.or.v16i1(<16 x i1> %x)
+  %y = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> %x)
   %z = select i1 %y, i32 %a1, i32 %a2
   ret i32 %z
 }
@@ -200,7 +200,7 @@
 ; CHECK-NEXT:    csel w0, w0, w1, ne
 ; CHECK-NEXT:    ret
   %x = icmp slt <32 x i8> %a0, zeroinitializer
-  %y = call i1 @llvm.experimental.vector.reduce.or.v32i1(<32 x i1> %x)
+  %y = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> %x)
   %z = select i1 %y, i32 %a1, i32 %a2
   ret i32 %z
 }
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fadd-legalization-strict.ll b/llvm/test/CodeGen/AArch64/vecreduce-fadd-legalization-strict.ll
--- a/llvm/test/CodeGen/AArch64/vecreduce-fadd-legalization-strict.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fadd-legalization-strict.ll
@@ -3,14 +3,14 @@
 
 ; Same as vecreduce-fadd-legalization.ll, but without fmf.
 
-declare half @llvm.experimental.vector.reduce.v2.fadd.f16.v1f16(half, <1 x half>)
-declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v1f32(float, <1 x float>)
-declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v1f64(double, <1 x double>)
-declare fp128 @llvm.experimental.vector.reduce.v2.fadd.f128.v1f128(fp128, <1 x fp128>)
+declare half @llvm.vector.reduce.fadd.f16.v1f16(half, <1 x half>)
+declare float @llvm.vector.reduce.fadd.f32.v1f32(float, <1 x float>)
+declare double @llvm.vector.reduce.fadd.f64.v1f64(double, <1 x double>)
+declare fp128 @llvm.vector.reduce.fadd.f128.v1f128(fp128, <1 x fp128>)
 
-declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v3f32(float, <3 x float>)
-declare fp128 @llvm.experimental.vector.reduce.v2.fadd.f128.v2f128(fp128, <2 x fp128>)
-declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float, <16 x float>)
+declare float @llvm.vector.reduce.fadd.f32.v3f32(float, <3 x float>)
+declare fp128 @llvm.vector.reduce.fadd.f128.v2f128(fp128, <2 x fp128>)
+declare float @llvm.vector.reduce.fadd.f32.v16f32(float, <16 x float>)
 
 define half @test_v1f16(<1 x half> %a) nounwind {
 ; CHECK-LABEL: test_v1f16:
@@ -20,7 +20,7 @@
 ; CHECK-NEXT:    fadd s0, s0, s1
 ; CHECK-NEXT:    fcvt h0, s0
 ; CHECK-NEXT:    ret
-  %b = call half @llvm.experimental.vector.reduce.v2.fadd.f16.v1f16(half 0.0, <1 x half> %a)
+  %b = call half @llvm.vector.reduce.fadd.f16.v1f16(half 0.0, <1 x half> %a)
   ret half %b
 }
 
@@ -31,7 +31,7 @@
 ; CHECK-NEXT:    fmov s1, wzr
 ; CHECK-NEXT:    fadd s0, s0, s1
 ; CHECK-NEXT:    ret
-  %b = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v1f32(float 0.0, <1 x float> %a)
+  %b = call float @llvm.vector.reduce.fadd.f32.v1f32(float 0.0, <1 x float> %a)
   ret float %b
 }
 
@@ -41,7 +41,7 @@
 ; CHECK-NEXT:    fmov d1, xzr
 ; CHECK-NEXT:    fadd d0, d0, d1
 ; CHECK-NEXT:    ret
-  %b = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v1f64(double 0.0, <1 x double> %a)
+  %b = call double @llvm.vector.reduce.fadd.f64.v1f64(double 0.0, <1 x double> %a)
   ret double %b
 }
 
@@ -54,7 +54,7 @@
 ; CHECK-NEXT:    bl __addtf3
 ; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
-  %b = call fp128 @llvm.experimental.vector.reduce.v2.fadd.f128.v1f128(fp128 zeroinitializer, <1 x fp128> %a)
+  %b = call fp128 @llvm.vector.reduce.fadd.f128.v1f128(fp128 zeroinitializer, <1 x fp128> %a)
   ret fp128 %b
 }
 
@@ -68,7 +68,7 @@
 ; CHECK-NEXT:    mov s0, v0.s[2]
 ; CHECK-NEXT:    fadd s0, s1, s0
 ; CHECK-NEXT:    ret
-  %b = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v3f32(float 0.0, <3 x float> %a)
+  %b = call float @llvm.vector.reduce.fadd.f32.v3f32(float 0.0, <3 x float> %a)
   ret float %b
 }
 
@@ -86,7 +86,7 @@
 ; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #32 // =32
 ; CHECK-NEXT:    ret
-  %b = call fp128 @llvm.experimental.vector.reduce.v2.fadd.f128.v2f128(fp128 zeroinitializer, <2 x fp128> %a)
+  %b = call fp128 @llvm.vector.reduce.fadd.f128.v2f128(fp128 zeroinitializer, <2 x fp128> %a)
   ret fp128 %b
 }
 
@@ -123,6 +123,6 @@
 ; CHECK-NEXT:    mov s1, v3.s[3]
 ; CHECK-NEXT:    fadd s0, s0, s1
 ; CHECK-NEXT:    ret
-  %b = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float 0.0, <16 x float> %a)
+  %b = call float @llvm.vector.reduce.fadd.f32.v16f32(float 0.0, <16 x float> %a)
   ret float %b
 }
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fadd-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-fadd-legalization.ll
--- a/llvm/test/CodeGen/AArch64/vecreduce-fadd-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fadd-legalization.ll
@@ -1,20 +1,20 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK
 
-declare half @llvm.experimental.vector.reduce.v2.fadd.f16.v1f16(half, <1 x half>)
-declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v1f32(float, <1 x float>)
-declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v1f64(double, <1 x double>)
-declare fp128 @llvm.experimental.vector.reduce.v2.fadd.f128.v1f128(fp128, <1 x fp128>)
+declare half @llvm.vector.reduce.fadd.f16.v1f16(half, <1 x half>)
+declare float @llvm.vector.reduce.fadd.f32.v1f32(float, <1 x float>)
+declare double @llvm.vector.reduce.fadd.f64.v1f64(double, <1 x double>)
+declare fp128 @llvm.vector.reduce.fadd.f128.v1f128(fp128, <1 x fp128>)
 
-declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v3f32(float, <3 x float>)
-declare fp128 @llvm.experimental.vector.reduce.v2.fadd.f128.v2f128(fp128, <2 x fp128>)
-declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float, <16 x float>)
+declare float @llvm.vector.reduce.fadd.f32.v3f32(float, <3 x float>)
+declare fp128 @llvm.vector.reduce.fadd.f128.v2f128(fp128, <2 x fp128>)
+declare float @llvm.vector.reduce.fadd.f32.v16f32(float, <16 x float>)
 
 define half @test_v1f16(<1 x half> %a) nounwind {
 ; CHECK-LABEL: test_v1f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
-  %b = call fast nnan half @llvm.experimental.vector.reduce.v2.fadd.f16.v1f16(half 0.0, <1 x half> %a)
+  %b = call fast nnan half @llvm.vector.reduce.fadd.f16.v1f16(half 0.0, <1 x half> %a)
   ret half %b
 }
 
@@ -24,7 +24,7 @@
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-NEXT:    ret
-  %b = call fast nnan float @llvm.experimental.vector.reduce.v2.fadd.f32.v1f32(float 0.0, <1 x float> %a)
+  %b = call fast nnan float @llvm.vector.reduce.fadd.f32.v1f32(float 0.0, <1 x float> %a)
   ret float %b
 }
 
@@ -32,7 +32,7 @@
 ; CHECK-LABEL: test_v1f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
-  %b = call fast nnan double @llvm.experimental.vector.reduce.v2.fadd.f64.v1f64(double 0.0, <1 x double> %a)
+  %b = call fast nnan double @llvm.vector.reduce.fadd.f64.v1f64(double 0.0, <1 x double> %a)
   ret double %b
 }
 
@@ -40,7 +40,7 @@
 ; CHECK-LABEL: test_v1f128:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
-  %b = call fast nnan fp128 @llvm.experimental.vector.reduce.v2.fadd.f128.v1f128(fp128 zeroinitializer, <1 x fp128> %a)
+  %b = call fast nnan fp128 @llvm.vector.reduce.fadd.f128.v1f128(fp128 zeroinitializer, <1 x fp128> %a)
   ret fp128 %b
 }
 
@@ -53,7 +53,7 @@
 ; CHECK-NEXT:    fadd v0.2s, v0.2s, v1.2s
 ; CHECK-NEXT:    faddp s0, v0.2s
 ; CHECK-NEXT:    ret
-  %b = call fast nnan float @llvm.experimental.vector.reduce.v2.fadd.f32.v3f32(float 0.0, <3 x float> %a)
+  %b = call fast nnan float @llvm.vector.reduce.fadd.f32.v3f32(float 0.0, <3 x float> %a)
   ret float %b
 }
 
@@ -64,7 +64,7 @@
 ; CHECK-NEXT:    bl __addtf3
 ; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
-  %b = call fast nnan fp128 @llvm.experimental.vector.reduce.v2.fadd.f128.v2f128(fp128 zeroinitializer, <2 x fp128> %a)
+  %b = call fast nnan fp128 @llvm.vector.reduce.fadd.f128.v2f128(fp128 zeroinitializer, <2 x fp128> %a)
   ret fp128 %b
 }
 
@@ -78,6 +78,6 @@
 ; CHECK-NEXT:    fadd v0.2s, v0.2s, v1.2s
 ; CHECK-NEXT:    faddp s0, v0.2s
 ; CHECK-NEXT:    ret
-  %b = call fast nnan float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float 0.0, <16 x float> %a)
+  %b = call fast nnan float @llvm.vector.reduce.fadd.f32.v16f32(float 0.0, <16 x float> %a)
   ret float %b
 }
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll b/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll
--- a/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll
@@ -14,7 +14,7 @@
 ; CHECKNOFP16-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECKNOFP16-NEXT:    faddp s0, v0.2s
 ; CHECKNOFP16-NEXT:    ret
-  %r = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v2f32(float 0.0, <2 x float> %bin.rdx)
+  %r = call fast float @llvm.vector.reduce.fadd.f32.v2f32(float 0.0, <2 x float> %bin.rdx)
   ret float %r
 }
 
@@ -48,7 +48,7 @@
 ; CHECKNOFP16-NEXT:    fadd s0, s0, s1
 ; CHECKNOFP16-NEXT:    fcvt h0, s0
 ; CHECKNOFP16-NEXT:    ret
-  %r = call fast half @llvm.experimental.vector.reduce.v2.fadd.f16.v4f16(half 0.0, <4 x half> %bin.rdx)
+  %r = call fast half @llvm.vector.reduce.fadd.f16.v4f16(half 0.0, <4 x half> %bin.rdx)
   ret half %r
 }
 
@@ -103,7 +103,7 @@
 ; CHECKNOFP16-NEXT:    fadd s0, s0, s1
 ; CHECKNOFP16-NEXT:    fcvt h0, s0
 ; CHECKNOFP16-NEXT:    ret
-  %r = call fast half @llvm.experimental.vector.reduce.v2.fadd.f16.v8f16(half 0.0, <8 x half> %bin.rdx)
+  %r = call fast half @llvm.vector.reduce.fadd.f16.v8f16(half 0.0, <8 x half> %bin.rdx)
   ret half %r
 }
 
@@ -121,7 +121,7 @@
 ; CHECKNOFP16-NEXT:    fadd v0.2s, v0.2s, v1.2s
 ; CHECKNOFP16-NEXT:    faddp s0, v0.2s
 ; CHECKNOFP16-NEXT:    ret
-  %r = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.0, <4 x float> %bin.rdx)
+  %r = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float 0.0, <4 x float> %bin.rdx)
   ret float %r
 }
 
@@ -135,7 +135,7 @@
 ; CHECKNOFP16:       // %bb.0:
 ; CHECKNOFP16-NEXT:    faddp d0, v0.2d
 ; CHECKNOFP16-NEXT:    ret
-  %r = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double 0.0, <2 x double> %bin.rdx)
+  %r = call fast double @llvm.vector.reduce.fadd.f64.v2f64(double 0.0, <2 x double> %bin.rdx)
   ret double %r
 }
 
@@ -229,7 +229,7 @@
 ; CHECKNOFP16-NEXT:    fadd s0, s1, s0
 ; CHECKNOFP16-NEXT:    fcvt h0, s0
 ; CHECKNOFP16-NEXT:    ret
-  %r = call fast half @llvm.experimental.vector.reduce.v2.fadd.f16.v16f16(half 0.0, <16 x half> %bin.rdx)
+  %r = call fast half @llvm.vector.reduce.fadd.f16.v16f16(half 0.0, <16 x half> %bin.rdx)
   ret half %r
 }
 
@@ -249,7 +249,7 @@
 ; CHECKNOFP16-NEXT:    fadd v0.2s, v0.2s, v1.2s
 ; CHECKNOFP16-NEXT:    faddp s0, v0.2s
 ; CHECKNOFP16-NEXT:    ret
-  %r = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float 0.0, <8 x float> %bin.rdx)
+  %r = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float 0.0, <8 x float> %bin.rdx)
   ret float %r
 }
 
@@ -265,16 +265,16 @@
 ; CHECKNOFP16-NEXT:    fadd v0.2d, v0.2d, v1.2d
 ; CHECKNOFP16-NEXT:    faddp d0, v0.2d
 ; CHECKNOFP16-NEXT:    ret
-  %r = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double 0.0, <4 x double> %bin.rdx)
+  %r = call fast double @llvm.vector.reduce.fadd.f64.v4f64(double 0.0, <4 x double> %bin.rdx)
   ret double %r
 }
 
 ; Function Attrs: nounwind readnone
-declare half @llvm.experimental.vector.reduce.v2.fadd.f16.v4f16(half, <4 x half>)
-declare half @llvm.experimental.vector.reduce.v2.fadd.f16.v8f16(half, <8 x half>)
-declare half @llvm.experimental.vector.reduce.v2.fadd.f16.v16f16(half, <16 x half>)
-declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v2f32(float, <2 x float>)
-declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float, <4 x float>)
-declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float, <8 x float>)
-declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double, <2 x double>)
-declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double, <4 x double>)
+declare half @llvm.vector.reduce.fadd.f16.v4f16(half, <4 x half>)
+declare half @llvm.vector.reduce.fadd.f16.v8f16(half, <8 x half>)
+declare half @llvm.vector.reduce.fadd.f16.v16f16(half, <16 x half>)
+declare float @llvm.vector.reduce.fadd.f32.v2f32(float, <2 x float>)
+declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
+declare float @llvm.vector.reduce.fadd.f32.v8f32(float, <8 x float>)
+declare double @llvm.vector.reduce.fadd.f64.v2f64(double, <2 x double>)
+declare double @llvm.vector.reduce.fadd.f64.v4f64(double, <4 x double>)
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization-nan.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization-nan.ll
--- a/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization-nan.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization-nan.ll
@@ -1,20 +1,20 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK
 
-declare half @llvm.experimental.vector.reduce.fmax.v1f16(<1 x half> %a)
-declare float @llvm.experimental.vector.reduce.fmax.v1f32(<1 x float> %a)
-declare double @llvm.experimental.vector.reduce.fmax.v1f64(<1 x double> %a)
-declare fp128 @llvm.experimental.vector.reduce.fmax.v1f128(<1 x fp128> %a)
+declare half @llvm.vector.reduce.fmax.v1f16(<1 x half> %a)
+declare float @llvm.vector.reduce.fmax.v1f32(<1 x float> %a)
+declare double @llvm.vector.reduce.fmax.v1f64(<1 x double> %a)
+declare fp128 @llvm.vector.reduce.fmax.v1f128(<1 x fp128> %a)
 
-declare float @llvm.experimental.vector.reduce.fmax.v3f32(<3 x float> %a)
-declare fp128 @llvm.experimental.vector.reduce.fmax.v2f128(<2 x fp128> %a)
-declare float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float> %a)
+declare float @llvm.vector.reduce.fmax.v3f32(<3 x float> %a)
+declare fp128 @llvm.vector.reduce.fmax.v2f128(<2 x fp128> %a)
+declare float @llvm.vector.reduce.fmax.v16f32(<16 x float> %a)
 
 define half @test_v1f16(<1 x half> %a) nounwind {
 ; CHECK-LABEL: test_v1f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
-  %b = call half @llvm.experimental.vector.reduce.fmax.v1f16(<1 x half> %a)
+  %b = call half @llvm.vector.reduce.fmax.v1f16(<1 x half> %a)
   ret half %b
 }
 
@@ -24,7 +24,7 @@
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-NEXT:    ret
-  %b = call float @llvm.experimental.vector.reduce.fmax.v1f32(<1 x float> %a)
+  %b = call float @llvm.vector.reduce.fmax.v1f32(<1 x float> %a)
   ret float %b
 }
 
@@ -32,7 +32,7 @@
 ; CHECK-LABEL: test_v1f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
-  %b = call double @llvm.experimental.vector.reduce.fmax.v1f64(<1 x double> %a)
+  %b = call double @llvm.vector.reduce.fmax.v1f64(<1 x double> %a)
   ret double %b
 }
 
@@ -40,14 +40,14 @@
 ; CHECK-LABEL: test_v1f128:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
-  %b = call fp128 @llvm.experimental.vector.reduce.fmax.v1f128(<1 x fp128> %a)
+  %b = call fp128 @llvm.vector.reduce.fmax.v1f128(<1 x fp128> %a)
   ret fp128 %b
 }
 
 ; TODO: This doesn't work, because ExpandReductions only supports power of two
 ; unordered reductions.
 ;define float @test_v3f32(<3 x float> %a) nounwind {
-;  %b = call float @llvm.experimental.vector.reduce.fmax.v3f32(<3 x float> %a)
+;  %b = call float @llvm.vector.reduce.fmax.v3f32(<3 x float> %a)
 ;  ret float %b
 ;}
 
@@ -55,7 +55,7 @@
 ; CHECK-LABEL: test_v2f128:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    b fmaxl
-  %b = call fp128 @llvm.experimental.vector.reduce.fmax.v2f128(<2 x fp128> %a)
+  %b = call fp128 @llvm.vector.reduce.fmax.v2f128(<2 x fp128> %a)
   ret fp128 %b
 }
 
@@ -67,6 +67,6 @@
 ; CHECK-NEXT:    fmaxnm v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    fmaxnmv s0, v0.4s
 ; CHECK-NEXT:    ret
-  %b = call float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float> %a)
+  %b = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %a)
   ret float %b
 }
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll
--- a/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll
@@ -1,20 +1,20 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK
 
-declare half @llvm.experimental.vector.reduce.fmax.v1f16(<1 x half> %a)
-declare float @llvm.experimental.vector.reduce.fmax.v1f32(<1 x float> %a)
-declare double @llvm.experimental.vector.reduce.fmax.v1f64(<1 x double> %a)
-declare fp128 @llvm.experimental.vector.reduce.fmax.v1f128(<1 x fp128> %a)
+declare half @llvm.vector.reduce.fmax.v1f16(<1 x half> %a)
+declare float @llvm.vector.reduce.fmax.v1f32(<1 x float> %a)
+declare double @llvm.vector.reduce.fmax.v1f64(<1 x double> %a)
+declare fp128 @llvm.vector.reduce.fmax.v1f128(<1 x fp128> %a)
 
-declare float @llvm.experimental.vector.reduce.fmax.v3f32(<3 x float> %a)
-declare fp128 @llvm.experimental.vector.reduce.fmax.v2f128(<2 x fp128> %a)
-declare float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float> %a)
+declare float @llvm.vector.reduce.fmax.v3f32(<3 x float> %a)
+declare fp128 @llvm.vector.reduce.fmax.v2f128(<2 x fp128> %a)
+declare float @llvm.vector.reduce.fmax.v16f32(<16 x float> %a)
 
 define half @test_v1f16(<1 x half> %a) nounwind {
 ; CHECK-LABEL: test_v1f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
-  %b = call nnan half @llvm.experimental.vector.reduce.fmax.v1f16(<1 x half> %a)
+  %b = call nnan half @llvm.vector.reduce.fmax.v1f16(<1 x half> %a)
   ret half %b
 }
 
@@ -24,7 +24,7 @@
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-NEXT:    ret
-  %b = call nnan float @llvm.experimental.vector.reduce.fmax.v1f32(<1 x float> %a)
+  %b = call nnan float @llvm.vector.reduce.fmax.v1f32(<1 x float> %a)
   ret float %b
 }
 
@@ -32,7 +32,7 @@
 ; CHECK-LABEL: test_v1f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
-  %b = call nnan double @llvm.experimental.vector.reduce.fmax.v1f64(<1 x double> %a)
+  %b = call nnan double @llvm.vector.reduce.fmax.v1f64(<1 x double> %a)
   ret double %b
 }
 
@@ -40,7 +40,7 @@
 ; CHECK-LABEL: test_v1f128:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
-  %b = call nnan fp128 @llvm.experimental.vector.reduce.fmax.v1f128(<1 x fp128> %a)
+  %b = call nnan fp128 @llvm.vector.reduce.fmax.v1f128(<1 x fp128> %a)
   ret fp128 %b
 }
 
@@ -52,7 +52,7 @@
 ; CHECK-NEXT:    mov v0.s[3], v1.s[0]
 ; CHECK-NEXT:    fmaxnmv s0, v0.4s
 ; CHECK-NEXT:    ret
-  %b = call nnan float @llvm.experimental.vector.reduce.fmax.v3f32(<3 x float> %a)
+  %b = call nnan float @llvm.vector.reduce.fmax.v3f32(<3 x float> %a)
   ret float %b
 }
 
@@ -64,7 +64,7 @@
 ; CHECK-NEXT:    mov v0.s[3], v1.s[0]
 ; CHECK-NEXT:    fmaxnmv s0, v0.4s
 ; CHECK-NEXT:    ret
-  %b = call nnan ninf float @llvm.experimental.vector.reduce.fmax.v3f32(<3 x float> %a)
+  %b = call nnan ninf float @llvm.vector.reduce.fmax.v3f32(<3 x float> %a)
   ret float %b
 }
 
@@ -72,7 +72,7 @@
 ; CHECK-LABEL: test_v2f128:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    b fmaxl
-  %b = call nnan fp128 @llvm.experimental.vector.reduce.fmax.v2f128(<2 x fp128> %a)
+  %b = call nnan fp128 @llvm.vector.reduce.fmax.v2f128(<2 x fp128> %a)
   ret fp128 %b
 }
 
@@ -84,6 +84,6 @@
 ; CHECK-NEXT:    fmaxnm v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    fmaxnmv s0, v0.4s
 ; CHECK-NEXT:    ret
-  %b = call nnan float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float> %a)
+  %b = call nnan float @llvm.vector.reduce.fmax.v16f32(<16 x float> %a)
   ret float %b
 }
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll
--- a/llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll
@@ -1,20 +1,20 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK
 
-declare half @llvm.experimental.vector.reduce.fmin.v1f16(<1 x half> %a)
-declare float @llvm.experimental.vector.reduce.fmin.v1f32(<1 x float> %a)
-declare double @llvm.experimental.vector.reduce.fmin.v1f64(<1 x double> %a)
-declare fp128 @llvm.experimental.vector.reduce.fmin.v1f128(<1 x fp128> %a)
+declare half @llvm.vector.reduce.fmin.v1f16(<1 x half> %a)
+declare float @llvm.vector.reduce.fmin.v1f32(<1 x float> %a)
+declare double @llvm.vector.reduce.fmin.v1f64(<1 x double> %a)
+declare fp128 @llvm.vector.reduce.fmin.v1f128(<1 x fp128> %a)
 
-declare float @llvm.experimental.vector.reduce.fmin.v3f32(<3 x float> %a)
-declare fp128 @llvm.experimental.vector.reduce.fmin.v2f128(<2 x fp128> %a)
-declare float @llvm.experimental.vector.reduce.fmin.v16f32(<16 x float> %a)
+declare float @llvm.vector.reduce.fmin.v3f32(<3 x float> %a)
+declare fp128 @llvm.vector.reduce.fmin.v2f128(<2 x fp128> %a)
+declare float @llvm.vector.reduce.fmin.v16f32(<16 x float> %a)
 
 define half @test_v1f16(<1 x half> %a) nounwind {
 ; CHECK-LABEL: test_v1f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
-  %b = call nnan half @llvm.experimental.vector.reduce.fmin.v1f16(<1 x half> %a)
+  %b = call nnan half @llvm.vector.reduce.fmin.v1f16(<1 x half> %a)
   ret half %b
 }
 
@@ -24,7 +24,7 @@
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-NEXT:    ret
-  %b = call nnan float @llvm.experimental.vector.reduce.fmin.v1f32(<1 x float> %a)
+  %b = call nnan float @llvm.vector.reduce.fmin.v1f32(<1 x float> %a)
   ret float %b
 }
 
@@ -32,7 +32,7 @@
 ; CHECK-LABEL: test_v1f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
-  %b = call nnan double @llvm.experimental.vector.reduce.fmin.v1f64(<1 x double> %a)
+  %b = call nnan double @llvm.vector.reduce.fmin.v1f64(<1 x double> %a)
   ret double %b
 }
 
@@ -40,7 +40,7 @@
 ; CHECK-LABEL: test_v1f128:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
-  %b = call nnan fp128 @llvm.experimental.vector.reduce.fmin.v1f128(<1 x fp128> %a)
+  %b = call nnan fp128 @llvm.vector.reduce.fmin.v1f128(<1 x fp128> %a)
   ret fp128 %b
 }
 
@@ -52,7 +52,7 @@
 ; CHECK-NEXT:    mov v0.s[3], v1.s[0]
 ; CHECK-NEXT:    fminnmv s0, v0.4s
 ; CHECK-NEXT:    ret
-  %b = call nnan float @llvm.experimental.vector.reduce.fmin.v3f32(<3 x float> %a)
+  %b = call nnan float @llvm.vector.reduce.fmin.v3f32(<3 x float> %a)
   ret float %b
 }
 
@@ -64,7 +64,7 @@
 ; CHECK-NEXT:    mov v0.s[3], v1.s[0]
 ; CHECK-NEXT:    fminnmv s0, v0.4s
 ; CHECK-NEXT:    ret
-  %b = call nnan ninf float @llvm.experimental.vector.reduce.fmin.v3f32(<3 x float> %a)
+  %b = call nnan ninf float @llvm.vector.reduce.fmin.v3f32(<3 x float> %a)
   ret float %b
 }
 
@@ -72,7 +72,7 @@
 ; CHECK-LABEL: test_v2f128:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    b fminl
-  %b = call nnan fp128 @llvm.experimental.vector.reduce.fmin.v2f128(<2 x fp128> %a)
+  %b = call nnan fp128 @llvm.vector.reduce.fmin.v2f128(<2 x fp128> %a)
   ret fp128 %b
 }
 
@@ -84,6 +84,6 @@
 ; CHECK-NEXT:    fminnm v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    fminnmv s0, v0.4s
 ; CHECK-NEXT:    ret
-  %b = call nnan float @llvm.experimental.vector.reduce.fmin.v16f32(<16 x float> %a)
+  %b = call nnan float @llvm.vector.reduce.fmin.v16f32(<16 x float> %a)
   ret float %b
 }
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmul-legalization-strict.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmul-legalization-strict.ll
--- a/llvm/test/CodeGen/AArch64/vecreduce-fmul-legalization-strict.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fmul-legalization-strict.ll
@@ -3,14 +3,14 @@
 
 ; Same as vecreduce-fmul-legalization.ll, but without fmf.
 
-declare half @llvm.experimental.vector.reduce.v2.fmul.f16.v1f16(half, <1 x half>)
-declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v1f32(float, <1 x float>)
-declare double @llvm.experimental.vector.reduce.v2.fmul.f64.v1f64(double, <1 x double>)
-declare fp128 @llvm.experimental.vector.reduce.v2.fmul.f128.v1f128(fp128, <1 x fp128>)
+declare half @llvm.vector.reduce.fmul.f16.v1f16(half, <1 x half>)
+declare float @llvm.vector.reduce.fmul.f32.v1f32(float, <1 x float>)
+declare double @llvm.vector.reduce.fmul.f64.v1f64(double, <1 x double>)
+declare fp128 @llvm.vector.reduce.fmul.f128.v1f128(fp128, <1 x fp128>)
 
-declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v3f32(float, <3 x float>)
-declare fp128 @llvm.experimental.vector.reduce.v2.fmul.f128.v2f128(fp128, <2 x fp128>)
-declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v16f32(float, <16 x float>)
+declare float @llvm.vector.reduce.fmul.f32.v3f32(float, <3 x float>)
+declare fp128 @llvm.vector.reduce.fmul.f128.v2f128(fp128, <2 x fp128>)
+declare float @llvm.vector.reduce.fmul.f32.v16f32(float, <16 x float>)
 
 define half @test_v1f16(<1 x half> %a) nounwind {
 ; CHECK-LABEL: test_v1f16:
@@ -20,7 +20,7 @@
 ; CHECK-NEXT:    fmul s0, s0, s1
 ; CHECK-NEXT:    fcvt h0, s0
 ; CHECK-NEXT:    ret
-  %b = call half @llvm.experimental.vector.reduce.v2.fmul.f16.v1f16(half 0.0, <1 x half> %a)
+  %b = call half @llvm.vector.reduce.fmul.f16.v1f16(half 0.0, <1 x half> %a)
   ret half %b
 }
 
@@ -31,7 +31,7 @@
 ; CHECK-NEXT:    fmov s1, wzr
 ; CHECK-NEXT:    fmul s0, s1, v0.s[0]
 ; CHECK-NEXT:    ret
-  %b = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v1f32(float 0.0, <1 x float> %a)
+  %b = call float @llvm.vector.reduce.fmul.f32.v1f32(float 0.0, <1 x float> %a)
   ret float %b
 }
 
@@ -41,7 +41,7 @@
 ; CHECK-NEXT:    fmov d1, xzr
 ; CHECK-NEXT:    fmul d0, d0, d1
 ; CHECK-NEXT:    ret
-  %b = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v1f64(double 0.0, <1 x double> %a)
+  %b = call double @llvm.vector.reduce.fmul.f64.v1f64(double 0.0, <1 x double> %a)
   ret double %b
 }
 
@@ -54,7 +54,7 @@
 ; CHECK-NEXT:    bl __multf3
 ; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
-  %b = call fp128 @llvm.experimental.vector.reduce.v2.fmul.f128.v1f128(fp128 zeroinitializer, <1 x fp128> %a)
+  %b = call fp128 @llvm.vector.reduce.fmul.f128.v1f128(fp128 zeroinitializer, <1 x fp128> %a)
   ret fp128 %b
 }
 
@@ -66,7 +66,7 @@
 ; CHECK-NEXT:    fmul s1, s1, v0.s[1]
 ; CHECK-NEXT:    fmul s0, s1, v0.s[2]
 ; CHECK-NEXT:    ret
-  %b = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v3f32(float 0.0, <3 x float> %a)
+  %b = call float @llvm.vector.reduce.fmul.f32.v3f32(float 0.0, <3 x float> %a)
   ret float %b
 }
 
@@ -84,7 +84,7 @@
 ; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #32 // =32
 ; CHECK-NEXT:    ret
-  %b = call fp128 @llvm.experimental.vector.reduce.v2.fmul.f128.v2f128(fp128 zeroinitializer, <2 x fp128> %a)
+  %b = call fp128 @llvm.vector.reduce.fmul.f128.v2f128(fp128 zeroinitializer, <2 x fp128> %a)
   ret fp128 %b
 }
 
@@ -109,6 +109,6 @@
 ; CHECK-NEXT:    fmul s0, s0, v3.s[2]
 ; CHECK-NEXT:    fmul s0, s0, v3.s[3]
 ; CHECK-NEXT:    ret
-  %b = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v16f32(float 0.0, <16 x float> %a)
+  %b = call float @llvm.vector.reduce.fmul.f32.v16f32(float 0.0, <16 x float> %a)
   ret float %b
 }
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-propagate-sd-flags.ll b/llvm/test/CodeGen/AArch64/vecreduce-propagate-sd-flags.ll
--- a/llvm/test/CodeGen/AArch64/vecreduce-propagate-sd-flags.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-propagate-sd-flags.ll
@@ -24,8 +24,8 @@
  %1 = insertelement <4 x double> %0, double 1.0, i32 1
  %2 = insertelement <4 x double> %1, double 1.0, i32 2
  %3 = insertelement <4 x double> %2, double 1.0, i32 3
- %4 = call nnan reassoc double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double> %3)
+ %4 = call nnan reassoc double @llvm.vector.reduce.fmax.v4f64(<4 x double> %3)
  ret double %4
 }
 
-declare double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double>)
+declare double @llvm.vector.reduce.fmax.v4f64(<4 x double>)
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll
--- a/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll
@@ -1,29 +1,29 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK
 
-declare i1 @llvm.experimental.vector.reduce.umax.v1i1(<1 x i1> %a)
-declare i8 @llvm.experimental.vector.reduce.umax.v1i8(<1 x i8> %a)
-declare i16 @llvm.experimental.vector.reduce.umax.v1i16(<1 x i16> %a)
-declare i24 @llvm.experimental.vector.reduce.umax.v1i24(<1 x i24> %a)
-declare i32 @llvm.experimental.vector.reduce.umax.v1i32(<1 x i32> %a)
-declare i64 @llvm.experimental.vector.reduce.umax.v1i64(<1 x i64> %a)
-declare i128 @llvm.experimental.vector.reduce.umax.v1i128(<1 x i128> %a)
-
-declare i64 @llvm.experimental.vector.reduce.umax.v2i64(<2 x i64> %a)
-declare i8 @llvm.experimental.vector.reduce.umax.v3i8(<3 x i8> %a)
-declare i8 @llvm.experimental.vector.reduce.umax.v9i8(<9 x i8> %a)
-declare i32 @llvm.experimental.vector.reduce.umax.v3i32(<3 x i32> %a)
-declare i1 @llvm.experimental.vector.reduce.umax.v4i1(<4 x i1> %a)
-declare i24 @llvm.experimental.vector.reduce.umax.v4i24(<4 x i24> %a)
-declare i128 @llvm.experimental.vector.reduce.umax.v2i128(<2 x i128> %a)
-declare i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32> %a)
+declare i1 @llvm.vector.reduce.umax.v1i1(<1 x i1> %a)
+declare i8 @llvm.vector.reduce.umax.v1i8(<1 x i8> %a)
+declare i16 @llvm.vector.reduce.umax.v1i16(<1 x i16> %a)
+declare i24 @llvm.vector.reduce.umax.v1i24(<1 x i24> %a)
+declare i32 @llvm.vector.reduce.umax.v1i32(<1 x i32> %a)
+declare i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> %a)
+declare i128 @llvm.vector.reduce.umax.v1i128(<1 x i128> %a)
+
+declare i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> %a)
+declare i8 @llvm.vector.reduce.umax.v3i8(<3 x i8> %a)
+declare i8 @llvm.vector.reduce.umax.v9i8(<9 x i8> %a)
+declare i32 @llvm.vector.reduce.umax.v3i32(<3 x i32> %a)
+declare i1 @llvm.vector.reduce.umax.v4i1(<4 x i1> %a)
+declare i24 @llvm.vector.reduce.umax.v4i24(<4 x i24> %a)
+declare i128 @llvm.vector.reduce.umax.v2i128(<2 x i128> %a)
+declare i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> %a)
 
 define i1 @test_v1i1(<1 x i1> %a) nounwind {
 ; CHECK-LABEL: test_v1i1:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    and w0, w0, #0x1
 ; CHECK-NEXT:    ret
-  %b = call i1 @llvm.experimental.vector.reduce.umax.v1i1(<1 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.umax.v1i1(<1 x i1> %a)
   ret i1 %b
 }
 
@@ -33,7 +33,7 @@
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    umov w0, v0.b[0]
 ; CHECK-NEXT:    ret
-  %b = call i8 @llvm.experimental.vector.reduce.umax.v1i8(<1 x i8> %a)
+  %b = call i8 @llvm.vector.reduce.umax.v1i8(<1 x i8> %a)
   ret i8 %b
 }
 
@@ -43,7 +43,7 @@
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    umov w0, v0.h[0]
 ; CHECK-NEXT:    ret
-  %b = call i16 @llvm.experimental.vector.reduce.umax.v1i16(<1 x i16> %a)
+  %b = call i16 @llvm.vector.reduce.umax.v1i16(<1 x i16> %a)
   ret i16 %b
 }
 
@@ -51,7 +51,7 @@
 ; CHECK-LABEL: test_v1i24:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
-  %b = call i24 @llvm.experimental.vector.reduce.umax.v1i24(<1 x i24> %a)
+  %b = call i24 @llvm.vector.reduce.umax.v1i24(<1 x i24> %a)
   ret i24 %b
 }
 
@@ -61,7 +61,7 @@
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-  %b = call i32 @llvm.experimental.vector.reduce.umax.v1i32(<1 x i32> %a)
+  %b = call i32 @llvm.vector.reduce.umax.v1i32(<1 x i32> %a)
   ret i32 %b
 }
 
@@ -71,7 +71,7 @@
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
-  %b = call i64 @llvm.experimental.vector.reduce.umax.v1i64(<1 x i64> %a)
+  %b = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> %a)
   ret i64 %b
 }
 
@@ -79,7 +79,7 @@
 ; CHECK-LABEL: test_v1i128:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
-  %b = call i128 @llvm.experimental.vector.reduce.umax.v1i128(<1 x i128> %a)
+  %b = call i128 @llvm.vector.reduce.umax.v1i128(<1 x i128> %a)
   ret i128 %b
 }
 
@@ -92,7 +92,7 @@
 ; CHECK-NEXT:   cmp     x9, x8
 ; CHECK-NEXT:   csel    x0, x9, x8, hi
 ; CHECK-NEXT:   ret
-  %b = call i64 @llvm.experimental.vector.reduce.umax.v2i64(<2 x i64> %a)
+  %b = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> %a)
   ret i64 %b
 }
 
@@ -107,7 +107,7 @@
 ; CHECK-NEXT:    umaxv h0, v0.4h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-  %b = call i8 @llvm.experimental.vector.reduce.umax.v3i8(<3 x i8> %a)
+  %b = call i8 @llvm.vector.reduce.umax.v3i8(<3 x i8> %a)
   ret i8 %b
 }
 
@@ -124,7 +124,7 @@
 ; CHECK-NEXT:    umaxv b0, v0.16b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-  %b = call i8 @llvm.experimental.vector.reduce.umax.v9i8(<9 x i8> %a)
+  %b = call i8 @llvm.vector.reduce.umax.v9i8(<9 x i8> %a)
   ret i8 %b
 }
 
@@ -135,7 +135,7 @@
 ; CHECK-NEXT:    umaxv s0, v0.4s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-  %b = call i32 @llvm.experimental.vector.reduce.umax.v3i32(<3 x i32> %a)
+  %b = call i32 @llvm.vector.reduce.umax.v3i32(<3 x i32> %a)
   ret i32 %b
 }
 
@@ -148,7 +148,7 @@
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    and w0, w8, #0x1
 ; CHECK-NEXT:    ret
-  %b = call i1 @llvm.experimental.vector.reduce.umax.v4i1(<4 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.umax.v4i1(<4 x i1> %a)
   ret i1 %b
 }
 
@@ -159,7 +159,7 @@
 ; CHECK-NEXT:    umaxv s0, v0.4s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-  %b = call i24 @llvm.experimental.vector.reduce.umax.v4i24(<4 x i24> %a)
+  %b = call i24 @llvm.vector.reduce.umax.v4i24(<4 x i24> %a)
   ret i24 %b
 }
 
@@ -173,7 +173,7 @@
 ; CHECK-NEXT:    csel x0, x8, x9, eq
 ; CHECK-NEXT:    csel x1, x1, x3, hi
 ; CHECK-NEXT:    ret
-  %b = call i128 @llvm.experimental.vector.reduce.umax.v2i128(<2 x i128> %a)
+  %b = call i128 @llvm.vector.reduce.umax.v2i128(<2 x i128> %a)
   ret i128 %b
 }
 
@@ -186,6 +186,6 @@
 ; CHECK-NEXT:    umaxv s0, v0.4s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-  %b = call i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32> %a)
+  %b = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> %a)
   ret i32 %b
 }
diff --git a/llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-soft-float.ll b/llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-soft-float.ll
--- a/llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-soft-float.ll
+++ b/llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-soft-float.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=arm-none-eabi -mattr=-neon | FileCheck %s --check-prefix=CHECK
 
-declare half @llvm.experimental.vector.reduce.v2.fadd.f16.v4f16(half, <4 x half>)
-declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float, <4 x float>)
-declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double, <2 x double>)
-declare fp128 @llvm.experimental.vector.reduce.v2.fadd.f128.v2f128(fp128, <2 x fp128>)
+declare half @llvm.vector.reduce.fadd.f16.v4f16(half, <4 x half>)
+declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
+declare double @llvm.vector.reduce.fadd.f64.v2f64(double, <2 x double>)
+declare fp128 @llvm.vector.reduce.fadd.f128.v2f128(fp128, <2 x fp128>)
 
 define half @test_v4f16(<4 x half> %a) nounwind {
 ; CHECK-LABEL: test_v4f16:
@@ -37,7 +37,7 @@
 ; CHECK-NEXT:    bl __aeabi_f2h
 ; CHECK-NEXT:    pop {r4, r5, r6, r7, r8, lr}
 ; CHECK-NEXT:    mov pc, lr
-  %b = call fast half @llvm.experimental.vector.reduce.v2.fadd.f16.v4f16(half 0.0, <4 x half> %a)
+  %b = call fast half @llvm.vector.reduce.fadd.f16.v4f16(half 0.0, <4 x half> %a)
   ret half %b
 }
 
@@ -55,7 +55,7 @@
 ; CHECK-NEXT:    bl __aeabi_fadd
 ; CHECK-NEXT:    pop {r4, r5, r11, lr}
 ; CHECK-NEXT:    mov pc, lr
-  %b = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.0, <4 x float> %a)
+  %b = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float 0.0, <4 x float> %a)
   ret float %b
 }
 
@@ -67,7 +67,7 @@
 ; CHECK-NEXT:    bl __aeabi_dadd
 ; CHECK-NEXT:    pop {r11, lr}
 ; CHECK-NEXT:    mov pc, lr
-  %b = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double zeroinitializer, <2 x double> %a)
+  %b = call fast double @llvm.vector.reduce.fadd.f64.v2f64(double zeroinitializer, <2 x double> %a)
   ret double %b
 }
 
@@ -90,6 +90,6 @@
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    pop {r11, lr}
 ; CHECK-NEXT:    mov pc, lr
-  %b = call fast fp128 @llvm.experimental.vector.reduce.v2.fadd.f128.v2f128(fp128 zeroinitializer, <2 x fp128> %a)
+  %b = call fast fp128 @llvm.vector.reduce.fadd.f128.v2f128(fp128 zeroinitializer, <2 x fp128> %a)
   ret fp128 %b
 }
diff --git a/llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-strict.ll b/llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-strict.ll
--- a/llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-strict.ll
+++ b/llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-strict.ll
@@ -1,14 +1,14 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=arm-none-eabi -mattr=+neon | FileCheck %s --check-prefix=CHECK
 
-declare half @llvm.experimental.vector.reduce.v2.fadd.f16.v1f16(half, <1 x half>)
-declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v1f32(float, <1 x float>)
-declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v1f64(double, <1 x double>)
-declare fp128 @llvm.experimental.vector.reduce.v2.fadd.f128.v1f128(fp128, <1 x fp128>)
+declare half @llvm.vector.reduce.fadd.f16.v1f16(half, <1 x half>)
+declare float @llvm.vector.reduce.fadd.f32.v1f32(float, <1 x float>)
+declare double @llvm.vector.reduce.fadd.f64.v1f64(double, <1 x double>)
+declare fp128 @llvm.vector.reduce.fadd.f128.v1f128(fp128, <1 x fp128>)
 
-declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v3f32(float, <3 x float>)
-declare fp128 @llvm.experimental.vector.reduce.v2.fadd.f128.v2f128(fp128, <2 x fp128>)
-declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float, <16 x float>)
+declare float @llvm.vector.reduce.fadd.f32.v3f32(float, <3 x float>)
+declare fp128 @llvm.vector.reduce.fadd.f128.v2f128(fp128, <2 x fp128>)
+declare float @llvm.vector.reduce.fadd.f32.v16f32(float, <16 x float>)
 
 define half @test_v1f16(<1 x half> %a) nounwind {
 ; CHECK-LABEL: test_v1f16:
@@ -28,7 +28,7 @@
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI0_0:
 ; CHECK-NEXT:    .long 0x00000000 @ float 0
-  %b = call half @llvm.experimental.vector.reduce.v2.fadd.f16.v1f16(half 0.0, <1 x half> %a)
+  %b = call half @llvm.vector.reduce.fadd.f16.v1f16(half 0.0, <1 x half> %a)
   ret half %b
 }
 
@@ -44,7 +44,7 @@
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI1_0:
 ; CHECK-NEXT:    .long 0x00000000 @ float 0
-  %b = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v1f32(float 0.0, <1 x float> %a)
+  %b = call float @llvm.vector.reduce.fadd.f32.v1f32(float 0.0, <1 x float> %a)
   ret float %b
 }
 
@@ -56,7 +56,7 @@
 ; CHECK-NEXT:    vadd.f64 d16, d17, d16
 ; CHECK-NEXT:    vmov r0, r1, d16
 ; CHECK-NEXT:    mov pc, lr
-  %b = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v1f64(double 0.0, <1 x double> %a)
+  %b = call double @llvm.vector.reduce.fadd.f64.v1f64(double 0.0, <1 x double> %a)
   ret double %b
 }
 
@@ -76,7 +76,7 @@
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    pop {r11, lr}
 ; CHECK-NEXT:    mov pc, lr
-  %b = call fp128 @llvm.experimental.vector.reduce.v2.fadd.f128.v1f128(fp128 zeroinitializer, <1 x fp128> %a)
+  %b = call fp128 @llvm.vector.reduce.fadd.f128.v1f128(fp128 zeroinitializer, <1 x fp128> %a)
   ret fp128 %b
 }
 
@@ -95,7 +95,7 @@
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI4_0:
 ; CHECK-NEXT:    .long 0x00000000 @ float 0
-  %b = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v3f32(float 0.0, <3 x float> %a)
+  %b = call float @llvm.vector.reduce.fadd.f32.v3f32(float 0.0, <3 x float> %a)
   ret float %b
 }
 
@@ -124,7 +124,7 @@
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    pop {r4, r5, r11, lr}
 ; CHECK-NEXT:    mov pc, lr
-  %b = call fp128 @llvm.experimental.vector.reduce.v2.fadd.f128.v2f128(fp128 zeroinitializer, <2 x fp128> %a)
+  %b = call fp128 @llvm.vector.reduce.fadd.f128.v2f128(fp128 zeroinitializer, <2 x fp128> %a)
   ret fp128 %b
 }
 
@@ -162,6 +162,6 @@
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI6_0:
 ; CHECK-NEXT:    .long 0x00000000 @ float 0
-  %b = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float 0.0, <16 x float> %a)
+  %b = call float @llvm.vector.reduce.fadd.f32.v16f32(float 0.0, <16 x float> %a)
   ret float %b
 }
diff --git a/llvm/test/CodeGen/ARM/vecreduce-fmax-legalization-soft-float.ll b/llvm/test/CodeGen/ARM/vecreduce-fmax-legalization-soft-float.ll
--- a/llvm/test/CodeGen/ARM/vecreduce-fmax-legalization-soft-float.ll
+++ b/llvm/test/CodeGen/ARM/vecreduce-fmax-legalization-soft-float.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=arm-none-eabi -mattr=-neon | FileCheck %s --check-prefix=CHECK
 
-declare half @llvm.experimental.vector.reduce.fmax.v4f16(<4 x half>)
-declare float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float>)
-declare double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double>)
-declare fp128 @llvm.experimental.vector.reduce.fmax.v2f128(<2 x fp128>)
+declare half @llvm.vector.reduce.fmax.v4f16(<4 x half>)
+declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>)
+declare double @llvm.vector.reduce.fmax.v2f64(<2 x double>)
+declare fp128 @llvm.vector.reduce.fmax.v2f128(<2 x fp128>)
 
 define half @test_v4f16(<4 x half> %a) nounwind {
 ; CHECK-LABEL: test_v4f16:
@@ -37,7 +37,7 @@
 ; CHECK-NEXT:    bl __aeabi_f2h
 ; CHECK-NEXT:    pop {r4, r5, r6, r7, r8, lr}
 ; CHECK-NEXT:    mov pc, lr
-  %b = call fast half @llvm.experimental.vector.reduce.fmax.v4f16(<4 x half> %a)
+  %b = call fast half @llvm.vector.reduce.fmax.v4f16(<4 x half> %a)
   ret half %b
 }
 
@@ -55,7 +55,7 @@
 ; CHECK-NEXT:    bl fmaxf
 ; CHECK-NEXT:    pop {r4, r5, r11, lr}
 ; CHECK-NEXT:    mov pc, lr
-  %b = call fast float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> %a)
+  %b = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> %a)
   ret float %b
 }
 
@@ -67,7 +67,7 @@
 ; CHECK-NEXT:    bl fmax
 ; CHECK-NEXT:    pop {r11, lr}
 ; CHECK-NEXT:    mov pc, lr
-  %b = call fast double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double> %a)
+  %b = call fast double @llvm.vector.reduce.fmax.v2f64(<2 x double> %a)
   ret double %b
 }
 
@@ -90,6 +90,6 @@
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    pop {r11, lr}
 ; CHECK-NEXT:    mov pc, lr
-  %b = call fast fp128 @llvm.experimental.vector.reduce.fmax.v2f128(<2 x fp128> %a)
+  %b = call fast fp128 @llvm.vector.reduce.fmax.v2f128(<2 x fp128> %a)
   ret fp128 %b
 }
diff --git a/llvm/test/CodeGen/ARM/vecreduce-fmin-legalization-soft-float.ll b/llvm/test/CodeGen/ARM/vecreduce-fmin-legalization-soft-float.ll
--- a/llvm/test/CodeGen/ARM/vecreduce-fmin-legalization-soft-float.ll
+++ b/llvm/test/CodeGen/ARM/vecreduce-fmin-legalization-soft-float.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=arm-none-eabi -mattr=-neon | FileCheck %s --check-prefix=CHECK
 
-declare half @llvm.experimental.vector.reduce.fmin.v4f16(<4 x half>)
-declare float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float>)
-declare double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double>)
-declare fp128 @llvm.experimental.vector.reduce.fmin.v2f128(<2 x fp128>)
+declare half @llvm.vector.reduce.fmin.v4f16(<4 x half>)
+declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>)
+declare double @llvm.vector.reduce.fmin.v2f64(<2 x double>)
+declare fp128 @llvm.vector.reduce.fmin.v2f128(<2 x fp128>)
 
 define half @test_v4f16(<4 x half> %a) nounwind {
 ; CHECK-LABEL: test_v4f16:
@@ -37,7 +37,7 @@
 ; CHECK-NEXT:    bl __aeabi_f2h
 ; CHECK-NEXT:    pop {r4, r5, r6, r7, r8, lr}
 ; CHECK-NEXT:    mov pc, lr
-  %b = call fast half @llvm.experimental.vector.reduce.fmin.v4f16(<4 x half> %a)
+  %b = call fast half @llvm.vector.reduce.fmin.v4f16(<4 x half> %a)
   ret half %b
 }
 
@@ -55,7 +55,7 @@
 ; CHECK-NEXT:    bl fminf
 ; CHECK-NEXT:    pop {r4, r5, r11, lr}
 ; CHECK-NEXT:    mov pc, lr
-  %b = call fast float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> %a)
+  %b = call fast float @llvm.vector.reduce.fmin.v4f32(<4 x float> %a)
   ret float %b
 }
 
@@ -67,7 +67,7 @@
 ; CHECK-NEXT:    bl fmin
 ; CHECK-NEXT:    pop {r11, lr}
 ; CHECK-NEXT:    mov pc, lr
-  %b = call fast double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double> %a)
+  %b = call fast double @llvm.vector.reduce.fmin.v2f64(<2 x double> %a)
   ret double %b
 }
 
@@ -90,6 +90,6 @@
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    pop {r11, lr}
 ; CHECK-NEXT:    mov pc, lr
-  %b = call fast fp128 @llvm.experimental.vector.reduce.fmin.v2f128(<2 x fp128> %a)
+  %b = call fast fp128 @llvm.vector.reduce.fmin.v2f128(<2 x fp128> %a)
   ret fp128 %b
 }
diff --git a/llvm/test/CodeGen/ARM/vecreduce-fmul-legalization-soft-float.ll b/llvm/test/CodeGen/ARM/vecreduce-fmul-legalization-soft-float.ll
--- a/llvm/test/CodeGen/ARM/vecreduce-fmul-legalization-soft-float.ll
+++ b/llvm/test/CodeGen/ARM/vecreduce-fmul-legalization-soft-float.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=arm-none-eabi -mattr=-neon | FileCheck %s --check-prefix=CHECK
 
-declare half @llvm.experimental.vector.reduce.v2.fmul.f16.v4f16(half, <4 x half>)
-declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float, <4 x float>)
-declare double @llvm.experimental.vector.reduce.v2.fmul.f64.v2f64(double, <2 x double>)
-declare fp128 @llvm.experimental.vector.reduce.v2.fmul.f128.v2f128(fp128, <2 x fp128>)
+declare half @llvm.vector.reduce.fmul.f16.v4f16(half, <4 x half>)
+declare float @llvm.vector.reduce.fmul.f32.v4f32(float, <4 x float>)
+declare double @llvm.vector.reduce.fmul.f64.v2f64(double, <2 x double>)
+declare fp128 @llvm.vector.reduce.fmul.f128.v2f128(fp128, <2 x fp128>)
 
 define half @test_v4f16(<4 x half> %a) nounwind {
 ; CHECK-LABEL: test_v4f16:
@@ -37,7 +37,7 @@
 ; CHECK-NEXT:    bl __aeabi_f2h
 ; CHECK-NEXT:    pop {r4, r5, r6, r7, r8, lr}
 ; CHECK-NEXT:    mov pc, lr
-  %b = call fast half @llvm.experimental.vector.reduce.v2.fmul.f16.v4f16(half 1.0, <4 x half> %a)
+  %b = call fast half @llvm.vector.reduce.fmul.f16.v4f16(half 1.0, <4 x half> %a)
   ret half %b
 }
 
@@ -55,7 +55,7 @@
 ; CHECK-NEXT:    bl __aeabi_fmul
 ; CHECK-NEXT:    pop {r4, r5, r11, lr}
 ; CHECK-NEXT:    mov pc, lr
-  %b = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.0, <4 x float> %a)
+  %b = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a)
   ret float %b
 }
 
@@ -67,7 +67,7 @@
 ; CHECK-NEXT:    bl __aeabi_dmul
 ; CHECK-NEXT:    pop {r11, lr}
 ; CHECK-NEXT:    mov pc, lr
-  %b = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v2f64(double 1.0, <2 x double> %a)
+  %b = call fast double @llvm.vector.reduce.fmul.f64.v2f64(double 1.0, <2 x double> %a)
   ret double %b
 }
 
@@ -90,6 +90,6 @@
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    pop {r11, lr}
 ; CHECK-NEXT:    mov pc, lr
-  %b = call fast fp128 @llvm.experimental.vector.reduce.v2.fmul.f128.v2f128(fp128 0xL00000000000000003fff00000000000000, <2 x fp128> %a)
+  %b = call fast fp128 @llvm.vector.reduce.fmul.f128.v2f128(fp128 0xL00000000000000003fff00000000000000, <2 x fp128> %a)
   ret fp128 %b
 }
diff --git a/llvm/test/CodeGen/ARM/vecreduce-fmul-legalization-strict.ll b/llvm/test/CodeGen/ARM/vecreduce-fmul-legalization-strict.ll
--- a/llvm/test/CodeGen/ARM/vecreduce-fmul-legalization-strict.ll
+++ b/llvm/test/CodeGen/ARM/vecreduce-fmul-legalization-strict.ll
@@ -1,14 +1,14 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=arm-none-eabi -mattr=+neon | FileCheck %s --check-prefix=CHECK
 
-declare half @llvm.experimental.vector.reduce.v2.fmul.f16.v1f16(half, <1 x half>)
-declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v1f32(float, <1 x float>)
-declare double @llvm.experimental.vector.reduce.v2.fmul.f64.v1f64(double, <1 x double>)
-declare fp128 @llvm.experimental.vector.reduce.v2.fmul.f128.v1f128(fp128, <1 x fp128>)
+declare half @llvm.vector.reduce.fmul.f16.v1f16(half, <1 x half>)
+declare float @llvm.vector.reduce.fmul.f32.v1f32(float, <1 x float>)
+declare double @llvm.vector.reduce.fmul.f64.v1f64(double, <1 x double>)
+declare fp128 @llvm.vector.reduce.fmul.f128.v1f128(fp128, <1 x fp128>)
 
-declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v3f32(float, <3 x float>)
-declare fp128 @llvm.experimental.vector.reduce.v2.fmul.f128.v2f128(fp128, <2 x fp128>)
-declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v16f32(float, <16 x float>)
+declare float @llvm.vector.reduce.fmul.f32.v3f32(float, <3 x float>)
+declare fp128 @llvm.vector.reduce.fmul.f128.v2f128(fp128, <2 x fp128>)
+declare float @llvm.vector.reduce.fmul.f32.v16f32(float, <16 x float>)
 
 define half @test_v1f16(<1 x half> %a) nounwind {
 ; CHECK-LABEL: test_v1f16:
@@ -28,7 +28,7 @@
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI0_0:
 ; CHECK-NEXT:    .long 0x00000000 @ float 0
-  %b = call half @llvm.experimental.vector.reduce.v2.fmul.f16.v1f16(half 0.0, <1 x half> %a)
+  %b = call half @llvm.vector.reduce.fmul.f16.v1f16(half 0.0, <1 x half> %a)
   ret half %b
 }
 
@@ -44,7 +44,7 @@
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI1_0:
 ; CHECK-NEXT:    .long 0x00000000 @ float 0
-  %b = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v1f32(float 0.0, <1 x float> %a)
+  %b = call float @llvm.vector.reduce.fmul.f32.v1f32(float 0.0, <1 x float> %a)
   ret float %b
 }
 
@@ -56,7 +56,7 @@
 ; CHECK-NEXT:    vmul.f64 d16, d17, d16
 ; CHECK-NEXT:    vmov r0, r1, d16
 ; CHECK-NEXT:    mov pc, lr
-  %b = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v1f64(double 0.0, <1 x double> %a)
+  %b = call double @llvm.vector.reduce.fmul.f64.v1f64(double 0.0, <1 x double> %a)
   ret double %b
 }
 
@@ -76,7 +76,7 @@
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    pop {r11, lr}
 ; CHECK-NEXT:    mov pc, lr
-  %b = call fp128 @llvm.experimental.vector.reduce.v2.fmul.f128.v1f128(fp128 zeroinitializer, <1 x fp128> %a)
+  %b = call fp128 @llvm.vector.reduce.fmul.f128.v1f128(fp128 zeroinitializer, <1 x fp128> %a)
   ret fp128 %b
 }
 
@@ -95,7 +95,7 @@
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI4_0:
 ; CHECK-NEXT:    .long 0x00000000 @ float 0
-  %b = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v3f32(float 0.0, <3 x float> %a)
+  %b = call float @llvm.vector.reduce.fmul.f32.v3f32(float 0.0, <3 x float> %a)
   ret float %b
 }
 
@@ -124,7 +124,7 @@
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    pop {r4, r5, r11, lr}
 ; CHECK-NEXT:    mov pc, lr
-  %b = call fp128 @llvm.experimental.vector.reduce.v2.fmul.f128.v2f128(fp128 zeroinitializer, <2 x fp128> %a)
+  %b = call fp128 @llvm.vector.reduce.fmul.f128.v2f128(fp128 zeroinitializer, <2 x fp128> %a)
   ret fp128 %b
 }
 
@@ -162,6 +162,6 @@
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI6_0:
 ; CHECK-NEXT:    .long 0x00000000 @ float 0
-  %b = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v16f32(float 0.0, <16 x float> %a)
+  %b = call float @llvm.vector.reduce.fmul.f32.v16f32(float 0.0, <16 x float> %a)
   ret float %b
 }
diff --git a/llvm/test/CodeGen/Generic/expand-experimental-reductions.ll b/llvm/test/CodeGen/Generic/expand-experimental-reductions.ll
--- a/llvm/test/CodeGen/Generic/expand-experimental-reductions.ll
+++ b/llvm/test/CodeGen/Generic/expand-experimental-reductions.ll
@@ -1,24 +1,24 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -expand-reductions -S | FileCheck %s
 ; Tests without a target which should expand all reductions
-declare i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.mul.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.and.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.or.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.xor.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.mul.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.and.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.or.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.xor.v2i64(<2 x i64>)
 
-declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float, <4 x float>)
-declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float, <4 x float>)
+declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
+declare float @llvm.vector.reduce.fmul.f32.v4f32(float, <4 x float>)
 
-declare i64 @llvm.experimental.vector.reduce.smax.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.smin.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.umax.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.umin.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.smax.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.smin.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.umax.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.umin.v2i64(<2 x i64>)
 
-declare double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double>)
-declare double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double>)
+declare double @llvm.vector.reduce.fmax.v2f64(<2 x double>)
+declare double @llvm.vector.reduce.fmin.v2f64(<2 x double>)
 
-declare i8 @llvm.experimental.vector.reduce.and.i8.v3i8(<3 x i8>)
+declare i8 @llvm.vector.reduce.and.i8.v3i8(<3 x i8>)
 
 define i64 @add_i64(<2 x i64> %vec) {
 ; CHECK-LABEL: @add_i64(
@@ -29,7 +29,7 @@
 ; CHECK-NEXT:    ret i64 [[TMP0]]
 ;
 entry:
-  %r = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %vec)
+  %r = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %vec)
   ret i64 %r
 }
 
@@ -42,7 +42,7 @@
 ; CHECK-NEXT:    ret i64 [[TMP0]]
 ;
 entry:
-  %r = call i64 @llvm.experimental.vector.reduce.mul.v2i64(<2 x i64> %vec)
+  %r = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> %vec)
   ret i64 %r
 }
 
@@ -55,7 +55,7 @@
 ; CHECK-NEXT:    ret i64 [[TMP0]]
 ;
 entry:
-  %r = call i64 @llvm.experimental.vector.reduce.and.v2i64(<2 x i64> %vec)
+  %r = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> %vec)
   ret i64 %r
 }
 
@@ -68,7 +68,7 @@
 ; CHECK-NEXT:    ret i64 [[TMP0]]
 ;
 entry:
-  %r = call i64 @llvm.experimental.vector.reduce.or.v2i64(<2 x i64> %vec)
+  %r = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> %vec)
   ret i64 %r
 }
 
@@ -81,7 +81,7 @@
 ; CHECK-NEXT:    ret i64 [[TMP0]]
 ;
 entry:
-  %r = call i64 @llvm.experimental.vector.reduce.xor.v2i64(<2 x i64> %vec)
+  %r = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> %vec)
   ret i64 %r
 }
 
@@ -97,7 +97,7 @@
 ; CHECK-NEXT:    ret float [[BIN_RDX3]]
 ;
 entry:
-  %r = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.0, <4 x float> %vec)
+  %r = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float 0.0, <4 x float> %vec)
   ret float %r
 }
 
@@ -113,7 +113,7 @@
 ; CHECK-NEXT:    ret float [[BIN_RDX3]]
 ;
 entry:
-  %r = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %accum, <4 x float> %vec)
+  %r = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float %accum, <4 x float> %vec)
   ret float %r
 }
 
@@ -131,7 +131,7 @@
 ; CHECK-NEXT:    ret float [[BIN_RDX3]]
 ;
 entry:
-  %r = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float undef, <4 x float> %vec)
+  %r = call float @llvm.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %vec)
   ret float %r
 }
 
@@ -149,7 +149,7 @@
 ; CHECK-NEXT:    ret float [[BIN_RDX3]]
 ;
 entry:
-  %r = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %accum, <4 x float> %vec)
+  %r = call float @llvm.vector.reduce.fadd.f32.v4f32(float %accum, <4 x float> %vec)
   ret float %r
 }
 
@@ -165,7 +165,7 @@
 ; CHECK-NEXT:    ret float [[BIN_RDX3]]
 ;
 entry:
-  %r = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.0, <4 x float> %vec)
+  %r = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %vec)
   ret float %r
 }
 
@@ -181,7 +181,7 @@
 ; CHECK-NEXT:    ret float [[BIN_RDX3]]
 ;
 entry:
-  %r = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float %accum, <4 x float> %vec)
+  %r = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float %accum, <4 x float> %vec)
   ret float %r
 }
 
@@ -199,7 +199,7 @@
 ; CHECK-NEXT:    ret float [[BIN_RDX3]]
 ;
 entry:
-  %r = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float undef, <4 x float> %vec)
+  %r = call float @llvm.vector.reduce.fmul.f32.v4f32(float undef, <4 x float> %vec)
   ret float %r
 }
 
@@ -217,7 +217,7 @@
 ; CHECK-NEXT:    ret float [[BIN_RDX3]]
 ;
 entry:
-  %r = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float %accum, <4 x float> %vec)
+  %r = call float @llvm.vector.reduce.fmul.f32.v4f32(float %accum, <4 x float> %vec)
   ret float %r
 }
 
@@ -231,7 +231,7 @@
 ; CHECK-NEXT:    ret i64 [[TMP0]]
 ;
 entry:
-  %r = call i64 @llvm.experimental.vector.reduce.smax.v2i64(<2 x i64> %vec)
+  %r = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> %vec)
   ret i64 %r
 }
 
@@ -245,7 +245,7 @@
 ; CHECK-NEXT:    ret i64 [[TMP0]]
 ;
 entry:
-  %r = call i64 @llvm.experimental.vector.reduce.smin.v2i64(<2 x i64> %vec)
+  %r = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> %vec)
   ret i64 %r
 }
 
@@ -259,7 +259,7 @@
 ; CHECK-NEXT:    ret i64 [[TMP0]]
 ;
 entry:
-  %r = call i64 @llvm.experimental.vector.reduce.umax.v2i64(<2 x i64> %vec)
+  %r = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> %vec)
   ret i64 %r
 }
 
@@ -273,7 +273,7 @@
 ; CHECK-NEXT:    ret i64 [[TMP0]]
 ;
 entry:
-  %r = call i64 @llvm.experimental.vector.reduce.umin.v2i64(<2 x i64> %vec)
+  %r = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> %vec)
   ret i64 %r
 }
 
@@ -282,11 +282,11 @@
 define double @fmax_f64(<2 x double> %vec) {
 ; CHECK-LABEL: @fmax_f64(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[R:%.*]] = call double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double> [[VEC:%.*]])
+; CHECK-NEXT:    [[R:%.*]] = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> [[VEC:%.*]])
 ; CHECK-NEXT:    ret double [[R]]
 ;
 entry:
-  %r = call double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double> %vec)
+  %r = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> %vec)
   ret double %r
 }
 
@@ -295,11 +295,11 @@
 define double @fmin_f64(<2 x double> %vec) {
 ; CHECK-LABEL: @fmin_f64(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[R:%.*]] = call double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double> [[VEC:%.*]])
+; CHECK-NEXT:    [[R:%.*]] = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> [[VEC:%.*]])
 ; CHECK-NEXT:    ret double [[R]]
 ;
 entry:
-  %r = call double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double> %vec)
+  %r = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> %vec)
   ret double %r
 }
 
@@ -309,10 +309,10 @@
 define i8 @test_v3i8(<3 x i8> %a) nounwind {
 ; CHECK-LABEL: @test_v3i8(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[B:%.*]] = call i8 @llvm.experimental.vector.reduce.and.v3i8(<3 x i8> [[A:%.*]])
+; CHECK-NEXT:    [[B:%.*]] = call i8 @llvm.vector.reduce.and.v3i8(<3 x i8> [[A:%.*]])
 ; CHECK-NEXT:    ret i8 [[B]]
 ;
 entry:
-  %b = call i8 @llvm.experimental.vector.reduce.and.i8.v3i8(<3 x i8> %a)
+  %b = call i8 @llvm.vector.reduce.and.i8.v3i8(<3 x i8> %a)
   ret i8 %b
 }
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/add_reduce.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/add_reduce.mir
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/add_reduce.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/add_reduce.mir
@@ -44,7 +44,7 @@
     %add7 = add <4 x i32> %mul, %splat.output
     %max = tail call <4 x i32> @llvm.arm.mve.max.predicated.v4i32.v4i1(<4 x i32> %add7, <4 x i32> %.splat.i42, i32 1, <4 x i1> %pred, <4 x i32> undef)
     %min = tail call <4 x i32> @llvm.arm.mve.min.predicated.v4i32.v4i1(<4 x i32> %max, <4 x i32> %.splat.i, i32 1, <4 x i1> %pred, <4 x i32> undef)
-    %reduce = tail call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %min)
+    %reduce = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %min)
     store i32 %reduce, i32* %scevgep2
     %add.ptr = getelementptr inbounds i8, i8* %input_1_vect.addr.052, i32 4
     %add.ptr14 = getelementptr inbounds i8, i8* %input_2_vect.addr.051, i32 4
@@ -62,7 +62,7 @@
   declare <4 x i32> @llvm.arm.mve.min.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>) #1
   declare i1 @llvm.test.set.loop.iterations.i32(i32) #4
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #4
-  declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) #5
+  declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) #5
 
 ...
 ---
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll
@@ -85,7 +85,7 @@
 
 middle.block:                                     ; preds = %vector.body
   %tmp8 = select <4 x i1> %tmp1, <4 x i32> %add, <4 x i32> %vec.phi
-  %tmp9 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %tmp8)
+  %tmp9 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %tmp8)
   br label %for.cond.cleanup
 
 for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -188,7 +188,7 @@
 
 middle.block:                                     ; preds = %vector.body
   %acc = select <4 x i1> %tmp1, <4 x i32> %add, <4 x i32> %vec.phi
-  %reduce = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %acc)
+  %reduce = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %acc)
   br label %for.cond.cleanup
 
 for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -287,7 +287,7 @@
 
 middle.block:                                     ; preds = %vector.body
   %acc = select <4 x i1> %tmp1, <4 x i32> %add, <4 x i32> %vec.phi
-  %reduce = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %acc)
+  %reduce = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %acc)
   br label %for.cond.cleanup
 
 for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -386,7 +386,7 @@
 
 middle.block:                                     ; preds = %vector.body
   %acc = select <4 x i1> %tmp1, <4 x i32> %add, <4 x i32> %vec.phi
-  %reduce = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %acc)
+  %reduce = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %acc)
   br label %for.cond.cleanup
 
 for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -528,6 +528,6 @@
 declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>)
 
 ; Function Attrs: nounwind readnone willreturn
-declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
 
 declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpsel-1.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpsel-1.mir
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpsel-1.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpsel-1.mir
@@ -56,7 +56,7 @@
     br i1 %tmp16, label %vector.body, label %middle.block
 
   middle.block:                                     ; preds = %vector.body
-    %tmp17 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %tmp14)
+    %tmp17 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %tmp14)
     br label %for.cond.cleanup
 
   for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -64,7 +64,7 @@
     ret i32 %res.0.lcssa
   }
   declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) #1
-  declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) #2
+  declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) #2
   declare void @llvm.set.loop.iterations.i32(i32) #3
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3
   declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpsel-2.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpsel-2.mir
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpsel-2.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpsel-2.mir
@@ -58,7 +58,7 @@
     br i1 %tmp16, label %vector.body, label %middle.block
 
   middle.block:                                     ; preds = %vector.body
-    %tmp17 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %tmp14)
+    %tmp17 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %tmp14)
     br label %for.cond.cleanup
 
   for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -66,7 +66,7 @@
     ret i32 %res.0.lcssa
   }
   declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) #1
-  declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) #2
+  declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) #2
   declare void @llvm.set.loop.iterations.i32(i32) #3
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3
   declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/invariant-qreg.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/invariant-qreg.mir
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/invariant-qreg.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/invariant-qreg.mir
@@ -68,7 +68,7 @@
     %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv17, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
     %tmp10 = sext <4 x i16> %wide.masked.load to <4 x i32>
     %tmp12 = mul nsw <4 x i32> %pass, %tmp10
-    %tmp13 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %tmp12)
+    %tmp13 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %tmp12)
     %scevgep = getelementptr i16, i16* %lsr.iv, i32 4
     %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %lsr.iv1, i32 1)
     %tmp16 = icmp ne i32 %tmp15, 0
@@ -105,7 +105,7 @@
     %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv17, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
     %tmp10 = sext <4 x i16> %wide.masked.load to <4 x i32>
     %tmp12 = add nsw <4 x i32> %pass, %tmp10
-    %tmp13 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %tmp12)
+    %tmp13 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %tmp12)
     %scevgep = getelementptr i16, i16* %lsr.iv, i32 4
     %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %lsr.iv1, i32 1)
     %tmp16 = icmp ne i32 %tmp15, 0
@@ -117,7 +117,7 @@
     ret i32 %res
   }
 
-  declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
+  declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
   declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>)
   declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
   declare void @llvm.set.loop.iterations.i32(i32)
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/lstp-insertion-position.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/lstp-insertion-position.mir
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/lstp-insertion-position.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/lstp-insertion-position.mir
@@ -40,7 +40,7 @@
     br i1 %15, label %vector.body, label %middle.block
 
   middle.block:                                     ; preds = %vector.body
-    %16 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> %13)
+    %16 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float 0.000000e+00, <4 x float> %13)
     br label %for.cond.cleanup
 
   for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -88,7 +88,7 @@
     br i1 %15, label %vector.body, label %middle.block
 
   middle.block:                                     ; preds = %vector.body
-    %16 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> %13)
+    %16 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float 0.000000e+00, <4 x float> %13)
     br label %for.cond.cleanup
 
   for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -98,7 +98,7 @@
 
   declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
   declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4 x i1>, <4 x float>)
-  declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float, <4 x float>)
+  declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
   declare void @llvm.set.loop.iterations.i32(i32)
   declare i32 @llvm.loop.decrement.reg.i32(i32, i32)
   declare <4 x i1> @llvm.arm.mve.vctp32(i32)
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/matrix.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/matrix.mir
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/matrix.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/matrix.mir
@@ -91,7 +91,7 @@
     %22 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %10)
     %23 = bitcast i16* %lsr.iv7 to i1*
     %24 = select <4 x i1> %22, <4 x i32> %.lcssa, <4 x i32> %vec.phi.lcssa
-    %25 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %24)
+    %25 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %24)
     %sunkaddr = mul i32 %i.064.us, 4
     %26 = bitcast i32* %e to i8*
     %sunkaddr17 = getelementptr inbounds i8, i8* %26, i32 %sunkaddr
@@ -141,7 +141,7 @@
   }
   declare dso_local arm_aapcs_vfpcc signext i16 @crc16(...) local_unnamed_addr #0
   declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) #1
-  declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) #2
+  declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) #2
   declare void @llvm.set.loop.iterations.i32(i32) #3
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3
   declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll
@@ -69,7 +69,7 @@
 
 middle.block:                                     ; preds = %vector.body
   %7 = select <4 x i1> %1, <4 x i32> %5, <4 x i32> %vec.phi
-  %8 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %7)
+  %8 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %7)
   br label %for.cond.cleanup
 
 for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -145,7 +145,7 @@
 
 middle.block:                                     ; preds = %vector.body
   %7 = select <4 x i1> %1, <4 x i32> %5, <4 x i32> %vec.phi
-  %8 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %7)
+  %8 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %7)
   br label %for.cond.cleanup
 
 for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -221,7 +221,7 @@
 
 middle.block:                                     ; preds = %vector.body
   %7 = select <4 x i1> %1, <4 x i32> %5, <4 x i32> %vec.phi
-  %8 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %7)
+  %8 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %7)
   br label %for.cond.cleanup
 
 for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -297,7 +297,7 @@
 
 middle.block:                                     ; preds = %vector.body
   %7 = select <4 x i1> %1, <4 x i32> %5, <4 x i32> %vec.phi
-  %8 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %7)
+  %8 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %7)
   br label %for.cond.cleanup
 
 for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -371,7 +371,7 @@
 
 middle.block:                                     ; preds = %vector.body
   %6 = select <4 x i1> %1, <4 x i32> %4, <4 x i32> %vec.phi
-  %7 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %6)
+  %7 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %6)
   br label %for.cond.cleanup
 
 for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -1273,6 +1273,6 @@
 declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
 declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32 immarg, <8 x i1>)
 declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
-declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
 declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
 declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32)
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll
@@ -51,7 +51,7 @@
 ; CHECK-NEXT:    br i1 [[TMP16]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP17:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[TMP14]], <4 x i32> [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP18:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP17]])
+; CHECK-NEXT:    [[TMP18:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP17]])
 ; CHECK-NEXT:    store i32 [[TMP18]], i32* [[ARRAYIDX8_US]], align 4
 ; CHECK-NEXT:    [[INC10_US]] = add nuw i32 [[I_025_US]], 1
 ; CHECK-NEXT:    [[EXITCOND27:%.*]] = icmp eq i32 [[INC10_US]], [[N]]
@@ -112,7 +112,7 @@
 
 middle.block:                                     ; preds = %vector.body
   %tmp17 = select <4 x i1> %tmp7, <4 x i32> %tmp14, <4 x i32> %vec.phi
-  %tmp18 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %tmp17)
+  %tmp18 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %tmp17)
   store i32 %tmp18, i32* %arrayidx8.us, align 4
   %inc10.us = add nuw i32 %i.025.us, 1
   %exitcond27 = icmp eq i32 %inc10.us, %N
@@ -170,7 +170,7 @@
 ; CHECK-NEXT:    br i1 [[TMP14]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP15:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[TMP12]], <4 x i32> [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP16:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP15]])
+; CHECK-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP15]])
 ; CHECK-NEXT:    store i32 [[TMP16]], i32* [[ARRAYIDX7_US]], align 4
 ; CHECK-NEXT:    [[INC9_US]] = add nuw i32 [[I_024_US]], 1
 ; CHECK-NEXT:    [[EXITCOND26:%.*]] = icmp eq i32 [[INC9_US]], [[N]]
@@ -229,7 +229,7 @@
 
 middle.block:                                     ; preds = %vector.body
   %tmp15 = select <4 x i1> %tmp7, <4 x i32> %tmp12, <4 x i32> %vec.phi
-  %tmp16 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %tmp15)
+  %tmp16 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %tmp15)
   store i32 %tmp16, i32* %arrayidx7.us, align 4
   %inc9.us = add nuw i32 %i.024.us, 1
   %exitcond26 = icmp eq i32 %inc9.us, %N
@@ -247,7 +247,7 @@
 declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) #0
 
 ; Function Attrs: nounwind readnone willreturn
-declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) #1
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) #1
 
 ; Function Attrs: noduplicate nounwind
 declare void @llvm.set.loop.iterations.i32(i32) #2
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/no-vpsel-liveout.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/no-vpsel-liveout.mir
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/no-vpsel-liveout.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/no-vpsel-liveout.mir
@@ -40,7 +40,7 @@
     br i1 %tmp15, label %vector.body, label %middle.block
 
   middle.block:                                     ; preds = %vector.body
-    %tmp16 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %tmp13)
+    %tmp16 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %tmp13)
     br label %for.cond.cleanup
 
   for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -48,7 +48,7 @@
     ret i32 %res.0.lcssa
   }
   declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) #1
-  declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) #2
+  declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) #2
   declare void @llvm.set.loop.iterations.i32(i32) #3
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3
   declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/non-masked-load.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/non-masked-load.mir
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/non-masked-load.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/non-masked-load.mir
@@ -44,7 +44,7 @@
     %.lcssa = phi <16 x i8> [ %13, %vector.body ]
     %16 = call <16 x i1> @llvm.arm.mve.vctp8(i32 %7)
     %17 = select <16 x i1> %16, <16 x i8> %.lcssa, <16 x i8> %vec.phi.lcssa
-    %18 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %17)
+    %18 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %17)
     br label %for.cond.cleanup
 
   for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -53,7 +53,7 @@
   }
 
   declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>) #1
-  declare i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8>) #2
+  declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>) #2
   declare void @llvm.set.loop.iterations.i32(i32) #3
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3
   declare <16 x i1> @llvm.arm.mve.vctp8(i32) #4
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/predicated-liveout.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/predicated-liveout.mir
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/predicated-liveout.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/predicated-liveout.mir
@@ -36,7 +36,7 @@
     br i1 %cmp, label %for.body, label %middle.block
 
   middle.block:                                     ; preds = %for.body
-    %reduce = tail call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %acc.next)
+    %reduce = tail call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %acc.next)
     ret i16 %reduce
 
   for.cond.cleanup:                                 ; preds = %entry
@@ -47,7 +47,7 @@
   declare <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>*, i32 immarg, <8 x i1>, <8 x i8>) #2
   declare i1 @llvm.test.set.loop.iterations.i32(i32) #3
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3
-  declare i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16>) #4
+  declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>) #4
   declare <8 x i16> @llvm.arm.mve.add.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, <8 x i1>, <8 x i16>) #1
 
 ...
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions-vpt-liveout.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions-vpt-liveout.mir
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions-vpt-liveout.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions-vpt-liveout.mir
@@ -41,7 +41,7 @@
     br i1 %16, label %vector.body, label %middle.block
 
   middle.block:                                     ; preds = %vector.body
-    %17 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %14)
+    %17 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %14)
     br label %for.cond.cleanup
 
   for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -88,7 +88,7 @@
     br i1 %16, label %vector.body, label %middle.block
 
   middle.block:                                     ; preds = %vector.body
-    %17 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %14)
+    %17 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %14)
     br label %for.cond.cleanup
 
   for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -135,7 +135,7 @@
     br i1 %16, label %vector.body, label %middle.block
 
   middle.block:                                     ; preds = %vector.body
-    %17 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %14)
+    %17 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %14)
     br label %for.cond.cleanup
 
   for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -182,7 +182,7 @@
     br i1 %16, label %vector.body, label %middle.block
 
   middle.block:                                     ; preds = %vector.body
-    %17 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %14)
+    %17 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %14)
     br label %for.cond.cleanup
 
   for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -228,7 +228,7 @@
     br i1 %14, label %vector.body, label %middle.block
 
   middle.block:                                     ; preds = %vector.body
-    %15 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %12)
+    %15 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %12)
     br label %for.cond.cleanup
 
   for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -274,7 +274,7 @@
     br i1 %14, label %vector.body, label %middle.block
 
   middle.block:                                     ; preds = %vector.body
-    %15 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %12)
+    %15 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %12)
     br label %for.cond.cleanup
 
   for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -285,7 +285,7 @@
   declare <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>*, i32 immarg, <4 x i1>, <4 x i8>)
   declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>)
   declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
-  declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
+  declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
   declare void @llvm.set.loop.iterations.i32(i32)
   declare i32 @llvm.loop.decrement.reg.i32(i32, i32)
   declare <4 x i1> @llvm.arm.mve.vctp32(i32)
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll
@@ -45,7 +45,7 @@
   %wide.masked.load16 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %i3, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
   %i4 = add <16 x i8> %wide.masked.load, %wide.masked.load16
   %i5 = select <16 x i1> %active.lane.mask, <16 x i8> %i4, <16 x i8> %vec.phi
-  %i6 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %i5)
+  %i6 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %i5)
   %index.next = add i32 %index, 16
   %i7 = icmp eq i32 %index.next, %n.vec
   br i1 %i7, label %middle.block, label %vector.body
@@ -123,7 +123,7 @@
 
 middle.block:                                     ; preds = %vector.body
   %i9 = select <8 x i1> %active.lane.mask, <8 x i16> %i7, <8 x i16> %vec.phi
-  %i10 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %i9)
+  %i10 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %i9)
   br label %for.cond.cleanup
 
 for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -193,7 +193,7 @@
 
 middle.block:                                     ; preds = %vector.body
   %i7 = select <16 x i1> %active.lane.mask, <16 x i8> %i5, <16 x i8> %vec.phi
-  %i8 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %i7)
+  %i8 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %i7)
   br label %for.cond.cleanup
 
 for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -265,7 +265,7 @@
 
 middle.block:                                     ; preds = %vector.body
   %i9 = select <8 x i1> %active.lane.mask, <8 x i16> %i7, <8 x i16> %vec.phi
-  %i10 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %i9)
+  %i10 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %i9)
   br label %for.cond.cleanup
 
 for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -335,7 +335,7 @@
 
 middle.block:                                     ; preds = %vector.body
   %i7 = select <16 x i1> %active.lane.mask, <16 x i8> %i5, <16 x i8> %vec.phi
-  %i8 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %i7)
+  %i8 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %i7)
   br label %for.cond.cleanup
 
 for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -407,7 +407,7 @@
 
 middle.block:                                     ; preds = %vector.body
   %i9 = select <8 x i1> %active.lane.mask, <8 x i16> %i7, <8 x i16> %vec.phi
-  %i10 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %i9)
+  %i10 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %i9)
   br label %for.cond.cleanup
 
 for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -504,7 +504,7 @@
 
 middle.block:                                     ; preds = %vector.body
   %i9 = select <4 x i1> %active.lane.mask, <4 x i32> %i7, <4 x i32> %vec.phi
-  %i10 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %i9)
+  %i10 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %i9)
   br i1 %cmp35, label %for.cond.cleanup7, label %vector.ph47
 
 vector.ph47:                                      ; preds = %middle.block
@@ -534,7 +534,7 @@
 
 middle.block44:                                   ; preds = %vector.body46
   %i21 = select <4 x i1> %active.lane.mask61, <4 x i32> %i19, <4 x i32> %vec.phi60
-  %i22 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %i21)
+  %i22 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %i21)
   br label %for.cond.cleanup7
 
 for.cond.cleanup7:                                ; preds = %middle.block44, %middle.block, %entry
@@ -620,9 +620,9 @@
 
 middle.block:                                     ; preds = %vector.body
   %i11 = select <8 x i1> %active.lane.mask, <8 x i16> %i8, <8 x i16> %vec.phi
-  %i12 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %i11)
+  %i12 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %i11)
   %i13 = select <8 x i1> %active.lane.mask, <8 x i16> %i9, <8 x i16> %vec.phi.1
-  %i14 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %i13)
+  %i14 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %i13)
   br label %for.cond.cleanup
 
 for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -747,7 +747,7 @@
 
 middle.block:                                     ; preds = %vector.body
   %10 = select <4 x i1> %active.lane.mask, <4 x i32> %8, <4 x i32> %vec.phi
-  %11 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %10)
+  %11 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %10)
   br label %for.end
 
 for.end:                                          ; preds = %middle.block, %lor.end
@@ -758,10 +758,10 @@
 declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
 declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32)
 declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)
 declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32)
 declare <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>*, i32 immarg, <8 x i1>, <8 x i8>)
-declare i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
 declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
 declare <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>*, i32 immarg, <4 x i1>, <4 x i8>)
-declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/skip-debug.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/skip-debug.mir
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/skip-debug.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/skip-debug.mir
@@ -46,7 +46,7 @@
     %.lcssa = phi <4 x i32> [ %15, %vector.body ], !dbg !38
     %18 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %9), !dbg !34
     %19 = select <4 x i1> %18, <4 x i32> %.lcssa, <4 x i32> %vec.phi.lcssa, !dbg !38
-    %20 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %19), !dbg !32
+    %20 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %19), !dbg !32
     br label %for.cond.cleanup, !dbg !42
 
   for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -58,7 +58,7 @@
 
   declare void @llvm.dbg.value(metadata, metadata, metadata)
   declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>)
-  declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
+  declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
   declare void @llvm.set.loop.iterations.i32(i32)
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
   declare <4 x i1> @llvm.arm.mve.vctp32(i32)
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-reduce.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-reduce.ll
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-reduce.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-reduce.ll
@@ -258,7 +258,7 @@
 
 middle.block:                                     ; preds = %vector.body
   %19 = select <4 x i1> %active.lane.mask, <4 x i32> %16, <4 x i32> %vec.phi
-  %20 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %19)
+  %20 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %19)
   br label %for.end
 
 for.end:                                          ; preds = %middle.block, %for.body
@@ -282,6 +282,6 @@
 declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
 declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
 declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32)
-declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
 declare i32 @llvm.loop.decrement.reg.i32(i32, i32)
 declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>)
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tp-multiple-vpst.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tp-multiple-vpst.ll
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tp-multiple-vpst.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tp-multiple-vpst.ll
@@ -74,14 +74,14 @@
   br i1 %8, label %middle.block, label %vector.body
 
 middle.block:                                     ; preds = %vector.body
-  %9 = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> %7)
-  %10 = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> %5)
+  %9 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %7)
+  %10 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %5)
   store i32 %10, i32* %minp, align 4
   ret i32 %9
 }
 
 declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) #1
 declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #2
-declare i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32>) #3
-declare i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32>) #3
+declare i32 @llvm.vector.reduce.smin.v4i32(<4 x i32>) #3
+declare i32 @llvm.vector.reduce.smax.v4i32(<4 x i32>) #3
 
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredicated-max.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredicated-max.mir
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredicated-max.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredicated-max.mir
@@ -26,7 +26,7 @@
     %tmp8 = call <8 x i1> @llvm.arm.mve.vctp16(i32 %tmp7)
     %tmp9 = sub i32 %tmp7, 8
     %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %lsr.iv17, i32 2, <8 x i1> %tmp8, <8 x i16> undef)
-    %min = tail call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> %wide.masked.load)
+    %min = tail call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> %wide.masked.load)
     store i16 %min, i16* %lsr.iv.2
     %scevgep = getelementptr i16, i16* %lsr.iv, i32 8
     %scevgep.2 = getelementptr i16, i16* %lsr.iv.2, i32 1
@@ -43,7 +43,7 @@
   declare void @llvm.set.loop.iterations.i32(i32)
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
   declare <8 x i1> @llvm.arm.mve.vctp16(i32)
-  declare i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16>)
+  declare i16 @llvm.vector.reduce.smax.v8i16(<8 x i16>)
 
 ...
 ---
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vaddv.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vaddv.mir
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vaddv.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vaddv.mir
@@ -26,7 +26,7 @@
     %tmp9 = sub i32 %tmp7, 4
     %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv17, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
     %tmp10 = sext <4 x i16> %wide.masked.load to <4 x i32>
-    %tmp11 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %tmp10)
+    %tmp11 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %tmp10)
     store i32 %tmp11, i32* %store.addr
     %store.next = getelementptr i32, i32* %store.addr, i32 1
     %scevgep = getelementptr i16, i16* %lsr.iv, i32 4
@@ -64,7 +64,7 @@
     %tmp9 = sub i32 %tmp7, 8
     %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %lsr.iv17, i32 2, <8 x i1> %tmp8, <8 x i16> undef)
     %sext = sext <8 x i16> %wide.masked.load to <8 x i32>
-    %tmp11 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %sext)
+    %tmp11 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %sext)
     store i32 %tmp11, i32* %store.addr
     %store.next = getelementptr i32, i32* %store.addr, i32 1
     %scevgep = getelementptr i16, i16* %lsr.iv, i32 8
@@ -102,7 +102,7 @@
     %tmp9 = sub i32 %tmp7, 16
     %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %lsr.iv17, i32 1, <16 x i1> %tmp8, <16 x i8> undef)
     %sext = sext <16 x i8> %wide.masked.load to <16 x i32>
-    %tmp11 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %sext)
+    %tmp11 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %sext)
     store i32 %tmp11, i32* %store.addr
     %store.next = getelementptr i32, i32* %store.addr, i32 1
     %scevgep = getelementptr i8, i8* %lsr.iv, i32 16
@@ -140,7 +140,7 @@
     %tmp9 = sub i32 %tmp7, 4
     %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv17, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
     %tmp10 = sext <4 x i16> %wide.masked.load to <4 x i32>
-    %tmp11 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %tmp10)
+    %tmp11 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %tmp10)
     %acc.next = add i32 %tmp11, %acc
     %scevgep = getelementptr i16, i16* %lsr.iv, i32 4
     %tmp12 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %lsr.iv1, i32 1)
@@ -179,7 +179,7 @@
     %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv17, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
     %tmp10 = sext <4 x i16> %wide.masked.load to <4 x i32>
     %not = xor <4 x i32> %tmp10, <i32 -1, i32 -1, i32 -1, i32 -1>
-    %tmp11 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %not)
+    %tmp11 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %not)
     store i32 %tmp11, i32* %store.addr
     %store.next = getelementptr i32, i32* %store.addr, i32 1
     %scevgep = getelementptr i16, i16* %lsr.iv, i32 4
@@ -218,7 +218,7 @@
     %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv17, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
     %tmp10 = sext <4 x i16> %wide.masked.load to <4 x i32>
     %not = xor <4 x i32> %tmp10, <i32 -1, i32 -1, i32 -1, i32 -1>
-    %tmp11 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %not)
+    %tmp11 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %not)
     %acc.next = add i32 %tmp11, %acc
     %scevgep = getelementptr i16, i16* %lsr.iv, i32 4
     %tmp12 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %lsr.iv1, i32 1)
@@ -257,7 +257,7 @@
     %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv17, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
     %tmp10 = zext <4 x i16> %wide.masked.load to <4 x i32>
     %not = xor <4 x i32> %tmp10, <i32 -1, i32 -1, i32 -1, i32 -1>
-    %tmp11 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %not)
+    %tmp11 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %not)
     store i32 %tmp11, i32* %store.addr
     %store.next = getelementptr i32, i32* %store.addr, i32 1
     %scevgep = getelementptr i16, i16* %lsr.iv, i32 4
@@ -296,7 +296,7 @@
     %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv17, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
     %tmp10 = zext <4 x i16> %wide.masked.load to <4 x i32>
     %not = xor <4 x i32> %tmp10, <i32 -1, i32 -1, i32 -1, i32 -1>
-    %tmp11 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %not)
+    %tmp11 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %not)
     %acc.next = add i32 %tmp11, %acc
     %scevgep = getelementptr i16, i16* %lsr.iv, i32 4
     %tmp12 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %lsr.iv1, i32 1)
@@ -335,7 +335,7 @@
     %wide.masked.load = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %lsr.iv17, i32 1, <8 x i1> %tmp8, <8 x i8> undef)
     %sext.wide = sext <8 x i8> %wide.masked.load to <8 x i16>
     %sub = sub <8 x i16> %sext.wide, %pass
-    %reduce = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %sub)
+    %reduce = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %sub)
     %sext.reduce = sext i16 %reduce to i32
     store i32 %sext.reduce, i32* %store.addr
     %store.next = getelementptr i32, i32* %store.addr, i32 1
@@ -375,7 +375,7 @@
     %wide.masked.load = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %lsr.iv17, i32 1, <8 x i1> %tmp8, <8 x i8> undef)
     %sext.wide = sext <8 x i8> %wide.masked.load to <8 x i16>
     %sub = sub <8 x i16> %sext.wide, %pass
-    %reduce = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %sub)
+    %reduce = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %sub)
     %sext.reduce = sext i16 %reduce to i32
     %acc.next = add i32 %sext.reduce, %acc
     %scevgep = getelementptr i8, i8* %lsr.iv, i32 8
@@ -414,7 +414,7 @@
     %tmp9 = sub i32 %tmp7, 8
     %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %lsr.iv17, i32 2, <8 x i1> %tmp8, <8 x i16> undef)
     %sub = sub <8 x i16> %wide.masked.load, %pass
-    %reduce = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %sub)
+    %reduce = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %sub)
     %zext.reduce = zext i16 %reduce to i32
     store i32 %zext.reduce, i32* %store.addr
     %store.next = getelementptr i32, i32* %store.addr, i32 1
@@ -453,7 +453,7 @@
     %tmp9 = sub i32 %tmp7, 8
     %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %lsr.iv17, i32 2, <8 x i1> %tmp8, <8 x i16> undef)
     %sub = sub <8 x i16> %wide.masked.load, %pass
-    %reduce = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %sub)
+    %reduce = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %sub)
     %zext.reduce = zext i16 %reduce to i32
     %acc.next = add i32 %zext.reduce, %acc
     %scevgep = getelementptr i16, i16* %lsr.iv, i32 8
@@ -492,7 +492,7 @@
     %tmp9 = sub i32 %tmp7, 16
     %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %lsr.iv17, i32 1, <16 x i1> %tmp8, <16 x i8> undef)
     %xor = xor <16 x i8> %wide.masked.load, %pass
-    %reduce = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %xor)
+    %reduce = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %xor)
     %sext.reduce = sext i8 %reduce to i32
     store i32 %sext.reduce, i32* %store.addr
     %store.next = getelementptr i32, i32* %store.addr, i32 1
@@ -531,7 +531,7 @@
     %tmp9 = sub i32 %tmp7, 16
     %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %lsr.iv17, i32 1, <16 x i1> %tmp8, <16 x i8> undef)
     %xor = xor <16 x i8> %wide.masked.load, %pass
-    %reduce = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %xor)
+    %reduce = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %xor)
     %sext.reduce = sext i8 %reduce to i32
     %acc.next = add i32 %sext.reduce, %acc
     %scevgep = getelementptr i8, i8* %lsr.iv, i32 16
@@ -570,7 +570,7 @@
     %tmp9 = sub i32 %tmp7, 16
     %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %lsr.iv17, i32 1, <16 x i1> %tmp8, <16 x i8> undef)
     %xor = xor <16 x i8> %wide.masked.load, %pass
-    %reduce = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %xor)
+    %reduce = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %xor)
     %zext.reduce = zext i8 %reduce to i32
     store i32 %zext.reduce, i32* %store.addr
     %store.next = getelementptr i32, i32* %store.addr, i32 1
@@ -609,7 +609,7 @@
     %tmp9 = sub i32 %tmp7, 16
     %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %lsr.iv17, i32 1, <16 x i1> %tmp8, <16 x i8> undef)
     %xor = xor <16 x i8> %wide.masked.load, %pass
-    %reduce = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %xor)
+    %reduce = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %xor)
     %zext.reduce = zext i8 %reduce to i32
     %acc.next = add i32 %zext.reduce, %acc
     %scevgep = getelementptr i8, i8* %lsr.iv, i32 16
@@ -652,7 +652,7 @@
     %tmp4 = tail call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %tmp3, i32 2, <4 x i1> %tmp, <4 x i16> zeroinitializer)
     %zext.wide.2 = zext <4 x i16> %tmp4 to <4 x i32>
     %or = or <4 x i32> %zext.wide.1, %zext.wide.2
-    %reduce = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %or)
+    %reduce = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %or)
     %acc.next = add i32 %reduce, %acc
     %add.ptr = getelementptr inbounds i16, i16* %x.addr.026, i32 4
     %add.ptr4 = getelementptr inbounds i16, i16* %y.addr.025, i32 4
@@ -693,7 +693,7 @@
     %tmp2 = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp1, i32 2, <8 x i1> %tmp, <8 x i16> zeroinitializer)
     %tmp4 = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp3, i32 2, <8 x i1> %tmp, <8 x i16> zeroinitializer)
     %or = or <8 x i16> %tmp2, %tmp4
-    %reduce = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %or)
+    %reduce = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %or)
     %zext.reduce = zext i16 %reduce to i32
     %acc.next = add i32 %zext.reduce, %acc
     %add.ptr = getelementptr inbounds i16, i16* %x.addr.026, i32 8
@@ -737,7 +737,7 @@
     %tmp5 = tail call <4 x i32> @llvm.arm.mve.vmull.v4i32.v8i16(<8 x i16> %tmp2, <8 x i16> %tmp4, i32 0, i32 1)
     %tmp6 = tail call <4 x i32> @llvm.arm.mve.vmull.v4i32.v8i16(<8 x i16> %tmp2, <8 x i16> %tmp4, i32 0, i32 0)
     %mul = add <4 x i32> %tmp5, %tmp6
-    %reduce = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %mul)
+    %reduce = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %mul)
     %acc.next = add i32 %reduce, %acc
     %add.ptr = getelementptr inbounds i16, i16* %x.addr.026, i32 8
     %add.ptr4 = getelementptr inbounds i16, i16* %y.addr.025, i32 8
@@ -778,7 +778,7 @@
     %tmp2 = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp1, i32 2, <8 x i1> %tmp, <8 x i16> zeroinitializer)
     %tmp4 = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp3, i32 2, <8 x i1> %tmp, <8 x i16> zeroinitializer)
     %mul = tail call <4 x i32> @llvm.arm.mve.vmull.v4i32.v8i16(<8 x i16> %tmp2, <8 x i16> %tmp4, i32 0, i32 1)
-    %reduce = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %mul)
+    %reduce = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %mul)
     %acc.next = add i32 %reduce, %acc
     %add.ptr = getelementptr inbounds i16, i16* %x.addr.026, i32 8
     %add.ptr4 = getelementptr inbounds i16, i16* %y.addr.025, i32 8
@@ -798,11 +798,11 @@
   declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>)
   declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32 immarg, <8 x i1>)
   declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
-  declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
-  declare i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32>)
-  declare i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16>)
-  declare i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32>)
-  declare i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8>)
+  declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
+  declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
+  declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
+  declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
+  declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)
   declare void @llvm.set.loop.iterations.i32(i32)
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
   declare <4 x i32> @llvm.arm.mve.vmull.v4i32.v8i16(<8 x i16>, <8 x i16>, i32, i32)
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll
@@ -214,7 +214,7 @@
 
 middle.block:                                     ; preds = %vector.body
   %i19 = select <4 x i1> %active.lane.mask, <4 x i32> %i16, <4 x i32> %vec.phi
-  %i20 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %i19)
+  %i20 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %i19)
   br label %for.end
 
 for.end:                                          ; preds = %middle.block, %for.body
@@ -235,6 +235,6 @@
 
 declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
 declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>)
-declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
 declare i32 @llvm.loop.decrement.reg.i32(i32, i32)
 declare void @llvm.set.loop.iterations.i32(i32)
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-add-operand-liveout.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-add-operand-liveout.mir
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-add-operand-liveout.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-add-operand-liveout.mir
@@ -47,7 +47,7 @@
     %15 = add i32 %8, 4
     %16 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %15)
     %17 = select <4 x i1> %16, <4 x i32> %12, <4 x i32> %vec.phi
-    %18 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %17)
+    %18 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %17)
     br label %for.cond.cleanup
 
   for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -55,7 +55,7 @@
     ret i32 %res.0.lcssa
   }
   declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>)
-  declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
+  declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
   declare void @llvm.set.loop.iterations.i32(i32)
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
   declare <4 x i1> @llvm.arm.mve.vctp32(i32)
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp16-reduce.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp16-reduce.mir
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp16-reduce.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp16-reduce.mir
@@ -46,7 +46,7 @@
     %.lcssa = phi <8 x i16> [ %15, %vector.body ]
     %18 = call <8 x i1> @llvm.arm.mve.vctp16(i32 %7)
     %19 = select <8 x i1> %18, <8 x i16> %.lcssa, <8 x i16> %vec.phi.lcssa
-    %20 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %19)
+    %20 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %19)
     br label %for.cond.cleanup
 
   for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -54,7 +54,7 @@
     ret i16 %a.0.lcssa
   }
   declare <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>*, i32 immarg, <8 x i1>, <8 x i8>)
-  declare i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16>)
+  declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
   declare void @llvm.set.loop.iterations.i32(i32)
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
   declare <8 x i1> @llvm.arm.mve.vctp16(i32)
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll
@@ -70,7 +70,7 @@
 
 middle.block:                                     ; preds = %vector.body
   %8 = select <4 x i1> %1, <4 x i32> %6, <4 x i32> %vec.phi
-  %9 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %8)
+  %9 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %8)
   br label %for.cond.cleanup
 
 for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -141,7 +141,7 @@
 
 middle.block:                                     ; preds = %vector.body
   %5 = select <4 x i1> %1, <4 x i32> %3, <4 x i32> %vec.phi
-  %6 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %5)
+  %6 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %5)
   br label %for.cond.cleanup
 
 for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -212,7 +212,7 @@
 
 middle.block:                                     ; preds = %vector.body
   %5 = select <4 x i1> %1, <4 x i32> %3, <4 x i32> %vec.phi
-  %6 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %5)
+  %6 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %5)
   br label %for.cond.cleanup
 
 for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -459,7 +459,7 @@
 declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32 immarg, <16 x i1>)
 declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32 immarg, <8 x i1>)
 declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
-declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
 declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
 declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32)
 declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32)
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll
@@ -16,7 +16,7 @@
 
 ; CHECK: middle.block:
 ; CHECK: [[VPSEL:%[^ ]+]] = select <4 x i1> [[VCTP]],
-; CHECK: call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[VPSEL]])
+; CHECK: call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[VPSEL]])
 
 define i32 @vec_mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) {
 entry:
@@ -64,7 +64,7 @@
 
 middle.block:                                     ; preds = %vector.body
   %12 = select <4 x i1> %7, <4 x i32> %9, <4 x i32> %vec.phi
-  %13 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %12)
+  %13 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %12)
   br label %for.cond.cleanup
 
 for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -73,7 +73,7 @@
 }
 
 declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
 declare void @llvm.set.loop.iterations.i32(i32)
 declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
 declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wlstp.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wlstp.mir
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wlstp.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wlstp.mir
@@ -118,7 +118,7 @@
   middle.block:                                     ; preds = %vector.body
     %8 = call <4 x i1> @llvm.arm.vctp32(i32 %5)
     %tmp8 = select <4 x i1> %8, <4 x i32> %tmp6, <4 x i32> %vec.phi
-    %tmp9 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %tmp8)
+    %tmp9 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %tmp8)
     br label %for.cond.cleanup
 
   for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -134,7 +134,7 @@
   declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32 immarg, <16 x i1>)
   declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32 immarg, <8 x i1>)
   declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
-  declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
+  declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
   declare <16 x i1> @llvm.arm.vctp8(i32)
   declare void @llvm.stackprotector(i8*, i8**)
   declare <8 x i1> @llvm.arm.vctp16(i32)
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-liveout-lsr-shift.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-liveout-lsr-shift.mir
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-liveout-lsr-shift.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-liveout-lsr-shift.mir
@@ -46,7 +46,7 @@
     %.lcssa = phi <8 x i16> [ %15, %vector.body ]
     %18 = call <8 x i1> @llvm.arm.mve.vctp16(i32 %7)
     %19 = select <8 x i1> %18, <8 x i16> %.lcssa, <8 x i16> %vec.phi.lcssa
-    %20 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %19)
+    %20 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %19)
     br label %for.cond.cleanup
 
   for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -54,7 +54,7 @@
     ret i16 %a.0.lcssa
   }
   declare <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>*, i32 immarg, <8 x i1>, <8 x i8>)
-  declare i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16>)
+  declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
   declare void @llvm.set.loop.iterations.i32(i32)
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
   declare <8 x i1> @llvm.arm.mve.vctp16(i32)
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-opcode-liveout.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-opcode-liveout.mir
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-opcode-liveout.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-opcode-liveout.mir
@@ -52,7 +52,7 @@
     %n.splat = shufflevector <4 x i32> %insert.n, <4 x i32> undef, <4 x i32> zeroinitializer
     %tmp16 = icmp ult <4 x i32> %idx.splat, %n.splat
     %tmp17 = select <4 x i1> %tmp16, <4 x i32> %tmp13, <4 x i32> %vec.phi
-    %tmp18 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %tmp17)
+    %tmp18 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %tmp17)
     br label %for.cond.cleanup
 
   for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -60,7 +60,7 @@
     ret i32 %res.0.lcssa
   }
   declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) #1
-  declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) #2
+  declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) #2
   declare void @llvm.set.loop.iterations.i32(i32) #3
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3
   declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-operand-liveout.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-operand-liveout.mir
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-operand-liveout.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-operand-liveout.mir
@@ -45,7 +45,7 @@
   middle.block:                                     ; preds = %vector.body
     %15 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %8)
     %16 = select <4 x i1> %15, <4 x i32> %12, <4 x i32> %vec.phi
-    %17 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %16)
+    %17 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %16)
     br label %for.cond.cleanup
 
   for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -53,7 +53,7 @@
     ret i32 %res.0.lcssa
   }
   declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) #1
-  declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) #2
+  declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) #2
   declare void @llvm.set.loop.iterations.i32(i32) #3
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3
   declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4
diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
--- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
@@ -572,7 +572,7 @@
   br i1 %10, label %middle.block, label %vector.body, !llvm.loop !7
 
 middle.block:                                     ; preds = %vector.body
-  %11 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %9)
+  %11 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %9)
 ;for.cond8.for.cond.cleanup10_crit_edge.us.us:     ; preds = %for.body11.us.us, %middle.block
   %add19.us.us = add i32 %j.051.us.us, %mul18.us
   %arrayidx20.us.us = getelementptr inbounds i32, i32* %C, i32 %add19.us.us
@@ -803,7 +803,7 @@
   br i1 %12, label %middle.block, label %vector.body, !llvm.loop !7
 
 middle.block:                                     ; preds = %vector.body
-  %13 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %11)
+  %13 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %11)
   br i1 %cmp.n, label %for.cond5.for.cond.cleanup7_crit_edge.us.us, label %for.body8.us.us.preheader
 
 for.cond5.for.cond.cleanup7_crit_edge.us.us:      ; preds = %for.body8.us.us, %middle.block
@@ -1065,7 +1065,7 @@
   %wide.masked.gather75 = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> %tmp85, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i8> undef)
   %tmp86 = sext <4 x i8> %wide.masked.gather75 to <4 x i32>
   %tmp87 = mul nsw <4 x i32> %tmp84, %tmp86
-  %tmp88 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %tmp87)
+  %tmp88 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %tmp87)
   %tmp89 = add i32 %tmp88, %vec.phi
   %index.next = add i32 %index, 4
   %vec.ind.next = add <4 x i32> %vec.ind, <i32 4, i32 4, i32 4, i32 4>
@@ -1091,7 +1091,7 @@
 declare <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*>, i32, <4 x i1>, <4 x i16>)
 declare <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*>, i32 immarg, <4 x i1>, <4 x i8>) #3
 
-declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
 declare void @llvm.memset.p0i8.i32(i8* align 2, i8, i32, i1)
 
 declare void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32>, <4 x i32*>, i32, <4 x i1>)
diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll
--- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll
@@ -62,7 +62,7 @@
   br i1 %8, label %middle.block, label %vector.body
 middle.block:                                     ; preds = %vector.body
   %9 = select <4 x i1> %active.lane.mask, <4 x i32> %7, <4 x i32> %vec.phi
-  %10 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %9)
+  %10 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %9)
   store i32 %10, i32* %arrayidx.us.us, align 4
   %inc21.us.us = add nuw i32 4, 1
   %exitcond81.not = icmp eq i32 %inc21.us.us, %n
@@ -139,7 +139,7 @@
   br i1 %8, label %middle.block, label %vector.body
 middle.block:                                     ; preds = %vector.body
   %9 = select <4 x i1> %active.lane.mask, <4 x i32> %7, <4 x i32> %vec.phi
-  %10 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %9)
+  %10 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %9)
   store i32 %10, i32* %arrayidx.us.us, align 4
   %inc21.us.us = add nuw i32 4, 1
   %exitcond81.not = icmp eq i32 %inc21.us.us, %n
@@ -210,7 +210,7 @@
   br i1 %8, label %middle.block, label %vector.body
 middle.block:                                     ; preds = %vector.body
   %9 = select <4 x i1> %active.lane.mask, <4 x i32> %7, <4 x i32> %vec.phi
-  %10 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %9)
+  %10 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %9)
   store i32 %10, i32* %arrayidx.us.us, align 4
   %inc21.us.us = add nuw i32 4, 1
   %exitcond81.not = icmp eq i32 %inc21.us.us, %n
@@ -440,7 +440,7 @@
   ret void
 }
 
-declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
 declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>)
 declare <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*>, i32, <4 x i1>, <4 x i8>)
 declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
diff --git a/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll b/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll
--- a/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll
@@ -1390,7 +1390,7 @@
 declare <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>*, i32, <8 x i1>, <8 x i8>)
 declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>)
 declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32 immarg, <4 x i1>)
-declare i32 @llvm.experimental.vector.reduce.add.v16i8(<16 x i32> %ext4)
+declare i32 @llvm.vector.reduce.add.v16i8(<16 x i32> %ext4)
 declare i32 @llvm.arm.mve.vmldava.v8i16(i32, i32, i32, i32, <8 x i16>, <8 x i16>)
 declare i32 @llvm.arm.mve.vmldava.predicated.v16i8.v16i1(i32, i32, i32, i32, <16 x i8>, <16 x i8>, <16 x i1>)
 declare i32 @llvm.arm.mve.vmldava.predicated.v8i16.v8i1(i32, i32, i32, i32, <8 x i16>, <8 x i16>, <8 x i1>)
diff --git a/llvm/test/CodeGen/Thumb2/mve-vaddv.ll b/llvm/test/CodeGen/Thumb2/mve-vaddv.ll
--- a/llvm/test/CodeGen/Thumb2/mve-vaddv.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vaddv.ll
@@ -1,13 +1,13 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -o - | FileCheck %s
 
-declare i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64>)
-declare i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32>)
-declare i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16>)
-declare i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8>)
+declare i64 @llvm.vector.reduce.add.i64.v2i64(<2 x i64>)
+declare i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.add.i32.v8i32(<8 x i32>)
+declare i16 @llvm.vector.reduce.add.i16.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.add.i16.v16i16(<16 x i16>)
+declare i8 @llvm.vector.reduce.add.i8.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.add.i8.v32i8(<32 x i8>)
 
 define arm_aapcs_vfpcc i64 @vaddv_v2i64_i64(<2 x i64> %s1) {
 ; CHECK-LABEL: vaddv_v2i64_i64:
@@ -20,7 +20,7 @@
 ; CHECK-NEXT:    adcs r1, r2
 ; CHECK-NEXT:    bx lr
 entry:
-  %r = call i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64> %s1)
+  %r = call i64 @llvm.vector.reduce.add.i64.v2i64(<2 x i64> %s1)
   ret i64 %r
 }
 
@@ -30,7 +30,7 @@
 ; CHECK-NEXT:    vaddv.u32 r0, q0
 ; CHECK-NEXT:    bx lr
 entry:
-  %r = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> %s1)
+  %r = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %s1)
   ret i32 %r
 }
 
@@ -41,7 +41,7 @@
 ; CHECK-NEXT:    vaddv.u32 r0, q0
 ; CHECK-NEXT:    bx lr
 entry:
-  %r = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> %s1)
+  %r = call i32 @llvm.vector.reduce.add.i32.v8i32(<8 x i32> %s1)
   ret i32 %r
 }
 
@@ -51,7 +51,7 @@
 ; CHECK-NEXT:    vaddv.u16 r0, q0
 ; CHECK-NEXT:    bx lr
 entry:
-  %r = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> %s1)
+  %r = call i16 @llvm.vector.reduce.add.i16.v8i16(<8 x i16> %s1)
   ret i16 %r
 }
 
@@ -62,7 +62,7 @@
 ; CHECK-NEXT:    vaddv.u16 r0, q0
 ; CHECK-NEXT:    bx lr
 entry:
-  %r = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> %s1)
+  %r = call i16 @llvm.vector.reduce.add.i16.v16i16(<16 x i16> %s1)
   ret i16 %r
 }
 
@@ -72,7 +72,7 @@
 ; CHECK-NEXT:    vaddv.u8 r0, q0
 ; CHECK-NEXT:    bx lr
 entry:
-  %r = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> %s1)
+  %r = call i8 @llvm.vector.reduce.add.i8.v16i8(<16 x i8> %s1)
   ret i8 %r
 }
 
@@ -83,7 +83,7 @@
 ; CHECK-NEXT:    vaddv.u8 r0, q0
 ; CHECK-NEXT:    bx lr
 entry:
-  %r = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> %s1)
+  %r = call i8 @llvm.vector.reduce.add.i8.v32i8(<32 x i8> %s1)
   ret i8 %r
 }
 
@@ -102,7 +102,7 @@
 ; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
-  %t = call i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64> %s1)
+  %t = call i64 @llvm.vector.reduce.add.i64.v2i64(<2 x i64> %s1)
   %r = add i64 %t, %x
   ret i64 %r
 }
@@ -113,7 +113,7 @@
 ; CHECK-NEXT:    vaddva.u32 r0, q0
 ; CHECK-NEXT:    bx lr
 entry:
-  %t = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> %s1)
+  %t = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %s1)
   %r = add i32 %t, %x
   ret i32 %r
 }
@@ -125,7 +125,7 @@
 ; CHECK-NEXT:    vaddva.u32 r0, q0
 ; CHECK-NEXT:    bx lr
 entry:
-  %t = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> %s1)
+  %t = call i32 @llvm.vector.reduce.add.i32.v8i32(<8 x i32> %s1)
   %r = add i32 %t, %x
   ret i32 %r
 }
@@ -136,7 +136,7 @@
 ; CHECK-NEXT:    vaddva.u16 r0, q0
 ; CHECK-NEXT:    bx lr
 entry:
-  %t = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> %s1)
+  %t = call i16 @llvm.vector.reduce.add.i16.v8i16(<8 x i16> %s1)
   %r = add i16 %t, %x
   ret i16 %r
 }
@@ -148,7 +148,7 @@
 ; CHECK-NEXT:    vaddva.u16 r0, q0
 ; CHECK-NEXT:    bx lr
 entry:
-  %t = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> %s1)
+  %t = call i16 @llvm.vector.reduce.add.i16.v16i16(<16 x i16> %s1)
   %r = add i16 %t, %x
   ret i16 %r
 }
@@ -159,7 +159,7 @@
 ; CHECK-NEXT:    vaddva.u8 r0, q0
 ; CHECK-NEXT:    bx lr
 entry:
-  %t = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> %s1)
+  %t = call i8 @llvm.vector.reduce.add.i8.v16i8(<16 x i8> %s1)
   %r = add i8 %t, %x
   ret i8 %r
 }
@@ -171,7 +171,7 @@
 ; CHECK-NEXT:    vaddva.u8 r0, q0
 ; CHECK-NEXT:    bx lr
 entry:
-  %t = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> %s1)
+  %t = call i8 @llvm.vector.reduce.add.i8.v32i8(<32 x i8> %s1)
   %r = add i8 %t, %x
   ret i8 %r
 }
diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll
@@ -7,7 +7,7 @@
 ; CHECK-NEXT:    vaddv.u32 r0, q0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %x)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %x)
   ret i32 %z
 }
 
@@ -18,7 +18,7 @@
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = zext <4 x i32> %x to <4 x i64>
-  %z = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %xx)
+  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
   ret i64 %z
 }
 
@@ -29,7 +29,7 @@
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = sext <4 x i32> %x to <4 x i64>
-  %z = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %xx)
+  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
   ret i64 %z
 }
 
@@ -47,7 +47,7 @@
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = zext <2 x i32> %x to <2 x i64>
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %xx)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
   ret i64 %z
 }
 
@@ -65,7 +65,7 @@
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = sext <2 x i32> %x to <2 x i64>
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %xx)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
   ret i64 %z
 }
 
@@ -76,7 +76,7 @@
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = zext <8 x i16> %x to <8 x i32>
-  %z = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %xx)
+  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
   ret i32 %z
 }
 
@@ -87,7 +87,7 @@
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = sext <8 x i16> %x to <8 x i32>
-  %z = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %xx)
+  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
   ret i32 %z
 }
 
@@ -99,7 +99,7 @@
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = zext <4 x i16> %x to <4 x i32>
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %xx)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
   ret i32 %z
 }
 
@@ -111,7 +111,7 @@
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = sext <4 x i16> %x to <4 x i32>
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %xx)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
   ret i32 %z
 }
 
@@ -122,7 +122,7 @@
 ; CHECK-NEXT:    uxth r0, r0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %x)
+  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %x)
   ret i16 %z
 }
 
@@ -175,7 +175,7 @@
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = zext <8 x i16> %x to <8 x i64>
-  %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %xx)
+  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
   ret i64 %z
 }
 
@@ -242,7 +242,7 @@
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = sext <8 x i16> %x to <8 x i64>
-  %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %xx)
+  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
   ret i64 %z
 }
 
@@ -258,7 +258,7 @@
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = zext <2 x i16> %x to <2 x i64>
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %xx)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
   ret i64 %z
 }
 
@@ -278,7 +278,7 @@
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = sext <2 x i16> %x to <2 x i64>
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %xx)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
   ret i64 %z
 }
 
@@ -289,7 +289,7 @@
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = zext <16 x i8> %x to <16 x i32>
-  %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %xx)
+  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
   ret i32 %z
 }
 
@@ -300,7 +300,7 @@
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = sext <16 x i8> %x to <16 x i32>
-  %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %xx)
+  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
   ret i32 %z
 }
 
@@ -313,7 +313,7 @@
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = zext <4 x i8> %x to <4 x i32>
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %xx)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
   ret i32 %z
 }
 
@@ -326,7 +326,7 @@
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = sext <4 x i8> %x to <4 x i32>
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %xx)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
   ret i32 %z
 }
 
@@ -338,7 +338,7 @@
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = zext <16 x i8> %x to <16 x i16>
-  %z = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %xx)
+  %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
   ret i16 %z
 }
 
@@ -350,7 +350,7 @@
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = sext <16 x i8> %x to <16 x i16>
-  %z = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %xx)
+  %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
   ret i16 %z
 }
 
@@ -363,7 +363,7 @@
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = zext <8 x i8> %x to <8 x i16>
-  %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %xx)
+  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
   ret i16 %z
 }
 
@@ -376,7 +376,7 @@
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = sext <8 x i8> %x to <8 x i16>
-  %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %xx)
+  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
   ret i16 %z
 }
 
@@ -387,7 +387,7 @@
 ; CHECK-NEXT:    uxtb r0, r0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %x)
+  %z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %x)
   ret i8 %z
 }
 
@@ -492,7 +492,7 @@
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = zext <16 x i8> %x to <16 x i64>
-  %z = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %xx)
+  %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
   ret i64 %z
 }
 
@@ -627,7 +627,7 @@
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = sext <16 x i8> %x to <16 x i64>
-  %z = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %xx)
+  %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
   ret i64 %z
 }
 
@@ -643,7 +643,7 @@
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = zext <2 x i8> %x to <2 x i64>
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %xx)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
   ret i64 %z
 }
 
@@ -663,7 +663,7 @@
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = sext <2 x i8> %x to <2 x i64>
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %xx)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
   ret i64 %z
 }
 
@@ -678,7 +678,7 @@
 ; CHECK-NEXT:    adcs r1, r2
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %x)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %x)
   ret i64 %z
 }
 
@@ -688,7 +688,7 @@
 ; CHECK-NEXT:    vaddva.u32 r0, q0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %x)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %x)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -700,7 +700,7 @@
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = zext <4 x i32> %x to <4 x i64>
-  %z = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %xx)
+  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -712,7 +712,7 @@
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = sext <4 x i32> %x to <4 x i64>
-  %z = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %xx)
+  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -735,7 +735,7 @@
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
   %xx = zext <2 x i32> %x to <2 x i64>
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %xx)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -756,7 +756,7 @@
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = sext <2 x i32> %x to <2 x i64>
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %xx)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -768,7 +768,7 @@
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = zext <8 x i16> %x to <8 x i32>
-  %z = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %xx)
+  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -780,7 +780,7 @@
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = sext <8 x i16> %x to <8 x i32>
-  %z = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %xx)
+  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -793,7 +793,7 @@
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = zext <4 x i16> %x to <4 x i32>
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %xx)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -806,7 +806,7 @@
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = sext <4 x i16> %x to <4 x i32>
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %xx)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -818,7 +818,7 @@
 ; CHECK-NEXT:    uxth r0, r0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %x)
+  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %x)
   %r = add i16 %z, %a
   ret i16 %r
 }
@@ -876,7 +876,7 @@
 ; CHECK-NEXT:    pop {r4, pc}
 entry:
   %xx = zext <8 x i16> %x to <8 x i64>
-  %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %xx)
+  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -948,7 +948,7 @@
 ; CHECK-NEXT:    pop {r4, pc}
 entry:
   %xx = sext <8 x i16> %x to <8 x i64>
-  %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %xx)
+  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -967,7 +967,7 @@
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = zext <2 x i16> %x to <2 x i64>
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %xx)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -990,7 +990,7 @@
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = sext <2 x i16> %x to <2 x i64>
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %xx)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -1002,7 +1002,7 @@
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = zext <16 x i8> %x to <16 x i32>
-  %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %xx)
+  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -1014,7 +1014,7 @@
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = sext <16 x i8> %x to <16 x i32>
-  %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %xx)
+  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -1028,7 +1028,7 @@
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = zext <4 x i8> %x to <4 x i32>
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %xx)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -1042,7 +1042,7 @@
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = sext <4 x i8> %x to <4 x i32>
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %xx)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -1055,7 +1055,7 @@
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = zext <16 x i8> %x to <16 x i16>
-  %z = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %xx)
+  %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
   %r = add i16 %z, %a
   ret i16 %r
 }
@@ -1068,7 +1068,7 @@
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = sext <16 x i8> %x to <16 x i16>
-  %z = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %xx)
+  %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
   %r = add i16 %z, %a
   ret i16 %r
 }
@@ -1082,7 +1082,7 @@
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = zext <8 x i8> %x to <8 x i16>
-  %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %xx)
+  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
   %r = add i16 %z, %a
   ret i16 %r
 }
@@ -1096,7 +1096,7 @@
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = sext <8 x i8> %x to <8 x i16>
-  %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %xx)
+  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
   %r = add i16 %z, %a
   ret i16 %r
 }
@@ -1108,7 +1108,7 @@
 ; CHECK-NEXT:    uxtb r0, r0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %x)
+  %z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %x)
   %r = add i8 %z, %a
   ret i8 %r
 }
@@ -1218,7 +1218,7 @@
 ; CHECK-NEXT:    pop {r4, pc}
 entry:
   %xx = zext <16 x i8> %x to <16 x i64>
-  %z = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %xx)
+  %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -1358,7 +1358,7 @@
 ; CHECK-NEXT:    pop {r4, pc}
 entry:
   %xx = sext <16 x i8> %x to <16 x i64>
-  %z = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %xx)
+  %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -1377,7 +1377,7 @@
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = zext <2 x i8> %x to <2 x i64>
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %xx)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -1400,7 +1400,7 @@
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = sext <2 x i8> %x to <2 x i64>
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %xx)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -1420,18 +1420,18 @@
 ; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %x)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %x)
   %r = add i64 %z, %a
   ret i64 %r
 }
 
-declare i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16>)
-declare i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32>)
-declare i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64>)
-declare i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64>)
-declare i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8>)
+declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
+declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
+declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>)
+declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>)
+declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)
diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll
@@ -10,7 +10,7 @@
 entry:
   %c = icmp eq <4 x i32> %b, zeroinitializer
   %s = select <4 x i1> %c, <4 x i32> %x, <4 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
   ret i32 %z
 }
 
@@ -24,7 +24,7 @@
   %c = icmp eq <4 x i32> %b, zeroinitializer
   %xx = zext <4 x i32> %x to <4 x i64>
   %s = select <4 x i1> %c, <4 x i64> %xx, <4 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s)
   ret i64 %z
 }
 
@@ -38,7 +38,7 @@
   %c = icmp eq <4 x i32> %b, zeroinitializer
   %xx = sext <4 x i32> %x to <4 x i64>
   %s = select <4 x i1> %c, <4 x i64> %xx, <4 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s)
   ret i64 %z
 }
 
@@ -73,7 +73,7 @@
   %c = icmp eq <2 x i32> %b, zeroinitializer
   %xx = zext <2 x i32> %x to <2 x i64>
   %s = select <2 x i1> %c, <2 x i64> %xx, <2 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
   ret i64 %z
 }
 
@@ -114,7 +114,7 @@
   %c = icmp eq <2 x i32> %b, zeroinitializer
   %xx = sext <2 x i32> %x to <2 x i64>
   %s = select <2 x i1> %c, <2 x i64> %xx, <2 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
   ret i64 %z
 }
 
@@ -128,7 +128,7 @@
   %c = icmp eq <8 x i16> %b, zeroinitializer
   %xx = zext <8 x i16> %x to <8 x i32>
   %s = select <8 x i1> %c, <8 x i32> %xx, <8 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %s)
   ret i32 %z
 }
 
@@ -142,7 +142,7 @@
   %c = icmp eq <8 x i16> %b, zeroinitializer
   %xx = sext <8 x i16> %x to <8 x i32>
   %s = select <8 x i1> %c, <8 x i32> %xx, <8 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %s)
   ret i32 %z
 }
 
@@ -158,7 +158,7 @@
   %c = icmp eq <4 x i16> %b, zeroinitializer
   %xx = zext <4 x i16> %x to <4 x i32>
   %s = select <4 x i1> %c, <4 x i32> %xx, <4 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
   ret i32 %z
 }
 
@@ -174,7 +174,7 @@
   %c = icmp eq <4 x i16> %b, zeroinitializer
   %xx = sext <4 x i16> %x to <4 x i32>
   %s = select <4 x i1> %c, <4 x i32> %xx, <4 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
   ret i32 %z
 }
 
@@ -188,7 +188,7 @@
 entry:
   %c = icmp eq <8 x i16> %b, zeroinitializer
   %s = select <8 x i1> %c, <8 x i16> %x, <8 x i16> zeroinitializer
-  %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %s)
+  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s)
   ret i16 %z
 }
 
@@ -314,7 +314,7 @@
   %c = icmp eq <8 x i16> %b, zeroinitializer
   %xx = zext <8 x i16> %x to <8 x i64>
   %s = select <8 x i1> %c, <8 x i64> %xx, <8 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
   ret i64 %z
 }
 
@@ -456,7 +456,7 @@
   %c = icmp eq <8 x i16> %b, zeroinitializer
   %xx = sext <8 x i16> %x to <8 x i64>
   %s = select <8 x i1> %c, <8 x i64> %xx, <8 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
   ret i64 %z
 }
 
@@ -492,7 +492,7 @@
   %c = icmp eq <2 x i16> %b, zeroinitializer
   %xx = zext <2 x i16> %x to <2 x i64>
   %s = select <2 x i1> %c, <2 x i64> %xx, <2 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
   ret i64 %z
 }
 
@@ -537,7 +537,7 @@
   %c = icmp eq <2 x i16> %b, zeroinitializer
   %xx = sext <2 x i16> %x to <2 x i64>
   %s = select <2 x i1> %c, <2 x i64> %xx, <2 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
   ret i64 %z
 }
 
@@ -551,7 +551,7 @@
   %c = icmp eq <16 x i8> %b, zeroinitializer
   %xx = zext <16 x i8> %x to <16 x i32>
   %s = select <16 x i1> %c, <16 x i32> %xx, <16 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
   ret i32 %z
 }
 
@@ -565,7 +565,7 @@
   %c = icmp eq <16 x i8> %b, zeroinitializer
   %xx = sext <16 x i8> %x to <16 x i32>
   %s = select <16 x i1> %c, <16 x i32> %xx, <16 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
   ret i32 %z
 }
 
@@ -582,7 +582,7 @@
   %c = icmp eq <4 x i8> %b, zeroinitializer
   %xx = zext <4 x i8> %x to <4 x i32>
   %s = select <4 x i1> %c, <4 x i32> %xx, <4 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
   ret i32 %z
 }
 
@@ -600,7 +600,7 @@
   %c = icmp eq <4 x i8> %b, zeroinitializer
   %xx = sext <4 x i8> %x to <4 x i32>
   %s = select <4 x i1> %c, <4 x i32> %xx, <4 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
   ret i32 %z
 }
 
@@ -615,7 +615,7 @@
   %c = icmp eq <16 x i8> %b, zeroinitializer
   %xx = zext <16 x i8> %x to <16 x i16>
   %s = select <16 x i1> %c, <16 x i16> %xx, <16 x i16> zeroinitializer
-  %z = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %s)
+  %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s)
   ret i16 %z
 }
 
@@ -630,7 +630,7 @@
   %c = icmp eq <16 x i8> %b, zeroinitializer
   %xx = sext <16 x i8> %x to <16 x i16>
   %s = select <16 x i1> %c, <16 x i16> %xx, <16 x i16> zeroinitializer
-  %z = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %s)
+  %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s)
   ret i16 %z
 }
 
@@ -647,7 +647,7 @@
   %c = icmp eq <8 x i8> %b, zeroinitializer
   %xx = zext <8 x i8> %x to <8 x i16>
   %s = select <8 x i1> %c, <8 x i16> %xx, <8 x i16> zeroinitializer
-  %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %s)
+  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s)
   ret i16 %z
 }
 
@@ -664,7 +664,7 @@
   %c = icmp eq <8 x i8> %b, zeroinitializer
   %xx = sext <8 x i8> %x to <8 x i16>
   %s = select <8 x i1> %c, <8 x i16> %xx, <8 x i16> zeroinitializer
-  %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %s)
+  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s)
   ret i16 %z
 }
 
@@ -678,7 +678,7 @@
 entry:
   %c = icmp eq <16 x i8> %b, zeroinitializer
   %s = select <16 x i1> %c, <16 x i8> %x, <16 x i8> zeroinitializer
-  %z = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %s)
+  %z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %s)
   ret i8 %z
 }
 
@@ -948,7 +948,7 @@
   %c = icmp eq <16 x i8> %b, zeroinitializer
   %xx = zext <16 x i8> %x to <16 x i64>
   %s = select <16 x i1> %c, <16 x i64> %xx, <16 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %s)
   ret i64 %z
 }
 
@@ -1257,7 +1257,7 @@
   %c = icmp eq <16 x i8> %b, zeroinitializer
   %xx = sext <16 x i8> %x to <16 x i64>
   %s = select <16 x i1> %c, <16 x i64> %xx, <16 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %s)
   ret i64 %z
 }
 
@@ -1293,7 +1293,7 @@
   %c = icmp eq <2 x i8> %b, zeroinitializer
   %xx = zext <2 x i8> %x to <2 x i64>
   %s = select <2 x i1> %c, <2 x i64> %xx, <2 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
   ret i64 %z
 }
 
@@ -1338,7 +1338,7 @@
   %c = icmp eq <2 x i8> %b, zeroinitializer
   %xx = sext <2 x i8> %x to <2 x i64>
   %s = select <2 x i1> %c, <2 x i64> %xx, <2 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
   ret i64 %z
 }
 
@@ -1372,7 +1372,7 @@
 entry:
   %c = icmp eq <2 x i64> %b, zeroinitializer
   %s = select <2 x i1> %c, <2 x i64> %x, <2 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
   ret i64 %z
 }
 
@@ -1385,7 +1385,7 @@
 entry:
   %c = icmp eq <4 x i32> %b, zeroinitializer
   %s = select <4 x i1> %c, <4 x i32> %x, <4 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -1400,7 +1400,7 @@
   %c = icmp eq <4 x i32> %b, zeroinitializer
   %xx = zext <4 x i32> %x to <4 x i64>
   %s = select <4 x i1> %c, <4 x i64> %xx, <4 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -1415,7 +1415,7 @@
   %c = icmp eq <4 x i32> %b, zeroinitializer
   %xx = sext <4 x i32> %x to <4 x i64>
   %s = select <4 x i1> %c, <4 x i64> %xx, <4 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -1455,7 +1455,7 @@
   %c = icmp eq <2 x i32> %b, zeroinitializer
   %xx = zext <2 x i32> %x to <2 x i64>
   %s = select <2 x i1> %c, <2 x i64> %xx, <2 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -1501,7 +1501,7 @@
   %c = icmp eq <2 x i32> %b, zeroinitializer
   %xx = sext <2 x i32> %x to <2 x i64>
   %s = select <2 x i1> %c, <2 x i64> %xx, <2 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -1516,7 +1516,7 @@
   %c = icmp eq <8 x i16> %b, zeroinitializer
   %xx = zext <8 x i16> %x to <8 x i32>
   %s = select <8 x i1> %c, <8 x i32> %xx, <8 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %s)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -1531,7 +1531,7 @@
   %c = icmp eq <8 x i16> %b, zeroinitializer
   %xx = sext <8 x i16> %x to <8 x i32>
   %s = select <8 x i1> %c, <8 x i32> %xx, <8 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %s)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -1548,7 +1548,7 @@
   %c = icmp eq <4 x i16> %b, zeroinitializer
   %xx = zext <4 x i16> %x to <4 x i32>
   %s = select <4 x i1> %c, <4 x i32> %xx, <4 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -1565,7 +1565,7 @@
   %c = icmp eq <4 x i16> %b, zeroinitializer
   %xx = sext <4 x i16> %x to <4 x i32>
   %s = select <4 x i1> %c, <4 x i32> %xx, <4 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -1580,7 +1580,7 @@
 entry:
   %c = icmp eq <8 x i16> %b, zeroinitializer
   %s = select <8 x i1> %c, <8 x i16> %x, <8 x i16> zeroinitializer
-  %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %s)
+  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s)
   %r = add i16 %z, %a
   ret i16 %r
 }
@@ -1711,7 +1711,7 @@
   %c = icmp eq <8 x i16> %b, zeroinitializer
   %xx = zext <8 x i16> %x to <8 x i64>
   %s = select <8 x i1> %c, <8 x i64> %xx, <8 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -1858,7 +1858,7 @@
   %c = icmp eq <8 x i16> %b, zeroinitializer
   %xx = sext <8 x i16> %x to <8 x i64>
   %s = select <8 x i1> %c, <8 x i64> %xx, <8 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -1897,7 +1897,7 @@
   %c = icmp eq <2 x i16> %b, zeroinitializer
   %xx = zext <2 x i16> %x to <2 x i64>
   %s = select <2 x i1> %c, <2 x i64> %xx, <2 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -1947,7 +1947,7 @@
   %c = icmp eq <2 x i16> %b, zeroinitializer
   %xx = sext <2 x i16> %x to <2 x i64>
   %s = select <2 x i1> %c, <2 x i64> %xx, <2 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -1962,7 +1962,7 @@
   %c = icmp eq <16 x i8> %b, zeroinitializer
   %xx = zext <16 x i8> %x to <16 x i32>
   %s = select <16 x i1> %c, <16 x i32> %xx, <16 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -1977,7 +1977,7 @@
   %c = icmp eq <16 x i8> %b, zeroinitializer
   %xx = sext <16 x i8> %x to <16 x i32>
   %s = select <16 x i1> %c, <16 x i32> %xx, <16 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -1995,7 +1995,7 @@
   %c = icmp eq <4 x i8> %b, zeroinitializer
   %xx = zext <4 x i8> %x to <4 x i32>
   %s = select <4 x i1> %c, <4 x i32> %xx, <4 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -2014,7 +2014,7 @@
   %c = icmp eq <4 x i8> %b, zeroinitializer
   %xx = sext <4 x i8> %x to <4 x i32>
   %s = select <4 x i1> %c, <4 x i32> %xx, <4 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -2030,7 +2030,7 @@
   %c = icmp eq <16 x i8> %b, zeroinitializer
   %xx = zext <16 x i8> %x to <16 x i16>
   %s = select <16 x i1> %c, <16 x i16> %xx, <16 x i16> zeroinitializer
-  %z = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %s)
+  %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s)
   %r = add i16 %z, %a
   ret i16 %r
 }
@@ -2046,7 +2046,7 @@
   %c = icmp eq <16 x i8> %b, zeroinitializer
   %xx = sext <16 x i8> %x to <16 x i16>
   %s = select <16 x i1> %c, <16 x i16> %xx, <16 x i16> zeroinitializer
-  %z = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %s)
+  %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s)
   %r = add i16 %z, %a
   ret i16 %r
 }
@@ -2064,7 +2064,7 @@
   %c = icmp eq <8 x i8> %b, zeroinitializer
   %xx = zext <8 x i8> %x to <8 x i16>
   %s = select <8 x i1> %c, <8 x i16> %xx, <8 x i16> zeroinitializer
-  %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %s)
+  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s)
   %r = add i16 %z, %a
   ret i16 %r
 }
@@ -2082,7 +2082,7 @@
   %c = icmp eq <8 x i8> %b, zeroinitializer
   %xx = sext <8 x i8> %x to <8 x i16>
   %s = select <8 x i1> %c, <8 x i16> %xx, <8 x i16> zeroinitializer
-  %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %s)
+  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s)
   %r = add i16 %z, %a
   ret i16 %r
 }
@@ -2097,7 +2097,7 @@
 entry:
   %c = icmp eq <16 x i8> %b, zeroinitializer
   %s = select <16 x i1> %c, <16 x i8> %x, <16 x i8> zeroinitializer
-  %z = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %s)
+  %z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %s)
   %r = add i8 %z, %a
   ret i8 %r
 }
@@ -2372,7 +2372,7 @@
   %c = icmp eq <16 x i8> %b, zeroinitializer
   %xx = zext <16 x i8> %x to <16 x i64>
   %s = select <16 x i1> %c, <16 x i64> %xx, <16 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %s)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -2686,7 +2686,7 @@
   %c = icmp eq <16 x i8> %b, zeroinitializer
   %xx = sext <16 x i8> %x to <16 x i64>
   %s = select <16 x i1> %c, <16 x i64> %xx, <16 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %s)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -2725,7 +2725,7 @@
   %c = icmp eq <2 x i8> %b, zeroinitializer
   %xx = zext <2 x i8> %x to <2 x i64>
   %s = select <2 x i1> %c, <2 x i64> %xx, <2 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -2775,7 +2775,7 @@
   %c = icmp eq <2 x i8> %b, zeroinitializer
   %xx = sext <2 x i8> %x to <2 x i64>
   %s = select <2 x i1> %c, <2 x i64> %xx, <2 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -2814,18 +2814,18 @@
 entry:
   %c = icmp eq <2 x i64> %b, zeroinitializer
   %s = select <2 x i1> %c, <2 x i64> %x, <2 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
   %r = add i64 %z, %a
   ret i64 %r
 }
 
-declare i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16>)
-declare i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32>)
-declare i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64>)
-declare i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64>)
-declare i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8>)
+declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
+declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
+declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>)
+declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>)
+declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)
diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-bit.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-bit.ll
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-bit.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-bit.ll
@@ -9,7 +9,7 @@
 ; CHECK-NEXT:    ands r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i32 @llvm.experimental.vector.reduce.and.v2i32(<2 x i32> %x)
+  %z = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> %x)
   ret i32 %z
 }
 
@@ -25,7 +25,7 @@
 ; CHECK-NEXT:    ands r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> %x)
+  %z = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %x)
   ret i32 %z
 }
 
@@ -42,7 +42,7 @@
 ; CHECK-NEXT:    ands r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> %x)
+  %z = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> %x)
   ret i32 %z
 }
 
@@ -58,7 +58,7 @@
 ; CHECK-NEXT:    ands r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16> %x)
+  %z = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> %x)
   ret i16 %z
 }
 
@@ -76,7 +76,7 @@
 ; CHECK-NEXT:    ands r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16> %x)
+  %z = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> %x)
   ret i16 %z
 }
 
@@ -95,7 +95,7 @@
 ; CHECK-NEXT:    ands r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16> %x)
+  %z = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> %x)
   ret i16 %z
 }
 
@@ -113,7 +113,7 @@
 ; CHECK-NEXT:    ands r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8> %x)
+  %z = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> %x)
   ret i8 %z
 }
 
@@ -133,7 +133,7 @@
 ; CHECK-NEXT:    ands r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8> %x)
+  %z = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> %x)
   ret i8 %z
 }
 
@@ -154,7 +154,7 @@
 ; CHECK-NEXT:    ands r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8> %x)
+  %z = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> %x)
   ret i8 %z
 }
 
@@ -163,7 +163,7 @@
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i64 @llvm.experimental.vector.reduce.and.v1i64(<1 x i64> %x)
+  %z = call i64 @llvm.vector.reduce.and.v1i64(<1 x i64> %x)
   ret i64 %z
 }
 
@@ -178,7 +178,7 @@
 ; CHECK-NEXT:    ands r1, r2
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i64 @llvm.experimental.vector.reduce.and.v2i64(<2 x i64> %x)
+  %z = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> %x)
   ret i64 %z
 }
 
@@ -194,7 +194,7 @@
 ; CHECK-NEXT:    ands r1, r2
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i64 @llvm.experimental.vector.reduce.and.v4i64(<4 x i64> %x)
+  %z = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> %x)
   ret i64 %z
 }
 
@@ -207,7 +207,7 @@
 ; CHECK-NEXT:    ands r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i32 @llvm.experimental.vector.reduce.and.v2i32(<2 x i32> %x)
+  %z = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> %x)
   %r = and i32 %y, %z
   ret i32 %r
 }
@@ -225,7 +225,7 @@
 ; CHECK-NEXT:    ands r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> %x)
+  %z = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %x)
   %r = and i32 %y, %z
   ret i32 %r
 }
@@ -244,7 +244,7 @@
 ; CHECK-NEXT:    ands r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> %x)
+  %z = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> %x)
   %r = and i32 %y, %z
   ret i32 %r
 }
@@ -262,7 +262,7 @@
 ; CHECK-NEXT:    ands r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16> %x)
+  %z = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> %x)
   %r = and i16 %y, %z
   ret i16 %r
 }
@@ -282,7 +282,7 @@
 ; CHECK-NEXT:    ands r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16> %x)
+  %z = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> %x)
   %r = and i16 %y, %z
   ret i16 %r
 }
@@ -303,7 +303,7 @@
 ; CHECK-NEXT:    ands r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16> %x)
+  %z = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> %x)
   %r = and i16 %y, %z
   ret i16 %r
 }
@@ -323,7 +323,7 @@
 ; CHECK-NEXT:    ands r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8> %x)
+  %z = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> %x)
   %r = and i8 %y, %z
   ret i8 %r
 }
@@ -345,7 +345,7 @@
 ; CHECK-NEXT:    ands r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8> %x)
+  %z = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> %x)
   %r = and i8 %y, %z
   ret i8 %r
 }
@@ -368,7 +368,7 @@
 ; CHECK-NEXT:    ands r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8> %x)
+  %z = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> %x)
   %r = and i8 %y, %z
   ret i8 %r
 }
@@ -380,7 +380,7 @@
 ; CHECK-NEXT:    ands r1, r3
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i64 @llvm.experimental.vector.reduce.and.v1i64(<1 x i64> %x)
+  %z = call i64 @llvm.vector.reduce.and.v1i64(<1 x i64> %x)
   %r = and i64 %y, %z
   ret i64 %r
 }
@@ -398,7 +398,7 @@
 ; CHECK-NEXT:    ands r1, r2
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i64 @llvm.experimental.vector.reduce.and.v2i64(<2 x i64> %x)
+  %z = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> %x)
   %r = and i64 %y, %z
   ret i64 %r
 }
@@ -417,7 +417,7 @@
 ; CHECK-NEXT:    ands r1, r2
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i64 @llvm.experimental.vector.reduce.and.v4i64(<4 x i64> %x)
+  %z = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> %x)
   %r = and i64 %y, %z
   ret i64 %r
 }
@@ -430,7 +430,7 @@
 ; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i32 @llvm.experimental.vector.reduce.or.v2i32(<2 x i32> %x)
+  %z = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> %x)
   ret i32 %z
 }
 
@@ -446,7 +446,7 @@
 ; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> %x)
+  %z = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %x)
   ret i32 %z
 }
 
@@ -463,7 +463,7 @@
 ; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> %x)
+  %z = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %x)
   ret i32 %z
 }
 
@@ -479,7 +479,7 @@
 ; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i16 @llvm.experimental.vector.reduce.or.v4i16(<4 x i16> %x)
+  %z = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> %x)
   ret i16 %z
 }
 
@@ -497,7 +497,7 @@
 ; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i16 @llvm.experimental.vector.reduce.or.v8i16(<8 x i16> %x)
+  %z = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> %x)
   ret i16 %z
 }
 
@@ -516,7 +516,7 @@
 ; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16> %x)
+  %z = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> %x)
   ret i16 %z
 }
 
@@ -534,7 +534,7 @@
 ; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i8 @llvm.experimental.vector.reduce.or.v8i8(<8 x i8> %x)
+  %z = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> %x)
   ret i8 %z
 }
 
@@ -554,7 +554,7 @@
 ; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i8 @llvm.experimental.vector.reduce.or.v16i8(<16 x i8> %x)
+  %z = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> %x)
   ret i8 %z
 }
 
@@ -575,7 +575,7 @@
 ; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8> %x)
+  %z = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> %x)
   ret i8 %z
 }
 
@@ -584,7 +584,7 @@
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i64 @llvm.experimental.vector.reduce.or.v1i64(<1 x i64> %x)
+  %z = call i64 @llvm.vector.reduce.or.v1i64(<1 x i64> %x)
   ret i64 %z
 }
 
@@ -599,7 +599,7 @@
 ; CHECK-NEXT:    orrs r1, r2
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i64 @llvm.experimental.vector.reduce.or.v2i64(<2 x i64> %x)
+  %z = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> %x)
   ret i64 %z
 }
 
@@ -615,7 +615,7 @@
 ; CHECK-NEXT:    orrs r1, r2
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i64 @llvm.experimental.vector.reduce.or.v4i64(<4 x i64> %x)
+  %z = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> %x)
   ret i64 %z
 }
 
@@ -628,7 +628,7 @@
 ; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i32 @llvm.experimental.vector.reduce.or.v2i32(<2 x i32> %x)
+  %z = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> %x)
   %r = or i32 %y, %z
   ret i32 %r
 }
@@ -646,7 +646,7 @@
 ; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> %x)
+  %z = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %x)
   %r = or i32 %y, %z
   ret i32 %r
 }
@@ -665,7 +665,7 @@
 ; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> %x)
+  %z = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %x)
   %r = or i32 %y, %z
   ret i32 %r
 }
@@ -683,7 +683,7 @@
 ; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i16 @llvm.experimental.vector.reduce.or.v4i16(<4 x i16> %x)
+  %z = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> %x)
   %r = or i16 %y, %z
   ret i16 %r
 }
@@ -703,7 +703,7 @@
 ; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i16 @llvm.experimental.vector.reduce.or.v8i16(<8 x i16> %x)
+  %z = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> %x)
   %r = or i16 %y, %z
   ret i16 %r
 }
@@ -724,7 +724,7 @@
 ; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16> %x)
+  %z = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> %x)
   %r = or i16 %y, %z
   ret i16 %r
 }
@@ -744,7 +744,7 @@
 ; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i8 @llvm.experimental.vector.reduce.or.v8i8(<8 x i8> %x)
+  %z = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> %x)
   %r = or i8 %y, %z
   ret i8 %r
 }
@@ -766,7 +766,7 @@
 ; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i8 @llvm.experimental.vector.reduce.or.v16i8(<16 x i8> %x)
+  %z = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> %x)
   %r = or i8 %y, %z
   ret i8 %r
 }
@@ -789,7 +789,7 @@
 ; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8> %x)
+  %z = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> %x)
   %r = or i8 %y, %z
   ret i8 %r
 }
@@ -801,7 +801,7 @@
 ; CHECK-NEXT:    orrs r1, r3
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i64 @llvm.experimental.vector.reduce.or.v1i64(<1 x i64> %x)
+  %z = call i64 @llvm.vector.reduce.or.v1i64(<1 x i64> %x)
   %r = or i64 %y, %z
   ret i64 %r
 }
@@ -819,7 +819,7 @@
 ; CHECK-NEXT:    orrs r1, r2
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i64 @llvm.experimental.vector.reduce.or.v2i64(<2 x i64> %x)
+  %z = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> %x)
   %r = or i64 %y, %z
   ret i64 %r
 }
@@ -838,7 +838,7 @@
 ; CHECK-NEXT:    orrs r1, r2
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i64 @llvm.experimental.vector.reduce.or.v4i64(<4 x i64> %x)
+  %z = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> %x)
   %r = or i64 %y, %z
   ret i64 %r
 }
@@ -851,7 +851,7 @@
 ; CHECK-NEXT:    eors r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i32 @llvm.experimental.vector.reduce.xor.v2i32(<2 x i32> %x)
+  %z = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> %x)
   ret i32 %z
 }
 
@@ -867,7 +867,7 @@
 ; CHECK-NEXT:    eors r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32> %x)
+  %z = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %x)
   ret i32 %z
 }
 
@@ -884,7 +884,7 @@
 ; CHECK-NEXT:    eors r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32> %x)
+  %z = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> %x)
   ret i32 %z
 }
 
@@ -900,7 +900,7 @@
 ; CHECK-NEXT:    eors r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i16 @llvm.experimental.vector.reduce.xor.v4i16(<4 x i16> %x)
+  %z = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> %x)
   ret i16 %z
 }
 
@@ -918,7 +918,7 @@
 ; CHECK-NEXT:    eors r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i16 @llvm.experimental.vector.reduce.xor.v8i16(<8 x i16> %x)
+  %z = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> %x)
   ret i16 %z
 }
 
@@ -937,7 +937,7 @@
 ; CHECK-NEXT:    eors r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i16 @llvm.experimental.vector.reduce.xor.v16i16(<16 x i16> %x)
+  %z = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> %x)
   ret i16 %z
 }
 
@@ -955,7 +955,7 @@
 ; CHECK-NEXT:    eors r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i8 @llvm.experimental.vector.reduce.xor.v8i8(<8 x i8> %x)
+  %z = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> %x)
   ret i8 %z
 }
 
@@ -975,7 +975,7 @@
 ; CHECK-NEXT:    eors r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i8 @llvm.experimental.vector.reduce.xor.v16i8(<16 x i8> %x)
+  %z = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> %x)
   ret i8 %z
 }
 
@@ -996,7 +996,7 @@
 ; CHECK-NEXT:    eors r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i8 @llvm.experimental.vector.reduce.xor.v32i8(<32 x i8> %x)
+  %z = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> %x)
   ret i8 %z
 }
 
@@ -1005,7 +1005,7 @@
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i64 @llvm.experimental.vector.reduce.xor.v1i64(<1 x i64> %x)
+  %z = call i64 @llvm.vector.reduce.xor.v1i64(<1 x i64> %x)
   ret i64 %z
 }
 
@@ -1020,7 +1020,7 @@
 ; CHECK-NEXT:    eors r1, r2
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i64 @llvm.experimental.vector.reduce.xor.v2i64(<2 x i64> %x)
+  %z = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> %x)
   ret i64 %z
 }
 
@@ -1036,7 +1036,7 @@
 ; CHECK-NEXT:    eors r1, r2
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i64 @llvm.experimental.vector.reduce.xor.v4i64(<4 x i64> %x)
+  %z = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> %x)
   ret i64 %z
 }
 
@@ -1049,7 +1049,7 @@
 ; CHECK-NEXT:    eors r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i32 @llvm.experimental.vector.reduce.xor.v2i32(<2 x i32> %x)
+  %z = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> %x)
   %r = xor i32 %y, %z
   ret i32 %r
 }
@@ -1067,7 +1067,7 @@
 ; CHECK-NEXT:    eors r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32> %x)
+  %z = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %x)
   %r = xor i32 %y, %z
   ret i32 %r
 }
@@ -1086,7 +1086,7 @@
 ; CHECK-NEXT:    eors r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32> %x)
+  %z = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> %x)
   %r = xor i32 %y, %z
   ret i32 %r
 }
@@ -1104,7 +1104,7 @@
 ; CHECK-NEXT:    eors r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i16 @llvm.experimental.vector.reduce.xor.v4i16(<4 x i16> %x)
+  %z = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> %x)
   %r = xor i16 %y, %z
   ret i16 %r
 }
@@ -1124,7 +1124,7 @@
 ; CHECK-NEXT:    eors r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i16 @llvm.experimental.vector.reduce.xor.v8i16(<8 x i16> %x)
+  %z = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> %x)
   %r = xor i16 %y, %z
   ret i16 %r
 }
@@ -1145,7 +1145,7 @@
 ; CHECK-NEXT:    eors r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i16 @llvm.experimental.vector.reduce.xor.v16i16(<16 x i16> %x)
+  %z = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> %x)
   %r = xor i16 %y, %z
   ret i16 %r
 }
@@ -1165,7 +1165,7 @@
 ; CHECK-NEXT:    eors r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i8 @llvm.experimental.vector.reduce.xor.v8i8(<8 x i8> %x)
+  %z = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> %x)
   %r = xor i8 %y, %z
   ret i8 %r
 }
@@ -1187,7 +1187,7 @@
 ; CHECK-NEXT:    eors r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i8 @llvm.experimental.vector.reduce.xor.v16i8(<16 x i8> %x)
+  %z = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> %x)
   %r = xor i8 %y, %z
   ret i8 %r
 }
@@ -1210,7 +1210,7 @@
 ; CHECK-NEXT:    eors r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i8 @llvm.experimental.vector.reduce.xor.v32i8(<32 x i8> %x)
+  %z = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> %x)
   %r = xor i8 %y, %z
   ret i8 %r
 }
@@ -1222,7 +1222,7 @@
 ; CHECK-NEXT:    eors r1, r3
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i64 @llvm.experimental.vector.reduce.xor.v1i64(<1 x i64> %x)
+  %z = call i64 @llvm.vector.reduce.xor.v1i64(<1 x i64> %x)
   %r = xor i64 %y, %z
   ret i64 %r
 }
@@ -1240,7 +1240,7 @@
 ; CHECK-NEXT:    eors r1, r2
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i64 @llvm.experimental.vector.reduce.xor.v2i64(<2 x i64> %x)
+  %z = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> %x)
   %r = xor i64 %y, %z
   ret i64 %r
 }
@@ -1259,44 +1259,44 @@
 ; CHECK-NEXT:    eors r1, r2
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i64 @llvm.experimental.vector.reduce.xor.v4i64(<4 x i64> %x)
+  %z = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> %x)
   %r = xor i64 %y, %z
   ret i64 %r
 }
 
-declare i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.or.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.or.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.xor.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.xor.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.xor.v8i16(<8 x i16>)
-declare i32 @llvm.experimental.vector.reduce.and.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.or.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.xor.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32>)
-declare i64 @llvm.experimental.vector.reduce.and.v1i64(<1 x i64>)
-declare i64 @llvm.experimental.vector.reduce.and.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.and.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.or.v1i64(<1 x i64>)
-declare i64 @llvm.experimental.vector.reduce.or.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.or.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.xor.v1i64(<1 x i64>)
-declare i64 @llvm.experimental.vector.reduce.xor.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.xor.v4i64(<4 x i64>)
-declare i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.or.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.or.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.xor.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.xor.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.xor.v8i8(<8 x i8>)
+declare i16 @llvm.vector.reduce.and.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.and.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.and.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.or.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.or.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.or.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.xor.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.xor.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.xor.v8i16(<8 x i16>)
+declare i32 @llvm.vector.reduce.and.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.and.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.and.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.or.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.or.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.or.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.xor.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.xor.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.xor.v8i32(<8 x i32>)
+declare i64 @llvm.vector.reduce.and.v1i64(<1 x i64>)
+declare i64 @llvm.vector.reduce.and.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.and.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.or.v1i64(<1 x i64>)
+declare i64 @llvm.vector.reduce.or.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.or.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.xor.v1i64(<1 x i64>)
+declare i64 @llvm.vector.reduce.xor.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.xor.v4i64(<4 x i64>)
+declare i8 @llvm.vector.reduce.and.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.and.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.and.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.or.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.or.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.or.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.xor.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.xor.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.xor.v8i8(<8 x i8>)
diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-fadd.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-fadd.ll
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-fadd.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-fadd.ll
@@ -9,7 +9,7 @@
 ; CHECK-NEXT:    vadd.f32 s0, s4, s0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v2f32(float %y, <2 x float> %x)
+  %z = call fast float @llvm.vector.reduce.fadd.f32.v2f32(float %y, <2 x float> %x)
   ret float %z
 }
 
@@ -30,7 +30,7 @@
 ; CHECK-NOFP-NEXT:    vadd.f32 s0, s4, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
-  %z = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %y, <4 x float> %x)
+  %z = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float %y, <4 x float> %x)
   ret float %z
 }
 
@@ -56,7 +56,7 @@
 ; CHECK-NOFP-NEXT:    vadd.f32 s0, s8, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
-  %z = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float %y, <8 x float> %x)
+  %z = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float %y, <8 x float> %x)
   ret float %z
 }
 
@@ -71,7 +71,7 @@
 ; CHECK-NEXT:    bx lr
 entry:
   %y = load half, half* %yy
-  %z = call fast half @llvm.experimental.vector.reduce.v2.fadd.f16.v2f16(half %y, <2 x half> %x)
+  %z = call fast half @llvm.vector.reduce.fadd.f16.v2f16(half %y, <2 x half> %x)
   store half %z, half* %yy
   ret void
 }
@@ -102,7 +102,7 @@
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %y = load half, half* %yy
-  %z = call fast half @llvm.experimental.vector.reduce.v2.fadd.f16.v4f16(half %y, <4 x half> %x)
+  %z = call fast half @llvm.vector.reduce.fadd.f16.v4f16(half %y, <4 x half> %x)
   store half %z, half* %yy
   ret void
 }
@@ -139,7 +139,7 @@
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %y = load half, half* %yy
-  %z = call fast half @llvm.experimental.vector.reduce.v2.fadd.f16.v8f16(half %y, <8 x half> %x)
+  %z = call fast half @llvm.vector.reduce.fadd.f16.v8f16(half %y, <8 x half> %x)
   store half %z, half* %yy
   ret void
 }
@@ -189,7 +189,7 @@
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %y = load half, half* %yy
-  %z = call fast half @llvm.experimental.vector.reduce.v2.fadd.f16.v16f16(half %y, <16 x half> %x)
+  %z = call fast half @llvm.vector.reduce.fadd.f16.v16f16(half %y, <16 x half> %x)
   store half %z, half* %yy
   ret void
 }
@@ -200,7 +200,7 @@
 ; CHECK-NEXT:    vadd.f64 d0, d1, d0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v1f64(double %y, <1 x double> %x)
+  %z = call fast double @llvm.vector.reduce.fadd.f64.v1f64(double %y, <1 x double> %x)
   ret double %z
 }
 
@@ -211,7 +211,7 @@
 ; CHECK-NEXT:    vadd.f64 d0, d2, d0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double %y, <2 x double> %x)
+  %z = call fast double @llvm.vector.reduce.fadd.f64.v2f64(double %y, <2 x double> %x)
   ret double %z
 }
 
@@ -224,7 +224,7 @@
 ; CHECK-NEXT:    vadd.f64 d0, d4, d0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double %y, <4 x double> %x)
+  %z = call fast double @llvm.vector.reduce.fadd.f64.v4f64(double %y, <4 x double> %x)
   ret double %z
 }
 
@@ -235,7 +235,7 @@
 ; CHECK-NEXT:    vadd.f32 s0, s4, s1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v2f32(float %y, <2 x float> %x)
+  %z = call float @llvm.vector.reduce.fadd.f32.v2f32(float %y, <2 x float> %x)
   ret float %z
 }
 
@@ -248,7 +248,7 @@
 ; CHECK-NEXT:    vadd.f32 s0, s4, s3
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %y, <4 x float> %x)
+  %z = call float @llvm.vector.reduce.fadd.f32.v4f32(float %y, <4 x float> %x)
   ret float %z
 }
 
@@ -265,7 +265,7 @@
 ; CHECK-NEXT:    vadd.f32 s0, s0, s7
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float %y, <8 x float> %x)
+  %z = call float @llvm.vector.reduce.fadd.f32.v8f32(float %y, <8 x float> %x)
   ret float %z
 }
 
@@ -283,7 +283,7 @@
 ; CHECK-NEXT:    bx lr
 entry:
   %y = load half, half* %yy
-  %z = call half @llvm.experimental.vector.reduce.v2.fadd.f16.v4f16(half %y, <4 x half> %x)
+  %z = call half @llvm.vector.reduce.fadd.f16.v4f16(half %y, <4 x half> %x)
   store half %z, half* %yy
   ret void
 }
@@ -308,7 +308,7 @@
 ; CHECK-NEXT:    bx lr
 entry:
   %y = load half, half* %yy
-  %z = call half @llvm.experimental.vector.reduce.v2.fadd.f16.v8f16(half %y, <8 x half> %x)
+  %z = call half @llvm.vector.reduce.fadd.f16.v8f16(half %y, <8 x half> %x)
   store half %z, half* %yy
   ret void
 }
@@ -345,7 +345,7 @@
 ; CHECK-NEXT:    bx lr
 entry:
   %y = load half, half* %yy
-  %z = call half @llvm.experimental.vector.reduce.v2.fadd.f16.v16f16(half %y, <16 x half> %x)
+  %z = call half @llvm.vector.reduce.fadd.f16.v16f16(half %y, <16 x half> %x)
   store half %z, half* %yy
   ret void
 }
@@ -356,7 +356,7 @@
 ; CHECK-NEXT:    vadd.f64 d0, d1, d0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v1f64(double %y, <1 x double> %x)
+  %z = call double @llvm.vector.reduce.fadd.f64.v1f64(double %y, <1 x double> %x)
   ret double %z
 }
 
@@ -367,7 +367,7 @@
 ; CHECK-NEXT:    vadd.f64 d0, d2, d1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double %y, <2 x double> %x)
+  %z = call double @llvm.vector.reduce.fadd.f64.v2f64(double %y, <2 x double> %x)
   ret double %z
 }
 
@@ -380,17 +380,17 @@
 ; CHECK-NEXT:    vadd.f64 d0, d0, d3
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double %y, <4 x double> %x)
+  %z = call double @llvm.vector.reduce.fadd.f64.v4f64(double %y, <4 x double> %x)
   ret double %z
 }
 
-declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v1f64(double, <1 x double>)
-declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double, <2 x double>)
-declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double, <4 x double>)
-declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v2f32(float, <2 x float>)
-declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float, <4 x float>)
-declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float, <8 x float>)
-declare half @llvm.experimental.vector.reduce.v2.fadd.f16.v16f16(half, <16 x half>)
-declare half @llvm.experimental.vector.reduce.v2.fadd.f16.v2f16(half, <2 x half>)
-declare half @llvm.experimental.vector.reduce.v2.fadd.f16.v4f16(half, <4 x half>)
-declare half @llvm.experimental.vector.reduce.v2.fadd.f16.v8f16(half, <8 x half>)
+declare double @llvm.vector.reduce.fadd.f64.v1f64(double, <1 x double>)
+declare double @llvm.vector.reduce.fadd.f64.v2f64(double, <2 x double>)
+declare double @llvm.vector.reduce.fadd.f64.v4f64(double, <4 x double>)
+declare float @llvm.vector.reduce.fadd.f32.v2f32(float, <2 x float>)
+declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
+declare float @llvm.vector.reduce.fadd.f32.v8f32(float, <8 x float>)
+declare half @llvm.vector.reduce.fadd.f16.v16f16(half, <16 x half>)
+declare half @llvm.vector.reduce.fadd.f16.v2f16(half, <2 x half>)
+declare half @llvm.vector.reduce.fadd.f16.v4f16(half, <4 x half>)
+declare half @llvm.vector.reduce.fadd.f16.v8f16(half, <8 x half>)
diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-fminmax.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-fminmax.ll
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-fminmax.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-fminmax.ll
@@ -8,7 +8,7 @@
 ; CHECK-NEXT:    vminnm.f32 s0, s0, s1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call fast float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float> %x)
+  %z = call fast float @llvm.vector.reduce.fmin.v2f32(<2 x float> %x)
   ret float %z
 }
 
@@ -27,7 +27,7 @@
 ; CHECK-NOFP-NEXT:    vminnm.f32 s0, s4, s3
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
-  %z = call fast float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> %x)
+  %z = call fast float @llvm.vector.reduce.fmin.v4f32(<4 x float> %x)
   ret float %z
 }
 
@@ -60,7 +60,7 @@
 ; CHECK-NOFP-NEXT:    vminnm.f32 s0, s2, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
-  %z = call fast float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float> %x)
+  %z = call fast float @llvm.vector.reduce.fmin.v8f32(<8 x float> %x)
   ret float %z
 }
 
@@ -83,7 +83,7 @@
 ; CHECK-NOFP-NEXT:    vminnm.f16 s0, s4, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
-  %z = call fast half @llvm.experimental.vector.reduce.fmin.v4f16(<4 x half> %x)
+  %z = call fast half @llvm.vector.reduce.fmin.v4f16(<4 x half> %x)
   ret half %z
 }
 
@@ -112,7 +112,7 @@
 ; CHECK-NOFP-NEXT:    vminnm.f16 s0, s4, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
-  %z = call fast half @llvm.experimental.vector.reduce.fmin.v8f16(<8 x half> %x)
+  %z = call fast half @llvm.vector.reduce.fmin.v8f16(<8 x half> %x)
   ret half %z
 }
 
@@ -170,7 +170,7 @@
 ; CHECK-NOFP-NEXT:    vminnm.f16 s0, s8, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
-  %z = call fast half @llvm.experimental.vector.reduce.fmin.v16f16(<16 x half> %x)
+  %z = call fast half @llvm.vector.reduce.fmin.v16f16(<16 x half> %x)
   ret half %z
 }
 
@@ -179,7 +179,7 @@
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call fast double @llvm.experimental.vector.reduce.fmin.v1f64(<1 x double> %x)
+  %z = call fast double @llvm.vector.reduce.fmin.v1f64(<1 x double> %x)
   ret double %z
 }
 
@@ -189,7 +189,7 @@
 ; CHECK-NEXT:    vminnm.f64 d0, d0, d1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call fast double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double> %x)
+  %z = call fast double @llvm.vector.reduce.fmin.v2f64(<2 x double> %x)
   ret double %z
 }
 
@@ -205,7 +205,7 @@
 ; CHECK-NEXT:    vminnm.f64 d0, d0, d4
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call fast double @llvm.experimental.vector.reduce.fmin.v4f64(<4 x double> %x)
+  %z = call fast double @llvm.vector.reduce.fmin.v4f64(<4 x double> %x)
   ret double %z
 }
 
@@ -215,7 +215,7 @@
 ; CHECK-NEXT:    vminnm.f32 s0, s0, s1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float> %x)
+  %z = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> %x)
   ret float %z
 }
 
@@ -234,7 +234,7 @@
 ; CHECK-NOFP-NEXT:    vminnm.f32 s0, s4, s3
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
-  %z = call float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> %x)
+  %z = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %x)
   ret float %z
 }
 
@@ -258,7 +258,7 @@
 ; CHECK-NOFP-NEXT:    vminnm.f32 s0, s8, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
-  %z = call float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float> %x)
+  %z = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> %x)
   ret float %z
 }
 
@@ -281,7 +281,7 @@
 ; CHECK-NOFP-NEXT:    vminnm.f16 s0, s4, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
-  %z = call half @llvm.experimental.vector.reduce.fmin.v4f16(<4 x half> %x)
+  %z = call half @llvm.vector.reduce.fmin.v4f16(<4 x half> %x)
   ret half %z
 }
 
@@ -310,7 +310,7 @@
 ; CHECK-NOFP-NEXT:    vminnm.f16 s0, s4, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
-  %z = call half @llvm.experimental.vector.reduce.fmin.v8f16(<8 x half> %x)
+  %z = call half @llvm.vector.reduce.fmin.v8f16(<8 x half> %x)
   ret half %z
 }
 
@@ -352,7 +352,7 @@
 ; CHECK-NOFP-NEXT:    vminnm.f16 s0, s8, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
-  %z = call half @llvm.experimental.vector.reduce.fmin.v16f16(<16 x half> %x)
+  %z = call half @llvm.vector.reduce.fmin.v16f16(<16 x half> %x)
   ret half %z
 }
 
@@ -361,7 +361,7 @@
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call double @llvm.experimental.vector.reduce.fmin.v1f64(<1 x double> %x)
+  %z = call double @llvm.vector.reduce.fmin.v1f64(<1 x double> %x)
   ret double %z
 }
 
@@ -371,7 +371,7 @@
 ; CHECK-NEXT:    vminnm.f64 d0, d0, d1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double> %x)
+  %z = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> %x)
   ret double %z
 }
 
@@ -383,7 +383,7 @@
 ; CHECK-NEXT:    vminnm.f64 d0, d0, d4
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call double @llvm.experimental.vector.reduce.fmin.v4f64(<4 x double> %x)
+  %z = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> %x)
   ret double %z
 }
 
@@ -394,7 +394,7 @@
 ; CHECK-NEXT:    vminnm.f32 s0, s4, s0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call fast float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float> %x)
+  %z = call fast float @llvm.vector.reduce.fmin.v2f32(<2 x float> %x)
   %c = fcmp fast olt float %y, %z
   %r = select i1 %c, float %y, float %z
   ret float %r
@@ -417,7 +417,7 @@
 ; CHECK-NOFP-NEXT:    vminnm.f32 s0, s4, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
-  %z = call fast float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> %x)
+  %z = call fast float @llvm.vector.reduce.fmin.v4f32(<4 x float> %x)
   %c = fcmp fast olt float %y, %z
   %r = select i1 %c, float %y, float %z
   ret float %r
@@ -453,7 +453,7 @@
 ; CHECK-NOFP-NEXT:    vminnm.f32 s0, s8, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
-  %z = call fast float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float> %x)
+  %z = call fast float @llvm.vector.reduce.fmin.v8f32(<8 x float> %x)
   %c = fcmp fast olt float %y, %z
   %r = select i1 %c, float %y, float %z
   ret float %r
@@ -485,7 +485,7 @@
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %y = load half, half* %yy
-  %z = call fast half @llvm.experimental.vector.reduce.fmin.v4f16(<4 x half> %x)
+  %z = call fast half @llvm.vector.reduce.fmin.v4f16(<4 x half> %x)
   %c = fcmp fast olt half %y, %z
   %r = select i1 %c, half %y, half %z
   store half %r, half* %yy
@@ -503,7 +503,7 @@
 ; CHECK-NEXT:    bx lr
 entry:
   %y = load half, half* %yy
-  %z = call fast half @llvm.experimental.vector.reduce.fmin.v2f16(<2 x half> %x)
+  %z = call fast half @llvm.vector.reduce.fmin.v2f16(<2 x half> %x)
   %c = fcmp fast olt half %y, %z
   %r = select i1 %c, half %y, half %z
   store half %r, half* %yy
@@ -542,7 +542,7 @@
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %y = load half, half* %yy
-  %z = call fast half @llvm.experimental.vector.reduce.fmin.v8f16(<8 x half> %x)
+  %z = call fast half @llvm.vector.reduce.fmin.v8f16(<8 x half> %x)
   %c = fcmp fast olt half %y, %z
   %r = select i1 %c, half %y, half %z
   store half %r, half* %yy
@@ -610,7 +610,7 @@
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %y = load half, half* %yy
-  %z = call fast half @llvm.experimental.vector.reduce.fmin.v16f16(<16 x half> %x)
+  %z = call fast half @llvm.vector.reduce.fmin.v16f16(<16 x half> %x)
   %c = fcmp fast olt half %y, %z
   %r = select i1 %c, half %y, half %z
   store half %r, half* %yy
@@ -623,7 +623,7 @@
 ; CHECK-NEXT:    vminnm.f64 d0, d1, d0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call fast double @llvm.experimental.vector.reduce.fmin.v1f64(<1 x double> %x)
+  %z = call fast double @llvm.vector.reduce.fmin.v1f64(<1 x double> %x)
   %c = fcmp fast olt double %y, %z
   %r = select i1 %c, double %y, double %z
   ret double %r
@@ -636,7 +636,7 @@
 ; CHECK-NEXT:    vminnm.f64 d0, d2, d0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call fast double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double> %x)
+  %z = call fast double @llvm.vector.reduce.fmin.v2f64(<2 x double> %x)
   %c = fcmp fast olt double %y, %z
   %r = select i1 %c, double %y, double %z
   ret double %r
@@ -655,7 +655,7 @@
 ; CHECK-NEXT:    vminnm.f64 d0, d4, d0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call fast double @llvm.experimental.vector.reduce.fmin.v4f64(<4 x double> %x)
+  %z = call fast double @llvm.vector.reduce.fmin.v4f64(<4 x double> %x)
   %c = fcmp fast olt double %y, %z
   %r = select i1 %c, double %y, double %z
   ret double %r
@@ -670,7 +670,7 @@
 ; CHECK-NEXT:    vselgt.f32 s0, s4, s0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float> %x)
+  %z = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> %x)
   %c = fcmp olt float %y, %z
   %r = select i1 %c, float %y, float %z
   ret float %r
@@ -697,7 +697,7 @@
 ; CHECK-NOFP-NEXT:    vselgt.f32 s0, s4, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
-  %z = call float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> %x)
+  %z = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %x)
   %c = fcmp olt float %y, %z
   %r = select i1 %c, float %y, float %z
   ret float %r
@@ -729,7 +729,7 @@
 ; CHECK-NOFP-NEXT:    vselgt.f32 s0, s8, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
-  %z = call float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float> %x)
+  %z = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> %x)
   %c = fcmp olt float %y, %z
   %r = select i1 %c, float %y, float %z
   ret float %r
@@ -765,7 +765,7 @@
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %y = load half, half* %yy
-  %z = call half @llvm.experimental.vector.reduce.fmin.v4f16(<4 x half> %x)
+  %z = call half @llvm.vector.reduce.fmin.v4f16(<4 x half> %x)
   %c = fcmp olt half %y, %z
   %r = select i1 %c, half %y, half %z
   store half %r, half* %yy
@@ -808,7 +808,7 @@
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %y = load half, half* %yy
-  %z = call half @llvm.experimental.vector.reduce.fmin.v8f16(<8 x half> %x)
+  %z = call half @llvm.vector.reduce.fmin.v8f16(<8 x half> %x)
   %c = fcmp olt half %y, %z
   %r = select i1 %c, half %y, half %z
   store half %r, half* %yy
@@ -864,7 +864,7 @@
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %y = load half, half* %yy
-  %z = call half @llvm.experimental.vector.reduce.fmin.v16f16(<16 x half> %x)
+  %z = call half @llvm.vector.reduce.fmin.v16f16(<16 x half> %x)
   %c = fcmp olt half %y, %z
   %r = select i1 %c, half %y, half %z
   store half %r, half* %yy
@@ -879,7 +879,7 @@
 ; CHECK-NEXT:    vselgt.f64 d0, d1, d0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call double @llvm.experimental.vector.reduce.fmin.v1f64(<1 x double> %x)
+  %z = call double @llvm.vector.reduce.fmin.v1f64(<1 x double> %x)
   %c = fcmp olt double %y, %z
   %r = select i1 %c, double %y, double %z
   ret double %r
@@ -894,7 +894,7 @@
 ; CHECK-NEXT:    vselgt.f64 d0, d2, d0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double> %x)
+  %z = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> %x)
   %c = fcmp olt double %y, %z
   %r = select i1 %c, double %y, double %z
   ret double %r
@@ -911,7 +911,7 @@
 ; CHECK-NEXT:    vselgt.f64 d0, d4, d0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call double @llvm.experimental.vector.reduce.fmin.v4f64(<4 x double> %x)
+  %z = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> %x)
   %c = fcmp olt double %y, %z
   %r = select i1 %c, double %y, double %z
   ret double %r
@@ -923,7 +923,7 @@
 ; CHECK-NEXT:    vmaxnm.f32 s0, s0, s1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call fast float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float> %x)
+  %z = call fast float @llvm.vector.reduce.fmax.v2f32(<2 x float> %x)
   ret float %z
 }
 
@@ -942,7 +942,7 @@
 ; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s4, s3
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
-  %z = call fast float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> %x)
+  %z = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> %x)
   ret float %z
 }
 
@@ -974,7 +974,7 @@
 ; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s2, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
-  %z = call fast float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float> %x)
+  %z = call fast float @llvm.vector.reduce.fmax.v8f32(<8 x float> %x)
   ret float %z
 }
 
@@ -997,7 +997,7 @@
 ; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s4, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
-  %z = call fast half @llvm.experimental.vector.reduce.fmax.v4f16(<4 x half> %x)
+  %z = call fast half @llvm.vector.reduce.fmax.v4f16(<4 x half> %x)
   ret half %z
 }
 
@@ -1026,7 +1026,7 @@
 ; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s4, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
-  %z = call fast half @llvm.experimental.vector.reduce.fmax.v8f16(<8 x half> %x)
+  %z = call fast half @llvm.vector.reduce.fmax.v8f16(<8 x half> %x)
   ret half %z
 }
 
@@ -1084,7 +1084,7 @@
 ; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s8, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
-  %z = call fast half @llvm.experimental.vector.reduce.fmax.v16f16(<16 x half> %x)
+  %z = call fast half @llvm.vector.reduce.fmax.v16f16(<16 x half> %x)
   ret half %z
 }
 
@@ -1093,7 +1093,7 @@
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call fast double @llvm.experimental.vector.reduce.fmax.v1f64(<1 x double> %x)
+  %z = call fast double @llvm.vector.reduce.fmax.v1f64(<1 x double> %x)
   ret double %z
 }
 
@@ -1103,7 +1103,7 @@
 ; CHECK-NEXT:    vmaxnm.f64 d0, d0, d1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call fast double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double> %x)
+  %z = call fast double @llvm.vector.reduce.fmax.v2f64(<2 x double> %x)
   ret double %z
 }
 
@@ -1119,7 +1119,7 @@
 ; CHECK-NEXT:    vmaxnm.f64 d0, d0, d4
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call fast double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double> %x)
+  %z = call fast double @llvm.vector.reduce.fmax.v4f64(<4 x double> %x)
   ret double %z
 }
 
@@ -1129,7 +1129,7 @@
 ; CHECK-NEXT:    vmaxnm.f32 s0, s0, s1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float> %x)
+  %z = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> %x)
   ret float %z
 }
 
@@ -1148,7 +1148,7 @@
 ; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s4, s3
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
-  %z = call float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> %x)
+  %z = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %x)
   ret float %z
 }
 
@@ -1172,7 +1172,7 @@
 ; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s8, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
-  %z = call float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float> %x)
+  %z = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> %x)
   ret float %z
 }
 
@@ -1195,7 +1195,7 @@
 ; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s4, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
-  %z = call half @llvm.experimental.vector.reduce.fmax.v4f16(<4 x half> %x)
+  %z = call half @llvm.vector.reduce.fmax.v4f16(<4 x half> %x)
   ret half %z
 }
 
@@ -1224,7 +1224,7 @@
 ; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s4, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
-  %z = call half @llvm.experimental.vector.reduce.fmax.v8f16(<8 x half> %x)
+  %z = call half @llvm.vector.reduce.fmax.v8f16(<8 x half> %x)
   ret half %z
 }
 
@@ -1266,7 +1266,7 @@
 ; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s8, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
-  %z = call half @llvm.experimental.vector.reduce.fmax.v16f16(<16 x half> %x)
+  %z = call half @llvm.vector.reduce.fmax.v16f16(<16 x half> %x)
   ret half %z
 }
 
@@ -1275,7 +1275,7 @@
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call double @llvm.experimental.vector.reduce.fmax.v1f64(<1 x double> %x)
+  %z = call double @llvm.vector.reduce.fmax.v1f64(<1 x double> %x)
   ret double %z
 }
 
@@ -1285,7 +1285,7 @@
 ; CHECK-NEXT:    vmaxnm.f64 d0, d0, d1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double> %x)
+  %z = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> %x)
   ret double %z
 }
 
@@ -1297,7 +1297,7 @@
 ; CHECK-NEXT:    vmaxnm.f64 d0, d0, d4
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double> %x)
+  %z = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> %x)
   ret double %z
 }
 
@@ -1308,7 +1308,7 @@
 ; CHECK-NEXT:    vmaxnm.f32 s0, s4, s0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call fast float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float> %x)
+  %z = call fast float @llvm.vector.reduce.fmax.v2f32(<2 x float> %x)
   %c = fcmp fast ogt float %y, %z
   %r = select i1 %c, float %y, float %z
   ret float %r
@@ -1331,7 +1331,7 @@
 ; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s4, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
-  %z = call fast float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> %x)
+  %z = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> %x)
   %c = fcmp fast ogt float %y, %z
   %r = select i1 %c, float %y, float %z
   ret float %r
@@ -1367,7 +1367,7 @@
 ; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s8, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
-  %z = call fast float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float> %x)
+  %z = call fast float @llvm.vector.reduce.fmax.v8f32(<8 x float> %x)
   %c = fcmp fast ogt float %y, %z
   %r = select i1 %c, float %y, float %z
   ret float %r
@@ -1384,7 +1384,7 @@
 ; CHECK-NEXT:    bx lr
 entry:
   %y = load half, half* %yy
-  %z = call fast half @llvm.experimental.vector.reduce.fmax.v2f16(<2 x half> %x)
+  %z = call fast half @llvm.vector.reduce.fmax.v2f16(<2 x half> %x)
   %c = fcmp fast ogt half %y, %z
   %r = select i1 %c, half %y, half %z
   store half %r, half* %yy
@@ -1417,7 +1417,7 @@
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %y = load half, half* %yy
-  %z = call fast half @llvm.experimental.vector.reduce.fmax.v4f16(<4 x half> %x)
+  %z = call fast half @llvm.vector.reduce.fmax.v4f16(<4 x half> %x)
   %c = fcmp fast ogt half %y, %z
   %r = select i1 %c, half %y, half %z
   store half %r, half* %yy
@@ -1456,7 +1456,7 @@
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %y = load half, half* %yy
-  %z = call fast half @llvm.experimental.vector.reduce.fmax.v8f16(<8 x half> %x)
+  %z = call fast half @llvm.vector.reduce.fmax.v8f16(<8 x half> %x)
   %c = fcmp fast ogt half %y, %z
   %r = select i1 %c, half %y, half %z
   store half %r, half* %yy
@@ -1524,7 +1524,7 @@
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %y = load half, half* %yy
-  %z = call fast half @llvm.experimental.vector.reduce.fmax.v16f16(<16 x half> %x)
+  %z = call fast half @llvm.vector.reduce.fmax.v16f16(<16 x half> %x)
   %c = fcmp fast ogt half %y, %z
   %r = select i1 %c, half %y, half %z
   store half %r, half* %yy
@@ -1537,7 +1537,7 @@
 ; CHECK-NEXT:    vmaxnm.f64 d0, d1, d0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call fast double @llvm.experimental.vector.reduce.fmax.v1f64(<1 x double> %x)
+  %z = call fast double @llvm.vector.reduce.fmax.v1f64(<1 x double> %x)
   %c = fcmp fast ogt double %y, %z
   %r = select i1 %c, double %y, double %z
   ret double %r
@@ -1550,7 +1550,7 @@
 ; CHECK-NEXT:    vmaxnm.f64 d0, d2, d0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call fast double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double> %x)
+  %z = call fast double @llvm.vector.reduce.fmax.v2f64(<2 x double> %x)
   %c = fcmp fast ogt double %y, %z
   %r = select i1 %c, double %y, double %z
   ret double %r
@@ -1569,7 +1569,7 @@
 ; CHECK-NEXT:    vmaxnm.f64 d0, d4, d0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call fast double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double> %x)
+  %z = call fast double @llvm.vector.reduce.fmax.v4f64(<4 x double> %x)
   %c = fcmp fast ogt double %y, %z
   %r = select i1 %c, double %y, double %z
   ret double %r
@@ -1584,7 +1584,7 @@
 ; CHECK-NEXT:    vselgt.f32 s0, s4, s0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float> %x)
+  %z = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> %x)
   %c = fcmp ogt float %y, %z
   %r = select i1 %c, float %y, float %z
   ret float %r
@@ -1611,7 +1611,7 @@
 ; CHECK-NOFP-NEXT:    vselgt.f32 s0, s4, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
-  %z = call float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> %x)
+  %z = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %x)
   %c = fcmp ogt float %y, %z
   %r = select i1 %c, float %y, float %z
   ret float %r
@@ -1643,7 +1643,7 @@
 ; CHECK-NOFP-NEXT:    vselgt.f32 s0, s8, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
-  %z = call float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float> %x)
+  %z = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> %x)
   %c = fcmp ogt float %y, %z
   %r = select i1 %c, float %y, float %z
   ret float %r
@@ -1679,7 +1679,7 @@
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %y = load half, half* %yy
-  %z = call half @llvm.experimental.vector.reduce.fmax.v4f16(<4 x half> %x)
+  %z = call half @llvm.vector.reduce.fmax.v4f16(<4 x half> %x)
   %c = fcmp ogt half %y, %z
   %r = select i1 %c, half %y, half %z
   store half %r, half* %yy
@@ -1722,7 +1722,7 @@
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %y = load half, half* %yy
-  %z = call half @llvm.experimental.vector.reduce.fmax.v8f16(<8 x half> %x)
+  %z = call half @llvm.vector.reduce.fmax.v8f16(<8 x half> %x)
   %c = fcmp ogt half %y, %z
   %r = select i1 %c, half %y, half %z
   store half %r, half* %yy
@@ -1778,7 +1778,7 @@
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %y = load half, half* %yy
-  %z = call half @llvm.experimental.vector.reduce.fmax.v16f16(<16 x half> %x)
+  %z = call half @llvm.vector.reduce.fmax.v16f16(<16 x half> %x)
   %c = fcmp ogt half %y, %z
   %r = select i1 %c, half %y, half %z
   store half %r, half* %yy
@@ -1793,7 +1793,7 @@
 ; CHECK-NEXT:    vselgt.f64 d0, d1, d0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call double @llvm.experimental.vector.reduce.fmax.v1f64(<1 x double> %x)
+  %z = call double @llvm.vector.reduce.fmax.v1f64(<1 x double> %x)
   %c = fcmp ogt double %y, %z
   %r = select i1 %c, double %y, double %z
   ret double %r
@@ -1808,7 +1808,7 @@
 ; CHECK-NEXT:    vselgt.f64 d0, d2, d0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double> %x)
+  %z = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> %x)
   %c = fcmp ogt double %y, %z
   %r = select i1 %c, double %y, double %z
   ret double %r
@@ -1825,29 +1825,29 @@
 ; CHECK-NEXT:    vselgt.f64 d0, d4, d0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double> %x)
+  %z = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> %x)
   %c = fcmp ogt double %y, %z
   %r = select i1 %c, double %y, double %z
   ret double %r
 }
 
-declare double @llvm.experimental.vector.reduce.fmax.v1f64(<1 x double>)
-declare double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double>)
-declare double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double>)
-declare double @llvm.experimental.vector.reduce.fmin.v1f64(<1 x double>)
-declare double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double>)
-declare double @llvm.experimental.vector.reduce.fmin.v4f64(<4 x double>)
-declare float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float>)
-declare float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float>)
-declare float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float>)
-declare float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float>)
-declare float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float>)
-declare float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float>)
-declare half @llvm.experimental.vector.reduce.fmax.v16f16(<16 x half>)
-declare half @llvm.experimental.vector.reduce.fmax.v2f16(<2 x half>)
-declare half @llvm.experimental.vector.reduce.fmax.v4f16(<4 x half>)
-declare half @llvm.experimental.vector.reduce.fmax.v8f16(<8 x half>)
-declare half @llvm.experimental.vector.reduce.fmin.v16f16(<16 x half>)
-declare half @llvm.experimental.vector.reduce.fmin.v2f16(<2 x half>)
-declare half @llvm.experimental.vector.reduce.fmin.v4f16(<4 x half>)
-declare half @llvm.experimental.vector.reduce.fmin.v8f16(<8 x half>)
+declare double @llvm.vector.reduce.fmax.v1f64(<1 x double>)
+declare double @llvm.vector.reduce.fmax.v2f64(<2 x double>)
+declare double @llvm.vector.reduce.fmax.v4f64(<4 x double>)
+declare double @llvm.vector.reduce.fmin.v1f64(<1 x double>)
+declare double @llvm.vector.reduce.fmin.v2f64(<2 x double>)
+declare double @llvm.vector.reduce.fmin.v4f64(<4 x double>)
+declare float @llvm.vector.reduce.fmax.v2f32(<2 x float>)
+declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>)
+declare float @llvm.vector.reduce.fmax.v8f32(<8 x float>)
+declare float @llvm.vector.reduce.fmin.v2f32(<2 x float>)
+declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>)
+declare float @llvm.vector.reduce.fmin.v8f32(<8 x float>)
+declare half @llvm.vector.reduce.fmax.v16f16(<16 x half>)
+declare half @llvm.vector.reduce.fmax.v2f16(<2 x half>)
+declare half @llvm.vector.reduce.fmax.v4f16(<4 x half>)
+declare half @llvm.vector.reduce.fmax.v8f16(<8 x half>)
+declare half @llvm.vector.reduce.fmin.v16f16(<16 x half>)
+declare half @llvm.vector.reduce.fmin.v2f16(<2 x half>)
+declare half @llvm.vector.reduce.fmin.v4f16(<4 x half>)
+declare half @llvm.vector.reduce.fmin.v8f16(<8 x half>)
diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-fmul.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-fmul.ll
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-fmul.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-fmul.ll
@@ -9,7 +9,7 @@
 ; CHECK-NEXT:    vmul.f32 s0, s4, s0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v2f32(float %y, <2 x float> %x)
+  %z = call fast float @llvm.vector.reduce.fmul.f32.v2f32(float %y, <2 x float> %x)
   ret float %z
 }
 
@@ -30,7 +30,7 @@
 ; CHECK-NOFP-NEXT:    vmul.f32 s0, s4, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
-  %z = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float %y, <4 x float> %x)
+  %z = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float %y, <4 x float> %x)
   ret float %z
 }
 
@@ -56,7 +56,7 @@
 ; CHECK-NOFP-NEXT:    vmul.f32 s0, s8, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
-  %z = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v8f32(float %y, <8 x float> %x)
+  %z = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float %y, <8 x float> %x)
   ret float %z
 }
 
@@ -71,7 +71,7 @@
 ; CHECK-NEXT:    bx lr
 entry:
   %y = load half, half* %yy
-  %z = call fast half @llvm.experimental.vector.reduce.v2.fmul.f16.v2f16(half %y, <2 x half> %x)
+  %z = call fast half @llvm.vector.reduce.fmul.f16.v2f16(half %y, <2 x half> %x)
   store half %z, half* %yy
   ret void
 }
@@ -102,7 +102,7 @@
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %y = load half, half* %yy
-  %z = call fast half @llvm.experimental.vector.reduce.v2.fmul.f16.v4f16(half %y, <4 x half> %x)
+  %z = call fast half @llvm.vector.reduce.fmul.f16.v4f16(half %y, <4 x half> %x)
   store half %z, half* %yy
   ret void
 }
@@ -139,7 +139,7 @@
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %y = load half, half* %yy
-  %z = call fast half @llvm.experimental.vector.reduce.v2.fmul.f16.v8f16(half %y, <8 x half> %x)
+  %z = call fast half @llvm.vector.reduce.fmul.f16.v8f16(half %y, <8 x half> %x)
   store half %z, half* %yy
   ret void
 }
@@ -189,7 +189,7 @@
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %y = load half, half* %yy
-  %z = call fast half @llvm.experimental.vector.reduce.v2.fmul.f16.v16f16(half %y, <16 x half> %x)
+  %z = call fast half @llvm.vector.reduce.fmul.f16.v16f16(half %y, <16 x half> %x)
   store half %z, half* %yy
   ret void
 }
@@ -200,7 +200,7 @@
 ; CHECK-NEXT:    vmul.f64 d0, d1, d0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v1f64(double %y, <1 x double> %x)
+  %z = call fast double @llvm.vector.reduce.fmul.f64.v1f64(double %y, <1 x double> %x)
   ret double %z
 }
 
@@ -211,7 +211,7 @@
 ; CHECK-NEXT:    vmul.f64 d0, d2, d0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v2f64(double %y, <2 x double> %x)
+  %z = call fast double @llvm.vector.reduce.fmul.f64.v2f64(double %y, <2 x double> %x)
   ret double %z
 }
 
@@ -224,7 +224,7 @@
 ; CHECK-NEXT:    vmul.f64 d0, d4, d0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v4f64(double %y, <4 x double> %x)
+  %z = call fast double @llvm.vector.reduce.fmul.f64.v4f64(double %y, <4 x double> %x)
   ret double %z
 }
 
@@ -235,7 +235,7 @@
 ; CHECK-NEXT:    vmul.f32 s0, s4, s1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v2f32(float %y, <2 x float> %x)
+  %z = call float @llvm.vector.reduce.fmul.f32.v2f32(float %y, <2 x float> %x)
   ret float %z
 }
 
@@ -248,7 +248,7 @@
 ; CHECK-NEXT:    vmul.f32 s0, s4, s3
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float %y, <4 x float> %x)
+  %z = call float @llvm.vector.reduce.fmul.f32.v4f32(float %y, <4 x float> %x)
   ret float %z
 }
 
@@ -265,7 +265,7 @@
 ; CHECK-NEXT:    vmul.f32 s0, s0, s7
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v8f32(float %y, <8 x float> %x)
+  %z = call float @llvm.vector.reduce.fmul.f32.v8f32(float %y, <8 x float> %x)
   ret float %z
 }
 
@@ -280,7 +280,7 @@
 ; CHECK-NEXT:    bx lr
 entry:
   %y = load half, half* %yy
-  %z = call half @llvm.experimental.vector.reduce.v2.fmul.f16.v2f16(half %y, <2 x half> %x)
+  %z = call half @llvm.vector.reduce.fmul.f16.v2f16(half %y, <2 x half> %x)
   store half %z, half* %yy
   ret void
 }
@@ -299,7 +299,7 @@
 ; CHECK-NEXT:    bx lr
 entry:
   %y = load half, half* %yy
-  %z = call half @llvm.experimental.vector.reduce.v2.fmul.f16.v4f16(half %y, <4 x half> %x)
+  %z = call half @llvm.vector.reduce.fmul.f16.v4f16(half %y, <4 x half> %x)
   store half %z, half* %yy
   ret void
 }
@@ -324,7 +324,7 @@
 ; CHECK-NEXT:    bx lr
 entry:
   %y = load half, half* %yy
-  %z = call half @llvm.experimental.vector.reduce.v2.fmul.f16.v8f16(half %y, <8 x half> %x)
+  %z = call half @llvm.vector.reduce.fmul.f16.v8f16(half %y, <8 x half> %x)
   store half %z, half* %yy
   ret void
 }
@@ -361,7 +361,7 @@
 ; CHECK-NEXT:    bx lr
 entry:
   %y = load half, half* %yy
-  %z = call half @llvm.experimental.vector.reduce.v2.fmul.f16.v16f16(half %y, <16 x half> %x)
+  %z = call half @llvm.vector.reduce.fmul.f16.v16f16(half %y, <16 x half> %x)
   store half %z, half* %yy
   ret void
 }
@@ -372,7 +372,7 @@
 ; CHECK-NEXT:    vmul.f64 d0, d1, d0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v1f64(double %y, <1 x double> %x)
+  %z = call double @llvm.vector.reduce.fmul.f64.v1f64(double %y, <1 x double> %x)
   ret double %z
 }
 
@@ -383,7 +383,7 @@
 ; CHECK-NEXT:    vmul.f64 d0, d2, d1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v2f64(double %y, <2 x double> %x)
+  %z = call double @llvm.vector.reduce.fmul.f64.v2f64(double %y, <2 x double> %x)
   ret double %z
 }
 
@@ -396,17 +396,17 @@
 ; CHECK-NEXT:    vmul.f64 d0, d0, d3
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v4f64(double %y, <4 x double> %x)
+  %z = call double @llvm.vector.reduce.fmul.f64.v4f64(double %y, <4 x double> %x)
   ret double %z
 }
 
-declare double @llvm.experimental.vector.reduce.v2.fmul.f64.v1f64(double, <1 x double>)
-declare double @llvm.experimental.vector.reduce.v2.fmul.f64.v2f64(double, <2 x double>)
-declare double @llvm.experimental.vector.reduce.v2.fmul.f64.v4f64(double, <4 x double>)
-declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v2f32(float, <2 x float>)
-declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float, <4 x float>)
-declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v8f32(float, <8 x float>)
-declare half @llvm.experimental.vector.reduce.v2.fmul.f16.v16f16(half, <16 x half>)
-declare half @llvm.experimental.vector.reduce.v2.fmul.f16.v2f16(half, <2 x half>)
-declare half @llvm.experimental.vector.reduce.v2.fmul.f16.v4f16(half, <4 x half>)
-declare half @llvm.experimental.vector.reduce.v2.fmul.f16.v8f16(half, <8 x half>)
+declare double @llvm.vector.reduce.fmul.f64.v1f64(double, <1 x double>)
+declare double @llvm.vector.reduce.fmul.f64.v2f64(double, <2 x double>)
+declare double @llvm.vector.reduce.fmul.f64.v4f64(double, <4 x double>)
+declare float @llvm.vector.reduce.fmul.f32.v2f32(float, <2 x float>)
+declare float @llvm.vector.reduce.fmul.f32.v4f32(float, <4 x float>)
+declare float @llvm.vector.reduce.fmul.f32.v8f32(float, <8 x float>)
+declare half @llvm.vector.reduce.fmul.f16.v16f16(half, <16 x half>)
+declare half @llvm.vector.reduce.fmul.f16.v2f16(half, <2 x half>)
+declare half @llvm.vector.reduce.fmul.f16.v4f16(half, <4 x half>)
+declare half @llvm.vector.reduce.fmul.f16.v8f16(half, <8 x half>)
diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll
@@ -65,7 +65,7 @@
   %0 = getelementptr inbounds i32, i32* %x, i32 %index
   %1 = bitcast i32* %0 to <4 x i32>*
   %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
-  %2 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %wide.load)
+  %2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load)
   %3 = add i32 %2, %vec.phi
   %index.next = add i32 %index, 4
   %4 = icmp eq i32 %index.next, %n.vec
@@ -167,7 +167,7 @@
   br i1 %3, label %middle.block, label %vector.body
 
 middle.block:                                     ; preds = %vector.body
-  %4 = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> %2)
+  %4 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %2)
   %cmp.n = icmp eq i32 %n.vec, %n
   br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
 
@@ -267,7 +267,7 @@
   br i1 %3, label %middle.block, label %vector.body
 
 middle.block:                                     ; preds = %vector.body
-  %4 = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> %2)
+  %4 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %2)
   %cmp.n = icmp eq i32 %n.vec, %n
   br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
 
@@ -367,7 +367,7 @@
   br i1 %3, label %middle.block, label %vector.body
 
 middle.block:                                     ; preds = %vector.body
-  %4 = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> %2)
+  %4 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %2)
   %cmp.n = icmp eq i32 %n.vec, %n
   br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
 
@@ -467,7 +467,7 @@
   br i1 %3, label %middle.block, label %vector.body
 
 middle.block:                                     ; preds = %vector.body
-  %4 = call i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32> %2)
+  %4 = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %2)
   %cmp.n = icmp eq i32 %n.vec, %n
   br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
 
@@ -568,7 +568,7 @@
   br i1 %3, label %middle.block, label %vector.body
 
 middle.block:                                     ; preds = %vector.body
-  %4 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> %2)
+  %4 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float 0.000000e+00, <4 x float> %2)
   %cmp.n = icmp eq i32 %n.vec, %n
   br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
 
@@ -665,7 +665,7 @@
   br i1 %3, label %middle.block, label %vector.body
 
 middle.block:                                     ; preds = %vector.body
-  %4 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.000000e+00, <4 x float> %2)
+  %4 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.000000e+00, <4 x float> %2)
   %cmp.n = icmp eq i32 %n.vec, %n
   br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
 
@@ -762,7 +762,7 @@
   br i1 %4, label %middle.block, label %vector.body
 
 middle.block:                                     ; preds = %vector.body
-  %5 = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> %3)
+  %5 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %3)
   %cmp.n = icmp eq i32 %n.vec, %n
   br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
 
@@ -852,7 +852,7 @@
   %0 = getelementptr inbounds i32, i32* %x, i32 %index
   %1 = bitcast i32* %0 to <4 x i32>*
   %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
-  %l5 = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> %wide.load)
+  %l5 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %wide.load)
   %2 = icmp slt i32 %vec.phi, %l5
   %3 = select i1 %2, i32 %vec.phi, i32 %l5
   %index.next = add i32 %index, 4
@@ -958,7 +958,7 @@
   br i1 %4, label %middle.block, label %vector.body
 
 middle.block:                                     ; preds = %vector.body
-  %5 = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> %3)
+  %5 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %3)
   %cmp.n = icmp eq i32 %n.vec, %n
   br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
 
@@ -1048,7 +1048,7 @@
   %0 = getelementptr inbounds i32, i32* %x, i32 %index
   %1 = bitcast i32* %0 to <4 x i32>*
   %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
-  %l5 = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> %wide.load)
+  %l5 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %wide.load)
   %2 = icmp sgt i32 %vec.phi, %l5
   %3 = select i1 %2, i32 %vec.phi, i32 %l5
   %index.next = add i32 %index, 4
@@ -1154,7 +1154,7 @@
   br i1 %4, label %middle.block, label %vector.body
 
 middle.block:                                     ; preds = %vector.body
-  %5 = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> %3)
+  %5 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %3)
   %cmp.n = icmp eq i32 %n.vec, %n
   br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
 
@@ -1244,7 +1244,7 @@
   %0 = getelementptr inbounds i32, i32* %x, i32 %index
   %1 = bitcast i32* %0 to <4 x i32>*
   %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
-  %l5 = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> %wide.load)
+  %l5 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %wide.load)
   %2 = icmp ult i32 %vec.phi, %l5
   %3 = select i1 %2, i32 %vec.phi, i32 %l5
   %index.next = add i32 %index, 4
@@ -1350,7 +1350,7 @@
   br i1 %4, label %middle.block, label %vector.body
 
 middle.block:                                     ; preds = %vector.body
-  %5 = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> %3)
+  %5 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %3)
   %cmp.n = icmp eq i32 %n.vec, %n
   br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
 
@@ -1440,7 +1440,7 @@
   %0 = getelementptr inbounds i32, i32* %x, i32 %index
   %1 = bitcast i32* %0 to <4 x i32>*
   %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
-  %l5 = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> %wide.load)
+  %l5 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %wide.load)
   %2 = icmp ugt i32 %vec.phi, %l5
   %3 = select i1 %2, i32 %vec.phi, i32 %l5
   %index.next = add i32 %index, 4
@@ -1553,7 +1553,7 @@
   br i1 %4, label %middle.block, label %vector.body
 
 middle.block:                                     ; preds = %vector.body
-  %5 = call float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> %3)
+  %5 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %3)
   %cmp.n = icmp eq i32 %n.vec, %n
   br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
 
@@ -1658,7 +1658,7 @@
   br i1 %4, label %middle.block, label %vector.body
 
 middle.block:                                     ; preds = %vector.body
-  %5 = call float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> %3)
+  %5 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %3)
   %cmp.n = icmp eq i32 %n.vec, %n
   br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
 
@@ -1722,7 +1722,7 @@
   %1 = bitcast i32* %0 to <4 x i32>*
   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
   %2 = select <4 x i1> %active.lane.mask, <4 x i32> %wide.masked.load, <4 x i32> zeroinitializer
-  %3 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %2)
+  %3 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %2)
   %4 = add i32 %3, %vec.phi
   %index.next = add i32 %index, 4
   %5 = icmp eq i32 %index.next, %n.vec
@@ -1777,7 +1777,7 @@
   %wide.masked.load13 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %3, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
   %4 = mul nsw <4 x i32> %wide.masked.load13, %wide.masked.load
   %5 = select <4 x i1> %active.lane.mask, <4 x i32> %4, <4 x i32> zeroinitializer
-  %6 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %5)
+  %6 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %5)
   %7 = add i32 %6, %vec.phi
   %index.next = add i32 %index, 4
   %8 = icmp eq i32 %index.next, %n.vec
@@ -1828,7 +1828,7 @@
   %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %1, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef)
   %2 = sext <8 x i16> %wide.masked.load to <8 x i32>
   %3 = select <8 x i1> %active.lane.mask, <8 x i32> %2, <8 x i32> zeroinitializer
-  %4 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %3)
+  %4 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %3)
   %5 = add i32 %4, %vec.phi
   %index.next = add i32 %index, 8
   %6 = icmp eq i32 %index.next, %n.vec
@@ -1885,7 +1885,7 @@
   %5 = sext <8 x i16> %wide.masked.load14 to <8 x i32>
   %6 = mul nsw <8 x i32> %5, %2
   %7 = select <8 x i1> %active.lane.mask, <8 x i32> %6, <8 x i32> zeroinitializer
-  %8 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %7)
+  %8 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %7)
   %9 = add i32 %8, %vec.phi
   %index.next = add i32 %index, 8
   %10 = icmp eq i32 %index.next, %n.vec
@@ -1936,7 +1936,7 @@
   %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
   %2 = zext <16 x i8> %wide.masked.load to <16 x i32>
   %3 = select <16 x i1> %active.lane.mask, <16 x i32> %2, <16 x i32> zeroinitializer
-  %4 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %3)
+  %4 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %3)
   %5 = add i32 %4, %vec.phi
   %index.next = add i32 %index, 16
   %6 = icmp eq i32 %index.next, %n.vec
@@ -1993,7 +1993,7 @@
   %5 = zext <16 x i8> %wide.masked.load14 to <16 x i32>
   %6 = mul nuw nsw <16 x i32> %5, %2
   %7 = select <16 x i1> %active.lane.mask, <16 x i32> %6, <16 x i32> zeroinitializer
-  %8 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %7)
+  %8 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %7)
   %9 = add i32 %8, %vec.phi
   %index.next = add i32 %index, 16
   %10 = icmp eq i32 %index.next, %n.vec
@@ -2043,7 +2043,7 @@
   %1 = bitcast i16* %0 to <8 x i16>*
   %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %1, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef)
   %2 = select <8 x i1> %active.lane.mask, <8 x i16> %wide.masked.load, <8 x i16> zeroinitializer
-  %3 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %2)
+  %3 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %2)
   %4 = add i16 %3, %vec.phi
   %index.next = add i32 %index, 8
   %5 = icmp eq i32 %index.next, %n.vec
@@ -2098,7 +2098,7 @@
   %wide.masked.load16 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %3, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef)
   %4 = mul <8 x i16> %wide.masked.load16, %wide.masked.load
   %5 = select <8 x i1> %active.lane.mask, <8 x i16> %4, <8 x i16> zeroinitializer
-  %6 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %5)
+  %6 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %5)
   %7 = add i16 %6, %vec.phi
   %index.next = add i32 %index, 8
   %8 = icmp eq i32 %index.next, %n.vec
@@ -2149,7 +2149,7 @@
   %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
   %2 = zext <16 x i8> %wide.masked.load to <16 x i16>
   %3 = select <16 x i1> %active.lane.mask, <16 x i16> %2, <16 x i16> zeroinitializer
-  %4 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %3)
+  %4 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %3)
   %5 = add i16 %4, %vec.phi
   %index.next = add i32 %index, 16
   %6 = icmp eq i32 %index.next, %n.vec
@@ -2206,7 +2206,7 @@
   %5 = zext <16 x i8> %wide.masked.load18 to <16 x i16>
   %6 = mul nuw <16 x i16> %5, %2
   %7 = select <16 x i1> %active.lane.mask, <16 x i16> %6, <16 x i16> zeroinitializer
-  %8 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %7)
+  %8 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %7)
   %9 = add i16 %8, %vec.phi
   %index.next = add i32 %index, 16
   %10 = icmp eq i32 %index.next, %n.vec
@@ -2256,7 +2256,7 @@
   %1 = bitcast i8* %0 to <16 x i8>*
   %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
   %2 = select <16 x i1> %active.lane.mask, <16 x i8> %wide.masked.load, <16 x i8> zeroinitializer
-  %3 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %2)
+  %3 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %2)
   %4 = add i8 %3, %vec.phi
   %index.next = add i32 %index, 16
   %5 = icmp eq i32 %index.next, %n.vec
@@ -2311,7 +2311,7 @@
   %wide.masked.load15 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %3, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
   %4 = mul <16 x i8> %wide.masked.load15, %wide.masked.load
   %5 = select <16 x i1> %active.lane.mask, <16 x i8> %4, <16 x i8> zeroinitializer
-  %6 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %5)
+  %6 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %5)
   %7 = add i8 %6, %vec.phi
   %index.next = add i32 %index, 16
   %8 = icmp eq i32 %index.next, %n.vec
@@ -2364,7 +2364,7 @@
   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
   %2 = sext <4 x i32> %wide.masked.load to <4 x i64>
   %3 = select <4 x i1> %active.lane.mask, <4 x i64> %2, <4 x i64> zeroinitializer
-  %4 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %3)
+  %4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %3)
   %5 = add i64 %4, %vec.phi
   %index.next = add i32 %index, 4
   %6 = icmp eq i32 %index.next, %n.vec
@@ -2423,7 +2423,7 @@
   %5 = sext <4 x i32> %wide.masked.load14 to <4 x i64>
   %6 = mul nsw <4 x i64> %5, %2
   %7 = select <4 x i1> %active.lane.mask, <4 x i64> %6, <4 x i64> zeroinitializer
-  %8 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %7)
+  %8 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %7)
   %9 = add i64 %8, %vec.phi
   %index.next = add i32 %index, 4
   %10 = icmp eq i32 %index.next, %n.vec
@@ -2482,7 +2482,7 @@
   %5 = sext <8 x i16> %wide.masked.load14 to <8 x i64>
   %6 = mul nsw <8 x i64> %5, %2
   %7 = select <8 x i1> %active.lane.mask, <8 x i64> %6, <8 x i64> zeroinitializer
-  %8 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %7)
+  %8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %7)
   %9 = add i64 %8, %vec.phi
   %index.next = add i32 %index, 8
   %10 = icmp eq i32 %index.next, %n.vec
@@ -2497,26 +2497,26 @@
 declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #2
 declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32) #1
 declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>) #2
-declare i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32>) #3
+declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>) #3
 declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32) #1
 declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>) #2
-declare i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32>) #3
-declare i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16>) #3
-declare i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16>) #3
-declare i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8>) #3
-declare i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64>) #3
-declare i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64>) #3
-
-declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32>)
-declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float, <4 x float>)
-declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float, <4 x float>)
-declare i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32>)
-declare float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float>)
-declare float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float>)
+declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>) #3
+declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>) #3
+declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>) #3
+declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>) #3
+declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>) #3
+declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>) #3
+
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.mul.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.and.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.or.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.xor.v4i32(<4 x i32>)
+declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
+declare float @llvm.vector.reduce.fmul.f32.v4f32(float, <4 x float>)
+declare i32 @llvm.vector.reduce.smin.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.smax.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.umin.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.umax.v4i32(<4 x i32>)
+declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>)
+declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>)
diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll
@@ -8,7 +8,7 @@
 ; CHECK-NEXT:    bx lr
 entry:
   %m = mul <4 x i32> %x, %y
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %m)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m)
   ret i32 %z
 }
 
@@ -21,7 +21,7 @@
   %xx = zext <4 x i32> %x to <4 x i64>
   %yy = zext <4 x i32> %y to <4 x i64>
   %m = mul <4 x i64> %xx, %yy
-  %z = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %m)
+  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %m)
   ret i64 %z
 }
 
@@ -34,7 +34,7 @@
   %xx = sext <4 x i32> %x to <4 x i64>
   %yy = sext <4 x i32> %y to <4 x i64>
   %m = mul <4 x i64> %xx, %yy
-  %z = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %m)
+  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %m)
   ret i64 %z
 }
 
@@ -53,7 +53,7 @@
   %xx = zext <2 x i32> %x to <2 x i64>
   %yy = zext <2 x i32> %y to <2 x i64>
   %m = mul <2 x i64> %xx, %yy
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m)
   ret i64 %z
 }
 
@@ -72,7 +72,7 @@
   %xx = sext <2 x i32> %x to <2 x i64>
   %yy = sext <2 x i32> %y to <2 x i64>
   %m = mul <2 x i64> %xx, %yy
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m)
   ret i64 %z
 }
 
@@ -85,7 +85,7 @@
   %xx = zext <8 x i16> %x to <8 x i32>
   %yy = zext <8 x i16> %y to <8 x i32>
   %m = mul <8 x i32> %xx, %yy
-  %z = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %m)
+  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m)
   ret i32 %z
 }
 
@@ -98,7 +98,7 @@
   %xx = sext <8 x i16> %x to <8 x i32>
   %yy = sext <8 x i16> %y to <8 x i32>
   %m = mul <8 x i32> %xx, %yy
-  %z = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %m)
+  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m)
   ret i32 %z
 }
 
@@ -113,7 +113,7 @@
   %xx = zext <4 x i16> %x to <4 x i32>
   %yy = zext <4 x i16> %y to <4 x i32>
   %m = mul <4 x i32> %xx, %yy
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %m)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m)
   ret i32 %z
 }
 
@@ -128,7 +128,7 @@
   %xx = sext <4 x i16> %x to <4 x i32>
   %yy = sext <4 x i16> %y to <4 x i32>
   %m = mul <4 x i32> %xx, %yy
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %m)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m)
   ret i32 %z
 }
 
@@ -140,7 +140,7 @@
 ; CHECK-NEXT:    bx lr
 entry:
   %m = mul <8 x i16> %x, %y
-  %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %m)
+  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %m)
   ret i16 %z
 }
 
@@ -153,7 +153,7 @@
   %xx = zext <8 x i16> %x to <8 x i64>
   %yy = zext <8 x i16> %y to <8 x i64>
   %m = mul <8 x i64> %xx, %yy
-  %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %m)
+  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %m)
   ret i64 %z
 }
 
@@ -166,7 +166,7 @@
   %xx = sext <8 x i16> %x to <8 x i64>
   %yy = sext <8 x i16> %y to <8 x i64>
   %m = mul <8 x i64> %xx, %yy
-  %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %m)
+  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %m)
   ret i64 %z
 }
 
@@ -180,7 +180,7 @@
   %yy = zext <8 x i16> %y to <8 x i32>
   %m = mul <8 x i32> %xx, %yy
   %ma = zext <8 x i32> %m to <8 x i64>
-  %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %ma)
+  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %ma)
   ret i64 %z
 }
 
@@ -194,7 +194,7 @@
   %yy = sext <8 x i16> %y to <8 x i32>
   %m = mul <8 x i32> %xx, %yy
   %ma = sext <8 x i32> %m to <8 x i64>
-  %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %ma)
+  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %ma)
   ret i64 %z
 }
 
@@ -207,7 +207,7 @@
   %xx = sext <8 x i16> %x to <8 x i32>
   %m = mul <8 x i32> %xx, %xx
   %ma = zext <8 x i32> %m to <8 x i64>
-  %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %ma)
+  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %ma)
   ret i64 %z
 }
 
@@ -228,7 +228,7 @@
   %xx = zext <2 x i16> %x to <2 x i64>
   %yy = zext <2 x i16> %y to <2 x i64>
   %m = mul <2 x i64> %xx, %yy
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m)
   ret i64 %z
 }
 
@@ -250,7 +250,7 @@
   %xx = sext <2 x i16> %x to <2 x i64>
   %yy = sext <2 x i16> %y to <2 x i64>
   %m = mul <2 x i64> %xx, %yy
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m)
   ret i64 %z
 }
 
@@ -263,7 +263,7 @@
   %xx = zext <16 x i8> %x to <16 x i32>
   %yy = zext <16 x i8> %y to <16 x i32>
   %m = mul <16 x i32> %xx, %yy
-  %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %m)
+  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %m)
   ret i32 %z
 }
 
@@ -276,7 +276,7 @@
   %xx = sext <16 x i8> %x to <16 x i32>
   %yy = sext <16 x i8> %y to <16 x i32>
   %m = mul <16 x i32> %xx, %yy
-  %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %m)
+  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %m)
   ret i32 %z
 }
 
@@ -290,7 +290,7 @@
   %yy = zext <16 x i8> %y to <16 x i16>
   %m = mul <16 x i16> %xx, %yy
   %ma = zext <16 x i16> %m to <16 x i32>
-  %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %ma)
+  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %ma)
   ret i32 %z
 }
 
@@ -304,7 +304,7 @@
   %yy = sext <16 x i8> %y to <16 x i16>
   %m = mul <16 x i16> %xx, %yy
   %ma = sext <16 x i16> %m to <16 x i32>
-  %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %ma)
+  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %ma)
   ret i32 %z
 }
 
@@ -317,7 +317,7 @@
   %xx = sext <16 x i8> %x to <16 x i16>
   %m = mul <16 x i16> %xx, %xx
   %ma = zext <16 x i16> %m to <16 x i32>
-  %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %ma)
+  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %ma)
   ret i32 %z
 }
 
@@ -333,7 +333,7 @@
   %xx = zext <4 x i8> %x to <4 x i32>
   %yy = zext <4 x i8> %y to <4 x i32>
   %m = mul <4 x i32> %xx, %yy
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %m)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m)
   ret i32 %z
 }
 
@@ -350,7 +350,7 @@
   %xx = sext <4 x i8> %x to <4 x i32>
   %yy = sext <4 x i8> %y to <4 x i32>
   %m = mul <4 x i32> %xx, %yy
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %m)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m)
   ret i32 %z
 }
 
@@ -364,7 +364,7 @@
   %xx = zext <16 x i8> %x to <16 x i16>
   %yy = zext <16 x i8> %y to <16 x i16>
   %m = mul <16 x i16> %xx, %yy
-  %z = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %m)
+  %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %m)
   ret i16 %z
 }
 
@@ -378,7 +378,7 @@
   %xx = sext <16 x i8> %x to <16 x i16>
   %yy = sext <16 x i8> %y to <16 x i16>
   %m = mul <16 x i16> %xx, %yy
-  %z = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %m)
+  %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %m)
   ret i16 %z
 }
 
@@ -394,7 +394,7 @@
   %xx = zext <8 x i8> %x to <8 x i16>
   %yy = zext <8 x i8> %y to <8 x i16>
   %m = mul <8 x i16> %xx, %yy
-  %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %m)
+  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %m)
   ret i16 %z
 }
 
@@ -410,7 +410,7 @@
   %xx = sext <8 x i8> %x to <8 x i16>
   %yy = sext <8 x i8> %y to <8 x i16>
   %m = mul <8 x i16> %xx, %yy
-  %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %m)
+  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %m)
   ret i16 %z
 }
 
@@ -422,7 +422,7 @@
 ; CHECK-NEXT:    bx lr
 entry:
   %m = mul <16 x i8> %x, %y
-  %z = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %m)
+  %z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %m)
   ret i8 %z
 }
 
@@ -636,7 +636,7 @@
   %xx = zext <16 x i8> %x to <16 x i64>
   %yy = zext <16 x i8> %y to <16 x i64>
   %m = mul <16 x i64> %xx, %yy
-  %z = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %m)
+  %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %m)
   ret i64 %z
 }
 
@@ -803,7 +803,7 @@
   %xx = sext <16 x i8> %x to <16 x i64>
   %yy = sext <16 x i8> %y to <16 x i64>
   %m = mul <16 x i64> %xx, %yy
-  %z = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %m)
+  %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %m)
   ret i64 %z
 }
 
@@ -826,7 +826,7 @@
   %xx = zext <2 x i8> %x to <2 x i64>
   %yy = zext <2 x i8> %y to <2 x i64>
   %m = mul <2 x i64> %xx, %yy
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m)
   ret i64 %z
 }
 
@@ -848,7 +848,7 @@
   %xx = sext <2 x i8> %x to <2 x i64>
   %yy = sext <2 x i8> %y to <2 x i64>
   %m = mul <2 x i64> %xx, %yy
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m)
   ret i64 %z
 }
 
@@ -879,7 +879,7 @@
 ; CHECK-NEXT:    pop {r4, pc}
 entry:
   %m = mul <2 x i64> %x, %y
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m)
   ret i64 %z
 }
 
@@ -890,7 +890,7 @@
 ; CHECK-NEXT:    bx lr
 entry:
   %m = mul <4 x i32> %x, %y
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %m)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -904,7 +904,7 @@
   %xx = zext <4 x i32> %x to <4 x i64>
   %yy = zext <4 x i32> %y to <4 x i64>
   %m = mul <4 x i64> %xx, %yy
-  %z = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %m)
+  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %m)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -918,7 +918,7 @@
   %xx = sext <4 x i32> %x to <4 x i64>
   %yy = sext <4 x i32> %y to <4 x i64>
   %m = mul <4 x i64> %xx, %yy
-  %z = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %m)
+  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %m)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -942,7 +942,7 @@
   %xx = zext <2 x i32> %x to <2 x i64>
   %yy = zext <2 x i32> %y to <2 x i64>
   %m = mul <2 x i64> %xx, %yy
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -966,7 +966,7 @@
   %xx = sext <2 x i32> %x to <2 x i64>
   %yy = sext <2 x i32> %y to <2 x i64>
   %m = mul <2 x i64> %xx, %yy
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -980,7 +980,7 @@
   %xx = zext <8 x i16> %x to <8 x i32>
   %yy = zext <8 x i16> %y to <8 x i32>
   %m = mul <8 x i32> %xx, %yy
-  %z = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %m)
+  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -994,7 +994,7 @@
   %xx = sext <8 x i16> %x to <8 x i32>
   %yy = sext <8 x i16> %y to <8 x i32>
   %m = mul <8 x i32> %xx, %yy
-  %z = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %m)
+  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -1010,7 +1010,7 @@
   %xx = zext <4 x i16> %x to <4 x i32>
   %yy = zext <4 x i16> %y to <4 x i32>
   %m = mul <4 x i32> %xx, %yy
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %m)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -1026,7 +1026,7 @@
   %xx = sext <4 x i16> %x to <4 x i32>
   %yy = sext <4 x i16> %y to <4 x i32>
   %m = mul <4 x i32> %xx, %yy
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %m)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -1039,7 +1039,7 @@
 ; CHECK-NEXT:    bx lr
 entry:
   %m = mul <8 x i16> %x, %y
-  %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %m)
+  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %m)
   %r = add i16 %z, %a
   ret i16 %r
 }
@@ -1053,7 +1053,7 @@
   %xx = zext <8 x i16> %x to <8 x i64>
   %yy = zext <8 x i16> %y to <8 x i64>
   %m = mul <8 x i64> %xx, %yy
-  %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %m)
+  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %m)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -1067,7 +1067,7 @@
   %xx = sext <8 x i16> %x to <8 x i64>
   %yy = sext <8 x i16> %y to <8 x i64>
   %m = mul <8 x i64> %xx, %yy
-  %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %m)
+  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %m)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -1082,7 +1082,7 @@
   %yy = zext <8 x i16> %y to <8 x i32>
   %m = mul <8 x i32> %xx, %yy
   %ma = zext <8 x i32> %m to <8 x i64>
-  %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %ma)
+  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %ma)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -1097,7 +1097,7 @@
   %yy = sext <8 x i16> %y to <8 x i32>
   %m = mul <8 x i32> %xx, %yy
   %ma = sext <8 x i32> %m to <8 x i64>
-  %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %ma)
+  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %ma)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -1111,7 +1111,7 @@
   %xx = sext <8 x i16> %x to <8 x i32>
   %m = mul <8 x i32> %xx, %xx
   %ma = zext <8 x i32> %m to <8 x i64>
-  %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %ma)
+  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %ma)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -1137,7 +1137,7 @@
   %xx = zext <2 x i16> %x to <2 x i64>
   %yy = zext <2 x i16> %y to <2 x i64>
   %m = mul <2 x i64> %xx, %yy
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -1164,7 +1164,7 @@
   %xx = sext <2 x i16> %x to <2 x i64>
   %yy = sext <2 x i16> %y to <2 x i64>
   %m = mul <2 x i64> %xx, %yy
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -1178,7 +1178,7 @@
   %xx = zext <16 x i8> %x to <16 x i32>
   %yy = zext <16 x i8> %y to <16 x i32>
   %m = mul <16 x i32> %xx, %yy
-  %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %m)
+  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %m)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -1192,7 +1192,7 @@
   %xx = sext <16 x i8> %x to <16 x i32>
   %yy = sext <16 x i8> %y to <16 x i32>
   %m = mul <16 x i32> %xx, %yy
-  %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %m)
+  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %m)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -1207,7 +1207,7 @@
   %yy = zext <16 x i8> %y to <16 x i16>
   %m = mul <16 x i16> %xx, %yy
   %ma = zext <16 x i16> %m to <16 x i32>
-  %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %ma)
+  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %ma)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -1222,7 +1222,7 @@
   %yy = sext <16 x i8> %y to <16 x i16>
   %m = mul <16 x i16> %xx, %yy
   %ma = sext <16 x i16> %m to <16 x i32>
-  %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %ma)
+  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %ma)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -1236,7 +1236,7 @@
   %xx = sext <16 x i8> %x to <16 x i16>
   %m = mul <16 x i16> %xx, %xx
   %ma = zext <16 x i16> %m to <16 x i32>
-  %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %ma)
+  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %ma)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -1253,7 +1253,7 @@
   %xx = zext <4 x i8> %x to <4 x i32>
   %yy = zext <4 x i8> %y to <4 x i32>
   %m = mul <4 x i32> %xx, %yy
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %m)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -1271,7 +1271,7 @@
   %xx = sext <4 x i8> %x to <4 x i32>
   %yy = sext <4 x i8> %y to <4 x i32>
   %m = mul <4 x i32> %xx, %yy
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %m)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -1286,7 +1286,7 @@
   %xx = zext <16 x i8> %x to <16 x i16>
   %yy = zext <16 x i8> %y to <16 x i16>
   %m = mul <16 x i16> %xx, %yy
-  %z = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %m)
+  %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %m)
   %r = add i16 %z, %a
   ret i16 %r
 }
@@ -1301,7 +1301,7 @@
   %xx = sext <16 x i8> %x to <16 x i16>
   %yy = sext <16 x i8> %y to <16 x i16>
   %m = mul <16 x i16> %xx, %yy
-  %z = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %m)
+  %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %m)
   %r = add i16 %z, %a
   ret i16 %r
 }
@@ -1318,7 +1318,7 @@
   %xx = zext <8 x i8> %x to <8 x i16>
   %yy = zext <8 x i8> %y to <8 x i16>
   %m = mul <8 x i16> %xx, %yy
-  %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %m)
+  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %m)
   %r = add i16 %z, %a
   ret i16 %r
 }
@@ -1335,7 +1335,7 @@
   %xx = sext <8 x i8> %x to <8 x i16>
   %yy = sext <8 x i8> %y to <8 x i16>
   %m = mul <8 x i16> %xx, %yy
-  %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %m)
+  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %m)
   %r = add i16 %z, %a
   ret i16 %r
 }
@@ -1348,7 +1348,7 @@
 ; CHECK-NEXT:    bx lr
 entry:
   %m = mul <16 x i8> %x, %y
-  %z = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %m)
+  %z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %m)
   %r = add i8 %z, %a
   ret i8 %r
 }
@@ -1565,7 +1565,7 @@
   %xx = zext <16 x i8> %x to <16 x i64>
   %yy = zext <16 x i8> %y to <16 x i64>
   %m = mul <16 x i64> %xx, %yy
-  %z = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %m)
+  %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %m)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -1737,7 +1737,7 @@
   %xx = sext <16 x i8> %x to <16 x i64>
   %yy = sext <16 x i8> %y to <16 x i64>
   %m = mul <16 x i64> %xx, %yy
-  %z = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %m)
+  %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %m)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -1765,7 +1765,7 @@
   %xx = zext <2 x i8> %x to <2 x i64>
   %yy = zext <2 x i8> %y to <2 x i64>
   %m = mul <2 x i64> %xx, %yy
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -1792,7 +1792,7 @@
   %xx = sext <2 x i8> %x to <2 x i64>
   %yy = sext <2 x i8> %y to <2 x i64>
   %m = mul <2 x i64> %xx, %yy
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -1826,18 +1826,18 @@
 ; CHECK-NEXT:    pop {r4, r5, r6, pc}
 entry:
   %m = mul <2 x i64> %x, %y
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m)
   %r = add i64 %z, %a
   ret i64 %r
 }
 
-declare i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16>)
-declare i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32>)
-declare i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64>)
-declare i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64>)
-declare i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8>)
+declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
+declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
+declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>)
+declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>)
+declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)
diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll
@@ -11,7 +11,7 @@
   %c = icmp eq <4 x i32> %b, zeroinitializer
   %m = mul <4 x i32> %x, %y
   %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
   ret i32 %z
 }
 
@@ -27,7 +27,7 @@
   %yy = zext <4 x i32> %y to <4 x i64>
   %m = mul <4 x i64> %xx, %yy
   %s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s)
   ret i64 %z
 }
 
@@ -43,7 +43,7 @@
   %yy = sext <4 x i32> %y to <4 x i64>
   %m = mul <4 x i64> %xx, %yy
   %s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s)
   ret i64 %z
 }
 
@@ -79,7 +79,7 @@
   %yy = zext <2 x i32> %y to <2 x i64>
   %m = mul <2 x i64> %xx, %yy
   %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
   ret i64 %z
 }
 
@@ -115,7 +115,7 @@
   %yy = sext <2 x i32> %y to <2 x i64>
   %m = mul <2 x i64> %xx, %yy
   %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
   ret i64 %z
 }
 
@@ -131,7 +131,7 @@
   %yy = zext <8 x i16> %y to <8 x i32>
   %m = mul <8 x i32> %xx, %yy
   %s = select <8 x i1> %c, <8 x i32> %m, <8 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %s)
   ret i32 %z
 }
 
@@ -147,7 +147,7 @@
   %yy = sext <8 x i16> %y to <8 x i32>
   %m = mul <8 x i32> %xx, %yy
   %s = select <8 x i1> %c, <8 x i32> %m, <8 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %s)
   ret i32 %z
 }
 
@@ -166,7 +166,7 @@
   %yy = zext <4 x i16> %y to <4 x i32>
   %m = mul <4 x i32> %xx, %yy
   %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
   ret i32 %z
 }
 
@@ -185,7 +185,7 @@
   %yy = sext <4 x i16> %y to <4 x i32>
   %m = mul <4 x i32> %xx, %yy
   %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
   ret i32 %z
 }
 
@@ -200,7 +200,7 @@
   %c = icmp eq <8 x i16> %b, zeroinitializer
   %m = mul <8 x i16> %x, %y
   %s = select <8 x i1> %c, <8 x i16> %m, <8 x i16> zeroinitializer
-  %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %s)
+  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s)
   ret i16 %z
 }
 
@@ -216,7 +216,7 @@
   %yy = zext <8 x i16> %y to <8 x i64>
   %m = mul <8 x i64> %xx, %yy
   %s = select <8 x i1> %c, <8 x i64> %m, <8 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
   ret i64 %z
 }
 
@@ -232,7 +232,7 @@
   %yy = sext <8 x i16> %y to <8 x i64>
   %m = mul <8 x i64> %xx, %yy
   %s = select <8 x i1> %c, <8 x i64> %m, <8 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
   ret i64 %z
 }
 
@@ -249,7 +249,7 @@
   %m = mul <8 x i32> %xx, %yy
   %ma = zext <8 x i32> %m to <8 x i64>
   %s = select <8 x i1> %c, <8 x i64> %ma, <8 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
   ret i64 %z
 }
 
@@ -266,7 +266,7 @@
   %m = mul <8 x i32> %xx, %yy
   %ma = sext <8 x i32> %m to <8 x i64>
   %s = select <8 x i1> %c, <8 x i64> %ma, <8 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
   ret i64 %z
 }
 
@@ -282,7 +282,7 @@
   %m = mul <8 x i32> %xx, %xx
   %ma = zext <8 x i32> %m to <8 x i64>
   %s = select <8 x i1> %c, <8 x i64> %ma, <8 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
   ret i64 %z
 }
 
@@ -334,7 +334,7 @@
   %yy = zext <2 x i16> %y to <2 x i64>
   %m = mul <2 x i64> %xx, %yy
   %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
   ret i64 %z
 }
 
@@ -385,7 +385,7 @@
   %yy = sext <2 x i16> %y to <2 x i64>
   %m = mul <2 x i64> %xx, %yy
   %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
   ret i64 %z
 }
 
@@ -401,7 +401,7 @@
   %yy = zext <16 x i8> %y to <16 x i32>
   %m = mul <16 x i32> %xx, %yy
   %s = select <16 x i1> %c, <16 x i32> %m, <16 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
   ret i32 %z
 }
 
@@ -417,7 +417,7 @@
   %yy = sext <16 x i8> %y to <16 x i32>
   %m = mul <16 x i32> %xx, %yy
   %s = select <16 x i1> %c, <16 x i32> %m, <16 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
   ret i32 %z
 }
 
@@ -434,7 +434,7 @@
   %m = mul <16 x i16> %xx, %yy
   %ma = zext <16 x i16> %m to <16 x i32>
   %s = select <16 x i1> %c, <16 x i32> %ma, <16 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
   ret i32 %z
 }
 
@@ -451,7 +451,7 @@
   %m = mul <16 x i16> %xx, %yy
   %ma = sext <16 x i16> %m to <16 x i32>
   %s = select <16 x i1> %c, <16 x i32> %ma, <16 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
   ret i32 %z
 }
 
@@ -467,7 +467,7 @@
   %m = mul <16 x i16> %xx, %xx
   %ma = zext <16 x i16> %m to <16 x i32>
   %s = select <16 x i1> %c, <16 x i32> %ma, <16 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
   ret i32 %z
 }
 
@@ -487,7 +487,7 @@
   %yy = zext <4 x i8> %y to <4 x i32>
   %m = mul <4 x i32> %xx, %yy
   %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
   ret i32 %z
 }
 
@@ -509,7 +509,7 @@
   %yy = sext <4 x i8> %y to <4 x i32>
   %m = mul <4 x i32> %xx, %yy
   %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
   ret i32 %z
 }
 
@@ -526,7 +526,7 @@
   %yy = zext <16 x i8> %y to <16 x i16>
   %m = mul <16 x i16> %xx, %yy
   %s = select <16 x i1> %c, <16 x i16> %m, <16 x i16> zeroinitializer
-  %z = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %s)
+  %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s)
   ret i16 %z
 }
 
@@ -543,7 +543,7 @@
   %yy = sext <16 x i8> %y to <16 x i16>
   %m = mul <16 x i16> %xx, %yy
   %s = select <16 x i1> %c, <16 x i16> %m, <16 x i16> zeroinitializer
-  %z = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %s)
+  %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s)
   ret i16 %z
 }
 
@@ -563,7 +563,7 @@
   %yy = zext <8 x i8> %y to <8 x i16>
   %m = mul <8 x i16> %xx, %yy
   %s = select <8 x i1> %c, <8 x i16> %m, <8 x i16> zeroinitializer
-  %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %s)
+  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s)
   ret i16 %z
 }
 
@@ -583,7 +583,7 @@
   %yy = sext <8 x i8> %y to <8 x i16>
   %m = mul <8 x i16> %xx, %yy
   %s = select <8 x i1> %c, <8 x i16> %m, <8 x i16> zeroinitializer
-  %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %s)
+  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s)
   ret i16 %z
 }
 
@@ -598,7 +598,7 @@
   %c = icmp eq <16 x i8> %b, zeroinitializer
   %m = mul <16 x i8> %x, %y
   %s = select <16 x i1> %c, <16 x i8> %m, <16 x i8> zeroinitializer
-  %z = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %s)
+  %z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %s)
   ret i8 %z
 }
 
@@ -1010,7 +1010,7 @@
   %yy = zext <16 x i8> %y to <16 x i64>
   %m = mul <16 x i64> %xx, %yy
   %s = select <16 x i1> %c, <16 x i64> %m, <16 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %s)
   ret i64 %z
 }
 
@@ -1353,7 +1353,7 @@
   %yy = sext <16 x i8> %y to <16 x i64>
   %m = mul <16 x i64> %xx, %yy
   %s = select <16 x i1> %c, <16 x i64> %m, <16 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %s)
   ret i64 %z
 }
 
@@ -1405,7 +1405,7 @@
   %yy = zext <2 x i8> %y to <2 x i64>
   %m = mul <2 x i64> %xx, %yy
   %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
   ret i64 %z
 }
 
@@ -1456,7 +1456,7 @@
   %yy = sext <2 x i8> %y to <2 x i64>
   %m = mul <2 x i64> %xx, %yy
   %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
   ret i64 %z
 }
 
@@ -1509,7 +1509,7 @@
   %c = icmp eq <2 x i64> %b, zeroinitializer
   %m = mul <2 x i64> %x, %y
   %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
   ret i64 %z
 }
 
@@ -1523,7 +1523,7 @@
   %c = icmp eq <4 x i32> %b, zeroinitializer
   %m = mul <4 x i32> %x, %y
   %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -1540,7 +1540,7 @@
   %yy = zext <4 x i32> %y to <4 x i64>
   %m = mul <4 x i64> %xx, %yy
   %s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -1557,7 +1557,7 @@
   %yy = sext <4 x i32> %y to <4 x i64>
   %m = mul <4 x i64> %xx, %yy
   %s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -1598,7 +1598,7 @@
   %yy = zext <2 x i32> %y to <2 x i64>
   %m = mul <2 x i64> %xx, %yy
   %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -1639,7 +1639,7 @@
   %yy = sext <2 x i32> %y to <2 x i64>
   %m = mul <2 x i64> %xx, %yy
   %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -1656,7 +1656,7 @@
   %yy = zext <8 x i16> %y to <8 x i32>
   %m = mul <8 x i32> %xx, %yy
   %s = select <8 x i1> %c, <8 x i32> %m, <8 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %s)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -1673,7 +1673,7 @@
   %yy = sext <8 x i16> %y to <8 x i32>
   %m = mul <8 x i32> %xx, %yy
   %s = select <8 x i1> %c, <8 x i32> %m, <8 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %s)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -1693,7 +1693,7 @@
   %yy = zext <4 x i16> %y to <4 x i32>
   %m = mul <4 x i32> %xx, %yy
   %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -1713,7 +1713,7 @@
   %yy = sext <4 x i16> %y to <4 x i32>
   %m = mul <4 x i32> %xx, %yy
   %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -1729,7 +1729,7 @@
   %c = icmp eq <8 x i16> %b, zeroinitializer
   %m = mul <8 x i16> %x, %y
   %s = select <8 x i1> %c, <8 x i16> %m, <8 x i16> zeroinitializer
-  %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %s)
+  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s)
   %r = add i16 %z, %a
   ret i16 %r
 }
@@ -1746,7 +1746,7 @@
   %yy = zext <8 x i16> %y to <8 x i64>
   %m = mul <8 x i64> %xx, %yy
   %s = select <8 x i1> %c, <8 x i64> %m, <8 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -1763,7 +1763,7 @@
   %yy = sext <8 x i16> %y to <8 x i64>
   %m = mul <8 x i64> %xx, %yy
   %s = select <8 x i1> %c, <8 x i64> %m, <8 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -1781,7 +1781,7 @@
   %m = mul <8 x i32> %xx, %yy
   %ma = zext <8 x i32> %m to <8 x i64>
   %s = select <8 x i1> %c, <8 x i64> %ma, <8 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -1799,7 +1799,7 @@
   %m = mul <8 x i32> %xx, %yy
   %ma = sext <8 x i32> %m to <8 x i64>
   %s = select <8 x i1> %c, <8 x i64> %ma, <8 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -1816,7 +1816,7 @@
   %m = mul <8 x i32> %xx, %xx
   %ma = zext <8 x i32> %m to <8 x i64>
   %s = select <8 x i1> %c, <8 x i64> %ma, <8 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -1873,7 +1873,7 @@
   %yy = zext <2 x i16> %y to <2 x i64>
   %m = mul <2 x i64> %xx, %yy
   %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -1929,7 +1929,7 @@
   %yy = sext <2 x i16> %y to <2 x i64>
   %m = mul <2 x i64> %xx, %yy
   %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -1946,7 +1946,7 @@
   %yy = zext <16 x i8> %y to <16 x i32>
   %m = mul <16 x i32> %xx, %yy
   %s = select <16 x i1> %c, <16 x i32> %m, <16 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -1963,7 +1963,7 @@
   %yy = sext <16 x i8> %y to <16 x i32>
   %m = mul <16 x i32> %xx, %yy
   %s = select <16 x i1> %c, <16 x i32> %m, <16 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -1981,7 +1981,7 @@
   %m = mul <16 x i16> %xx, %yy
   %ma = zext <16 x i16> %m to <16 x i32>
   %s = select <16 x i1> %c, <16 x i32> %ma, <16 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -1999,7 +1999,7 @@
   %m = mul <16 x i16> %xx, %yy
   %ma = sext <16 x i16> %m to <16 x i32>
   %s = select <16 x i1> %c, <16 x i32> %ma, <16 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -2016,7 +2016,7 @@
   %m = mul <16 x i16> %xx, %xx
   %ma = zext <16 x i16> %m to <16 x i32>
   %s = select <16 x i1> %c, <16 x i32> %ma, <16 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -2037,7 +2037,7 @@
   %yy = zext <4 x i8> %y to <4 x i32>
   %m = mul <4 x i32> %xx, %yy
   %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -2060,7 +2060,7 @@
   %yy = sext <4 x i8> %y to <4 x i32>
   %m = mul <4 x i32> %xx, %yy
   %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -2078,7 +2078,7 @@
   %yy = zext <16 x i8> %y to <16 x i16>
   %m = mul <16 x i16> %xx, %yy
   %s = select <16 x i1> %c, <16 x i16> %m, <16 x i16> zeroinitializer
-  %z = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %s)
+  %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s)
   %r = add i16 %z, %a
   ret i16 %r
 }
@@ -2096,7 +2096,7 @@
   %yy = sext <16 x i8> %y to <16 x i16>
   %m = mul <16 x i16> %xx, %yy
   %s = select <16 x i1> %c, <16 x i16> %m, <16 x i16> zeroinitializer
-  %z = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %s)
+  %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s)
   %r = add i16 %z, %a
   ret i16 %r
 }
@@ -2117,7 +2117,7 @@
   %yy = zext <8 x i8> %y to <8 x i16>
   %m = mul <8 x i16> %xx, %yy
   %s = select <8 x i1> %c, <8 x i16> %m, <8 x i16> zeroinitializer
-  %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %s)
+  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s)
   %r = add i16 %z, %a
   ret i16 %r
 }
@@ -2138,7 +2138,7 @@
   %yy = sext <8 x i8> %y to <8 x i16>
   %m = mul <8 x i16> %xx, %yy
   %s = select <8 x i1> %c, <8 x i16> %m, <8 x i16> zeroinitializer
-  %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %s)
+  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s)
   %r = add i16 %z, %a
   ret i16 %r
 }
@@ -2154,7 +2154,7 @@
   %c = icmp eq <16 x i8> %b, zeroinitializer
   %m = mul <16 x i8> %x, %y
   %s = select <16 x i1> %c, <16 x i8> %m, <16 x i8> zeroinitializer
-  %z = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %s)
+  %z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %s)
   %r = add i8 %z, %a
   ret i8 %r
 }
@@ -2569,7 +2569,7 @@
   %yy = zext <16 x i8> %y to <16 x i64>
   %m = mul <16 x i64> %xx, %yy
   %s = select <16 x i1> %c, <16 x i64> %m, <16 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %s)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -2917,7 +2917,7 @@
   %yy = sext <16 x i8> %y to <16 x i64>
   %m = mul <16 x i64> %xx, %yy
   %s = select <16 x i1> %c, <16 x i64> %m, <16 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %s)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -2974,7 +2974,7 @@
   %yy = zext <2 x i8> %y to <2 x i64>
   %m = mul <2 x i64> %xx, %yy
   %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -3030,7 +3030,7 @@
   %yy = sext <2 x i8> %y to <2 x i64>
   %m = mul <2 x i64> %xx, %yy
   %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -3088,18 +3088,18 @@
   %c = icmp eq <2 x i64> %b, zeroinitializer
   %m = mul <2 x i64> %x, %y
   %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
   %r = add i64 %z, %a
   ret i64 %r
 }
 
-declare i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16>)
-declare i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32>)
-declare i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64>)
-declare i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64>)
-declare i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8>)
+declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
+declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
+declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>)
+declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>)
+declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)
diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mul.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mul.ll
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mul.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mul.ll
@@ -9,7 +9,7 @@
 ; CHECK-NEXT:    muls r0, r1, r0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32> %x)
+  %z = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> %x)
   ret i32 %z
 }
 
@@ -25,7 +25,7 @@
 ; CHECK-NEXT:    muls r0, r1, r0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> %x)
+  %z = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %x)
   ret i32 %z
 }
 
@@ -42,7 +42,7 @@
 ; CHECK-NEXT:    muls r0, r1, r0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> %x)
+  %z = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> %x)
   ret i32 %z
 }
 
@@ -58,7 +58,7 @@
 ; CHECK-NEXT:    muls r0, r1, r0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> %x)
+  %z = call i16 @llvm.vector.reduce.mul.v4i16(<4 x i16> %x)
   ret i16 %z
 }
 
@@ -76,7 +76,7 @@
 ; CHECK-NEXT:    muls r0, r1, r0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16> %x)
+  %z = call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> %x)
   ret i16 %z
 }
 
@@ -95,7 +95,7 @@
 ; CHECK-NEXT:    muls r0, r1, r0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16> %x)
+  %z = call i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> %x)
   ret i16 %z
 }
 
@@ -113,7 +113,7 @@
 ; CHECK-NEXT:    muls r0, r1, r0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> %x)
+  %z = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> %x)
   ret i8 %z
 }
 
@@ -133,7 +133,7 @@
 ; CHECK-NEXT:    muls r0, r1, r0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8> %x)
+  %z = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> %x)
   ret i8 %z
 }
 
@@ -154,7 +154,7 @@
 ; CHECK-NEXT:    muls r0, r1, r0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> %x)
+  %z = call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> %x)
   ret i8 %z
 }
 
@@ -163,7 +163,7 @@
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i64 @llvm.experimental.vector.reduce.mul.v1i64(<1 x i64> %x)
+  %z = call i64 @llvm.vector.reduce.mul.v1i64(<1 x i64> %x)
   ret i64 %z
 }
 
@@ -179,7 +179,7 @@
 ; CHECK-NEXT:    mla r1, r3, r1, r2
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i64 @llvm.experimental.vector.reduce.mul.v2i64(<2 x i64> %x)
+  %z = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> %x)
   ret i64 %z
 }
 
@@ -207,7 +207,7 @@
 ; CHECK-NEXT:    mla r1, r1, r6, r4
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, pc}
 entry:
-  %z = call i64 @llvm.experimental.vector.reduce.mul.v4i64(<4 x i64> %x)
+  %z = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> %x)
   ret i64 %z
 }
 
@@ -220,7 +220,7 @@
 ; CHECK-NEXT:    muls r0, r1, r0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32> %x)
+  %z = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> %x)
   %r = mul i32 %y, %z
   ret i32 %r
 }
@@ -238,7 +238,7 @@
 ; CHECK-NEXT:    muls r0, r1, r0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> %x)
+  %z = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %x)
   %r = mul i32 %y, %z
   ret i32 %r
 }
@@ -257,7 +257,7 @@
 ; CHECK-NEXT:    muls r0, r1, r0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> %x)
+  %z = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> %x)
   %r = mul i32 %y, %z
   ret i32 %r
 }
@@ -275,7 +275,7 @@
 ; CHECK-NEXT:    muls r0, r1, r0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> %x)
+  %z = call i16 @llvm.vector.reduce.mul.v4i16(<4 x i16> %x)
   %r = mul i16 %y, %z
   ret i16 %r
 }
@@ -295,7 +295,7 @@
 ; CHECK-NEXT:    muls r0, r1, r0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16> %x)
+  %z = call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> %x)
   %r = mul i16 %y, %z
   ret i16 %r
 }
@@ -316,7 +316,7 @@
 ; CHECK-NEXT:    muls r0, r1, r0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16> %x)
+  %z = call i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> %x)
   %r = mul i16 %y, %z
   ret i16 %r
 }
@@ -336,7 +336,7 @@
 ; CHECK-NEXT:    muls r0, r1, r0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> %x)
+  %z = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> %x)
   %r = mul i8 %y, %z
   ret i8 %r
 }
@@ -358,7 +358,7 @@
 ; CHECK-NEXT:    muls r0, r1, r0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8> %x)
+  %z = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> %x)
   %r = mul i8 %y, %z
   ret i8 %r
 }
@@ -381,7 +381,7 @@
 ; CHECK-NEXT:    muls r0, r1, r0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> %x)
+  %z = call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> %x)
   %r = mul i8 %y, %z
   ret i8 %r
 }
@@ -397,7 +397,7 @@
 ; CHECK-NEXT:    mov r0, r12
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
-  %z = call i64 @llvm.experimental.vector.reduce.mul.v1i64(<1 x i64> %x)
+  %z = call i64 @llvm.vector.reduce.mul.v1i64(<1 x i64> %x)
   %r = mul i64 %y, %z
   ret i64 %r
 }
@@ -420,7 +420,7 @@
 ; CHECK-NEXT:    mov r0, r2
 ; CHECK-NEXT:    pop {r4, pc}
 entry:
-  %z = call i64 @llvm.experimental.vector.reduce.mul.v2i64(<2 x i64> %x)
+  %z = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> %x)
   %r = mul i64 %y, %z
   ret i64 %r
 }
@@ -453,20 +453,20 @@
 ; CHECK-NEXT:    mov r0, r2
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
 entry:
-  %z = call i64 @llvm.experimental.vector.reduce.mul.v4i64(<4 x i64> %x)
+  %z = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> %x)
   %r = mul i64 %y, %z
   ret i64 %r
 }
 
-declare i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16>)
-declare i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32>)
-declare i64 @llvm.experimental.vector.reduce.mul.v1i64(<1 x i64>)
-declare i64 @llvm.experimental.vector.reduce.mul.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.mul.v4i64(<4 x i64>)
-declare i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8>)
+declare i16 @llvm.vector.reduce.mul.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.mul.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.mul.v8i16(<8 x i16>)
+declare i32 @llvm.vector.reduce.mul.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.mul.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.mul.v8i32(<8 x i32>)
+declare i64 @llvm.vector.reduce.mul.v1i64(<1 x i64>)
+declare i64 @llvm.vector.reduce.mul.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.mul.v4i64(<4 x i64>)
+declare i8 @llvm.vector.reduce.mul.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.mul.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.mul.v8i8(<8 x i8>)
diff --git a/llvm/test/CodeGen/Thumb2/mve-vmaxv.ll b/llvm/test/CodeGen/Thumb2/mve-vmaxv.ll
--- a/llvm/test/CodeGen/Thumb2/mve-vmaxv.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vmaxv.ll
@@ -1,18 +1,18 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -o - | FileCheck %s
 
-declare i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8>)
-declare i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16>)
-declare i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32>)
-declare i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8>)
-declare i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16>)
-declare i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32>)
-declare i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8>)
-declare i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16>)
-declare i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32>)
-declare i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8>)
-declare i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16>)
-declare i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32>)
+declare i8 @llvm.vector.reduce.smax.v16i8(<16 x i8>)
+declare i16 @llvm.vector.reduce.smax.v8i16(<8 x i16>)
+declare i32 @llvm.vector.reduce.smax.v4i32(<4 x i32>)
+declare i8 @llvm.vector.reduce.umax.v16i8(<16 x i8>)
+declare i16 @llvm.vector.reduce.umax.v8i16(<8 x i16>)
+declare i32 @llvm.vector.reduce.umax.v4i32(<4 x i32>)
+declare i8 @llvm.vector.reduce.smin.v16i8(<16 x i8>)
+declare i16 @llvm.vector.reduce.smin.v8i16(<8 x i16>)
+declare i32 @llvm.vector.reduce.smin.v4i32(<4 x i32>)
+declare i8 @llvm.vector.reduce.umin.v16i8(<16 x i8>)
+declare i16 @llvm.vector.reduce.umin.v8i16(<8 x i16>)
+declare i32 @llvm.vector.reduce.umin.v4i32(<4 x i32>)
 
 define arm_aapcs_vfpcc i8 @vmaxv_s_v16i8(<16 x i8> %s1) {
 ; CHECK-LABEL: vmaxv_s_v16i8:
@@ -20,7 +20,7 @@
 ; CHECK-NEXT:    mvn r0, #127
 ; CHECK-NEXT:    vmaxv.s8 r0, q0
 ; CHECK-NEXT:    bx lr
-  %r = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> %s1)
+  %r = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> %s1)
   ret i8 %r
 }
 
@@ -31,7 +31,7 @@
 ; CHECK-NEXT:    movt r0, #65535
 ; CHECK-NEXT:    vmaxv.s16 r0, q0
 ; CHECK-NEXT:    bx lr
-  %r = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> %s1)
+  %r = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> %s1)
   ret i16 %r
 }
 
@@ -41,7 +41,7 @@
 ; CHECK-NEXT:    mov.w r0, #-2147483648
 ; CHECK-NEXT:    vmaxv.s32 r0, q0
 ; CHECK-NEXT:    bx lr
-  %r = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> %s1)
+  %r = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %s1)
   ret i32 %r
 }
 
@@ -51,7 +51,7 @@
 ; CHECK-NEXT:    movs r0, #0
 ; CHECK-NEXT:    vmaxv.u8 r0, q0
 ; CHECK-NEXT:    bx lr
-  %r = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> %s1)
+  %r = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> %s1)
   ret i8 %r
 }
 
@@ -61,7 +61,7 @@
 ; CHECK-NEXT:    movs r0, #0
 ; CHECK-NEXT:    vmaxv.u16 r0, q0
 ; CHECK-NEXT:    bx lr
-  %r = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> %s1)
+  %r = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> %s1)
   ret i16 %r
 }
 
@@ -71,7 +71,7 @@
 ; CHECK-NEXT:    movs r0, #0
 ; CHECK-NEXT:    vmaxv.u32 r0, q0
 ; CHECK-NEXT:    bx lr
-  %r = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> %s1)
+  %r = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %s1)
   ret i32 %r
 }
 
@@ -81,7 +81,7 @@
 ; CHECK-NEXT:    movs r0, #127
 ; CHECK-NEXT:    vminv.s8 r0, q0
 ; CHECK-NEXT:    bx lr
-  %r = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> %s1)
+  %r = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> %s1)
   ret i8 %r
 }
 
@@ -91,7 +91,7 @@
 ; CHECK-NEXT:    movw r0, #32767
 ; CHECK-NEXT:    vminv.s16 r0, q0
 ; CHECK-NEXT:    bx lr
-  %r = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> %s1)
+  %r = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> %s1)
   ret i16 %r
 }
 
@@ -101,7 +101,7 @@
 ; CHECK-NEXT:    mvn r0, #-2147483648
 ; CHECK-NEXT:    vminv.s32 r0, q0
 ; CHECK-NEXT:    bx lr
-  %r = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> %s1)
+  %r = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %s1)
   ret i32 %r
 }
 
@@ -111,7 +111,7 @@
 ; CHECK-NEXT:    movs r0, #255
 ; CHECK-NEXT:    vminv.u8 r0, q0
 ; CHECK-NEXT:    bx lr
-  %r = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> %s1)
+  %r = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> %s1)
   ret i8 %r
 }
 
@@ -121,7 +121,7 @@
 ; CHECK-NEXT:    movw r0, #65535
 ; CHECK-NEXT:    vminv.u16 r0, q0
 ; CHECK-NEXT:    bx lr
-  %r = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> %s1)
+  %r = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> %s1)
   ret i16 %r
 }
 
@@ -131,7 +131,7 @@
 ; CHECK-NEXT:    mov.w r0, #-1
 ; CHECK-NEXT:    vminv.u32 r0, q0
 ; CHECK-NEXT:    bx lr
-  %r = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> %s1)
+  %r = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %s1)
   ret i32 %r
 }
 
@@ -142,7 +142,7 @@
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    vmaxv.s8 r0, q0
 ; CHECK-NEXT:    bx lr
-  %r = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> %s1)
+  %r = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> %s1)
   %c = icmp sgt i8 %r, %s2
   %s = select i1 %c, i8 %r, i8 %s2
   ret i8 %s
@@ -157,7 +157,7 @@
 ; CHECK-NEXT:    cmp r1, r0
 ; CHECK-NEXT:    csel r0, r1, r0, gt
 ; CHECK-NEXT:    bx lr
-  %r = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> %s1)
+  %r = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> %s1)
   %rs = sext i8 %r to i32
   %c = icmp sgt i32 %rs, %s2
   %s = select i1 %c, i32 %rs, i32 %s2
@@ -169,7 +169,7 @@
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    vmaxv.s16 r0, q0
 ; CHECK-NEXT:    bx lr
-  %r = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> %s1)
+  %r = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> %s1)
   %c = icmp sgt i16 %r, %s2
   %s = select i1 %c, i16 %r, i16 %s2
   ret i16 %s
@@ -185,7 +185,7 @@
 ; CHECK-NEXT:    cmp r1, r0
 ; CHECK-NEXT:    csel r0, r1, r0, gt
 ; CHECK-NEXT:    bx lr
-  %r = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> %s1)
+  %r = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> %s1)
   %rs = sext i16 %r to i32
   %c = icmp sgt i32 %rs, %s2
   %s = select i1 %c, i32 %rs, i32 %s2
@@ -197,7 +197,7 @@
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    vmaxv.s32 r0, q0
 ; CHECK-NEXT:    bx lr
-  %r = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> %s1)
+  %r = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %s1)
   %c = icmp sgt i32 %r, %s2
   %s = select i1 %c, i32 %r, i32 %s2
   ret i32 %s
@@ -208,7 +208,7 @@
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    vmaxv.u8 r0, q0
 ; CHECK-NEXT:    bx lr
-  %r = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> %s1)
+  %r = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> %s1)
   %c = icmp ugt i8 %r, %s2
   %s = select i1 %c, i8 %r, i8 %s2
   ret i8 %s
@@ -223,7 +223,7 @@
 ; CHECK-NEXT:    cmp r1, r0
 ; CHECK-NEXT:    csel r0, r1, r0, hi
 ; CHECK-NEXT:    bx lr
-  %r = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> %s1)
+  %r = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> %s1)
   %rs = zext i8 %r to i32
   %c = icmp ugt i32 %rs, %s2
   %s = select i1 %c, i32 %rs, i32 %s2
@@ -235,7 +235,7 @@
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    vmaxv.u16 r0, q0
 ; CHECK-NEXT:    bx lr
-  %r = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> %s1)
+  %r = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> %s1)
   %c = icmp ugt i16 %r, %s2
   %s = select i1 %c, i16 %r, i16 %s2
   ret i16 %s
@@ -250,7 +250,7 @@
 ; CHECK-NEXT:    cmp r1, r0
 ; CHECK-NEXT:    csel r0, r1, r0, hi
 ; CHECK-NEXT:    bx lr
-  %r = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> %s1)
+  %r = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> %s1)
   %rs = zext i16 %r to i32
   %c = icmp ugt i32 %rs, %s2
   %s = select i1 %c, i32 %rs, i32 %s2
@@ -262,7 +262,7 @@
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    vmaxv.u32 r0, q0
 ; CHECK-NEXT:    bx lr
-  %r = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> %s1)
+  %r = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %s1)
   %c = icmp ugt i32 %r, %s2
   %s = select i1 %c, i32 %r, i32 %s2
   ret i32 %s
@@ -273,7 +273,7 @@
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    vminv.s8 r0, q0
 ; CHECK-NEXT:    bx lr
-  %r = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> %s1)
+  %r = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> %s1)
   %c = icmp slt i8 %r, %s2
   %s = select i1 %c, i8 %r, i8 %s2
   ret i8 %s
@@ -288,7 +288,7 @@
 ; CHECK-NEXT:    cmp r1, r0
 ; CHECK-NEXT:    csel r0, r1, r0, lt
 ; CHECK-NEXT:    bx lr
-  %r = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> %s1)
+  %r = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> %s1)
   %rs = sext i8 %r to i32
   %c = icmp slt i32 %rs, %s2
   %s = select i1 %c, i32 %rs, i32 %s2
@@ -300,7 +300,7 @@
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    vminv.s16 r0, q0
 ; CHECK-NEXT:    bx lr
-  %r = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> %s1)
+  %r = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> %s1)
   %c = icmp slt i16 %r, %s2
   %s = select i1 %c, i16 %r, i16 %s2
   ret i16 %s
@@ -315,7 +315,7 @@
 ; CHECK-NEXT:    cmp r1, r0
 ; CHECK-NEXT:    csel r0, r1, r0, lt
 ; CHECK-NEXT:    bx lr
-  %r = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> %s1)
+  %r = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> %s1)
   %rs = sext i16 %r to i32
   %c = icmp slt i32 %rs, %s2
   %s = select i1 %c, i32 %rs, i32 %s2
@@ -327,7 +327,7 @@
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    vminv.s32 r0, q0
 ; CHECK-NEXT:    bx lr
-  %r = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> %s1)
+  %r = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %s1)
   %c = icmp slt i32 %r, %s2
   %s = select i1 %c, i32 %r, i32 %s2
   ret i32 %s
@@ -338,7 +338,7 @@
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    vminv.u8 r0, q0
 ; CHECK-NEXT:    bx lr
-  %r = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> %s1)
+  %r = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> %s1)
   %c = icmp ult i8 %r, %s2
   %s = select i1 %c, i8 %r, i8 %s2
   ret i8 %s
@@ -353,7 +353,7 @@
 ; CHECK-NEXT:    cmp r1, r0
 ; CHECK-NEXT:    csel r0, r1, r0, lo
 ; CHECK-NEXT:    bx lr
-  %r = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> %s1)
+  %r = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> %s1)
   %rs = zext i8 %r to i32
   %c = icmp ult i32 %rs, %s2
   %s = select i1 %c, i32 %rs, i32 %s2
@@ -365,7 +365,7 @@
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    vminv.u16 r0, q0
 ; CHECK-NEXT:    bx lr
-  %r = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> %s1)
+  %r = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> %s1)
   %c = icmp ult i16 %r, %s2
   %s = select i1 %c, i16 %r, i16 %s2
   ret i16 %s
@@ -380,7 +380,7 @@
 ; CHECK-NEXT:    cmp r1, r0
 ; CHECK-NEXT:    csel r0, r1, r0, lo
 ; CHECK-NEXT:    bx lr
-  %r = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> %s1)
+  %r = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> %s1)
   %rs = zext i16 %r to i32
   %c = icmp ult i32 %rs, %s2
   %s = select i1 %c, i32 %rs, i32 %s2
@@ -392,7 +392,7 @@
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    vminv.u32 r0, q0
 ; CHECK-NEXT:    bx lr
-  %r = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> %s1)
+  %r = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %s1)
   %c = icmp ult i32 %r, %s2
   %s = select i1 %c, i32 %r, i32 %s2
   ret i32 %s
diff --git a/llvm/test/CodeGen/X86/haddsub.ll b/llvm/test/CodeGen/X86/haddsub.ll
--- a/llvm/test/CodeGen/X86/haddsub.ll
+++ b/llvm/test/CodeGen/X86/haddsub.ll
@@ -1628,8 +1628,8 @@
 ; Repeat tests from general reductions to verify output for hoppy targets:
 ; PR38971: https://bugs.llvm.org/show_bug.cgi?id=38971
 
-declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float, <8 x float>)
-declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double, <4 x double>)
+declare float @llvm.vector.reduce.fadd.f32.v8f32(float, <8 x float>)
+declare double @llvm.vector.reduce.fadd.f64.v4f64(double, <4 x double>)
 
 define float @fadd_reduce_v8f32(float %a0, <8 x float> %a1) {
 ; SSE3-SLOW-LABEL: fadd_reduce_v8f32:
@@ -1672,7 +1672,7 @@
 ; AVX-FAST-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    vzeroupper
 ; AVX-FAST-NEXT:    retq
-  %r = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float %a0, <8 x float> %a1)
+  %r = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float %a0, <8 x float> %a1)
   ret float %r
 }
 
@@ -1711,7 +1711,7 @@
 ; AVX-FAST-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    vzeroupper
 ; AVX-FAST-NEXT:    retq
-  %r = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double %a0, <4 x double> %a1)
+  %r = call fast double @llvm.vector.reduce.fadd.f64.v4f64(double %a0, <4 x double> %a1)
   ret double %r
 }
 
diff --git a/llvm/test/CodeGen/X86/pr45378.ll b/llvm/test/CodeGen/X86/pr45378.ll
--- a/llvm/test/CodeGen/X86/pr45378.ll
+++ b/llvm/test/CodeGen/X86/pr45378.ll
@@ -6,7 +6,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f  | FileCheck %s --check-prefixes=CHECK,AVX,AVX512,AVX512F
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512bw | FileCheck %s --check-prefixes=CHECK,AVX,AVX512,AVX512BW
 
-declare i64 @llvm.experimental.vector.reduce.or.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.or.v2i64(<2 x i64>)
 
 define i1 @parseHeaders(i64 * %ptr) nounwind {
 ; SSE2-LABEL: parseHeaders:
@@ -34,7 +34,7 @@
 ; AVX-NEXT:    retq
   %vptr = bitcast i64 * %ptr to <2 x i64> *
   %vload = load <2 x i64>, <2 x i64> * %vptr, align 8
-  %vreduce = call i64 @llvm.experimental.vector.reduce.or.v2i64(<2 x i64> %vload)
+  %vreduce = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> %vload)
   %vcheck = icmp eq i64 %vreduce, 0
   ret i1 %vcheck
 }
diff --git a/llvm/test/CodeGen/X86/vector-reduce-add.ll b/llvm/test/CodeGen/X86/vector-reduce-add.ll
--- a/llvm/test/CodeGen/X86/vector-reduce-add.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-add.ll
@@ -32,7 +32,7 @@
 ; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovq %xmm0, %rax
 ; AVX512-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a0)
   ret i64 %1
 }
 
@@ -74,7 +74,7 @@
 ; AVX512-NEXT:    vmovq %xmm0, %rax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a0)
   ret i64 %1
 }
 
@@ -124,7 +124,7 @@
 ; AVX512-NEXT:    vmovq %xmm0, %rax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a0)
   ret i64 %1
 }
 
@@ -187,7 +187,7 @@
 ; AVX512-NEXT:    vmovq %xmm0, %rax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a0)
   ret i64 %1
 }
 
@@ -229,7 +229,7 @@
 ; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a0)
   ret i32 %1
 }
 
@@ -276,7 +276,7 @@
 ; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a0)
   ret i32 %1
 }
 
@@ -336,7 +336,7 @@
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a0)
   ret i32 %1
 }
 
@@ -408,7 +408,7 @@
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a0)
   ret i32 %1
 }
 
@@ -499,7 +499,7 @@
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %a0)
   ret i32 %1
 }
 
@@ -547,7 +547,7 @@
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %a0)
   ret i16 %1
 }
 
@@ -601,7 +601,7 @@
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a0)
   ret i16 %1
 }
 
@@ -663,7 +663,7 @@
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a0)
   ret i16 %1
 }
 
@@ -738,7 +738,7 @@
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %a0)
   ret i16 %1
 }
 
@@ -826,7 +826,7 @@
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %a0)
   ret i16 %1
 }
 
@@ -933,7 +933,7 @@
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> %a0)
   ret i16 %1
 }
 
@@ -966,7 +966,7 @@
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> %a0)
   ret i8 %1
 }
 
@@ -1002,7 +1002,7 @@
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    retq
   %a0 = load <2 x i8>, <2 x i8>* %p
-  %1 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> %a0)
   ret i8 %1
 }
 
@@ -1043,7 +1043,7 @@
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> %a0)
   ret i8 %1
 }
 
@@ -1075,7 +1075,7 @@
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    retq
   %a0 = load <4 x i8>, <4 x i8>* %p
-  %1 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> %a0)
   ret i8 %1
 }
 
@@ -1103,7 +1103,7 @@
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a0)
   ret i8 %1
 }
 
@@ -1135,7 +1135,7 @@
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    retq
   %a0 = load <8 x i8>, <8 x i8>* %p
-  %1 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a0)
   ret i8 %1
 }
 
@@ -1169,7 +1169,7 @@
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %a0)
   ret i8 %1
 }
 
@@ -1223,7 +1223,7 @@
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> %a0)
   ret i8 %1
 }
 
@@ -1285,7 +1285,7 @@
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> %a0)
   ret i8 %1
 }
 
@@ -1360,32 +1360,32 @@
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> %a0)
   ret i8 %1
 }
 
-declare i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64>)
-declare i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64>)
+declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>)
+declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>)
 
-declare i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32>)
+declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.add.v32i32(<32 x i32>)
 
-declare i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16>)
-declare i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16>)
-declare i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16>)
+declare i16 @llvm.vector.reduce.add.v2i16(<2 x i16>)
+declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.add.v32i16(<32 x i16>)
+declare i16 @llvm.vector.reduce.add.v64i16(<64 x i16>)
 
-declare i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8>)
-declare i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8>)
-declare i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8>)
-declare i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8>)
+declare i8 @llvm.vector.reduce.add.v2i8(<2 x i8>)
+declare i8 @llvm.vector.reduce.add.v4i8(<4 x i8>)
+declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.add.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.add.v64i8(<64 x i8>)
+declare i8 @llvm.vector.reduce.add.v128i8(<128 x i8>)
diff --git a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll
--- a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll
@@ -59,7 +59,7 @@
 ; AVX512VL-NEXT:    sete %al
 ; AVX512VL-NEXT:    retq
   %a = trunc <2 x i64> %0 to <2 x i1>
-  %b = call i1 @llvm.experimental.vector.reduce.and.v2i1(<2 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> %a)
   ret i1 %b
 }
 
@@ -111,7 +111,7 @@
 ; AVX512VL-NEXT:    sete %al
 ; AVX512VL-NEXT:    retq
   %a = trunc <4 x i32> %0 to <4 x i1>
-  %b = call i1 @llvm.experimental.vector.reduce.and.v4i1(<4 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> %a)
   ret i1 %b
 }
 
@@ -176,7 +176,7 @@
 ; AVX512VL-NEXT:    sete %al
 ; AVX512VL-NEXT:    retq
   %a = trunc <8 x i8> %0 to <8 x i1>
-  %b = call i1 @llvm.experimental.vector.reduce.and.v8i1(<8 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> %a)
   ret i1 %b
 }
 
@@ -205,7 +205,7 @@
 ; AVX512-NEXT:    sete %al
 ; AVX512-NEXT:    retq
   %a = trunc <16 x i8> %0 to <16 x i1>
-  %b = call i1 @llvm.experimental.vector.reduce.and.v16i1(<16 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> %a)
   ret i1 %b
 }
 
@@ -262,7 +262,7 @@
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = trunc <4 x i64> %0 to <4 x i1>
-  %b = call i1 @llvm.experimental.vector.reduce.and.v4i1(<4 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> %a)
   ret i1 %b
 }
 
@@ -351,7 +351,7 @@
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = trunc <8 x i32> %0 to <8 x i1>
-  %b = call i1 @llvm.experimental.vector.reduce.and.v8i1(<8 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> %a)
   ret i1 %b
 }
 
@@ -420,7 +420,7 @@
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = trunc <16 x i16> %0 to <16 x i1>
-  %b = call i1 @llvm.experimental.vector.reduce.and.v16i1(<16 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> %a)
   ret i1 %b
 }
 
@@ -492,7 +492,7 @@
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = trunc <32 x i8> %0 to <32 x i1>
-  %b = call i1 @llvm.experimental.vector.reduce.and.v32i1(<32 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> %a)
   ret i1 %b
 }
 
@@ -597,7 +597,7 @@
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = trunc <8 x i64> %0 to <8 x i1>
-  %b = call i1 @llvm.experimental.vector.reduce.and.v8i1(<8 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> %a)
   ret i1 %b
 }
 
@@ -678,7 +678,7 @@
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %a = trunc <16 x i32> %0 to <16 x i1>
-  %b = call i1 @llvm.experimental.vector.reduce.and.v16i1(<16 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> %a)
   ret i1 %b
 }
 
@@ -765,7 +765,7 @@
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = trunc <32 x i16> %0 to <32 x i1>
-  %b = call i1 @llvm.experimental.vector.reduce.and.v32i1(<32 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> %a)
   ret i1 %b
 }
 
@@ -845,7 +845,7 @@
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = trunc <64 x i8> %0 to <64 x i1>
-  %b = call i1 @llvm.experimental.vector.reduce.and.v64i1(<64 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> %a)
   ret i1 %b
 }
 
@@ -905,7 +905,7 @@
 ; AVX512VL-NEXT:    sete %al
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <2 x i64> %0, zeroinitializer
-  %b = call i1 @llvm.experimental.vector.reduce.and.v2i1(<2 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> %a)
   ret i1 %b
 }
 
@@ -961,7 +961,7 @@
 ; AVX512VL-NEXT:    sete %al
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <4 x i32> %0, zeroinitializer
-  %b = call i1 @llvm.experimental.vector.reduce.and.v4i1(<4 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> %a)
   ret i1 %b
 }
 
@@ -1014,7 +1014,7 @@
 ; AVX512VL-NEXT:    sete %al
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <8 x i8> %0, zeroinitializer
-  %b = call i1 @llvm.experimental.vector.reduce.and.v8i1(<8 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> %a)
   ret i1 %b
 }
 
@@ -1062,7 +1062,7 @@
 ; AVX512VL-NEXT:    setb %al
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <16 x i8> %0, zeroinitializer
-  %b = call i1 @llvm.experimental.vector.reduce.and.v16i1(<16 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> %a)
   ret i1 %b
 }
 
@@ -1141,7 +1141,7 @@
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <4 x i64> %0, zeroinitializer
-  %b = call i1 @llvm.experimental.vector.reduce.and.v4i1(<4 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> %a)
   ret i1 %b
 }
 
@@ -1207,7 +1207,7 @@
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <8 x i32> %0, zeroinitializer
-  %b = call i1 @llvm.experimental.vector.reduce.and.v8i1(<8 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> %a)
   ret i1 %b
 }
 
@@ -1274,7 +1274,7 @@
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <16 x i16> %0, zeroinitializer
-  %b = call i1 @llvm.experimental.vector.reduce.and.v16i1(<16 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> %a)
   ret i1 %b
 }
 
@@ -1354,7 +1354,7 @@
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <32 x i8> %0, zeroinitializer
-  %b = call i1 @llvm.experimental.vector.reduce.and.v32i1(<32 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> %a)
   ret i1 %b
 }
 
@@ -1447,7 +1447,7 @@
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <8 x i64> %0, zeroinitializer
-  %b = call i1 @llvm.experimental.vector.reduce.and.v8i1(<8 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> %a)
   ret i1 %b
 }
 
@@ -1507,7 +1507,7 @@
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %a = icmp eq <16 x i32> %0, zeroinitializer
-  %b = call i1 @llvm.experimental.vector.reduce.and.v16i1(<16 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> %a)
   ret i1 %b
 }
 
@@ -1595,7 +1595,7 @@
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <32 x i16> %0, zeroinitializer
-  %b = call i1 @llvm.experimental.vector.reduce.and.v32i1(<32 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> %a)
   ret i1 %b
 }
 
@@ -1686,13 +1686,13 @@
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <64 x i8> %0, zeroinitializer
-  %b = call i1 @llvm.experimental.vector.reduce.and.v64i1(<64 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> %a)
   ret i1 %b
 }
 
-declare i1 @llvm.experimental.vector.reduce.and.v2i1(<2 x i1>)
-declare i1 @llvm.experimental.vector.reduce.and.v4i1(<4 x i1>)
-declare i1 @llvm.experimental.vector.reduce.and.v8i1(<8 x i1>)
-declare i1 @llvm.experimental.vector.reduce.and.v16i1(<16 x i1>)
-declare i1 @llvm.experimental.vector.reduce.and.v32i1(<32 x i1>)
-declare i1 @llvm.experimental.vector.reduce.and.v64i1(<64 x i1>)
+declare i1 @llvm.vector.reduce.and.v2i1(<2 x i1>)
+declare i1 @llvm.vector.reduce.and.v4i1(<4 x i1>)
+declare i1 @llvm.vector.reduce.and.v8i1(<8 x i1>)
+declare i1 @llvm.vector.reduce.and.v16i1(<16 x i1>)
+declare i1 @llvm.vector.reduce.and.v32i1(<32 x i1>)
+declare i1 @llvm.vector.reduce.and.v64i1(<64 x i1>)
diff --git a/llvm/test/CodeGen/X86/vector-reduce-and-cmp.ll b/llvm/test/CodeGen/X86/vector-reduce-and-cmp.ll
--- a/llvm/test/CodeGen/X86/vector-reduce-and-cmp.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-and-cmp.ll
@@ -28,7 +28,7 @@
 ; AVX-NEXT:    testq %rax, %rax
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.and.v2i64(<2 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> %a0)
   %2 = icmp eq i64 %1, 0
   ret i1 %2
 }
@@ -79,7 +79,7 @@
 ; AVX512-NEXT:    setne %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.and.v4i64(<4 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> %a0)
   %2 = icmp ne i64 %1, 0
   ret i1 %2
 }
@@ -136,7 +136,7 @@
 ; AVX512-NEXT:    sete %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.and.v8i64(<8 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.and.v8i64(<8 x i64> %a0)
   %2 = icmp eq i64 %1, 0
   ret i1 %2
 }
@@ -202,7 +202,7 @@
 ; AVX512-NEXT:    setne %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.and.v16i64(<16 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.and.v16i64(<16 x i64> %a0)
   %2 = icmp ne i64 %1, 0
   ret i1 %2
 }
@@ -229,7 +229,7 @@
 ; AVX-NEXT:    testl %eax, %eax
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.and.v2i32(<2 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> %a0)
   %2 = icmp eq i32 %1, 0
   ret i1 %2
 }
@@ -256,7 +256,7 @@
 ; AVX-NEXT:    testl %eax, %eax
 ; AVX-NEXT:    setne %al
 ; AVX-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %a0)
   %2 = icmp ne i32 %1, 0
   ret i1 %2
 }
@@ -315,7 +315,7 @@
 ; AVX512-NEXT:    sete %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> %a0)
   %2 = icmp eq i32 %1, 0
   ret i1 %2
 }
@@ -380,7 +380,7 @@
 ; AVX512-NEXT:    setne %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.and.v16i32(<16 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> %a0)
   %2 = icmp ne i32 %1, 0
   ret i1 %2
 }
@@ -454,7 +454,7 @@
 ; AVX512-NEXT:    sete %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.and.v32i32(<32 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.and.v32i32(<32 x i32> %a0)
   %2 = icmp eq i32 %1, 0
   ret i1 %2
 }
@@ -482,7 +482,7 @@
 ; AVX-NEXT:    testw %ax, %ax
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.and.v2i16(<2 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.and.v2i16(<2 x i16> %a0)
   %2 = icmp eq i16 %1, 0
   ret i1 %2
 }
@@ -510,7 +510,7 @@
 ; AVX-NEXT:    testw %ax, %ax
 ; AVX-NEXT:    setne %al
 ; AVX-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> %a0)
   %2 = icmp ne i16 %1, 0
   ret i1 %2
 }
@@ -542,7 +542,7 @@
 ; AVX-NEXT:    testw %ax, %ax
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> %a0)
   %2 = icmp eq i16 %1, 0
   ret i1 %2
 }
@@ -610,7 +610,7 @@
 ; AVX512-NEXT:    setne %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> %a0)
   %2 = icmp ne i16 %1, 0
   ret i1 %2
 }
@@ -684,7 +684,7 @@
 ; AVX512-NEXT:    sete %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.and.v32i16(<32 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.and.v32i16(<32 x i16> %a0)
   %2 = icmp eq i16 %1, 0
   ret i1 %2
 }
@@ -767,7 +767,7 @@
 ; AVX512-NEXT:    setne %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.and.v64i16(<64 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.and.v64i16(<64 x i16> %a0)
   %2 = icmp ne i16 %1, 0
   ret i1 %2
 }
@@ -795,7 +795,7 @@
 ; AVX-NEXT:    testb %al, %al
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.and.v2i8(<2 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.and.v2i8(<2 x i8> %a0)
   %2 = icmp eq i8 %1, 0
   ret i1 %2
 }
@@ -824,7 +824,7 @@
 ; AVX-NEXT:    testb %al, %al
 ; AVX-NEXT:    setne %al
 ; AVX-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.and.v4i8(<4 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> %a0)
   %2 = icmp ne i8 %1, 0
   ret i1 %2
 }
@@ -857,7 +857,7 @@
 ; AVX-NEXT:    testb %al, %al
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> %a0)
   %2 = icmp eq i8 %1, 0
   ret i1 %2
 }
@@ -894,7 +894,7 @@
 ; AVX-NEXT:    testb %al, %al
 ; AVX-NEXT:    setne %al
 ; AVX-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> %a0)
   %2 = icmp ne i8 %1, 0
   ret i1 %2
 }
@@ -971,7 +971,7 @@
 ; AVX512-NEXT:    sete %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> %a0)
   %2 = icmp eq i8 %1, 0
   ret i1 %2
 }
@@ -1054,7 +1054,7 @@
 ; AVX512-NEXT:    setne %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.and.v64i8(<64 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.and.v64i8(<64 x i8> %a0)
   %2 = icmp ne i8 %1, 0
   ret i1 %2
 }
@@ -1146,33 +1146,33 @@
 ; AVX512-NEXT:    sete %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.and.v128i8(<128 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.and.v128i8(<128 x i8> %a0)
   %2 = icmp eq i8 %1, 0
   ret i1 %2
 }
 
-declare i64 @llvm.experimental.vector.reduce.and.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.and.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.and.v8i64(<8 x i64>)
-declare i64 @llvm.experimental.vector.reduce.and.v16i64(<16 x i64>)
+declare i64 @llvm.vector.reduce.and.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.and.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.and.v8i64(<8 x i64>)
+declare i64 @llvm.vector.reduce.and.v16i64(<16 x i64>)
 
-declare i32 @llvm.experimental.vector.reduce.and.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.and.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.and.v32i32(<32 x i32>)
+declare i32 @llvm.vector.reduce.and.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.and.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.and.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.and.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.and.v32i32(<32 x i32>)
 
-declare i16 @llvm.experimental.vector.reduce.and.v2i16(<2 x i16>)
-declare i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.and.v32i16(<32 x i16>)
-declare i16 @llvm.experimental.vector.reduce.and.v64i16(<64 x i16>)
+declare i16 @llvm.vector.reduce.and.v2i16(<2 x i16>)
+declare i16 @llvm.vector.reduce.and.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.and.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.and.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.and.v32i16(<32 x i16>)
+declare i16 @llvm.vector.reduce.and.v64i16(<64 x i16>)
 
-declare i8 @llvm.experimental.vector.reduce.and.v2i8(<2 x i8>)
-declare i8 @llvm.experimental.vector.reduce.and.v4i8(<4 x i8>)
-declare i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.and.v64i8(<64 x i8>)
-declare i8 @llvm.experimental.vector.reduce.and.v128i8(<128 x i8>)
+declare i8 @llvm.vector.reduce.and.v2i8(<2 x i8>)
+declare i8 @llvm.vector.reduce.and.v4i8(<4 x i8>)
+declare i8 @llvm.vector.reduce.and.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.and.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.and.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.and.v64i8(<64 x i8>)
+declare i8 @llvm.vector.reduce.and.v128i8(<128 x i8>)
diff --git a/llvm/test/CodeGen/X86/vector-reduce-and.ll b/llvm/test/CodeGen/X86/vector-reduce-and.ll
--- a/llvm/test/CodeGen/X86/vector-reduce-and.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-and.ll
@@ -24,7 +24,7 @@
 ; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vmovq %xmm0, %rax
 ; AVX-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.and.v2i64(<2 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> %a0)
   ret i64 %1
 }
 
@@ -66,7 +66,7 @@
 ; AVX512-NEXT:    vmovq %xmm0, %rax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.and.v4i64(<4 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> %a0)
   ret i64 %1
 }
 
@@ -114,7 +114,7 @@
 ; AVX512-NEXT:    vmovq %xmm0, %rax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.and.v8i64(<8 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.and.v8i64(<8 x i64> %a0)
   ret i64 %1
 }
 
@@ -171,7 +171,7 @@
 ; AVX512-NEXT:    vmovq %xmm0, %rax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.and.v16i64(<16 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.and.v16i64(<16 x i64> %a0)
   ret i64 %1
 }
 
@@ -193,7 +193,7 @@
 ; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vmovd %xmm0, %eax
 ; AVX-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.and.v2i32(<2 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> %a0)
   ret i32 %1
 }
 
@@ -215,7 +215,7 @@
 ; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vmovd %xmm0, %eax
 ; AVX-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %a0)
   ret i32 %1
 }
 
@@ -265,7 +265,7 @@
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> %a0)
   ret i32 %1
 }
 
@@ -321,7 +321,7 @@
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.and.v16i32(<16 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> %a0)
   ret i32 %1
 }
 
@@ -386,7 +386,7 @@
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.and.v32i32(<32 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.and.v32i32(<32 x i32> %a0)
   ret i32 %1
 }
 
@@ -411,7 +411,7 @@
 ; AVX-NEXT:    vmovd %xmm0, %eax
 ; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.and.v2i16(<2 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.and.v2i16(<2 x i16> %a0)
   ret i16 %1
 }
 
@@ -436,7 +436,7 @@
 ; AVX-NEXT:    vmovd %xmm0, %eax
 ; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> %a0)
   ret i16 %1
 }
 
@@ -465,7 +465,7 @@
 ; AVX-NEXT:    vmovd %xmm0, %eax
 ; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> %a0)
   ret i16 %1
 }
 
@@ -528,7 +528,7 @@
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> %a0)
   ret i16 %1
 }
 
@@ -597,7 +597,7 @@
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.and.v32i16(<32 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.and.v32i16(<32 x i16> %a0)
   ret i16 %1
 }
 
@@ -675,7 +675,7 @@
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.and.v64i16(<64 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.and.v64i16(<64 x i16> %a0)
   ret i16 %1
 }
 
@@ -700,7 +700,7 @@
 ; AVX-NEXT:    vmovd %xmm0, %eax
 ; AVX-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.and.v2i8(<2 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.and.v2i8(<2 x i8> %a0)
   ret i8 %1
 }
 
@@ -726,7 +726,7 @@
 ; AVX-NEXT:    vmovd %xmm0, %eax
 ; AVX-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.and.v4i8(<4 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> %a0)
   ret i8 %1
 }
 
@@ -756,7 +756,7 @@
 ; AVX-NEXT:    vmovd %xmm0, %eax
 ; AVX-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> %a0)
   ret i8 %1
 }
 
@@ -790,7 +790,7 @@
 ; AVX-NEXT:    vmovd %xmm0, %eax
 ; AVX-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> %a0)
   ret i8 %1
 }
 
@@ -862,7 +862,7 @@
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> %a0)
   ret i8 %1
 }
 
@@ -940,7 +940,7 @@
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.and.v64i8(<64 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.and.v64i8(<64 x i8> %a0)
   ret i8 %1
 }
 
@@ -1027,32 +1027,32 @@
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.and.v128i8(<128 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.and.v128i8(<128 x i8> %a0)
   ret i8 %1
 }
 
-declare i64 @llvm.experimental.vector.reduce.and.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.and.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.and.v8i64(<8 x i64>)
-declare i64 @llvm.experimental.vector.reduce.and.v16i64(<16 x i64>)
+declare i64 @llvm.vector.reduce.and.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.and.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.and.v8i64(<8 x i64>)
+declare i64 @llvm.vector.reduce.and.v16i64(<16 x i64>)
 
-declare i32 @llvm.experimental.vector.reduce.and.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.and.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.and.v32i32(<32 x i32>)
+declare i32 @llvm.vector.reduce.and.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.and.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.and.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.and.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.and.v32i32(<32 x i32>)
 
-declare i16 @llvm.experimental.vector.reduce.and.v2i16(<2 x i16>)
-declare i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.and.v32i16(<32 x i16>)
-declare i16 @llvm.experimental.vector.reduce.and.v64i16(<64 x i16>)
+declare i16 @llvm.vector.reduce.and.v2i16(<2 x i16>)
+declare i16 @llvm.vector.reduce.and.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.and.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.and.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.and.v32i16(<32 x i16>)
+declare i16 @llvm.vector.reduce.and.v64i16(<64 x i16>)
 
-declare i8 @llvm.experimental.vector.reduce.and.v2i8(<2 x i8>)
-declare i8 @llvm.experimental.vector.reduce.and.v4i8(<4 x i8>)
-declare i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.and.v64i8(<64 x i8>)
-declare i8 @llvm.experimental.vector.reduce.and.v128i8(<128 x i8>)
+declare i8 @llvm.vector.reduce.and.v2i8(<2 x i8>)
+declare i8 @llvm.vector.reduce.and.v4i8(<4 x i8>)
+declare i8 @llvm.vector.reduce.and.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.and.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.and.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.and.v64i8(<64 x i8>)
+declare i8 @llvm.vector.reduce.and.v128i8(<128 x i8>)
diff --git a/llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll b/llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll
--- a/llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll
@@ -53,7 +53,7 @@
 ; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v2f32(float %a0, <2 x float> %a1)
+  %1 = call fast float @llvm.vector.reduce.fadd.f32.v2f32(float %a0, <2 x float> %a1)
   ret float %1
 }
 
@@ -112,7 +112,7 @@
 ; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %a0, <4 x float> %a1)
+  %1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float %a0, <4 x float> %a1)
   ret float %1
 }
 
@@ -185,7 +185,7 @@
 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float %a0, <8 x float> %a1)
+  %1 = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float %a0, <8 x float> %a1)
   ret float %1
 }
 
@@ -268,7 +268,7 @@
 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float %a0, <16 x float> %a1)
+  %1 = call fast float @llvm.vector.reduce.fadd.f32.v16f32(float %a0, <16 x float> %a1)
   ret float %1
 }
 
@@ -313,7 +313,7 @@
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v2f32(float 0.0, <2 x float> %a0)
+  %1 = call fast float @llvm.vector.reduce.fadd.f32.v2f32(float 0.0, <2 x float> %a0)
   ret float %1
 }
 
@@ -367,7 +367,7 @@
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.0, <4 x float> %a0)
+  %1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float 0.0, <4 x float> %a0)
   ret float %1
 }
 
@@ -435,7 +435,7 @@
 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float 0.0, <8 x float> %a0)
+  %1 = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float 0.0, <8 x float> %a0)
   ret float %1
 }
 
@@ -513,7 +513,7 @@
 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float 0.0, <16 x float> %a0)
+  %1 = call fast float @llvm.vector.reduce.fadd.f32.v16f32(float 0.0, <16 x float> %a0)
   ret float %1
 }
 
@@ -558,7 +558,7 @@
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v2f32(float 0.0, <2 x float> %a0)
+  %1 = call fast float @llvm.vector.reduce.fadd.f32.v2f32(float 0.0, <2 x float> %a0)
   ret float %1
 }
 
@@ -612,7 +612,7 @@
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.0, <4 x float> %a0)
+  %1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float 0.0, <4 x float> %a0)
   ret float %1
 }
 
@@ -680,7 +680,7 @@
 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float 0.0, <8 x float> %a0)
+  %1 = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float 0.0, <8 x float> %a0)
   ret float %1
 }
 
@@ -758,7 +758,7 @@
 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float 0.0, <16 x float> %a0)
+  %1 = call fast float @llvm.vector.reduce.fadd.f32.v16f32(float 0.0, <16 x float> %a0)
   ret float %1
 }
 
@@ -801,7 +801,7 @@
 ; AVX512-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double %a0, <2 x double> %a1)
+  %1 = call fast double @llvm.vector.reduce.fadd.f64.v2f64(double %a0, <2 x double> %a1)
   ret double %1
 }
 
@@ -853,7 +853,7 @@
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double %a0, <4 x double> %a1)
+  %1 = call fast double @llvm.vector.reduce.fadd.f64.v4f64(double %a0, <4 x double> %a1)
   ret double %1
 }
 
@@ -912,7 +912,7 @@
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v8f64(double %a0, <8 x double> %a1)
+  %1 = call fast double @llvm.vector.reduce.fadd.f64.v8f64(double %a0, <8 x double> %a1)
   ret double %1
 }
 
@@ -982,7 +982,7 @@
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v16f64(double %a0, <16 x double> %a1)
+  %1 = call fast double @llvm.vector.reduce.fadd.f64.v16f64(double %a0, <16 x double> %a1)
   ret double %1
 }
 
@@ -1021,7 +1021,7 @@
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double 0.0, <2 x double> %a0)
+  %1 = call fast double @llvm.vector.reduce.fadd.f64.v2f64(double 0.0, <2 x double> %a0)
   ret double %1
 }
 
@@ -1069,7 +1069,7 @@
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double 0.0, <4 x double> %a0)
+  %1 = call fast double @llvm.vector.reduce.fadd.f64.v4f64(double 0.0, <4 x double> %a0)
   ret double %1
 }
 
@@ -1124,7 +1124,7 @@
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v8f64(double 0.0, <8 x double> %a0)
+  %1 = call fast double @llvm.vector.reduce.fadd.f64.v8f64(double 0.0, <8 x double> %a0)
   ret double %1
 }
 
@@ -1189,7 +1189,7 @@
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v16f64(double 0.0, <16 x double> %a0)
+  %1 = call fast double @llvm.vector.reduce.fadd.f64.v16f64(double 0.0, <16 x double> %a0)
   ret double %1
 }
 
@@ -1228,7 +1228,7 @@
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double 0.0, <2 x double> %a0)
+  %1 = call fast double @llvm.vector.reduce.fadd.f64.v2f64(double 0.0, <2 x double> %a0)
   ret double %1
 }
 
@@ -1276,7 +1276,7 @@
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double 0.0, <4 x double> %a0)
+  %1 = call fast double @llvm.vector.reduce.fadd.f64.v4f64(double 0.0, <4 x double> %a0)
   ret double %1
 }
 
@@ -1331,7 +1331,7 @@
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v8f64(double 0.0, <8 x double> %a0)
+  %1 = call fast double @llvm.vector.reduce.fadd.f64.v8f64(double 0.0, <8 x double> %a0)
   ret double %1
 }
 
@@ -1396,16 +1396,16 @@
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v16f64(double 0.0, <16 x double> %a0)
+  %1 = call fast double @llvm.vector.reduce.fadd.f64.v16f64(double 0.0, <16 x double> %a0)
   ret double %1
 }
 
-declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v2f32(float, <2 x float>)
-declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float, <4 x float>)
-declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float, <8 x float>)
-declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float, <16 x float>)
+declare float @llvm.vector.reduce.fadd.f32.v2f32(float, <2 x float>)
+declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
+declare float @llvm.vector.reduce.fadd.f32.v8f32(float, <8 x float>)
+declare float @llvm.vector.reduce.fadd.f32.v16f32(float, <16 x float>)
 
-declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double, <2 x double>)
-declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double, <4 x double>)
-declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v8f64(double, <8 x double>)
-declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v16f64(double, <16 x double>)
+declare double @llvm.vector.reduce.fadd.f64.v2f64(double, <2 x double>)
+declare double @llvm.vector.reduce.fadd.f64.v4f64(double, <4 x double>)
+declare double @llvm.vector.reduce.fadd.f64.v8f64(double, <8 x double>)
+declare double @llvm.vector.reduce.fadd.f64.v16f64(double, <16 x double>)
diff --git a/llvm/test/CodeGen/X86/vector-reduce-fadd.ll b/llvm/test/CodeGen/X86/vector-reduce-fadd.ll
--- a/llvm/test/CodeGen/X86/vector-reduce-fadd.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-fadd.ll
@@ -39,7 +39,7 @@
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v2f32(float %a0, <2 x float> %a1)
+  %1 = call float @llvm.vector.reduce.fadd.f32.v2f32(float %a0, <2 x float> %a1)
   ret float %1
 }
 
@@ -90,7 +90,7 @@
 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %a0, <4 x float> %a1)
+  %1 = call float @llvm.vector.reduce.fadd.f32.v4f32(float %a0, <4 x float> %a1)
   ret float %1
 }
 
@@ -176,7 +176,7 @@
 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float %a0, <8 x float> %a1)
+  %1 = call float @llvm.vector.reduce.fadd.f32.v8f32(float %a0, <8 x float> %a1)
   ret float %1
 }
 
@@ -327,7 +327,7 @@
 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float %a0, <16 x float> %a1)
+  %1 = call float @llvm.vector.reduce.fadd.f32.v16f32(float %a0, <16 x float> %a1)
   ret float %1
 }
 
@@ -367,7 +367,7 @@
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; AVX512-NEXT:    vaddss %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v2f32(float 0.0, <2 x float> %a0)
+  %1 = call float @llvm.vector.reduce.fadd.f32.v2f32(float 0.0, <2 x float> %a0)
   ret float %1
 }
 
@@ -422,7 +422,7 @@
 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
 ; AVX512-NEXT:    vaddss %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.0, <4 x float> %a0)
+  %1 = call float @llvm.vector.reduce.fadd.f32.v4f32(float 0.0, <4 x float> %a0)
   ret float %1
 }
 
@@ -512,7 +512,7 @@
 ; AVX512-NEXT:    vaddss %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float 0.0, <8 x float> %a0)
+  %1 = call float @llvm.vector.reduce.fadd.f32.v8f32(float 0.0, <8 x float> %a0)
   ret float %1
 }
 
@@ -667,7 +667,7 @@
 ; AVX512-NEXT:    vaddss %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float 0.0, <16 x float> %a0)
+  %1 = call float @llvm.vector.reduce.fadd.f32.v16f32(float 0.0, <16 x float> %a0)
   ret float %1
 }
 
@@ -699,7 +699,7 @@
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; AVX512-NEXT:    vaddss {{.*}}(%rip), %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v2f32(float undef, <2 x float> %a0)
+  %1 = call float @llvm.vector.reduce.fadd.f32.v2f32(float undef, <2 x float> %a0)
   ret float %1
 }
 
@@ -746,7 +746,7 @@
 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
 ; AVX512-NEXT:    vaddss %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float undef, <4 x float> %a0)
+  %1 = call float @llvm.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %a0)
   ret float %1
 }
 
@@ -828,7 +828,7 @@
 ; AVX512-NEXT:    vaddss %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float undef, <8 x float> %a0)
+  %1 = call float @llvm.vector.reduce.fadd.f32.v8f32(float undef, <8 x float> %a0)
   ret float %1
 }
 
@@ -975,7 +975,7 @@
 ; AVX512-NEXT:    vaddss %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float undef, <16 x float> %a0)
+  %1 = call float @llvm.vector.reduce.fadd.f32.v16f32(float undef, <16 x float> %a0)
   ret float %1
 }
 
@@ -1004,7 +1004,7 @@
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double %a0, <2 x double> %a1)
+  %1 = call double @llvm.vector.reduce.fadd.f64.v2f64(double %a0, <2 x double> %a1)
   ret double %1
 }
 
@@ -1042,7 +1042,7 @@
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double %a0, <4 x double> %a1)
+  %1 = call double @llvm.vector.reduce.fadd.f64.v4f64(double %a0, <4 x double> %a1)
   ret double %1
 }
 
@@ -1101,7 +1101,7 @@
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v8f64(double %a0, <8 x double> %a1)
+  %1 = call double @llvm.vector.reduce.fadd.f64.v8f64(double %a0, <8 x double> %a1)
   ret double %1
 }
 
@@ -1229,7 +1229,7 @@
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v16f64(double %a0, <16 x double> %a1)
+  %1 = call double @llvm.vector.reduce.fadd.f64.v16f64(double %a0, <16 x double> %a1)
   ret double %1
 }
 
@@ -1261,7 +1261,7 @@
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
 ; AVX512-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double 0.0, <2 x double> %a0)
+  %1 = call double @llvm.vector.reduce.fadd.f64.v2f64(double 0.0, <2 x double> %a0)
   ret double %1
 }
 
@@ -1302,7 +1302,7 @@
 ; AVX512-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double 0.0, <4 x double> %a0)
+  %1 = call double @llvm.vector.reduce.fadd.f64.v4f64(double 0.0, <4 x double> %a0)
   ret double %1
 }
 
@@ -1364,7 +1364,7 @@
 ; AVX512-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v8f64(double 0.0, <8 x double> %a0)
+  %1 = call double @llvm.vector.reduce.fadd.f64.v8f64(double 0.0, <8 x double> %a0)
   ret double %1
 }
 
@@ -1467,7 +1467,7 @@
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v16f64(double 0.0, <16 x double> %a0)
+  %1 = call double @llvm.vector.reduce.fadd.f64.v16f64(double 0.0, <16 x double> %a0)
   ret double %1
 }
 
@@ -1493,7 +1493,7 @@
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
 ; AVX512-NEXT:    vaddsd {{.*}}(%rip), %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double undef, <2 x double> %a0)
+  %1 = call double @llvm.vector.reduce.fadd.f64.v2f64(double undef, <2 x double> %a0)
   ret double %1
 }
 
@@ -1528,7 +1528,7 @@
 ; AVX512-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double undef, <4 x double> %a0)
+  %1 = call double @llvm.vector.reduce.fadd.f64.v4f64(double undef, <4 x double> %a0)
   ret double %1
 }
 
@@ -1584,7 +1584,7 @@
 ; AVX512-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v8f64(double undef, <8 x double> %a0)
+  %1 = call double @llvm.vector.reduce.fadd.f64.v8f64(double undef, <8 x double> %a0)
   ret double %1
 }
 
@@ -1681,16 +1681,16 @@
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v16f64(double undef, <16 x double> %a0)
+  %1 = call double @llvm.vector.reduce.fadd.f64.v16f64(double undef, <16 x double> %a0)
   ret double %1
 }
 
-declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v2f32(float, <2 x float>)
-declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float, <4 x float>)
-declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float, <8 x float>)
-declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float, <16 x float>)
+declare float @llvm.vector.reduce.fadd.f32.v2f32(float, <2 x float>)
+declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
+declare float @llvm.vector.reduce.fadd.f32.v8f32(float, <8 x float>)
+declare float @llvm.vector.reduce.fadd.f32.v16f32(float, <16 x float>)
 
-declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double, <2 x double>)
-declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double, <4 x double>)
-declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v8f64(double, <8 x double>)
-declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v16f64(double, <16 x double>)
+declare double @llvm.vector.reduce.fadd.f64.v2f64(double, <2 x double>)
+declare double @llvm.vector.reduce.fadd.f64.v4f64(double, <4 x double>)
+declare double @llvm.vector.reduce.fadd.f64.v8f64(double, <8 x double>)
+declare double @llvm.vector.reduce.fadd.f64.v16f64(double, <16 x double>)
diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmax-fmin-fast.ll b/llvm/test/CodeGen/X86/vector-reduce-fmax-fmin-fast.ll
--- a/llvm/test/CodeGen/X86/vector-reduce-fmax-fmin-fast.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-fmax-fmin-fast.ll
@@ -39,7 +39,7 @@
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; AVX512-NEXT:    vminss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float> %a0)
+  %1 = call fast float @llvm.vector.reduce.fmin.v2f32(<2 x float> %a0)
   ret float %1
 }
 
@@ -78,7 +78,7 @@
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; AVX512-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> %a0)
+  %1 = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> %a0)
   ret float %1
 }
 
@@ -125,7 +125,7 @@
 ; AVX512-NEXT:    vminss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float> %a0)
+  %1 = call fast float @llvm.vector.reduce.fmin.v8f32(<8 x float> %a0)
   ret float %1
 }
 
@@ -179,7 +179,7 @@
 ; AVX512-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float> %a0)
+  %1 = call fast float @llvm.vector.reduce.fmax.v16f32(<16 x float> %a0)
   ret float %1
 }
 
@@ -206,7 +206,7 @@
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX512-NEXT:    vminsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double> %a0)
+  %1 = call fast double @llvm.vector.reduce.fmin.v2f64(<2 x double> %a0)
   ret double %1
 }
 
@@ -236,7 +236,7 @@
 ; AVX512-NEXT:    vmaxsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double> %a0)
+  %1 = call fast double @llvm.vector.reduce.fmax.v4f64(<4 x double> %a0)
   ret double %1
 }
 
@@ -271,7 +271,7 @@
 ; AVX512-NEXT:    vminsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.fmin.v8f64(<8 x double> %a0)
+  %1 = call fast double @llvm.vector.reduce.fmin.v8f64(<8 x double> %a0)
   ret double %1
 }
 
@@ -313,16 +313,16 @@
 ; AVX512-NEXT:    vmaxsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.fmax.v16f64(<16 x double> %a0)
+  %1 = call fast double @llvm.vector.reduce.fmax.v16f64(<16 x double> %a0)
   ret double %1
 }
 
-declare float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float>)
-declare float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float>)
-declare float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float>)
-declare float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float>)
+declare float @llvm.vector.reduce.fmin.v2f32(<2 x float>)
+declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>)
+declare float @llvm.vector.reduce.fmin.v8f32(<8 x float>)
+declare float @llvm.vector.reduce.fmax.v16f32(<16 x float>)
 
-declare double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double>)
-declare double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double>)
-declare double @llvm.experimental.vector.reduce.fmin.v8f64(<8 x double>)
-declare double @llvm.experimental.vector.reduce.fmax.v16f64(<16 x double>)
+declare double @llvm.vector.reduce.fmin.v2f64(<2 x double>)
+declare double @llvm.vector.reduce.fmax.v4f64(<4 x double>)
+declare double @llvm.vector.reduce.fmin.v8f64(<8 x double>)
+declare double @llvm.vector.reduce.fmax.v16f64(<16 x double>)
diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll b/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll
--- a/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll
@@ -35,7 +35,7 @@
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; AVX512-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call nnan float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float> %a0)
+  %1 = call nnan float @llvm.vector.reduce.fmax.v2f32(<2 x float> %a0)
   ret float %1
 }
 
@@ -84,7 +84,7 @@
 ; AVX512-NEXT:    vmaxss %xmm2, %xmm0, %xmm0
 ; AVX512-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call nnan float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> %a0)
+  %1 = call nnan float @llvm.vector.reduce.fmax.v4f32(<4 x float> %a0)
   ret float %1
 }
 
@@ -155,7 +155,7 @@
 ; AVX512-NEXT:    vmaxss %xmm2, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call nnan float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float> %a0)
+  %1 = call nnan float @llvm.vector.reduce.fmax.v8f32(<8 x float> %a0)
   ret float %1
 }
 
@@ -247,7 +247,7 @@
 ; AVX512-NEXT:    vmaxss %xmm8, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call nnan float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float> %a0)
+  %1 = call nnan float @llvm.vector.reduce.fmax.v16f32(<16 x float> %a0)
   ret float %1
 }
 
@@ -274,7 +274,7 @@
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX512-NEXT:    vmaxsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call nnan double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double> %a0)
+  %1 = call nnan double @llvm.vector.reduce.fmax.v2f64(<2 x double> %a0)
   ret double %1
 }
 
@@ -316,7 +316,7 @@
 ; AVX512-NEXT:    vmaxsd %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call nnan double @llvm.experimental.vector.reduce.fmax.v3f64(<3 x double> %a0)
+  %1 = call nnan double @llvm.vector.reduce.fmax.v3f64(<3 x double> %a0)
   ret double %1
 }
 
@@ -350,7 +350,7 @@
 ; AVX512-NEXT:    vmaxsd %xmm2, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call nnan double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double> %a0)
+  %1 = call nnan double @llvm.vector.reduce.fmax.v4f64(<4 x double> %a0)
   ret double %1
 }
 
@@ -395,7 +395,7 @@
 ; AVX512-NEXT:    vmaxsd %xmm2, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call nnan double @llvm.experimental.vector.reduce.fmax.v8f64(<8 x double> %a0)
+  %1 = call nnan double @llvm.vector.reduce.fmax.v8f64(<8 x double> %a0)
   ret double %1
 }
 
@@ -447,7 +447,7 @@
 ; AVX512-NEXT:    vmaxsd %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call nnan double @llvm.experimental.vector.reduce.fmax.v16f64(<16 x double> %a0)
+  %1 = call nnan double @llvm.vector.reduce.fmax.v16f64(<16 x double> %a0)
   ret double %1
 }
 
@@ -511,18 +511,18 @@
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call nnan half @llvm.experimental.vector.reduce.fmax.v2f16(<2 x half> %a0)
+  %1 = call nnan half @llvm.vector.reduce.fmax.v2f16(<2 x half> %a0)
   ret half %1
 }
-declare float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float>)
-declare float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float>)
-declare float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float>)
-declare float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float>)
+declare float @llvm.vector.reduce.fmax.v2f32(<2 x float>)
+declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>)
+declare float @llvm.vector.reduce.fmax.v8f32(<8 x float>)
+declare float @llvm.vector.reduce.fmax.v16f32(<16 x float>)
 
-declare double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double>)
-declare double @llvm.experimental.vector.reduce.fmax.v3f64(<3 x double>)
-declare double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double>)
-declare double @llvm.experimental.vector.reduce.fmax.v8f64(<8 x double>)
-declare double @llvm.experimental.vector.reduce.fmax.v16f64(<16 x double>)
+declare double @llvm.vector.reduce.fmax.v2f64(<2 x double>)
+declare double @llvm.vector.reduce.fmax.v3f64(<3 x double>)
+declare double @llvm.vector.reduce.fmax.v4f64(<4 x double>)
+declare double @llvm.vector.reduce.fmax.v8f64(<8 x double>)
+declare double @llvm.vector.reduce.fmax.v16f64(<16 x double>)
 
-declare half @llvm.experimental.vector.reduce.fmax.v2f16(<2 x half>)
+declare half @llvm.vector.reduce.fmax.v2f16(<2 x half>)
diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmax.ll b/llvm/test/CodeGen/X86/vector-reduce-fmax.ll
--- a/llvm/test/CodeGen/X86/vector-reduce-fmax.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-fmax.ll
@@ -14,7 +14,7 @@
 ; ALL-LABEL: test_v1f32:
 ; ALL:       # %bb.0:
 ; ALL-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.fmax.v1f32(<1 x float> %a0)
+  %1 = call float @llvm.vector.reduce.fmax.v1f32(<1 x float> %a0)
   ret float %1
 }
 
@@ -62,7 +62,7 @@
 ; AVX512-NEXT:    vmovss %xmm2, %xmm1, %xmm1 {%k1}
 ; AVX512-NEXT:    vmovaps %xmm1, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float> %a0)
+  %1 = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> %a0)
   ret float %1
 }
 
@@ -133,7 +133,7 @@
 ; AVX512-NEXT:    vmaxss %xmm2, %xmm1, %xmm0
 ; AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.fmax.v3f32(<3 x float> %a0)
+  %1 = call float @llvm.vector.reduce.fmax.v3f32(<3 x float> %a0)
   ret float %1
 }
 
@@ -230,7 +230,7 @@
 ; AVX512-NEXT:    vmaxss %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> %a0)
+  %1 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %a0)
   ret float %1
 }
 
@@ -401,7 +401,7 @@
 ; AVX512VL-NEXT:    vmovss %xmm8, %xmm0, %xmm0 {%k1}
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float> %a0)
+  %1 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> %a0)
   ret float %1
 }
 
@@ -661,7 +661,7 @@
 ; AVX512VL-NEXT:    vmovss %xmm8, %xmm0, %xmm0 {%k1}
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float> %a0)
+  %1 = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %a0)
   ret float %1
 }
 
@@ -700,7 +700,7 @@
 ; AVX512-NEXT:    vmovsd %xmm2, %xmm1, %xmm1 {%k1}
 ; AVX512-NEXT:    vmovapd %xmm1, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double> %a0)
+  %1 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> %a0)
   ret double %1
 }
 
@@ -774,7 +774,7 @@
 ; AVX512-NEXT:    vmovsd %xmm2, %xmm0, %xmm0 {%k1}
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double> %a0)
+  %1 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> %a0)
   ret double %1
 }
 
@@ -922,7 +922,7 @@
 ; AVX512VL-NEXT:    vmovsd %xmm8, %xmm0, %xmm0 {%k1}
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.fmax.v8f64(<8 x double> %a0)
+  %1 = call double @llvm.vector.reduce.fmax.v8f64(<8 x double> %a0)
   ret double %1
 }
 
@@ -1091,18 +1091,18 @@
 ; AVX512-NEXT:    vmovsd %xmm2, %xmm0, %xmm0 {%k1}
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.fmax.v16f64(<16 x double> %a0)
+  %1 = call double @llvm.vector.reduce.fmax.v16f64(<16 x double> %a0)
   ret double %1
 }
 
-declare float @llvm.experimental.vector.reduce.fmax.v1f32(<1 x float>)
-declare float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float>)
-declare float @llvm.experimental.vector.reduce.fmax.v3f32(<3 x float>)
-declare float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float>)
-declare float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float>)
-declare float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float>)
+declare float @llvm.vector.reduce.fmax.v1f32(<1 x float>)
+declare float @llvm.vector.reduce.fmax.v2f32(<2 x float>)
+declare float @llvm.vector.reduce.fmax.v3f32(<3 x float>)
+declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>)
+declare float @llvm.vector.reduce.fmax.v8f32(<8 x float>)
+declare float @llvm.vector.reduce.fmax.v16f32(<16 x float>)
 
-declare double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double>)
-declare double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double>)
-declare double @llvm.experimental.vector.reduce.fmax.v8f64(<8 x double>)
-declare double @llvm.experimental.vector.reduce.fmax.v16f64(<16 x double>)
+declare double @llvm.vector.reduce.fmax.v2f64(<2 x double>)
+declare double @llvm.vector.reduce.fmax.v4f64(<4 x double>)
+declare double @llvm.vector.reduce.fmax.v8f64(<8 x double>)
+declare double @llvm.vector.reduce.fmax.v16f64(<16 x double>)
diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll b/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll
--- a/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll
@@ -14,7 +14,7 @@
 ; ALL-LABEL: test_v1f32:
 ; ALL:       # %bb.0:
 ; ALL-NEXT:    retq
-  %1 = call nnan float @llvm.experimental.vector.reduce.fmin.v1f32(<1 x float> %a0)
+  %1 = call nnan float @llvm.vector.reduce.fmin.v1f32(<1 x float> %a0)
   ret float %1
 }
 
@@ -43,7 +43,7 @@
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; AVX512-NEXT:    vminss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call nnan float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float> %a0)
+  %1 = call nnan float @llvm.vector.reduce.fmin.v2f32(<2 x float> %a0)
   ret float %1
 }
 
@@ -84,7 +84,7 @@
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
 ; AVX512-NEXT:    vminss %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call nnan float @llvm.experimental.vector.reduce.fmin.v3f32(<3 x float> %a0)
+  %1 = call nnan float @llvm.vector.reduce.fmin.v3f32(<3 x float> %a0)
   ret float %1
 }
 
@@ -133,7 +133,7 @@
 ; AVX512-NEXT:    vminss %xmm2, %xmm0, %xmm0
 ; AVX512-NEXT:    vminss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call nnan float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> %a0)
+  %1 = call nnan float @llvm.vector.reduce.fmin.v4f32(<4 x float> %a0)
   ret float %1
 }
 
@@ -204,7 +204,7 @@
 ; AVX512-NEXT:    vminss %xmm2, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call nnan float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float> %a0)
+  %1 = call nnan float @llvm.vector.reduce.fmin.v8f32(<8 x float> %a0)
   ret float %1
 }
 
@@ -296,7 +296,7 @@
 ; AVX512-NEXT:    vminss %xmm8, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call nnan float @llvm.experimental.vector.reduce.fmin.v16f32(<16 x float> %a0)
+  %1 = call nnan float @llvm.vector.reduce.fmin.v16f32(<16 x float> %a0)
   ret float %1
 }
 
@@ -323,7 +323,7 @@
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX512-NEXT:    vminsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call nnan double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double> %a0)
+  %1 = call nnan double @llvm.vector.reduce.fmin.v2f64(<2 x double> %a0)
   ret double %1
 }
 
@@ -357,7 +357,7 @@
 ; AVX512-NEXT:    vminsd %xmm2, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call nnan double @llvm.experimental.vector.reduce.fmin.v4f64(<4 x double> %a0)
+  %1 = call nnan double @llvm.vector.reduce.fmin.v4f64(<4 x double> %a0)
   ret double %1
 }
 
@@ -402,7 +402,7 @@
 ; AVX512-NEXT:    vminsd %xmm2, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call nnan double @llvm.experimental.vector.reduce.fmin.v8f64(<8 x double> %a0)
+  %1 = call nnan double @llvm.vector.reduce.fmin.v8f64(<8 x double> %a0)
   ret double %1
 }
 
@@ -454,7 +454,7 @@
 ; AVX512-NEXT:    vminsd %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call nnan double @llvm.experimental.vector.reduce.fmin.v16f64(<16 x double> %a0)
+  %1 = call nnan double @llvm.vector.reduce.fmin.v16f64(<16 x double> %a0)
   ret double %1
 }
 
@@ -518,20 +518,20 @@
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call nnan half @llvm.experimental.vector.reduce.fmin.v2f16(<2 x half> %a0)
+  %1 = call nnan half @llvm.vector.reduce.fmin.v2f16(<2 x half> %a0)
   ret half %1
 }
 
-declare float @llvm.experimental.vector.reduce.fmin.v1f32(<1 x float>)
-declare float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float>)
-declare float @llvm.experimental.vector.reduce.fmin.v3f32(<3 x float>)
-declare float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float>)
-declare float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float>)
-declare float @llvm.experimental.vector.reduce.fmin.v16f32(<16 x float>)
+declare float @llvm.vector.reduce.fmin.v1f32(<1 x float>)
+declare float @llvm.vector.reduce.fmin.v2f32(<2 x float>)
+declare float @llvm.vector.reduce.fmin.v3f32(<3 x float>)
+declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>)
+declare float @llvm.vector.reduce.fmin.v8f32(<8 x float>)
+declare float @llvm.vector.reduce.fmin.v16f32(<16 x float>)
 
-declare double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double>)
-declare double @llvm.experimental.vector.reduce.fmin.v4f64(<4 x double>)
-declare double @llvm.experimental.vector.reduce.fmin.v8f64(<8 x double>)
-declare double @llvm.experimental.vector.reduce.fmin.v16f64(<16 x double>)
+declare double @llvm.vector.reduce.fmin.v2f64(<2 x double>)
+declare double @llvm.vector.reduce.fmin.v4f64(<4 x double>)
+declare double @llvm.vector.reduce.fmin.v8f64(<8 x double>)
+declare double @llvm.vector.reduce.fmin.v16f64(<16 x double>)
 
-declare half @llvm.experimental.vector.reduce.fmin.v2f16(<2 x half>)
+declare half @llvm.vector.reduce.fmin.v2f16(<2 x half>)
diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmin.ll b/llvm/test/CodeGen/X86/vector-reduce-fmin.ll
--- a/llvm/test/CodeGen/X86/vector-reduce-fmin.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-fmin.ll
@@ -54,7 +54,7 @@
 ; AVX512-NEXT:    vmovss %xmm2, %xmm1, %xmm1 {%k1}
 ; AVX512-NEXT:    vmovaps %xmm1, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float> %a0)
+  %1 = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> %a0)
   ret float %1
 }
 
@@ -151,7 +151,7 @@
 ; AVX512-NEXT:    vminss %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> %a0)
+  %1 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %a0)
   ret float %1
 }
 
@@ -322,7 +322,7 @@
 ; AVX512VL-NEXT:    vmovss %xmm8, %xmm0, %xmm0 {%k1}
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float> %a0)
+  %1 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> %a0)
   ret float %1
 }
 
@@ -582,7 +582,7 @@
 ; AVX512VL-NEXT:    vmovss %xmm8, %xmm0, %xmm0 {%k1}
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.fmin.v16f32(<16 x float> %a0)
+  %1 = call float @llvm.vector.reduce.fmin.v16f32(<16 x float> %a0)
   ret float %1
 }
 
@@ -621,7 +621,7 @@
 ; AVX512-NEXT:    vmovsd %xmm2, %xmm1, %xmm1 {%k1}
 ; AVX512-NEXT:    vmovapd %xmm1, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double> %a0)
+  %1 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> %a0)
   ret double %1
 }
 
@@ -691,7 +691,7 @@
 ; AVX512-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 {%k1}
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.fmin.v3f64(<3 x double> %a0)
+  %1 = call double @llvm.vector.reduce.fmin.v3f64(<3 x double> %a0)
   ret double %1
 }
 
@@ -765,7 +765,7 @@
 ; AVX512-NEXT:    vmovsd %xmm2, %xmm0, %xmm0 {%k1}
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.fmin.v4f64(<4 x double> %a0)
+  %1 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> %a0)
   ret double %1
 }
 
@@ -913,7 +913,7 @@
 ; AVX512VL-NEXT:    vmovsd %xmm8, %xmm0, %xmm0 {%k1}
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.fmin.v8f64(<8 x double> %a0)
+  %1 = call double @llvm.vector.reduce.fmin.v8f64(<8 x double> %a0)
   ret double %1
 }
 
@@ -1082,17 +1082,17 @@
 ; AVX512-NEXT:    vmovsd %xmm2, %xmm0, %xmm0 {%k1}
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.fmin.v16f64(<16 x double> %a0)
+  %1 = call double @llvm.vector.reduce.fmin.v16f64(<16 x double> %a0)
   ret double %1
 }
 
-declare float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float>)
-declare float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float>)
-declare float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float>)
-declare float @llvm.experimental.vector.reduce.fmin.v16f32(<16 x float>)
+declare float @llvm.vector.reduce.fmin.v2f32(<2 x float>)
+declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>)
+declare float @llvm.vector.reduce.fmin.v8f32(<8 x float>)
+declare float @llvm.vector.reduce.fmin.v16f32(<16 x float>)
 
-declare double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double>)
-declare double @llvm.experimental.vector.reduce.fmin.v3f64(<3 x double>)
-declare double @llvm.experimental.vector.reduce.fmin.v4f64(<4 x double>)
-declare double @llvm.experimental.vector.reduce.fmin.v8f64(<8 x double>)
-declare double @llvm.experimental.vector.reduce.fmin.v16f64(<16 x double>)
+declare double @llvm.vector.reduce.fmin.v2f64(<2 x double>)
+declare double @llvm.vector.reduce.fmin.v3f64(<3 x double>)
+declare double @llvm.vector.reduce.fmin.v4f64(<4 x double>)
+declare double @llvm.vector.reduce.fmin.v8f64(<8 x double>)
+declare double @llvm.vector.reduce.fmin.v16f64(<16 x double>)
diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmul-fast.ll b/llvm/test/CodeGen/X86/vector-reduce-fmul-fast.ll
--- a/llvm/test/CodeGen/X86/vector-reduce-fmul-fast.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-fmul-fast.ll
@@ -39,7 +39,7 @@
 ; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v2f32(float %a0, <2 x float> %a1)
+  %1 = call fast float @llvm.vector.reduce.fmul.f32.v2f32(float %a0, <2 x float> %a1)
   ret float %1
 }
 
@@ -82,7 +82,7 @@
 ; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float %a0, <4 x float> %a1)
+  %1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float %a0, <4 x float> %a1)
   ret float %1
 }
 
@@ -133,7 +133,7 @@
 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v8f32(float %a0, <8 x float> %a1)
+  %1 = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float %a0, <8 x float> %a1)
   ret float %1
 }
 
@@ -191,7 +191,7 @@
 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v16f32(float %a0, <16 x float> %a1)
+  %1 = call fast float @llvm.vector.reduce.fmul.f32.v16f32(float %a0, <16 x float> %a1)
   ret float %1
 }
 
@@ -225,7 +225,7 @@
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v2f32(float 1.0, <2 x float> %a0)
+  %1 = call fast float @llvm.vector.reduce.fmul.f32.v2f32(float 1.0, <2 x float> %a0)
   ret float %1
 }
 
@@ -265,7 +265,7 @@
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.0, <4 x float> %a0)
+  %1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a0)
   ret float %1
 }
 
@@ -313,7 +313,7 @@
 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v8f32(float 1.0, <8 x float> %a0)
+  %1 = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %a0)
   ret float %1
 }
 
@@ -368,7 +368,7 @@
 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v16f32(float 1.0, <16 x float> %a0)
+  %1 = call fast float @llvm.vector.reduce.fmul.f32.v16f32(float 1.0, <16 x float> %a0)
   ret float %1
 }
 
@@ -402,7 +402,7 @@
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v2f32(float 1.0, <2 x float> %a0)
+  %1 = call fast float @llvm.vector.reduce.fmul.f32.v2f32(float 1.0, <2 x float> %a0)
   ret float %1
 }
 
@@ -442,7 +442,7 @@
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.0, <4 x float> %a0)
+  %1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a0)
   ret float %1
 }
 
@@ -490,7 +490,7 @@
 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v8f32(float 1.0, <8 x float> %a0)
+  %1 = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %a0)
   ret float %1
 }
 
@@ -545,7 +545,7 @@
 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v16f32(float 1.0, <16 x float> %a0)
+  %1 = call fast float @llvm.vector.reduce.fmul.f32.v16f32(float 1.0, <16 x float> %a0)
   ret float %1
 }
 
@@ -575,7 +575,7 @@
 ; AVX512-NEXT:    vmulsd %xmm2, %xmm1, %xmm1
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v2f64(double %a0, <2 x double> %a1)
+  %1 = call fast double @llvm.vector.reduce.fmul.f64.v2f64(double %a0, <2 x double> %a1)
   ret double %1
 }
 
@@ -608,7 +608,7 @@
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v4f64(double %a0, <4 x double> %a1)
+  %1 = call fast double @llvm.vector.reduce.fmul.f64.v4f64(double %a0, <4 x double> %a1)
   ret double %1
 }
 
@@ -646,7 +646,7 @@
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v8f64(double %a0, <8 x double> %a1)
+  %1 = call fast double @llvm.vector.reduce.fmul.f64.v8f64(double %a0, <8 x double> %a1)
   ret double %1
 }
 
@@ -691,7 +691,7 @@
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v16f64(double %a0, <16 x double> %a1)
+  %1 = call fast double @llvm.vector.reduce.fmul.f64.v16f64(double %a0, <16 x double> %a1)
   ret double %1
 }
 
@@ -719,7 +719,7 @@
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v2f64(double 1.0, <2 x double> %a0)
+  %1 = call fast double @llvm.vector.reduce.fmul.f64.v2f64(double 1.0, <2 x double> %a0)
   ret double %1
 }
 
@@ -750,7 +750,7 @@
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v4f64(double 1.0, <4 x double> %a0)
+  %1 = call fast double @llvm.vector.reduce.fmul.f64.v4f64(double 1.0, <4 x double> %a0)
   ret double %1
 }
 
@@ -786,7 +786,7 @@
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v8f64(double 1.0, <8 x double> %a0)
+  %1 = call fast double @llvm.vector.reduce.fmul.f64.v8f64(double 1.0, <8 x double> %a0)
   ret double %1
 }
 
@@ -828,7 +828,7 @@
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v16f64(double 1.0, <16 x double> %a0)
+  %1 = call fast double @llvm.vector.reduce.fmul.f64.v16f64(double 1.0, <16 x double> %a0)
   ret double %1
 }
 
@@ -856,7 +856,7 @@
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v2f64(double 1.0, <2 x double> %a0)
+  %1 = call fast double @llvm.vector.reduce.fmul.f64.v2f64(double 1.0, <2 x double> %a0)
   ret double %1
 }
 
@@ -887,7 +887,7 @@
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v4f64(double 1.0, <4 x double> %a0)
+  %1 = call fast double @llvm.vector.reduce.fmul.f64.v4f64(double 1.0, <4 x double> %a0)
   ret double %1
 }
 
@@ -923,7 +923,7 @@
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v8f64(double 1.0, <8 x double> %a0)
+  %1 = call fast double @llvm.vector.reduce.fmul.f64.v8f64(double 1.0, <8 x double> %a0)
   ret double %1
 }
 
@@ -965,16 +965,16 @@
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v16f64(double 1.0, <16 x double> %a0)
+  %1 = call fast double @llvm.vector.reduce.fmul.f64.v16f64(double 1.0, <16 x double> %a0)
   ret double %1
 }
 
-declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v2f32(float, <2 x float>)
-declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float, <4 x float>)
-declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v8f32(float, <8 x float>)
-declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v16f32(float, <16 x float>)
+declare float @llvm.vector.reduce.fmul.f32.v2f32(float, <2 x float>)
+declare float @llvm.vector.reduce.fmul.f32.v4f32(float, <4 x float>)
+declare float @llvm.vector.reduce.fmul.f32.v8f32(float, <8 x float>)
+declare float @llvm.vector.reduce.fmul.f32.v16f32(float, <16 x float>)
 
-declare double @llvm.experimental.vector.reduce.v2.fmul.f64.v2f64(double, <2 x double>)
-declare double @llvm.experimental.vector.reduce.v2.fmul.f64.v4f64(double, <4 x double>)
-declare double @llvm.experimental.vector.reduce.v2.fmul.f64.v8f64(double, <8 x double>)
-declare double @llvm.experimental.vector.reduce.v2.fmul.f64.v16f64(double, <16 x double>)
+declare double @llvm.vector.reduce.fmul.f64.v2f64(double, <2 x double>)
+declare double @llvm.vector.reduce.fmul.f64.v4f64(double, <4 x double>)
+declare double @llvm.vector.reduce.fmul.f64.v8f64(double, <8 x double>)
+declare double @llvm.vector.reduce.fmul.f64.v16f64(double, <16 x double>)
diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmul.ll b/llvm/test/CodeGen/X86/vector-reduce-fmul.ll
--- a/llvm/test/CodeGen/X86/vector-reduce-fmul.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-fmul.ll
@@ -38,7 +38,7 @@
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v2f32(float %a0, <2 x float> %a1)
+  %1 = call float @llvm.vector.reduce.fmul.f32.v2f32(float %a0, <2 x float> %a1)
   ret float %1
 }
 
@@ -89,7 +89,7 @@
 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float %a0, <4 x float> %a1)
+  %1 = call float @llvm.vector.reduce.fmul.f32.v4f32(float %a0, <4 x float> %a1)
   ret float %1
 }
 
@@ -175,7 +175,7 @@
 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v8f32(float %a0, <8 x float> %a1)
+  %1 = call float @llvm.vector.reduce.fmul.f32.v8f32(float %a0, <8 x float> %a1)
   ret float %1
 }
 
@@ -326,7 +326,7 @@
 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v16f32(float %a0, <16 x float> %a1)
+  %1 = call float @llvm.vector.reduce.fmul.f32.v16f32(float %a0, <16 x float> %a1)
   ret float %1
 }
 
@@ -360,7 +360,7 @@
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v2f32(float 1.0, <2 x float> %a0)
+  %1 = call float @llvm.vector.reduce.fmul.f32.v2f32(float 1.0, <2 x float> %a0)
   ret float %1
 }
 
@@ -407,7 +407,7 @@
 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
 ; AVX512-NEXT:    vmulss %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.0, <4 x float> %a0)
+  %1 = call float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a0)
   ret float %1
 }
 
@@ -489,7 +489,7 @@
 ; AVX512-NEXT:    vmulss %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v8f32(float 1.0, <8 x float> %a0)
+  %1 = call float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %a0)
   ret float %1
 }
 
@@ -636,7 +636,7 @@
 ; AVX512-NEXT:    vmulss %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v16f32(float 1.0, <16 x float> %a0)
+  %1 = call float @llvm.vector.reduce.fmul.f32.v16f32(float 1.0, <16 x float> %a0)
   ret float %1
 }
 
@@ -668,7 +668,7 @@
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; AVX512-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v2f32(float undef, <2 x float> %a0)
+  %1 = call float @llvm.vector.reduce.fmul.f32.v2f32(float undef, <2 x float> %a0)
   ret float %1
 }
 
@@ -715,7 +715,7 @@
 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
 ; AVX512-NEXT:    vmulss %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float undef, <4 x float> %a0)
+  %1 = call float @llvm.vector.reduce.fmul.f32.v4f32(float undef, <4 x float> %a0)
   ret float %1
 }
 
@@ -797,7 +797,7 @@
 ; AVX512-NEXT:    vmulss %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v8f32(float undef, <8 x float> %a0)
+  %1 = call float @llvm.vector.reduce.fmul.f32.v8f32(float undef, <8 x float> %a0)
   ret float %1
 }
 
@@ -944,7 +944,7 @@
 ; AVX512-NEXT:    vmulss %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v16f32(float undef, <16 x float> %a0)
+  %1 = call float @llvm.vector.reduce.fmul.f32.v16f32(float undef, <16 x float> %a0)
   ret float %1
 }
 
@@ -973,7 +973,7 @@
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v2f64(double %a0, <2 x double> %a1)
+  %1 = call double @llvm.vector.reduce.fmul.f64.v2f64(double %a0, <2 x double> %a1)
   ret double %1
 }
 
@@ -1011,7 +1011,7 @@
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v4f64(double %a0, <4 x double> %a1)
+  %1 = call double @llvm.vector.reduce.fmul.f64.v4f64(double %a0, <4 x double> %a1)
   ret double %1
 }
 
@@ -1070,7 +1070,7 @@
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v8f64(double %a0, <8 x double> %a1)
+  %1 = call double @llvm.vector.reduce.fmul.f64.v8f64(double %a0, <8 x double> %a1)
   ret double %1
 }
 
@@ -1198,7 +1198,7 @@
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v16f64(double %a0, <16 x double> %a1)
+  %1 = call double @llvm.vector.reduce.fmul.f64.v16f64(double %a0, <16 x double> %a1)
   ret double %1
 }
 
@@ -1226,7 +1226,7 @@
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v2f64(double 1.0, <2 x double> %a0)
+  %1 = call double @llvm.vector.reduce.fmul.f64.v2f64(double 1.0, <2 x double> %a0)
   ret double %1
 }
 
@@ -1263,7 +1263,7 @@
 ; AVX512-NEXT:    vmulsd %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v4f64(double 1.0, <4 x double> %a0)
+  %1 = call double @llvm.vector.reduce.fmul.f64.v4f64(double 1.0, <4 x double> %a0)
   ret double %1
 }
 
@@ -1321,7 +1321,7 @@
 ; AVX512-NEXT:    vmulsd %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v8f64(double 1.0, <8 x double> %a0)
+  %1 = call double @llvm.vector.reduce.fmul.f64.v8f64(double 1.0, <8 x double> %a0)
   ret double %1
 }
 
@@ -1419,7 +1419,7 @@
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v16f64(double 1.0, <16 x double> %a0)
+  %1 = call double @llvm.vector.reduce.fmul.f64.v16f64(double 1.0, <16 x double> %a0)
   ret double %1
 }
 
@@ -1445,7 +1445,7 @@
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
 ; AVX512-NEXT:    vmulsd {{.*}}(%rip), %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v2f64(double undef, <2 x double> %a0)
+  %1 = call double @llvm.vector.reduce.fmul.f64.v2f64(double undef, <2 x double> %a0)
   ret double %1
 }
 
@@ -1480,7 +1480,7 @@
 ; AVX512-NEXT:    vmulsd %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v4f64(double undef, <4 x double> %a0)
+  %1 = call double @llvm.vector.reduce.fmul.f64.v4f64(double undef, <4 x double> %a0)
   ret double %1
 }
 
@@ -1536,7 +1536,7 @@
 ; AVX512-NEXT:    vmulsd %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v8f64(double undef, <8 x double> %a0)
+  %1 = call double @llvm.vector.reduce.fmul.f64.v8f64(double undef, <8 x double> %a0)
   ret double %1
 }
 
@@ -1633,16 +1633,16 @@
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v16f64(double undef, <16 x double> %a0)
+  %1 = call double @llvm.vector.reduce.fmul.f64.v16f64(double undef, <16 x double> %a0)
   ret double %1
 }
 
-declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v2f32(float, <2 x float>)
-declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float, <4 x float>)
-declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v8f32(float, <8 x float>)
-declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v16f32(float, <16 x float>)
+declare float @llvm.vector.reduce.fmul.f32.v2f32(float, <2 x float>)
+declare float @llvm.vector.reduce.fmul.f32.v4f32(float, <4 x float>)
+declare float @llvm.vector.reduce.fmul.f32.v8f32(float, <8 x float>)
+declare float @llvm.vector.reduce.fmul.f32.v16f32(float, <16 x float>)
 
-declare double @llvm.experimental.vector.reduce.v2.fmul.f64.v2f64(double, <2 x double>)
-declare double @llvm.experimental.vector.reduce.v2.fmul.f64.v4f64(double, <4 x double>)
-declare double @llvm.experimental.vector.reduce.v2.fmul.f64.v8f64(double, <8 x double>)
-declare double @llvm.experimental.vector.reduce.v2.fmul.f64.v16f64(double, <16 x double>)
+declare double @llvm.vector.reduce.fmul.f64.v2f64(double, <2 x double>)
+declare double @llvm.vector.reduce.fmul.f64.v4f64(double, <4 x double>)
+declare double @llvm.vector.reduce.fmul.f64.v8f64(double, <8 x double>)
+declare double @llvm.vector.reduce.fmul.f64.v16f64(double, <16 x double>)
diff --git a/llvm/test/CodeGen/X86/vector-reduce-mul.ll b/llvm/test/CodeGen/X86/vector-reduce-mul.ll
--- a/llvm/test/CodeGen/X86/vector-reduce-mul.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-mul.ll
@@ -85,7 +85,7 @@
 ; AVX512DQVL-NEXT:    vpmullq %xmm1, %xmm0, %xmm0
 ; AVX512DQVL-NEXT:    vmovq %xmm0, %rax
 ; AVX512DQVL-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.mul.v2i64(<2 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> %a0)
   ret i64 %1
 }
 
@@ -231,7 +231,7 @@
 ; AVX512DQVL-NEXT:    vmovq %xmm0, %rax
 ; AVX512DQVL-NEXT:    vzeroupper
 ; AVX512DQVL-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.mul.v4i64(<4 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> %a0)
   ret i64 %1
 }
 
@@ -443,7 +443,7 @@
 ; AVX512DQVL-NEXT:    vmovq %xmm0, %rax
 ; AVX512DQVL-NEXT:    vzeroupper
 ; AVX512DQVL-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.mul.v8i64(<8 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> %a0)
   ret i64 %1
 }
 
@@ -763,7 +763,7 @@
 ; AVX512DQVL-NEXT:    vmovq %xmm0, %rax
 ; AVX512DQVL-NEXT:    vzeroupper
 ; AVX512DQVL-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.mul.v16i64(<16 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> %a0)
   ret i64 %1
 }
 
@@ -799,7 +799,7 @@
 ; AVX512-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> %a0)
   ret i32 %1
 }
 
@@ -841,7 +841,7 @@
 ; AVX512-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %a0)
   ret i32 %1
 }
 
@@ -905,7 +905,7 @@
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> %a0)
   ret i32 %1
 }
 
@@ -983,7 +983,7 @@
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.mul.v16i32(<16 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> %a0)
   ret i32 %1
 }
 
@@ -1086,7 +1086,7 @@
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.mul.v32i32(<32 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.mul.v32i32(<32 x i32> %a0)
   ret i32 %1
 }
 
@@ -1119,7 +1119,7 @@
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.mul.v2i16(<2 x i16> %a0)
   ret i16 %1
 }
 
@@ -1154,7 +1154,7 @@
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.mul.v4i16(<4 x i16> %a0)
   ret i16 %1
 }
 
@@ -1195,7 +1195,7 @@
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> %a0)
   ret i16 %1
 }
 
@@ -1258,7 +1258,7 @@
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> %a0)
   ret i16 %1
 }
 
@@ -1380,7 +1380,7 @@
 ; AVX512DQVL-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512DQVL-NEXT:    vzeroupper
 ; AVX512DQVL-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.mul.v32i16(<32 x i16> %a0)
   ret i16 %1
 }
 
@@ -1522,7 +1522,7 @@
 ; AVX512DQVL-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512DQVL-NEXT:    vzeroupper
 ; AVX512DQVL-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.mul.v64i16(<64 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.mul.v64i16(<64 x i16> %a0)
   ret i16 %1
 }
 
@@ -1555,7 +1555,7 @@
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> %a0)
   ret i8 %1
 }
 
@@ -1607,7 +1607,7 @@
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> %a0)
   ret i8 %1
 }
 
@@ -1668,7 +1668,7 @@
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> %a0)
   ret i8 %1
 }
 
@@ -1842,7 +1842,7 @@
 ; AVX512DQVL-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512DQVL-NEXT:    vzeroupper
 ; AVX512DQVL-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> %a0)
   ret i8 %1
 }
 
@@ -2051,7 +2051,7 @@
 ; AVX512DQVL-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512DQVL-NEXT:    vzeroupper
 ; AVX512DQVL-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> %a0)
   ret i8 %1
 }
 
@@ -2325,7 +2325,7 @@
 ; AVX512DQVL-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512DQVL-NEXT:    vzeroupper
 ; AVX512DQVL-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.mul.v64i8(<64 x i8> %a0)
   ret i8 %1
 }
 
@@ -2685,32 +2685,32 @@
 ; AVX512DQVL-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512DQVL-NEXT:    vzeroupper
 ; AVX512DQVL-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.mul.v128i8(<128 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> %a0)
   ret i8 %1
 }
 
-declare i64 @llvm.experimental.vector.reduce.mul.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.mul.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.mul.v8i64(<8 x i64>)
-declare i64 @llvm.experimental.vector.reduce.mul.v16i64(<16 x i64>)
+declare i64 @llvm.vector.reduce.mul.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.mul.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.mul.v8i64(<8 x i64>)
+declare i64 @llvm.vector.reduce.mul.v16i64(<16 x i64>)
 
-declare i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.mul.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.mul.v32i32(<32 x i32>)
+declare i32 @llvm.vector.reduce.mul.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.mul.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.mul.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.mul.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.mul.v32i32(<32 x i32>)
 
-declare i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16>)
-declare i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16>)
-declare i16 @llvm.experimental.vector.reduce.mul.v64i16(<64 x i16>)
+declare i16 @llvm.vector.reduce.mul.v2i16(<2 x i16>)
+declare i16 @llvm.vector.reduce.mul.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.mul.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.mul.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.mul.v32i16(<32 x i16>)
+declare i16 @llvm.vector.reduce.mul.v64i16(<64 x i16>)
 
-declare i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8>)
-declare i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8>)
-declare i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8>)
-declare i8 @llvm.experimental.vector.reduce.mul.v128i8(<128 x i8>)
+declare i8 @llvm.vector.reduce.mul.v2i8(<2 x i8>)
+declare i8 @llvm.vector.reduce.mul.v4i8(<4 x i8>)
+declare i8 @llvm.vector.reduce.mul.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.mul.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.mul.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.mul.v64i8(<64 x i8>)
+declare i8 @llvm.vector.reduce.mul.v128i8(<128 x i8>)
diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll
--- a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll
@@ -57,7 +57,7 @@
 ; AVX512VL-NEXT:    setne %al
 ; AVX512VL-NEXT:    retq
   %a = trunc <2 x i64> %0 to <2 x i1>
-  %b = call i1 @llvm.experimental.vector.reduce.or.v2i1(<2 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %a)
   ret i1 %b
 }
 
@@ -107,7 +107,7 @@
 ; AVX512VL-NEXT:    setne %al
 ; AVX512VL-NEXT:    retq
   %a = trunc <4 x i32> %0 to <4 x i1>
-  %b = call i1 @llvm.experimental.vector.reduce.or.v4i1(<4 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> %a)
   ret i1 %b
 }
 
@@ -169,7 +169,7 @@
 ; AVX512VL-NEXT:    setne %al
 ; AVX512VL-NEXT:    retq
   %a = trunc <8 x i8> %0 to <8 x i1>
-  %b = call i1 @llvm.experimental.vector.reduce.or.v8i1(<8 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> %a)
   ret i1 %b
 }
 
@@ -198,7 +198,7 @@
 ; AVX512-NEXT:    setne %al
 ; AVX512-NEXT:    retq
   %a = trunc <16 x i8> %0 to <16 x i1>
-  %b = call i1 @llvm.experimental.vector.reduce.or.v16i1(<16 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> %a)
   ret i1 %b
 }
 
@@ -253,7 +253,7 @@
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = trunc <4 x i64> %0 to <4 x i1>
-  %b = call i1 @llvm.experimental.vector.reduce.or.v4i1(<4 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> %a)
   ret i1 %b
 }
 
@@ -338,7 +338,7 @@
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = trunc <8 x i32> %0 to <8 x i1>
-  %b = call i1 @llvm.experimental.vector.reduce.or.v8i1(<8 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> %a)
   ret i1 %b
 }
 
@@ -407,7 +407,7 @@
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = trunc <16 x i16> %0 to <16 x i1>
-  %b = call i1 @llvm.experimental.vector.reduce.or.v16i1(<16 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> %a)
   ret i1 %b
 }
 
@@ -479,7 +479,7 @@
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = trunc <32 x i8> %0 to <32 x i1>
-  %b = call i1 @llvm.experimental.vector.reduce.or.v32i1(<32 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> %a)
   ret i1 %b
 }
 
@@ -580,7 +580,7 @@
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = trunc <8 x i64> %0 to <8 x i1>
-  %b = call i1 @llvm.experimental.vector.reduce.or.v8i1(<8 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> %a)
   ret i1 %b
 }
 
@@ -661,7 +661,7 @@
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %a = trunc <16 x i32> %0 to <16 x i1>
-  %b = call i1 @llvm.experimental.vector.reduce.or.v16i1(<16 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> %a)
   ret i1 %b
 }
 
@@ -748,7 +748,7 @@
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = trunc <32 x i16> %0 to <32 x i1>
-  %b = call i1 @llvm.experimental.vector.reduce.or.v32i1(<32 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> %a)
   ret i1 %b
 }
 
@@ -828,7 +828,7 @@
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = trunc <64 x i8> %0 to <64 x i1>
-  %b = call i1 @llvm.experimental.vector.reduce.or.v64i1(<64 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> %a)
   ret i1 %b
 }
 
@@ -894,7 +894,7 @@
 ; AVX512VL-NEXT:    setne %al
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <2 x i64> %0, zeroinitializer
-  %b = call i1 @llvm.experimental.vector.reduce.or.v2i1(<2 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %a)
   ret i1 %b
 }
 
@@ -945,7 +945,7 @@
 ; AVX512VL-NEXT:    setne %al
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <4 x i32> %0, zeroinitializer
-  %b = call i1 @llvm.experimental.vector.reduce.or.v4i1(<4 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> %a)
   ret i1 %b
 }
 
@@ -998,7 +998,7 @@
 ; AVX512VL-NEXT:    setne %al
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <8 x i8> %0, zeroinitializer
-  %b = call i1 @llvm.experimental.vector.reduce.or.v8i1(<8 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> %a)
   ret i1 %b
 }
 
@@ -1046,7 +1046,7 @@
 ; AVX512VL-NEXT:    setne %al
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <16 x i8> %0, zeroinitializer
-  %b = call i1 @llvm.experimental.vector.reduce.or.v16i1(<16 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> %a)
   ret i1 %b
 }
 
@@ -1129,7 +1129,7 @@
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <4 x i64> %0, zeroinitializer
-  %b = call i1 @llvm.experimental.vector.reduce.or.v4i1(<4 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> %a)
   ret i1 %b
 }
 
@@ -1197,7 +1197,7 @@
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <8 x i32> %0, zeroinitializer
-  %b = call i1 @llvm.experimental.vector.reduce.or.v8i1(<8 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> %a)
   ret i1 %b
 }
 
@@ -1264,7 +1264,7 @@
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <16 x i16> %0, zeroinitializer
-  %b = call i1 @llvm.experimental.vector.reduce.or.v16i1(<16 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> %a)
   ret i1 %b
 }
 
@@ -1341,7 +1341,7 @@
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <32 x i8> %0, zeroinitializer
-  %b = call i1 @llvm.experimental.vector.reduce.or.v32i1(<32 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> %a)
   ret i1 %b
 }
 
@@ -1441,7 +1441,7 @@
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <8 x i64> %0, zeroinitializer
-  %b = call i1 @llvm.experimental.vector.reduce.or.v8i1(<8 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> %a)
   ret i1 %b
 }
 
@@ -1499,7 +1499,7 @@
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %a = icmp eq <16 x i32> %0, zeroinitializer
-  %b = call i1 @llvm.experimental.vector.reduce.or.v16i1(<16 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> %a)
   ret i1 %b
 }
 
@@ -1587,7 +1587,7 @@
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <32 x i16> %0, zeroinitializer
-  %b = call i1 @llvm.experimental.vector.reduce.or.v32i1(<32 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> %a)
   ret i1 %b
 }
 
@@ -1680,13 +1680,13 @@
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <64 x i8> %0, zeroinitializer
-  %b = call i1 @llvm.experimental.vector.reduce.or.v64i1(<64 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> %a)
   ret i1 %b
 }
 
-declare i1 @llvm.experimental.vector.reduce.or.v2i1(<2 x i1>)
-declare i1 @llvm.experimental.vector.reduce.or.v4i1(<4 x i1>)
-declare i1 @llvm.experimental.vector.reduce.or.v8i1(<8 x i1>)
-declare i1 @llvm.experimental.vector.reduce.or.v16i1(<16 x i1>)
-declare i1 @llvm.experimental.vector.reduce.or.v32i1(<32 x i1>)
-declare i1 @llvm.experimental.vector.reduce.or.v64i1(<64 x i1>)
+declare i1 @llvm.vector.reduce.or.v2i1(<2 x i1>)
+declare i1 @llvm.vector.reduce.or.v4i1(<4 x i1>)
+declare i1 @llvm.vector.reduce.or.v8i1(<8 x i1>)
+declare i1 @llvm.vector.reduce.or.v16i1(<16 x i1>)
+declare i1 @llvm.vector.reduce.or.v32i1(<32 x i1>)
+declare i1 @llvm.vector.reduce.or.v64i1(<64 x i1>)
diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll
--- a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll
@@ -31,7 +31,7 @@
 ; AVX-NEXT:    vptest %xmm0, %xmm0
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.or.v2i64(<2 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> %a0)
   %2 = icmp eq i64 %1, 0
   ret i1 %2
 }
@@ -60,7 +60,7 @@
 ; AVX-NEXT:    setne %al
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.or.v4i64(<4 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> %a0)
   %2 = icmp ne i64 %1, 0
   ret i1 %2
 }
@@ -111,7 +111,7 @@
 ; AVX512-NEXT:    sete %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.or.v8i64(<8 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> %a0)
   %2 = icmp eq i64 %1, 0
   ret i1 %2
 }
@@ -175,7 +175,7 @@
 ; AVX512-NEXT:    setne %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.or.v16i64(<16 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> %a0)
   %2 = icmp ne i64 %1, 0
   ret i1 %2
 }
@@ -198,7 +198,7 @@
 ; AVX-NEXT:    testq %rax, %rax
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.or.v2i32(<2 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> %a0)
   %2 = icmp eq i32 %1, 0
   ret i1 %2
 }
@@ -224,7 +224,7 @@
 ; AVX-NEXT:    vptest %xmm0, %xmm0
 ; AVX-NEXT:    setne %al
 ; AVX-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %a0)
   %2 = icmp ne i32 %1, 0
   ret i1 %2
 }
@@ -253,7 +253,7 @@
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %a0)
   %2 = icmp eq i32 %1, 0
   ret i1 %2
 }
@@ -304,7 +304,7 @@
 ; AVX512-NEXT:    setne %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.or.v16i32(<16 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> %a0)
   %2 = icmp ne i32 %1, 0
   ret i1 %2
 }
@@ -368,7 +368,7 @@
 ; AVX512-NEXT:    sete %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.or.v32i32(<32 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.or.v32i32(<32 x i32> %a0)
   %2 = icmp eq i32 %1, 0
   ret i1 %2
 }
@@ -391,7 +391,7 @@
 ; AVX-NEXT:    testl %eax, %eax
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.or.v2i16(<2 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.or.v2i16(<2 x i16> %a0)
   %2 = icmp eq i16 %1, 0
   ret i1 %2
 }
@@ -410,7 +410,7 @@
 ; AVX-NEXT:    testq %rax, %rax
 ; AVX-NEXT:    setne %al
 ; AVX-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.or.v4i16(<4 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> %a0)
   %2 = icmp ne i16 %1, 0
   ret i1 %2
 }
@@ -436,7 +436,7 @@
 ; AVX-NEXT:    vptest %xmm0, %xmm0
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.or.v8i16(<8 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> %a0)
   %2 = icmp eq i16 %1, 0
   ret i1 %2
 }
@@ -465,7 +465,7 @@
 ; AVX-NEXT:    setne %al
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> %a0)
   %2 = icmp ne i16 %1, 0
   ret i1 %2
 }
@@ -516,7 +516,7 @@
 ; AVX512-NEXT:    sete %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.or.v32i16(<32 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.or.v32i16(<32 x i16> %a0)
   %2 = icmp eq i16 %1, 0
   ret i1 %2
 }
@@ -580,7 +580,7 @@
 ; AVX512-NEXT:    setne %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.or.v64i16(<64 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.or.v64i16(<64 x i16> %a0)
   %2 = icmp ne i16 %1, 0
   ret i1 %2
 }
@@ -603,7 +603,7 @@
 ; AVX-NEXT:    testw %ax, %ax
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.or.v2i8(<2 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.or.v2i8(<2 x i8> %a0)
   %2 = icmp eq i8 %1, 0
   ret i1 %2
 }
@@ -622,7 +622,7 @@
 ; AVX-NEXT:    testl %eax, %eax
 ; AVX-NEXT:    setne %al
 ; AVX-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.or.v4i8(<4 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> %a0)
   %2 = icmp ne i8 %1, 0
   ret i1 %2
 }
@@ -641,7 +641,7 @@
 ; AVX-NEXT:    testq %rax, %rax
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.or.v8i8(<8 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> %a0)
   %2 = icmp eq i8 %1, 0
   ret i1 %2
 }
@@ -667,7 +667,7 @@
 ; AVX-NEXT:    vptest %xmm0, %xmm0
 ; AVX-NEXT:    setne %al
 ; AVX-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.or.v16i8(<16 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> %a0)
   %2 = icmp ne i8 %1, 0
   ret i1 %2
 }
@@ -696,7 +696,7 @@
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> %a0)
   %2 = icmp eq i8 %1, 0
   ret i1 %2
 }
@@ -747,7 +747,7 @@
 ; AVX512-NEXT:    setne %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.or.v64i8(<64 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.or.v64i8(<64 x i8> %a0)
   %2 = icmp ne i8 %1, 0
   ret i1 %2
 }
@@ -811,7 +811,7 @@
 ; AVX512-NEXT:    sete %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.or.v128i8(<128 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.or.v128i8(<128 x i8> %a0)
   %2 = icmp eq i8 %1, 0
   ret i1 %2
 }
@@ -841,7 +841,7 @@
 ; AVX-NEXT:    vptest {{.*}}(%rip), %xmm0
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.or.v2i64(<2 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> %a0)
   %2 = trunc i64 %1 to i16
   %3 = icmp eq i16 %2, 0
   ret i1 %3
@@ -888,7 +888,7 @@
 ; AVX512-NEXT:    sete %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %a0)
   %2 = and i32 %1, 2147483648
   %3 = icmp eq i32 %2, 0
   ret i1 %3
@@ -935,7 +935,7 @@
 ; AVX512-NEXT:    setne %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> %a0)
   %2 = trunc i16 %1 to i8
   %3 = icmp ne i8 %2, 0
   ret i1 %3
@@ -1003,7 +1003,7 @@
 ; AVX512-NEXT:    sete %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.or.v128i8(<128 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.or.v128i8(<128 x i8> %a0)
   %2 = and i8 %1, 1
   %3 = icmp eq i8 %2, 0
   ret i1 %3
@@ -1037,34 +1037,34 @@
 ; AVX-NEXT:    retq
   %2 = bitcast %struct.Box* %0 to <4 x i32>*
   %3 = load <4 x i32>, <4 x i32>* %2, align 4
-  %4 = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> %3)
+  %4 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %3)
   %5 = and i32 %4, 15
   %6 = icmp eq i32 %5, 0
   ret i1 %6
 }
 
-declare i64 @llvm.experimental.vector.reduce.or.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.or.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.or.v8i64(<8 x i64>)
-declare i64 @llvm.experimental.vector.reduce.or.v16i64(<16 x i64>)
+declare i64 @llvm.vector.reduce.or.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.or.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.or.v8i64(<8 x i64>)
+declare i64 @llvm.vector.reduce.or.v16i64(<16 x i64>)
 
-declare i32 @llvm.experimental.vector.reduce.or.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.or.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.or.v32i32(<32 x i32>)
+declare i32 @llvm.vector.reduce.or.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.or.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.or.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.or.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.or.v32i32(<32 x i32>)
 
-declare i16 @llvm.experimental.vector.reduce.or.v2i16(<2 x i16>)
-declare i16 @llvm.experimental.vector.reduce.or.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.or.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.or.v32i16(<32 x i16>)
-declare i16 @llvm.experimental.vector.reduce.or.v64i16(<64 x i16>)
+declare i16 @llvm.vector.reduce.or.v2i16(<2 x i16>)
+declare i16 @llvm.vector.reduce.or.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.or.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.or.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.or.v32i16(<32 x i16>)
+declare i16 @llvm.vector.reduce.or.v64i16(<64 x i16>)
 
-declare i8 @llvm.experimental.vector.reduce.or.v2i8(<2 x i8>)
-declare i8 @llvm.experimental.vector.reduce.or.v4i8(<4 x i8>)
-declare i8 @llvm.experimental.vector.reduce.or.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.or.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.or.v64i8(<64 x i8>)
-declare i8 @llvm.experimental.vector.reduce.or.v128i8(<128 x i8>)
+declare i8 @llvm.vector.reduce.or.v2i8(<2 x i8>)
+declare i8 @llvm.vector.reduce.or.v4i8(<4 x i8>)
+declare i8 @llvm.vector.reduce.or.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.or.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.or.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.or.v64i8(<64 x i8>)
+declare i8 @llvm.vector.reduce.or.v128i8(<128 x i8>)
diff --git a/llvm/test/CodeGen/X86/vector-reduce-or.ll b/llvm/test/CodeGen/X86/vector-reduce-or.ll
--- a/llvm/test/CodeGen/X86/vector-reduce-or.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-or.ll
@@ -24,7 +24,7 @@
 ; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vmovq %xmm0, %rax
 ; AVX-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.or.v2i64(<2 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> %a0)
   ret i64 %1
 }
 
@@ -66,7 +66,7 @@
 ; AVX512-NEXT:    vmovq %xmm0, %rax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.or.v4i64(<4 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> %a0)
   ret i64 %1
 }
 
@@ -114,7 +114,7 @@
 ; AVX512-NEXT:    vmovq %xmm0, %rax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.or.v8i64(<8 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> %a0)
   ret i64 %1
 }
 
@@ -171,7 +171,7 @@
 ; AVX512-NEXT:    vmovq %xmm0, %rax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.or.v16i64(<16 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> %a0)
   ret i64 %1
 }
 
@@ -193,7 +193,7 @@
 ; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vmovd %xmm0, %eax
 ; AVX-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.or.v2i32(<2 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> %a0)
   ret i32 %1
 }
 
@@ -215,7 +215,7 @@
 ; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vmovd %xmm0, %eax
 ; AVX-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %a0)
   ret i32 %1
 }
 
@@ -265,7 +265,7 @@
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %a0)
   ret i32 %1
 }
 
@@ -321,7 +321,7 @@
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.or.v16i32(<16 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> %a0)
   ret i32 %1
 }
 
@@ -386,7 +386,7 @@
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.or.v32i32(<32 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.or.v32i32(<32 x i32> %a0)
   ret i32 %1
 }
 
@@ -411,7 +411,7 @@
 ; AVX-NEXT:    vmovd %xmm0, %eax
 ; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.or.v2i16(<2 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.or.v2i16(<2 x i16> %a0)
   ret i16 %1
 }
 
@@ -436,7 +436,7 @@
 ; AVX-NEXT:    vmovd %xmm0, %eax
 ; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.or.v4i16(<4 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> %a0)
   ret i16 %1
 }
 
@@ -465,7 +465,7 @@
 ; AVX-NEXT:    vmovd %xmm0, %eax
 ; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.or.v8i16(<8 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> %a0)
   ret i16 %1
 }
 
@@ -528,7 +528,7 @@
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> %a0)
   ret i16 %1
 }
 
@@ -597,7 +597,7 @@
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.or.v32i16(<32 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.or.v32i16(<32 x i16> %a0)
   ret i16 %1
 }
 
@@ -675,7 +675,7 @@
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.or.v64i16(<64 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.or.v64i16(<64 x i16> %a0)
   ret i16 %1
 }
 
@@ -700,7 +700,7 @@
 ; AVX-NEXT:    vmovd %xmm0, %eax
 ; AVX-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.or.v2i8(<2 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.or.v2i8(<2 x i8> %a0)
   ret i8 %1
 }
 
@@ -726,7 +726,7 @@
 ; AVX-NEXT:    vmovd %xmm0, %eax
 ; AVX-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.or.v4i8(<4 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> %a0)
   ret i8 %1
 }
 
@@ -756,7 +756,7 @@
 ; AVX-NEXT:    vmovd %xmm0, %eax
 ; AVX-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.or.v8i8(<8 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> %a0)
   ret i8 %1
 }
 
@@ -790,7 +790,7 @@
 ; AVX-NEXT:    vmovd %xmm0, %eax
 ; AVX-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.or.v16i8(<16 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> %a0)
   ret i8 %1
 }
 
@@ -862,7 +862,7 @@
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> %a0)
   ret i8 %1
 }
 
@@ -940,7 +940,7 @@
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.or.v64i8(<64 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.or.v64i8(<64 x i8> %a0)
   ret i8 %1
 }
 
@@ -1027,32 +1027,32 @@
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.or.v128i8(<128 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.or.v128i8(<128 x i8> %a0)
   ret i8 %1
 }
 
-declare i64 @llvm.experimental.vector.reduce.or.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.or.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.or.v8i64(<8 x i64>)
-declare i64 @llvm.experimental.vector.reduce.or.v16i64(<16 x i64>)
+declare i64 @llvm.vector.reduce.or.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.or.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.or.v8i64(<8 x i64>)
+declare i64 @llvm.vector.reduce.or.v16i64(<16 x i64>)
 
-declare i32 @llvm.experimental.vector.reduce.or.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.or.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.or.v32i32(<32 x i32>)
+declare i32 @llvm.vector.reduce.or.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.or.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.or.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.or.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.or.v32i32(<32 x i32>)
 
-declare i16 @llvm.experimental.vector.reduce.or.v2i16(<2 x i16>)
-declare i16 @llvm.experimental.vector.reduce.or.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.or.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.or.v32i16(<32 x i16>)
-declare i16 @llvm.experimental.vector.reduce.or.v64i16(<64 x i16>)
+declare i16 @llvm.vector.reduce.or.v2i16(<2 x i16>)
+declare i16 @llvm.vector.reduce.or.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.or.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.or.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.or.v32i16(<32 x i16>)
+declare i16 @llvm.vector.reduce.or.v64i16(<64 x i16>)
 
-declare i8 @llvm.experimental.vector.reduce.or.v2i8(<2 x i8>)
-declare i8 @llvm.experimental.vector.reduce.or.v4i8(<4 x i8>)
-declare i8 @llvm.experimental.vector.reduce.or.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.or.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.or.v64i8(<64 x i8>)
-declare i8 @llvm.experimental.vector.reduce.or.v128i8(<128 x i8>)
+declare i8 @llvm.vector.reduce.or.v2i8(<2 x i8>)
+declare i8 @llvm.vector.reduce.or.v4i8(<4 x i8>)
+declare i8 @llvm.vector.reduce.or.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.or.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.or.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.or.v64i8(<64 x i8>)
+declare i8 @llvm.vector.reduce.or.v128i8(<128 x i8>)
diff --git a/llvm/test/CodeGen/X86/vector-reduce-smax.ll b/llvm/test/CodeGen/X86/vector-reduce-smax.ll
--- a/llvm/test/CodeGen/X86/vector-reduce-smax.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-smax.ll
@@ -83,7 +83,7 @@
 ; AVX512VL-NEXT:    vpmaxsq %xmm1, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vmovq %xmm0, %rax
 ; AVX512VL-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.smax.v2i64(<2 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> %a0)
   ret i64 %1
 }
 
@@ -209,7 +209,7 @@
 ; AVX512VL-NEXT:    vmovq %xmm0, %rax
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.smax.v4i64(<4 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> %a0)
   ret i64 %1
 }
 
@@ -404,7 +404,7 @@
 ; AVX512VL-NEXT:    vmovq %xmm0, %rax
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.smax.v8i64(<8 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> %a0)
   ret i64 %1
 }
 
@@ -731,7 +731,7 @@
 ; AVX512VL-NEXT:    vmovq %xmm0, %rax
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.smax.v16i64(<16 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> %a0)
   ret i64 %1
 }
 
@@ -771,7 +771,7 @@
 ; AVX512-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> %a0)
   ret i32 %1
 }
 
@@ -819,7 +819,7 @@
 ; AVX512-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %a0)
   ret i32 %1
 }
 
@@ -891,7 +891,7 @@
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> %a0)
   ret i32 %1
 }
 
@@ -981,7 +981,7 @@
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> %a0)
   ret i32 %1
 }
 
@@ -1104,7 +1104,7 @@
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.smax.v32i32(<32 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> %a0)
   ret i32 %1
 }
 
@@ -1137,7 +1137,7 @@
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> %a0)
   ret i16 %1
 }
 
@@ -1172,7 +1172,7 @@
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> %a0)
   ret i16 %1
 }
 
@@ -1216,7 +1216,7 @@
 ; AVX512-NEXT:    xorl $32767, %eax # imm = 0x7FFF
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> %a0)
   ret i16 %1
 }
 
@@ -1280,7 +1280,7 @@
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> %a0)
   ret i16 %1
 }
 
@@ -1354,7 +1354,7 @@
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.smax.v32i16(<32 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> %a0)
   ret i16 %1
 }
 
@@ -1445,7 +1445,7 @@
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.smax.v64i16(<64 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> %a0)
   ret i16 %1
 }
 
@@ -1491,7 +1491,7 @@
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> %a0)
   ret i8 %1
 }
 
@@ -1547,7 +1547,7 @@
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> %a0)
   ret i8 %1
 }
 
@@ -1615,7 +1615,7 @@
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> %a0)
   ret i8 %1
 }
 
@@ -1685,7 +1685,7 @@
 ; AVX512-NEXT:    xorb $127, %al
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> %a0)
   ret i8 %1
 }
 
@@ -1781,7 +1781,7 @@
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> %a0)
   ret i8 %1
 }
 
@@ -1895,7 +1895,7 @@
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> %a0)
   ret i8 %1
 }
 
@@ -2042,32 +2042,32 @@
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.smax.v128i8(<128 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> %a0)
   ret i8 %1
 }
 
-declare i64 @llvm.experimental.vector.reduce.smax.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.smax.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.smax.v8i64(<8 x i64>)
-declare i64 @llvm.experimental.vector.reduce.smax.v16i64(<16 x i64>)
+declare i64 @llvm.vector.reduce.smax.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.smax.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.smax.v8i64(<8 x i64>)
+declare i64 @llvm.vector.reduce.smax.v16i64(<16 x i64>)
 
-declare i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smax.v32i32(<32 x i32>)
+declare i32 @llvm.vector.reduce.smax.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.smax.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.smax.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.smax.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.smax.v32i32(<32 x i32>)
 
-declare i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smax.v32i16(<32 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smax.v64i16(<64 x i16>)
+declare i16 @llvm.vector.reduce.smax.v2i16(<2 x i16>)
+declare i16 @llvm.vector.reduce.smax.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.smax.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.smax.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.smax.v32i16(<32 x i16>)
+declare i16 @llvm.vector.reduce.smax.v64i16(<64 x i16>)
 
-declare i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smax.v128i8(<128 x i8>)
+declare i8 @llvm.vector.reduce.smax.v2i8(<2 x i8>)
+declare i8 @llvm.vector.reduce.smax.v4i8(<4 x i8>)
+declare i8 @llvm.vector.reduce.smax.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.smax.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.smax.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.smax.v64i8(<64 x i8>)
+declare i8 @llvm.vector.reduce.smax.v128i8(<128 x i8>)
diff --git a/llvm/test/CodeGen/X86/vector-reduce-smin.ll b/llvm/test/CodeGen/X86/vector-reduce-smin.ll
--- a/llvm/test/CodeGen/X86/vector-reduce-smin.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-smin.ll
@@ -83,7 +83,7 @@
 ; AVX512VL-NEXT:    vpminsq %xmm1, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vmovq %xmm0, %rax
 ; AVX512VL-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.smin.v2i64(<2 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> %a0)
   ret i64 %1
 }
 
@@ -209,7 +209,7 @@
 ; AVX512VL-NEXT:    vmovq %xmm0, %rax
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.smin.v4i64(<4 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> %a0)
   ret i64 %1
 }
 
@@ -404,7 +404,7 @@
 ; AVX512VL-NEXT:    vmovq %xmm0, %rax
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.smin.v8i64(<8 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> %a0)
   ret i64 %1
 }
 
@@ -731,7 +731,7 @@
 ; AVX512VL-NEXT:    vmovq %xmm0, %rax
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.smin.v16i64(<16 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> %a0)
   ret i64 %1
 }
 
@@ -771,7 +771,7 @@
 ; AVX512-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> %a0)
   ret i32 %1
 }
 
@@ -819,7 +819,7 @@
 ; AVX512-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %a0)
   ret i32 %1
 }
 
@@ -891,7 +891,7 @@
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> %a0)
   ret i32 %1
 }
 
@@ -981,7 +981,7 @@
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.smin.v16i32(<16 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> %a0)
   ret i32 %1
 }
 
@@ -1104,7 +1104,7 @@
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.smin.v32i32(<32 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> %a0)
   ret i32 %1
 }
 
@@ -1137,7 +1137,7 @@
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> %a0)
   ret i16 %1
 }
 
@@ -1172,7 +1172,7 @@
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> %a0)
   ret i16 %1
 }
 
@@ -1216,7 +1216,7 @@
 ; AVX512-NEXT:    xorl $32768, %eax # imm = 0x8000
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> %a0)
   ret i16 %1
 }
 
@@ -1280,7 +1280,7 @@
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> %a0)
   ret i16 %1
 }
 
@@ -1354,7 +1354,7 @@
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.smin.v32i16(<32 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> %a0)
   ret i16 %1
 }
 
@@ -1445,7 +1445,7 @@
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.smin.v64i16(<64 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> %a0)
   ret i16 %1
 }
 
@@ -1491,7 +1491,7 @@
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> %a0)
   ret i8 %1
 }
 
@@ -1547,7 +1547,7 @@
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> %a0)
   ret i8 %1
 }
 
@@ -1615,7 +1615,7 @@
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> %a0)
   ret i8 %1
 }
 
@@ -1685,7 +1685,7 @@
 ; AVX512-NEXT:    xorb $-128, %al
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> %a0)
   ret i8 %1
 }
 
@@ -1781,7 +1781,7 @@
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> %a0)
   ret i8 %1
 }
 
@@ -1895,7 +1895,7 @@
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> %a0)
   ret i8 %1
 }
 
@@ -2042,32 +2042,32 @@
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.smin.v128i8(<128 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> %a0)
   ret i8 %1
 }
 
-declare i64 @llvm.experimental.vector.reduce.smin.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.smin.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.smin.v8i64(<8 x i64>)
-declare i64 @llvm.experimental.vector.reduce.smin.v16i64(<16 x i64>)
+declare i64 @llvm.vector.reduce.smin.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.smin.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.smin.v8i64(<8 x i64>)
+declare i64 @llvm.vector.reduce.smin.v16i64(<16 x i64>)
 
-declare i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smin.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smin.v32i32(<32 x i32>)
+declare i32 @llvm.vector.reduce.smin.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.smin.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.smin.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.smin.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.smin.v32i32(<32 x i32>)
 
-declare i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smin.v32i16(<32 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smin.v64i16(<64 x i16>)
+declare i16 @llvm.vector.reduce.smin.v2i16(<2 x i16>)
+declare i16 @llvm.vector.reduce.smin.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.smin.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.smin.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.smin.v32i16(<32 x i16>)
+declare i16 @llvm.vector.reduce.smin.v64i16(<64 x i16>)
 
-declare i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smin.v128i8(<128 x i8>)
+declare i8 @llvm.vector.reduce.smin.v2i8(<2 x i8>)
+declare i8 @llvm.vector.reduce.smin.v4i8(<4 x i8>)
+declare i8 @llvm.vector.reduce.smin.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.smin.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.smin.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.smin.v64i8(<64 x i8>)
+declare i8 @llvm.vector.reduce.smin.v128i8(<128 x i8>)
diff --git a/llvm/test/CodeGen/X86/vector-reduce-umax.ll b/llvm/test/CodeGen/X86/vector-reduce-umax.ll
--- a/llvm/test/CodeGen/X86/vector-reduce-umax.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-umax.ll
@@ -89,7 +89,7 @@
 ; AVX512VL-NEXT:    vpmaxuq %xmm1, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vmovq %xmm0, %rax
 ; AVX512VL-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.umax.v2i64(<2 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> %a0)
   ret i64 %1
 }
 
@@ -231,7 +231,7 @@
 ; AVX512VL-NEXT:    vmovq %xmm0, %rax
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.umax.v4i64(<4 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> %a0)
   ret i64 %1
 }
 
@@ -453,7 +453,7 @@
 ; AVX512VL-NEXT:    vmovq %xmm0, %rax
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.umax.v8i64(<8 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> %a0)
   ret i64 %1
 }
 
@@ -832,7 +832,7 @@
 ; AVX512VL-NEXT:    vmovq %xmm0, %rax
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.umax.v16i64(<16 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> %a0)
   ret i64 %1
 }
 
@@ -875,7 +875,7 @@
 ; AVX512-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> %a0)
   ret i32 %1
 }
 
@@ -929,7 +929,7 @@
 ; AVX512-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %a0)
   ret i32 %1
 }
 
@@ -1010,7 +1010,7 @@
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> %a0)
   ret i32 %1
 }
 
@@ -1115,7 +1115,7 @@
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> %a0)
   ret i32 %1
 }
 
@@ -1265,7 +1265,7 @@
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.umax.v32i32(<32 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> %a0)
   ret i32 %1
 }
 
@@ -1311,7 +1311,7 @@
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> %a0)
   ret i16 %1
 }
 
@@ -1361,7 +1361,7 @@
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> %a0)
   ret i16 %1
 }
 
@@ -1422,7 +1422,7 @@
 ; AVX512VL-NEXT:    notl %eax
 ; AVX512VL-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512VL-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> %a0)
   ret i16 %1
 }
 
@@ -1505,7 +1505,7 @@
 ; AVX512VL-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> %a0)
   ret i16 %1
 }
 
@@ -1602,7 +1602,7 @@
 ; AVX512VL-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.umax.v32i16(<32 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> %a0)
   ret i16 %1
 }
 
@@ -1721,7 +1721,7 @@
 ; AVX512VL-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.umax.v64i16(<64 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> %a0)
   ret i16 %1
 }
 
@@ -1754,7 +1754,7 @@
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> %a0)
   ret i8 %1
 }
 
@@ -1790,7 +1790,7 @@
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> %a0)
   ret i8 %1
 }
 
@@ -1832,7 +1832,7 @@
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> %a0)
   ret i8 %1
 }
 
@@ -1901,7 +1901,7 @@
 ; AVX512VL-NEXT:    notb %al
 ; AVX512VL-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512VL-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> %a0)
   ret i8 %1
 }
 
@@ -1994,7 +1994,7 @@
 ; AVX512VL-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> %a0)
   ret i8 %1
 }
 
@@ -2099,7 +2099,7 @@
 ; AVX512VL-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> %a0)
   ret i8 %1
 }
 
@@ -2222,32 +2222,32 @@
 ; AVX512VL-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.umax.v128i8(<128 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> %a0)
   ret i8 %1
 }
 
-declare i64 @llvm.experimental.vector.reduce.umax.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.umax.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.umax.v8i64(<8 x i64>)
-declare i64 @llvm.experimental.vector.reduce.umax.v16i64(<16 x i64>)
+declare i64 @llvm.vector.reduce.umax.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.umax.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.umax.v8i64(<8 x i64>)
+declare i64 @llvm.vector.reduce.umax.v16i64(<16 x i64>)
 
-declare i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umax.v32i32(<32 x i32>)
+declare i32 @llvm.vector.reduce.umax.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.umax.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.umax.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.umax.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.umax.v32i32(<32 x i32>)
 
-declare i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umax.v32i16(<32 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umax.v64i16(<64 x i16>)
+declare i16 @llvm.vector.reduce.umax.v2i16(<2 x i16>)
+declare i16 @llvm.vector.reduce.umax.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.umax.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.umax.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.umax.v32i16(<32 x i16>)
+declare i16 @llvm.vector.reduce.umax.v64i16(<64 x i16>)
 
-declare i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umax.v128i8(<128 x i8>)
+declare i8 @llvm.vector.reduce.umax.v2i8(<2 x i8>)
+declare i8 @llvm.vector.reduce.umax.v4i8(<4 x i8>)
+declare i8 @llvm.vector.reduce.umax.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.umax.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.umax.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.umax.v64i8(<64 x i8>)
+declare i8 @llvm.vector.reduce.umax.v128i8(<128 x i8>)
diff --git a/llvm/test/CodeGen/X86/vector-reduce-umin.ll b/llvm/test/CodeGen/X86/vector-reduce-umin.ll
--- a/llvm/test/CodeGen/X86/vector-reduce-umin.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-umin.ll
@@ -89,7 +89,7 @@
 ; AVX512VL-NEXT:    vpminuq %xmm1, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vmovq %xmm0, %rax
 ; AVX512VL-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.umin.v2i64(<2 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> %a0)
   ret i64 %1
 }
 
@@ -232,7 +232,7 @@
 ; AVX512VL-NEXT:    vmovq %xmm0, %rax
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.umin.v4i64(<4 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> %a0)
   ret i64 %1
 }
 
@@ -456,7 +456,7 @@
 ; AVX512VL-NEXT:    vmovq %xmm0, %rax
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.umin.v8i64(<8 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> %a0)
   ret i64 %1
 }
 
@@ -836,7 +836,7 @@
 ; AVX512VL-NEXT:    vmovq %xmm0, %rax
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.umin.v16i64(<16 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> %a0)
   ret i64 %1
 }
 
@@ -879,7 +879,7 @@
 ; AVX512-NEXT:    vpminud %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> %a0)
   ret i32 %1
 }
 
@@ -933,7 +933,7 @@
 ; AVX512-NEXT:    vpminud %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %a0)
   ret i32 %1
 }
 
@@ -1014,7 +1014,7 @@
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> %a0)
   ret i32 %1
 }
 
@@ -1119,7 +1119,7 @@
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.umin.v16i32(<16 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> %a0)
   ret i32 %1
 }
 
@@ -1269,7 +1269,7 @@
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.umin.v32i32(<32 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> %a0)
   ret i32 %1
 }
 
@@ -1315,7 +1315,7 @@
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> %a0)
   ret i16 %1
 }
 
@@ -1365,7 +1365,7 @@
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> %a0)
   ret i16 %1
 }
 
@@ -1407,7 +1407,7 @@
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> %a0)
   ret i16 %1
 }
 
@@ -1467,7 +1467,7 @@
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> %a0)
   ret i16 %1
 }
 
@@ -1539,7 +1539,7 @@
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.umin.v32i16(<32 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> %a0)
   ret i16 %1
 }
 
@@ -1632,7 +1632,7 @@
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.umin.v64i16(<64 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> %a0)
   ret i16 %1
 }
 
@@ -1665,7 +1665,7 @@
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> %a0)
   ret i8 %1
 }
 
@@ -1701,7 +1701,7 @@
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> %a0)
   ret i8 %1
 }
 
@@ -1743,7 +1743,7 @@
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> %a0)
   ret i8 %1
 }
 
@@ -1791,7 +1791,7 @@
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> %a0)
   ret i8 %1
 }
 
@@ -1859,7 +1859,7 @@
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> %a0)
   ret i8 %1
 }
 
@@ -1937,7 +1937,7 @@
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> %a0)
   ret i8 %1
 }
 
@@ -2032,32 +2032,32 @@
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.umin.v128i8(<128 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> %a0)
   ret i8 %1
 }
 
-declare i64 @llvm.experimental.vector.reduce.umin.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.umin.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.umin.v8i64(<8 x i64>)
-declare i64 @llvm.experimental.vector.reduce.umin.v16i64(<16 x i64>)
+declare i64 @llvm.vector.reduce.umin.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.umin.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.umin.v8i64(<8 x i64>)
+declare i64 @llvm.vector.reduce.umin.v16i64(<16 x i64>)
 
-declare i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umin.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umin.v32i32(<32 x i32>)
+declare i32 @llvm.vector.reduce.umin.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.umin.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.umin.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.umin.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.umin.v32i32(<32 x i32>)
 
-declare i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umin.v32i16(<32 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umin.v64i16(<64 x i16>)
+declare i16 @llvm.vector.reduce.umin.v2i16(<2 x i16>)
+declare i16 @llvm.vector.reduce.umin.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.umin.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.umin.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.umin.v32i16(<32 x i16>)
+declare i16 @llvm.vector.reduce.umin.v64i16(<64 x i16>)
 
-declare i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umin.v128i8(<128 x i8>)
+declare i8 @llvm.vector.reduce.umin.v2i8(<2 x i8>)
+declare i8 @llvm.vector.reduce.umin.v4i8(<4 x i8>)
+declare i8 @llvm.vector.reduce.umin.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.umin.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.umin.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.umin.v64i8(<64 x i8>)
+declare i8 @llvm.vector.reduce.umin.v128i8(<128 x i8>)
diff --git a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll
--- a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll
@@ -57,7 +57,7 @@
 ; AVX512VL-NEXT:    setnp %al
 ; AVX512VL-NEXT:    retq
   %a = trunc <2 x i64> %0 to <2 x i1>
-  %b = call i1 @llvm.experimental.vector.reduce.xor.v2i1(<2 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> %a)
   ret i1 %b
 }
 
@@ -107,7 +107,7 @@
 ; AVX512VL-NEXT:    setnp %al
 ; AVX512VL-NEXT:    retq
   %a = trunc <4 x i32> %0 to <4 x i1>
-  %b = call i1 @llvm.experimental.vector.reduce.xor.v4i1(<4 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> %a)
   ret i1 %b
 }
 
@@ -172,7 +172,7 @@
 ; AVX512VL-NEXT:    setnp %al
 ; AVX512VL-NEXT:    retq
   %a = trunc <8 x i8> %0 to <8 x i1>
-  %b = call i1 @llvm.experimental.vector.reduce.xor.v8i1(<8 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> %a)
   ret i1 %b
 }
 
@@ -201,7 +201,7 @@
 ; AVX512-NEXT:    setnp %al
 ; AVX512-NEXT:    retq
   %a = trunc <16 x i8> %0 to <16 x i1>
-  %b = call i1 @llvm.experimental.vector.reduce.xor.v16i1(<16 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> %a)
   ret i1 %b
 }
 
@@ -256,7 +256,7 @@
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = trunc <4 x i64> %0 to <4 x i1>
-  %b = call i1 @llvm.experimental.vector.reduce.xor.v4i1(<4 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> %a)
   ret i1 %b
 }
 
@@ -345,7 +345,7 @@
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = trunc <8 x i32> %0 to <8 x i1>
-  %b = call i1 @llvm.experimental.vector.reduce.xor.v8i1(<8 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> %a)
   ret i1 %b
 }
 
@@ -423,7 +423,7 @@
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = trunc <16 x i16> %0 to <16 x i1>
-  %b = call i1 @llvm.experimental.vector.reduce.xor.v16i1(<16 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> %a)
   ret i1 %b
 }
 
@@ -504,7 +504,7 @@
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = trunc <32 x i8> %0 to <32 x i1>
-  %b = call i1 @llvm.experimental.vector.reduce.xor.v32i1(<32 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> %a)
   ret i1 %b
 }
 
@@ -609,7 +609,7 @@
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = trunc <8 x i64> %0 to <8 x i1>
-  %b = call i1 @llvm.experimental.vector.reduce.xor.v8i1(<8 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> %a)
   ret i1 %b
 }
 
@@ -717,7 +717,7 @@
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = trunc <16 x i32> %0 to <16 x i1>
-  %b = call i1 @llvm.experimental.vector.reduce.xor.v16i1(<16 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> %a)
   ret i1 %b
 }
 
@@ -815,7 +815,7 @@
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = trunc <32 x i16> %0 to <32 x i1>
-  %b = call i1 @llvm.experimental.vector.reduce.xor.v32i1(<32 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> %a)
   ret i1 %b
 }
 
@@ -912,7 +912,7 @@
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = trunc <64 x i8> %0 to <64 x i1>
-  %b = call i1 @llvm.experimental.vector.reduce.xor.v64i1(<64 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> %a)
   ret i1 %b
 }
 
@@ -978,7 +978,7 @@
 ; AVX512VL-NEXT:    setnp %al
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <2 x i64> %0, zeroinitializer
-  %b = call i1 @llvm.experimental.vector.reduce.xor.v2i1(<2 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> %a)
   ret i1 %b
 }
 
@@ -1029,7 +1029,7 @@
 ; AVX512VL-NEXT:    setnp %al
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <4 x i32> %0, zeroinitializer
-  %b = call i1 @llvm.experimental.vector.reduce.xor.v4i1(<4 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> %a)
   ret i1 %b
 }
 
@@ -1082,7 +1082,7 @@
 ; AVX512VL-NEXT:    setnp %al
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <8 x i8> %0, zeroinitializer
-  %b = call i1 @llvm.experimental.vector.reduce.xor.v8i1(<8 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> %a)
   ret i1 %b
 }
 
@@ -1136,7 +1136,7 @@
 ; AVX512VL-NEXT:    setnp %al
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <16 x i8> %0, zeroinitializer
-  %b = call i1 @llvm.experimental.vector.reduce.xor.v16i1(<16 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> %a)
   ret i1 %b
 }
 
@@ -1219,7 +1219,7 @@
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <4 x i64> %0, zeroinitializer
-  %b = call i1 @llvm.experimental.vector.reduce.xor.v4i1(<4 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> %a)
   ret i1 %b
 }
 
@@ -1288,7 +1288,7 @@
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <8 x i32> %0, zeroinitializer
-  %b = call i1 @llvm.experimental.vector.reduce.xor.v8i1(<8 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> %a)
   ret i1 %b
 }
 
@@ -1366,7 +1366,7 @@
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <16 x i16> %0, zeroinitializer
-  %b = call i1 @llvm.experimental.vector.reduce.xor.v16i1(<16 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> %a)
   ret i1 %b
 }
 
@@ -1454,7 +1454,7 @@
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <32 x i8> %0, zeroinitializer
-  %b = call i1 @llvm.experimental.vector.reduce.xor.v32i1(<32 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> %a)
   ret i1 %b
 }
 
@@ -1557,7 +1557,7 @@
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <8 x i64> %0, zeroinitializer
-  %b = call i1 @llvm.experimental.vector.reduce.xor.v8i1(<8 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> %a)
   ret i1 %b
 }
 
@@ -1643,7 +1643,7 @@
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <16 x i32> %0, zeroinitializer
-  %b = call i1 @llvm.experimental.vector.reduce.xor.v16i1(<16 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> %a)
   ret i1 %b
 }
 
@@ -1743,7 +1743,7 @@
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <32 x i16> %0, zeroinitializer
-  %b = call i1 @llvm.experimental.vector.reduce.xor.v32i1(<32 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> %a)
   ret i1 %b
 }
 
@@ -1853,13 +1853,13 @@
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <64 x i8> %0, zeroinitializer
-  %b = call i1 @llvm.experimental.vector.reduce.xor.v64i1(<64 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> %a)
   ret i1 %b
 }
 
-declare i1 @llvm.experimental.vector.reduce.xor.v2i1(<2 x i1>)
-declare i1 @llvm.experimental.vector.reduce.xor.v4i1(<4 x i1>)
-declare i1 @llvm.experimental.vector.reduce.xor.v8i1(<8 x i1>)
-declare i1 @llvm.experimental.vector.reduce.xor.v16i1(<16 x i1>)
-declare i1 @llvm.experimental.vector.reduce.xor.v32i1(<32 x i1>)
-declare i1 @llvm.experimental.vector.reduce.xor.v64i1(<64 x i1>)
+declare i1 @llvm.vector.reduce.xor.v2i1(<2 x i1>)
+declare i1 @llvm.vector.reduce.xor.v4i1(<4 x i1>)
+declare i1 @llvm.vector.reduce.xor.v8i1(<8 x i1>)
+declare i1 @llvm.vector.reduce.xor.v16i1(<16 x i1>)
+declare i1 @llvm.vector.reduce.xor.v32i1(<32 x i1>)
+declare i1 @llvm.vector.reduce.xor.v64i1(<64 x i1>)
diff --git a/llvm/test/CodeGen/X86/vector-reduce-xor.ll b/llvm/test/CodeGen/X86/vector-reduce-xor.ll
--- a/llvm/test/CodeGen/X86/vector-reduce-xor.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-xor.ll
@@ -24,7 +24,7 @@
 ; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vmovq %xmm0, %rax
 ; AVX-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.xor.v2i64(<2 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> %a0)
   ret i64 %1
 }
 
@@ -66,7 +66,7 @@
 ; AVX512-NEXT:    vmovq %xmm0, %rax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.xor.v4i64(<4 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> %a0)
   ret i64 %1
 }
 
@@ -114,7 +114,7 @@
 ; AVX512-NEXT:    vmovq %xmm0, %rax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.xor.v8i64(<8 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.xor.v8i64(<8 x i64> %a0)
   ret i64 %1
 }
 
@@ -171,7 +171,7 @@
 ; AVX512-NEXT:    vmovq %xmm0, %rax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.xor.v16i64(<16 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.xor.v16i64(<16 x i64> %a0)
   ret i64 %1
 }
 
@@ -193,7 +193,7 @@
 ; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vmovd %xmm0, %eax
 ; AVX-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.xor.v2i32(<2 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> %a0)
   ret i32 %1
 }
 
@@ -215,7 +215,7 @@
 ; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vmovd %xmm0, %eax
 ; AVX-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %a0)
   ret i32 %1
 }
 
@@ -265,7 +265,7 @@
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> %a0)
   ret i32 %1
 }
 
@@ -321,7 +321,7 @@
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.xor.v16i32(<16 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.xor.v16i32(<16 x i32> %a0)
   ret i32 %1
 }
 
@@ -386,7 +386,7 @@
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.xor.v32i32(<32 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.xor.v32i32(<32 x i32> %a0)
   ret i32 %1
 }
 
@@ -411,7 +411,7 @@
 ; AVX-NEXT:    vmovd %xmm0, %eax
 ; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.xor.v2i16(<2 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.xor.v2i16(<2 x i16> %a0)
   ret i16 %1
 }
 
@@ -436,7 +436,7 @@
 ; AVX-NEXT:    vmovd %xmm0, %eax
 ; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.xor.v4i16(<4 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> %a0)
   ret i16 %1
 }
 
@@ -465,7 +465,7 @@
 ; AVX-NEXT:    vmovd %xmm0, %eax
 ; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.xor.v8i16(<8 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> %a0)
   ret i16 %1
 }
 
@@ -528,7 +528,7 @@
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.xor.v16i16(<16 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> %a0)
   ret i16 %1
 }
 
@@ -597,7 +597,7 @@
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.xor.v32i16(<32 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.xor.v32i16(<32 x i16> %a0)
   ret i16 %1
 }
 
@@ -675,7 +675,7 @@
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.xor.v64i16(<64 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.xor.v64i16(<64 x i16> %a0)
   ret i16 %1
 }
 
@@ -700,7 +700,7 @@
 ; AVX-NEXT:    vmovd %xmm0, %eax
 ; AVX-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.xor.v2i8(<2 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.xor.v2i8(<2 x i8> %a0)
   ret i8 %1
 }
 
@@ -726,7 +726,7 @@
 ; AVX-NEXT:    vmovd %xmm0, %eax
 ; AVX-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.xor.v4i8(<4 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> %a0)
   ret i8 %1
 }
 
@@ -756,7 +756,7 @@
 ; AVX-NEXT:    vmovd %xmm0, %eax
 ; AVX-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.xor.v8i8(<8 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> %a0)
   ret i8 %1
 }
 
@@ -790,7 +790,7 @@
 ; AVX-NEXT:    vmovd %xmm0, %eax
 ; AVX-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.xor.v16i8(<16 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> %a0)
   ret i8 %1
 }
 
@@ -862,7 +862,7 @@
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.xor.v32i8(<32 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> %a0)
   ret i8 %1
 }
 
@@ -940,7 +940,7 @@
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.xor.v64i8(<64 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.xor.v64i8(<64 x i8> %a0)
   ret i8 %1
 }
 
@@ -1027,32 +1027,32 @@
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.xor.v128i8(<128 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.xor.v128i8(<128 x i8> %a0)
   ret i8 %1
 }
 
-declare i64 @llvm.experimental.vector.reduce.xor.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.xor.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.xor.v8i64(<8 x i64>)
-declare i64 @llvm.experimental.vector.reduce.xor.v16i64(<16 x i64>)
+declare i64 @llvm.vector.reduce.xor.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.xor.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.xor.v8i64(<8 x i64>)
+declare i64 @llvm.vector.reduce.xor.v16i64(<16 x i64>)
 
-declare i32 @llvm.experimental.vector.reduce.xor.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.xor.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.xor.v32i32(<32 x i32>)
+declare i32 @llvm.vector.reduce.xor.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.xor.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.xor.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.xor.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.xor.v32i32(<32 x i32>)
 
-declare i16 @llvm.experimental.vector.reduce.xor.v2i16(<2 x i16>)
-declare i16 @llvm.experimental.vector.reduce.xor.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.xor.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.xor.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.xor.v32i16(<32 x i16>)
-declare i16 @llvm.experimental.vector.reduce.xor.v64i16(<64 x i16>)
+declare i16 @llvm.vector.reduce.xor.v2i16(<2 x i16>)
+declare i16 @llvm.vector.reduce.xor.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.xor.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.xor.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.xor.v32i16(<32 x i16>)
+declare i16 @llvm.vector.reduce.xor.v64i16(<64 x i16>)
 
-declare i8 @llvm.experimental.vector.reduce.xor.v2i8(<2 x i8>)
-declare i8 @llvm.experimental.vector.reduce.xor.v4i8(<4 x i8>)
-declare i8 @llvm.experimental.vector.reduce.xor.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.xor.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.xor.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.xor.v64i8(<64 x i8>)
-declare i8 @llvm.experimental.vector.reduce.xor.v128i8(<128 x i8>)
+declare i8 @llvm.vector.reduce.xor.v2i8(<2 x i8>)
+declare i8 @llvm.vector.reduce.xor.v4i8(<4 x i8>)
+declare i8 @llvm.vector.reduce.xor.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.xor.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.xor.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.xor.v64i8(<64 x i8>)
+declare i8 @llvm.vector.reduce.xor.v128i8(<128 x i8>)
diff --git a/llvm/test/Instrumentation/MemorySanitizer/experimental-reduce.ll b/llvm/test/Instrumentation/MemorySanitizer/reduce.ll
rename from llvm/test/Instrumentation/MemorySanitizer/experimental-reduce.ll
rename to llvm/test/Instrumentation/MemorySanitizer/reduce.ll
--- a/llvm/test/Instrumentation/MemorySanitizer/experimental-reduce.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/reduce.ll
@@ -5,9 +5,9 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-declare i32 @llvm.experimental.vector.reduce.add(<3 x i32>)
-declare i32 @llvm.experimental.vector.reduce.and(<3 x i32>)
-declare i32 @llvm.experimental.vector.reduce.or(<3 x i32>)
+declare i32 @llvm.vector.reduce.add(<3 x i32>)
+declare i32 @llvm.vector.reduce.and(<3 x i32>)
+declare i32 @llvm.vector.reduce.or(<3 x i32>)
 
 ; CHECK-LABEL: @reduce_add
 define i32 @reduce_add() sanitize_memory {
@@ -17,9 +17,9 @@
   %o = load <3 x i32>, <3 x i32> *%p
 ; CHECK: [[O_SHADOW:%.*]] = load <3 x i32>, <3 x i32>*
 ; CHECK: [[O_ORIGIN:%.*]] = load i32, i32*
-; CHECK: [[R_SHADOW:%.*]] = call i32 @llvm.experimental.vector.reduce.or.v3i32(<3 x i32> [[O_SHADOW]])
-; CHECK: [[R:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v3i32(<3 x i32> [[O]])
-  %r = call i32 @llvm.experimental.vector.reduce.add(<3 x i32> %o)
+; CHECK: [[R_SHADOW:%.*]] = call i32 @llvm.vector.reduce.or.v3i32(<3 x i32> [[O_SHADOW]])
+; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.add.v3i32(<3 x i32> [[O]])
+  %r = call i32 @llvm.vector.reduce.add(<3 x i32> %o)
 ; CHECK: store i32 [[R_SHADOW]], {{.*}} @__msan_retval_tls
 ; CHECK: store i32 [[O_ORIGIN]], {{.*}} @__msan_retval_origin_tls
 ; CHECK: ret i32 [[R]]
@@ -35,11 +35,11 @@
 ; CHECK: [[O_SHADOW:%.*]] = load <3 x i32>, <3 x i32>*
 ; CHECK: [[O_ORIGIN:%.*]] = load i32, i32*
 ; CHECK: [[O_SHADOW_1:%.*]] = or <3 x i32> [[O]], [[O_SHADOW]]
-; CHECK: [[O_SHADOW_2:%.*]] = call i32 @llvm.experimental.vector.reduce.and.v3i32(<3 x i32> [[O_SHADOW_1]]
-; CHECK: [[O_SHADOW_3:%.*]] = call i32 @llvm.experimental.vector.reduce.or.v3i32(<3 x i32> [[O_SHADOW]])
+; CHECK: [[O_SHADOW_2:%.*]] = call i32 @llvm.vector.reduce.and.v3i32(<3 x i32> [[O_SHADOW_1]]
+; CHECK: [[O_SHADOW_3:%.*]] = call i32 @llvm.vector.reduce.or.v3i32(<3 x i32> [[O_SHADOW]])
 ; CHECK: [[R_SHADOW:%.*]] = and i32 [[O_SHADOW_2]], [[O_SHADOW_3]]
-; CHECK: [[R:%.*]] = call i32 @llvm.experimental.vector.reduce.and.v3i32(<3 x i32> [[O]])
-  %r = call i32 @llvm.experimental.vector.reduce.and(<3 x i32> %o)
+; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.and.v3i32(<3 x i32> [[O]])
+  %r = call i32 @llvm.vector.reduce.and(<3 x i32> %o)
 ; CHECK: store i32 [[R_SHADOW]], {{.*}} @__msan_retval_tls
 ; CHECK: store i32 [[O_ORIGIN]], {{.*}} @__msan_retval_origin_tls
 ; CHECK: ret i32 [[R]]
@@ -56,11 +56,11 @@
 ; CHECK: [[O_ORIGIN:%.*]] = load i32, i32*
 ; CHECK: [[NOT_O:%.*]] = xor <3 x i32> [[O]], <i32 -1, i32 -1, i32 -1>
 ; CHECK: [[O_SHADOW_1:%.*]] = or <3 x i32> [[NOT_O]], [[O_SHADOW]]
-; CHECK: [[O_SHADOW_2:%.*]] = call i32 @llvm.experimental.vector.reduce.and.v3i32(<3 x i32> [[O_SHADOW_1]]
-; CHECK: [[O_SHADOW_3:%.*]] = call i32 @llvm.experimental.vector.reduce.or.v3i32(<3 x i32> [[O_SHADOW]])
+; CHECK: [[O_SHADOW_2:%.*]] = call i32 @llvm.vector.reduce.and.v3i32(<3 x i32> [[O_SHADOW_1]]
+; CHECK: [[O_SHADOW_3:%.*]] = call i32 @llvm.vector.reduce.or.v3i32(<3 x i32> [[O_SHADOW]])
 ; CHECK: [[R_SHADOW:%.*]] = and i32 [[O_SHADOW_2]], [[O_SHADOW_3]]
-; CHECK: [[R:%.*]] = call i32 @llvm.experimental.vector.reduce.or.v3i32(<3 x i32> [[O]])
-  %r = call i32 @llvm.experimental.vector.reduce.or(<3 x i32> %o)
+; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.or.v3i32(<3 x i32> [[O]])
+  %r = call i32 @llvm.vector.reduce.or(<3 x i32> %o)
 ; CHECK: store i32 [[R_SHADOW]], {{.*}} @__msan_retval_tls
 ; CHECK: store i32 [[O_ORIGIN]], {{.*}} @__msan_retval_origin_tls
 ; CHECK: ret i32 [[R]]
diff --git a/llvm/test/Transforms/InstCombine/vector-reductions.ll b/llvm/test/Transforms/InstCombine/vector-reductions.ll
--- a/llvm/test/Transforms/InstCombine/vector-reductions.ll
+++ b/llvm/test/Transforms/InstCombine/vector-reductions.ll
@@ -1,23 +1,23 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -instcombine -S | FileCheck %s
 
-declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float, <4 x float>)
-declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float, <8 x float>)
+declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
+declare float @llvm.vector.reduce.fadd.f32.v8f32(float, <8 x float>)
 declare void @use_f32(float)
 
-declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
 declare void @use_i32(i32)
 
 define float @diff_of_sums_v4f32(float %a0, <4 x float> %v0, float %a1, <4 x float> %v1) {
 ; CHECK-LABEL: @diff_of_sums_v4f32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fsub reassoc nsz <4 x float> [[V0:%.*]], [[V1:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = call reassoc nsz float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float [[A0:%.*]], <4 x float> [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call reassoc nsz float @llvm.vector.reduce.fadd.v4f32(float [[A0:%.*]], <4 x float> [[TMP1]])
 ; CHECK-NEXT:    [[R:%.*]] = fsub reassoc nsz float [[TMP2]], [[A1:%.*]]
 ; CHECK-NEXT:    ret float [[R]]
 ;
-  %r0 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %a0, <4 x float> %v0)
-  %r1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %a1, <4 x float> %v1)
+  %r0 = call float @llvm.vector.reduce.fadd.f32.v4f32(float %a0, <4 x float> %v0)
+  %r1 = call float @llvm.vector.reduce.fadd.f32.v4f32(float %a1, <4 x float> %v1)
   %r = fsub reassoc nsz float %r0, %r1
   ret float %r
 }
@@ -26,13 +26,13 @@
 
 define float @diff_of_sums_v4f32_fmf(float %a0, <4 x float> %v0, float %a1, <4 x float> %v1) {
 ; CHECK-LABEL: @diff_of_sums_v4f32_fmf(
-; CHECK-NEXT:    [[R0:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float [[A0:%.*]], <4 x float> [[V0:%.*]])
-; CHECK-NEXT:    [[R1:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float [[A1:%.*]], <4 x float> [[V1:%.*]])
+; CHECK-NEXT:    [[R0:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float [[A0:%.*]], <4 x float> [[V0:%.*]])
+; CHECK-NEXT:    [[R1:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float [[A1:%.*]], <4 x float> [[V1:%.*]])
 ; CHECK-NEXT:    [[R:%.*]] = fsub nnan ninf nsz float [[R0]], [[R1]]
 ; CHECK-NEXT:    ret float [[R]]
 ;
-  %r0 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %a0, <4 x float> %v0)
-  %r1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %a1, <4 x float> %v1)
+  %r0 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float %a0, <4 x float> %v0)
+  %r1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float %a1, <4 x float> %v1)
   %r = fsub ninf nnan nsz float %r0, %r1
   ret float %r
 }
@@ -41,15 +41,15 @@
 
 define float @diff_of_sums_extra_use1(float %a0, <4 x float> %v0, float %a1, <4 x float> %v1) {
 ; CHECK-LABEL: @diff_of_sums_extra_use1(
-; CHECK-NEXT:    [[R0:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float [[A0:%.*]], <4 x float> [[V0:%.*]])
+; CHECK-NEXT:    [[R0:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float [[A0:%.*]], <4 x float> [[V0:%.*]])
 ; CHECK-NEXT:    call void @use_f32(float [[R0]])
-; CHECK-NEXT:    [[R1:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float [[A1:%.*]], <4 x float> [[V1:%.*]])
+; CHECK-NEXT:    [[R1:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float [[A1:%.*]], <4 x float> [[V1:%.*]])
 ; CHECK-NEXT:    [[R:%.*]] = fsub fast float [[R0]], [[R1]]
 ; CHECK-NEXT:    ret float [[R]]
 ;
-  %r0 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %a0, <4 x float> %v0)
+  %r0 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float %a0, <4 x float> %v0)
   call void @use_f32(float %r0)
-  %r1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %a1, <4 x float> %v1)
+  %r1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float %a1, <4 x float> %v1)
   %r = fsub fast float %r0, %r1
   ret float %r
 }
@@ -58,14 +58,14 @@
 
 define float @diff_of_sums_extra_use2(float %a0, <4 x float> %v0, float %a1, <4 x float> %v1) {
 ; CHECK-LABEL: @diff_of_sums_extra_use2(
-; CHECK-NEXT:    [[R0:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float [[A0:%.*]], <4 x float> [[V0:%.*]])
-; CHECK-NEXT:    [[R1:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float [[A1:%.*]], <4 x float> [[V1:%.*]])
+; CHECK-NEXT:    [[R0:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float [[A0:%.*]], <4 x float> [[V0:%.*]])
+; CHECK-NEXT:    [[R1:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float [[A1:%.*]], <4 x float> [[V1:%.*]])
 ; CHECK-NEXT:    call void @use_f32(float [[R1]])
 ; CHECK-NEXT:    [[R:%.*]] = fsub fast float [[R0]], [[R1]]
 ; CHECK-NEXT:    ret float [[R]]
 ;
-  %r0 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %a0, <4 x float> %v0)
-  %r1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %a1, <4 x float> %v1)
+  %r0 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float %a0, <4 x float> %v0)
+  %r1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float %a1, <4 x float> %v1)
   call void @use_f32(float %r1)
   %r = fsub fast float %r0, %r1
   ret float %r
@@ -75,13 +75,13 @@
 
 define float @diff_of_sums_type_mismatch(float %a0, <4 x float> %v0, float %a1, <8 x float> %v1) {
 ; CHECK-LABEL: @diff_of_sums_type_mismatch(
-; CHECK-NEXT:    [[R0:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float [[A0:%.*]], <4 x float> [[V0:%.*]])
-; CHECK-NEXT:    [[R1:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float [[A1:%.*]], <8 x float> [[V1:%.*]])
+; CHECK-NEXT:    [[R0:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float [[A0:%.*]], <4 x float> [[V0:%.*]])
+; CHECK-NEXT:    [[R1:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float [[A1:%.*]], <8 x float> [[V1:%.*]])
 ; CHECK-NEXT:    [[R:%.*]] = fsub fast float [[R0]], [[R1]]
 ; CHECK-NEXT:    ret float [[R]]
 ;
-  %r0 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %a0, <4 x float> %v0)
-  %r1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float %a1, <8 x float> %v1)
+  %r0 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float %a0, <4 x float> %v0)
+  %r1 = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float %a1, <8 x float> %v1)
   %r = fsub fast float %r0, %r1
   ret float %r
 }
@@ -89,11 +89,11 @@
 define i32 @diff_of_sums_v4i32(<4 x i32> %v0, <4 x i32> %v1) {
 ; CHECK-LABEL: @diff_of_sums_v4i32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub <4 x i32> [[V0:%.*]], [[V1:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]])
 ; CHECK-NEXT:    ret i32 [[TMP2]]
 ;
-  %r0 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %v0)
-  %r1 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %v1)
+  %r0 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %v0)
+  %r1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %v1)
   %r = sub i32 %r0, %r1
   ret i32 %r
 }
@@ -102,15 +102,15 @@
 
 define i32 @diff_of_sums_v4i32_extra_use1(<4 x i32> %v0, <4 x i32> %v1) {
 ; CHECK-LABEL: @diff_of_sums_v4i32_extra_use1(
-; CHECK-NEXT:    [[R0:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[V0:%.*]])
+; CHECK-NEXT:    [[R0:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[V0:%.*]])
 ; CHECK-NEXT:    call void @use_i32(i32 [[R0]])
-; CHECK-NEXT:    [[R1:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[V1:%.*]])
+; CHECK-NEXT:    [[R1:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[V1:%.*]])
 ; CHECK-NEXT:    [[R:%.*]] = sub i32 [[R0]], [[R1]]
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
-  %r0 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %v0)
+  %r0 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %v0)
   call void @use_i32(i32 %r0)
-  %r1 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %v1)
+  %r1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %v1)
   %r = sub i32 %r0, %r1
   ret i32 %r
 }
@@ -119,14 +119,14 @@
 
 define i32 @diff_of_sums_v4i32_extra_use2(<4 x i32> %v0, <4 x i32> %v1) {
 ; CHECK-LABEL: @diff_of_sums_v4i32_extra_use2(
-; CHECK-NEXT:    [[R0:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[V0:%.*]])
-; CHECK-NEXT:    [[R1:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[V1:%.*]])
+; CHECK-NEXT:    [[R0:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[V0:%.*]])
+; CHECK-NEXT:    [[R1:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[V1:%.*]])
 ; CHECK-NEXT:    call void @use_i32(i32 [[R1]])
 ; CHECK-NEXT:    [[R:%.*]] = sub i32 [[R0]], [[R1]]
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
-  %r0 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %v0)
-  %r1 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %v1)
+  %r0 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %v0)
+  %r1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %v1)
   call void @use_i32(i32 %r1)
   %r = sub i32 %r0, %r1
   ret i32 %r
@@ -136,13 +136,13 @@
 
 define i32 @diff_of_sums_type_mismatch2(<8 x i32> %v0, <4 x i32> %v1) {
 ; CHECK-LABEL: @diff_of_sums_type_mismatch2(
-; CHECK-NEXT:    [[R0:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> [[V0:%.*]])
-; CHECK-NEXT:    [[R1:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[V1:%.*]])
+; CHECK-NEXT:    [[R0:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[V0:%.*]])
+; CHECK-NEXT:    [[R1:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[V1:%.*]])
 ; CHECK-NEXT:    [[R:%.*]] = sub i32 [[R0]], [[R1]]
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
-  %r0 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %v0)
-  %r1 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %v1)
+  %r0 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %v0)
+  %r1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %v1)
   %r = sub i32 %r0, %r1
   ret i32 %r
 }
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/vecreduce.ll b/llvm/test/Transforms/InstSimplify/ConstProp/vecreduce.ll
--- a/llvm/test/Transforms/InstSimplify/ConstProp/vecreduce.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/vecreduce.ll
@@ -1,31 +1,31 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -instsimplify -S | FileCheck %s
 
-declare i32 @llvm.experimental.vector.reduce.add.v1i32(<1 x i32> %a)
-declare i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %a)
-declare i32 @llvm.experimental.vector.reduce.mul.v1i32(<1 x i32> %a)
-declare i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> %a)
-declare i32 @llvm.experimental.vector.reduce.and.v1i32(<1 x i32> %a)
-declare i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> %a)
-declare i32 @llvm.experimental.vector.reduce.or.v1i32(<1 x i32> %a)
-declare i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> %a)
-declare i32 @llvm.experimental.vector.reduce.xor.v1i32(<1 x i32> %a)
-declare i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32> %a)
-declare i32 @llvm.experimental.vector.reduce.smin.v1i32(<1 x i32> %a)
-declare i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> %a)
-declare i32 @llvm.experimental.vector.reduce.smax.v1i32(<1 x i32> %a)
-declare i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> %a)
-declare i32 @llvm.experimental.vector.reduce.umin.v1i32(<1 x i32> %a)
-declare i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> %a)
-declare i32 @llvm.experimental.vector.reduce.umax.v1i32(<1 x i32> %a)
-declare i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> %a)
+declare i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a)
+declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a)
+declare i32 @llvm.vector.reduce.mul.v1i32(<1 x i32> %a)
+declare i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> %a)
+declare i32 @llvm.vector.reduce.and.v1i32(<1 x i32> %a)
+declare i32 @llvm.vector.reduce.and.v8i32(<8 x i32> %a)
+declare i32 @llvm.vector.reduce.or.v1i32(<1 x i32> %a)
+declare i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %a)
+declare i32 @llvm.vector.reduce.xor.v1i32(<1 x i32> %a)
+declare i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> %a)
+declare i32 @llvm.vector.reduce.smin.v1i32(<1 x i32> %a)
+declare i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> %a)
+declare i32 @llvm.vector.reduce.smax.v1i32(<1 x i32> %a)
+declare i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> %a)
+declare i32 @llvm.vector.reduce.umin.v1i32(<1 x i32> %a)
+declare i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> %a)
+declare i32 @llvm.vector.reduce.umax.v1i32(<1 x i32> %a)
+declare i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> %a)
 
 
 define i32 @add_0() {
 ; CHECK-LABEL: @add_0(
 ; CHECK-NEXT:    ret i32 0
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> zeroinitializer)
+  %x = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> zeroinitializer)
   ret i32 %x
 }
 
@@ -33,7 +33,7 @@
 ; CHECK-LABEL: @add_1(
 ; CHECK-NEXT:    ret i32 8
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>)
+  %x = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>)
   ret i32 %x
 }
 
@@ -41,7 +41,7 @@
 ; CHECK-LABEL: @add_inc(
 ; CHECK-NEXT:    ret i32 18
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> <i32 1, i32 -3, i32 5, i32 7, i32 2, i32 4, i32 -6, i32 8>)
+  %x = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> <i32 1, i32 -3, i32 5, i32 7, i32 2, i32 4, i32 -6, i32 8>)
   ret i32 %x
 }
 
@@ -49,25 +49,25 @@
 ; CHECK-LABEL: @add_1v(
 ; CHECK-NEXT:    ret i32 10
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.add.v1i32(<1 x i32> <i32 10>)
+  %x = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> <i32 10>)
   ret i32 %x
 }
 
 define i32 @add_undef() {
 ; CHECK-LABEL: @add_undef(
-; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> undef)
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef)
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> undef)
+  %x = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef)
   ret i32 %x
 }
 
 define i32 @add_undef1() {
 ; CHECK-LABEL: @add_undef1(
-; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
+  %x = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
   ret i32 %x
 }
 
@@ -77,7 +77,7 @@
 ; CHECK-LABEL: @mul_0(
 ; CHECK-NEXT:    ret i32 0
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> zeroinitializer)
+  %x = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> zeroinitializer)
   ret i32 %x
 }
 
@@ -85,7 +85,7 @@
 ; CHECK-LABEL: @mul_1(
 ; CHECK-NEXT:    ret i32 1
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>)
+  %x = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>)
   ret i32 %x
 }
 
@@ -93,7 +93,7 @@
 ; CHECK-LABEL: @mul_inc(
 ; CHECK-NEXT:    ret i32 40320
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> <i32 1, i32 -3, i32 5, i32 7, i32 2, i32 4, i32 -6, i32 8>)
+  %x = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> <i32 1, i32 -3, i32 5, i32 7, i32 2, i32 4, i32 -6, i32 8>)
   ret i32 %x
 }
 
@@ -101,25 +101,25 @@
 ; CHECK-LABEL: @mul_1v(
 ; CHECK-NEXT:    ret i32 10
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.mul.v1i32(<1 x i32> <i32 10>)
+  %x = call i32 @llvm.vector.reduce.mul.v1i32(<1 x i32> <i32 10>)
   ret i32 %x
 }
 
 define i32 @mul_undef() {
 ; CHECK-LABEL: @mul_undef(
-; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> undef)
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> undef)
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> undef)
+  %x = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> undef)
   ret i32 %x
 }
 
 define i32 @mul_undef1() {
 ; CHECK-LABEL: @mul_undef1(
-; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
+  %x = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
   ret i32 %x
 }
 
@@ -128,7 +128,7 @@
 ; CHECK-LABEL: @and_0(
 ; CHECK-NEXT:    ret i32 0
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> zeroinitializer)
+  %x = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> zeroinitializer)
   ret i32 %x
 }
 
@@ -136,7 +136,7 @@
 ; CHECK-LABEL: @and_1(
 ; CHECK-NEXT:    ret i32 1
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>)
+  %x = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>)
   ret i32 %x
 }
 
@@ -144,7 +144,7 @@
 ; CHECK-LABEL: @and_inc(
 ; CHECK-NEXT:    ret i32 0
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> <i32 1, i32 -3, i32 5, i32 7, i32 2, i32 4, i32 -6, i32 8>)
+  %x = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> <i32 1, i32 -3, i32 5, i32 7, i32 2, i32 4, i32 -6, i32 8>)
   ret i32 %x
 }
 
@@ -152,25 +152,25 @@
 ; CHECK-LABEL: @and_1v(
 ; CHECK-NEXT:    ret i32 10
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.and.v1i32(<1 x i32> <i32 10>)
+  %x = call i32 @llvm.vector.reduce.and.v1i32(<1 x i32> <i32 10>)
   ret i32 %x
 }
 
 define i32 @and_undef() {
 ; CHECK-LABEL: @and_undef(
-; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> undef)
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> undef)
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> undef)
+  %x = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> undef)
   ret i32 %x
 }
 
 define i32 @and_undef1() {
 ; CHECK-LABEL: @and_undef1(
-; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
+  %x = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
   ret i32 %x
 }
 
@@ -179,7 +179,7 @@
 ; CHECK-LABEL: @or_0(
 ; CHECK-NEXT:    ret i32 0
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> zeroinitializer)
+  %x = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> zeroinitializer)
   ret i32 %x
 }
 
@@ -187,7 +187,7 @@
 ; CHECK-LABEL: @or_1(
 ; CHECK-NEXT:    ret i32 1
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>)
+  %x = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>)
   ret i32 %x
 }
 
@@ -195,7 +195,7 @@
 ; CHECK-LABEL: @or_inc(
 ; CHECK-NEXT:    ret i32 -1
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> <i32 1, i32 -3, i32 5, i32 7, i32 2, i32 4, i32 -6, i32 8>)
+  %x = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> <i32 1, i32 -3, i32 5, i32 7, i32 2, i32 4, i32 -6, i32 8>)
   ret i32 %x
 }
 
@@ -203,25 +203,25 @@
 ; CHECK-LABEL: @or_1v(
 ; CHECK-NEXT:    ret i32 10
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.or.v1i32(<1 x i32> <i32 10>)
+  %x = call i32 @llvm.vector.reduce.or.v1i32(<1 x i32> <i32 10>)
   ret i32 %x
 }
 
 define i32 @or_undef() {
 ; CHECK-LABEL: @or_undef(
-; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> undef)
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> undef)
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> undef)
+  %x = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> undef)
   ret i32 %x
 }
 
 define i32 @or_undef1() {
 ; CHECK-LABEL: @or_undef1(
-; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
+  %x = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
   ret i32 %x
 }
 
@@ -230,7 +230,7 @@
 ; CHECK-LABEL: @xor_0(
 ; CHECK-NEXT:    ret i32 0
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32> zeroinitializer)
+  %x = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> zeroinitializer)
   ret i32 %x
 }
 
@@ -238,7 +238,7 @@
 ; CHECK-LABEL: @xor_1(
 ; CHECK-NEXT:    ret i32 0
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>)
+  %x = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>)
   ret i32 %x
 }
 
@@ -246,7 +246,7 @@
 ; CHECK-LABEL: @xor_inc(
 ; CHECK-NEXT:    ret i32 10
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32> <i32 1, i32 -3, i32 5, i32 7, i32 2, i32 4, i32 -6, i32 8>)
+  %x = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> <i32 1, i32 -3, i32 5, i32 7, i32 2, i32 4, i32 -6, i32 8>)
   ret i32 %x
 }
 
@@ -254,25 +254,25 @@
 ; CHECK-LABEL: @xor_1v(
 ; CHECK-NEXT:    ret i32 10
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.xor.v1i32(<1 x i32> <i32 10>)
+  %x = call i32 @llvm.vector.reduce.xor.v1i32(<1 x i32> <i32 10>)
   ret i32 %x
 }
 
 define i32 @xor_undef() {
 ; CHECK-LABEL: @xor_undef(
-; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32> undef)
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> undef)
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32> undef)
+  %x = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> undef)
   ret i32 %x
 }
 
 define i32 @xor_undef1() {
 ; CHECK-LABEL: @xor_undef1(
-; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
+  %x = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
   ret i32 %x
 }
 
@@ -281,7 +281,7 @@
 ; CHECK-LABEL: @smin_0(
 ; CHECK-NEXT:    ret i32 0
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> zeroinitializer)
+  %x = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> zeroinitializer)
   ret i32 %x
 }
 
@@ -289,7 +289,7 @@
 ; CHECK-LABEL: @smin_1(
 ; CHECK-NEXT:    ret i32 1
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>)
+  %x = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>)
   ret i32 %x
 }
 
@@ -297,7 +297,7 @@
 ; CHECK-LABEL: @smin_inc(
 ; CHECK-NEXT:    ret i32 -6
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> <i32 1, i32 -3, i32 5, i32 7, i32 2, i32 4, i32 -6, i32 8>)
+  %x = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> <i32 1, i32 -3, i32 5, i32 7, i32 2, i32 4, i32 -6, i32 8>)
   ret i32 %x
 }
 
@@ -305,25 +305,25 @@
 ; CHECK-LABEL: @smin_1v(
 ; CHECK-NEXT:    ret i32 10
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.smin.v1i32(<1 x i32> <i32 10>)
+  %x = call i32 @llvm.vector.reduce.smin.v1i32(<1 x i32> <i32 10>)
   ret i32 %x
 }
 
 define i32 @smin_undef() {
 ; CHECK-LABEL: @smin_undef(
-; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> undef)
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef)
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> undef)
+  %x = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef)
   ret i32 %x
 }
 
 define i32 @smin_undef1() {
 ; CHECK-LABEL: @smin_undef1(
-; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
+  %x = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
   ret i32 %x
 }
 
@@ -332,7 +332,7 @@
 ; CHECK-LABEL: @smax_0(
 ; CHECK-NEXT:    ret i32 0
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> zeroinitializer)
+  %x = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> zeroinitializer)
   ret i32 %x
 }
 
@@ -340,7 +340,7 @@
 ; CHECK-LABEL: @smax_1(
 ; CHECK-NEXT:    ret i32 1
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>)
+  %x = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>)
   ret i32 %x
 }
 
@@ -348,7 +348,7 @@
 ; CHECK-LABEL: @smax_inc(
 ; CHECK-NEXT:    ret i32 8
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> <i32 1, i32 -3, i32 5, i32 7, i32 2, i32 4, i32 -6, i32 8>)
+  %x = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> <i32 1, i32 -3, i32 5, i32 7, i32 2, i32 4, i32 -6, i32 8>)
   ret i32 %x
 }
 
@@ -356,25 +356,25 @@
 ; CHECK-LABEL: @smax_1v(
 ; CHECK-NEXT:    ret i32 10
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.smax.v1i32(<1 x i32> <i32 10>)
+  %x = call i32 @llvm.vector.reduce.smax.v1i32(<1 x i32> <i32 10>)
   ret i32 %x
 }
 
 define i32 @smax_undef() {
 ; CHECK-LABEL: @smax_undef(
-; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> undef)
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef)
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> undef)
+  %x = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef)
   ret i32 %x
 }
 
 define i32 @smax_undef1() {
 ; CHECK-LABEL: @smax_undef1(
-; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
+  %x = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
   ret i32 %x
 }
 
@@ -383,7 +383,7 @@
 ; CHECK-LABEL: @umin_0(
 ; CHECK-NEXT:    ret i32 0
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> zeroinitializer)
+  %x = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> zeroinitializer)
   ret i32 %x
 }
 
@@ -391,7 +391,7 @@
 ; CHECK-LABEL: @umin_1(
 ; CHECK-NEXT:    ret i32 1
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>)
+  %x = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>)
   ret i32 %x
 }
 
@@ -399,7 +399,7 @@
 ; CHECK-LABEL: @umin_inc(
 ; CHECK-NEXT:    ret i32 1
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> <i32 1, i32 -3, i32 5, i32 7, i32 2, i32 4, i32 -6, i32 8>)
+  %x = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> <i32 1, i32 -3, i32 5, i32 7, i32 2, i32 4, i32 -6, i32 8>)
   ret i32 %x
 }
 
@@ -407,25 +407,25 @@
 ; CHECK-LABEL: @umin_1v(
 ; CHECK-NEXT:    ret i32 10
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.umin.v1i32(<1 x i32> <i32 10>)
+  %x = call i32 @llvm.vector.reduce.umin.v1i32(<1 x i32> <i32 10>)
   ret i32 %x
 }
 
 define i32 @umin_undef() {
 ; CHECK-LABEL: @umin_undef(
-; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> undef)
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef)
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> undef)
+  %x = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef)
   ret i32 %x
 }
 
 define i32 @umin_undef1() {
 ; CHECK-LABEL: @umin_undef1(
-; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
+  %x = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
   ret i32 %x
 }
 
@@ -434,7 +434,7 @@
 ; CHECK-LABEL: @umax_0(
 ; CHECK-NEXT:    ret i32 0
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> zeroinitializer)
+  %x = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> zeroinitializer)
   ret i32 %x
 }
 
@@ -442,7 +442,7 @@
 ; CHECK-LABEL: @umax_1(
 ; CHECK-NEXT:    ret i32 1
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>)
+  %x = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>)
   ret i32 %x
 }
 
@@ -450,7 +450,7 @@
 ; CHECK-LABEL: @umax_inc(
 ; CHECK-NEXT:    ret i32 -3
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> <i32 1, i32 -3, i32 5, i32 7, i32 2, i32 4, i32 -6, i32 8>)
+  %x = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> <i32 1, i32 -3, i32 5, i32 7, i32 2, i32 4, i32 -6, i32 8>)
   ret i32 %x
 }
 
@@ -458,24 +458,24 @@
 ; CHECK-LABEL: @umax_1v(
 ; CHECK-NEXT:    ret i32 10
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.umax.v1i32(<1 x i32> <i32 10>)
+  %x = call i32 @llvm.vector.reduce.umax.v1i32(<1 x i32> <i32 10>)
   ret i32 %x
 }
 
 define i32 @umax_undef() {
 ; CHECK-LABEL: @umax_undef(
-; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> undef)
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef)
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> undef)
+  %x = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef)
   ret i32 %x
 }
 
 define i32 @umax_undef1d() {
 ; CHECK-LABEL: @umax_undef1d(
-; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
+  %x = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
   ret i32 %x
 }
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/pr33053.ll b/llvm/test/Transforms/LoopVectorize/AArch64/pr33053.ll
--- a/llvm/test/Transforms/LoopVectorize/AArch64/pr33053.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/pr33053.ll
@@ -8,7 +8,7 @@
 ; Function Attrs: norecurse nounwind readonly
 define i32 @fn1() local_unnamed_addr #0 {
 ; Ensure that we don't emit reduction intrinsics for unsupported short reductions.
-; CHECK-NOT: @llvm.experimental.vector.reduce
+; CHECK-NOT: @llvm.vector.reduce
 entry:
   %0 = load i32, i32* @b, align 4, !tbaa !1
   %cmp40 = icmp sgt i32 %0, 0
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll
--- a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll
@@ -20,7 +20,7 @@
 ; CHECK:   add <16 x i8>
 ;
 ; CHECK: middle.block:
-; CHECK:   [[Rdx:%[a-zA-Z0-9.]+]] = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8>
+; CHECK:   [[Rdx:%[a-zA-Z0-9.]+]] = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8>
 ; CHECK:   zext i8 [[Rdx]] to i32
 ;
 define i8 @reduction_i8(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %n) {
@@ -75,7 +75,7 @@
 ; CHECK:   add <8 x i16>
 ;
 ; CHECK: middle.block:
-; CHECK:   [[Rdx:%[a-zA-Z0-9.]+]] = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16>
+; CHECK:   [[Rdx:%[a-zA-Z0-9.]+]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16>
 ; CHECK:   zext i16 [[Rdx]] to i32
 ;
 define i16 @reduction_i16_1(i16* nocapture readonly %a, i16* nocapture readonly %b, i32 %n) {
@@ -132,7 +132,7 @@
 ; CHECK:   add <8 x i16>
 ;
 ; CHECK: middle.block:
-; CHECK:   [[Rdx:%[a-zA-Z0-9.]+]] = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16>
+; CHECK:   [[Rdx:%[a-zA-Z0-9.]+]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16>
 ; CHECK:   zext i16 [[Rdx]] to i32
 ;
 define i16 @reduction_i16_2(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %n) {
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-gather-scatter-tailpred.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-gather-scatter-tailpred.ll
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-gather-scatter-tailpred.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-gather-scatter-tailpred.ll
@@ -628,7 +628,7 @@
   ret void
 }
 
-declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
 declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>)
 declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
 declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-predselect.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-predselect.ll
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-predselect.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-predselect.ll
@@ -23,7 +23,7 @@
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3]])
 ; CHECK-NEXT:    br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
@@ -77,7 +77,7 @@
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP4:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP7]])
+; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP7]])
 ; CHECK-NEXT:    br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
@@ -132,7 +132,7 @@
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP6:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> [[TMP6]])
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP6]])
 ; CHECK-NEXT:    br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
@@ -186,7 +186,7 @@
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP8:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> [[TMP6]])
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[TMP6]])
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
@@ -240,7 +240,7 @@
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP10:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> [[TMP6]])
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP6]])
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
@@ -294,7 +294,7 @@
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP12:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32> [[TMP6]])
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> [[TMP6]])
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
@@ -348,7 +348,7 @@
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP14:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP6]])
+; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP6]])
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
@@ -402,7 +402,7 @@
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP16:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.000000e+00, <4 x float> [[TMP6]])
+; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[TMP6]])
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
@@ -451,7 +451,7 @@
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP18:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP3]])
 ; CHECK-NEXT:    br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 256, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
@@ -508,7 +508,7 @@
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP20:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[TMP3]])
 ; CHECK-NEXT:    br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 256, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll
@@ -41,7 +41,7 @@
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP11]])
+; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP11]])
 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
@@ -129,7 +129,7 @@
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP4:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> [[TMP11]])
+; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP11]])
 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
@@ -210,7 +210,7 @@
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP6:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP5]])
 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
@@ -281,7 +281,7 @@
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP8:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP5]])
 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
@@ -352,7 +352,7 @@
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP10:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[TMP5]])
 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
@@ -423,7 +423,7 @@
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP12:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP5]])
 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
@@ -494,7 +494,7 @@
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP14:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> [[TMP5]])
 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
@@ -565,7 +565,7 @@
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP16:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP5]])
 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
@@ -636,7 +636,7 @@
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP18:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.000000e+00, <4 x float> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[TMP5]])
 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
@@ -703,7 +703,7 @@
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP20:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP5]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -773,7 +773,7 @@
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP22:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP5]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -843,7 +843,7 @@
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP24:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[TMP5]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -913,7 +913,7 @@
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP26:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[TMP5]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll
@@ -183,7 +183,7 @@
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3]])
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ]
@@ -232,7 +232,7 @@
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP2:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
@@ -282,7 +282,7 @@
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP3:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
@@ -330,7 +330,7 @@
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP4:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP5:%.*]] = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[TMP3]])
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ]
@@ -379,7 +379,7 @@
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP5:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP6:%.*]] = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> [[TMP4]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[TMP4]])
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
@@ -427,7 +427,7 @@
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP6:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP5:%.*]] = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> [[TMP3]])
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i8 [ 0, [[ENTRY:%.*]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ]
@@ -664,7 +664,7 @@
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP7:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP6]])
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP6]])
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
@@ -720,7 +720,7 @@
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP8:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP8]])
+; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP8]])
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
@@ -778,7 +778,7 @@
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP9:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP8]])
+; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP8]])
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
@@ -834,7 +834,7 @@
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP10:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP8:%.*]] = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> [[TMP6]])
+; CHECK-NEXT:    [[TMP8:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[TMP6]])
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
@@ -890,7 +890,7 @@
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP11:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP10:%.*]] = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> [[TMP8]])
+; CHECK-NEXT:    [[TMP10:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[TMP8]])
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
@@ -946,7 +946,7 @@
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP12:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP8:%.*]] = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> [[TMP6]])
+; CHECK-NEXT:    [[TMP8:%.*]] = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> [[TMP6]])
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i8 [ 0, [[ENTRY:%.*]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/tail-fold-multiple-icmps.ll b/llvm/test/Transforms/LoopVectorize/ARM/tail-fold-multiple-icmps.ll
--- a/llvm/test/Transforms/LoopVectorize/ARM/tail-fold-multiple-icmps.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/tail-fold-multiple-icmps.ll
@@ -29,8 +29,8 @@
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> [[TMP3]])
-; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP3]])
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP5]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[N]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-not-allowed.ll b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-not-allowed.ll
--- a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-not-allowed.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-not-allowed.ll
@@ -595,7 +595,7 @@
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP12:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP5]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -665,7 +665,7 @@
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP14:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP5]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -735,7 +735,7 @@
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP16:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[TMP5]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -805,7 +805,7 @@
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP18:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[TMP5]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll b/llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll
--- a/llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll
@@ -110,7 +110,7 @@
 ; AVX-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 32
 ; AVX-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
 ; AVX:       middle.block:
-; AVX-NEXT:    [[TMP8:%.*]] = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double 0.000000e+00, <4 x double> [[PREDPHI]])
+; AVX-NEXT:    [[TMP8:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[PREDPHI]])
 ; AVX-NEXT:    [[CMP_N:%.*]] = icmp eq i32 32, 32
 ; AVX-NEXT:    br i1 [[CMP_N]], label [[DONE:%.*]], label [[SCALAR_PH]]
 ; AVX:       scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll b/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll
--- a/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll
@@ -59,7 +59,7 @@
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <16 x i32> [[TMP11]], [[TMP10]]
 ; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <16 x i32> [[TMP12]], [[BIN_RDX]]
 ; CHECK-NEXT:    [[BIN_RDX12:%.*]] = add <16 x i32> [[TMP13]], [[BIN_RDX11]]
-; CHECK-NEXT:    [[TMP15:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX12]])
+; CHECK-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX12]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll
--- a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll
@@ -97,7 +97,7 @@
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP37]], [[TMP36]]
 ; CHECK-NEXT:    [[BIN_RDX19:%.*]] = add <4 x i32> [[TMP38]], [[BIN_RDX]]
 ; CHECK-NEXT:    [[BIN_RDX20:%.*]] = add <4 x i32> [[TMP39]], [[BIN_RDX19]]
-; CHECK-NEXT:    [[TMP41:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX20]])
+; CHECK-NEXT:    [[TMP41:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX20]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 4096, 4096
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -263,7 +263,7 @@
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP81]], [[TMP80]]
 ; CHECK-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[BIN_RDX]]
 ; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX10]]
-; CHECK-NEXT:    [[TMP85:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
+; CHECK-NEXT:    [[TMP85:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 4096, 4096
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -450,7 +450,7 @@
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP101]], [[TMP100]]
 ; CHECK-NEXT:    [[BIN_RDX7:%.*]] = add <4 x i32> [[TMP102]], [[BIN_RDX]]
 ; CHECK-NEXT:    [[BIN_RDX8:%.*]] = add <4 x i32> [[TMP103]], [[BIN_RDX7]]
-; CHECK-NEXT:    [[TMP105:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX8]])
+; CHECK-NEXT:    [[TMP105:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX8]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 4096, 4096
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -776,7 +776,7 @@
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP181]], [[TMP180]]
 ; CHECK-NEXT:    [[BIN_RDX37:%.*]] = add <4 x i32> [[TMP182]], [[BIN_RDX]]
 ; CHECK-NEXT:    [[BIN_RDX38:%.*]] = add <4 x i32> [[TMP183]], [[BIN_RDX37]]
-; CHECK-NEXT:    [[TMP185:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX38]])
+; CHECK-NEXT:    [[TMP185:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX38]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 4096, 4096
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -953,7 +953,7 @@
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP82]], [[TMP81]]
 ; CHECK-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX]]
 ; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP84]], [[BIN_RDX10]]
-; CHECK-NEXT:    [[TMP86:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
+; CHECK-NEXT:    [[TMP86:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -1126,7 +1126,7 @@
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP81]], [[TMP80]]
 ; CHECK-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[BIN_RDX]]
 ; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX10]]
-; CHECK-NEXT:    [[TMP85:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
+; CHECK-NEXT:    [[TMP85:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 3072, 3072
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -1477,7 +1477,7 @@
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP149]], [[TMP148]]
 ; CHECK-NEXT:    [[BIN_RDX37:%.*]] = add <4 x i32> [[TMP150]], [[BIN_RDX]]
 ; CHECK-NEXT:    [[BIN_RDX38:%.*]] = add <4 x i32> [[TMP151]], [[BIN_RDX37]]
-; CHECK-NEXT:    [[TMP153:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX38]])
+; CHECK-NEXT:    [[TMP153:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX38]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 2048, 2048
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -1644,7 +1644,7 @@
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP81]], [[TMP80]]
 ; CHECK-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[BIN_RDX]]
 ; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX10]]
-; CHECK-NEXT:    [[TMP85:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
+; CHECK-NEXT:    [[TMP85:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 4096, 4096
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -1811,7 +1811,7 @@
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP81]], [[TMP80]]
 ; CHECK-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[BIN_RDX]]
 ; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX10]]
-; CHECK-NEXT:    [[TMP85:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
+; CHECK-NEXT:    [[TMP85:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 4096, 4096
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -1978,7 +1978,7 @@
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP81]], [[TMP80]]
 ; CHECK-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[BIN_RDX]]
 ; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX10]]
-; CHECK-NEXT:    [[TMP85:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
+; CHECK-NEXT:    [[TMP85:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 4096, 4096
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll b/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll
--- a/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll
@@ -90,7 +90,7 @@
 ; CHECK-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP27]], [[TMP26]]
-; CHECK-NEXT:    [[TMP33:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
+; CHECK-NEXT:    [[TMP33:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP7]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND4_FOR_INC9_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr42674.ll b/llvm/test/Transforms/LoopVectorize/X86/pr42674.ll
--- a/llvm/test/Transforms/LoopVectorize/X86/pr42674.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr42674.ll
@@ -27,7 +27,7 @@
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <64 x i8> [[TMP5]], [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> [[BIN_RDX]])
+; CHECK-NEXT:    [[TMP7:%.*]] = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> [[BIN_RDX]])
 ; CHECK-NEXT:    ret i8 [[TMP7]]
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll b/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll
--- a/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll
@@ -76,7 +76,7 @@
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP9]], [[TMP8]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[BIN_RDX]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[BIN_RDX]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 4096, 4096
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -148,7 +148,7 @@
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !4
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd reassoc <4 x float> [[TMP9]], [[TMP8]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call reassoc float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[BIN_RDX]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call reassoc float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[BIN_RDX]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 4096, 4096
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -220,7 +220,7 @@
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !6
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd reassoc contract <4 x float> [[TMP9]], [[TMP8]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call reassoc contract float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[BIN_RDX]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call reassoc contract float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[BIN_RDX]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 4096, 4096
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll b/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll
--- a/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll
@@ -62,7 +62,7 @@
 ; CHECK-NEXT:    [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
 ; CHECK-NEXT:    br i1 [[TMP38]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !5
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP39:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> [[TMP37]])
+; CHECK-NEXT:    [[TMP39:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP37]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 100, 96
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll b/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll
--- a/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll
@@ -195,7 +195,7 @@
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP6:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP17:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> [[TMP15]])
+; CHECK-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP15]])
 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll
--- a/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll
@@ -27,13 +27,13 @@
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 12
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
 ; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD]])
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[TMP9]] = add i32 [[TMP8]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD4]])
+; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD4]])
 ; CHECK-NEXT:    [[TMP11]] = add i32 [[TMP10]], [[VEC_PHI1]]
-; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD5]])
+; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD5]])
 ; CHECK-NEXT:    [[TMP13]] = add i32 [[TMP12]], [[VEC_PHI2]]
-; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD6]])
+; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD6]])
 ; CHECK-NEXT:    [[TMP15]] = add i32 [[TMP14]], [[VEC_PHI3]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll
--- a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll
@@ -15,7 +15,7 @@
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[TMP3]] = add i32 [[TMP2]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
@@ -65,11 +65,11 @@
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
 ; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[VEC_IND2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[VEC_IND2]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[TMP4]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP6]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD1]])
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD1]])
 ; CHECK-NEXT:    [[TMP9]] = add i32 [[TMP8]], [[TMP7]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT3]] = add <4 x i32> [[VEC_IND2]], <i32 4, i32 4, i32 4, i32 4>
@@ -121,7 +121,7 @@
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP2]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP4]] = add i32 [[TMP3]], 12
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
@@ -173,11 +173,11 @@
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
 ; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> [[VEC_IND2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[VEC_IND2]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP4]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> [[WIDE_LOAD]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> [[WIDE_LOAD1]])
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[WIDE_LOAD1]])
 ; CHECK-NEXT:    [[TMP9]] = mul i32 [[TMP8]], [[TMP7]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT3]] = add <4 x i32> [[VEC_IND2]], <i32 4, i32 4, i32 4, i32 4>
@@ -234,9 +234,9 @@
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
 ; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[VEC_IND2]])
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[VEC_IND2]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP5]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
 ; CHECK-NEXT:    [[TMP8]] = add i32 [[TMP7]], [[TMP6]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT3]] = add <4 x i32> [[VEC_IND2]], <i32 4, i32 4, i32 4, i32 4>
@@ -291,9 +291,9 @@
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
 ; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> [[WIDE_LOAD]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP4]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> [[WIDE_LOAD1]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[WIDE_LOAD1]])
 ; CHECK-NEXT:    [[TMP7]] = mul i32 [[TMP6]], [[TMP5]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
@@ -346,7 +346,7 @@
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
 ; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
 ; CHECK-NEXT:    [[TMP6]] = add i32 [[TMP5]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
@@ -398,9 +398,9 @@
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
 ; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> [[WIDE_LOAD]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = and i32 [[TMP4]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> [[WIDE_LOAD1]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[WIDE_LOAD1]])
 ; CHECK-NEXT:    [[TMP7]] = and i32 [[TMP6]], [[TMP5]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
@@ -453,7 +453,7 @@
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
 ; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> [[TMP4]])
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP4]])
 ; CHECK-NEXT:    [[TMP6]] = or i32 [[TMP5]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
@@ -506,7 +506,7 @@
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
 ; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32> [[TMP4]])
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> [[TMP4]])
 ; CHECK-NEXT:    [[TMP6]] = xor i32 [[TMP5]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
@@ -558,9 +558,9 @@
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>*
 ; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[WIDE_LOAD]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = fadd float [[TMP4]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP6:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[WIDE_LOAD1]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[WIDE_LOAD1]])
 ; CHECK-NEXT:    [[TMP7]] = fadd float [[TMP6]], [[TMP5]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
@@ -612,9 +612,9 @@
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>*
 ; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.000000e+00, <4 x float> [[WIDE_LOAD]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = fmul float [[TMP4]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP6:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.000000e+00, <4 x float> [[WIDE_LOAD1]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[WIDE_LOAD1]])
 ; CHECK-NEXT:    [[TMP7]] = fmul float [[TMP6]], [[TMP5]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
@@ -663,7 +663,7 @@
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> [[WIDE_LOAD]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp slt i32 [[TMP2]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[RDX_MINMAX_SELECT]] = select i1 [[RDX_MINMAX_CMP]], i32 [[TMP2]], i32 [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
@@ -711,7 +711,7 @@
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> [[WIDE_LOAD]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp ugt i32 [[TMP2]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[RDX_MINMAX_SELECT]] = select i1 [[RDX_MINMAX_CMP]], i32 [[TMP2]], i32 [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
@@ -765,7 +765,7 @@
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !30
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP2]])
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
@@ -827,7 +827,7 @@
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
 ; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !32
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP15:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[PREDPHI3]])
+; CHECK-NEXT:    [[TMP15:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[PREDPHI3]])
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
@@ -950,11 +950,11 @@
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
 ; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[VEC_IND2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[VEC_IND2]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[TMP4]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP6]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD1]])
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD1]])
 ; CHECK-NEXT:    [[TMP9]] = add i32 [[TMP8]], [[TMP7]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT3]] = add <4 x i32> [[VEC_IND2]], <i32 4, i32 4, i32 4, i32 4>
@@ -1012,7 +1012,7 @@
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !36
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP5:%.*]] = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> [[TMP3]])
 ; CHECK-NEXT:    br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
@@ -1059,7 +1059,7 @@
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <4 x i8>*
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP3]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = zext <4 x i8> [[WIDE_LOAD]] to <4 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> [[TMP4]])
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[TMP4]])
 ; CHECK-NEXT:    [[TMP6]] = and i32 [[TMP5]], [[TMP0]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-predselect.ll b/llvm/test/Transforms/LoopVectorize/reduction-predselect.ll
--- a/llvm/test/Transforms/LoopVectorize/reduction-predselect.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-predselect.ll
@@ -10,7 +10,7 @@
 ; CHECK:         [[TMP24:%.*]] = select <4 x i1> [[TMP0:%.*]], <4 x i32> [[TMP23:%.*]], <4 x i32> zeroinitializer
 ; CHECK:         [[TMP25]] = add <4 x i32> [[VEC_PHI]], [[TMP24]]
 ; CHECK:       middle.block:
-; CHECK:         [[TMP27:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP25]])
+; CHECK:         [[TMP27:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP25]])
 ;
 entry:
   br label %.lr.ph
@@ -38,7 +38,7 @@
 ; CHECK:         [[TMP46:%.*]] = add <4 x i32> [[TMP45]], [[TMP43:%.*]]
 ; CHECK:         [[TMP47]] = select <4 x i1> [[TMP3:%.*]], <4 x i32> [[TMP46]], <4 x i32> [[VEC_PHI]]
 ; CHECK:       middle.block:
-; CHECK:         [[TMP49:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP47]])
+; CHECK:         [[TMP49:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP47]])
 ;
 entry:
   br label %.lr.ph
@@ -70,7 +70,7 @@
 ; CHECK:         [[TMP45:%.*]] = mul <4 x i32> [[TMP44]], [[TMP43:%.*]]
 ; CHECK:         [[TMP46]] = select <4 x i1> [[TMP3:%.*]], <4 x i32> [[TMP45]], <4 x i32> [[VEC_PHI]]
 ; CHECK:       middle.block:
-; CHECK:         [[TMP48:%.*]] = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> [[TMP46]])
+; CHECK:         [[TMP48:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP46]])
 ;
 entry:
   br label %.lr.ph
@@ -101,7 +101,7 @@
 ; CHECK:         [[TMP45:%.*]] = and <4 x i32> [[TMP44]], [[TMP43:%.*]]
 ; CHECK:         [[TMP46]] = select <4 x i1> [[TMP3:%.*]], <4 x i32> [[TMP45]], <4 x i32> [[VEC_PHI]]
 ; CHECK:       middle.block:
-; CHECK:         [[TMP48:%.*]] = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> [[TMP46]])
+; CHECK:         [[TMP48:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[TMP46]])
 ;
 entry:
   br label %for.body
@@ -131,7 +131,7 @@
 ; CHECK:         [[TMP45:%.*]] = select <4 x i1> [[TMP3:%.*]], <4 x i32> [[TMP44:%.*]], <4 x i32> zeroinitializer
 ; CHECK:         [[TMP46]] = or <4 x i32> [[VEC_PHI]], [[TMP45]]
 ; CHECK:       middle.block:
-; CHECK:         [[TMP48:%.*]] = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> [[TMP46]])
+; CHECK:         [[TMP48:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP46]])
 ;
 entry:
   br label %for.body
@@ -161,7 +161,7 @@
 ; CHECK:         [[TMP45:%.*]] = select <4 x i1> [[TMP3:%.*]], <4 x i32> [[TMP44:%.*]], <4 x i32> zeroinitializer
 ; CHECK:         [[TMP46]] = xor <4 x i32> [[VEC_PHI]], [[TMP45]]
 ; CHECK:       middle.block:
-; CHECK:         [[TMP48:%.*]] = call i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32> [[TMP46]])
+; CHECK:         [[TMP48:%.*]] = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> [[TMP46]])
 ;
 entry:
   br label %for.body
@@ -192,7 +192,7 @@
 ; CHECK:         [[TMP45:%.*]] = fadd fast <4 x float> [[TMP44]], [[TMP43:%.*]]
 ; CHECK:         [[TMP46]] = select <4 x i1> [[TMP3:%.*]], <4 x float> [[TMP45]], <4 x float> [[VEC_PHI]]
 ; CHECK:       middle.block:
-; CHECK:         [[TMP48:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP46]])
+; CHECK:         [[TMP48:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP46]])
 ;
 entry:
   br label %for.body
@@ -223,7 +223,7 @@
 ; CHECK:         [[TMP45:%.*]] = fmul fast <4 x float> [[TMP44]], [[TMP43:%.*]]
 ; CHECK:         [[TMP46]] = select <4 x i1> [[TMP3:%.*]], <4 x float> [[TMP45]], <4 x float> [[VEC_PHI]]
 ; CHECK:       middle.block:
-; CHECK:         [[TMP48:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.000000e+00, <4 x float> [[TMP46]])
+; CHECK:         [[TMP48:%.*]] = call fast float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[TMP46]])
 ;
 entry:
   br label %for.body
@@ -254,7 +254,7 @@
 ; CHECK:         [[TMP25:%.*]] = select <4 x i1> [[TMP24]], <4 x i32> [[VEC_PHI]], <4 x i32> [[TMP23]]
 ; CHECK:         [[TMP26]] = select <4 x i1> [[TMP0:%.*]], <4 x i32> [[TMP25]], <4 x i32> [[VEC_PHI]]
 ; CHECK:       middle.block:
-; CHECK:         [[TMP28:%.*]] = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> [[TMP26]])
+; CHECK:         [[TMP28:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP26]])
 ;
 entry:
   br label %for.body
@@ -283,7 +283,7 @@
 ; CHECK:         [[TMP25:%.*]] = select <4 x i1> [[TMP24]], <4 x i32> [[VEC_PHI]], <4 x i32> [[TMP23]]
 ; CHECK:         [[TMP26]] = select <4 x i1> [[TMP0:%.*]], <4 x i32> [[TMP25]], <4 x i32> [[VEC_PHI]]
 ; CHECK:       middle.block:
-; CHECK:         [[TMP28:%.*]] = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> [[TMP26]])
+; CHECK:         [[TMP28:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[TMP26]])
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll
--- a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll
@@ -8,7 +8,7 @@
 define i32 @ext_ext_or_reduction_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; CHECK-LABEL: @ext_ext_or_reduction_v4i32(
 ; CHECK-NEXT:    [[Z:%.*]] = and <4 x i32> [[Y:%.*]], [[X:%.*]]
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> [[Z]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[Z]])
 ; CHECK-NEXT:    ret i32 [[TMP1]]
 ;
   %z = and <4 x i32> %x, %y
@@ -74,7 +74,7 @@
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp slt <4 x i32> [[TMP4]], zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = sub nsw <4 x i32> zeroinitializer, [[TMP4]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP7]])
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP7]])
 ; CHECK-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP8]], [[TOLERANCE:%.*]]
 ; CHECK-NEXT:    [[COND6:%.*]] = zext i1 [[CMP5]] to i32
 ; CHECK-NEXT:    ret i32 [[COND6]]
@@ -133,7 +133,7 @@
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[VEC1:%.*]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
 ; CHECK-NEXT:    [[CMP3:%.*]] = icmp ule i32 [[TMP5]], [[TOLERANCE:%.*]]
 ; CHECK-NEXT:    [[COND:%.*]] = zext i1 [[CMP3]] to i32
 ; CHECK-NEXT:    ret i32 [[COND]]
@@ -181,7 +181,7 @@
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = fsub fast <4 x float> [[TMP1]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP4]])
-; CHECK-NEXT:    [[TMP6:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP5]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP5]])
 ; CHECK-NEXT:    [[CMP4:%.*]] = fcmp fast ole float [[TMP6]], [[TOLERANCE:%.*]]
 ; CHECK-NEXT:    [[COND5:%.*]] = zext i1 [[CMP4]] to i32
 ; CHECK-NEXT:    ret i32 [[COND5]]
@@ -240,7 +240,7 @@
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float* [[VEC1:%.*]] to <4 x float>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = fsub fast <4 x float> [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP4]])
+; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP4]])
 ; CHECK-NEXT:    [[CMP3:%.*]] = fcmp fast ole float [[TMP5]], [[TOLERANCE:%.*]]
 ; CHECK-NEXT:    [[COND:%.*]] = zext i1 [[CMP3]] to i32
 ; CHECK-NEXT:    ret i32 [[COND]]
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll
@@ -23,7 +23,7 @@
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul nuw <4 x i32> [[TMP6]], <i32 65535, i32 65535, i32 65535, i32 65535>
 ; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i32> [[TMP7]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor <4 x i32> [[TMP8]], [[TMP7]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP9]])
+; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP9]])
 ; CHECK-NEXT:    ret i32 [[TMP10]]
 ;
   %tmp00 = lshr i32 %a, 15
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll
@@ -17,7 +17,7 @@
 ; DEFAULT:       for.body:
 ; DEFAULT-NEXT:    [[P17:%.*]] = phi i32 [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
 ; DEFAULT-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> <i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720>, <8 x i32> <i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80>
-; DEFAULT-NEXT:    [[TMP3:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> [[TMP2]])
+; DEFAULT-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]])
 ; DEFAULT-NEXT:    [[OP_EXTRA]] = add i32 [[TMP3]], [[P17]]
 ; DEFAULT-NEXT:    br label [[FOR_BODY]]
 ;
@@ -61,7 +61,7 @@
 ; GATHER-NEXT:    [[TMP32:%.*]] = insertelement <8 x i32> [[TMP31]], i32 [[TMP25]], i32 6
 ; GATHER-NEXT:    [[TMP33:%.*]] = extractelement <8 x i32> [[TMP18]], i32 7
 ; GATHER-NEXT:    [[TMP34:%.*]] = insertelement <8 x i32> [[TMP32]], i32 [[TMP33]], i32 7
-; GATHER-NEXT:    [[TMP35:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> [[TMP34]])
+; GATHER-NEXT:    [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP34]])
 ; GATHER-NEXT:    [[OP_EXTRA]] = add i32 [[TMP35]], [[P17]]
 ; GATHER-NEXT:    br label [[FOR_BODY]]
 ;
@@ -153,7 +153,7 @@
 ; DEFAULT:       for.body:
 ; DEFAULT-NEXT:    [[P17:%.*]] = phi i32 [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
 ; DEFAULT-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> <i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720>, <8 x i32> <i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80>
-; DEFAULT-NEXT:    [[TMP3:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> [[TMP2]])
+; DEFAULT-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]])
 ; DEFAULT-NEXT:    [[OP_EXTRA]] = add i32 [[TMP3]], -5
 ; DEFAULT-NEXT:    br label [[FOR_BODY]]
 ;
@@ -197,7 +197,7 @@
 ; GATHER-NEXT:    [[TMP32:%.*]] = insertelement <8 x i32> [[TMP31]], i32 [[TMP25]], i32 6
 ; GATHER-NEXT:    [[TMP33:%.*]] = extractelement <8 x i32> [[TMP18]], i32 7
 ; GATHER-NEXT:    [[TMP34:%.*]] = insertelement <8 x i32> [[TMP32]], i32 [[TMP33]], i32 7
-; GATHER-NEXT:    [[TMP35:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> [[TMP34]])
+; GATHER-NEXT:    [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP34]])
 ; GATHER-NEXT:    [[OP_EXTRA]] = add i32 [[TMP35]], -5
 ; GATHER-NEXT:    br label [[FOR_BODY]]
 ;
@@ -229,7 +229,7 @@
 ; MAX-COST-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[TMP7]], <4 x i32> <i32 -720, i32 -720, i32 -720, i32 -720>, <4 x i32> <i32 -80, i32 -80, i32 -80, i32 -80>
 ; MAX-COST-NEXT:    [[P27:%.*]] = select i1 [[P9]], i32 -720, i32 -80
 ; MAX-COST-NEXT:    [[P29:%.*]] = select i1 [[P11]], i32 -720, i32 -80
-; MAX-COST-NEXT:    [[TMP9:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP8]])
+; MAX-COST-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP8]])
 ; MAX-COST-NEXT:    [[TMP10:%.*]] = add i32 [[TMP9]], [[P27]]
 ; MAX-COST-NEXT:    [[TMP11:%.*]] = add i32 [[TMP10]], [[P29]]
 ; MAX-COST-NEXT:    [[OP_EXTRA:%.*]] = add i32 [[TMP11]], -5
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll
@@ -46,7 +46,7 @@
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp slt <4 x i32> [[TMP4]], zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = sub nsw <4 x i32> zeroinitializer, [[TMP4]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP7]])
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP7]])
 ; CHECK-NEXT:    [[OP_EXTRA]] = add nsw i32 [[TMP8]], [[S_026]]
 ; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds i32, i32* [[P1_023]], i64 [[IDX_EXT]]
 ; CHECK-NEXT:    [[ADD_PTR29]] = getelementptr inbounds i32, i32* [[P2_024]], i64 [[IDX_EXT]]
@@ -169,7 +169,7 @@
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[P2_018]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw <4 x i32> [[TMP3]], [[TMP1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
 ; CHECK-NEXT:    [[OP_EXTRA]] = add nsw i32 [[TMP5]], [[S_020]]
 ; CHECK-NEXT:    [[CMP14:%.*]] = icmp slt i32 [[OP_EXTRA]], [[LIM:%.*]]
 ; CHECK-NEXT:    br i1 [[CMP14]], label [[IF_END]], label [[FOR_END_LOOPEXIT:%.*]]
@@ -285,7 +285,7 @@
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp slt <8 x i32> [[TMP6]], zeroinitializer
 ; CHECK-NEXT:    [[TMP8:%.*]] = sub nsw <8 x i32> zeroinitializer, [[TMP6]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = select <8 x i1> [[TMP7]], <8 x i32> [[TMP8]], <8 x i32> [[TMP6]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> [[TMP9]])
+; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP9]])
 ; CHECK-NEXT:    [[OP_EXTRA]] = add nsw i32 [[TMP10]], [[S_047]]
 ; CHECK-NEXT:    [[CMP83:%.*]] = icmp slt i32 [[OP_EXTRA]], [[LIM:%.*]]
 ; CHECK-NEXT:    br i1 [[CMP83]], label [[IF_END_86]], label [[FOR_END_LOOPEXIT:%.*]]
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll
@@ -244,7 +244,7 @@
 ; CHECK-NEXT:    [[TMP12:%.*]] = mul nuw <4 x i32> [[TMP11]], <i32 65535, i32 65535, i32 65535, i32 65535>
 ; CHECK-NEXT:    [[TMP13:%.*]] = add <4 x i32> [[TMP12]], [[TMP9]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = xor <4 x i32> [[TMP13]], [[TMP12]]
-; CHECK-NEXT:    [[TMP15:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP14]])
+; CHECK-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP14]])
 ; CHECK-NEXT:    ret i32 [[TMP15]]
 ;
   %v0.0 = extractelement <4 x i32> %v0, i32 0
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR35628_1.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR35628_1.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/PR35628_1.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/PR35628_1.ll
@@ -19,7 +19,7 @@
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP4]], i32 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = mul <4 x i32> [[TMP4]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = sext i32 [[TMP6]] to i64
-; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP8]])
+; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP8]])
 ; CHECK-NEXT:    [[OP_EXTRA:%.*]] = add i32 [[TMP10]], 1
 ; CHECK-NEXT:    [[OP_EXTRA1:%.*]] = add i32 [[OP_EXTRA]], [[TMP7]]
 ; CHECK-NEXT:    [[OP_EXTRA2:%.*]] = add i32 [[OP_EXTRA1]], [[TMP6]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR35628_2.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR35628_2.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/PR35628_2.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/PR35628_2.ll
@@ -20,7 +20,7 @@
 ; CHECK-NEXT:    [[DUMMY_SHL:%.*]] = shl i64 [[TMP7]], 32
 ; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i64> <i64 1, i64 1, i64 1, i64 1>, [[TMP5]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = ashr exact <4 x i64> [[TMP8]], <i64 32, i64 32, i64 32, i64 32>
-; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> [[TMP9]])
+; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP9]])
 ; CHECK-NEXT:    [[OP_EXTRA:%.*]] = add i64 [[TMP10]], 0
 ; CHECK-NEXT:    [[OP_EXTRA1]] = add i64 [[OP_EXTRA]], [[TMP6]]
 ; CHECK-NEXT:    br label [[LOOP]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll
@@ -11,7 +11,7 @@
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[SHUFFLE]], i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[SHUFFLE]], <i32 0, i32 55, i32 285, i32 1240, i32 1496, i32 8555, i32 12529, i32 13685>
-; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> [[TMP3]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP3]])
 ; CHECK-NEXT:    [[OP_EXTRA:%.*]] = and i32 [[TMP4]], [[TMP0:%.*]]
 ; CHECK-NEXT:    [[OP_EXTRA1:%.*]] = and i32 [[OP_EXTRA]], [[TMP0]]
 ; CHECK-NEXT:    [[OP_EXTRA2:%.*]] = and i32 [[OP_EXTRA1]], [[TMP0]]
@@ -62,7 +62,7 @@
 ; FORCE_REDUCTION-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[SHUFFLE]], <i32 0, i32 55, i32 285, i32 1240>
 ; FORCE_REDUCTION-NEXT:    [[VAL_20:%.*]] = add i32 [[TMP2]], 1496
 ; FORCE_REDUCTION-NEXT:    [[VAL_34:%.*]] = add i32 [[TMP2]], 8555
-; FORCE_REDUCTION-NEXT:    [[TMP4:%.*]] = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> [[TMP3]])
+; FORCE_REDUCTION-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[TMP3]])
 ; FORCE_REDUCTION-NEXT:    [[TMP5:%.*]] = and i32 [[TMP4]], [[VAL_20]]
 ; FORCE_REDUCTION-NEXT:    [[TMP6:%.*]] = and i32 [[TMP5]], [[VAL_34]]
 ; FORCE_REDUCTION-NEXT:    [[OP_EXTRA:%.*]] = and i32 [[TMP6]], [[TMP0:%.*]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll
@@ -13,7 +13,7 @@
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <16 x i32> [[SHUFFLE]], i32 15
 ; CHECK-NEXT:    store atomic i32 [[TMP3]], i32* [[VALS:%.*]] unordered, align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <16 x i32> [[SHUFFLE]], <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 -1>
-; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.vector.reduce.and.v16i32(<16 x i32> [[TMP4]])
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[TMP4]])
 ; CHECK-NEXT:    [[OP_EXTRA:%.*]] = and i32 [[TMP5]], [[TMP2]]
 ; CHECK-NEXT:    [[V44:%.*]] = add i32 [[TMP2]], 16
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> undef, i32 [[V44]], i32 0
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/compare-reduce.ll b/llvm/test/Transforms/SLPVectorizer/X86/compare-reduce.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/compare-reduce.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/compare-reduce.ll
@@ -84,7 +84,7 @@
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[X:%.*]], i32 3
 ; CHECK-NEXT:    [[CMP3WRONG:%.*]] = fcmp olt float [[TMP1]], 4.200000e+01
 ; CHECK-NEXT:    [[TMP2:%.*]] = fcmp ogt <4 x float> [[X]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; CHECK-NEXT:    [[TMP3:%.*]] = call i1 @llvm.experimental.vector.reduce.or.v4i1(<4 x i1> [[TMP2]])
+; CHECK-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = or i1 [[TMP3]], [[CMP3WRONG]]
 ; CHECK-NEXT:    [[R:%.*]] = select i1 [[TMP4]], float -1.000000e+00, float 1.000000e+00
 ; CHECK-NEXT:    ret float [[R]]
@@ -111,7 +111,7 @@
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[X:%.*]], i32 3
 ; CHECK-NEXT:    [[CMP3WRONG:%.*]] = fcmp olt float [[TMP1]], 4.200000e+01
 ; CHECK-NEXT:    [[TMP2:%.*]] = fcmp ogt <4 x float> [[X]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; CHECK-NEXT:    [[TMP3:%.*]] = call i1 @llvm.experimental.vector.reduce.or.v4i1(<4 x i1> [[TMP2]])
+; CHECK-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = or i1 [[TMP3]], [[CMP3WRONG]]
 ; CHECK-NEXT:    [[R:%.*]] = select i1 [[TMP4]], float -1.000000e+00, float 1.000000e+00
 ; CHECK-NEXT:    ret float [[R]]
@@ -138,7 +138,7 @@
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 3
 ; CHECK-NEXT:    [[CMP3WRONG:%.*]] = icmp slt i32 [[TMP1]], 42
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt <4 x i32> [[X]], <i32 1, i32 1, i32 1, i32 1>
-; CHECK-NEXT:    [[TMP3:%.*]] = call i1 @llvm.experimental.vector.reduce.or.v4i1(<4 x i1> [[TMP2]])
+; CHECK-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = or i1 [[TMP3]], [[CMP3WRONG]]
 ; CHECK-NEXT:    [[R:%.*]] = select i1 [[TMP4]], i32 -1, i32 1
 ; CHECK-NEXT:    ret i32 [[R]]
@@ -169,7 +169,7 @@
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 3
 ; CHECK-NEXT:    [[CMP3WRONG:%.*]] = icmp slt i32 [[TMP2]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt <4 x i32> [[X]], [[Y]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call i1 @llvm.experimental.vector.reduce.or.v4i1(<4 x i1> [[TMP3]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = or i1 [[TMP4]], [[CMP3WRONG]]
 ; CHECK-NEXT:    [[R:%.*]] = select i1 [[TMP5]], i32 -1, i32 1
 ; CHECK-NEXT:    ret i32 [[R]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
@@ -102,7 +102,7 @@
 ; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <8 x float> [[TMP2]], [[TMP1]]
 ; CHECK-NEXT:    [[MUL5:%.*]] = shl nsw i32 [[TMP0]], 2
 ; CHECK-NEXT:    [[CONV6:%.*]] = sitofp i32 [[MUL5]] to float
-; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float 0.000000e+00, <8 x float> [[TMP3]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP3]])
 ; CHECK-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP4]], [[CONV]]
 ; CHECK-NEXT:    [[OP_EXTRA1:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV6]]
 ; CHECK-NEXT:    store float [[OP_EXTRA1]], float* @res, align 4
@@ -118,7 +118,7 @@
 ; THRESHOLD-NEXT:    [[TMP3:%.*]] = fmul fast <8 x float> [[TMP2]], [[TMP1]]
 ; THRESHOLD-NEXT:    [[MUL5:%.*]] = shl nsw i32 [[TMP0]], 2
 ; THRESHOLD-NEXT:    [[CONV6:%.*]] = sitofp i32 [[MUL5]] to float
-; THRESHOLD-NEXT:    [[TMP4:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float 0.000000e+00, <8 x float> [[TMP3]])
+; THRESHOLD-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP3]])
 ; THRESHOLD-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP4]], [[CONV]]
 ; THRESHOLD-NEXT:    [[OP_EXTRA1:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV6]]
 ; THRESHOLD-NEXT:    store float [[OP_EXTRA1]], float* @res, align 4
@@ -175,7 +175,7 @@
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr to <4 x float>*), align 16
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr1 to <4 x float>*), align 16
 ; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast float [[CONV]], [[TMP4]]
 ; CHECK-NEXT:    store float [[TMP5]], float* @res, align 4
 ; CHECK-NEXT:    ret float [[TMP5]]
@@ -187,7 +187,7 @@
 ; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr to <4 x float>*), align 16
 ; THRESHOLD-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr1 to <4 x float>*), align 16
 ; THRESHOLD-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]]
-; THRESHOLD-NEXT:    [[TMP4:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
+; THRESHOLD-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
 ; THRESHOLD-NEXT:    [[TMP5:%.*]] = fmul fast float [[CONV]], [[TMP4]]
 ; THRESHOLD-NEXT:    store float [[TMP5]], float* @res, align 4
 ; THRESHOLD-NEXT:    ret float [[TMP5]]
@@ -223,7 +223,7 @@
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr to <4 x float>*), align 16
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr1 to <4 x float>*), align 16
 ; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast float [[CONV]], [[TMP4]]
 ; CHECK-NEXT:    [[CONV4:%.*]] = fptosi float [[TMP5]] to i32
 ; CHECK-NEXT:    store i32 [[CONV4]], i32* @n, align 4
@@ -236,7 +236,7 @@
 ; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr to <4 x float>*), align 16
 ; THRESHOLD-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr1 to <4 x float>*), align 16
 ; THRESHOLD-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]]
-; THRESHOLD-NEXT:    [[TMP4:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
+; THRESHOLD-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
 ; THRESHOLD-NEXT:    [[TMP5:%.*]] = fmul fast float [[CONV]], [[TMP4]]
 ; THRESHOLD-NEXT:    [[CONV4:%.*]] = fptosi float [[TMP5]] to i32
 ; THRESHOLD-NEXT:    store i32 [[CONV4]], i32* @n, align 4
@@ -390,8 +390,8 @@
 ; CHECK-NEXT:    [[ARRAYIDX_47:%.*]] = getelementptr inbounds float, float* [[X]], i64 47
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float* [[ARRAYIDX_16]] to <32 x float>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x float>, <32 x float>* [[TMP2]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v32f32(float 0.000000e+00, <32 x float> [[TMP3]])
-; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float 0.000000e+00, <16 x float> [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v32f32(float 0.000000e+00, <32 x float> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP1]])
 ; CHECK-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP4]], [[TMP5]]
 ; CHECK-NEXT:    ret float [[OP_RDX]]
 ;
@@ -448,8 +448,8 @@
 ; THRESHOLD-NEXT:    [[ARRAYIDX_47:%.*]] = getelementptr inbounds float, float* [[X]], i64 47
 ; THRESHOLD-NEXT:    [[TMP2:%.*]] = bitcast float* [[ARRAYIDX_16]] to <32 x float>*
 ; THRESHOLD-NEXT:    [[TMP3:%.*]] = load <32 x float>, <32 x float>* [[TMP2]], align 4
-; THRESHOLD-NEXT:    [[TMP4:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v32f32(float 0.000000e+00, <32 x float> [[TMP3]])
-; THRESHOLD-NEXT:    [[TMP5:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float 0.000000e+00, <16 x float> [[TMP1]])
+; THRESHOLD-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v32f32(float 0.000000e+00, <32 x float> [[TMP3]])
+; THRESHOLD-NEXT:    [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP1]])
 ; THRESHOLD-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP4]], [[TMP5]]
 ; THRESHOLD-NEXT:    ret float [[OP_RDX]]
 ;
@@ -637,7 +637,7 @@
 ; CHECK-NEXT:    [[ARRAYIDX_31:%.*]] = getelementptr inbounds float, float* [[X]], i64 31
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[X]] to <32 x float>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x float>, <32 x float>* [[TMP0]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v32f32(float 0.000000e+00, <32 x float> [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v32f32(float 0.000000e+00, <32 x float> [[TMP1]])
 ; CHECK-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[CONV]]
 ; CHECK-NEXT:    ret float [[OP_EXTRA]]
 ;
@@ -678,7 +678,7 @@
 ; THRESHOLD-NEXT:    [[ARRAYIDX_31:%.*]] = getelementptr inbounds float, float* [[X]], i64 31
 ; THRESHOLD-NEXT:    [[TMP0:%.*]] = bitcast float* [[X]] to <32 x float>*
 ; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <32 x float>, <32 x float>* [[TMP0]], align 4
-; THRESHOLD-NEXT:    [[TMP2:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v32f32(float 0.000000e+00, <32 x float> [[TMP1]])
+; THRESHOLD-NEXT:    [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v32f32(float 0.000000e+00, <32 x float> [[TMP1]])
 ; THRESHOLD-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[CONV]]
 ; THRESHOLD-NEXT:    ret float [[OP_EXTRA]]
 ;
@@ -824,10 +824,10 @@
 ; CHECK-NEXT:    [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, float* [[X]], i64 30
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast float* [[ARRAYIDX_14]] to <16 x float>*
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x float>, <16 x float>* [[TMP6]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float 0.000000e+00, <16 x float> [[TMP7]])
-; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float 0.000000e+00, <8 x float> [[TMP5]])
+; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP7]])
+; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP5]])
 ; CHECK-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
+; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
 ; CHECK-NEXT:    [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = fadd fast float [[OP_RDX1]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = fadd fast float [[TMP11]], [[TMP0]]
@@ -873,10 +873,10 @@
 ; THRESHOLD-NEXT:    [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, float* [[X]], i64 30
 ; THRESHOLD-NEXT:    [[TMP6:%.*]] = bitcast float* [[ARRAYIDX_14]] to <16 x float>*
 ; THRESHOLD-NEXT:    [[TMP7:%.*]] = load <16 x float>, <16 x float>* [[TMP6]], align 4
-; THRESHOLD-NEXT:    [[TMP8:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float 0.000000e+00, <16 x float> [[TMP7]])
-; THRESHOLD-NEXT:    [[TMP9:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float 0.000000e+00, <8 x float> [[TMP5]])
+; THRESHOLD-NEXT:    [[TMP8:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP7]])
+; THRESHOLD-NEXT:    [[TMP9:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP5]])
 ; THRESHOLD-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP8]], [[TMP9]]
-; THRESHOLD-NEXT:    [[TMP10:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
+; THRESHOLD-NEXT:    [[TMP10:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
 ; THRESHOLD-NEXT:    [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[TMP10]]
 ; THRESHOLD-NEXT:    [[TMP11:%.*]] = fadd fast float [[OP_RDX1]], [[TMP1]]
 ; THRESHOLD-NEXT:    [[TMP12:%.*]] = fadd fast float [[TMP11]], [[TMP0]]
@@ -990,7 +990,7 @@
 ; CHECK-NEXT:    [[ARRAYIDX3_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[X]] to <8 x float>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float 0.000000e+00, <8 x float> [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP1]])
 ; CHECK-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
 ; CHECK-NEXT:    [[OP_EXTRA1:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV]]
 ; CHECK-NEXT:    ret float [[OP_EXTRA1]]
@@ -1009,7 +1009,7 @@
 ; THRESHOLD-NEXT:    [[ARRAYIDX3_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7
 ; THRESHOLD-NEXT:    [[TMP0:%.*]] = bitcast float* [[X]] to <8 x float>*
 ; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4
-; THRESHOLD-NEXT:    [[TMP2:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float 0.000000e+00, <8 x float> [[TMP1]])
+; THRESHOLD-NEXT:    [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP1]])
 ; THRESHOLD-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
 ; THRESHOLD-NEXT:    [[OP_EXTRA1:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV]]
 ; THRESHOLD-NEXT:    ret float [[OP_EXTRA1]]
@@ -1060,7 +1060,7 @@
 ; CHECK-NEXT:    [[ARRAYIDX3_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[X]] to <8 x float>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float 0.000000e+00, <8 x float> [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP1]])
 ; CHECK-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
 ; CHECK-NEXT:    [[OP_EXTRA1:%.*]] = fadd fast float [[OP_EXTRA]], 5.000000e+00
 ; CHECK-NEXT:    [[OP_EXTRA2:%.*]] = fadd fast float [[OP_EXTRA1]], 5.000000e+00
@@ -1081,7 +1081,7 @@
 ; THRESHOLD-NEXT:    [[ARRAYIDX3_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7
 ; THRESHOLD-NEXT:    [[TMP0:%.*]] = bitcast float* [[X]] to <8 x float>*
 ; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4
-; THRESHOLD-NEXT:    [[TMP2:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float 0.000000e+00, <8 x float> [[TMP1]])
+; THRESHOLD-NEXT:    [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP1]])
 ; THRESHOLD-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
 ; THRESHOLD-NEXT:    [[OP_EXTRA1:%.*]] = fadd fast float [[OP_EXTRA]], 5.000000e+00
 ; THRESHOLD-NEXT:    [[OP_EXTRA2:%.*]] = fadd fast float [[OP_EXTRA1]], 5.000000e+00
@@ -1138,7 +1138,7 @@
 ; CHECK-NEXT:    [[ARRAYIDX3_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[X]] to <8 x float>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float 0.000000e+00, <8 x float> [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP1]])
 ; CHECK-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
 ; CHECK-NEXT:    [[OP_EXTRA1:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV]]
 ; CHECK-NEXT:    ret float [[OP_EXTRA1]]
@@ -1159,7 +1159,7 @@
 ; THRESHOLD-NEXT:    [[ARRAYIDX3_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7
 ; THRESHOLD-NEXT:    [[TMP0:%.*]] = bitcast float* [[X]] to <8 x float>*
 ; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4
-; THRESHOLD-NEXT:    [[TMP2:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float 0.000000e+00, <8 x float> [[TMP1]])
+; THRESHOLD-NEXT:    [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP1]])
 ; THRESHOLD-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
 ; THRESHOLD-NEXT:    [[OP_EXTRA1:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV]]
 ; THRESHOLD-NEXT:    ret float [[OP_EXTRA1]]
@@ -1212,7 +1212,7 @@
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[TMP8]], i32 3
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq <4 x i32> [[TMP8]], zeroinitializer
 ; CHECK-NEXT:    [[TMP11:%.*]] = sext <4 x i1> [[TMP10]] to <4 x i32>
-; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP11]])
+; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP11]])
 ; CHECK-NEXT:    [[OP_EXTRA:%.*]] = add nuw i32 [[TMP12]], [[ARG]]
 ; CHECK-NEXT:    [[OP_EXTRA1:%.*]] = add nsw i32 [[OP_EXTRA]], [[TMP9]]
 ; CHECK-NEXT:    ret i32 [[OP_EXTRA1]]
@@ -1231,7 +1231,7 @@
 ; THRESHOLD-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[TMP8]], i32 3
 ; THRESHOLD-NEXT:    [[TMP10:%.*]] = icmp eq <4 x i32> [[TMP8]], zeroinitializer
 ; THRESHOLD-NEXT:    [[TMP11:%.*]] = sext <4 x i1> [[TMP10]] to <4 x i32>
-; THRESHOLD-NEXT:    [[TMP12:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP11]])
+; THRESHOLD-NEXT:    [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP11]])
 ; THRESHOLD-NEXT:    [[OP_EXTRA:%.*]] = add nuw i32 [[TMP12]], [[ARG]]
 ; THRESHOLD-NEXT:    [[OP_EXTRA1:%.*]] = add nsw i32 [[OP_EXTRA]], [[TMP9]]
 ; THRESHOLD-NEXT:    ret i32 [[OP_EXTRA1]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
@@ -12,7 +12,7 @@
 define i32 @maxi8(i32) {
 ; CHECK-LABEL: @maxi8(
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr to <8 x i32>*), align 16
-; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> [[TMP2]])
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> [[TMP2]])
 ; CHECK-NEXT:    ret i32 [[TMP3]]
 ;
   %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
@@ -43,7 +43,7 @@
 define i32 @maxi16(i32) {
 ; CHECK-LABEL: @maxi16(
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([32 x i32]* @arr to <16 x i32>*), align 16
-; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32> [[TMP2]])
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> [[TMP2]])
 ; CHECK-NEXT:    ret i32 [[TMP3]]
 ;
   %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
@@ -98,7 +98,7 @@
 define i32 @maxi32(i32) {
 ; CHECK-LABEL: @maxi32(
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i32>, <32 x i32>* bitcast ([32 x i32]* @arr to <32 x i32>*), align 16
-; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.experimental.vector.reduce.smax.v32i32(<32 x i32> [[TMP2]])
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> [[TMP2]])
 ; CHECK-NEXT:    ret i32 [[TMP3]]
 ;
   %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
@@ -758,7 +758,7 @@
 ; AVX-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]]
 ; AVX-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8
 ; AVX-NEXT:    [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
-; AVX-NEXT:    [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> [[TMP6]])
+; AVX-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP6]])
 ; AVX-NEXT:    [[TMP9:%.*]] = icmp sgt i32 [[TMP8]], [[TMP7]]
 ; AVX-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], i32 [[TMP8]], i32 [[TMP7]]
 ; AVX-NEXT:    [[TMP11:%.*]] = icmp sgt i32 [[TMP10]], [[TMP5]]
@@ -776,7 +776,7 @@
 ; THRESH-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
 ; THRESH-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8
 ; THRESH-NEXT:    [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
-; THRESH-NEXT:    [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> [[TMP5]])
+; THRESH-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP5]])
 ; THRESH-NEXT:    [[TMP8:%.*]] = insertelement <2 x i32> undef, i32 [[TMP7]], i32 0
 ; THRESH-NEXT:    [[TMP9:%.*]] = insertelement <2 x i32> [[TMP8]], i32 [[TMP3]], i32 1
 ; THRESH-NEXT:    [[TMP10:%.*]] = insertelement <2 x i32> undef, i32 [[TMP6]], i32 0
@@ -860,7 +860,7 @@
 ; AVX-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8
 ; AVX-NEXT:    [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
 ; AVX-NEXT:    [[TMP8:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
-; AVX-NEXT:    [[TMP9:%.*]] = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> [[TMP6]])
+; AVX-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP6]])
 ; AVX-NEXT:    [[TMP10:%.*]] = icmp sgt i32 [[TMP9]], [[TMP7]]
 ; AVX-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP9]], i32 [[TMP7]]
 ; AVX-NEXT:    [[TMP12:%.*]] = icmp sgt i32 [[TMP11]], [[TMP8]]
@@ -879,7 +879,7 @@
 ; THRESH-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8
 ; THRESH-NEXT:    [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
 ; THRESH-NEXT:    [[TMP8:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
-; THRESH-NEXT:    [[TMP9:%.*]] = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> [[TMP6]])
+; THRESH-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP6]])
 ; THRESH-NEXT:    [[TMP10:%.*]] = icmp sgt i32 [[TMP9]], [[TMP7]]
 ; THRESH-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP9]], i32 [[TMP7]]
 ; THRESH-NEXT:    [[TMP12:%.*]] = icmp sgt i32 [[TMP11]], [[TMP8]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll
@@ -37,7 +37,7 @@
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[ARRAYIDX]] to <4 x float>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = fmul <4 x float> [[TMP2]], <float 7.000000e+00, float 7.000000e+00, float 7.000000e+00, float 7.000000e+00>
-; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
 ; CHECK-NEXT:    [[ADD17]] = fadd fast float [[SUM_032]], [[TMP4]]
 ; CHECK-NEXT:    [[INC]] = add nsw i64 [[I_033]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP0]]
@@ -70,7 +70,7 @@
 ; STORE-NEXT:    [[TMP1:%.*]] = bitcast float* [[ARRAYIDX]] to <4 x float>*
 ; STORE-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
 ; STORE-NEXT:    [[TMP3:%.*]] = fmul <4 x float> [[TMP2]], <float 7.000000e+00, float 7.000000e+00, float 7.000000e+00, float 7.000000e+00>
-; STORE-NEXT:    [[TMP4:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
+; STORE-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
 ; STORE-NEXT:    [[ADD17]] = fadd fast float [[SUM_032]], [[TMP4]]
 ; STORE-NEXT:    [[INC]] = add nsw i64 [[I_033]], 1
 ; STORE-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP0]]
@@ -164,7 +164,7 @@
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[ARRAYIDX2]] to <4 x float>*
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = fmul <4 x float> [[TMP1]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP5]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP5]])
 ; CHECK-NEXT:    [[MUL21]] = fmul float [[SUM_039]], [[TMP6]]
 ; CHECK-NEXT:    [[INC]] = add nsw i64 [[I_040]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP2]]
@@ -202,7 +202,7 @@
 ; STORE-NEXT:    [[TMP3:%.*]] = bitcast float* [[ARRAYIDX2]] to <4 x float>*
 ; STORE-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4
 ; STORE-NEXT:    [[TMP5:%.*]] = fmul <4 x float> [[TMP1]], [[TMP4]]
-; STORE-NEXT:    [[TMP6:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP5]])
+; STORE-NEXT:    [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP5]])
 ; STORE-NEXT:    [[MUL21]] = fmul float [[SUM_039]], [[TMP6]]
 ; STORE-NEXT:    [[INC]] = add nsw i64 [[I_040]], 1
 ; STORE-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP2]]
@@ -326,7 +326,7 @@
 ; CHECK-NEXT:    [[ARRAYIDX48:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD47]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = load float, float* [[ARRAYIDX48]], align 4
 ; CHECK-NEXT:    [[MUL49:%.*]] = fmul fast float [[TMP2]], [[TMP7]]
-; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float 0.000000e+00, <8 x float> [[TMP6]])
+; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP6]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = fadd fast float [[TMP8]], [[MUL49]]
 ; CHECK-NEXT:    [[ADD51]] = fadd fast float [[SUM_082]], [[TMP9]]
 ; CHECK-NEXT:    [[INC]] = add nsw i64 [[I_083]], 1
@@ -383,7 +383,7 @@
 ; STORE-NEXT:    [[ARRAYIDX48:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD47]]
 ; STORE-NEXT:    [[TMP7:%.*]] = load float, float* [[ARRAYIDX48]], align 4
 ; STORE-NEXT:    [[MUL49:%.*]] = fmul fast float [[TMP2]], [[TMP7]]
-; STORE-NEXT:    [[TMP8:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float 0.000000e+00, <8 x float> [[TMP6]])
+; STORE-NEXT:    [[TMP8:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP6]])
 ; STORE-NEXT:    [[TMP9:%.*]] = fadd fast float [[TMP8]], [[MUL49]]
 ; STORE-NEXT:    [[ADD51]] = fadd fast float [[SUM_082]], [[TMP9]]
 ; STORE-NEXT:    [[INC]] = add nsw i64 [[I_083]], 1
@@ -520,7 +520,7 @@
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[ARRAYIDX2]] to <4 x float>*
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast <4 x float> [[TMP1]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP5]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP5]])
 ; CHECK-NEXT:    [[OP_EXTRA]] = fadd fast float [[TMP6]], [[SUM_042]]
 ; CHECK-NEXT:    [[INC]] = add nsw i64 [[I_043]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP2]]
@@ -558,7 +558,7 @@
 ; STORE-NEXT:    [[TMP3:%.*]] = bitcast float* [[ARRAYIDX2]] to <4 x float>*
 ; STORE-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4
 ; STORE-NEXT:    [[TMP5:%.*]] = fmul fast <4 x float> [[TMP1]], [[TMP4]]
-; STORE-NEXT:    [[TMP6:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP5]])
+; STORE-NEXT:    [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP5]])
 ; STORE-NEXT:    [[OP_EXTRA]] = fadd fast float [[TMP6]], [[SUM_042]]
 ; STORE-NEXT:    [[INC]] = add nsw i64 [[I_043]], 1
 ; STORE-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP2]]
@@ -1015,7 +1015,7 @@
 ; STORE-NEXT:    [[TMP3:%.*]] = bitcast float* [[ARRAYIDX2]] to <4 x float>*
 ; STORE-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4
 ; STORE-NEXT:    [[TMP5:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP4]]
-; STORE-NEXT:    [[TMP6:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP5]])
+; STORE-NEXT:    [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP5]])
 ; STORE-NEXT:    store float [[TMP6]], float* [[C_ADDR_038]], align 4
 ; STORE-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds float, float* [[C_ADDR_038]], i64 1
 ; STORE-NEXT:    [[INC]] = add nsw i64 [[I_039]], 1
@@ -1090,7 +1090,7 @@
 ; STORE-LABEL: @float_red_example4(
 ; STORE-NEXT:  entry:
 ; STORE-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* bitcast ([32 x float]* @arr_float to <4 x float>*), align 16
-; STORE-NEXT:    [[TMP1:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP0]])
+; STORE-NEXT:    [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP0]])
 ; STORE-NEXT:    store float [[TMP1]], float* [[RES:%.*]], align 16
 ; STORE-NEXT:    ret void
 ;
@@ -1130,7 +1130,7 @@
 ; STORE-LABEL: @float_red_example8(
 ; STORE-NEXT:  entry:
 ; STORE-NEXT:    [[TMP0:%.*]] = load <8 x float>, <8 x float>* bitcast ([32 x float]* @arr_float to <8 x float>*), align 16
-; STORE-NEXT:    [[TMP1:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float 0.000000e+00, <8 x float> [[TMP0]])
+; STORE-NEXT:    [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP0]])
 ; STORE-NEXT:    store float [[TMP1]], float* [[RES:%.*]], align 16
 ; STORE-NEXT:    ret void
 ;
@@ -1194,7 +1194,7 @@
 ; STORE-LABEL: @float_red_example16(
 ; STORE-NEXT:  entry:
 ; STORE-NEXT:    [[TMP0:%.*]] = load <16 x float>, <16 x float>* bitcast ([32 x float]* @arr_float to <16 x float>*), align 16
-; STORE-NEXT:    [[TMP1:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float 0.000000e+00, <16 x float> [[TMP0]])
+; STORE-NEXT:    [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP0]])
 ; STORE-NEXT:    store float [[TMP1]], float* [[RES:%.*]], align 16
 ; STORE-NEXT:    ret void
 ;
@@ -1250,7 +1250,7 @@
 ; STORE-LABEL: @i32_red_example4(
 ; STORE-NEXT:  entry:
 ; STORE-NEXT:    [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([32 x i32]* @arr_i32 to <4 x i32>*), align 16
-; STORE-NEXT:    [[TMP1:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP0]])
+; STORE-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP0]])
 ; STORE-NEXT:    store i32 [[TMP1]], i32* [[RES:%.*]], align 16
 ; STORE-NEXT:    ret void
 ;
@@ -1290,7 +1290,7 @@
 ; STORE-LABEL: @i32_red_example8(
 ; STORE-NEXT:  entry:
 ; STORE-NEXT:    [[TMP0:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr_i32 to <8 x i32>*), align 16
-; STORE-NEXT:    [[TMP1:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> [[TMP0]])
+; STORE-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP0]])
 ; STORE-NEXT:    store i32 [[TMP1]], i32* [[RES:%.*]], align 16
 ; STORE-NEXT:    ret void
 ;
@@ -1354,7 +1354,7 @@
 ; STORE-LABEL: @i32_red_example16(
 ; STORE-NEXT:  entry:
 ; STORE-NEXT:    [[TMP0:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([32 x i32]* @arr_i32 to <16 x i32>*), align 16
-; STORE-NEXT:    [[TMP1:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> [[TMP0]])
+; STORE-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP0]])
 ; STORE-NEXT:    store i32 [[TMP1]], i32* [[RES:%.*]], align 16
 ; STORE-NEXT:    ret void
 ;
@@ -1466,7 +1466,7 @@
 ; STORE-LABEL: @i32_red_example32(
 ; STORE-NEXT:  entry:
 ; STORE-NEXT:    [[TMP0:%.*]] = load <32 x i32>, <32 x i32>* bitcast ([32 x i32]* @arr_i32 to <32 x i32>*), align 16
-; STORE-NEXT:    [[TMP1:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32> [[TMP0]])
+; STORE-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> [[TMP0]])
 ; STORE-NEXT:    store i32 [[TMP1]], i32* [[RES:%.*]], align 16
 ; STORE-NEXT:    ret void
 ;
@@ -1544,14 +1544,14 @@
 ; CHECK-LABEL: @i32_red_call(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr_i32 to <8 x i32>*), align 16
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> [[TMP0]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP0]])
 ; CHECK-NEXT:    [[RES:%.*]] = call i32 @foobar(i32 [[TMP1]])
 ; CHECK-NEXT:    ret void
 ;
 ; STORE-LABEL: @i32_red_call(
 ; STORE-NEXT:  entry:
 ; STORE-NEXT:    [[TMP0:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr_i32 to <8 x i32>*), align 16
-; STORE-NEXT:    [[TMP1:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> [[TMP0]])
+; STORE-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP0]])
 ; STORE-NEXT:    [[RES:%.*]] = call i32 @foobar(i32 [[TMP1]])
 ; STORE-NEXT:    ret void
 ;
@@ -1579,7 +1579,7 @@
 ; CHECK-LABEL: @i32_red_invoke(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr_i32 to <8 x i32>*), align 16
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> [[TMP0]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP0]])
 ; CHECK-NEXT:    [[RES:%.*]] = invoke i32 @foobar(i32 [[TMP1]])
 ; CHECK-NEXT:    to label [[NORMAL:%.*]] unwind label [[EXCEPTION:%.*]]
 ; CHECK:       exception:
@@ -1592,7 +1592,7 @@
 ; STORE-LABEL: @i32_red_invoke(
 ; STORE-NEXT:  entry:
 ; STORE-NEXT:    [[TMP0:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr_i32 to <8 x i32>*), align 16
-; STORE-NEXT:    [[TMP1:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> [[TMP0]])
+; STORE-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP0]])
 ; STORE-NEXT:    [[RES:%.*]] = invoke i32 @foobar(i32 [[TMP1]])
 ; STORE-NEXT:    to label [[NORMAL:%.*]] unwind label [[EXCEPTION:%.*]]
 ; STORE:       exception:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reassociated-loads.ll b/llvm/test/Transforms/SLPVectorizer/X86/reassociated-loads.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/reassociated-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reassociated-loads.ll
@@ -5,7 +5,7 @@
 ; CHECK-LABEL: @Foo(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <32 x i8>, <32 x i8>* [[__V:%.*]], align 32
-; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> [[TMP0]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> [[TMP0]])
 ; CHECK-NEXT:    ret i8 [[TMP1]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduction.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction.ll
@@ -80,7 +80,7 @@
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
 ; CHECK-NEXT:    [[T4:%.*]] = load i32, i32* [[X4]], align 4
 ; CHECK-NEXT:    [[T5:%.*]] = load i32, i32* [[X5]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> [[TMP2]])
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP2]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[TMP3]], [[T4]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP3]], i32 [[T4]]
 ; CHECK-NEXT:    [[C012345:%.*]] = icmp sgt i32 [[TMP5]], [[T5]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction_loads.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction_loads.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduction_loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction_loads.ll
@@ -35,7 +35,7 @@
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = mul <8 x i32> [[TMP1]], <i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42>
-; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> [[TMP2]])
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]])
 ; CHECK-NEXT:    [[OP_EXTRA]] = add i32 [[TMP3]], [[SUM]]
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[FOR_BODY]]
 ; CHECK:       for.end:
@@ -124,7 +124,7 @@
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[Q]] to <8 x i32>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul <8 x i32> [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> [[TMP4]])
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP4]])
 ; CHECK-NEXT:    [[OP_EXTRA]] = add i32 [[TMP5]], [[SUM]]
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[FOR_BODY]]
 ; CHECK:       for.end:
@@ -230,7 +230,7 @@
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[Q]] to <8 x i32>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul <8 x i32> [[REORDER_SHUFFLE]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> [[TMP4]])
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP4]])
 ; CHECK-NEXT:    [[OP_EXTRA]] = add i32 [[TMP5]], [[SUM]]
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[FOR_BODY]]
 ; CHECK:       for.end:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll
@@ -26,7 +26,7 @@
 ; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP1]])
 ; CHECK-NEXT:    ret i32 [[TMP2]]
 ;
 entry:
@@ -74,7 +74,7 @@
 ; AVX-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
 ; AVX-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
 ; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
-; AVX-NEXT:    [[TMP2:%.*]] = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> [[TMP1]])
+; AVX-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> [[TMP1]])
 ; AVX-NEXT:    ret i32 [[TMP2]]
 ;
 ; SSE-LABEL: @test_mul(
@@ -148,7 +148,7 @@
 ; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP1]])
 ; CHECK-NEXT:    ret i32 [[TMP2]]
 ;
 entry:
@@ -196,7 +196,7 @@
 ; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> [[TMP1]])
 ; CHECK-NEXT:    ret i32 [[TMP2]]
 ;
 entry:
@@ -244,7 +244,7 @@
 ; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32> [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> [[TMP1]])
 ; CHECK-NEXT:    ret i32 [[TMP2]]
 ;
 entry:
@@ -284,7 +284,7 @@
 ; CHECK-NEXT:    [[TMP5:%.*]] = shl <4 x i32> [[TMP4]], <i32 18, i32 2, i32 7, i32 13>
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor <4 x i32> [[TMP3]], [[TMP5]]
 ; CHECK-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* [[SELF]], align 16
-; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32> [[TMP6]])
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> [[TMP6]])
 ; CHECK-NEXT:    ret i32 [[TMP7]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark_horcost.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark_horcost.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/remark_horcost.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/remark_horcost.ll
@@ -38,7 +38,7 @@
 ; CHECK-NEXT:    [[ARRAYIDX48:%.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* [[M2]], i64 0, i64 [[INDVARS_IV]], i64 3
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i32* [[ARRAYIDX6]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[TMP13]], <4 x i32>* [[TMP14]], align 16
-; CHECK-NEXT:    [[TMP15:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP13]])
+; CHECK-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP13]])
 ; CHECK-NEXT:    [[OP_EXTRA]] = add nsw i32 [[TMP15]], [[A_088]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 8
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll
@@ -18,14 +18,14 @@
 ; CHECK-NEXT:    [[TMP4:%.*]] = sub <2 x i32> [[TMP3]], undef
 ; CHECK-NEXT:    [[SHUFFLE5:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
 ; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[SHUFFLE5]], <i32 15, i32 31, i32 47, i32 undef>
-; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> [[TMP5]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP5]])
 ; CHECK-NEXT:    [[T19:%.*]] = select i1 undef, i32 [[TMP6]], i32 undef
 ; CHECK-NEXT:    [[T20:%.*]] = icmp sgt i32 [[T19]], 63
 ; CHECK-NEXT:    [[TMP7:%.*]] = sub nsw <2 x i32> undef, [[TMP2]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = sub <2 x i32> [[TMP7]], undef
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
 ; CHECK-NEXT:    [[TMP9:%.*]] = add nsw <4 x i32> [[SHUFFLE]], <i32 -49, i32 -33, i32 -33, i32 -17>
-; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> [[TMP9]])
+; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP9]])
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp slt i32 [[TMP10]], undef
 ; CHECK-NEXT:    [[OP_EXTRA:%.*]] = select i1 [[TMP11]], i32 [[TMP10]], i32 undef
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp slt i32 [[OP_EXTRA]], undef
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reverse_extract_elements.ll b/llvm/test/Transforms/SLPVectorizer/X86/reverse_extract_elements.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/reverse_extract_elements.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reverse_extract_elements.ll
@@ -5,7 +5,7 @@
 ; CHECK-LABEL: @dotf(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = fmul fast <4 x float> [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP0]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP0]])
 ; CHECK-NEXT:    ret float [[TMP1]]
 ;
 entry:
@@ -33,7 +33,7 @@
 ; CHECK-NEXT:    [[X:%.*]] = load <4 x double>, <4 x double>* [[TMP0:%.*]], align 32
 ; CHECK-NEXT:    [[Y:%.*]] = load <4 x double>, <4 x double>* [[TMP1:%.*]], align 32
 ; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast <4 x double> [[X]], [[Y]]
-; CHECK-NEXT:    [[TMP3:%.*]] = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double 0.000000e+00, <4 x double> [[TMP2]])
+; CHECK-NEXT:    [[TMP3:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP2]])
 ; CHECK-NEXT:    ret double [[TMP3]]
 ;
 entry:
@@ -63,7 +63,7 @@
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[X:%.*]], align 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[Y:%.*]], align 16
 ; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast <4 x float> [[TMP1]], [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP2]])
+; CHECK-NEXT:    [[TMP3:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP2]])
 ; CHECK-NEXT:    ret float [[TMP3]]
 ;
 entry:
@@ -93,7 +93,7 @@
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x double>, <4 x double>* [[X:%.*]], align 32
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* [[Y:%.*]], align 32
 ; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast <4 x double> [[TMP1]], [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double 0.000000e+00, <4 x double> [[TMP2]])
+; CHECK-NEXT:    [[TMP3:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP2]])
 ; CHECK-NEXT:    ret double [[TMP3]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/scheduling.ll b/llvm/test/Transforms/SLPVectorizer/X86/scheduling.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/scheduling.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/scheduling.ll
@@ -37,7 +37,7 @@
 ; CHECK-NEXT:    [[ARRAYIDX48:%.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* [[M2]], i64 0, i64 [[INDVARS_IV]], i64 3
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i32* [[ARRAYIDX6]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[TMP13]], <4 x i32>* [[TMP14]], align 16
-; CHECK-NEXT:    [[TMP15:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP13]])
+; CHECK-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP13]])
 ; CHECK-NEXT:    [[OP_EXTRA]] = add nsw i32 [[TMP15]], [[A_088]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 8
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/undef_vect.ll b/llvm/test/Transforms/SLPVectorizer/X86/undef_vect.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/undef_vect.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/undef_vect.ll
@@ -16,7 +16,7 @@
 ; CHECK-NEXT:    [[DOTSROA_RAW_IDX_7:%.*]] = getelementptr inbounds %"struct.std::h.0.4.8.12.16.20.24.28.248.0.1.2.3.76", %"struct.std::h.0.4.8.12.16.20.24.28.248.0.1.2.3.76"* undef, i64 7, i32 1
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[DOTSROA_CAST_4]] to <8 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> [[TMP1]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt i32 [[TMP2]], undef
 ; CHECK-NEXT:    [[OP_EXTRA:%.*]] = select i1 [[TMP3]], i32 [[TMP2]], i32 undef
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[OP_EXTRA]], undef
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/used-reduced-op.ll b/llvm/test/Transforms/SLPVectorizer/X86/used-reduced-op.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/used-reduced-op.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/used-reduced-op.ll
@@ -55,7 +55,7 @@
 ; CHECK-NEXT:    [[TMP38:%.*]] = icmp slt <4 x i32> [[TMP37]], zeroinitializer
 ; CHECK-NEXT:    [[TMP39:%.*]] = sub nsw <4 x i32> zeroinitializer, [[TMP37]]
 ; CHECK-NEXT:    [[TMP40:%.*]] = select <4 x i1> [[TMP38]], <4 x i32> [[TMP39]], <4 x i32> [[TMP37]]
-; CHECK-NEXT:    [[TMP41:%.*]] = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> [[TMP40]])
+; CHECK-NEXT:    [[TMP41:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP40]])
 ; CHECK-NEXT:    [[TMP42:%.*]] = icmp slt i32 [[TMP41]], [[TMP32]]
 ; CHECK-NEXT:    [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP41]], i32 [[TMP32]]
 ; CHECK-NEXT:    [[TMP44:%.*]] = icmp slt i32 [[TMP43]], [[B_0]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll
@@ -17,7 +17,7 @@
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A5:%.*]], i32 6
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A6:%.*]], i32 7
 ; CHECK-NEXT:    [[TMP10:%.*]] = add <8 x i32> [[SHUFFLE]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> [[TMP10]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> [[TMP10]])
 ; CHECK-NEXT:    ret i32 [[TMP11]]
 ;
 entry:
@@ -67,7 +67,7 @@
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A7:%.*]], i32 6
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A3:%.*]], i32 7
 ; CHECK-NEXT:    [[TMP10:%.*]] = add <8 x i32> [[SHUFFLE]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> [[TMP10]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> [[TMP10]])
 ; CHECK-NEXT:    ret i32 [[TMP11]]
 ;
 entry:
@@ -121,7 +121,7 @@
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A1:%.*]], i32 6
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A3:%.*]], i32 7
 ; CHECK-NEXT:    [[TMP10:%.*]] = add <8 x i32> [[SHUFFLE]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> [[TMP10]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> [[TMP10]])
 ; CHECK-NEXT:    ret i32 [[TMP11]]
 ;
 entry:
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td
@@ -291,14 +291,14 @@
 
 // LLVM vector reduction over a single vector.
 class LLVM_VectorReduction<string mnem>
-    : LLVM_OneResultIntrOp<"experimental.vector.reduce." # mnem,
+    : LLVM_OneResultIntrOp<"vector.reduce." # mnem,
                            [], [0], [NoSideEffect]>,
       Arguments<(ins LLVM_Type)>;
 
 // LLVM vector reduction over a single vector, with an initial value,
 // and with permission to reassociate the reduction operations.
-class LLVM_VectorReductionV2<string mnem>
-    : LLVM_OpBase<LLVM_Dialect, "intr.experimental.vector.reduce.v2." # mnem,
+class LLVM_VectorReductionAcc<string mnem>
+    : LLVM_OpBase<LLVM_Dialect, "intr.vector.reduce." # mnem,
                   [NoSideEffect]>,
       Results<(outs LLVM_Type:$res)>,
       Arguments<(ins LLVM_Type, LLVM_Type,
@@ -307,7 +307,7 @@
     llvm::Module *module = builder.GetInsertBlock()->getModule();
     llvm::Function *fn = llvm::Intrinsic::getDeclaration(
         module,
-        llvm::Intrinsic::experimental_vector_reduce_v2_}] # mnem # [{,
+        llvm::Intrinsic::vector_reduce_}] # mnem # [{,
         { }] # StrJoin<!listconcat(
             ListIntSubst<LLVM_IntrPatterns.result, [0]>.lst,
             ListIntSubst<LLVM_IntrPatterns.operand, [1]>.lst)>.result # [{
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
@@ -885,20 +885,20 @@
 // Vector Reductions.
 //
 
-def LLVM_experimental_vector_reduce_add : LLVM_VectorReduction<"add">;
-def LLVM_experimental_vector_reduce_and : LLVM_VectorReduction<"and">;
-def LLVM_experimental_vector_reduce_mul : LLVM_VectorReduction<"mul">;
-def LLVM_experimental_vector_reduce_fmax : LLVM_VectorReduction<"fmax">;
-def LLVM_experimental_vector_reduce_fmin : LLVM_VectorReduction<"fmin">;
-def LLVM_experimental_vector_reduce_or : LLVM_VectorReduction<"or">;
-def LLVM_experimental_vector_reduce_smax : LLVM_VectorReduction<"smax">;
-def LLVM_experimental_vector_reduce_smin : LLVM_VectorReduction<"smin">;
-def LLVM_experimental_vector_reduce_umax : LLVM_VectorReduction<"umax">;
-def LLVM_experimental_vector_reduce_umin : LLVM_VectorReduction<"umin">;
-def LLVM_experimental_vector_reduce_xor : LLVM_VectorReduction<"xor">;
-
-def LLVM_experimental_vector_reduce_v2_fadd : LLVM_VectorReductionV2<"fadd">;
-def LLVM_experimental_vector_reduce_v2_fmul : LLVM_VectorReductionV2<"fmul">;
+def LLVM_vector_reduce_add : LLVM_VectorReduction<"add">;
+def LLVM_vector_reduce_and : LLVM_VectorReduction<"and">;
+def LLVM_vector_reduce_mul : LLVM_VectorReduction<"mul">;
+def LLVM_vector_reduce_fmax : LLVM_VectorReduction<"fmax">;
+def LLVM_vector_reduce_fmin : LLVM_VectorReduction<"fmin">;
+def LLVM_vector_reduce_or : LLVM_VectorReduction<"or">;
+def LLVM_vector_reduce_smax : LLVM_VectorReduction<"smax">;
+def LLVM_vector_reduce_smin : LLVM_VectorReduction<"smin">;
+def LLVM_vector_reduce_umax : LLVM_VectorReduction<"umax">;
+def LLVM_vector_reduce_umin : LLVM_VectorReduction<"umin">;
+def LLVM_vector_reduce_xor : LLVM_VectorReduction<"xor">;
+
+def LLVM_vector_reduce_fadd : LLVM_VectorReductionAcc<"fadd">;
+def LLVM_vector_reduce_fmul : LLVM_VectorReductionAcc<"fmul">;
 
 //
 // LLVM Matrix operations.
diff --git a/mlir/include/mlir/Dialect/Vector/VectorOps.td b/mlir/include/mlir/Dialect/Vector/VectorOps.td
--- a/mlir/include/mlir/Dialect/Vector/VectorOps.td
+++ b/mlir/include/mlir/Dialect/Vector/VectorOps.td
@@ -207,7 +207,7 @@
     Note that these operations are restricted to 1-D vectors to remain
     close to the corresponding LLVM intrinsics:
 
-    http://llvm.org/docs/LangRef.html#experimental-vector-reduction-intrinsics
+    http://llvm.org/docs/LangRef.html#vector-reduction-intrinsics
 
     Example:
 
diff --git a/mlir/integration_test/Dialect/LLVMIR/CPU/test-vector-reductions-fp.mlir b/mlir/integration_test/Dialect/LLVMIR/CPU/test-vector-reductions-fp.mlir
--- a/mlir/integration_test/Dialect/LLVMIR/CPU/test-vector-reductions-fp.mlir
+++ b/mlir/integration_test/Dialect/LLVMIR/CPU/test-vector-reductions-fp.mlir
@@ -24,61 +24,61 @@
     %12 = llvm.mlir.constant(3 : i64) : !llvm.i64
     %v = llvm.insertelement %3, %11[%12 : !llvm.i64] : !llvm.vec<4 x float>
 
-    %max = "llvm.intr.experimental.vector.reduce.fmax"(%v)
+    %max = "llvm.intr.vector.reduce.fmax"(%v)
         : (!llvm.vec<4 x float>) -> !llvm.float
     llvm.call @printF32(%max) : (!llvm.float) -> ()
     llvm.call @printNewline() : () -> ()
     // CHECK: 4
 
-    %min = "llvm.intr.experimental.vector.reduce.fmin"(%v)
+    %min = "llvm.intr.vector.reduce.fmin"(%v)
         : (!llvm.vec<4 x float>) -> !llvm.float
     llvm.call @printF32(%min) : (!llvm.float) -> ()
     llvm.call @printNewline() : () -> ()
     // CHECK: 1
 
-    %add1 = "llvm.intr.experimental.vector.reduce.v2.fadd"(%0, %v)
+    %add1 = "llvm.intr.vector.reduce.fadd"(%0, %v)
         : (!llvm.float, !llvm.vec<4 x float>) -> !llvm.float
     llvm.call @printF32(%add1) : (!llvm.float) -> ()
     llvm.call @printNewline() : () -> ()
     // CHECK: 11
 
-    %add1r = "llvm.intr.experimental.vector.reduce.v2.fadd"(%0, %v)
+    %add1r = "llvm.intr.vector.reduce.fadd"(%0, %v)
         {reassoc = true} : (!llvm.float, !llvm.vec<4 x float>) -> !llvm.float
     llvm.call @printF32(%add1r) : (!llvm.float) -> ()
     llvm.call @printNewline() : () -> ()
     // CHECK: 11
 
-    %add2 = "llvm.intr.experimental.vector.reduce.v2.fadd"(%1, %v)
+    %add2 = "llvm.intr.vector.reduce.fadd"(%1, %v)
         : (!llvm.float, !llvm.vec<4 x float>) -> !llvm.float
     llvm.call @printF32(%add2) : (!llvm.float) -> ()
     llvm.call @printNewline() : () -> ()
     // CHECK: 12
 
-    %add2r = "llvm.intr.experimental.vector.reduce.v2.fadd"(%1, %v)
+    %add2r = "llvm.intr.vector.reduce.fadd"(%1, %v)
         {reassoc = true} : (!llvm.float, !llvm.vec<4 x float>) -> !llvm.float
     llvm.call @printF32(%add2r) : (!llvm.float) -> ()
     llvm.call @printNewline() : () -> ()
     // CHECK: 12
 
-    %mul1 = "llvm.intr.experimental.vector.reduce.v2.fmul"(%0, %v)
+    %mul1 = "llvm.intr.vector.reduce.fmul"(%0, %v)
         : (!llvm.float, !llvm.vec<4 x float>) -> !llvm.float
     llvm.call @printF32(%mul1) : (!llvm.float) -> ()
     llvm.call @printNewline() : () -> ()
     // CHECK: 24
 
-    %mul1r = "llvm.intr.experimental.vector.reduce.v2.fmul"(%0, %v)
+    %mul1r = "llvm.intr.vector.reduce.fmul"(%0, %v)
         {reassoc = true} : (!llvm.float, !llvm.vec<4 x float>) -> !llvm.float
     llvm.call @printF32(%mul1r) : (!llvm.float) -> ()
     llvm.call @printNewline() : () -> ()
     // CHECK: 24
 
-    %mul2 = "llvm.intr.experimental.vector.reduce.v2.fmul"(%1, %v)
+    %mul2 = "llvm.intr.vector.reduce.fmul"(%1, %v)
         : (!llvm.float, !llvm.vec<4 x float>) -> !llvm.float
     llvm.call @printF32(%mul2) : (!llvm.float) -> ()
     llvm.call @printNewline() : () -> ()
     // CHECK: 48
 
-    %mul2r = "llvm.intr.experimental.vector.reduce.v2.fmul"(%1, %v)
+    %mul2r = "llvm.intr.vector.reduce.fmul"(%1, %v)
         {reassoc = true} : (!llvm.float, !llvm.vec<4 x float>) -> !llvm.float
     llvm.call @printF32(%mul2r) : (!llvm.float) -> ()
     llvm.call @printNewline() : () -> ()
diff --git a/mlir/integration_test/Dialect/LLVMIR/CPU/test-vector-reductions-int.mlir b/mlir/integration_test/Dialect/LLVMIR/CPU/test-vector-reductions-int.mlir
--- a/mlir/integration_test/Dialect/LLVMIR/CPU/test-vector-reductions-int.mlir
+++ b/mlir/integration_test/Dialect/LLVMIR/CPU/test-vector-reductions-int.mlir
@@ -24,55 +24,55 @@
     %12 = llvm.mlir.constant(3 : i64) : !llvm.i64
     %v = llvm.insertelement %3, %11[%12 : !llvm.i64] : !llvm.vec<4 x i64>
 
-    %add = "llvm.intr.experimental.vector.reduce.add"(%v)
+    %add = "llvm.intr.vector.reduce.add"(%v)
         : (!llvm.vec<4 x i64>) -> !llvm.i64
     llvm.call @printI64(%add) : (!llvm.i64) -> ()
     llvm.call @printNewline() : () -> ()
     // CHECK: 10
 
-    %and = "llvm.intr.experimental.vector.reduce.and"(%v)
+    %and = "llvm.intr.vector.reduce.and"(%v)
         : (!llvm.vec<4 x i64>) -> !llvm.i64
     llvm.call @printI64(%and) : (!llvm.i64) -> ()
     llvm.call @printNewline() : () -> ()
     // CHECK: 0
 
-    %mul = "llvm.intr.experimental.vector.reduce.mul"(%v)
+    %mul = "llvm.intr.vector.reduce.mul"(%v)
         : (!llvm.vec<4 x i64>) -> !llvm.i64
     llvm.call @printI64(%mul) : (!llvm.i64) -> ()
     llvm.call @printNewline() : () -> ()
     // CHECK: 24
 
-    %or = "llvm.intr.experimental.vector.reduce.or"(%v)
+    %or = "llvm.intr.vector.reduce.or"(%v)
         : (!llvm.vec<4 x i64>) -> !llvm.i64
     llvm.call @printI64(%or) : (!llvm.i64) -> ()
     llvm.call @printNewline() : () -> ()
     // CHECK: 7
 
-    %smax = "llvm.intr.experimental.vector.reduce.smax"(%v)
+    %smax = "llvm.intr.vector.reduce.smax"(%v)
         : (!llvm.vec<4 x i64>) -> !llvm.i64
     llvm.call @printI64(%smax) : (!llvm.i64) -> ()
     llvm.call @printNewline() : () -> ()
     // CHECK: 4
 
-    %smin = "llvm.intr.experimental.vector.reduce.smin"(%v)
+    %smin = "llvm.intr.vector.reduce.smin"(%v)
         : (!llvm.vec<4 x i64>) -> !llvm.i64
     llvm.call @printI64(%smin) : (!llvm.i64) -> ()
     llvm.call @printNewline() : () -> ()
     // CHECK: 1
 
-    %umax = "llvm.intr.experimental.vector.reduce.umax"(%v)
+    %umax = "llvm.intr.vector.reduce.umax"(%v)
         : (!llvm.vec<4 x i64>) -> !llvm.i64
     llvm.call @printI64(%umax) : (!llvm.i64) -> ()
     llvm.call @printNewline() : () -> ()
     // CHECK: 4
 
-    %umin = "llvm.intr.experimental.vector.reduce.umin"(%v)
+    %umin = "llvm.intr.vector.reduce.umin"(%v)
         : (!llvm.vec<4 x i64>) -> !llvm.i64
     llvm.call @printI64(%umin) : (!llvm.i64) -> ()
     llvm.call @printNewline() : () -> ()
     // CHECK: 1
 
-    %xor = "llvm.intr.experimental.vector.reduce.xor"(%v)
+    %xor = "llvm.intr.vector.reduce.xor"(%v)
         : (!llvm.vec<4 x i64>) -> !llvm.i64
     llvm.call @printI64(%xor) : (!llvm.i64) -> ()
     llvm.call @printNewline() : () -> ()
diff --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
--- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
+++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
@@ -564,33 +564,33 @@
     if (eltType.isIntOrIndex()) {
       // Integer reductions: add/mul/min/max/and/or/xor.
       if (kind == "add")
-        rewriter.replaceOpWithNewOp<LLVM::experimental_vector_reduce_add>(
+        rewriter.replaceOpWithNewOp<LLVM::vector_reduce_add>(
             op, llvmType, operands[0]);
       else if (kind == "mul")
-        rewriter.replaceOpWithNewOp<LLVM::experimental_vector_reduce_mul>(
+        rewriter.replaceOpWithNewOp<LLVM::vector_reduce_mul>(
             op, llvmType, operands[0]);
       else if (kind == "min" &&
                (eltType.isIndex() || eltType.isUnsignedInteger()))
-        rewriter.replaceOpWithNewOp<LLVM::experimental_vector_reduce_umin>(
+        rewriter.replaceOpWithNewOp<LLVM::vector_reduce_umin>(
             op, llvmType, operands[0]);
       else if (kind == "min")
-        rewriter.replaceOpWithNewOp<LLVM::experimental_vector_reduce_smin>(
+        rewriter.replaceOpWithNewOp<LLVM::vector_reduce_smin>(
             op, llvmType, operands[0]);
       else if (kind == "max" &&
                (eltType.isIndex() || eltType.isUnsignedInteger()))
-        rewriter.replaceOpWithNewOp<LLVM::experimental_vector_reduce_umax>(
+        rewriter.replaceOpWithNewOp<LLVM::vector_reduce_umax>(
             op, llvmType, operands[0]);
       else if (kind == "max")
-        rewriter.replaceOpWithNewOp<LLVM::experimental_vector_reduce_smax>(
+        rewriter.replaceOpWithNewOp<LLVM::vector_reduce_smax>(
             op, llvmType, operands[0]);
       else if (kind == "and")
-        rewriter.replaceOpWithNewOp<LLVM::experimental_vector_reduce_and>(
+        rewriter.replaceOpWithNewOp<LLVM::vector_reduce_and>(
             op, llvmType, operands[0]);
       else if (kind == "or")
-        rewriter.replaceOpWithNewOp<LLVM::experimental_vector_reduce_or>(
+        rewriter.replaceOpWithNewOp<LLVM::vector_reduce_or>(
             op, llvmType, operands[0]);
       else if (kind == "xor")
-        rewriter.replaceOpWithNewOp<LLVM::experimental_vector_reduce_xor>(
+        rewriter.replaceOpWithNewOp<LLVM::vector_reduce_xor>(
             op, llvmType, operands[0]);
       else
         return failure();
@@ -604,7 +604,7 @@
                                         : rewriter.create<LLVM::ConstantOp>(
                                               op->getLoc(), llvmType,
                                               rewriter.getZeroAttr(eltType));
-        rewriter.replaceOpWithNewOp<LLVM::experimental_vector_reduce_v2_fadd>(
+        rewriter.replaceOpWithNewOp<LLVM::vector_reduce_fadd>(
             op, llvmType, acc, operands[0],
             rewriter.getBoolAttr(reassociateFPReductions));
       } else if (kind == "mul") {
@@ -614,14 +614,14 @@
                         : rewriter.create<LLVM::ConstantOp>(
                               op->getLoc(), llvmType,
                               rewriter.getFloatAttr(eltType, 1.0));
-        rewriter.replaceOpWithNewOp<LLVM::experimental_vector_reduce_v2_fmul>(
+        rewriter.replaceOpWithNewOp<LLVM::vector_reduce_fmul>(
             op, llvmType, acc, operands[0],
             rewriter.getBoolAttr(reassociateFPReductions));
       } else if (kind == "min")
-        rewriter.replaceOpWithNewOp<LLVM::experimental_vector_reduce_fmin>(
+        rewriter.replaceOpWithNewOp<LLVM::vector_reduce_fmin>(
             op, llvmType, operands[0]);
       else if (kind == "max")
-        rewriter.replaceOpWithNewOp<LLVM::experimental_vector_reduce_fmax>(
+        rewriter.replaceOpWithNewOp<LLVM::vector_reduce_fmax>(
             op, llvmType, operands[0]);
       else
         return failure();
diff --git a/mlir/test/Conversion/VectorToLLVM/vector-reduction-to-llvm.mlir b/mlir/test/Conversion/VectorToLLVM/vector-reduction-to-llvm.mlir
--- a/mlir/test/Conversion/VectorToLLVM/vector-reduction-to-llvm.mlir
+++ b/mlir/test/Conversion/VectorToLLVM/vector-reduction-to-llvm.mlir
@@ -5,14 +5,14 @@
 // CHECK-LABEL: llvm.func @reduce_add_f32(
 // CHECK-SAME: %[[A:.*]]: !llvm.vec<16 x float>)
 //      CHECK: %[[C:.*]] = llvm.mlir.constant(0.000000e+00 : f32) : !llvm.float
-//      CHECK: %[[V:.*]] = "llvm.intr.experimental.vector.reduce.v2.fadd"(%[[C]], %[[A]])
+//      CHECK: %[[V:.*]] = "llvm.intr.vector.reduce.fadd"(%[[C]], %[[A]])
 // CHECK-SAME: {reassoc = false} : (!llvm.float, !llvm.vec<16 x float>) -> !llvm.float
 //      CHECK: llvm.return %[[V]] : !llvm.float
 //
 // REASSOC-LABEL: llvm.func @reduce_add_f32(
 // REASSOC-SAME: %[[A:.*]]: !llvm.vec<16 x float>)
 //      REASSOC: %[[C:.*]] = llvm.mlir.constant(0.000000e+00 : f32) : !llvm.float
-//      REASSOC: %[[V:.*]] = "llvm.intr.experimental.vector.reduce.v2.fadd"(%[[C]], %[[A]])
+//      REASSOC: %[[V:.*]] = "llvm.intr.vector.reduce.fadd"(%[[C]], %[[A]])
 // REASSOC-SAME: {reassoc = true} : (!llvm.float, !llvm.vec<16 x float>) -> !llvm.float
 //      REASSOC: llvm.return %[[V]] : !llvm.float
 //
@@ -25,14 +25,14 @@
 // CHECK-LABEL: llvm.func @reduce_mul_f32(
 // CHECK-SAME: %[[A:.*]]: !llvm.vec<16 x float>)
 //      CHECK: %[[C:.*]] = llvm.mlir.constant(1.000000e+00 : f32) : !llvm.float
-//      CHECK: %[[V:.*]] = "llvm.intr.experimental.vector.reduce.v2.fmul"(%[[C]], %[[A]])
+//      CHECK: %[[V:.*]] = "llvm.intr.vector.reduce.fmul"(%[[C]], %[[A]])
 // CHECK-SAME: {reassoc = false} : (!llvm.float, !llvm.vec<16 x float>) -> !llvm.float
 //      CHECK: llvm.return %[[V]] : !llvm.float
 //
 // REASSOC-LABEL: llvm.func @reduce_mul_f32(
 // REASSOC-SAME: %[[A:.*]]: !llvm.vec<16 x float>)
 //      REASSOC: %[[C:.*]] = llvm.mlir.constant(1.000000e+00 : f32) : !llvm.float
-//      REASSOC: %[[V:.*]] = "llvm.intr.experimental.vector.reduce.v2.fmul"(%[[C]], %[[A]])
+//      REASSOC: %[[V:.*]] = "llvm.intr.vector.reduce.fmul"(%[[C]], %[[A]])
 // REASSOC-SAME: {reassoc = true} : (!llvm.float, !llvm.vec<16 x float>) -> !llvm.float
 //      REASSOC: llvm.return %[[V]] : !llvm.float
 //
diff --git a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
--- a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
+++ b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
@@ -774,7 +774,7 @@
 // CHECK-LABEL: llvm.func @reduce_f16(
 // CHECK-SAME: %[[A:.*]]: !llvm.vec<16 x half>)
 //      CHECK: %[[C:.*]] = llvm.mlir.constant(0.000000e+00 : f16) : !llvm.half
-//      CHECK: %[[V:.*]] = "llvm.intr.experimental.vector.reduce.v2.fadd"(%[[C]], %[[A]])
+//      CHECK: %[[V:.*]] = "llvm.intr.vector.reduce.fadd"(%[[C]], %[[A]])
 // CHECK-SAME: {reassoc = false} : (!llvm.half, !llvm.vec<16 x half>) -> !llvm.half
 //      CHECK: llvm.return %[[V]] : !llvm.half
 
@@ -785,7 +785,7 @@
 // CHECK-LABEL: llvm.func @reduce_f32(
 // CHECK-SAME: %[[A:.*]]: !llvm.vec<16 x float>)
 //      CHECK: %[[C:.*]] = llvm.mlir.constant(0.000000e+00 : f32) : !llvm.float
-//      CHECK: %[[V:.*]] = "llvm.intr.experimental.vector.reduce.v2.fadd"(%[[C]], %[[A]])
+//      CHECK: %[[V:.*]] = "llvm.intr.vector.reduce.fadd"(%[[C]], %[[A]])
 // CHECK-SAME: {reassoc = false} : (!llvm.float, !llvm.vec<16 x float>) -> !llvm.float
 //      CHECK: llvm.return %[[V]] : !llvm.float
 
@@ -796,7 +796,7 @@
 // CHECK-LABEL: llvm.func @reduce_f64(
 // CHECK-SAME: %[[A:.*]]: !llvm.vec<16 x double>)
 //      CHECK: %[[C:.*]] = llvm.mlir.constant(0.000000e+00 : f64) : !llvm.double
-//      CHECK: %[[V:.*]] = "llvm.intr.experimental.vector.reduce.v2.fadd"(%[[C]], %[[A]])
+//      CHECK: %[[V:.*]] = "llvm.intr.vector.reduce.fadd"(%[[C]], %[[A]])
 // CHECK-SAME: {reassoc = false} : (!llvm.double, !llvm.vec<16 x double>) -> !llvm.double
 //      CHECK: llvm.return %[[V]] : !llvm.double
 
@@ -806,7 +806,7 @@
 }
 // CHECK-LABEL: llvm.func @reduce_i8(
 // CHECK-SAME: %[[A:.*]]: !llvm.vec<16 x i8>)
-//      CHECK: %[[V:.*]] = "llvm.intr.experimental.vector.reduce.add"(%[[A]])
+//      CHECK: %[[V:.*]] = "llvm.intr.vector.reduce.add"(%[[A]])
 //      CHECK: llvm.return %[[V]] : !llvm.i8
 
 func @reduce_i32(%arg0: vector<16xi32>) -> i32 {
@@ -815,7 +815,7 @@
 }
 // CHECK-LABEL: llvm.func @reduce_i32(
 // CHECK-SAME: %[[A:.*]]: !llvm.vec<16 x i32>)
-//      CHECK: %[[V:.*]] = "llvm.intr.experimental.vector.reduce.add"(%[[A]])
+//      CHECK: %[[V:.*]] = "llvm.intr.vector.reduce.add"(%[[A]])
 //      CHECK: llvm.return %[[V]] : !llvm.i32
 
 func @reduce_i64(%arg0: vector<16xi64>) -> i64 {
@@ -824,7 +824,7 @@
 }
 // CHECK-LABEL: llvm.func @reduce_i64(
 // CHECK-SAME: %[[A:.*]]: !llvm.vec<16 x i64>)
-//      CHECK: %[[V:.*]] = "llvm.intr.experimental.vector.reduce.add"(%[[A]])
+//      CHECK: %[[V:.*]] = "llvm.intr.vector.reduce.add"(%[[A]])
 //      CHECK: llvm.return %[[V]] : !llvm.i64
 
 
diff --git a/mlir/test/Target/llvmir-intrinsics.mlir b/mlir/test/Target/llvmir-intrinsics.mlir
--- a/mlir/test/Target/llvmir-intrinsics.mlir
+++ b/mlir/test/Target/llvmir-intrinsics.mlir
@@ -182,36 +182,36 @@
 
 // CHECK-LABEL: @vector_reductions
 llvm.func @vector_reductions(%arg0: !llvm.float, %arg1: !llvm.vec<8 x float>, %arg2: !llvm.vec<8 x i32>) {
-  // CHECK: call i32 @llvm.experimental.vector.reduce.add.v8i32
-  "llvm.intr.experimental.vector.reduce.add"(%arg2) : (!llvm.vec<8 x i32>) -> !llvm.i32
-  // CHECK: call i32 @llvm.experimental.vector.reduce.and.v8i32
-  "llvm.intr.experimental.vector.reduce.and"(%arg2) : (!llvm.vec<8 x i32>) -> !llvm.i32
-  // CHECK: call float @llvm.experimental.vector.reduce.fmax.v8f32
-  "llvm.intr.experimental.vector.reduce.fmax"(%arg1) : (!llvm.vec<8 x float>) -> !llvm.float
-  // CHECK: call float @llvm.experimental.vector.reduce.fmin.v8f32
-  "llvm.intr.experimental.vector.reduce.fmin"(%arg1) : (!llvm.vec<8 x float>) -> !llvm.float
-  // CHECK: call i32 @llvm.experimental.vector.reduce.mul.v8i32
-  "llvm.intr.experimental.vector.reduce.mul"(%arg2) : (!llvm.vec<8 x i32>) -> !llvm.i32
-  // CHECK: call i32 @llvm.experimental.vector.reduce.or.v8i32
-  "llvm.intr.experimental.vector.reduce.or"(%arg2) : (!llvm.vec<8 x i32>) -> !llvm.i32
-  // CHECK: call i32 @llvm.experimental.vector.reduce.smax.v8i32
-  "llvm.intr.experimental.vector.reduce.smax"(%arg2) : (!llvm.vec<8 x i32>) -> !llvm.i32
-  // CHECK: call i32 @llvm.experimental.vector.reduce.smin.v8i32
-  "llvm.intr.experimental.vector.reduce.smin"(%arg2) : (!llvm.vec<8 x i32>) -> !llvm.i32
-  // CHECK: call i32 @llvm.experimental.vector.reduce.umax.v8i32
-  "llvm.intr.experimental.vector.reduce.umax"(%arg2) : (!llvm.vec<8 x i32>) -> !llvm.i32
-  // CHECK: call i32 @llvm.experimental.vector.reduce.umin.v8i32
-  "llvm.intr.experimental.vector.reduce.umin"(%arg2) : (!llvm.vec<8 x i32>) -> !llvm.i32
-  // CHECK: call float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32
-  "llvm.intr.experimental.vector.reduce.v2.fadd"(%arg0, %arg1) : (!llvm.float, !llvm.vec<8 x float>) -> !llvm.float
-  // CHECK: call float @llvm.experimental.vector.reduce.v2.fmul.f32.v8f32
-  "llvm.intr.experimental.vector.reduce.v2.fmul"(%arg0, %arg1) : (!llvm.float, !llvm.vec<8 x float>) -> !llvm.float
-  // CHECK: call reassoc float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32
-  "llvm.intr.experimental.vector.reduce.v2.fadd"(%arg0, %arg1) {reassoc = true} : (!llvm.float, !llvm.vec<8 x float>) -> !llvm.float
-  // CHECK: call reassoc float @llvm.experimental.vector.reduce.v2.fmul.f32.v8f32
-  "llvm.intr.experimental.vector.reduce.v2.fmul"(%arg0, %arg1) {reassoc = true} : (!llvm.float, !llvm.vec<8 x float>) -> !llvm.float
-  // CHECK: call i32 @llvm.experimental.vector.reduce.xor.v8i32
-  "llvm.intr.experimental.vector.reduce.xor"(%arg2) : (!llvm.vec<8 x i32>) -> !llvm.i32
+  // CHECK: call i32 @llvm.vector.reduce.add.v8i32
+  "llvm.intr.vector.reduce.add"(%arg2) : (!llvm.vec<8 x i32>) -> !llvm.i32
+  // CHECK: call i32 @llvm.vector.reduce.and.v8i32
+  "llvm.intr.vector.reduce.and"(%arg2) : (!llvm.vec<8 x i32>) -> !llvm.i32
+  // CHECK: call float @llvm.vector.reduce.fmax.v8f32
+  "llvm.intr.vector.reduce.fmax"(%arg1) : (!llvm.vec<8 x float>) -> !llvm.float
+  // CHECK: call float @llvm.vector.reduce.fmin.v8f32
+  "llvm.intr.vector.reduce.fmin"(%arg1) : (!llvm.vec<8 x float>) -> !llvm.float
+  // CHECK: call i32 @llvm.vector.reduce.mul.v8i32
+  "llvm.intr.vector.reduce.mul"(%arg2) : (!llvm.vec<8 x i32>) -> !llvm.i32
+  // CHECK: call i32 @llvm.vector.reduce.or.v8i32
+  "llvm.intr.vector.reduce.or"(%arg2) : (!llvm.vec<8 x i32>) -> !llvm.i32
+  // CHECK: call i32 @llvm.vector.reduce.smax.v8i32
+  "llvm.intr.vector.reduce.smax"(%arg2) : (!llvm.vec<8 x i32>) -> !llvm.i32
+  // CHECK: call i32 @llvm.vector.reduce.smin.v8i32
+  "llvm.intr.vector.reduce.smin"(%arg2) : (!llvm.vec<8 x i32>) -> !llvm.i32
+  // CHECK: call i32 @llvm.vector.reduce.umax.v8i32
+  "llvm.intr.vector.reduce.umax"(%arg2) : (!llvm.vec<8 x i32>) -> !llvm.i32
+  // CHECK: call i32 @llvm.vector.reduce.umin.v8i32
+  "llvm.intr.vector.reduce.umin"(%arg2) : (!llvm.vec<8 x i32>) -> !llvm.i32
+  // CHECK: call float @llvm.vector.reduce.fadd.f32.v8f32
+  "llvm.intr.vector.reduce.fadd"(%arg0, %arg1) : (!llvm.float, !llvm.vec<8 x float>) -> !llvm.float
+  // CHECK: call float @llvm.vector.reduce.fmul.f32.v8f32
+  "llvm.intr.vector.reduce.fmul"(%arg0, %arg1) : (!llvm.float, !llvm.vec<8 x float>) -> !llvm.float
+  // CHECK: call reassoc float @llvm.vector.reduce.fadd.f32.v8f32
+  "llvm.intr.vector.reduce.fadd"(%arg0, %arg1) {reassoc = true} : (!llvm.float, !llvm.vec<8 x float>) -> !llvm.float
+  // CHECK: call reassoc float @llvm.vector.reduce.fmul.f32.v8f32
+  "llvm.intr.vector.reduce.fmul"(%arg0, %arg1) {reassoc = true} : (!llvm.float, !llvm.vec<8 x float>) -> !llvm.float
+  // CHECK: call i32 @llvm.vector.reduce.xor.v8i32
+  "llvm.intr.vector.reduce.xor"(%arg2) : (!llvm.vec<8 x i32>) -> !llvm.i32
   llvm.return
 }