diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -16536,6 +16536,7 @@
 respective operation across all elements of the vector, returning a single
 scalar result of the same element type.
 
+.. _int_vector_reduce_add:
 
 '``llvm.vector.reduce.add.*``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -16559,6 +16560,8 @@
 """"""""""
 The argument to this intrinsic must be a vector of integer values.
 
+.. _int_vector_reduce_fadd:
+
 '``llvm.vector.reduce.fadd.*``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -16611,6 +16614,8 @@
       %ord = call float @llvm.vector.reduce.fadd.v4f32(float %start_value, <4 x float> %input) ; sequential reduction
 
 
+.. _int_vector_reduce_mul:
+
 '``llvm.vector.reduce.mul.*``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -16633,6 +16638,8 @@
 """"""""""
 The argument to this intrinsic must be a vector of integer values.
 
+.. _int_vector_reduce_fmul:
+
 '``llvm.vector.reduce.fmul.*``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -16684,6 +16691,8 @@
       %unord = call reassoc float @llvm.vector.reduce.fmul.v4f32(float 1.0, <4 x float> %input) ; relaxed reduction
       %ord = call float @llvm.vector.reduce.fmul.v4f32(float %start_value, <4 x float> %input) ; sequential reduction
 
+.. _int_vector_reduce_and:
+
 '``llvm.vector.reduce.and.*``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -16705,6 +16714,8 @@
 """"""""""
 The argument to this intrinsic must be a vector of integer values.
 
+.. _int_vector_reduce_or:
+
 '``llvm.vector.reduce.or.*``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -16726,6 +16737,8 @@
 """"""""""
 The argument to this intrinsic must be a vector of integer values.
 
+.. _int_vector_reduce_xor:
+
 '``llvm.vector.reduce.xor.*``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -16747,6 +16760,8 @@
 """"""""""
 The argument to this intrinsic must be a vector of integer values.
 
+.. _int_vector_reduce_smax:
+
 '``llvm.vector.reduce.smax.*``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -16768,6 +16783,8 @@
 """"""""""
 The argument to this intrinsic must be a vector of integer values.
 
+.. _int_vector_reduce_smin:
+
 '``llvm.vector.reduce.smin.*``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -16789,6 +16806,8 @@
 """"""""""
 The argument to this intrinsic must be a vector of integer values.
 
+.. _int_vector_reduce_umax:
+
 '``llvm.vector.reduce.umax.*``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -16810,6 +16829,8 @@
 """"""""""
 The argument to this intrinsic must be a vector of integer values.
 
+.. _int_vector_reduce_umin:
+
 '``llvm.vector.reduce.umin.*``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -16831,6 +16852,8 @@
 """"""""""
 The argument to this intrinsic must be a vector of integer values.
 
+.. _int_vector_reduce_fmax:
+
 '``llvm.vector.reduce.fmax.*``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -16861,6 +16884,8 @@
 """"""""""
 The argument to this intrinsic must be a vector of floating-point values.
 
+.. _int_vector_reduce_fmin:
+
 '``llvm.vector.reduce.fmin.*``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -18473,6 +18498,697 @@
 
 
 
+.. _int_vp_reduce_add:
+
+'``llvm.vp.reduce.add.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare i32 @llvm.vp.reduce.add.v4i32(<4 x i32> <op>, <4 x i1> <mask>, i32 <vector_length>)
+      declare i16 @llvm.vp.reduce.add.nxv8i16(<vscale x 8 x i16> <op>, <vscale x 8 x i1> <mask>, i32 <vector_length>)
+
+Overview:
+"""""""""
+
+Predicated integer ``ADD`` reduction of a vector, returning the result as a
+scalar. The return type matches the element type of the vector input.
+
+
+Arguments:
+""""""""""
+
+The first operand must be a vector of integer values. The result type must be
+the element type of the first operand. The second operand is the vector mask
+and has the same number of elements as the first operand. The third operand is
+the explicit vector length of the operation.
+
+Semantics:
+""""""""""
+
+The '``llvm.vp.reduce.add``' intrinsic performs the integer ``ADD`` reduction
+(:ref:`llvm.vector.reduce.add <int_vector_reduce_add>`) of the vector operand
+on each enabled lane. Disabled lanes are treated as containing the neutral
+value (``0``) (i.e. having no effect on the reduction operation).
+
+Examples:
+"""""""""
+
+.. code-block:: llvm
+
+      %r = call i32 @llvm.vp.reduce.add.v4i32(<4 x i32> %a, <4 x i1> %mask, i32 %evl)
+      ; %r is equivalent to %also.r, where lanes greater than or equal to %evl
+      ; are treated as though %mask were false for those lanes.
+
+      %masked.a = select <4 x i1> %mask, <4 x i32> %a, <4 x i32> zeroinitializer
+      %also.r = call i32 @llvm.vector.reduce.add.nv432(<4 x i32> %masked.a)
+
+
+.. _int_vp_reduce_fadd:
+
+'``llvm.vp.reduce.fadd.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare float @llvm.vp.reduce.fadd.v4f32(float <start_value>, <4 x float> <op>, <4 x i1> <mask>, i32 <vector_length>)
+      declare double @llvm.vp.reduce.fadd.nxv8f64(double <start_value>, <vscale x 8 x double> <op>, <vscale x 8 x i1> <mask>, i32 <vector_length>)
+
+Overview:
+"""""""""
+
+Predicated floating-point ``ADD`` reduction of a vector, returning the result
+as a scalar. The return type matches the element type of the vector input.
+
+
+Arguments:
+""""""""""
+
+The first operand must be a vector of floating-point values. The result type
+must be the element type of the first operand. The second operand is the vector
+mask and has the same number of elements as the first operand. The third
+operand is the explicit vector length of the operation.
+
+Semantics:
+""""""""""
+
+The '``llvm.vp.reduce.fadd``' intrinsic performs the floating-point ``ADD``
+reduction (:ref:`llvm.vector.reduce.fadd <int_vector_reduce_fadd>`) of the
+vector operand on each enabled lane. Disabled lanes are treated as containing
+the neutral value (``-0.0``) (i.e. having no effect on the reduction
+operation). If no lanes are enabled, the resulting value will be equal to the
+starting value.
+
+If the call has the ``reassoc`` flag set, then the reduction will not preserve
+the associativity of the equivalent scalarized counterpart. Otherwise the
+reduction will be *sequential*. See the unpredicated version
+(:ref:`llvm.vector.reduce.fadd <int_vector_reduce_fadd>`) for more detail.
+
+Examples:
+"""""""""
+
+.. code-block:: llvm
+
+      %r = call float @llvm.vp.reduce.fadd.v4f32(float %start, <4 x float> %a, <4 x i1> %mask, i32 %evl)
+      ; %r is equivalent to %also.r, where lanes greater than or equal to %evl
+      ; are treated as though %mask were false for those lanes.
+
+      %masked.a = select <4 x i1> %mask, <4 x float> %a, <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>
+      %also.r = call float @llvm.vector.reduce.fadd.v4f32(float %start, <4 x float> %masked.a)
+
+
+.. _int_vp_reduce_mul:
+
+'``llvm.vp.reduce.mul.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare i32 @llvm.vp.reduce.mul.v4i32(<4 x i32> <op>, <4 x i1> <mask>, i32 <vector_length>)
+      declare i16 @llvm.vp.reduce.mul.nxv8i16(<vscale x 8 x i16> <op>, <vscale x 8 x i1> <mask>, i32 <vector_length>)
+
+Overview:
+"""""""""
+
+Predicated integer ``MUL`` reduction of a vector, returning the result as a
+scalar. The return type matches the element type of the vector input.
+
+
+Arguments:
+""""""""""
+
+The first operand must be a vector of integer values. The result type must be
+the element type of the first operand. The second operand is the vector mask
+and has the same number of elements as the first operand. The third operand is
+the explicit vector length of the operation.
+
+Semantics:
+""""""""""
+
+The '``llvm.vp.reduce.mul``' intrinsic performs the integer ``MUL`` reduction
+(:ref:`llvm.vector.reduce.mul <int_vector_reduce_mul>`) of the vector operand
+on each enabled lane. Disabled lanes are treated as containing the neutral
+value (``1``) (i.e. having no effect on the reduction operation).
+
+Examples:
+"""""""""
+
+.. code-block:: llvm
+
+      %r = call i32 @llvm.vp.reduce.mul.v4i32(<4 x i32> %a, <4 x i1> %mask, i32 %evl)
+      ; %r is equivalent to %also.r, where lanes greater than or equal to %evl
+      ; are treated as though %mask were false for those lanes.
+
+      %masked.a = select <4 x i1> %mask, <4 x i32> %a, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+      %also.r = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %masked.a)
+
+.. _int_vp_reduce_fmul:
+
+'``llvm.vp.reduce.fmul.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare float @llvm.vp.reduce.fmul.v4f32(float <start_value>, <4 x float> <op>, <4 x i1> <mask>, i32 <vector_length>)
+      declare double @llvm.vp.reduce.fmul.nxv8f64(double <start_value>, <vscale x 8 x double> <op>, <vscale x 8 x i1> <mask>, i32 <vector_length>)
+
+Overview:
+"""""""""
+
+Predicated floating-point ``MUL`` reduction of a vector, returning the result
+as a scalar. The return type matches the element type of the vector input.
+
+
+Arguments:
+""""""""""
+
+The first operand must be a vector of floating-point values. The result type
+must be the element type of the first operand. The second operand is the vector
+mask and has the same number of elements as the first operand. The third
+operand is the explicit vector length of the operation.
+
+Semantics:
+""""""""""
+
+The '``llvm.vp.reduce.fmul``' intrinsic performs the floating-point ``MUL``
+reduction (:ref:`llvm.vector.reduce.fmul <int_vector_reduce_fmul>`) of the
+vector operand on each enabled lane. Disabled lanes are treated as containing
+the neutral value (``1.0``) (i.e. having no effect on the reduction operation).
+If no lanes are enabled, the resulting value will be equal to the starting
+value.
+
+If the call has the ``reassoc`` flag set, then the reduction will not preserve
+the associativity of the equivalent scalarized counterpart. Otherwise the
+reduction will be *sequential*. See the unpredicated version
+(:ref:`llvm.vector.reduce.fmul <int_vector_reduce_fmul>`) for more detail.
+
+Examples:
+"""""""""
+
+.. code-block:: llvm
+
+      %r = call float @llvm.vp.reduce.fmul.v4f32(float %start, <4 x float> %a, <4 x i1> %mask, i32 %evl)
+      ; %r is equivalent to %also.r, where lanes greater than or equal to %evl
+      ; are treated as though %mask were false for those lanes.
+
+      %masked.a = select <4 x i1> %mask, <4 x float> %a, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>
+      %also.r = call float @llvm.vector.reduce.fmul.v4f32(float %start, <4 x float> %masked.a)
+
+
+.. _int_vp_reduce_and:
+
+'``llvm.vp.reduce.and.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare i32 @llvm.vp.reduce.and.v4i32(<4 x i32> <op>, <4 x i1> <mask>, i32 <vector_length>)
+      declare i16 @llvm.vp.reduce.and.nxv8i16(<vscale x 8 x i16> <op>, <vscale x 8 x i1> <mask>, i32 <vector_length>)
+
+Overview:
+"""""""""
+
+Predicated integer ``AND`` reduction of a vector, returning the result as a
+scalar. The return type matches the element type of the vector input.
+
+
+Arguments:
+""""""""""
+
+The first operand must be a vector of integer values. The result type must be
+the element type of the first operand. The second operand is the vector mask
+and has the same number of elements as the first operand. The third operand is
+the explicit vector length of the operation.
+
+Semantics:
+""""""""""
+
+The '``llvm.vp.reduce.and``' intrinsic performs the integer ``AND`` reduction
+(:ref:`llvm.vector.reduce.and <int_vector_reduce_and>`) of the vector operand
+on each enabled lane. Disabled lanes are treated as containing the neutral
+value (``1``) (i.e. having no effect on the reduction operation).
+
+Examples:
+"""""""""
+
+.. code-block:: llvm
+
+      %r = call i32 @llvm.vp.reduce.and.v4i32(<4 x i32> %a, <4 x i1> %mask, i32 %evl)
+      ; %r is equivalent to %also.r, where lanes greater than or equal to %evl
+      ; are treated as though %mask were false for those lanes.
+
+      %masked.a = select <4 x i1> %mask, <4 x i32> %a, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+      %also.r = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %masked.a)
+
+
+.. _int_vp_reduce_or:
+
+'``llvm.vp.reduce.or.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare i32 @llvm.vp.reduce.or.v4i32(<4 x i32> <op>, <4 x i1> <mask>, i32 <vector_length>)
+      declare i16 @llvm.vp.reduce.or.nxv8i16(<vscale x 8 x i16> <op>, <vscale x 8 x i1> <mask>, i32 <vector_length>)
+
+Overview:
+"""""""""
+
+Predicated integer ``OR`` reduction of a vector, returning the result as a
+scalar. The return type matches the element type of the vector input.
+
+
+Arguments:
+""""""""""
+
+The first operand must be a vector of integer values. The result type must be
+the element type of the first operand. The second operand is the vector mask
+and has the same number of elements as the first operand. The third operand is
+the explicit vector length of the operation.
+
+Semantics:
+""""""""""
+
+The '``llvm.vp.reduce.or``' intrinsic performs the integer ``OR`` reduction
+(:ref:`llvm.vector.reduce.or <int_vector_reduce_or>`) of the vector operand on
+each enabled lane. Disabled lanes are treated as containing the neutral value
+(``0``) (i.e. having no effect on the reduction operation).
+
+Examples:
+"""""""""
+
+.. code-block:: llvm
+
+      %r = call i32 @llvm.vp.reduce.or.v4i32(<4 x i32> %a, <4 x i1> %mask, i32 %evl)
+      ; %r is equivalent to %also.r, where lanes greater than or equal to %evl
+      ; are treated as though %mask were false for those lanes.
+
+      %masked.a = select <4 x i1> %mask, <4 x i32> %a, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+      %also.r = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %masked.a)
+
+.. _int_vp_reduce_xor:
+
+'``llvm.vp.reduce.xor.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare i32 @llvm.vp.reduce.xor.v4i32(<4 x i32> <op>, <4 x i1> <mask>, i32 <vector_length>)
+      declare i16 @llvm.vp.reduce.xor.nxv8i16(<vscale x 8 x i16> <op>, <vscale x 8 x i1> <mask>, i32 <vector_length>)
+
+Overview:
+"""""""""
+
+Predicated integer ``XOR`` reduction of a vector, returning the result as a
+scalar. The return type matches the element type of the vector input.
+
+
+Arguments:
+""""""""""
+
+The first operand must be a vector of integer values. The result type must be
+the element type of the first operand. The second operand is the vector mask
+and has the same number of elements as the first operand. The third operand is
+the explicit vector length of the operation.
+
+Semantics:
+""""""""""
+
+The '``llvm.vp.reduce.xor``' intrinsic performs the integer ``XOR`` reduction
+(:ref:`llvm.vector.reduce.xor <int_vector_reduce_xor>`) of the vector operand
+on each enabled lane. Disabled lanes are treated as containing the neutral
+value (``0``) (i.e. having no effect on the reduction operation).
+
+Examples:
+"""""""""
+
+.. code-block:: llvm
+
+      %r = call i32 @llvm.vp.reduce.xor.v4i32(<4 x i32> %a, <4 x i1> %mask, i32 %evl)
+      ; %r is equivalent to %also.r, where lanes greater than or equal to %evl
+      ; are treated as though %mask were false for those lanes.
+
+      %masked.a = select <4 x i1> %mask, <4 x i32> %a, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+      %also.r = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %masked.a)
+
+
+.. _int_vp_reduce_smax:
+
+'``llvm.vp.reduce.smax.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare i32 @llvm.vp.reduce.smax.v4i32(<4 x i32> <op>, <4 x i1> <mask>, i32 <vector_length>)
+      declare i16 @llvm.vp.reduce.smax.nxv8i16(<vscale x 8 x i16> <op>, <vscale x 8 x i1> <mask>, i32 <vector_length>)
+
+Overview:
+"""""""""
+
+Predicated signed-integer ``MAX`` reduction of a vector, returning the result
+as a scalar. The return type matches the element type of the vector input.
+
+
+Arguments:
+""""""""""
+
+The first operand must be a vector of integer values. The result type must be
+the element type of the first operand. The second operand is the vector mask
+and has the same number of elements as the first operand. The third operand is
+the explicit vector length of the operation.
+
+Semantics:
+""""""""""
+
+The '``llvm.vp.reduce.smax``' intrinsic performs the signed-integer ``MAX``
+reduction (:ref:`llvm.vector.reduce.smax <int_vector_reduce_smax>`) of the
+vector operand on each enabled lane. Disabled lanes are treated as containing
+the neutral value (``INT_MIN``) (i.e. having no effect on the reduction
+operation).
+
+Examples:
+"""""""""
+
+.. code-block:: llvm
+
+      %r = call i8 @llvm.vp.reduce.smax.v4i8(<4 x i8> %a, <4 x i1> %mask, i32 %evl)
+      ; %r is equivalent to %also.r, where lanes greater than or equal to %evl
+      ; are treated as though %mask were false for those lanes.
+
+      %masked.a = select <4 x i1> %mask, <4 x i8> %a, <4 x i8> <i8 -128, i8 -128, i8 -128, i8 -128>
+      %also.r = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> %masked.a)
+
+
+.. _int_vp_reduce_smin:
+
+'``llvm.vp.reduce.smin.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare i32 @llvm.vp.reduce.smin.v4i32(<4 x i32> <op>, <4 x i1> <mask>, i32 <vector_length>)
+      declare i16 @llvm.vp.reduce.smin.nxv8i16(<vscale x 8 x i16> <op>, <vscale x 8 x i1> <mask>, i32 <vector_length>)
+
+Overview:
+"""""""""
+
+Predicated signed-integer ``MIN`` reduction of a vector, returning the result
+as a scalar. The return type matches the element type of the vector input.
+
+
+Arguments:
+""""""""""
+
+The first operand must be a vector of integer values. The result type must be
+the element type of the first operand. The second operand is the vector mask
+and has the same number of elements as the first operand. The third operand is
+the explicit vector length of the operation.
+
+Semantics:
+""""""""""
+
+The '``llvm.vp.reduce.smin``' intrinsic performs the signed-integer ``MIN``
+reduction (:ref:`llvm.vector.reduce.smin <int_vector_reduce_smin>`) of the
+vector operand on each enabled lane. Disabled lanes are treated as containing
+the neutral value (``INT_MAX``) (i.e. having no effect on the reduction
+operation).
+
+Examples:
+"""""""""
+
+.. code-block:: llvm
+
+      %r = call i8 @llvm.vp.reduce.smin.v4i8(<4 x i8> %a, <4 x i1> %mask, i32 %evl)
+      ; %r is equivalent to %also.r, where lanes greater than or equal to %evl
+      ; are treated as though %mask were false for those lanes.
+
+      %masked.a = select <4 x i1> %mask, <4 x i8> %a, <4 x i8> <i8 127, i8 127, i8 127, i8 127>
+      %also.r = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> %masked.a)
+
+
+.. _int_vp_reduce_umax:
+
+'``llvm.vp.reduce.umax.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare i32 @llvm.vp.reduce.umax.v4i32(<4 x i32> <op>, <4 x i1> <mask>, i32 <vector_length>)
+      declare i16 @llvm.vp.reduce.umax.nxv8i16(<vscale x 8 x i16> <op>, <vscale x 8 x i1> <mask>, i32 <vector_length>)
+
+Overview:
+"""""""""
+
+Predicated unsigned-integer ``MAX`` reduction of a vector, returning the result
+as a scalar. The return type matches the element type of the vector input.
+
+
+Arguments:
+""""""""""
+
+The first operand must be a vector of integer values. The result type must be
+the element type of the first operand. The second operand is the vector mask
+and has the same number of elements as the first operand. The third operand is
+the explicit vector length of the operation.
+
+Semantics:
+""""""""""
+
+The '``llvm.vp.reduce.umax``' intrinsic performs the unsigned-integer ``MAX``
+reduction (:ref:`llvm.vector.reduce.umax <int_vector_reduce_umax>`) of the
+vector operand on each enabled lane. Disabled lanes are treated as containing
+the neutral value (``0``) (i.e. having no effect on the reduction operation).
+
+Examples:
+"""""""""
+
+.. code-block:: llvm
+
+      %r = call i32 @llvm.vp.reduce.umax.v4i32(<4 x i32> %a, <4 x i1> %mask, i32 %evl)
+      ; %r is equivalent to %also.r, where lanes greater than or equal to %evl
+      ; are treated as though %mask were false for those lanes.
+
+      %masked.a = select <4 x i1> %mask, <4 x i32> %a, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+      %also.r = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %masked.a)
+
+
+.. _int_vp_reduce_umin:
+
+'``llvm.vp.reduce.umin.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare i32 @llvm.vp.reduce.umin.v4i32(<4 x i32> <op>, <4 x i1> <mask>, i32 <vector_length>)
+      declare i16 @llvm.vp.reduce.umin.nxv8i16(<vscale x 8 x i16> <op>, <vscale x 8 x i1> <mask>, i32 <vector_length>)
+
+Overview:
+"""""""""
+
+Predicated unsigned-integer ``MIN`` reduction of a vector, returning the result
+as a scalar. The return type matches the element type of the vector input.
+
+
+Arguments:
+""""""""""
+
+The first operand must be a vector of integer values. The result type must be
+the element type of the first operand. The second operand is the vector mask
+and has the same number of elements as the first operand. The third operand is
+the explicit vector length of the operation.
+
+Semantics:
+""""""""""
+
+The '``llvm.vp.reduce.umin``' intrinsic performs the unsigned-integer ``MIN``
+reduction (:ref:`llvm.vector.reduce.umin <int_vector_reduce_umin>`) of the
+vector operand on each enabled lane. Disabled lanes are treated as containing
+the neutral value (``UINT_MAX``, or ``-1``) (i.e. having no effect on the
+reduction operation).
+
+Examples:
+"""""""""
+
+.. code-block:: llvm
+
+      %r = call i32 @llvm.vp.reduce.umin.v4i32(<4 x i32> %a, <4 x i1> %mask, i32 %evl)
+      ; %r is equivalent to %also.r, where lanes greater than or equal to %evl
+      ; are treated as though %mask were false for those lanes.
+
+      %masked.a = select <4 x i1> %mask, <4 x i32> %a, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
+      %also.r = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %masked.a)
+
+
+.. _int_vp_reduce_fmax:
+
+'``llvm.vp.reduce.fmax.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare float @llvm.vp.reduce.fmax.v4f32(<4 x float> <op>, <4 x i1> <mask>, float <vector_length>)
+      declare double @llvm.vp.reduce.fmax.nxv8f64(<vscale x 8 x double> <op>, <vscale x 8 x i1> <mask>, i32 <vector_length>)
+
+Overview:
+"""""""""
+
+Predicated floating-point ``MAX`` reduction of a vector, returning the result
+as a scalar. The return type matches the element type of the vector input.
+
+
+Arguments:
+""""""""""
+
+The first operand must be a vector of floating-point values. The result type
+must be the element type of the first operand. The second operand is the vector
+mask and has the same number of elements as the first operand. The third
+operand is the explicit vector length of the operation.
+
+Semantics:
+""""""""""
+
+The '``llvm.vp.reduce.fmax``' intrinsic performs the floating-point ``MAX``
+reduction (:ref:`llvm.vector.reduce.fmax <int_vector_reduce_fmax>`) of the
+vector operand on each enabled lane. Disabled lanes are treated as containing
+the neutral value (i.e. having no effect on the reduction operation).
+
+The neutral value is dependent on the :ref:`fast-math flags <fastmath>`. If no
+flags are set, the neutral value is ``-QNAN``. If ``nnan``  and ``ninf`` are
+both set, then the neutral value is the smallest floating-point value for the
+result type. If only ``nnan`` is set then the neutral value is ``-Infinity``.
+
+This instruction has the same comparison semantics as the
+:ref:`llvm.vector.reduce.fmax <int_vector_reduce_fmax>` intrinsic (and thus
+the '``llvm.maxnum.*``' intrinsic). That is, the result will always be a number
+unless all elements of the vector are ``NaN``. Note that this means if all
+lanes are disabled the result will *not* be a number. For a vector with maximum
+element magnitude ``0.0`` and containing both ``+0.0`` and ``-0.0`` elements,
+the sign of the result is unspecified.
+
+Examples:
+"""""""""
+
+.. code-block:: llvm
+
+      %r = call float @llvm.vp.reduce.fmax.v4f32(<4 x float> %a, <4 x i1> %mask, i32 %evl)
+      ; %r is equivalent to %also.r, where lanes greater than or equal to %evl
+      ; are treated as though %mask were false for those lanes.
+
+      %masked.a = select <4 x i1> %mask, <4 x float> %a, <4 x float> <float QNAN, float QNAN, float QNAN, float QNAN>
+      %also.r = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %masked.a)
+
+
+.. _int_vp_reduce_fmin:
+
+'``llvm.vp.reduce.fmin.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare float @llvm.vp.reduce.fmin.v4f32(<4 x float> <op>, <4 x i1> <mask>, float <vector_length>)
+      declare double @llvm.vp.reduce.fmin.nxv8f64(<vscale x 8 x double> <op>, <vscale x 8 x i1> <mask>, i32 <vector_length>)
+
+Overview:
+"""""""""
+
+Predicated floating-point ``MIN`` reduction of a vector, returning the result
+as a scalar. The return type matches the element type of the vector input.
+
+
+Arguments:
+""""""""""
+
+The first operand must be a vector of floating-point values. The result type
+must be the element type of the first operand. The second operand is the vector
+mask and has the same number of elements as the first operand. The third
+operand is the explicit vector length of the operation.
+
+Semantics:
+""""""""""
+
+The '``llvm.vp.reduce.fmin``' intrinsic performs the floating-point ``MIN``
+reduction (:ref:`llvm.vector.reduce.fmin <int_vector_reduce_fmin>`) of the
+vector operand on each enabled lane. Disabled lanes are treated as containing
+the neutral value (i.e. having no effect on the reduction operation).
+
+The neutral value is dependent on the :ref:`fast-math flags <fastmath>`. If no
+flags are set, the neutral value is ``+QNAN``. If ``nnan``  and ``ninf`` are
+both set, then the neutral value is the largest floating-point value for the
+result type. If only ``nnan`` is set then the neutral value is ``+Infinity``.
+
+This instruction has the same comparison semantics as the
+:ref:`llvm.vector.reduce.fmin <int_vector_reduce_fmin>` intrinsic (and thus the
+'``llvm.minnum.*``' intrinsic). That is, the result will always be a number
+unless all elements of the vector are ``NaN``. Note that this means if all
+lanes are disabled the result will *not* be a number. For a vector with maximum
+element magnitude ``0.0`` and containing both ``+0.0`` and ``-0.0`` elements,
+the sign of the result is unspecified.
+
+Examples:
+"""""""""
+
+.. code-block:: llvm
+
+      %r = call float @llvm.vp.reduce.fmin.v4f32(<4 x float> %a, <4 x i1> %mask, i32 %evl)
+      ; %r is equivalent to %also.r, where lanes greater than or equal to %evl
+      ; are treated as though %mask were false for those lanes.
+
+      %masked.a = select <4 x i1> %mask, <4 x float> %a, <4 x float> <float QNAN, float QNAN, float QNAN, float QNAN>
+      %also.r = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %masked.a)
+
+
 .. _int_get_active_lane_mask:
 
 '``llvm.get.active.lane.mask.*``' Intrinsics
diff --git a/llvm/include/llvm/IR/IntrinsicInst.h b/llvm/include/llvm/IR/IntrinsicInst.h
--- a/llvm/include/llvm/IR/IntrinsicInst.h
+++ b/llvm/include/llvm/IR/IntrinsicInst.h
@@ -428,6 +428,29 @@
 
   // Equivalent non-predicated opcode
   static Optional<unsigned> getFunctionalOpcodeForVP(Intrinsic::ID ID);
+
+  static bool isVPReduction(Intrinsic::ID ID);
+};
+
+/// This represents vector predication reduction intrinsics.
+class VPReductionIntrinsic : public VPIntrinsic {
+public:
+
+  unsigned getVectorParamPos() const;
+  Optional<unsigned> getStartParamPos() const;
+
+  static Optional<unsigned> getVectorParamPos(Intrinsic::ID ID);
+  static Optional<unsigned> getStartParamPos(Intrinsic::ID ID);
+
+  /// Methods for support type inquiry through isa, cast, and dyn_cast:
+  /// @{
+  static bool classof(const IntrinsicInst *I) {
+    return VPIntrinsic::isVPIntrinsic(I->getIntrinsicID());
+  }
+  static bool classof(const Value *V) {
+    return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+  }
+  /// @}
 };
 
 /// This is the common base class for constrained floating point intrinsics.
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1469,6 +1469,64 @@
                                 llvm_i32_ty]>;
 }
 
+// Reductions
+let IntrProperties = [IntrSpeculatable, IntrNoMem, IntrNoSync, IntrWillReturn] in {
+  def int_vp_reduce_fadd : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
+                                    [LLVMVectorElementType<0>,
+                                     llvm_anyvector_ty,
+                                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                                     llvm_i32_ty]>;
+  def int_vp_reduce_fmul : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
+                                    [LLVMVectorElementType<0>,
+                                     llvm_anyvector_ty,
+                                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                                     llvm_i32_ty]>;
+  def int_vp_reduce_add  : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
+                                    [llvm_anyvector_ty,
+                                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                                     llvm_i32_ty]>;
+  def int_vp_reduce_mul : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
+                                    [llvm_anyvector_ty,
+                                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                                     llvm_i32_ty]>;
+  def int_vp_reduce_and : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
+                                    [llvm_anyvector_ty,
+                                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                                     llvm_i32_ty]>;
+  def int_vp_reduce_or : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
+                                    [llvm_anyvector_ty,
+                                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                                     llvm_i32_ty]>;
+  def int_vp_reduce_xor : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
+                                    [llvm_anyvector_ty,
+                                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                                     llvm_i32_ty]>;
+  def int_vp_reduce_smax : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
+                                    [llvm_anyvector_ty,
+                                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                                     llvm_i32_ty]>;
+  def int_vp_reduce_smin : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
+                                    [llvm_anyvector_ty,
+                                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                                     llvm_i32_ty]>;
+  def int_vp_reduce_umax : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
+                                    [llvm_anyvector_ty,
+                                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                                     llvm_i32_ty]>;
+  def int_vp_reduce_umin : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
+                                    [llvm_anyvector_ty,
+                                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                                     llvm_i32_ty]>;
+  def int_vp_reduce_fmax : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
+                                    [llvm_anyvector_ty,
+                                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                                     llvm_i32_ty]>;
+  def int_vp_reduce_fmin : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
+                                    [llvm_anyvector_ty,
+                                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                                     llvm_i32_ty]>;
+}
+
 def int_get_active_lane_mask:
   DefaultAttrsIntrinsic<[llvm_anyvector_ty],
             [llvm_anyint_ty, LLVMMatchType<1>],
diff --git a/llvm/include/llvm/IR/VPIntrinsics.def b/llvm/include/llvm/IR/VPIntrinsics.def
--- a/llvm/include/llvm/IR/VPIntrinsics.def
+++ b/llvm/include/llvm/IR/VPIntrinsics.def
@@ -100,6 +100,16 @@
 #define HANDLE_VP_TO_CONSTRAINEDFP(HASROUND, HASEXCEPT, INTRINID)
 #endif
 
+// Map this VP intrinsic to its canonical functional intrinsic.
+#ifndef HANDLE_VP_TO_INTRIN
+#define HANDLE_VP_TO_INTRIN(VPID, IID)
+#endif
+
+// Map this VP reduction intrinsic to its reduction operand positions.
+#ifndef HANDLE_VP_REDUCTION
+#define HANDLE_VP_REDUCTION(ID, STARTPOS, VECTORPOS)
+#endif
+
 /// } Property Macros
 
 ///// Integer Arithmetic {
@@ -191,6 +201,92 @@
 
 ///// } Floating-Point Arithmetic
 
+///// Reduction {
+
+// Specialized helper macro for reductions (%x, %mask, %evl).
+#ifdef HELPER_REGISTER_REDUCTION_VP
+#error "The internal helper macro HELPER_REGISTER_REDUCTION_VP is already defined!"
+#endif
+#define HELPER_REGISTER_REDUCTION_VP(VPINTRIN, SDOPC, INTRIN) \
+BEGIN_REGISTER_VP(VPINTRIN, 1, 2, SDOPC, -1) \
+HANDLE_VP_TO_INTRIN(VPINTRIN, INTRIN) \
+HANDLE_VP_REDUCTION(VPINTRIN, None, 0) \
+END_REGISTER_VP(VPINTRIN, SDOPC)
+
+// llvm.vp.reduce.add(accu,x,mask,vlen)
+HELPER_REGISTER_REDUCTION_VP(vp_reduce_add, VP_REDUCE_ADD,
+                             experimental_vector_reduce_add)
+
+// llvm.vp.reduce.mul(accu,x,mask,vlen)
+HELPER_REGISTER_REDUCTION_VP(vp_reduce_mul, VP_REDUCE_MUL,
+                             experimental_vector_reduce_mul)
+
+// llvm.vp.reduce.and(accu,x,mask,vlen)
+HELPER_REGISTER_REDUCTION_VP(vp_reduce_and, VP_REDUCE_AND,
+                             experimental_vector_reduce_and)
+
+// llvm.vp.reduce.or(accu,x,mask,vlen)
+HELPER_REGISTER_REDUCTION_VP(vp_reduce_or, VP_REDUCE_OR,
+                             experimental_vector_reduce_or)
+
+// llvm.vp.reduce.xor(accu,x,mask,vlen)
+HELPER_REGISTER_REDUCTION_VP(vp_reduce_xor, VP_REDUCE_XOR,
+                             experimental_vector_reduce_xor)
+
+// llvm.vp.reduce.smax(accu,x,mask,vlen)
+HELPER_REGISTER_REDUCTION_VP(vp_reduce_smax, VP_REDUCE_SMAX,
+                             experimental_vector_reduce_smax)
+
+// llvm.vp.reduce.smin(accu,x,mask,vlen)
+HELPER_REGISTER_REDUCTION_VP(vp_reduce_smin, VP_REDUCE_SMIN,
+                             experimental_vector_reduce_smin)
+
+// llvm.vp.reduce.umax(accu,x,mask,vlen)
+HELPER_REGISTER_REDUCTION_VP(vp_reduce_umax, VP_REDUCE_UMAX,
+                             experimental_vector_reduce_umax)
+
+// llvm.vp.reduce.umin(accu,x,mask,vlen)
+HELPER_REGISTER_REDUCTION_VP(vp_reduce_umin, VP_REDUCE_UMIN,
+                             experimental_vector_reduce_umin)
+
+// llvm.vp.reduce.fmax(accu,x,mask,vlen)
+HELPER_REGISTER_REDUCTION_VP(vp_reduce_fmax, VP_REDUCE_FMAX,
+                             experimental_vector_reduce_fmax)
+
+// llvm.vp.reduce.fmin(accu,x,mask,vlen)
+HELPER_REGISTER_REDUCTION_VP(vp_reduce_fmin, VP_REDUCE_FMIN,
+                             experimental_vector_reduce_fmin)
+
+#undef HELPER_REGISTER_REDUCTION_VP
+
+// Specialized helper macro for reductions with a starting value (%acc, %x, %mask, %evl).
+#ifdef HELPER_REGISTER_REDUCTION_START_VP
+#error "The internal helper macro HELPER_REGISTER_REDUCTION_START_VP is already defined!"
+#endif
+#define HELPER_REGISTER_REDUCTION_START_VP(VPINTRIN, SDOPC, SEQ_SDOPC, INTRIN) \
+BEGIN_REGISTER_VP_INTRINSIC(VPINTRIN, 2, 3) \
+BEGIN_REGISTER_VP_SDNODE(SDOPC, -1, VPINTRIN, 1, 2) \
+END_REGISTER_VP_SDNODE(SDOPC) \
+BEGIN_REGISTER_VP_SDNODE(SEQ_SDOPC, -1, VPINTRIN, 2, 3) \
+END_REGISTER_VP_SDNODE(SEQ_SDOPC) \
+HANDLE_VP_TO_INTRIN(VPINTRIN, INTRIN) \
+HANDLE_VP_REDUCTION(VPINTRIN, 0, 1) \
+END_REGISTER_VP_INTRINSIC(VPINTRIN)
+
+// llvm.vp.reduce.fadd(start,x,mask,vlen)
+HELPER_REGISTER_REDUCTION_START_VP(vp_reduce_fadd, VP_REDUCE_FADD,
+                                   VP_REDUCE_SEQ_FADD,
+                                   experimental_vector_reduce_fadd)
+
+// llvm.vp.reduce.fmul(start,x,mask,vlen)
+HELPER_REGISTER_REDUCTION_START_VP(vp_reduce_fmul, VP_REDUCE_FMUL,
+                                   VP_REDUCE_SEQ_FMUL,
+                                   experimental_vector_reduce_fmul)
+
+#undef HELPER_REGISTER_REDUCTION_START_VP
+
+///// } Reduction
+
 #undef BEGIN_REGISTER_VP
 #undef BEGIN_REGISTER_VP_INTRINSIC
 #undef BEGIN_REGISTER_VP_SDNODE
@@ -199,3 +295,5 @@
 #undef END_REGISTER_VP_SDNODE
 #undef HANDLE_VP_TO_OPC
 #undef HANDLE_VP_TO_CONSTRAINEDFP
+#undef HANDLE_VP_TO_INTRIN
+#undef HANDLE_VP_REDUCTION
diff --git a/llvm/lib/CodeGen/ExpandVectorPredication.cpp b/llvm/lib/CodeGen/ExpandVectorPredication.cpp
--- a/llvm/lib/CodeGen/ExpandVectorPredication.cpp
+++ b/llvm/lib/CodeGen/ExpandVectorPredication.cpp
@@ -24,6 +24,7 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
@@ -158,6 +159,11 @@
   Value *expandPredicationInBinaryOperator(IRBuilder<> &Builder,
                                            VPIntrinsic &PI);
 
+  /// \brief Lower this VP reduction to a call to an unpredicated reduction
+  /// intrinsic.
+  Value *expandPredicationInReduction(IRBuilder<> &Builder,
+                                      VPReductionIntrinsic &PI);
+
   /// \brief Query TTI and expand the vector predication in \p P accordingly.
   Value *expandPredication(VPIntrinsic &PI);
 
@@ -248,6 +254,118 @@
   return NewBinOp;
 }
 
+static Value *getNeutralReductionElement(const VPReductionIntrinsic &VPI,
+                                         Type *EltTy) {
+  bool Negative = false;
+  unsigned EltBits = EltTy->getScalarSizeInBits();
+  switch (VPI.getIntrinsicID()) {
+  default:
+    llvm_unreachable("Expecting a VP reduction intrinsic");
+  case Intrinsic::vp_reduce_add:
+  case Intrinsic::vp_reduce_or:
+  case Intrinsic::vp_reduce_xor:
+  case Intrinsic::vp_reduce_umax:
+    return Constant::getNullValue(EltTy);
+  case Intrinsic::vp_reduce_mul:
+    return ConstantInt::get(EltTy, 1, /*IsSigned*/ false);
+  case Intrinsic::vp_reduce_and:
+  case Intrinsic::vp_reduce_umin:
+    return ConstantInt::getAllOnesValue(EltTy);
+  case Intrinsic::vp_reduce_smin:
+    return ConstantInt::get(EltTy->getContext(),
+                            APInt::getSignedMaxValue(EltBits));
+  case Intrinsic::vp_reduce_smax:
+    return ConstantInt::get(EltTy->getContext(),
+                            APInt::getSignedMinValue(EltBits));
+  case Intrinsic::vp_reduce_fmax:
+    Negative = true;
+    LLVM_FALLTHROUGH;
+  case Intrinsic::vp_reduce_fmin: {
+    FastMathFlags Flags = VPI.getFastMathFlags();
+    const fltSemantics &Semantics = EltTy->getFltSemantics();
+    return !Flags.noNaNs() ? ConstantFP::getQNaN(EltTy, Negative)
+           : !Flags.noInfs()
+               ? ConstantFP::getInfinity(EltTy, Negative)
+               : ConstantFP::get(EltTy,
+                                 APFloat::getLargest(Semantics, Negative));
+  }
+  case Intrinsic::vp_reduce_fadd:
+    return ConstantFP::getNegativeZero(EltTy);
+  case Intrinsic::vp_reduce_fmul:
+    return ConstantFP::get(EltTy, 1.0);
+  }
+}
+
+Value *
+CachingVPExpander::expandPredicationInReduction(IRBuilder<> &Builder,
+                                                VPReductionIntrinsic &VPI) {
+  assert((isSafeToSpeculativelyExecute(&VPI) ||
+          VPI.canIgnoreVectorLengthParam()) &&
+         "Implicitly dropping %evl in non-speculatable operator!");
+
+  Value *Mask = VPI.getMaskParam();
+  Value *RedOp = VPI.getOperand(VPI.getVectorParamPos());
+
+  // Insert neutral element in masked-out positions
+  if (Mask && !isAllTrueMask(Mask)) {
+    auto *NeutralElt = getNeutralReductionElement(VPI, VPI.getType());
+    auto *NeutralVector = Builder.CreateVectorSplat(
+        cast<VectorType>(RedOp->getType())->getElementCount(), NeutralElt);
+    RedOp = Builder.CreateSelect(Mask, RedOp, NeutralVector);
+  }
+
+  Value *NewReduction;
+
+  switch (VPI.getIntrinsicID()) {
+  default:
+    llvm_unreachable("Impossible reduction kind");
+  case Intrinsic::vp_reduce_add:
+    NewReduction = Builder.CreateAddReduce(RedOp);
+    break;
+  case Intrinsic::vp_reduce_mul:
+    NewReduction = Builder.CreateMulReduce(RedOp);
+    break;
+  case Intrinsic::vp_reduce_and:
+    NewReduction = Builder.CreateAndReduce(RedOp);
+    break;
+  case Intrinsic::vp_reduce_or:
+    NewReduction = Builder.CreateOrReduce(RedOp);
+    break;
+  case Intrinsic::vp_reduce_xor:
+    NewReduction = Builder.CreateXorReduce(RedOp);
+    break;
+  case Intrinsic::vp_reduce_smax:
+    NewReduction = Builder.CreateIntMaxReduce(RedOp, /*IsSigned*/ true);
+    break;
+  case Intrinsic::vp_reduce_smin:
+    NewReduction = Builder.CreateIntMinReduce(RedOp, /*IsSigned*/ true);
+    break;
+  case Intrinsic::vp_reduce_umax:
+    NewReduction = Builder.CreateIntMaxReduce(RedOp, /*IsSigned*/ false);
+    break;
+  case Intrinsic::vp_reduce_umin:
+    NewReduction = Builder.CreateIntMinReduce(RedOp, /*IsSigned*/ false);
+    break;
+  case Intrinsic::vp_reduce_fmax:
+    NewReduction = Builder.CreateFPMaxReduce(RedOp);
+    break;
+  case Intrinsic::vp_reduce_fmin:
+    NewReduction = Builder.CreateFPMinReduce(RedOp);
+    break;
+  case Intrinsic::vp_reduce_fadd:
+    NewReduction = Builder.CreateFAddReduce(
+        VPI.getOperand(*VPI.getStartParamPos()), RedOp);
+    break;
+  case Intrinsic::vp_reduce_fmul:
+    NewReduction = Builder.CreateFMulReduce(
+        VPI.getOperand(*VPI.getStartParamPos()), RedOp);
+    break;
+  }
+
+  replaceOperation(*NewReduction, VPI);
+  return NewReduction;
+}
+
 void CachingVPExpander::discardEVLParameter(VPIntrinsic &VPI) {
   LLVM_DEBUG(dbgs() << "Discard EVL parameter in " << VPI << "\n");
 
@@ -321,6 +439,9 @@
   if (OC && Instruction::isBinaryOp(*OC))
     return expandPredicationInBinaryOperator(Builder, VPI);
 
+  if (auto *VPRI = dyn_cast<VPReductionIntrinsic>(&VPI))
+    return expandPredicationInReduction(Builder, *VPRI);
+
   return &VPI;
 }
 
diff --git a/llvm/lib/IR/IntrinsicInst.cpp b/llvm/lib/IR/IntrinsicInst.cpp
--- a/llvm/lib/IR/IntrinsicInst.cpp
+++ b/llvm/lib/IR/IntrinsicInst.cpp
@@ -410,11 +410,58 @@
 
   // TODO: Extend this for other VP intrinsics as they are upstreamed. This
   // works for binary arithmetic VP intrinsics.
-  auto *VPFunc = Intrinsic::getDeclaration(M, VPID, Params[0]->getType());
+  Type *OverloadTy = Params[0]->getType();
+  if (VPIntrinsic::isVPReduction(VPID))
+    OverloadTy =
+        Params[*VPReductionIntrinsic::getVectorParamPos(VPID)]->getType();
+
+  auto *VPFunc = Intrinsic::getDeclaration(M, VPID, OverloadTy);
   assert(VPFunc && "Could not declare VP intrinsic");
   return VPFunc;
 }
 
+bool VPIntrinsic::isVPReduction(Intrinsic::ID ID) {
+  switch (ID) {
+  default:
+    return false;
+#define HANDLE_VP_REDUCTION(VPID, STARTPOS, VECTORPOS)                         \
+  case Intrinsic::VPID:                                                        \
+    break;
+#include "llvm/IR/VPIntrinsics.def"
+  }
+  return true;
+}
+
+unsigned VPReductionIntrinsic::getVectorParamPos() const {
+  return *VPReductionIntrinsic::getVectorParamPos(getIntrinsicID());
+}
+
+Optional<unsigned> VPReductionIntrinsic::getStartParamPos() const {
+  return VPReductionIntrinsic::getStartParamPos(getIntrinsicID());
+}
+
+Optional<unsigned> VPReductionIntrinsic::getVectorParamPos(Intrinsic::ID ID) {
+  switch (ID) {
+#define HANDLE_VP_REDUCTION(VPID, STARTPOS, VECTORPOS)                         \
+  case Intrinsic::VPID:                                                        \
+    return VECTORPOS;
+#include "llvm/IR/VPIntrinsics.def"
+  default:
+    return None;
+  }
+}
+
+Optional<unsigned> VPReductionIntrinsic::getStartParamPos(Intrinsic::ID ID) {
+  switch (ID) {
+#define HANDLE_VP_REDUCTION(VPID, STARTPOS, VECTORPOS)                         \
+  case Intrinsic::VPID:                                                        \
+    return STARTPOS;
+#include "llvm/IR/VPIntrinsics.def"
+  default:
+    return None;
+  }
+}
+
 Instruction::BinaryOps BinaryOpIntrinsic::getBinaryOp() const {
   switch (getIntrinsicID()) {
   case Intrinsic::uadd_with_overflow:
diff --git a/llvm/test/CodeGen/Generic/expand-vp.ll b/llvm/test/CodeGen/Generic/expand-vp.ll
--- a/llvm/test/CodeGen/Generic/expand-vp.ll
+++ b/llvm/test/CodeGen/Generic/expand-vp.ll
@@ -25,6 +25,10 @@
 declare <8 x i32> @llvm.vp.ashr.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
 declare <8 x i32> @llvm.vp.lshr.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
 declare <8 x i32> @llvm.vp.shl.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
+; Reductions
+declare i32 @llvm.vp.reduce.add.v4i32(<4 x i32>, <4 x i1>, i32)
+declare i32 @llvm.vp.reduce.mul.v4i32(<4 x i32>, <4 x i1>, i32)
+declare float @llvm.vp.reduce.fmin.v4f32(<4 x float>, <4 x i1>, i32)
 
 ; Fixed vector test function.
 define void @test_vp_int_v8(<8 x i32> %i0, <8 x i32> %i1, <8 x i32> %i2, <8 x i32> %f3, <8 x i1> %m, i32 %n) {
@@ -78,6 +82,21 @@
   %rC = call <vscale x 4 x i32> @llvm.vp.shl.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n)
   ret void
 }
+
+; Fixed vector reduce test function.
+define void @test_vp_reduce_int_v4(<4 x i32> %vi, <4 x i1> %m, i32 %n) {
+  %r0 = call i32 @llvm.vp.reduce.add.v4i32(<4 x i32> %vi, <4 x i1> %m, i32 %n)
+  %r1 = call i32 @llvm.vp.reduce.mul.v4i32(<4 x i32> %vi, <4 x i1> %m, i32 %n)
+  ret void
+}
+
+define void @test_vp_reduce_fp_v4(<4 x float> %vi, <4 x i1> %m, i32 %n) {
+  %r0 = call float @llvm.vp.reduce.fmin.v4f32(<4 x float> %vi, <4 x i1> %m, i32 %n)
+  %r1 = call nnan float @llvm.vp.reduce.fmin.v4f32(<4 x float> %vi, <4 x i1> %m, i32 %n)
+  %r2 = call nnan ninf float @llvm.vp.reduce.fmin.v4f32(<4 x float> %vi, <4 x i1> %m, i32 %n)
+  ret void
+}
+
 ; All VP intrinsics have to be lowered into non-VP ops
 ; Convert %evl into %mask for non-speculatable VP intrinsics and emit the
 ; instruction+select idiom with a non-VP SIMD instruction.
@@ -121,7 +140,23 @@
 ; ALL-CONVERT:       ret void
 
 
+; Check that reductions use the correct neutral element for masked-off elements
+; ALL-CONVERT: define void @test_vp_reduce_int_v4(<4 x i32> %vi, <4 x i1> %m, i32 %n) {
+; ALL-CONVERT-NEXT:  [[ADD:%.+]] = select <4 x i1> %m, <4 x i32> %vi, <4 x i32> zeroinitializer
+; ALL-CONVERT-NEXT:  %{{.+}} = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[ADD]])
+; ALL-CONVERT-NEXT:  [[MUL:%.+]] = select <4 x i1> %m, <4 x i32> %vi, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; ALL-CONVERT-NEXT:  %{{.+}} = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[MUL]])
+; ALL-CONVERT-NEXT:  ret void
 
+; Check that reductions use the correct neutral element for masked-off elements
+; ALL-CONVERT: define void @test_vp_reduce_fp_v4(<4 x float> %vi, <4 x i1> %m, i32 %n) {
+; ALL-CONVERT-NEXT:  [[FMIN:%.+]] = select <4 x i1> %m, <4 x float> %vi, <4 x float> <float 0x7FF8000000000000, float 0x7FF8000000000000, float 0x7FF8000000000000, float 0x7FF8000000000000>
+; ALL-CONVERT-NEXT:  %{{.+}} = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[FMIN]])
+; ALL-CONVERT-NEXT:  [[FMIN_NNAN:%.+]] = select <4 x i1> %m, <4 x float> %vi, <4 x float> <float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000>
+; ALL-CONVERT-NEXT:  %{{.+}} = call nnan float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[FMIN_NNAN]])
+; ALL-CONVERT-NEXT:  [[FMIN_NNAN_NINF:%.+]] = select <4 x i1> %m, <4 x float> %vi, <4 x float> <float 0x47EFFFFFE0000000, float 0x47EFFFFFE0000000, float 0x47EFFFFFE0000000, float 0x47EFFFFFE0000000>
+; ALL-CONVERT-NEXT:  %{{.+}} = call nnan ninf float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[FMIN_NNAN_NINF]])
+; ALL-CONVERT-NEXT:  ret void
 
 ; All legal - don't transform anything.
 
@@ -157,6 +192,16 @@
 ; LEGAL_LEGAL-NEXT:  %rC = call <vscale x 4 x i32> @llvm.vp.shl.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n)
 ; LEGAL_LEGAL-NEXT:  ret void
 
+; LEGAL_LEGAL: define void @test_vp_reduce_int_v4(<4 x i32> %vi, <4 x i1> %m, i32 %n) {
+; LEGAL_LEGAL-NEXT:  %r0 = call i32 @llvm.vp.reduce.add.v4i32(<4 x i32> %vi, <4 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT:  %r1 = call i32 @llvm.vp.reduce.mul.v4i32(<4 x i32> %vi, <4 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT:  ret void
+
+; LEGAL_LEGAL: define void @test_vp_reduce_fp_v4(<4 x float> %vi, <4 x i1> %m, i32 %n) {
+; LEGAL_LEGAL-NEXT:  %r0 = call float @llvm.vp.reduce.fmin.v4f32(<4 x float> %vi, <4 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT:  %r1 = call nnan float @llvm.vp.reduce.fmin.v4f32(<4 x float> %vi, <4 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT:  %r2 = call nnan ninf float @llvm.vp.reduce.fmin.v4f32(<4 x float> %vi, <4 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT:  ret void
 
 ; Drop %evl where possible else fold %evl into %mask (%evl Discard, %mask Legal)
 ;
@@ -205,6 +250,16 @@
 ; DISCARD_LEGAL-NOT:  %{{.+}} = call <vscale x 4 x i32> @llvm.vp.{{.*}}, i32 %n)
 ; DISCARD_LEGAL:      ret void
 
+; DISCARD_LEGAL: define void @test_vp_reduce_int_v4(<4 x i32> %vi, <4 x i1> %m, i32 %n) {
+; DISCARD_LEGAL-NEXT:  %r0 = call i32 @llvm.vp.reduce.add.v4i32(<4 x i32> %vi, <4 x i1> %m, i32 4)
+; DISCARD_LEGAL-NEXT:  %r1 = call i32 @llvm.vp.reduce.mul.v4i32(<4 x i32> %vi, <4 x i1> %m, i32 4)
+; DISCARD_LEGAL-NEXT: ret void
+
+; DISCARD_LEGAL: define void @test_vp_reduce_fp_v4(<4 x float> %vi, <4 x i1> %m, i32 %n) {
+; DISCARD_LEGAL-NEXT:  %r0 = call float @llvm.vp.reduce.fmin.v4f32(<4 x float> %vi, <4 x i1> %m, i32 4)
+; DISCARD_LEGAL-NEXT:  %r1 = call nnan float @llvm.vp.reduce.fmin.v4f32(<4 x float> %vi, <4 x i1> %m, i32 4)
+; DISCARD_LEGAL-NEXT:  %r2 = call nnan ninf float @llvm.vp.reduce.fmin.v4f32(<4 x float> %vi, <4 x i1> %m, i32 4)
+; DISCARD_LEGAL-NEXT: ret void
 
 ; Convert %evl into %mask everywhere (%evl Convert, %mask Legal)
 ;
@@ -243,3 +298,21 @@
 ; CONVERT_LEGAL-NOT:    %{{.*}} = call <vscale x 4 x i32> @llvm.vp.{{.*}}, i32 %n)
 ; CONVERT_LEGAL:        ret void
 
+; CONVERT_LEGAL: define void @test_vp_reduce_int_v4(<4 x i32> %vi, <4 x i1> %m, i32 %n) {
+; CONVERT_LEGAL-NEXT:  [[NINS:%.+]] = insertelement <4 x i32> poison, i32 %n, i32 0
+; CONVERT_LEGAL-NEXT:  [[NSPLAT:%.+]] = shufflevector <4 x i32> [[NINS]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CONVERT_LEGAL-NEXT:  [[EVLM:%.+]] = icmp ult <4 x i32> <i32 0, i32 1, i32 2, i32 3>, [[NSPLAT]]
+; CONVERT_LEGAL-NEXT:  [[NEWM:%.+]] = and <4 x i1> [[EVLM]], %m
+; CONVERT_LEGAL-NEXT:  %{{.+}} = call i32 @llvm.vp.reduce.add.v4i32(<4 x i32> %vi, <4 x i1> [[NEWM]], i32 4)
+; CONVERT_LEGAL-NOT:   %{{.+}} = call i32 @llvm.vp.reduce.mul.v4i32(<4 x i32> %vi, <4 x i1> %m, i32 4)
+; CONVERT_LEGAL: ret void
+
+; CONVERT_LEGAL: define void @test_vp_reduce_fp_v4(<4 x float> %vi, <4 x i1> %m, i32 %n) {
+; CONVERT_LEGAL-NEXT:  [[NINS:%.+]] = insertelement <4 x i32> poison, i32 %n, i32 0
+; CONVERT_LEGAL-NEXT:  [[NSPLAT:%.+]] = shufflevector <4 x i32> [[NINS]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CONVERT_LEGAL-NEXT:  [[EVLM:%.+]] = icmp ult <4 x i32> <i32 0, i32 1, i32 2, i32 3>, [[NSPLAT]]
+; CONVERT_LEGAL-NEXT:  [[NEWM:%.+]] = and <4 x i1> [[EVLM]], %m
+; CONVERT_LEGAL-NEXT:  %{{.+}} = call float @llvm.vp.reduce.fmin.v4f32(<4 x float> %vi, <4 x i1> [[NEWM]], i32 4)
+; CONVERT_LEGAL-NOT:   %{{.+}} = call nnan float @llvm.vp.reduce.fmin.v4f32(<4 x float> %vi, <4 x i1> %m, i32 4)
+; CONVERT_LEGAL-NOT:   %{{.+}} = call nnan ninf float @llvm.vp.reduce.fmin.v4f32(<4 x float> %vi, <4 x i1> %m, i32 4)
+; CONVERT_LEGAL: ret void
diff --git a/llvm/test/Verifier/vp-intrinsics.ll b/llvm/test/Verifier/vp-intrinsics.ll
--- a/llvm/test/Verifier/vp-intrinsics.ll
+++ b/llvm/test/Verifier/vp-intrinsics.ll
@@ -29,6 +29,24 @@
 
 ; TODO: test_vp_constrained_fp
 
+
+define void @test_vp_reduction(<8 x i32> %vi, <8 x float> %vf, float %f, <8 x i1> %m, i32 %n) {
+  %r0 = call i32 @llvm.vp.reduce.add.v8i32(<8 x i32> %vi, <8 x i1> %m, i32 %n)
+  %r1 = call i32 @llvm.vp.reduce.mul.v8i32(<8 x i32> %vi, <8 x i1> %m, i32 %n)
+  %r2 = call i32 @llvm.vp.reduce.and.v8i32(<8 x i32> %vi, <8 x i1> %m, i32 %n)
+  %r3 = call i32 @llvm.vp.reduce.or.v8i32(<8 x i32> %vi, <8 x i1> %m, i32 %n)
+  %r4 = call i32 @llvm.vp.reduce.xor.v8i32(<8 x i32> %vi, <8 x i1> %m, i32 %n)
+  %r5 = call i32 @llvm.vp.reduce.smax.v8i32(<8 x i32> %vi, <8 x i1> %m, i32 %n)
+  %r6 = call i32 @llvm.vp.reduce.smin.v8i32(<8 x i32> %vi, <8 x i1> %m, i32 %n)
+  %r7 = call i32 @llvm.vp.reduce.umax.v8i32(<8 x i32> %vi, <8 x i1> %m, i32 %n)
+  %r8 = call i32 @llvm.vp.reduce.umin.v8i32(<8 x i32> %vi, <8 x i1> %m, i32 %n)
+  %r9 = call float @llvm.vp.reduce.fmin.v8f32(<8 x float> %vf, <8 x i1> %m, i32 %n)
+  %rA = call float @llvm.vp.reduce.fmax.v8f32(<8 x float> %vf, <8 x i1> %m, i32 %n)
+  %rB = call float @llvm.vp.reduce.fadd.v8f32(float %f, <8 x float> %vf, <8 x i1> %m, i32 %n)
+  %rC = call float @llvm.vp.reduce.fmul.v8f32(float %f, <8 x float> %vf, <8 x i1> %m, i32 %n)
+  ret void
+}
+
 ; integer arith
 declare <8 x i32> @llvm.vp.add.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
 declare <8 x i32> @llvm.vp.sub.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
@@ -50,3 +68,17 @@
 declare <8 x double> @llvm.vp.fmul.v8f64(<8 x double>, <8 x double>, <8 x i1>, i32)
 declare <8 x double> @llvm.vp.fdiv.v8f64(<8 x double>, <8 x double>, <8 x i1>, i32)
 declare <8 x double> @llvm.vp.frem.v8f64(<8 x double>, <8 x double>, <8 x i1>, i32)
+; reductions
+declare i32 @llvm.vp.reduce.add.v8i32(<8 x i32>, <8 x i1>, i32)
+declare i32 @llvm.vp.reduce.mul.v8i32(<8 x i32>, <8 x i1>, i32)
+declare i32 @llvm.vp.reduce.and.v8i32(<8 x i32>, <8 x i1>, i32)
+declare i32 @llvm.vp.reduce.or.v8i32(<8 x i32>, <8 x i1>, i32)
+declare i32 @llvm.vp.reduce.xor.v8i32(<8 x i32>, <8 x i1>, i32)
+declare i32 @llvm.vp.reduce.smax.v8i32(<8 x i32>, <8 x i1>, i32)
+declare i32 @llvm.vp.reduce.smin.v8i32(<8 x i32>, <8 x i1>, i32)
+declare i32 @llvm.vp.reduce.umax.v8i32(<8 x i32>, <8 x i1>, i32)
+declare i32 @llvm.vp.reduce.umin.v8i32(<8 x i32>, <8 x i1>, i32)
+declare float @llvm.vp.reduce.fmin.v8f32(<8 x float>, <8 x i1>, i32)
+declare float @llvm.vp.reduce.fmax.v8f32(<8 x float>, <8 x i1>, i32)
+declare float @llvm.vp.reduce.fadd.v8f32(float, <8 x float>, <8 x i1>, i32)
+declare float @llvm.vp.reduce.fmul.v8f32(float, <8 x float>, <8 x i1>, i32)
diff --git a/llvm/unittests/IR/VPIntrinsicTest.cpp b/llvm/unittests/IR/VPIntrinsicTest.cpp
--- a/llvm/unittests/IR/VPIntrinsicTest.cpp
+++ b/llvm/unittests/IR/VPIntrinsicTest.cpp
@@ -46,6 +46,21 @@
       Str << " declare <8 x float> @llvm.vp." << BinaryFPOpcode
           << ".v8f32(<8 x float>, <8 x float>, <8 x i1>, i32) ";
 
+    const char *ReductionIntOpcodes[] = {"add",  "mul",  "and",  "or",  "xor",
+                                         "smin", "smax", "umin", "umax"};
+    for (const char *ReductionOpcode : ReductionIntOpcodes)
+      Str << " declare i32 @llvm.vp.reduce." << ReductionOpcode
+          << ".v8i32(<8 x i32>, <8 x i1>, i32) ";
+
+    std::pair<const char *, bool> ReductionFPOpcodes[] = {
+        {"fadd", true}, {"fmul", true}, {"fmin", false}, {"fmax", false}};
+    for (const auto &ReductionPair : ReductionFPOpcodes)
+      Str << " declare float @llvm.vp.reduce." << ReductionPair.first
+          << ".v8f32(" << (ReductionPair.second ? "float, " : "")
+          << "<8 x float>, <8 x i1>, i32) ";
+
+    dbgs() << Str.str() << "\n";
+
     return parseAssemblyString(Str.str(), Err, C);
   }
 };