diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -16536,6 +16536,7 @@ respective operation across all elements of the vector, returning a single scalar result of the same element type. +.. _int_vector_reduce_add: '``llvm.vector.reduce.add.*``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -16559,6 +16560,8 @@ """""""""" The argument to this intrinsic must be a vector of integer values. +.. _int_vector_reduce_fadd: + '``llvm.vector.reduce.fadd.*``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -16611,6 +16614,8 @@ %ord = call float @llvm.vector.reduce.fadd.v4f32(float %start_value, <4 x float> %input) ; sequential reduction +.. _int_vector_reduce_mul: + '``llvm.vector.reduce.mul.*``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -16633,6 +16638,8 @@ """""""""" The argument to this intrinsic must be a vector of integer values. +.. _int_vector_reduce_fmul: + '``llvm.vector.reduce.fmul.*``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -16684,6 +16691,8 @@ %unord = call reassoc float @llvm.vector.reduce.fmul.v4f32(float 1.0, <4 x float> %input) ; relaxed reduction %ord = call float @llvm.vector.reduce.fmul.v4f32(float %start_value, <4 x float> %input) ; sequential reduction +.. _int_vector_reduce_and: + '``llvm.vector.reduce.and.*``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -16705,6 +16714,8 @@ """""""""" The argument to this intrinsic must be a vector of integer values. +.. _int_vector_reduce_or: + '``llvm.vector.reduce.or.*``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -16726,6 +16737,8 @@ """""""""" The argument to this intrinsic must be a vector of integer values. +.. _int_vector_reduce_xor: + '``llvm.vector.reduce.xor.*``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -16747,6 +16760,8 @@ """""""""" The argument to this intrinsic must be a vector of integer values. +.. _int_vector_reduce_smax: + '``llvm.vector.reduce.smax.*``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -16768,6 +16783,8 @@ """""""""" The argument to this intrinsic must be a vector of integer values. +.. _int_vector_reduce_smin: + '``llvm.vector.reduce.smin.*``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -16789,6 +16806,8 @@ """""""""" The argument to this intrinsic must be a vector of integer values. +.. _int_vector_reduce_umax: + '``llvm.vector.reduce.umax.*``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -16810,6 +16829,8 @@ """""""""" The argument to this intrinsic must be a vector of integer values. +.. _int_vector_reduce_umin: + '``llvm.vector.reduce.umin.*``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -16831,6 +16852,8 @@ """""""""" The argument to this intrinsic must be a vector of integer values. +.. _int_vector_reduce_fmax: + '``llvm.vector.reduce.fmax.*``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -16861,6 +16884,8 @@ """""""""" The argument to this intrinsic must be a vector of floating-point values. +.. _int_vector_reduce_fmin: + '``llvm.vector.reduce.fmin.*``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -18473,6 +18498,697 @@ +.. _int_vp_reduce_add: + +'``llvm.vp.reduce.add.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare i32 @llvm.vp.reduce.add.v4i32(<4 x i32> , <4 x i1> , i32 ) + declare i16 @llvm.vp.reduce.add.nxv8i16( , , i32 ) + +Overview: +""""""""" + +Predicated integer ``ADD`` reduction of a vector, returning the result as a +scalar. The return type matches the element type of the vector input. + + +Arguments: +"""""""""" + +The first operand must be a vector of integer values. The result type must be +the element type of the first operand. The second operand is the vector mask +and has the same number of elements as the first operand. The third operand is +the explicit vector length of the operation. + +Semantics: +"""""""""" + +The '``llvm.vp.reduce.add``' intrinsic performs the integer ``ADD`` reduction +(:ref:`llvm.vector.reduce.add `) of the vector operand +on each enabled lane. Disabled lanes are treated as containing the neutral +value (``0``) (i.e. having no effect on the reduction operation). + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call i32 @llvm.vp.reduce.add.v4i32(<4 x i32> %a, <4 x i1> %mask, i32 %evl) + ; %r is equivalent to %also.r, where lanes greater than or equal to %evl + ; are treated as though %mask were false for those lanes. + + %masked.a = select <4 x i1> %mask, <4 x i32> %a, <4 x i32> zeroinitializer + %also.r = call i32 @llvm.vector.reduce.add.nv432(<4 x i32> %masked.a) + + +.. _int_vp_reduce_fadd: + +'``llvm.vp.reduce.fadd.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare float @llvm.vp.reduce.fadd.v4f32(float , <4 x float> , <4 x i1> , i32 ) + declare double @llvm.vp.reduce.fadd.nxv8f64(double , , , i32 ) + +Overview: +""""""""" + +Predicated floating-point ``ADD`` reduction of a vector, returning the result +as a scalar. The return type matches the element type of the vector input. + + +Arguments: +"""""""""" + +The first operand must be a vector of floating-point values. The result type +must be the element type of the first operand. The second operand is the vector +mask and has the same number of elements as the first operand. The third +operand is the explicit vector length of the operation. + +Semantics: +"""""""""" + +The '``llvm.vp.reduce.fadd``' intrinsic performs the floating-point ``ADD`` +reduction (:ref:`llvm.vector.reduce.fadd `) of the +vector operand on each enabled lane. Disabled lanes are treated as containing +the neutral value (``-0.0``) (i.e. having no effect on the reduction +operation). If no lanes are enabled, the resulting value will be equal to the +starting value. + +If the call has the ``reassoc`` flag set, then the reduction will not preserve +the associativity of the equivalent scalarized counterpart. Otherwise the +reduction will be *sequential*. See the unpredicated version +(:ref:`llvm.vector.reduce.fadd `) for more detail. + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call float @llvm.vp.reduce.fadd.v4f32(float %start, <4 x float> %a, <4 x i1> %mask, i32 %evl) + ; %r is equivalent to %also.r, where lanes greater than or equal to %evl + ; are treated as though %mask were false for those lanes. + + %masked.a = select <4 x i1> %mask, <4 x float> %a, <4 x float> + %also.r = call float @llvm.vector.reduce.fadd.v4f32(float %start, <4 x float> %masked.a) + + +.. _int_vp_reduce_mul: + +'``llvm.vp.reduce.mul.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare i32 @llvm.vp.reduce.mul.v4i32(<4 x i32> , <4 x i1> , i32 ) + declare i16 @llvm.vp.reduce.mul.nxv8i16( , , i32 ) + +Overview: +""""""""" + +Predicated integer ``MUL`` reduction of a vector, returning the result as a +scalar. The return type matches the element type of the vector input. + + +Arguments: +"""""""""" + +The first operand must be a vector of integer values. The result type must be +the element type of the first operand. The second operand is the vector mask +and has the same number of elements as the first operand. The third operand is +the explicit vector length of the operation. + +Semantics: +"""""""""" + +The '``llvm.vp.reduce.mul``' intrinsic performs the integer ``MUL`` reduction +(:ref:`llvm.vector.reduce.mul `) of the vector operand +on each enabled lane. Disabled lanes are treated as containing the neutral +value (``1``) (i.e. having no effect on the reduction operation). + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call i32 @llvm.vp.reduce.mul.v4i32(<4 x i32> %a, <4 x i1> %mask, i32 %evl) + ; %r is equivalent to %also.r, where lanes greater than or equal to %evl + ; are treated as though %mask were false for those lanes. + + %masked.a = select <4 x i1> %mask, <4 x i32> %a, <4 x i32> + %also.r = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %masked.a) + +.. _int_vp_reduce_fmul: + +'``llvm.vp.reduce.fmul.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare float @llvm.vp.reduce.fmul.v4f32(float , <4 x float> , <4 x i1> , i32 ) + declare double @llvm.vp.reduce.fmul.nxv8f64(double , , , i32 ) + +Overview: +""""""""" + +Predicated floating-point ``MUL`` reduction of a vector, returning the result +as a scalar. The return type matches the element type of the vector input. + + +Arguments: +"""""""""" + +The first operand must be a vector of floating-point values. The result type +must be the element type of the first operand. The second operand is the vector +mask and has the same number of elements as the first operand. The third +operand is the explicit vector length of the operation. + +Semantics: +"""""""""" + +The '``llvm.vp.reduce.fmul``' intrinsic performs the floating-point ``MUL`` +reduction (:ref:`llvm.vector.reduce.fmul `) of the +vector operand on each enabled lane. Disabled lanes are treated as containing +the neutral value (``1.0``) (i.e. having no effect on the reduction operation). +If no lanes are enabled, the resulting value will be equal to the starting +value. + +If the call has the ``reassoc`` flag set, then the reduction will not preserve +the associativity of the equivalent scalarized counterpart. Otherwise the +reduction will be *sequential*. See the unpredicated version +(:ref:`llvm.vector.reduce.fmul `) for more detail. + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call float @llvm.vp.reduce.fmul.v4f32(float %start, <4 x float> %a, <4 x i1> %mask, i32 %evl) + ; %r is equivalent to %also.r, where lanes greater than or equal to %evl + ; are treated as though %mask were false for those lanes. + + %masked.a = select <4 x i1> %mask, <4 x float> %a, <4 x float> + %also.r = call float @llvm.vector.reduce.fmul.v4f32(float %start, <4 x float> %masked.a) + + +.. _int_vp_reduce_and: + +'``llvm.vp.reduce.and.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare i32 @llvm.vp.reduce.and.v4i32(<4 x i32> , <4 x i1> , i32 ) + declare i16 @llvm.vp.reduce.and.nxv8i16( , , i32 ) + +Overview: +""""""""" + +Predicated integer ``AND`` reduction of a vector, returning the result as a +scalar. The return type matches the element type of the vector input. + + +Arguments: +"""""""""" + +The first operand must be a vector of integer values. The result type must be +the element type of the first operand. The second operand is the vector mask +and has the same number of elements as the first operand. The third operand is +the explicit vector length of the operation. + +Semantics: +"""""""""" + +The '``llvm.vp.reduce.and``' intrinsic performs the integer ``AND`` reduction +(:ref:`llvm.vector.reduce.and `) of the vector operand +on each enabled lane. Disabled lanes are treated as containing the neutral +value (``1``) (i.e. having no effect on the reduction operation). + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call i32 @llvm.vp.reduce.and.v4i32(<4 x i32> %a, <4 x i1> %mask, i32 %evl) + ; %r is equivalent to %also.r, where lanes greater than or equal to %evl + ; are treated as though %mask were false for those lanes. + + %masked.a = select <4 x i1> %mask, <4 x i32> %a, <4 x i32> + %also.r = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %masked.a) + + +.. _int_vp_reduce_or: + +'``llvm.vp.reduce.or.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare i32 @llvm.vp.reduce.or.v4i32(<4 x i32> , <4 x i1> , i32 ) + declare i16 @llvm.vp.reduce.or.nxv8i16( , , i32 ) + +Overview: +""""""""" + +Predicated integer ``OR`` reduction of a vector, returning the result as a +scalar. The return type matches the element type of the vector input. + + +Arguments: +"""""""""" + +The first operand must be a vector of integer values. The result type must be +the element type of the first operand. The second operand is the vector mask +and has the same number of elements as the first operand. The third operand is +the explicit vector length of the operation. + +Semantics: +"""""""""" + +The '``llvm.vp.reduce.or``' intrinsic performs the integer ``OR`` reduction +(:ref:`llvm.vector.reduce.or `) of the vector operand on +each enabled lane. Disabled lanes are treated as containing the neutral value +(``0``) (i.e. having no effect on the reduction operation). + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call i32 @llvm.vp.reduce.or.v4i32(<4 x i32> %a, <4 x i1> %mask, i32 %evl) + ; %r is equivalent to %also.r, where lanes greater than or equal to %evl + ; are treated as though %mask were false for those lanes. + + %masked.a = select <4 x i1> %mask, <4 x i32> %a, <4 x i32> + %also.r = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %masked.a) + +.. _int_vp_reduce_xor: + +'``llvm.vp.reduce.xor.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare i32 @llvm.vp.reduce.xor.v4i32(<4 x i32> , <4 x i1> , i32 ) + declare i16 @llvm.vp.reduce.xor.nxv8i16( , , i32 ) + +Overview: +""""""""" + +Predicated integer ``XOR`` reduction of a vector, returning the result as a +scalar. The return type matches the element type of the vector input. + + +Arguments: +"""""""""" + +The first operand must be a vector of integer values. The result type must be +the element type of the first operand. The second operand is the vector mask +and has the same number of elements as the first operand. The third operand is +the explicit vector length of the operation. + +Semantics: +"""""""""" + +The '``llvm.vp.reduce.xor``' intrinsic performs the integer ``XOR`` reduction +(:ref:`llvm.vector.reduce.xor `) of the vector operand +on each enabled lane. Disabled lanes are treated as containing the neutral +value (``0``) (i.e. having no effect on the reduction operation). + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call i32 @llvm.vp.reduce.xor.v4i32(<4 x i32> %a, <4 x i1> %mask, i32 %evl) + ; %r is equivalent to %also.r, where lanes greater than or equal to %evl + ; are treated as though %mask were false for those lanes. + + %masked.a = select <4 x i1> %mask, <4 x i32> %a, <4 x i32> + %also.r = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %masked.a) + + +.. _int_vp_reduce_smax: + +'``llvm.vp.reduce.smax.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare i32 @llvm.vp.reduce.smax.v4i32(<4 x i32> , <4 x i1> , i32 ) + declare i16 @llvm.vp.reduce.smax.nxv8i16( , , i32 ) + +Overview: +""""""""" + +Predicated signed-integer ``MAX`` reduction of a vector, returning the result +as a scalar. The return type matches the element type of the vector input. + + +Arguments: +"""""""""" + +The first operand must be a vector of integer values. The result type must be +the element type of the first operand. The second operand is the vector mask +and has the same number of elements as the first operand. The third operand is +the explicit vector length of the operation. + +Semantics: +"""""""""" + +The '``llvm.vp.reduce.smax``' intrinsic performs the signed-integer ``MAX`` +reduction (:ref:`llvm.vector.reduce.smax `) of the +vector operand on each enabled lane. Disabled lanes are treated as containing +the neutral value (``INT_MIN``) (i.e. having no effect on the reduction +operation). + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call i8 @llvm.vp.reduce.smax.v4i8(<4 x i8> %a, <4 x i1> %mask, i32 %evl) + ; %r is equivalent to %also.r, where lanes greater than or equal to %evl + ; are treated as though %mask were false for those lanes. + + %masked.a = select <4 x i1> %mask, <4 x i8> %a, <4 x i8> + %also.r = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> %masked.a) + + +.. _int_vp_reduce_smin: + +'``llvm.vp.reduce.smin.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare i32 @llvm.vp.reduce.smin.v4i32(<4 x i32> , <4 x i1> , i32 ) + declare i16 @llvm.vp.reduce.smin.nxv8i16( , , i32 ) + +Overview: +""""""""" + +Predicated signed-integer ``MIN`` reduction of a vector, returning the result +as a scalar. The return type matches the element type of the vector input. + + +Arguments: +"""""""""" + +The first operand must be a vector of integer values. The result type must be +the element type of the first operand. The second operand is the vector mask +and has the same number of elements as the first operand. The third operand is +the explicit vector length of the operation. + +Semantics: +"""""""""" + +The '``llvm.vp.reduce.smin``' intrinsic performs the signed-integer ``MIN`` +reduction (:ref:`llvm.vector.reduce.smin `) of the +vector operand on each enabled lane. Disabled lanes are treated as containing +the neutral value (``INT_MAX``) (i.e. having no effect on the reduction +operation). + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call i8 @llvm.vp.reduce.smin.v4i8(<4 x i8> %a, <4 x i1> %mask, i32 %evl) + ; %r is equivalent to %also.r, where lanes greater than or equal to %evl + ; are treated as though %mask were false for those lanes. + + %masked.a = select <4 x i1> %mask, <4 x i8> %a, <4 x i8> + %also.r = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> %masked.a) + + +.. _int_vp_reduce_umax: + +'``llvm.vp.reduce.umax.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare i32 @llvm.vp.reduce.umax.v4i32(<4 x i32> , <4 x i1> , i32 ) + declare i16 @llvm.vp.reduce.umax.nxv8i16( , , i32 ) + +Overview: +""""""""" + +Predicated unsigned-integer ``MAX`` reduction of a vector, returning the result +as a scalar. The return type matches the element type of the vector input. + + +Arguments: +"""""""""" + +The first operand must be a vector of integer values. The result type must be +the element type of the first operand. The second operand is the vector mask +and has the same number of elements as the first operand. The third operand is +the explicit vector length of the operation. + +Semantics: +"""""""""" + +The '``llvm.vp.reduce.umax``' intrinsic performs the unsigned-integer ``MAX`` +reduction (:ref:`llvm.vector.reduce.umax `) of the +vector operand on each enabled lane. Disabled lanes are treated as containing +the neutral value (``0``) (i.e. having no effect on the reduction operation). + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call i32 @llvm.vp.reduce.umax.v4i32(<4 x i32> %a, <4 x i1> %mask, i32 %evl) + ; %r is equivalent to %also.r, where lanes greater than or equal to %evl + ; are treated as though %mask were false for those lanes. + + %masked.a = select <4 x i1> %mask, <4 x i32> %a, <4 x i32> + %also.r = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %masked.a) + + +.. _int_vp_reduce_umin: + +'``llvm.vp.reduce.umin.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare i32 @llvm.vp.reduce.umin.v4i32(<4 x i32> , <4 x i1> , i32 ) + declare i16 @llvm.vp.reduce.umin.nxv8i16( , , i32 ) + +Overview: +""""""""" + +Predicated unsigned-integer ``MIN`` reduction of a vector, returning the result +as a scalar. The return type matches the element type of the vector input. + + +Arguments: +"""""""""" + +The first operand must be a vector of integer values. The result type must be +the element type of the first operand. The second operand is the vector mask +and has the same number of elements as the first operand. The third operand is +the explicit vector length of the operation. + +Semantics: +"""""""""" + +The '``llvm.vp.reduce.umin``' intrinsic performs the unsigned-integer ``MIN`` +reduction (:ref:`llvm.vector.reduce.umin `) of the +vector operand on each enabled lane. Disabled lanes are treated as containing +the neutral value (``UINT_MAX``, or ``-1``) (i.e. having no effect on the +reduction operation). + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call i32 @llvm.vp.reduce.umin.v4i32(<4 x i32> %a, <4 x i1> %mask, i32 %evl) + ; %r is equivalent to %also.r, where lanes greater than or equal to %evl + ; are treated as though %mask were false for those lanes. + + %masked.a = select <4 x i1> %mask, <4 x i32> %a, <4 x i32> + %also.r = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %masked.a) + + +.. _int_vp_reduce_fmax: + +'``llvm.vp.reduce.fmax.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare float @llvm.vp.reduce.fmax.v4f32(<4 x float> , <4 x i1> , float ) + declare double @llvm.vp.reduce.fmax.nxv8f64( , , i32 ) + +Overview: +""""""""" + +Predicated floating-point ``MAX`` reduction of a vector, returning the result +as a scalar. The return type matches the element type of the vector input. + + +Arguments: +"""""""""" + +The first operand must be a vector of floating-point values. The result type +must be the element type of the first operand. The second operand is the vector +mask and has the same number of elements as the first operand. The third +operand is the explicit vector length of the operation. + +Semantics: +"""""""""" + +The '``llvm.vp.reduce.fmax``' intrinsic performs the floating-point ``MAX`` +reduction (:ref:`llvm.vector.reduce.fmax `) of the +vector operand on each enabled lane. Disabled lanes are treated as containing +the neutral value (i.e. having no effect on the reduction operation). + +The neutral value is dependent on the :ref:`fast-math flags `. If no +flags are set, the neutral value is ``-QNAN``. If ``nnan`` and ``ninf`` are +both set, then the neutral value is the smallest floating-point value for the +result type. If only ``nnan`` is set then the neutral value is ``-Infinity``. + +This instruction has the same comparison semantics as the +:ref:`llvm.vector.reduce.fmax ` intrinsic (and thus +the '``llvm.maxnum.*``' intrinsic). That is, the result will always be a number +unless all elements of the vector are ``NaN``. Note that this means if all +lanes are disabled the result will *not* be a number. For a vector with maximum +element magnitude ``0.0`` and containing both ``+0.0`` and ``-0.0`` elements, +the sign of the result is unspecified. + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call float @llvm.vp.reduce.fmax.v4f32(<4 x float> %a, <4 x i1> %mask, i32 %evl) + ; %r is equivalent to %also.r, where lanes greater than or equal to %evl + ; are treated as though %mask were false for those lanes. + + %masked.a = select <4 x i1> %mask, <4 x float> %a, <4 x float> + %also.r = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %masked.a) + + +.. _int_vp_reduce_fmin: + +'``llvm.vp.reduce.fmin.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare float @llvm.vp.reduce.fmin.v4f32(<4 x float> , <4 x i1> , float ) + declare double @llvm.vp.reduce.fmin.nxv8f64( , , i32 ) + +Overview: +""""""""" + +Predicated floating-point ``MIN`` reduction of a vector, returning the result +as a scalar. The return type matches the element type of the vector input. + + +Arguments: +"""""""""" + +The first operand must be a vector of floating-point values. The result type +must be the element type of the first operand. The second operand is the vector +mask and has the same number of elements as the first operand. The third +operand is the explicit vector length of the operation. + +Semantics: +"""""""""" + +The '``llvm.vp.reduce.fmin``' intrinsic performs the floating-point ``MIN`` +reduction (:ref:`llvm.vector.reduce.fmin `) of the +vector operand on each enabled lane. Disabled lanes are treated as containing +the neutral value (i.e. having no effect on the reduction operation). + +The neutral value is dependent on the :ref:`fast-math flags `. If no +flags are set, the neutral value is ``+QNAN``. If ``nnan`` and ``ninf`` are +both set, then the neutral value is the largest floating-point value for the +result type. If only ``nnan`` is set then the neutral value is ``+Infinity``. + +This instruction has the same comparison semantics as the +:ref:`llvm.vector.reduce.fmin ` intrinsic (and thus the +'``llvm.minnum.*``' intrinsic). That is, the result will always be a number +unless all elements of the vector are ``NaN``. Note that this means if all +lanes are disabled the result will *not* be a number. For a vector with maximum +element magnitude ``0.0`` and containing both ``+0.0`` and ``-0.0`` elements, +the sign of the result is unspecified. + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call float @llvm.vp.reduce.fmin.v4f32(<4 x float> %a, <4 x i1> %mask, i32 %evl) + ; %r is equivalent to %also.r, where lanes greater than or equal to %evl + ; are treated as though %mask were false for those lanes. + + %masked.a = select <4 x i1> %mask, <4 x float> %a, <4 x float> + %also.r = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %masked.a) + + .. _int_get_active_lane_mask: '``llvm.get.active.lane.mask.*``' Intrinsics diff --git a/llvm/include/llvm/IR/IntrinsicInst.h b/llvm/include/llvm/IR/IntrinsicInst.h --- a/llvm/include/llvm/IR/IntrinsicInst.h +++ b/llvm/include/llvm/IR/IntrinsicInst.h @@ -428,6 +428,29 @@ // Equivalent non-predicated opcode static Optional getFunctionalOpcodeForVP(Intrinsic::ID ID); + + static bool isVPReduction(Intrinsic::ID ID); +}; + +/// This represents vector predication reduction intrinsics. +class VPReductionIntrinsic : public VPIntrinsic { +public: + + unsigned getVectorParamPos() const; + Optional getStartParamPos() const; + + static Optional getVectorParamPos(Intrinsic::ID ID); + static Optional getStartParamPos(Intrinsic::ID ID); + + /// Methods for support type inquiry through isa, cast, and dyn_cast: + /// @{ + static bool classof(const IntrinsicInst *I) { + return VPIntrinsic::isVPIntrinsic(I->getIntrinsicID()); + } + static bool classof(const Value *V) { + return isa(V) && classof(cast(V)); + } + /// @} }; /// This is the common base class for constrained floating point intrinsics. diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -1469,6 +1469,64 @@ llvm_i32_ty]>; } +// Reductions +let IntrProperties = [IntrSpeculatable, IntrNoMem, IntrNoSync, IntrWillReturn] in { + def int_vp_reduce_fadd : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>], + [LLVMVectorElementType<0>, + llvm_anyvector_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_reduce_fmul : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>], + [LLVMVectorElementType<0>, + llvm_anyvector_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_reduce_add : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>], + [llvm_anyvector_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_reduce_mul : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>], + [llvm_anyvector_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_reduce_and : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>], + [llvm_anyvector_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_reduce_or : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>], + [llvm_anyvector_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_reduce_xor : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>], + [llvm_anyvector_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_reduce_smax : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>], + [llvm_anyvector_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_reduce_smin : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>], + [llvm_anyvector_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_reduce_umax : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>], + [llvm_anyvector_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_reduce_umin : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>], + [llvm_anyvector_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_reduce_fmax : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>], + [llvm_anyvector_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_reduce_fmin : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>], + [llvm_anyvector_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; +} + def int_get_active_lane_mask: DefaultAttrsIntrinsic<[llvm_anyvector_ty], [llvm_anyint_ty, LLVMMatchType<1>], diff --git a/llvm/include/llvm/IR/VPIntrinsics.def b/llvm/include/llvm/IR/VPIntrinsics.def --- a/llvm/include/llvm/IR/VPIntrinsics.def +++ b/llvm/include/llvm/IR/VPIntrinsics.def @@ -100,6 +100,16 @@ #define HANDLE_VP_TO_CONSTRAINEDFP(HASROUND, HASEXCEPT, INTRINID) #endif +// Map this VP intrinsic to its canonical functional intrinsic. +#ifndef HANDLE_VP_TO_INTRIN +#define HANDLE_VP_TO_INTRIN(VPID, IID) +#endif + +// Map this VP reduction intrinsic to its reduction operand positions. +#ifndef HANDLE_VP_REDUCTION +#define HANDLE_VP_REDUCTION(ID, STARTPOS, VECTORPOS) +#endif + /// } Property Macros ///// Integer Arithmetic { @@ -191,6 +201,92 @@ ///// } Floating-Point Arithmetic +///// Reduction { + +// Specialized helper macro for reductions (%x, %mask, %evl). +#ifdef HELPER_REGISTER_REDUCTION_VP +#error "The internal helper macro HELPER_REGISTER_REDUCTION_VP is already defined!" +#endif +#define HELPER_REGISTER_REDUCTION_VP(VPINTRIN, SDOPC, INTRIN) \ +BEGIN_REGISTER_VP(VPINTRIN, 1, 2, SDOPC, -1) \ +HANDLE_VP_TO_INTRIN(VPINTRIN, INTRIN) \ +HANDLE_VP_REDUCTION(VPINTRIN, None, 0) \ +END_REGISTER_VP(VPINTRIN, SDOPC) + +// llvm.vp.reduce.add(accu,x,mask,vlen) +HELPER_REGISTER_REDUCTION_VP(vp_reduce_add, VP_REDUCE_ADD, + experimental_vector_reduce_add) + +// llvm.vp.reduce.mul(accu,x,mask,vlen) +HELPER_REGISTER_REDUCTION_VP(vp_reduce_mul, VP_REDUCE_MUL, + experimental_vector_reduce_mul) + +// llvm.vp.reduce.and(accu,x,mask,vlen) +HELPER_REGISTER_REDUCTION_VP(vp_reduce_and, VP_REDUCE_AND, + experimental_vector_reduce_and) + +// llvm.vp.reduce.or(accu,x,mask,vlen) +HELPER_REGISTER_REDUCTION_VP(vp_reduce_or, VP_REDUCE_OR, + experimental_vector_reduce_or) + +// llvm.vp.reduce.xor(accu,x,mask,vlen) +HELPER_REGISTER_REDUCTION_VP(vp_reduce_xor, VP_REDUCE_XOR, + experimental_vector_reduce_xor) + +// llvm.vp.reduce.smax(accu,x,mask,vlen) +HELPER_REGISTER_REDUCTION_VP(vp_reduce_smax, VP_REDUCE_SMAX, + experimental_vector_reduce_smax) + +// llvm.vp.reduce.smin(accu,x,mask,vlen) +HELPER_REGISTER_REDUCTION_VP(vp_reduce_smin, VP_REDUCE_SMIN, + experimental_vector_reduce_smin) + +// llvm.vp.reduce.umax(accu,x,mask,vlen) +HELPER_REGISTER_REDUCTION_VP(vp_reduce_umax, VP_REDUCE_UMAX, + experimental_vector_reduce_umax) + +// llvm.vp.reduce.umin(accu,x,mask,vlen) +HELPER_REGISTER_REDUCTION_VP(vp_reduce_umin, VP_REDUCE_UMIN, + experimental_vector_reduce_umin) + +// llvm.vp.reduce.fmax(accu,x,mask,vlen) +HELPER_REGISTER_REDUCTION_VP(vp_reduce_fmax, VP_REDUCE_FMAX, + experimental_vector_reduce_fmax) + +// llvm.vp.reduce.fmin(accu,x,mask,vlen) +HELPER_REGISTER_REDUCTION_VP(vp_reduce_fmin, VP_REDUCE_FMIN, + experimental_vector_reduce_fmin) + +#undef HELPER_REGISTER_REDUCTION_VP + +// Specialized helper macro for reductions with a starting value (%acc, %x, %mask, %evl). +#ifdef HELPER_REGISTER_REDUCTION_START_VP +#error "The internal helper macro HELPER_REGISTER_REDUCTION_START_VP is already defined!" +#endif +#define HELPER_REGISTER_REDUCTION_START_VP(VPINTRIN, SDOPC, SEQ_SDOPC, INTRIN) \ +BEGIN_REGISTER_VP_INTRINSIC(VPINTRIN, 2, 3) \ +BEGIN_REGISTER_VP_SDNODE(SDOPC, -1, VPINTRIN, 1, 2) \ +END_REGISTER_VP_SDNODE(SDOPC) \ +BEGIN_REGISTER_VP_SDNODE(SEQ_SDOPC, -1, VPINTRIN, 2, 3) \ +END_REGISTER_VP_SDNODE(SEQ_SDOPC) \ +HANDLE_VP_TO_INTRIN(VPINTRIN, INTRIN) \ +HANDLE_VP_REDUCTION(VPINTRIN, 0, 1) \ +END_REGISTER_VP_INTRINSIC(VPINTRIN) + +// llvm.vp.reduce.fadd(start,x,mask,vlen) +HELPER_REGISTER_REDUCTION_START_VP(vp_reduce_fadd, VP_REDUCE_FADD, + VP_REDUCE_SEQ_FADD, + experimental_vector_reduce_fadd) + +// llvm.vp.reduce.fmul(start,x,mask,vlen) +HELPER_REGISTER_REDUCTION_START_VP(vp_reduce_fmul, VP_REDUCE_FMUL, + VP_REDUCE_SEQ_FMUL, + experimental_vector_reduce_fmul) + +#undef HELPER_REGISTER_REDUCTION_START_VP + +///// } Reduction + #undef BEGIN_REGISTER_VP #undef BEGIN_REGISTER_VP_INTRINSIC #undef BEGIN_REGISTER_VP_SDNODE @@ -199,3 +295,5 @@ #undef END_REGISTER_VP_SDNODE #undef HANDLE_VP_TO_OPC #undef HANDLE_VP_TO_CONSTRAINEDFP +#undef HANDLE_VP_TO_INTRIN +#undef HANDLE_VP_REDUCTION diff --git a/llvm/lib/CodeGen/ExpandVectorPredication.cpp b/llvm/lib/CodeGen/ExpandVectorPredication.cpp --- a/llvm/lib/CodeGen/ExpandVectorPredication.cpp +++ b/llvm/lib/CodeGen/ExpandVectorPredication.cpp @@ -24,6 +24,7 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/Module.h" +#include "llvm/IR/Operator.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" @@ -158,6 +159,11 @@ Value *expandPredicationInBinaryOperator(IRBuilder<> &Builder, VPIntrinsic &PI); + /// \brief Lower this VP reduction to a call to an unpredicated reduction + /// intrinsic. + Value *expandPredicationInReduction(IRBuilder<> &Builder, + VPReductionIntrinsic &PI); + /// \brief Query TTI and expand the vector predication in \p P accordingly. Value *expandPredication(VPIntrinsic &PI); @@ -248,6 +254,118 @@ return NewBinOp; } +static Value *getNeutralReductionElement(const VPReductionIntrinsic &VPI, + Type *EltTy) { + bool Negative = false; + unsigned EltBits = EltTy->getScalarSizeInBits(); + switch (VPI.getIntrinsicID()) { + default: + llvm_unreachable("Expecting a VP reduction intrinsic"); + case Intrinsic::vp_reduce_add: + case Intrinsic::vp_reduce_or: + case Intrinsic::vp_reduce_xor: + case Intrinsic::vp_reduce_umax: + return Constant::getNullValue(EltTy); + case Intrinsic::vp_reduce_mul: + return ConstantInt::get(EltTy, 1, /*IsSigned*/ false); + case Intrinsic::vp_reduce_and: + case Intrinsic::vp_reduce_umin: + return ConstantInt::getAllOnesValue(EltTy); + case Intrinsic::vp_reduce_smin: + return ConstantInt::get(EltTy->getContext(), + APInt::getSignedMaxValue(EltBits)); + case Intrinsic::vp_reduce_smax: + return ConstantInt::get(EltTy->getContext(), + APInt::getSignedMinValue(EltBits)); + case Intrinsic::vp_reduce_fmax: + Negative = true; + LLVM_FALLTHROUGH; + case Intrinsic::vp_reduce_fmin: { + FastMathFlags Flags = VPI.getFastMathFlags(); + const fltSemantics &Semantics = EltTy->getFltSemantics(); + return !Flags.noNaNs() ? ConstantFP::getQNaN(EltTy, Negative) + : !Flags.noInfs() + ? ConstantFP::getInfinity(EltTy, Negative) + : ConstantFP::get(EltTy, + APFloat::getLargest(Semantics, Negative)); + } + case Intrinsic::vp_reduce_fadd: + return ConstantFP::getNegativeZero(EltTy); + case Intrinsic::vp_reduce_fmul: + return ConstantFP::get(EltTy, 1.0); + } +} + +Value * +CachingVPExpander::expandPredicationInReduction(IRBuilder<> &Builder, + VPReductionIntrinsic &VPI) { + assert((isSafeToSpeculativelyExecute(&VPI) || + VPI.canIgnoreVectorLengthParam()) && + "Implicitly dropping %evl in non-speculatable operator!"); + + Value *Mask = VPI.getMaskParam(); + Value *RedOp = VPI.getOperand(VPI.getVectorParamPos()); + + // Insert neutral element in masked-out positions + if (Mask && !isAllTrueMask(Mask)) { + auto *NeutralElt = getNeutralReductionElement(VPI, VPI.getType()); + auto *NeutralVector = Builder.CreateVectorSplat( + cast(RedOp->getType())->getElementCount(), NeutralElt); + RedOp = Builder.CreateSelect(Mask, RedOp, NeutralVector); + } + + Value *NewReduction; + + switch (VPI.getIntrinsicID()) { + default: + llvm_unreachable("Impossible reduction kind"); + case Intrinsic::vp_reduce_add: + NewReduction = Builder.CreateAddReduce(RedOp); + break; + case Intrinsic::vp_reduce_mul: + NewReduction = Builder.CreateMulReduce(RedOp); + break; + case Intrinsic::vp_reduce_and: + NewReduction = Builder.CreateAndReduce(RedOp); + break; + case Intrinsic::vp_reduce_or: + NewReduction = Builder.CreateOrReduce(RedOp); + break; + case Intrinsic::vp_reduce_xor: + NewReduction = Builder.CreateXorReduce(RedOp); + break; + case Intrinsic::vp_reduce_smax: + NewReduction = Builder.CreateIntMaxReduce(RedOp, /*IsSigned*/ true); + break; + case Intrinsic::vp_reduce_smin: + NewReduction = Builder.CreateIntMinReduce(RedOp, /*IsSigned*/ true); + break; + case Intrinsic::vp_reduce_umax: + NewReduction = Builder.CreateIntMaxReduce(RedOp, /*IsSigned*/ false); + break; + case Intrinsic::vp_reduce_umin: + NewReduction = Builder.CreateIntMinReduce(RedOp, /*IsSigned*/ false); + break; + case Intrinsic::vp_reduce_fmax: + NewReduction = Builder.CreateFPMaxReduce(RedOp); + break; + case Intrinsic::vp_reduce_fmin: + NewReduction = Builder.CreateFPMinReduce(RedOp); + break; + case Intrinsic::vp_reduce_fadd: + NewReduction = Builder.CreateFAddReduce( + VPI.getOperand(*VPI.getStartParamPos()), RedOp); + break; + case Intrinsic::vp_reduce_fmul: + NewReduction = Builder.CreateFMulReduce( + VPI.getOperand(*VPI.getStartParamPos()), RedOp); + break; + } + + replaceOperation(*NewReduction, VPI); + return NewReduction; +} + void CachingVPExpander::discardEVLParameter(VPIntrinsic &VPI) { LLVM_DEBUG(dbgs() << "Discard EVL parameter in " << VPI << "\n"); @@ -321,6 +439,9 @@ if (OC && Instruction::isBinaryOp(*OC)) return expandPredicationInBinaryOperator(Builder, VPI); + if (auto *VPRI = dyn_cast(&VPI)) + return expandPredicationInReduction(Builder, *VPRI); + return &VPI; } diff --git a/llvm/lib/IR/IntrinsicInst.cpp b/llvm/lib/IR/IntrinsicInst.cpp --- a/llvm/lib/IR/IntrinsicInst.cpp +++ b/llvm/lib/IR/IntrinsicInst.cpp @@ -410,11 +410,58 @@ // TODO: Extend this for other VP intrinsics as they are upstreamed. This // works for binary arithmetic VP intrinsics. - auto *VPFunc = Intrinsic::getDeclaration(M, VPID, Params[0]->getType()); + Type *OverloadTy = Params[0]->getType(); + if (VPIntrinsic::isVPReduction(VPID)) + OverloadTy = + Params[*VPReductionIntrinsic::getVectorParamPos(VPID)]->getType(); + + auto *VPFunc = Intrinsic::getDeclaration(M, VPID, OverloadTy); assert(VPFunc && "Could not declare VP intrinsic"); return VPFunc; } +bool VPIntrinsic::isVPReduction(Intrinsic::ID ID) { + switch (ID) { + default: + return false; +#define HANDLE_VP_REDUCTION(VPID, STARTPOS, VECTORPOS) \ + case Intrinsic::VPID: \ + break; +#include "llvm/IR/VPIntrinsics.def" + } + return true; +} + +unsigned VPReductionIntrinsic::getVectorParamPos() const { + return *VPReductionIntrinsic::getVectorParamPos(getIntrinsicID()); +} + +Optional VPReductionIntrinsic::getStartParamPos() const { + return VPReductionIntrinsic::getStartParamPos(getIntrinsicID()); +} + +Optional VPReductionIntrinsic::getVectorParamPos(Intrinsic::ID ID) { + switch (ID) { +#define HANDLE_VP_REDUCTION(VPID, STARTPOS, VECTORPOS) \ + case Intrinsic::VPID: \ + return VECTORPOS; +#include "llvm/IR/VPIntrinsics.def" + default: + return None; + } +} + +Optional VPReductionIntrinsic::getStartParamPos(Intrinsic::ID ID) { + switch (ID) { +#define HANDLE_VP_REDUCTION(VPID, STARTPOS, VECTORPOS) \ + case Intrinsic::VPID: \ + return STARTPOS; +#include "llvm/IR/VPIntrinsics.def" + default: + return None; + } +} + Instruction::BinaryOps BinaryOpIntrinsic::getBinaryOp() const { switch (getIntrinsicID()) { case Intrinsic::uadd_with_overflow: diff --git a/llvm/test/CodeGen/Generic/expand-vp.ll b/llvm/test/CodeGen/Generic/expand-vp.ll --- a/llvm/test/CodeGen/Generic/expand-vp.ll +++ b/llvm/test/CodeGen/Generic/expand-vp.ll @@ -25,6 +25,10 @@ declare <8 x i32> @llvm.vp.ashr.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) declare <8 x i32> @llvm.vp.lshr.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) declare <8 x i32> @llvm.vp.shl.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) +; Reductions +declare i32 @llvm.vp.reduce.add.v4i32(<4 x i32>, <4 x i1>, i32) +declare i32 @llvm.vp.reduce.mul.v4i32(<4 x i32>, <4 x i1>, i32) +declare float @llvm.vp.reduce.fmin.v4f32(<4 x float>, <4 x i1>, i32) ; Fixed vector test function. define void @test_vp_int_v8(<8 x i32> %i0, <8 x i32> %i1, <8 x i32> %i2, <8 x i32> %f3, <8 x i1> %m, i32 %n) { @@ -78,6 +82,21 @@ %rC = call @llvm.vp.shl.nxv4i32( %i0, %i1, %m, i32 %n) ret void } + +; Fixed vector reduce test function. +define void @test_vp_reduce_int_v4(<4 x i32> %vi, <4 x i1> %m, i32 %n) { + %r0 = call i32 @llvm.vp.reduce.add.v4i32(<4 x i32> %vi, <4 x i1> %m, i32 %n) + %r1 = call i32 @llvm.vp.reduce.mul.v4i32(<4 x i32> %vi, <4 x i1> %m, i32 %n) + ret void +} + +define void @test_vp_reduce_fp_v4(<4 x float> %vi, <4 x i1> %m, i32 %n) { + %r0 = call float @llvm.vp.reduce.fmin.v4f32(<4 x float> %vi, <4 x i1> %m, i32 %n) + %r1 = call nnan float @llvm.vp.reduce.fmin.v4f32(<4 x float> %vi, <4 x i1> %m, i32 %n) + %r2 = call nnan ninf float @llvm.vp.reduce.fmin.v4f32(<4 x float> %vi, <4 x i1> %m, i32 %n) + ret void +} + ; All VP intrinsics have to be lowered into non-VP ops ; Convert %evl into %mask for non-speculatable VP intrinsics and emit the ; instruction+select idiom with a non-VP SIMD instruction. @@ -121,7 +140,23 @@ ; ALL-CONVERT: ret void +; Check that reductions use the correct neutral element for masked-off elements +; ALL-CONVERT: define void @test_vp_reduce_int_v4(<4 x i32> %vi, <4 x i1> %m, i32 %n) { +; ALL-CONVERT-NEXT: [[ADD:%.+]] = select <4 x i1> %m, <4 x i32> %vi, <4 x i32> zeroinitializer +; ALL-CONVERT-NEXT: %{{.+}} = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[ADD]]) +; ALL-CONVERT-NEXT: [[MUL:%.+]] = select <4 x i1> %m, <4 x i32> %vi, <4 x i32> +; ALL-CONVERT-NEXT: %{{.+}} = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[MUL]]) +; ALL-CONVERT-NEXT: ret void +; Check that reductions use the correct neutral element for masked-off elements +; ALL-CONVERT: define void @test_vp_reduce_fp_v4(<4 x float> %vi, <4 x i1> %m, i32 %n) { +; ALL-CONVERT-NEXT: [[FMIN:%.+]] = select <4 x i1> %m, <4 x float> %vi, <4 x float> +; ALL-CONVERT-NEXT: %{{.+}} = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[FMIN]]) +; ALL-CONVERT-NEXT: [[FMIN_NNAN:%.+]] = select <4 x i1> %m, <4 x float> %vi, <4 x float> +; ALL-CONVERT-NEXT: %{{.+}} = call nnan float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[FMIN_NNAN]]) +; ALL-CONVERT-NEXT: [[FMIN_NNAN_NINF:%.+]] = select <4 x i1> %m, <4 x float> %vi, <4 x float> +; ALL-CONVERT-NEXT: %{{.+}} = call nnan ninf float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[FMIN_NNAN_NINF]]) +; ALL-CONVERT-NEXT: ret void ; All legal - don't transform anything. @@ -157,6 +192,16 @@ ; LEGAL_LEGAL-NEXT: %rC = call @llvm.vp.shl.nxv4i32( %i0, %i1, %m, i32 %n) ; LEGAL_LEGAL-NEXT: ret void +; LEGAL_LEGAL: define void @test_vp_reduce_int_v4(<4 x i32> %vi, <4 x i1> %m, i32 %n) { +; LEGAL_LEGAL-NEXT: %r0 = call i32 @llvm.vp.reduce.add.v4i32(<4 x i32> %vi, <4 x i1> %m, i32 %n) +; LEGAL_LEGAL-NEXT: %r1 = call i32 @llvm.vp.reduce.mul.v4i32(<4 x i32> %vi, <4 x i1> %m, i32 %n) +; LEGAL_LEGAL-NEXT: ret void + +; LEGAL_LEGAL: define void @test_vp_reduce_fp_v4(<4 x float> %vi, <4 x i1> %m, i32 %n) { +; LEGAL_LEGAL-NEXT: %r0 = call float @llvm.vp.reduce.fmin.v4f32(<4 x float> %vi, <4 x i1> %m, i32 %n) +; LEGAL_LEGAL-NEXT: %r1 = call nnan float @llvm.vp.reduce.fmin.v4f32(<4 x float> %vi, <4 x i1> %m, i32 %n) +; LEGAL_LEGAL-NEXT: %r2 = call nnan ninf float @llvm.vp.reduce.fmin.v4f32(<4 x float> %vi, <4 x i1> %m, i32 %n) +; LEGAL_LEGAL-NEXT: ret void ; Drop %evl where possible else fold %evl into %mask (%evl Discard, %mask Legal) ; @@ -205,6 +250,16 @@ ; DISCARD_LEGAL-NOT: %{{.+}} = call @llvm.vp.{{.*}}, i32 %n) ; DISCARD_LEGAL: ret void +; DISCARD_LEGAL: define void @test_vp_reduce_int_v4(<4 x i32> %vi, <4 x i1> %m, i32 %n) { +; DISCARD_LEGAL-NEXT: %r0 = call i32 @llvm.vp.reduce.add.v4i32(<4 x i32> %vi, <4 x i1> %m, i32 4) +; DISCARD_LEGAL-NEXT: %r1 = call i32 @llvm.vp.reduce.mul.v4i32(<4 x i32> %vi, <4 x i1> %m, i32 4) +; DISCARD_LEGAL-NEXT: ret void + +; DISCARD_LEGAL: define void @test_vp_reduce_fp_v4(<4 x float> %vi, <4 x i1> %m, i32 %n) { +; DISCARD_LEGAL-NEXT: %r0 = call float @llvm.vp.reduce.fmin.v4f32(<4 x float> %vi, <4 x i1> %m, i32 4) +; DISCARD_LEGAL-NEXT: %r1 = call nnan float @llvm.vp.reduce.fmin.v4f32(<4 x float> %vi, <4 x i1> %m, i32 4) +; DISCARD_LEGAL-NEXT: %r2 = call nnan ninf float @llvm.vp.reduce.fmin.v4f32(<4 x float> %vi, <4 x i1> %m, i32 4) +; DISCARD_LEGAL-NEXT: ret void ; Convert %evl into %mask everywhere (%evl Convert, %mask Legal) ; @@ -243,3 +298,21 @@ ; CONVERT_LEGAL-NOT: %{{.*}} = call @llvm.vp.{{.*}}, i32 %n) ; CONVERT_LEGAL: ret void +; CONVERT_LEGAL: define void @test_vp_reduce_int_v4(<4 x i32> %vi, <4 x i1> %m, i32 %n) { +; CONVERT_LEGAL-NEXT: [[NINS:%.+]] = insertelement <4 x i32> poison, i32 %n, i32 0 +; CONVERT_LEGAL-NEXT: [[NSPLAT:%.+]] = shufflevector <4 x i32> [[NINS]], <4 x i32> poison, <4 x i32> zeroinitializer +; CONVERT_LEGAL-NEXT: [[EVLM:%.+]] = icmp ult <4 x i32> , [[NSPLAT]] +; CONVERT_LEGAL-NEXT: [[NEWM:%.+]] = and <4 x i1> [[EVLM]], %m +; CONVERT_LEGAL-NEXT: %{{.+}} = call i32 @llvm.vp.reduce.add.v4i32(<4 x i32> %vi, <4 x i1> [[NEWM]], i32 4) +; CONVERT_LEGAL-NOT: %{{.+}} = call i32 @llvm.vp.reduce.mul.v4i32(<4 x i32> %vi, <4 x i1> %m, i32 4) +; CONVERT_LEGAL: ret void + +; CONVERT_LEGAL: define void @test_vp_reduce_fp_v4(<4 x float> %vi, <4 x i1> %m, i32 %n) { +; CONVERT_LEGAL-NEXT: [[NINS:%.+]] = insertelement <4 x i32> poison, i32 %n, i32 0 +; CONVERT_LEGAL-NEXT: [[NSPLAT:%.+]] = shufflevector <4 x i32> [[NINS]], <4 x i32> poison, <4 x i32> zeroinitializer +; CONVERT_LEGAL-NEXT: [[EVLM:%.+]] = icmp ult <4 x i32> , [[NSPLAT]] +; CONVERT_LEGAL-NEXT: [[NEWM:%.+]] = and <4 x i1> [[EVLM]], %m +; CONVERT_LEGAL-NEXT: %{{.+}} = call float @llvm.vp.reduce.fmin.v4f32(<4 x float> %vi, <4 x i1> [[NEWM]], i32 4) +; CONVERT_LEGAL-NOT: %{{.+}} = call nnan float @llvm.vp.reduce.fmin.v4f32(<4 x float> %vi, <4 x i1> %m, i32 4) +; CONVERT_LEGAL-NOT: %{{.+}} = call nnan ninf float @llvm.vp.reduce.fmin.v4f32(<4 x float> %vi, <4 x i1> %m, i32 4) +; CONVERT_LEGAL: ret void diff --git a/llvm/test/Verifier/vp-intrinsics.ll b/llvm/test/Verifier/vp-intrinsics.ll --- a/llvm/test/Verifier/vp-intrinsics.ll +++ b/llvm/test/Verifier/vp-intrinsics.ll @@ -29,6 +29,24 @@ ; TODO: test_vp_constrained_fp + +define void @test_vp_reduction(<8 x i32> %vi, <8 x float> %vf, float %f, <8 x i1> %m, i32 %n) { + %r0 = call i32 @llvm.vp.reduce.add.v8i32(<8 x i32> %vi, <8 x i1> %m, i32 %n) + %r1 = call i32 @llvm.vp.reduce.mul.v8i32(<8 x i32> %vi, <8 x i1> %m, i32 %n) + %r2 = call i32 @llvm.vp.reduce.and.v8i32(<8 x i32> %vi, <8 x i1> %m, i32 %n) + %r3 = call i32 @llvm.vp.reduce.or.v8i32(<8 x i32> %vi, <8 x i1> %m, i32 %n) + %r4 = call i32 @llvm.vp.reduce.xor.v8i32(<8 x i32> %vi, <8 x i1> %m, i32 %n) + %r5 = call i32 @llvm.vp.reduce.smax.v8i32(<8 x i32> %vi, <8 x i1> %m, i32 %n) + %r6 = call i32 @llvm.vp.reduce.smin.v8i32(<8 x i32> %vi, <8 x i1> %m, i32 %n) + %r7 = call i32 @llvm.vp.reduce.umax.v8i32(<8 x i32> %vi, <8 x i1> %m, i32 %n) + %r8 = call i32 @llvm.vp.reduce.umin.v8i32(<8 x i32> %vi, <8 x i1> %m, i32 %n) + %r9 = call float @llvm.vp.reduce.fmin.v8f32(<8 x float> %vf, <8 x i1> %m, i32 %n) + %rA = call float @llvm.vp.reduce.fmax.v8f32(<8 x float> %vf, <8 x i1> %m, i32 %n) + %rB = call float @llvm.vp.reduce.fadd.v8f32(float %f, <8 x float> %vf, <8 x i1> %m, i32 %n) + %rC = call float @llvm.vp.reduce.fmul.v8f32(float %f, <8 x float> %vf, <8 x i1> %m, i32 %n) + ret void +} + ; integer arith declare <8 x i32> @llvm.vp.add.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) declare <8 x i32> @llvm.vp.sub.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) @@ -50,3 +68,17 @@ declare <8 x double> @llvm.vp.fmul.v8f64(<8 x double>, <8 x double>, <8 x i1>, i32) declare <8 x double> @llvm.vp.fdiv.v8f64(<8 x double>, <8 x double>, <8 x i1>, i32) declare <8 x double> @llvm.vp.frem.v8f64(<8 x double>, <8 x double>, <8 x i1>, i32) +; reductions +declare i32 @llvm.vp.reduce.add.v8i32(<8 x i32>, <8 x i1>, i32) +declare i32 @llvm.vp.reduce.mul.v8i32(<8 x i32>, <8 x i1>, i32) +declare i32 @llvm.vp.reduce.and.v8i32(<8 x i32>, <8 x i1>, i32) +declare i32 @llvm.vp.reduce.or.v8i32(<8 x i32>, <8 x i1>, i32) +declare i32 @llvm.vp.reduce.xor.v8i32(<8 x i32>, <8 x i1>, i32) +declare i32 @llvm.vp.reduce.smax.v8i32(<8 x i32>, <8 x i1>, i32) +declare i32 @llvm.vp.reduce.smin.v8i32(<8 x i32>, <8 x i1>, i32) +declare i32 @llvm.vp.reduce.umax.v8i32(<8 x i32>, <8 x i1>, i32) +declare i32 @llvm.vp.reduce.umin.v8i32(<8 x i32>, <8 x i1>, i32) +declare float @llvm.vp.reduce.fmin.v8f32(<8 x float>, <8 x i1>, i32) +declare float @llvm.vp.reduce.fmax.v8f32(<8 x float>, <8 x i1>, i32) +declare float @llvm.vp.reduce.fadd.v8f32(float, <8 x float>, <8 x i1>, i32) +declare float @llvm.vp.reduce.fmul.v8f32(float, <8 x float>, <8 x i1>, i32) diff --git a/llvm/unittests/IR/VPIntrinsicTest.cpp b/llvm/unittests/IR/VPIntrinsicTest.cpp --- a/llvm/unittests/IR/VPIntrinsicTest.cpp +++ b/llvm/unittests/IR/VPIntrinsicTest.cpp @@ -46,6 +46,21 @@ Str << " declare <8 x float> @llvm.vp." << BinaryFPOpcode << ".v8f32(<8 x float>, <8 x float>, <8 x i1>, i32) "; + const char *ReductionIntOpcodes[] = {"add", "mul", "and", "or", "xor", + "smin", "smax", "umin", "umax"}; + for (const char *ReductionOpcode : ReductionIntOpcodes) + Str << " declare i32 @llvm.vp.reduce." << ReductionOpcode + << ".v8i32(<8 x i32>, <8 x i1>, i32) "; + + std::pair ReductionFPOpcodes[] = { + {"fadd", true}, {"fmul", true}, {"fmin", false}, {"fmax", false}}; + for (const auto &ReductionPair : ReductionFPOpcodes) + Str << " declare float @llvm.vp.reduce." << ReductionPair.first + << ".v8f32(" << (ReductionPair.second ? "float, " : "") + << "<8 x float>, <8 x i1>, i32) "; + + dbgs() << Str.str() << "\n"; + return parseAssemblyString(Str.str(), Err, C); } };