diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -16593,6 +16593,7 @@ respective operation across all elements of the vector, returning a single scalar result of the same element type. +.. _int_vector_reduce_add: '``llvm.vector.reduce.add.*``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -16616,6 +16617,8 @@ """""""""" The argument to this intrinsic must be a vector of integer values. +.. _int_vector_reduce_fadd: + '``llvm.vector.reduce.fadd.*``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -16668,6 +16671,8 @@ %ord = call float @llvm.vector.reduce.fadd.v4f32(float %start_value, <4 x float> %input) ; sequential reduction +.. _int_vector_reduce_mul: + '``llvm.vector.reduce.mul.*``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -16690,6 +16695,8 @@ """""""""" The argument to this intrinsic must be a vector of integer values. +.. _int_vector_reduce_fmul: + '``llvm.vector.reduce.fmul.*``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -16741,6 +16748,8 @@ %unord = call reassoc float @llvm.vector.reduce.fmul.v4f32(float 1.0, <4 x float> %input) ; relaxed reduction %ord = call float @llvm.vector.reduce.fmul.v4f32(float %start_value, <4 x float> %input) ; sequential reduction +.. _int_vector_reduce_and: + '``llvm.vector.reduce.and.*``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -16762,6 +16771,8 @@ """""""""" The argument to this intrinsic must be a vector of integer values. +.. _int_vector_reduce_or: + '``llvm.vector.reduce.or.*``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -16783,6 +16794,8 @@ """""""""" The argument to this intrinsic must be a vector of integer values. +.. _int_vector_reduce_xor: + '``llvm.vector.reduce.xor.*``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -16804,6 +16817,8 @@ """""""""" The argument to this intrinsic must be a vector of integer values. +.. _int_vector_reduce_smax: + '``llvm.vector.reduce.smax.*``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -16825,6 +16840,8 @@ """""""""" The argument to this intrinsic must be a vector of integer values. +.. _int_vector_reduce_smin: + '``llvm.vector.reduce.smin.*``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -16846,6 +16863,8 @@ """""""""" The argument to this intrinsic must be a vector of integer values. +.. _int_vector_reduce_umax: + '``llvm.vector.reduce.umax.*``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -16867,6 +16886,8 @@ """""""""" The argument to this intrinsic must be a vector of integer values. +.. _int_vector_reduce_umin: + '``llvm.vector.reduce.umin.*``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -16888,6 +16909,8 @@ """""""""" The argument to this intrinsic must be a vector of integer values. +.. _int_vector_reduce_fmax: + '``llvm.vector.reduce.fmax.*``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -16918,6 +16941,8 @@ """""""""" The argument to this intrinsic must be a vector of floating-point values. +.. _int_vector_reduce_fmin: + '``llvm.vector.reduce.fmin.*``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -18530,6 +18555,775 @@ +.. _int_vp_reduce_add: + +'``llvm.vp.reduce.add.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare i32 @llvm.vp.reduce.add.v4i32(i32 , <4 x i32> , <4 x i1> , i32 ) + declare i16 @llvm.vp.reduce.add.nxv8i16(i16 , , , i32 ) + +Overview: +""""""""" + +Predicated integer ``ADD`` reduction of a vector and a scalar starting value, +returning the result as a scalar. + +Arguments: +"""""""""" + +The first operand is the start value of the reduction, which must be a scalar +integer type equal to the result type. The second operand is the vector on +which the reduction is performed and must be a vector of integer values whose +element type is the result/start type. The third operand is the vector mask and +is a vector of boolean values with the same number of elements as the vector +operand. The fourth operand is the explicit vector length of the operation. + +Semantics: +"""""""""" + +The '``llvm.vp.reduce.add``' intrinsic performs the integer ``ADD`` reduction +(:ref:`llvm.vector.reduce.add `) of the vector operand +``val`` on each enabled lane, adding it to the scalar ``start_value``. Disabled +lanes are treated as containing the neutral value ``0`` (i.e. having no effect +on the reduction operation). If the vector length is zero, the result is equal +to ``start_value``. + +To ignore the start value, the neutral value can be used. + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call i32 @llvm.vp.reduce.add.v4i32(i32 %start, <4 x i32> %a, <4 x i1> %mask, i32 %evl) + ; %r is equivalent to %also.r, where lanes greater than or equal to %evl + ; are treated as though %mask were false for those lanes. + + %masked.a = select <4 x i1> %mask, <4 x i32> %a, <4 x i32> zeroinitializer + %reduction = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %masked.a) + %also.r = add i32 %reduction, %start + + +.. _int_vp_reduce_fadd: + +'``llvm.vp.reduce.fadd.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare float @llvm.vp.reduce.fadd.v4f32(float , <4 x float> , <4 x i1> , i32 ) + declare double @llvm.vp.reduce.fadd.nxv8f64(double , , , i32 ) + +Overview: +""""""""" + +Predicated floating-point ``ADD`` reduction of a vector and a scalar starting +value, returning the result as a scalar. + +Arguments: +"""""""""" + +The first operand is the start value of the reduction, which must be a scalar +floating-point type equal to the result type. The second operand is the vector +on which the reduction is performed and must be a vector of floating-point +values whose element type is the result/start type. The third operand is the +vector mask and is a vector of boolean values with the same number of elements +as the vector operand. The fourth operand is the explicit vector length of the +operation. + +Semantics: +"""""""""" + +The '``llvm.vp.reduce.fadd``' intrinsic performs the floating-point ``ADD`` +reduction (:ref:`llvm.vector.reduce.fadd `) of the +vector operand ``val`` on each enabled lane, adding it to the scalar +``start_value``. Disabled lanes are treated as containing the neutral value +``-0.0`` (i.e. having no effect on the reduction operation). If no lanes are +enabled, the resulting value will be equal to ``start_value``. + +To ignore the start value, the neutral value can be used. + +See the unpredicated version (:ref:`llvm.vector.reduce.fadd +`) for more detail on the semantics of the reduction. + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call float @llvm.vp.reduce.fadd.v4f32(float %start, <4 x float> %a, <4 x i1> %mask, i32 %evl) + ; %r is equivalent to %also.r, where lanes greater than or equal to %evl + ; are treated as though %mask were false for those lanes. + + %masked.a = select <4 x i1> %mask, <4 x float> %a, <4 x float> + %also.r = call float @llvm.vector.reduce.fadd.v4f32(float %start, <4 x float> %masked.a) + + +.. _int_vp_reduce_mul: + +'``llvm.vp.reduce.mul.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare i32 @llvm.vp.reduce.mul.v4i32(i32 , <4 x i32> , <4 x i1> , i32 ) + declare i16 @llvm.vp.reduce.mul.nxv8i16(i16 , , , i32 ) + +Overview: +""""""""" + +Predicated integer ``MUL`` reduction of a vector and a scalar starting value, +returning the result as a scalar. + + +Arguments: +"""""""""" + +The first operand is the start value of the reduction, which must be a scalar +integer type equal to the result type. The second operand is the vector on +which the reduction is performed and must be a vector of integer values whose +element type is the result/start type. The third operand is the vector mask and +is a vector of boolean values with the same number of elements as the vector +operand. The fourth operand is the explicit vector length of the operation. + +Semantics: +"""""""""" + +The '``llvm.vp.reduce.mul``' intrinsic performs the integer ``MUL`` reduction +(:ref:`llvm.vector.reduce.mul `) of the vector operand ``val`` +on each enabled lane, multiplying it by the scalar ``start_value``. Disabled +lanes are treated as containing the neutral vale ``1`` (i.e. having no +effect on the reduction operation). If the vector length is zero, the result is +the start value. + +To ignore the start value, the neutral value can be used. + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call i32 @llvm.vp.reduce.mul.v4i32(i32 %start, <4 x i32> %a, <4 x i1> %mask, i32 %evl) + ; %r is equivalent to %also.r, where lanes greater than or equal to %evl + ; are treated as though %mask were false for those lanes. + + %masked.a = select <4 x i1> %mask, <4 x i32> %a, <4 x i32> + %reduction = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %masked.a) + %also.r = mul i32 %reduction, %start + +.. _int_vp_reduce_fmul: + +'``llvm.vp.reduce.fmul.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare float @llvm.vp.reduce.fmul.v4f32(float , <4 x float> , <4 x i1> , i32 ) + declare double @llvm.vp.reduce.fmul.nxv8f64(double , , , i32 ) + +Overview: +""""""""" + +Predicated floating-point ``MUL`` reduction of a vector and a scalar starting +value, returning the result as a scalar. + + +Arguments: +"""""""""" + +The first operand is the start value of the reduction, which must be a scalar +floating-point type equal to the result type. The second operand is the vector +on which the reduction is performed and must be a vector of floating-point +values whose element type is the result/start type. The third operand is the +vector mask and is a vector of boolean values with the same number of elements +as the vector operand. The fourth operand is the explicit vector length of the +operation. + +Semantics: +"""""""""" + +The '``llvm.vp.reduce.fmul``' intrinsic performs the floating-point ``MUL`` +reduction (:ref:`llvm.vector.reduce.fmul `) of the +vector operand ``val`` on each enabled lane, multiplying it by the scalar +`start_value``. Disabled lanes are treated as containing the neutral value +``1.0`` (i.e. having no effect on the reduction operation). If no lanes are +enabled, the resulting value will be equal to the starting value. + +To ignore the start value, the neutral value can be used. + +See the unpredicated version (:ref:`llvm.vector.reduce.fmul +`) for more detail on the semantics. + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call float @llvm.vp.reduce.fmul.v4f32(float %start, <4 x float> %a, <4 x i1> %mask, i32 %evl) + ; %r is equivalent to %also.r, where lanes greater than or equal to %evl + ; are treated as though %mask were false for those lanes. + + %masked.a = select <4 x i1> %mask, <4 x float> %a, <4 x float> + %also.r = call float @llvm.vector.reduce.fmul.v4f32(float %start, <4 x float> %masked.a) + + +.. _int_vp_reduce_and: + +'``llvm.vp.reduce.and.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare i32 @llvm.vp.reduce.and.v4i32(i32 , <4 x i32> , <4 x i1> , i32 ) + declare i16 @llvm.vp.reduce.and.nxv8i16(i16 , , , i32 ) + +Overview: +""""""""" + +Predicated integer ``AND`` reduction of a vector and a scalar starting value, +returning the result as a scalar. + + +Arguments: +"""""""""" + +The first operand is the start value of the reduction, which must be a scalar +integer type equal to the result type. The second operand is the vector on +which the reduction is performed and must be a vector of integer values whose +element type is the result/start type. The third operand is the vector mask and +is a vector of boolean values with the same number of elements as the vector +operand. The fourth operand is the explicit vector length of the operation. + +Semantics: +"""""""""" + +The '``llvm.vp.reduce.and``' intrinsic performs the integer ``AND`` reduction +(:ref:`llvm.vector.reduce.and `) of the vector operand +``val`` on each enabled lane, performing an '``and``' of that with with the +scalar ``start_value``. Disabled lanes are treated as containing the neutral +value ``UINT_MAX``, or ``-1`` (i.e. having no effect on the reduction +operation). If the vector length is zero, the result is the start value. + +To ignore the start value, the neutral value can be used. + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call i32 @llvm.vp.reduce.and.v4i32(i32 %start, <4 x i32> %a, <4 x i1> %mask, i32 %evl) + ; %r is equivalent to %also.r, where lanes greater than or equal to %evl + ; are treated as though %mask were false for those lanes. + + %masked.a = select <4 x i1> %mask, <4 x i32> %a, <4 x i32> + %reduction = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %masked.a) + %also.r = and i32 %reduction, %start + + +.. _int_vp_reduce_or: + +'``llvm.vp.reduce.or.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare i32 @llvm.vp.reduce.or.v4i32(i32 , <4 x i32> , <4 x i1> , i32 ) + declare i16 @llvm.vp.reduce.or.nxv8i16(i16 , , , i32 ) + +Overview: +""""""""" + +Predicated integer ``OR`` reduction of a vector and a scalar starting value, +returning the result as a scalar. + + +Arguments: +"""""""""" + +The first operand is the start value of the reduction, which must be a scalar +integer type equal to the result type. The second operand is the vector on +which the reduction is performed and must be a vector of integer values whose +element type is the result/start type. The third operand is the vector mask and +is a vector of boolean values with the same number of elements as the vector +operand. The fourth operand is the explicit vector length of the operation. + +Semantics: +"""""""""" + +The '``llvm.vp.reduce.or``' intrinsic performs the integer ``OR`` reduction +(:ref:`llvm.vector.reduce.or `) of the vector operand +``val`` on each enabled lane, performing an '``or``' of that with the scalar +``start_value``. Disabled lanes are treated as containing the neutral value +``0`` (i.e. having no effect on the reduction operation). If the vector length +is zero, the result is the start value. + +To ignore the start value, the neutral value can be used. + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call i32 @llvm.vp.reduce.or.v4i32(i32 %start, <4 x i32> %a, <4 x i1> %mask, i32 %evl) + ; %r is equivalent to %also.r, where lanes greater than or equal to %evl + ; are treated as though %mask were false for those lanes. + + %masked.a = select <4 x i1> %mask, <4 x i32> %a, <4 x i32> + %reduction = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %masked.a) + %also.r = or i32 %reduction, %start + +.. _int_vp_reduce_xor: + +'``llvm.vp.reduce.xor.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare i32 @llvm.vp.reduce.xor.v4i32(i32 , <4 x i32> , <4 x i1> , i32 ) + declare i16 @llvm.vp.reduce.xor.nxv8i16(i16 , , , i32 ) + +Overview: +""""""""" + +Predicated integer ``XOR`` reduction of a vector and a scalar starting value, +returning the result as a scalar. + + +Arguments: +"""""""""" + +The first operand is the start value of the reduction, which must be a scalar +integer type equal to the result type. The second operand is the vector on +which the reduction is performed and must be a vector of integer values whose +element type is the result/start type. The third operand is the vector mask and +is a vector of boolean values with the same number of elements as the vector +operand. The fourth operand is the explicit vector length of the operation. + +Semantics: +"""""""""" + +The '``llvm.vp.reduce.xor``' intrinsic performs the integer ``XOR`` reduction +(:ref:`llvm.vector.reduce.xor `) of the vector operand +``val`` on each enabled lane, performing an '``xor``' of that with the scalar +``start_value``. Disabled lanes are treated as containing the neutral value +``0`` (i.e. having no effect on the reduction operation). If the vector length +is zero, the result is the start value. + +To ignore the start value, the neutral value can be used. + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call i32 @llvm.vp.reduce.xor.v4i32(i32 %start, <4 x i32> %a, <4 x i1> %mask, i32 %evl) + ; %r is equivalent to %also.r, where lanes greater than or equal to %evl + ; are treated as though %mask were false for those lanes. + + %masked.a = select <4 x i1> %mask, <4 x i32> %a, <4 x i32> + %reduction = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %masked.a) + %also.r = xor i32 %reduction, %start + + +.. _int_vp_reduce_smax: + +'``llvm.vp.reduce.smax.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare i32 @llvm.vp.reduce.smax.v4i32(i32 , <4 x i32> , <4 x i1> , i32 ) + declare i16 @llvm.vp.reduce.smax.nxv8i16(i16 , , , i32 ) + +Overview: +""""""""" + +Predicated signed-integer ``MAX`` reduction of a vector and a scalar starting +value, returning the result as a scalar. + + +Arguments: +"""""""""" + +The first operand is the start value of the reduction, which must be a scalar +integer type equal to the result type. The second operand is the vector on +which the reduction is performed and must be a vector of integer values whose +element type is the result/start type. The third operand is the vector mask and +is a vector of boolean values with the same number of elements as the vector +operand. The fourth operand is the explicit vector length of the operation. + +Semantics: +"""""""""" + +The '``llvm.vp.reduce.smax``' intrinsic performs the signed-integer ``MAX`` +reduction (:ref:`llvm.vector.reduce.smax `) of the +vector operand ``val`` on each enabled lane, and taking the maximum of that and +the scalar ``start_value``. Disabled lanes are treated as containing the +neutral value ``INT_MIN`` (i.e. having no effect on the reduction operation). +If the vector length is zero, the result is the start value. + +To ignore the start value, the neutral value can be used. + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call i8 @llvm.vp.reduce.smax.v4i8(i8 %start, <4 x i8> %a, <4 x i1> %mask, i32 %evl) + ; %r is equivalent to %also.r, where lanes greater than or equal to %evl + ; are treated as though %mask were false for those lanes. + + %masked.a = select <4 x i1> %mask, <4 x i8> %a, <4 x i8> + %reduction = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> %masked.a) + %also.r = call i8 @llvm.smax.i8(i8 %reduction, i8 %start) + + +.. _int_vp_reduce_smin: + +'``llvm.vp.reduce.smin.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare i32 @llvm.vp.reduce.smin.v4i32(i32 , <4 x i32> , <4 x i1> , i32 ) + declare i16 @llvm.vp.reduce.smin.nxv8i16(i16 , , , i32 ) + +Overview: +""""""""" + +Predicated signed-integer ``MIN`` reduction of a vector and a scalar starting +value, returning the result as a scalar. + + +Arguments: +"""""""""" + +The first operand is the start value of the reduction, which must be a scalar +integer type equal to the result type. The second operand is the vector on +which the reduction is performed and must be a vector of integer values whose +element type is the result/start type. The third operand is the vector mask and +is a vector of boolean values with the same number of elements as the vector +operand. The fourth operand is the explicit vector length of the operation. + +Semantics: +"""""""""" + +The '``llvm.vp.reduce.smin``' intrinsic performs the signed-integer ``MIN`` +reduction (:ref:`llvm.vector.reduce.smin `) of the +vector operand ``val`` on each enabled lane, and taking the minimum of that and +the scalar ``start_value``. Disabled lanes are treated as containing the +neutral value ``INT_MAX`` (i.e. having no effect on the reduction operation). +If the vector length is zero, the result is the start value. + +To ignore the start value, the neutral value can be used. + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call i8 @llvm.vp.reduce.smin.v4i8(i8 %start, <4 x i8> %a, <4 x i1> %mask, i32 %evl) + ; %r is equivalent to %also.r, where lanes greater than or equal to %evl + ; are treated as though %mask were false for those lanes. + + %masked.a = select <4 x i1> %mask, <4 x i8> %a, <4 x i8> + %reduction = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> %masked.a) + %also.r = call i8 @llvm.smin.i8(i8 %reduction, i8 %start) + + +.. _int_vp_reduce_umax: + +'``llvm.vp.reduce.umax.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare i32 @llvm.vp.reduce.umax.v4i32(i32 , <4 x i32> , <4 x i1> , i32 ) + declare i16 @llvm.vp.reduce.umax.nxv8i16(i16 , , , i32 ) + +Overview: +""""""""" + +Predicated unsigned-integer ``MAX`` reduction of a vector and a scalar starting +value, returning the result as a scalar. + + +Arguments: +"""""""""" + +The first operand is the start value of the reduction, which must be a scalar +integer type equal to the result type. The second operand is the vector on +which the reduction is performed and must be a vector of integer values whose +element type is the result/start type. The third operand is the vector mask and +is a vector of boolean values with the same number of elements as the vector +operand. The fourth operand is the explicit vector length of the operation. + +Semantics: +"""""""""" + +The '``llvm.vp.reduce.umax``' intrinsic performs the unsigned-integer ``MAX`` +reduction (:ref:`llvm.vector.reduce.umax `) of the +vector operand ``val`` on each enabled lane, and taking the maximum of that and +the scalar ``start_value``. Disabled lanes are treated as containing the +neutral value ``0`` (i.e. having no effect on the reduction operation). If the +vector length is zero, the result is the start value. + +To ignore the start value, the neutral value can be used. + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call i32 @llvm.vp.reduce.umax.v4i32(i32 %start, <4 x i32> %a, <4 x i1> %mask, i32 %evl) + ; %r is equivalent to %also.r, where lanes greater than or equal to %evl + ; are treated as though %mask were false for those lanes. + + %masked.a = select <4 x i1> %mask, <4 x i32> %a, <4 x i32> + %reduction = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %masked.a) + %also.r = call i32 @llvm.umax.i32(i32 %reduction, i32 %start) + + +.. _int_vp_reduce_umin: + +'``llvm.vp.reduce.umin.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare i32 @llvm.vp.reduce.umin.v4i32(i32 , <4 x i32> , <4 x i1> , i32 ) + declare i16 @llvm.vp.reduce.umin.nxv8i16(i16 , , , i32 ) + +Overview: +""""""""" + +Predicated unsigned-integer ``MIN`` reduction of a vector and a scalar starting +value, returning the result as a scalar. + + +Arguments: +"""""""""" + +The first operand is the start value of the reduction, which must be a scalar +integer type equal to the result type. The second operand is the vector on +which the reduction is performed and must be a vector of integer values whose +element type is the result/start type. The third operand is the vector mask and +is a vector of boolean values with the same number of elements as the vector +operand. The fourth operand is the explicit vector length of the operation. + +Semantics: +"""""""""" + +The '``llvm.vp.reduce.umin``' intrinsic performs the unsigned-integer ``MIN`` +reduction (:ref:`llvm.vector.reduce.umin `) of the +vector operand ``val`` on each enabled lane, taking the minimum of that and the +scalar ``start_value``. Disabled lanes are treated as containing the neutral +value ``UINT_MAX``, or ``-1`` (i.e. having no effect on the reduction +operation). If the vector length is zero, the result is the start value. + +To ignore the start value, the neutral value can be used. + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call i32 @llvm.vp.reduce.umin.v4i32(i32 %start, <4 x i32> %a, <4 x i1> %mask, i32 %evl) + ; %r is equivalent to %also.r, where lanes greater than or equal to %evl + ; are treated as though %mask were false for those lanes. + + %masked.a = select <4 x i1> %mask, <4 x i32> %a, <4 x i32> + %reduction = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %masked.a) + %also.r = call i32 @llvm.umin.i32(i32 %reduction, i32 %start) + + +.. _int_vp_reduce_fmax: + +'``llvm.vp.reduce.fmax.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare float @llvm.vp.reduce.fmax.v4f32(float , <4 x float> , <4 x i1> , float ) + declare double @llvm.vp.reduce.fmax.nxv8f64(double , , , i32 ) + +Overview: +""""""""" + +Predicated floating-point ``MAX`` reduction of a vector and a scalar starting +value, returning the result as a scalar. + + +Arguments: +"""""""""" + +The first operand is the start value of the reduction, which must be a scalar +floating-point type equal to the result type. The second operand is the vector +on which the reduction is performed and must be a vector of floating-point +values whose element type is the result/start type. The third operand is the +vector mask and is a vector of boolean values with the same number of elements +as the vector operand. The fourth operand is the explicit vector length of the +operation. + +Semantics: +"""""""""" + +The '``llvm.vp.reduce.fmax``' intrinsic performs the floating-point ``MAX`` +reduction (:ref:`llvm.vector.reduce.fmax `) of the +vector operand ``val`` on each enabled lane, taking the maximum of that and the +scalar ``start_value``. Disabled lanes are treated as containing the neutral +value (i.e. having no effect on the reduction operation). If the vector length +is zero, the result is the start value. + +The neutral value is dependent on the :ref:`fast-math flags `. If no +flags are set, the neutral value is ``-QNAN``. If ``nnan`` and ``ninf`` are +both set, then the neutral value is the smallest floating-point value for the +result type. If only ``nnan`` is set then the neutral value is ``-Infinity``. + +This instruction has the same comparison semantics as the +:ref:`llvm.vector.reduce.fmax ` intrinsic (and thus the +'``llvm.maxnum.*``' intrinsic). That is, the result will always be a number +unless all elements of the vector and the starting value are ``NaN``. For a +vector with maximum element magnitude ``0.0`` and containing both ``+0.0`` and +``-0.0`` elements, the sign of the result is unspecified. + +To ignore the start value, the neutral value can be used. + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call float @llvm.vp.reduce.fmax.v4f32(float %float, <4 x float> %a, <4 x i1> %mask, i32 %evl) + ; %r is equivalent to %also.r, where lanes greater than or equal to %evl + ; are treated as though %mask were false for those lanes. + + %masked.a = select <4 x i1> %mask, <4 x float> %a, <4 x float> + %reduction = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %masked.a) + %also.r = call float @llvm.maxnum.f32(float %reduction, float %start) + + +.. _int_vp_reduce_fmin: + +'``llvm.vp.reduce.fmin.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare float @llvm.vp.reduce.fmin.v4f32(float , <4 x float> , <4 x i1> , float ) + declare double @llvm.vp.reduce.fmin.nxv8f64(double , , , i32 ) + +Overview: +""""""""" + +Predicated floating-point ``MIN`` reduction of a vector and a scalar starting +value, returning the result as a scalar. + + +Arguments: +"""""""""" + +The first operand is the start value of the reduction, which must be a scalar +floating-point type equal to the result type. The second operand is the vector +on which the reduction is performed and must be a vector of floating-point +values whose element type is the result/start type. The third operand is the +vector mask and is a vector of boolean values with the same number of elements +as the vector operand. The fourth operand is the explicit vector length of the +operation. + +Semantics: +"""""""""" + +The '``llvm.vp.reduce.fmin``' intrinsic performs the floating-point ``MIN`` +reduction (:ref:`llvm.vector.reduce.fmin `) of the +vector operand ``val`` on each enabled lane, taking the minimum of that and the +scalar ``start_value``. Disabled lanes are treated as containing the neutral +value (i.e. having no effect on the reduction operation). If the vector length +is zero, the result is the start value. + +The neutral value is dependent on the :ref:`fast-math flags `. If no +flags are set, the neutral value is ``+QNAN``. If ``nnan`` and ``ninf`` are +both set, then the neutral value is the largest floating-point value for the +result type. If only ``nnan`` is set then the neutral value is ``+Infinity``. + +This instruction has the same comparison semantics as the +:ref:`llvm.vector.reduce.fmin ` intrinsic (and thus the +'``llvm.minnum.*``' intrinsic). That is, the result will always be a number +unless all elements of the vector and the starting value are ``NaN``. For a +vector with maximum element magnitude ``0.0`` and containing both ``+0.0`` and +``-0.0`` elements, the sign of the result is unspecified. + +To ignore the start value, the neutral value can be used. + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call float @llvm.vp.reduce.fmin.v4f32(float %start, <4 x float> %a, <4 x i1> %mask, i32 %evl) + ; %r is equivalent to %also.r, where lanes greater than or equal to %evl + ; are treated as though %mask were false for those lanes. + + %masked.a = select <4 x i1> %mask, <4 x float> %a, <4 x float> + %reduction = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %masked.a) + %also.r = call float @llvm.minnum.f32(float %reduction, float %start) + + .. _int_get_active_lane_mask: '``llvm.get.active.lane.mask.*``' Intrinsics diff --git a/llvm/include/llvm/IR/IntrinsicInst.h b/llvm/include/llvm/IR/IntrinsicInst.h --- a/llvm/include/llvm/IR/IntrinsicInst.h +++ b/llvm/include/llvm/IR/IntrinsicInst.h @@ -448,6 +448,28 @@ static Optional getFunctionalOpcodeForVP(Intrinsic::ID ID); }; +/// This represents vector predication reduction intrinsics. +class VPReductionIntrinsic : public VPIntrinsic { +public: + static bool isVPReduction(Intrinsic::ID ID); + + unsigned getStartParamPos() const; + unsigned getVectorParamPos() const; + + static Optional getStartParamPos(Intrinsic::ID ID); + static Optional getVectorParamPos(Intrinsic::ID ID); + + /// Methods for support type inquiry through isa, cast, and dyn_cast: + /// @{ + static bool classof(const IntrinsicInst *I) { + return VPReductionIntrinsic::isVPReduction(I->getIntrinsicID()); + } + static bool classof(const Value *V) { + return isa(V) && classof(cast(V)); + } + /// @} +}; + /// This is the common base class for constrained floating point intrinsics. class ConstrainedFPIntrinsic : public IntrinsicInst { public: diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -1498,6 +1498,75 @@ llvm_i32_ty]>; } +// Reductions +let IntrProperties = [IntrSpeculatable, IntrNoMem, IntrNoSync, IntrWillReturn] in { + def int_vp_reduce_fadd : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>], + [LLVMVectorElementType<0>, + llvm_anyvector_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_reduce_fmul : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>], + [LLVMVectorElementType<0>, + llvm_anyvector_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_reduce_add : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>], + [LLVMVectorElementType<0>, + llvm_anyvector_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_reduce_mul : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>], + [LLVMVectorElementType<0>, + llvm_anyvector_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_reduce_and : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>], + [LLVMVectorElementType<0>, + llvm_anyvector_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_reduce_or : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>], + [LLVMVectorElementType<0>, + llvm_anyvector_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_reduce_xor : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>], + [LLVMVectorElementType<0>, + llvm_anyvector_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_reduce_smax : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>], + [LLVMVectorElementType<0>, + llvm_anyvector_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_reduce_smin : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>], + [LLVMVectorElementType<0>, + llvm_anyvector_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_reduce_umax : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>], + [LLVMVectorElementType<0>, + llvm_anyvector_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_reduce_umin : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>], + [LLVMVectorElementType<0>, + llvm_anyvector_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_reduce_fmax : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>], + [LLVMVectorElementType<0>, + llvm_anyvector_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_reduce_fmin : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>], + [LLVMVectorElementType<0>, + llvm_anyvector_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; +} + def int_get_active_lane_mask: DefaultAttrsIntrinsic<[llvm_anyvector_ty], [llvm_anyint_ty, LLVMMatchType<1>], diff --git a/llvm/include/llvm/IR/VPIntrinsics.def b/llvm/include/llvm/IR/VPIntrinsics.def --- a/llvm/include/llvm/IR/VPIntrinsics.def +++ b/llvm/include/llvm/IR/VPIntrinsics.def @@ -111,6 +111,11 @@ #define HANDLE_VP_IS_MEMOP(VPID, POINTERPOS, DATAPOS) #endif +// Map this VP reduction intrinsic to its reduction operand positions. +#ifndef HANDLE_VP_REDUCTION +#define HANDLE_VP_REDUCTION(ID, STARTPOS, VECTORPOS) +#endif + /// } Property Macros ///// Integer Arithmetic { @@ -231,6 +236,91 @@ ///// } Memory Operations +///// Reductions { + +// Specialized helper macro for reductions (%x, %mask, %evl). +#ifdef HELPER_REGISTER_REDUCTION_VP +#error "The internal helper macro HELPER_REGISTER_REDUCTION_VP is already defined!" +#endif +#define HELPER_REGISTER_REDUCTION_VP(VPINTRIN, SDOPC, INTRIN) \ +BEGIN_REGISTER_VP(VPINTRIN, 2, 3, SDOPC, -1) \ +HANDLE_VP_TO_INTRIN(INTRIN) \ +HANDLE_VP_REDUCTION(VPINTRIN, 0, 1) \ +END_REGISTER_VP(VPINTRIN, SDOPC) + +// llvm.vp.reduce.add(start,x,mask,vlen) +HELPER_REGISTER_REDUCTION_VP(vp_reduce_add, VP_REDUCE_ADD, + experimental_vector_reduce_add) + +// llvm.vp.reduce.mul(start,x,mask,vlen) +HELPER_REGISTER_REDUCTION_VP(vp_reduce_mul, VP_REDUCE_MUL, + experimental_vector_reduce_mul) + +// llvm.vp.reduce.and(start,x,mask,vlen) +HELPER_REGISTER_REDUCTION_VP(vp_reduce_and, VP_REDUCE_AND, + experimental_vector_reduce_and) + +// llvm.vp.reduce.or(start,x,mask,vlen) +HELPER_REGISTER_REDUCTION_VP(vp_reduce_or, VP_REDUCE_OR, + experimental_vector_reduce_or) + +// llvm.vp.reduce.xor(start,x,mask,vlen) +HELPER_REGISTER_REDUCTION_VP(vp_reduce_xor, VP_REDUCE_XOR, + experimental_vector_reduce_xor) + +// llvm.vp.reduce.smax(start,x,mask,vlen) +HELPER_REGISTER_REDUCTION_VP(vp_reduce_smax, VP_REDUCE_SMAX, + experimental_vector_reduce_smax) + +// llvm.vp.reduce.smin(start,x,mask,vlen) +HELPER_REGISTER_REDUCTION_VP(vp_reduce_smin, VP_REDUCE_SMIN, + experimental_vector_reduce_smin) + +// llvm.vp.reduce.umax(start,x,mask,vlen) +HELPER_REGISTER_REDUCTION_VP(vp_reduce_umax, VP_REDUCE_UMAX, + experimental_vector_reduce_umax) + +// llvm.vp.reduce.umin(start,x,mask,vlen) +HELPER_REGISTER_REDUCTION_VP(vp_reduce_umin, VP_REDUCE_UMIN, + experimental_vector_reduce_umin) + +// llvm.vp.reduce.fmax(start,x,mask,vlen) +HELPER_REGISTER_REDUCTION_VP(vp_reduce_fmax, VP_REDUCE_FMAX, + experimental_vector_reduce_fmax) + +// llvm.vp.reduce.fmin(start,x,mask,vlen) +HELPER_REGISTER_REDUCTION_VP(vp_reduce_fmin, VP_REDUCE_FMIN, + experimental_vector_reduce_fmin) + +#undef HELPER_REGISTER_REDUCTION_VP + +// Specialized helper macro for reductions with a starting value (%acc, %x, %mask, %evl). +#ifdef HELPER_REGISTER_REDUCTION_SEQ_VP +#error "The internal helper macro HELPER_REGISTER_REDUCTION_SEQ_VP is already defined!" +#endif +#define HELPER_REGISTER_REDUCTION_SEQ_VP(VPINTRIN, SDOPC, SEQ_SDOPC, INTRIN) \ +BEGIN_REGISTER_VP_INTRINSIC(VPINTRIN, 2, 3) \ +BEGIN_REGISTER_VP_SDNODE(SDOPC, -1, VPINTRIN, 2, 3) \ +END_REGISTER_VP_SDNODE(SDOPC) \ +BEGIN_REGISTER_VP_SDNODE(SEQ_SDOPC, -1, VPINTRIN, 2, 3) \ +END_REGISTER_VP_SDNODE(SEQ_SDOPC) \ +HANDLE_VP_TO_INTRIN(INTRIN) \ +HANDLE_VP_REDUCTION(VPINTRIN, 0, 1) \ +END_REGISTER_VP_INTRINSIC(VPINTRIN) + +// llvm.vp.reduce.fadd(start,x,mask,vlen) +HELPER_REGISTER_REDUCTION_SEQ_VP(vp_reduce_fadd, VP_REDUCE_FADD, + VP_REDUCE_SEQ_FADD, + experimental_vector_reduce_fadd) + +// llvm.vp.reduce.fmul(start,x,mask,vlen) +HELPER_REGISTER_REDUCTION_SEQ_VP(vp_reduce_fmul, VP_REDUCE_FMUL, + VP_REDUCE_SEQ_FMUL, + experimental_vector_reduce_fmul) + +#undef HELPER_REGISTER_REDUCTION_SEQ_VP + +///// } Reduction #undef BEGIN_REGISTER_VP #undef BEGIN_REGISTER_VP_INTRINSIC @@ -242,3 +332,4 @@ #undef HANDLE_VP_TO_CONSTRAINEDFP #undef HANDLE_VP_TO_INTRIN #undef HANDLE_VP_IS_MEMOP +#undef HANDLE_VP_REDUCTION diff --git a/llvm/lib/CodeGen/ExpandVectorPredication.cpp b/llvm/lib/CodeGen/ExpandVectorPredication.cpp --- a/llvm/lib/CodeGen/ExpandVectorPredication.cpp +++ b/llvm/lib/CodeGen/ExpandVectorPredication.cpp @@ -24,6 +24,7 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/Module.h" +#include "llvm/IR/Operator.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" @@ -158,6 +159,11 @@ Value *expandPredicationInBinaryOperator(IRBuilder<> &Builder, VPIntrinsic &PI); + /// \brief Lower this VP reduction to a call to an unpredicated reduction + /// intrinsic. + Value *expandPredicationInReduction(IRBuilder<> &Builder, + VPReductionIntrinsic &PI); + /// \brief Query TTI and expand the vector predication in \p P accordingly. Value *expandPredication(VPIntrinsic &PI); @@ -248,6 +254,136 @@ return NewBinOp; } +static Value *getNeutralReductionElement(const VPReductionIntrinsic &VPI, + Type *EltTy) { + bool Negative = false; + unsigned EltBits = EltTy->getScalarSizeInBits(); + switch (VPI.getIntrinsicID()) { + default: + llvm_unreachable("Expecting a VP reduction intrinsic"); + case Intrinsic::vp_reduce_add: + case Intrinsic::vp_reduce_or: + case Intrinsic::vp_reduce_xor: + case Intrinsic::vp_reduce_umax: + return Constant::getNullValue(EltTy); + case Intrinsic::vp_reduce_mul: + return ConstantInt::get(EltTy, 1, /*IsSigned*/ false); + case Intrinsic::vp_reduce_and: + case Intrinsic::vp_reduce_umin: + return ConstantInt::getAllOnesValue(EltTy); + case Intrinsic::vp_reduce_smin: + return ConstantInt::get(EltTy->getContext(), + APInt::getSignedMaxValue(EltBits)); + case Intrinsic::vp_reduce_smax: + return ConstantInt::get(EltTy->getContext(), + APInt::getSignedMinValue(EltBits)); + case Intrinsic::vp_reduce_fmax: + Negative = true; + LLVM_FALLTHROUGH; + case Intrinsic::vp_reduce_fmin: { + FastMathFlags Flags = VPI.getFastMathFlags(); + const fltSemantics &Semantics = EltTy->getFltSemantics(); + return !Flags.noNaNs() ? ConstantFP::getQNaN(EltTy, Negative) + : !Flags.noInfs() + ? ConstantFP::getInfinity(EltTy, Negative) + : ConstantFP::get(EltTy, + APFloat::getLargest(Semantics, Negative)); + } + case Intrinsic::vp_reduce_fadd: + return ConstantFP::getNegativeZero(EltTy); + case Intrinsic::vp_reduce_fmul: + return ConstantFP::get(EltTy, 1.0); + } +} + +Value * +CachingVPExpander::expandPredicationInReduction(IRBuilder<> &Builder, + VPReductionIntrinsic &VPI) { + assert((isSafeToSpeculativelyExecute(&VPI) || + VPI.canIgnoreVectorLengthParam()) && + "Implicitly dropping %evl in non-speculatable operator!"); + + Value *Mask = VPI.getMaskParam(); + Value *RedOp = VPI.getOperand(VPI.getVectorParamPos()); + + // Insert neutral element in masked-out positions + if (Mask && !isAllTrueMask(Mask)) { + auto *NeutralElt = getNeutralReductionElement(VPI, VPI.getType()); + auto *NeutralVector = Builder.CreateVectorSplat( + cast(RedOp->getType())->getElementCount(), NeutralElt); + RedOp = Builder.CreateSelect(Mask, RedOp, NeutralVector); + } + + Value *Reduction; + Value *Start = VPI.getOperand(VPI.getStartParamPos()); + + switch (VPI.getIntrinsicID()) { + default: + llvm_unreachable("Impossible reduction kind"); + case Intrinsic::vp_reduce_add: + Reduction = Builder.CreateAddReduce(RedOp); + Reduction = Builder.CreateAdd(Reduction, Start); + break; + case Intrinsic::vp_reduce_mul: + Reduction = Builder.CreateMulReduce(RedOp); + Reduction = Builder.CreateMul(Reduction, Start); + break; + case Intrinsic::vp_reduce_and: + Reduction = Builder.CreateAndReduce(RedOp); + Reduction = Builder.CreateAnd(Reduction, Start); + break; + case Intrinsic::vp_reduce_or: + Reduction = Builder.CreateOrReduce(RedOp); + Reduction = Builder.CreateOr(Reduction, Start); + break; + case Intrinsic::vp_reduce_xor: + Reduction = Builder.CreateXorReduce(RedOp); + Reduction = Builder.CreateXor(Reduction, Start); + break; + case Intrinsic::vp_reduce_smax: + Reduction = Builder.CreateIntMaxReduce(RedOp, /*IsSigned*/ true); + Reduction = + Builder.CreateBinaryIntrinsic(Intrinsic::smax, Reduction, Start); + break; + case Intrinsic::vp_reduce_smin: + Reduction = Builder.CreateIntMinReduce(RedOp, /*IsSigned*/ true); + Reduction = + Builder.CreateBinaryIntrinsic(Intrinsic::smin, Reduction, Start); + break; + case Intrinsic::vp_reduce_umax: + Reduction = Builder.CreateIntMaxReduce(RedOp, /*IsSigned*/ false); + Reduction = + Builder.CreateBinaryIntrinsic(Intrinsic::umax, Reduction, Start); + break; + case Intrinsic::vp_reduce_umin: + Reduction = Builder.CreateIntMinReduce(RedOp, /*IsSigned*/ false); + Reduction = + Builder.CreateBinaryIntrinsic(Intrinsic::umin, Reduction, Start); + break; + case Intrinsic::vp_reduce_fmax: + Reduction = Builder.CreateFPMaxReduce(RedOp); + transferDecorations(*Reduction, VPI); + Reduction = + Builder.CreateBinaryIntrinsic(Intrinsic::maxnum, Reduction, Start); + break; + case Intrinsic::vp_reduce_fmin: + Reduction = Builder.CreateFPMinReduce(RedOp); + transferDecorations(*Reduction, VPI); + Reduction = + Builder.CreateBinaryIntrinsic(Intrinsic::minnum, Reduction, Start); + break; + case Intrinsic::vp_reduce_fadd: + Reduction = Builder.CreateFAddReduce(Start, RedOp); + break; + case Intrinsic::vp_reduce_fmul: + Reduction = Builder.CreateFMulReduce(Start, RedOp); + break; + } + + replaceOperation(*Reduction, VPI); + return Reduction; +} + void CachingVPExpander::discardEVLParameter(VPIntrinsic &VPI) { LLVM_DEBUG(dbgs() << "Discard EVL parameter in " << VPI << "\n"); @@ -321,6 +457,9 @@ if (OC && Instruction::isBinaryOp(*OC)) return expandPredicationInBinaryOperator(Builder, VPI); + if (auto *VPRI = dyn_cast(&VPI)) + return expandPredicationInReduction(Builder, *VPRI); + return &VPI; } diff --git a/llvm/lib/IR/IntrinsicInst.cpp b/llvm/lib/IR/IntrinsicInst.cpp --- a/llvm/lib/IR/IntrinsicInst.cpp +++ b/llvm/lib/IR/IntrinsicInst.cpp @@ -473,9 +473,15 @@ assert(isVPIntrinsic(VPID) && "not a VP intrinsic"); Function *VPFunc; switch (VPID) { - default: - VPFunc = Intrinsic::getDeclaration(M, VPID, Params[0]->getType()); + default: { + Type *OverloadTy = Params[0]->getType(); + if (VPReductionIntrinsic::isVPReduction(VPID)) + OverloadTy = + Params[*VPReductionIntrinsic::getVectorParamPos(VPID)]->getType(); + + VPFunc = Intrinsic::getDeclaration(M, VPID, OverloadTy); break; + } case Intrinsic::vp_load: VPFunc = Intrinsic::getDeclaration( M, VPID, @@ -504,6 +510,48 @@ return VPFunc; } +bool VPReductionIntrinsic::isVPReduction(Intrinsic::ID ID) { + switch (ID) { + default: + return false; +#define HANDLE_VP_REDUCTION(VPID, STARTPOS, VECTORPOS) \ + case Intrinsic::VPID: \ + break; +#include "llvm/IR/VPIntrinsics.def" + } + return true; +} + +unsigned VPReductionIntrinsic::getVectorParamPos() const { + return *VPReductionIntrinsic::getVectorParamPos(getIntrinsicID()); +} + +unsigned VPReductionIntrinsic::getStartParamPos() const { + return *VPReductionIntrinsic::getStartParamPos(getIntrinsicID()); +} + +Optional VPReductionIntrinsic::getVectorParamPos(Intrinsic::ID ID) { + switch (ID) { +#define HANDLE_VP_REDUCTION(VPID, STARTPOS, VECTORPOS) \ + case Intrinsic::VPID: \ + return VECTORPOS; +#include "llvm/IR/VPIntrinsics.def" + default: + return None; + } +} + +Optional VPReductionIntrinsic::getStartParamPos(Intrinsic::ID ID) { + switch (ID) { +#define HANDLE_VP_REDUCTION(VPID, STARTPOS, VECTORPOS) \ + case Intrinsic::VPID: \ + return STARTPOS; +#include "llvm/IR/VPIntrinsics.def" + default: + return None; + } +} + Instruction::BinaryOps BinaryOpIntrinsic::getBinaryOp() const { switch (getIntrinsicID()) { case Intrinsic::uadd_with_overflow: diff --git a/llvm/test/CodeGen/Generic/expand-vp.ll b/llvm/test/CodeGen/Generic/expand-vp.ll --- a/llvm/test/CodeGen/Generic/expand-vp.ll +++ b/llvm/test/CodeGen/Generic/expand-vp.ll @@ -25,6 +25,20 @@ declare <8 x i32> @llvm.vp.ashr.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) declare <8 x i32> @llvm.vp.lshr.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) declare <8 x i32> @llvm.vp.shl.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) +; Reductions +declare i32 @llvm.vp.reduce.add.v4i32(i32, <4 x i32>, <4 x i1>, i32) +declare i32 @llvm.vp.reduce.mul.v4i32(i32, <4 x i32>, <4 x i1>, i32) +declare i32 @llvm.vp.reduce.and.v4i32(i32, <4 x i32>, <4 x i1>, i32) +declare i32 @llvm.vp.reduce.or.v4i32(i32, <4 x i32>, <4 x i1>, i32) +declare i32 @llvm.vp.reduce.xor.v4i32(i32, <4 x i32>, <4 x i1>, i32) +declare i32 @llvm.vp.reduce.smin.v4i32(i32, <4 x i32>, <4 x i1>, i32) +declare i32 @llvm.vp.reduce.smax.v4i32(i32, <4 x i32>, <4 x i1>, i32) +declare i32 @llvm.vp.reduce.umin.v4i32(i32, <4 x i32>, <4 x i1>, i32) +declare i32 @llvm.vp.reduce.umax.v4i32(i32, <4 x i32>, <4 x i1>, i32) +declare float @llvm.vp.reduce.fmin.v4f32(float, <4 x float>, <4 x i1>, i32) +declare float @llvm.vp.reduce.fmax.v4f32(float, <4 x float>, <4 x i1>, i32) +declare float @llvm.vp.reduce.fadd.v4f32(float, <4 x float>, <4 x i1>, i32) +declare float @llvm.vp.reduce.fmul.v4f32(float, <4 x float>, <4 x i1>, i32) ; Fixed vector test function. define void @test_vp_int_v8(<8 x i32> %i0, <8 x i32> %i1, <8 x i32> %i2, <8 x i32> %f3, <8 x i1> %m, i32 %n) { @@ -78,6 +92,35 @@ %rC = call @llvm.vp.shl.nxv4i32( %i0, %i1, %m, i32 %n) ret void } + +; Fixed vector reduce test function. +define void @test_vp_reduce_int_v4(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 %n) { + %r0 = call i32 @llvm.vp.reduce.add.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 %n) + %r1 = call i32 @llvm.vp.reduce.mul.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 %n) + %r2 = call i32 @llvm.vp.reduce.and.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 %n) + %r3 = call i32 @llvm.vp.reduce.or.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 %n) + %r4 = call i32 @llvm.vp.reduce.xor.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 %n) + %r5 = call i32 @llvm.vp.reduce.smin.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 %n) + %r6 = call i32 @llvm.vp.reduce.smax.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 %n) + %r7 = call i32 @llvm.vp.reduce.umin.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 %n) + %r8 = call i32 @llvm.vp.reduce.umax.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 %n) + ret void +} + +define void @test_vp_reduce_fp_v4(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) { + %r0 = call float @llvm.vp.reduce.fmin.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) + %r1 = call nnan float @llvm.vp.reduce.fmin.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) + %r2 = call nnan ninf float @llvm.vp.reduce.fmin.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) + %r3 = call float @llvm.vp.reduce.fmax.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) + %r4 = call nnan float @llvm.vp.reduce.fmax.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) + %r5 = call nnan ninf float @llvm.vp.reduce.fmax.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) + %r6 = call float @llvm.vp.reduce.fadd.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) + %r7 = call reassoc float @llvm.vp.reduce.fadd.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) + %r8 = call float @llvm.vp.reduce.fmul.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) + %r9 = call reassoc float @llvm.vp.reduce.fmul.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) + ret void +} + ; All VP intrinsics have to be lowered into non-VP ops ; Convert %evl into %mask for non-speculatable VP intrinsics and emit the ; instruction+select idiom with a non-VP SIMD instruction. @@ -121,7 +164,66 @@ ; ALL-CONVERT: ret void +; Check that reductions use the correct neutral element for masked-off elements +; ALL-CONVERT: define void @test_vp_reduce_int_v4(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 %n) { +; ALL-CONVERT-NEXT: [[ADD:%.+]] = select <4 x i1> %m, <4 x i32> %vi, <4 x i32> zeroinitializer +; ALL-CONVERT-NEXT: [[RED:%.+]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[ADD]]) +; ALL-CONVERT-NEXT: %{{.+}} = add i32 [[RED]], %start +; ALL-CONVERT-NEXT: [[MUL:%.+]] = select <4 x i1> %m, <4 x i32> %vi, <4 x i32> +; ALL-CONVERT-NEXT: [[RED:%.+]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[MUL]]) +; ALL-CONVERT-NEXT: %{{.+}} = mul i32 [[RED]], %start +; ALL-CONVERT-NEXT: [[AND:%.+]] = select <4 x i1> %m, <4 x i32> %vi, <4 x i32> +; ALL-CONVERT-NEXT: [[RED:%.+]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[AND]]) +; ALL-CONVERT-NEXT: %{{.+}} = and i32 [[RED]], %start +; ALL-CONVERT-NEXT: [[OR:%.+]] = select <4 x i1> %m, <4 x i32> %vi, <4 x i32> zeroinitializer +; ALL-CONVERT-NEXT: [[RED:%.+]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[OR]]) +; ALL-CONVERT-NEXT: %{{.+}} = or i32 [[RED]], %start +; ALL-CONVERT-NEXT: [[XOR:%.+]] = select <4 x i1> %m, <4 x i32> %vi, <4 x i32> zeroinitializer +; ALL-CONVERT-NEXT: [[RED:%.+]] = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> [[XOR]]) +; ALL-CONVERT-NEXT: %{{.+}} = xor i32 [[RED]], %start +; ALL-CONVERT-NEXT: [[SMIN:%.+]] = select <4 x i1> %m, <4 x i32> %vi, <4 x i32> +; ALL-CONVERT-NEXT: [[RED:%.+]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[SMIN]]) +; ALL-CONVERT-NEXT: %{{.+}} = call i32 @llvm.smin.i32(i32 [[RED]], i32 %start) +; ALL-CONVERT-NEXT: [[SMAX:%.+]] = select <4 x i1> %m, <4 x i32> %vi, <4 x i32> +; ALL-CONVERT-NEXT: [[RED:%.+]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[SMAX]]) +; ALL-CONVERT-NEXT: %{{.+}} = call i32 @llvm.smax.i32(i32 [[RED]], i32 %start) +; ALL-CONVERT-NEXT: [[UMIN:%.+]] = select <4 x i1> %m, <4 x i32> %vi, <4 x i32> +; ALL-CONVERT-NEXT: [[RED:%.+]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[UMIN]]) +; ALL-CONVERT-NEXT: %{{.+}} = call i32 @llvm.umin.i32(i32 [[RED]], i32 %start) +; ALL-CONVERT-NEXT: [[UMAX:%.+]] = select <4 x i1> %m, <4 x i32> %vi, <4 x i32> zeroinitializer +; ALL-CONVERT-NEXT: [[RED:%.+]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[UMAX]]) +; ALL-CONVERT-NEXT: %{{.+}} = call i32 @llvm.umax.i32(i32 [[RED]], i32 %start) +; ALL-CONVERT-NEXT: ret void +; Check that reductions use the correct neutral element for masked-off elements +; ALL-CONVERT: define void @test_vp_reduce_fp_v4(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) { +; ALL-CONVERT-NEXT: [[FMIN:%.+]] = select <4 x i1> %m, <4 x float> %vf, <4 x float> +; ALL-CONVERT-NEXT: [[RED:%.+]] = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[FMIN]]) +; ALL-CONVERT-NEXT: %{{.+}} = call float @llvm.minnum.f32(float [[RED]], float %f) +; ALL-CONVERT-NEXT: [[FMIN_NNAN:%.+]] = select <4 x i1> %m, <4 x float> %vf, <4 x float> +; ALL-CONVERT-NEXT: [[RED:%.+]] = call nnan float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[FMIN_NNAN]]) +; ALL-CONVERT-NEXT: %{{.+}} = call nnan float @llvm.minnum.f32(float [[RED]], float %f) +; ALL-CONVERT-NEXT: [[FMIN_NNAN_NINF:%.+]] = select <4 x i1> %m, <4 x float> %vf, <4 x float> +; ALL-CONVERT-NEXT: [[RED:%.+]] = call nnan ninf float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[FMIN_NNAN_NINF]]) +; ALL-CONVERT-NEXT: %{{.+}} = call nnan ninf float @llvm.minnum.f32(float [[RED]], float %f) +; ALL-CONVERT-NEXT: [[FMAX:%.+]] = select <4 x i1> %m, <4 x float> %vf, <4 x float> +; ALL-CONVERT-NEXT: [[RED:%.+]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[FMAX]]) +; ALL-CONVERT-NEXT: %{{.+}} = call float @llvm.maxnum.f32(float [[RED]], float %f) +; ALL-CONVERT-NEXT: [[FMAX_NNAN:%.+]] = select <4 x i1> %m, <4 x float> %vf, <4 x float> +; ALL-CONVERT-NEXT: [[RED:%.+]] = call nnan float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[FMAX_NNAN]]) +; ALL-CONVERT-NEXT: %{{.+}} = call nnan float @llvm.maxnum.f32(float [[RED]], float %f) +; ALL-CONVERT-NEXT: [[FMAX_NNAN_NINF:%.+]] = select <4 x i1> %m, <4 x float> %vf, <4 x float> +; ALL-CONVERT-NEXT: [[RED:%.+]] = call nnan ninf float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[FMAX_NNAN_NINF]]) +; ALL-CONVERT-NEXT: %{{.+}} = call nnan ninf float @llvm.maxnum.f32(float [[RED]], float %f) +; ALL-CONVERT-NEXT: [[FADD:%.+]] = select <4 x i1> %m, <4 x float> %vf, <4 x float> +; ALL-CONVERT-NEXT: %{{.+}} = call float @llvm.vector.reduce.fadd.v4f32(float %f, <4 x float> [[FADD]]) +; ALL-CONVERT-NEXT: [[FADD:%.+]] = select <4 x i1> %m, <4 x float> %vf, <4 x float> +; ALL-CONVERT-NEXT: %{{.+}} = call reassoc float @llvm.vector.reduce.fadd.v4f32(float %f, <4 x float> [[FADD]]) +; ALL-CONVERT-NEXT: [[FMUL:%.+]] = select <4 x i1> %m, <4 x float> %vf, <4 x float> +; ALL-CONVERT-NEXT: %{{.+}} = call float @llvm.vector.reduce.fmul.v4f32(float %f, <4 x float> [[FMUL]]) +; ALL-CONVERT-NEXT: [[FMUL:%.+]] = select <4 x i1> %m, <4 x float> %vf, <4 x float> +; ALL-CONVERT-NEXT: %{{.+}} = call reassoc float @llvm.vector.reduce.fmul.v4f32(float %f, <4 x float> [[FMUL]]) +; ALL-CONVERT-NEXT: ret void ; All legal - don't transform anything. @@ -157,6 +259,30 @@ ; LEGAL_LEGAL-NEXT: %rC = call @llvm.vp.shl.nxv4i32( %i0, %i1, %m, i32 %n) ; LEGAL_LEGAL-NEXT: ret void +; LEGAL_LEGAL: define void @test_vp_reduce_int_v4(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 %n) { +; LEGAL_LEGAL-NEXT: %r0 = call i32 @llvm.vp.reduce.add.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 %n) +; LEGAL_LEGAL-NEXT: %r1 = call i32 @llvm.vp.reduce.mul.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 %n) +; LEGAL_LEGAL-NEXT: %r2 = call i32 @llvm.vp.reduce.and.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 %n) +; LEGAL_LEGAL-NEXT: %r3 = call i32 @llvm.vp.reduce.or.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 %n) +; LEGAL_LEGAL-NEXT: %r4 = call i32 @llvm.vp.reduce.xor.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 %n) +; LEGAL_LEGAL-NEXT: %r5 = call i32 @llvm.vp.reduce.smin.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 %n) +; LEGAL_LEGAL-NEXT: %r6 = call i32 @llvm.vp.reduce.smax.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 %n) +; LEGAL_LEGAL-NEXT: %r7 = call i32 @llvm.vp.reduce.umin.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 %n) +; LEGAL_LEGAL-NEXT: %r8 = call i32 @llvm.vp.reduce.umax.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 %n) +; LEGAL_LEGAL-NEXT: ret void + +; LEGAL_LEGAL: define void @test_vp_reduce_fp_v4(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) { +; LEGAL_LEGAL-NEXT: %r0 = call float @llvm.vp.reduce.fmin.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) +; LEGAL_LEGAL-NEXT: %r1 = call nnan float @llvm.vp.reduce.fmin.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) +; LEGAL_LEGAL-NEXT: %r2 = call nnan ninf float @llvm.vp.reduce.fmin.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) +; LEGAL_LEGAL-NEXT: %r3 = call float @llvm.vp.reduce.fmax.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) +; LEGAL_LEGAL-NEXT: %r4 = call nnan float @llvm.vp.reduce.fmax.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) +; LEGAL_LEGAL-NEXT: %r5 = call nnan ninf float @llvm.vp.reduce.fmax.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) +; LEGAL_LEGAL-NEXT: %r6 = call float @llvm.vp.reduce.fadd.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) +; LEGAL_LEGAL-NEXT: %r7 = call reassoc float @llvm.vp.reduce.fadd.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) +; LEGAL_LEGAL-NEXT: %r8 = call float @llvm.vp.reduce.fmul.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) +; LEGAL_LEGAL-NEXT: %r9 = call reassoc float @llvm.vp.reduce.fmul.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) +; LEGAL_LEGAL-NEXT: ret void ; Drop %evl where possible else fold %evl into %mask (%evl Discard, %mask Legal) ; @@ -205,6 +331,30 @@ ; DISCARD_LEGAL-NOT: %{{.+}} = call @llvm.vp.{{.*}}, i32 %n) ; DISCARD_LEGAL: ret void +; DISCARD_LEGAL: define void @test_vp_reduce_int_v4(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 %n) { +; DISCARD_LEGAL-NEXT: %r0 = call i32 @llvm.vp.reduce.add.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 4) +; DISCARD_LEGAL-NEXT: %r1 = call i32 @llvm.vp.reduce.mul.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 4) +; DISCARD_LEGAL-NEXT: %r2 = call i32 @llvm.vp.reduce.and.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 4) +; DISCARD_LEGAL-NEXT: %r3 = call i32 @llvm.vp.reduce.or.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 4) +; DISCARD_LEGAL-NEXT: %r4 = call i32 @llvm.vp.reduce.xor.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 4) +; DISCARD_LEGAL-NEXT: %r5 = call i32 @llvm.vp.reduce.smin.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 4) +; DISCARD_LEGAL-NEXT: %r6 = call i32 @llvm.vp.reduce.smax.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 4) +; DISCARD_LEGAL-NEXT: %r7 = call i32 @llvm.vp.reduce.umin.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 4) +; DISCARD_LEGAL-NEXT: %r8 = call i32 @llvm.vp.reduce.umax.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 4) +; DISCARD_LEGAL-NEXT: ret void + +; DISCARD_LEGAL: define void @test_vp_reduce_fp_v4(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) { +; DISCARD_LEGAL-NEXT: %r0 = call float @llvm.vp.reduce.fmin.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4) +; DISCARD_LEGAL-NEXT: %r1 = call nnan float @llvm.vp.reduce.fmin.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4) +; DISCARD_LEGAL-NEXT: %r2 = call nnan ninf float @llvm.vp.reduce.fmin.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4) +; DISCARD_LEGAL-NEXT: %r3 = call float @llvm.vp.reduce.fmax.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4) +; DISCARD_LEGAL-NEXT: %r4 = call nnan float @llvm.vp.reduce.fmax.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4) +; DISCARD_LEGAL-NEXT: %r5 = call nnan ninf float @llvm.vp.reduce.fmax.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4) +; DISCARD_LEGAL-NEXT: %r6 = call float @llvm.vp.reduce.fadd.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4) +; DISCARD_LEGAL-NEXT: %r7 = call reassoc float @llvm.vp.reduce.fadd.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4) +; DISCARD_LEGAL-NEXT: %r8 = call float @llvm.vp.reduce.fmul.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4) +; DISCARD_LEGAL-NEXT: %r9 = call reassoc float @llvm.vp.reduce.fmul.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4) +; DISCARD_LEGAL-NEXT: ret void ; Convert %evl into %mask everywhere (%evl Convert, %mask Legal) ; @@ -243,3 +393,35 @@ ; CONVERT_LEGAL-NOT: %{{.*}} = call @llvm.vp.{{.*}}, i32 %n) ; CONVERT_LEGAL: ret void +; CONVERT_LEGAL: define void @test_vp_reduce_int_v4(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 %n) { +; CONVERT_LEGAL-NEXT: [[NINS:%.+]] = insertelement <4 x i32> poison, i32 %n, i32 0 +; CONVERT_LEGAL-NEXT: [[NSPLAT:%.+]] = shufflevector <4 x i32> [[NINS]], <4 x i32> poison, <4 x i32> zeroinitializer +; CONVERT_LEGAL-NEXT: [[EVLM:%.+]] = icmp ult <4 x i32> , [[NSPLAT]] +; CONVERT_LEGAL-NEXT: [[NEWM:%.+]] = and <4 x i1> [[EVLM]], %m +; CONVERT_LEGAL-NEXT: %{{.+}} = call i32 @llvm.vp.reduce.add.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> [[NEWM]], i32 4) +; CONVERT_LEGAL-NOT: %{{.+}} = call i32 @llvm.vp.reduce.mul.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 4) +; CONVERT_LEGAL-NOT: %{{.+}} = call i32 @llvm.vp.reduce.and.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 4) +; CONVERT_LEGAL-NOT: %{{.+}} = call i32 @llvm.vp.reduce.or.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 4) +; CONVERT_LEGAL-NOT: %{{.+}} = call i32 @llvm.vp.reduce.xor.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 4) +; CONVERT_LEGAL-NOT: %{{.+}} = call i32 @llvm.vp.reduce.smin.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 4) +; CONVERT_LEGAL-NOT: %{{.+}} = call i32 @llvm.vp.reduce.smax.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 4) +; CONVERT_LEGAL-NOT: %{{.+}} = call i32 @llvm.vp.reduce.umin.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 4) +; CONVERT_LEGAL-NOT: %{{.+}} = call i32 @llvm.vp.reduce.umax.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 4) +; CONVERT_LEGAL: ret void + +; CONVERT_LEGAL: define void @test_vp_reduce_fp_v4(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) { +; CONVERT_LEGAL-NEXT: [[NINS:%.+]] = insertelement <4 x i32> poison, i32 %n, i32 0 +; CONVERT_LEGAL-NEXT: [[NSPLAT:%.+]] = shufflevector <4 x i32> [[NINS]], <4 x i32> poison, <4 x i32> zeroinitializer +; CONVERT_LEGAL-NEXT: [[EVLM:%.+]] = icmp ult <4 x i32> , [[NSPLAT]] +; CONVERT_LEGAL-NEXT: [[NEWM:%.+]] = and <4 x i1> [[EVLM]], %m +; CONVERT_LEGAL-NEXT: %{{.+}} = call float @llvm.vp.reduce.fmin.v4f32(float %f, <4 x float> %vf, <4 x i1> [[NEWM]], i32 4) +; CONVERT_LEGAL-NOT: %{{.+}} = call nnan float @llvm.vp.reduce.fmin.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4) +; CONVERT_LEGAL-NOT: %{{.+}} = call nnan ninf float @llvm.vp.reduce.fmin.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4) +; CONVERT_LEGAL-NOT: %{{.+}} = call float @llvm.vp.reduce.fmax.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4) +; CONVERT_LEGAL-NOT: %{{.+}} = call nnan float @llvm.vp.reduce.fmax.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4) +; CONVERT_LEGAL-NOT: %{{.+}} = call nnan ninf float @llvm.vp.reduce.fmax.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4) +; CONVERT_LEGAL-NOT: %{{.+}} = call float @llvm.vp.reduce.fadd.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4) +; CONVERT_LEGAL-NOT: %{{.+}} = call reassoc float @llvm.vp.reduce.fadd.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4) +; CONVERT_LEGAL-NOT: %{{.+}} = call float @llvm.vp.reduce.fmul.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4) +; CONVERT_LEGAL-NOT: %{{.+}} = call reassoc float @llvm.vp.reduce.fmul.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4) +; CONVERT_LEGAL: ret void diff --git a/llvm/test/Verifier/vp-intrinsics.ll b/llvm/test/Verifier/vp-intrinsics.ll --- a/llvm/test/Verifier/vp-intrinsics.ll +++ b/llvm/test/Verifier/vp-intrinsics.ll @@ -29,6 +29,24 @@ ; TODO: test_vp_constrained_fp + +define void @test_vp_reduction(i32 %x, <8 x i32> %vi, <8 x float> %vf, float %f, <8 x i1> %m, i32 %n) { + %r0 = call i32 @llvm.vp.reduce.add.v8i32(i32 %x, <8 x i32> %vi, <8 x i1> %m, i32 %n) + %r1 = call i32 @llvm.vp.reduce.mul.v8i32(i32 %x, <8 x i32> %vi, <8 x i1> %m, i32 %n) + %r2 = call i32 @llvm.vp.reduce.and.v8i32(i32 %x, <8 x i32> %vi, <8 x i1> %m, i32 %n) + %r3 = call i32 @llvm.vp.reduce.or.v8i32(i32 %x, <8 x i32> %vi, <8 x i1> %m, i32 %n) + %r4 = call i32 @llvm.vp.reduce.xor.v8i32(i32 %x, <8 x i32> %vi, <8 x i1> %m, i32 %n) + %r5 = call i32 @llvm.vp.reduce.smax.v8i32(i32 %x, <8 x i32> %vi, <8 x i1> %m, i32 %n) + %r6 = call i32 @llvm.vp.reduce.smin.v8i32(i32 %x, <8 x i32> %vi, <8 x i1> %m, i32 %n) + %r7 = call i32 @llvm.vp.reduce.umax.v8i32(i32 %x, <8 x i32> %vi, <8 x i1> %m, i32 %n) + %r8 = call i32 @llvm.vp.reduce.umin.v8i32(i32 %x, <8 x i32> %vi, <8 x i1> %m, i32 %n) + %r9 = call float @llvm.vp.reduce.fmin.v8f32(float %f, <8 x float> %vf, <8 x i1> %m, i32 %n) + %rA = call float @llvm.vp.reduce.fmax.v8f32(float %f, <8 x float> %vf, <8 x i1> %m, i32 %n) + %rB = call float @llvm.vp.reduce.fadd.v8f32(float %f, <8 x float> %vf, <8 x i1> %m, i32 %n) + %rC = call float @llvm.vp.reduce.fmul.v8f32(float %f, <8 x float> %vf, <8 x i1> %m, i32 %n) + ret void +} + ; integer arith declare <8 x i32> @llvm.vp.add.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) declare <8 x i32> @llvm.vp.sub.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) @@ -50,3 +68,17 @@ declare <8 x double> @llvm.vp.fmul.v8f64(<8 x double>, <8 x double>, <8 x i1>, i32) declare <8 x double> @llvm.vp.fdiv.v8f64(<8 x double>, <8 x double>, <8 x i1>, i32) declare <8 x double> @llvm.vp.frem.v8f64(<8 x double>, <8 x double>, <8 x i1>, i32) +; reductions +declare i32 @llvm.vp.reduce.add.v8i32(i32, <8 x i32>, <8 x i1>, i32) +declare i32 @llvm.vp.reduce.mul.v8i32(i32, <8 x i32>, <8 x i1>, i32) +declare i32 @llvm.vp.reduce.and.v8i32(i32, <8 x i32>, <8 x i1>, i32) +declare i32 @llvm.vp.reduce.or.v8i32(i32, <8 x i32>, <8 x i1>, i32) +declare i32 @llvm.vp.reduce.xor.v8i32(i32, <8 x i32>, <8 x i1>, i32) +declare i32 @llvm.vp.reduce.smax.v8i32(i32, <8 x i32>, <8 x i1>, i32) +declare i32 @llvm.vp.reduce.smin.v8i32(i32, <8 x i32>, <8 x i1>, i32) +declare i32 @llvm.vp.reduce.umax.v8i32(i32, <8 x i32>, <8 x i1>, i32) +declare i32 @llvm.vp.reduce.umin.v8i32(i32, <8 x i32>, <8 x i1>, i32) +declare float @llvm.vp.reduce.fmin.v8f32(float, <8 x float>, <8 x i1>, i32) +declare float @llvm.vp.reduce.fmax.v8f32(float, <8 x float>, <8 x i1>, i32) +declare float @llvm.vp.reduce.fadd.v8f32(float, <8 x float>, <8 x i1>, i32) +declare float @llvm.vp.reduce.fmul.v8f32(float, <8 x float>, <8 x i1>, i32) diff --git a/llvm/unittests/IR/VPIntrinsicTest.cpp b/llvm/unittests/IR/VPIntrinsicTest.cpp --- a/llvm/unittests/IR/VPIntrinsicTest.cpp +++ b/llvm/unittests/IR/VPIntrinsicTest.cpp @@ -23,6 +23,11 @@ namespace { +static const char *ReductionIntOpcodes[] = { + "add", "mul", "and", "or", "xor", "smin", "smax", "umin", "umax"}; + +static const char *ReductionFPOpcodes[] = {"fadd", "fmul", "fmin", "fmax"}; + class VPIntrinsicTest : public testing::Test { protected: LLVMContext Context; @@ -46,10 +51,22 @@ Str << " declare <8 x float> @llvm.vp." << BinaryFPOpcode << ".v8f32(<8 x float>, <8 x float>, <8 x i1>, i32) "; - Str << " declare void @llvm.vp.store.v8i32.p0v8i32(<8 x i32>, <8 x i32>*, <8 x i1>, i32) "; - Str << " declare void @llvm.vp.scatter.v8i32.v8p0i32(<8 x i32>, <8 x i32*>, <8 x i1>, i32) "; - Str << " declare <8 x i32> @llvm.vp.load.v8i32.p0v8i32(<8 x i32>*, <8 x i1>, i32) "; - Str << " declare <8 x i32> @llvm.vp.gather.v8i32.v8p0i32(<8 x i32*>, <8 x i1>, i32) "; + Str << " declare void @llvm.vp.store.v8i32.p0v8i32(<8 x i32>, <8 x i32>*, " + "<8 x i1>, i32) "; + Str << " declare void @llvm.vp.scatter.v8i32.v8p0i32(<8 x i32>, <8 x " + "i32*>, <8 x i1>, i32) "; + Str << " declare <8 x i32> @llvm.vp.load.v8i32.p0v8i32(<8 x i32>*, <8 " + "x i1>, i32) "; + Str << " declare <8 x i32> @llvm.vp.gather.v8i32.v8p0i32(<8 x i32*>, <8 x " + "i1>, i32) "; + + for (const char *ReductionOpcode : ReductionIntOpcodes) + Str << " declare i32 @llvm.vp.reduce." << ReductionOpcode + << ".v8i32(i32, <8 x i32>, <8 x i1>, i32) "; + + for (const char *ReductionOpcode : ReductionFPOpcodes) + Str << " declare float @llvm.vp.reduce." << ReductionOpcode + << ".v8f32(float, <8 x float>, <8 x i1>, i32) "; return parseAssemblyString(Str.str(), Err, C); } @@ -283,3 +300,69 @@ } } // end anonymous namespace + +/// Check various properties of VPReductionIntrinsics +TEST_F(VPIntrinsicTest, VPReductions) { + LLVMContext C; + SMDiagnostic Err; + + std::stringstream Str; + Str << "declare <8 x i32> @llvm.vp.mul.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, " + "i32)"; + for (const char *ReductionOpcode : ReductionIntOpcodes) + Str << " declare i32 @llvm.vp.reduce." << ReductionOpcode + << ".v8i32(i32, <8 x i32>, <8 x i1>, i32) "; + + for (const char *ReductionOpcode : ReductionFPOpcodes) + Str << " declare float @llvm.vp.reduce." << ReductionOpcode + << ".v8f32(float, <8 x float>, <8 x i1>, i32) "; + + Str << "define void @test_reductions(i32 %start, <8 x i32> %val, float " + "%fpstart, <8 x float> %fpval, <8 x i1> %m, i32 %vl) {"; + + // Mix in a regular non-reduction intrinsic to check that the + // VPReductionIntrinsic subclass works as intended. + Str << " %r0 = call <8 x i32> @llvm.vp.mul.v8i32(<8 x i32> %val, <8 x i32> " + "%val, <8 x i1> %m, i32 %vl)"; + + unsigned Idx = 1; + for (const char *ReductionOpcode : ReductionIntOpcodes) + Str << " %r" << Idx++ << " = call i32 @llvm.vp.reduce." << ReductionOpcode + << ".v8i32(i32 %start, <8 x i32> %val, <8 x i1> %m, i32 %vl)"; + for (const char *ReductionOpcode : ReductionFPOpcodes) + Str << " %r" << Idx++ << " = call float @llvm.vp.reduce." << ReductionOpcode + << ".v8f32(float %fpstart, <8 x float> %fpval, <8 x i1> %m, i32 %vl)"; + + Str << " ret void" + "}"; + + std::unique_ptr M = parseAssemblyString(Str.str(), Err, C); + assert(M); + + auto *F = M->getFunction("test_reductions"); + assert(F); + + for (const auto &I : F->getEntryBlock()) { + const VPIntrinsic *VPI = dyn_cast(&I); + if (!VPI) + continue; + + Intrinsic::ID ID = VPI->getIntrinsicID(); + const auto *VPRedI = dyn_cast(&I); + + if (!VPReductionIntrinsic::isVPReduction(ID)) { + EXPECT_EQ(VPRedI, nullptr); + EXPECT_EQ(VPReductionIntrinsic::getStartParamPos(ID).hasValue(), false); + EXPECT_EQ(VPReductionIntrinsic::getVectorParamPos(ID).hasValue(), false); + continue; + } + + EXPECT_EQ(VPReductionIntrinsic::getStartParamPos(ID).hasValue(), true); + EXPECT_EQ(VPReductionIntrinsic::getVectorParamPos(ID).hasValue(), true); + ASSERT_NE(VPRedI, nullptr); + EXPECT_EQ(VPReductionIntrinsic::getStartParamPos(ID), VPRedI->getStartParamPos()); + EXPECT_EQ(VPReductionIntrinsic::getVectorParamPos(ID), VPRedI->getVectorParamPos()); + EXPECT_EQ(VPRedI->getStartParamPos(), 0u); + EXPECT_EQ(VPRedI->getVectorParamPos(), 1u); + } +}