diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -7538,6 +7538,8 @@ = fadd float 4.0, %var ; yields float:result = 4.0 + %var +.. _i_sub: + '``sub``' Instruction ^^^^^^^^^^^^^^^^^^^^^ @@ -7633,6 +7635,8 @@ = fsub float 4.0, %var ; yields float:result = 4.0 - %var = fsub float -0.0, %val ; yields float:result = -%var +.. _i_mul: + '``mul``' Instruction ^^^^^^^^^^^^^^^^^^^^^ @@ -7727,6 +7731,8 @@ = fmul float 4.0, %var ; yields float:result = 4.0 * %var +.. _i_udiv: + '``udiv``' Instruction ^^^^^^^^^^^^^^^^^^^^^^ @@ -7773,6 +7779,8 @@ = udiv i32 4, %var ; yields i32:result = 4 / %var +.. _i_sdiv: + '``sdiv``' Instruction ^^^^^^^^^^^^^^^^^^^^^^ @@ -7861,6 +7869,8 @@ = fdiv float 4.0, %var ; yields float:result = 4.0 / %var +.. _i_urem: + '``urem``' Instruction ^^^^^^^^^^^^^^^^^^^^^^ @@ -7905,6 +7915,8 @@ = urem i32 4, %var ; yields i32:result = 4 % %var +.. _i_srem: + '``srem``' Instruction ^^^^^^^^^^^^^^^^^^^^^^ @@ -8018,6 +8030,8 @@ operands of the same type, execute an operation on them, and produce a single value. The resulting value is the same type as its operands. +.. _i_shl: + '``shl``' Instruction ^^^^^^^^^^^^^^^^^^^^^ @@ -8070,6 +8084,9 @@ = shl i32 1, 32 ; undefined = shl <2 x i32> < i32 1, i32 1>, < i32 1, i32 2> ; yields: result=<2 x i32> < i32 2, i32 4> +.. _i_lshr: + + '``lshr``' Instruction ^^^^^^^^^^^^^^^^^^^^^^ @@ -8119,6 +8136,8 @@ = lshr i32 1, 32 ; undefined = lshr <2 x i32> < i32 -2, i32 4>, < i32 1, i32 2> ; yields: result=<2 x i32> < i32 0x7FFFFFFF, i32 1> +.. _i_ashr: + '``ashr``' Instruction ^^^^^^^^^^^^^^^^^^^^^^ @@ -8169,6 +8188,8 @@ = ashr i32 1, 32 ; undefined = ashr <2 x i32> < i32 -2, i32 4>, < i32 1, i32 3> ; yields: result=<2 x i32> < i32 -1, i32 0> +.. _i_and: + '``and``' Instruction ^^^^^^^^^^^^^^^^^^^^^ @@ -8218,6 +8239,8 @@ = and i32 15, 40 ; yields i32:result = 8 = and i32 4, 8 ; yields i32:result = 0 +.. _i_or: + '``or``' Instruction ^^^^^^^^^^^^^^^^^^^^ @@ -8267,6 +8290,8 @@ = or i32 15, 40 ; yields i32:result = 47 = or i32 4, 8 ; yields i32:result = 12 +.. _i_xor: + '``xor``' Instruction ^^^^^^^^^^^^^^^^^^^^^ @@ -14626,6 +14651,586 @@ after performing the required machine specific adjustments. The pointer returned can then be :ref:`bitcast and executed `. + +.. _int_vp: + +Vector Predication Intrinsics +----------------------------- +VP intrinics are intended for predicated SIMD/vector code. +A typical VP operation takes a vector mask and an explicit vector length parameter as in: + +:: + + llvm.vp..*( %x, %y, %mask, i32 %evl) + +The vector mask parameter always has a vector of bit type, for example `<32 x i1>`. +The explicit vector length parameter always has the type `i32`. +The explicit vector length is only effective if the MSB of its value is zero. +Results are only computed for enabled lanes. +A lane is enabled if the mask at that position is true and, if effective, where the lane position is below the explicit vector length. + + +.. _int_vp_add: + +'``llvm.vp.add.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare <16 x i32> @llvm.vp.add.v16i32 (<16 x i32> , <16 x i32> , <16 x i1> , i32 ) + declare <256 x i64> @llvm.vp.add.v256i64 (<256 x i64> , <256 x i64> , <256 x i1> , i32 ) + +Overview: +""""""""" + +Predicated integer addition of two vectors of integers. + + +Arguments: +"""""""""" + +The first two operands and the result have the same vector of integer type. The third operand is the vector mask and has the same number of elements as the result vector type. The fourth operand is the explicit vector length of the operation. + +Semantics: +"""""""""" + +The '``llvm.vp.add``' intrinsic performs integer addition (:ref:`add `) of the first and second vector operand on each enabled lane. +The result on disabled lanes is undefined. + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call <4 x i32> @llvm.vp.add.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i1> %mask, i32 %avl) + ;; For all lanes below %avl, %r is lane-wise equivalent to %also.r + + %t = add <4 x i32> %a, %b + %also.r = select <4 x i1> %mask, <4 x i32> %t, <4 x i32> undef + +.. _int_vp_sub: + +'``llvm.vp.sub.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare <16 x i32> @llvm.vp.sub.v16i32 (<16 x i32> , <16 x i32> , <16 x i1> , i32 ) + declare <256 x i64> @llvm.vp.sub.v256i64 (<256 x i64> , <256 x i64> , <256 x i1> , i32 ) + +Overview: +""""""""" + +Predicated integer subtraction of two vectors of integers. + + +Arguments: +"""""""""" + +The first two operands and the result have the same vector of integer type. The third operand is the vector mask and has the same number of elements as the result vector type. The fourth operand is the explicit vector length of the operation. + +Semantics: +"""""""""" + +The '``llvm.vp.sub``' intrinsic performs integer subtraction (:ref:`sub `) of the first and second vector operand on each enabled lane. +The result on disabled lanes is undefined. + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call <4 x i32> @llvm.vp.sub.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i1> %mask, i32 %avl) + ;; For all lanes below %avl, %r is lane-wise equivalent to %also.r + + %t = sub <4 x i32> %a, %b + %also.r = select <4 x i1> %mask, <4 x i32> %t, <4 x i32> undef + + + +.. _int_vp_mul: + +'``llvm.vp.mul.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare <16 x i32> @llvm.vp.mul.v16i32 (<16 x i32> , <16 x i32> , <16 x i1> , i32 ) + declare <256 x i64> @llvm.vp.mul.v256i64 (<256 x i64> , <256 x i64> , <256 x i1> , i32 ) + +Overview: +""""""""" + +Predicated integer multiplication of two vectors of integers. + + +Arguments: +"""""""""" + +The first two operands and the result have the same vector of integer type. The third operand is the vector mask and has the same number of elements as the result vector type. The fourth operand is the explicit vector length of the operation. + +Semantics: +"""""""""" +The '``llvm.vp.mul``' intrinsic performs integer multiplication (:ref:`mul `) of the first and second vector operand on each enabled lane. +The result on disabled lanes is undefined. + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call <4 x i32> @llvm.vp.mul.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i1> %mask, i32 %avl) + ;; For all lanes below %avl, %r is lane-wise equivalent to %also.r + + %t = mul <4 x i32> %a, %b + %also.r = select <4 x i1> %mask, <4 x i32> %t, <4 x i32> undef + + +.. _int_vp_sdiv: + +'``llvm.vp.sdiv.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare <16 x i32> @llvm.vp.sdiv.v16i32 (<16 x i32> , <16 x i32> , <16 x i1> , i32 ) + declare <256 x i64> @llvm.vp.sdiv.v256i64 (<256 x i64> , <256 x i64> , <256 x i1> , i32 ) + +Overview: +""""""""" + +Predicated, signed division of two vectors of integers. + + +Arguments: +"""""""""" + +The first two operands and the result have the same vector of integer type. The third operand is the vector mask and has the same number of elements as the result vector type. The fourth operand is the explicit vector length of the operation. + +Semantics: +"""""""""" + +The '``llvm.vp.sdiv``' intrinsic performs signed division (:ref:`sdiv `) of the first and second vector operand on each enabled lane. +The result on disabled lanes is undefined. + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call <4 x i32> @llvm.vp.sdiv.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i1> %mask, i32 %avl) + ;; For all lanes below %avl, %r is lane-wise equivalent to %also.r + + %t = sdiv <4 x i32> %a, %b + %also.r = select <4 x ii> %mask, <4 x i32> %t, <4 x i32> undef + + +.. _int_vp_udiv: + +'``llvm.vp.udiv.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare <16 x i32> @llvm.vp.udiv.v16i32 (<16 x i32> , <16 x i32> , <16 x i1> , i32 ) + declare <256 x i64> @llvm.vp.udiv.v256i64 (<256 x i64> , <256 x i64> , <256 x i1> , i32 ) + +Overview: +""""""""" + +Predicated, unsigned division of two vectors of integers. + + +Arguments: +"""""""""" + +The first two operands and the result have the same vector of integer type. The third operand is the vector mask and has the same number of elements as the result vector type. The fourth operand is the explicit vector length of the operation. + +Semantics: +"""""""""" + +The '``llvm.vp.udiv``' intrinsic performs unsigned division (:ref:`udiv `) of the first and second vector operand on each enabled lane. +The result on disabled lanes is undefined. + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call <4 x i32> @llvm.vp.udiv.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i1> %mask, i32 %avl) + ;; For all lanes below %avl, %r is lane-wise equivalent to %also.r + + %t = udiv <4 x i32> %a, %b + %also.r = select <4 x ii> %mask, <4 x i32> %t, <4 x i32> undef + + + +.. _int_vp_srem: + +'``llvm.vp.srem.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare <16 x i32> @llvm.vp.srem.v16i32 (<16 x i32> , <16 x i32> , <16 x i1> , i32 ) + declare <256 x i64> @llvm.vp.srem.v256i64 (<256 x i64> , <256 x i64> , <256 x i1> , i32 ) + +Overview: +""""""""" + +Predicated computations of the signed remainder of two integer vectors. + + +Arguments: +"""""""""" + +The first two operands and the result have the same vector of integer type. The third operand is the vector mask and has the same number of elements as the result vector type. The fourth operand is the explicit vector length of the operation. + +Semantics: +"""""""""" + +The '``llvm.vp.srem``' intrinsic computes the remainder of the signed division (:ref:`srem `) of the first and second vector operand on each enabled lane. +The result on disabled lanes is undefined. + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call <4 x i32> @llvm.vp.srem.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i1> %mask, i32 %avl) + ;; For all lanes below %avl, %r is lane-wise equivalent to %also.r + + %t = srem <4 x i32> %a, %b + %also.r = select <4 x i1> %mask, <4 x i32> %t, <4 x i32> undef + + + +.. _int_vp_urem: + +'``llvm.vp.urem.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare <16 x i32> @llvm.vp.urem.v16i32 (<16 x i32> , <16 x i32> , <16 x i1> , i32 ) + declare <256 x i64> @llvm.vp.urem.v256i64 (<256 x i64> , <256 x i64> , <256 x i1> , i32 ) + +Overview: +""""""""" + +Predicated computation of the unsigned remainder of two integer vectors. + + +Arguments: +"""""""""" + +The first two operands and the result have the same vector of integer type. The third operand is the vector mask and has the same number of elements as the result vector type. The fourth operand is the explicit vector length of the operation. + +Semantics: +"""""""""" + +The '``llvm.vp.urem``' intrinsic computes the remainder of the unsigned division (:ref:`urem `) of the first and second vector operand on each enabled lane. +The result on disabled lanes is undefined. + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call <4 x i32> @llvm.vp.urem.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i1> %mask, i32 %avl) + ;; For all lanes below %avl, %r is lane-wise equivalent to %also.r + + %t = urem <4 x i32> %a, %b + %also.r = select <4 x i1> %mask, <4 x i32> %t, <4 x i32> undef + + +.. _int_vp_ashr: + +'``llvm.vp.ashr.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare <16 x i32> @llvm.vp.ashr.v16i32 (<16 x i32> , <16 x i32> , <16 x i1> , i32 ) + declare <256 x i64> @llvm.vp.ashr.v256i64 (<256 x i64> , <256 x i64> , <256 x i1> , i32 ) + +Overview: +""""""""" + +Vector-predicated arithmetic right-shift. + + +Arguments: +"""""""""" + +The first two operands and the result have the same vector of integer type. The third operand is the vector mask and has the same number of elements as the result vector type. The fourth operand is the explicit vector length of the operation. + +Semantics: +"""""""""" + +The '``llvm.vp.ashr``' intrinsic computes the arithmetic right shift (:ref:`ashr `) of the first operand by the second operand on each enabled lane. +The result on disabled lanes is undefined. + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call <4 x i32> @llvm.vp.ashr.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i1> %mask, i32 %avl) + ;; For all lanes below %avl, %r is lane-wise equivalent to %also.r + + %t = ashr <4 x i32> %a, %b + %also.r = select <4 x i1> %mask, <4 x i32> %t, <4 x i32> undef + + +.. _int_vp_lshr: + + +'``llvm.vp.lshr.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare <16 x i32> @llvm.vp.lshr.v16i32 (<16 x i32> , <16 x i32> , <16 x i1> , i32 ) + declare <256 x i64> @llvm.vp.lshr.v256i64 (<256 x i64> , <256 x i64> , <256 x i1> , i32 ) + +Overview: +""""""""" + +Vector-predicated logical right-shift. + + +Arguments: +"""""""""" + +The first two operands and the result have the same vector of integer type. The third operand is the vector mask and has the same number of elements as the result vector type. The fourth operand is the explicit vector length of the operation. + +Semantics: +"""""""""" + +The '``llvm.vp.lshr``' intrinsic computes the logical right shift (:ref:`lshr `) of the first operand by the second operand on each enabled lane. +The result on disabled lanes is undefined. + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call <4 x i32> @llvm.vp.lshr.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i1> %mask, i32 %avl) + ;; For all lanes below %avl, %r is lane-wise equivalent to %also.r + + %t = lshr <4 x i32> %a, %b + %also.r = select <4 x i1> %mask, <4 x i32> %t, <4 x i32> undef + + +.. _int_vp_shl: + +'``llvm.vp.shl.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare <16 x i32> @llvm.vp.shl.v16i32 (<16 x i32> , <16 x i32> , <16 x i1> , i32 ) + declare <256 x i64> @llvm.vp.shl.v256i64 (<256 x i64> , <256 x i64> , <256 x i1> , i32 ) + +Overview: +""""""""" + +Vector-predicated left shift. + + +Arguments: +"""""""""" + +The first two operands and the result have the same vector of integer type. The third operand is the vector mask and has the same number of elements as the result vector type. The fourth operand is the explicit vector length of the operation. + +Semantics: +"""""""""" + +The '``llvm.vp.shl``' intrinsic computes the left shift (:ref:`shl `) of the first operand by the second operand on each enabled lane. +The result on disabled lanes is undefined. + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call <4 x i32> @llvm.vp.shl.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i1> %mask, i32 %avl) + ;; For all lanes below %avl, %r is lane-wise equivalent to %also.r + + %t = shl <4 x i32> %a, %b + %also.r = select <4 x i1> %mask, <4 x i32> %t, <4 x i32> undef + + +.. _int_vp_or: + +'``llvm.vp.or.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare <16 x i32> @llvm.vp.or.v16i32 (<16 x i32> , <16 x i32> , <16 x i1> , i32 ) + declare <256 x i64> @llvm.vp.or.v256i64 (<256 x i64> , <256 x i64> , <256 x i1> , i32 ) + +Overview: +""""""""" + +Vector-predicated or. + + +Arguments: +"""""""""" + +The first two operands and the result have the same vector of integer type. The third operand is the vector mask and has the same number of elements as the result vector type. The fourth operand is the explicit vector length of the operation. + +Semantics: +"""""""""" + +The '``llvm.vp.or``' intrinsic performs a bitwise or (:ref:`or `) of the first two operands on each enabled lane. +The result on disabled lanes is undefined. + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call <4 x i32> @llvm.vp.or.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i1> %mask, i32 %avl) + ;; For all lanes below %avl, %r is lane-wise equivalent to %also.r + + %t = or <4 x i32> %a, %b + %also.r = select <4 x i1> %mask, <4 x i32> %t, <4 x i32> undef + + +.. _int_vp_and: + +'``llvm.vp.and.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare <16 x i32> @llvm.vp.and.v16i32 (<16 x i32> , <16 x i32> , <16 x i1> , i32 ) + declare <256 x i64> @llvm.vp.and.v256i64 (<256 x i64> , <256 x i64> , <256 x i1> , i32 ) + +Overview: +""""""""" + +Vector-predicated and. + + +Arguments: +"""""""""" + +The first two operands and the result have the same vector of integer type. The third operand is the vector mask and has the same number of elements as the result vector type. The fourth operand is the explicit vector length of the operation. + +Semantics: +"""""""""" + +The '``llvm.vp.and``' intrinsic performs a bitwise and (:ref:`and `) of the first two operands on each enabled lane. +The result on disabled lanes is undefined. + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call <4 x i32> @llvm.vp.and.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i1> %mask, i32 %avl) + ;; For all lanes below %avl, %r is lane-wise equivalent to %also.r + + %t = and <4 x i32> %a, %b + %also.r = select <4 x i1> %mask, <4 x i32> %t, <4 x i32> undef + + +.. _int_vp_xor: + +'``llvm.vp.xor.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare <16 x i32> @llvm.vp.xor.v16i32 (<16 x i32> , <16 x i32> , <16 x i1> , i32 ) + declare <256 x i64> @llvm.vp.xor.v256i64 (<256 x i64> , <256 x i64> , <256 x i1> , i32 ) + +Overview: +""""""""" + +Vector-predicated, bitwise xor. + + +Arguments: +"""""""""" + +The first two operands and the result have the same vector of integer type. The third operand is the vector mask and has the same number of elements as the result vector type. The fourth operand is the explicit vector length of the operation. + +Semantics: +"""""""""" + +The '``llvm.vp.xor``' intrinsic performs a bitwise xor (:ref:`xor `) of the first two operands on each enabled lane. +The result on disabled lanes is undefined. + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call <4 x i32> @llvm.vp.xor.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i1> %mask, i32 %avl) + ;; For all lanes below %avl, %r is lane-wise equivalent to %also.r + + %t = xor <4 x i32> %a, %b + %also.r = select <4 x i1> %mask, <4 x i32> %t, <4 x i32> undef + + .. _int_mload_mstore: Masked Vector Load and Store Intrinsics diff --git a/llvm/include/llvm/IR/IntrinsicInst.h b/llvm/include/llvm/IR/IntrinsicInst.h --- a/llvm/include/llvm/IR/IntrinsicInst.h +++ b/llvm/include/llvm/IR/IntrinsicInst.h @@ -205,6 +205,105 @@ /// @} }; + /// This is the common base class for vector predication intrinsics. + class VPIntrinsic : public IntrinsicInst { + public: + static Optional GetMaskParamPos(Intrinsic::ID IntrinsicID); + static Optional GetVectorLengthParamPos(Intrinsic::ID IntrinsicID); + + // the llvm.vp.* intrinsic for an instruction Opcode. + static Intrinsic::ID GetForOpcode(unsigned OC); + + /// set the mask parameter. + /// this asserts if the underlying intrinsic has no mask parameter. + void setMaskParam(Value *); + + /// set the vector length parameter. + /// this asserts if the underlying intrinsic has no vector length + /// parameter. + void setVectorLengthParam(Value *); + + /// \return the mask parameter or nullptr. + Value *getMaskParam() const; + + /// \return the vector length parameter or nullptr. + Value *getVectorLengthParam() const; + + /// \return whether the vector length param can be ignored. + bool canIgnoreVectorLengthParam() const; + + /// \return the static element count (vector number of elements) the vector + /// length parameter applies to. This returns None if the operation is + /// scalable. + Optional getStaticVectorLength() const; + + // Methods for support type inquiry through isa, cast, and dyn_cast: + static bool classof(const IntrinsicInst *I) { + switch (I->getIntrinsicID()) { + default: + return false; + + // int arith + case Intrinsic::vp_and: + case Intrinsic::vp_or: + case Intrinsic::vp_xor: + case Intrinsic::vp_ashr: + case Intrinsic::vp_lshr: + case Intrinsic::vp_shl: + case Intrinsic::vp_add: + case Intrinsic::vp_sub: + case Intrinsic::vp_mul: + case Intrinsic::vp_udiv: + case Intrinsic::vp_sdiv: + case Intrinsic::vp_urem: + case Intrinsic::vp_srem: + return true; + } + } + static bool classof(const Value *V) { + return isa(V) && classof(cast(V)); + } + + // Equivalent non-predicated opcode + unsigned getFunctionalOpcode() const { + return GetFunctionalOpcodeForVP(getIntrinsicID()); + } + // Equivalent non-predicated opcode + static unsigned GetFunctionalOpcodeForVP(Intrinsic::ID ID) { + switch (ID) { + default: + return Instruction::Call; + + case Intrinsic::vp_and: + return Instruction::And; + case Intrinsic::vp_or: + return Instruction::Or; + case Intrinsic::vp_xor: + return Instruction::Xor; + case Intrinsic::vp_ashr: + return Instruction::AShr; + case Intrinsic::vp_lshr: + return Instruction::LShr; + case Intrinsic::vp_shl: + return Instruction::Shl; + case Intrinsic::vp_add: + return Instruction::Add; + case Intrinsic::vp_sub: + return Instruction::Sub; + case Intrinsic::vp_mul: + return Instruction::Mul; + case Intrinsic::vp_udiv: + return Instruction::UDiv; + case Intrinsic::vp_sdiv: + return Instruction::SDiv; + case Intrinsic::vp_urem: + return Instruction::URem; + case Intrinsic::vp_srem: + return Instruction::SRem; + } + } + }; + /// This is the common base class for constrained floating point intrinsics. class ConstrainedFPIntrinsic : public IntrinsicInst { public: diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -1099,6 +1099,78 @@ def int_ptrmask: Intrinsic<[llvm_anyptr_ty], [llvm_anyptr_ty, llvm_anyint_ty], [IntrNoMem, IntrSpeculatable, IntrWillReturn]>; +//===---------------- Vector Predication Intrinsics --------------===// + +// Binary operators +let IntrProperties = [IntrNoMem, IntrWillReturn] in { + def int_vp_add : Intrinsic<[ llvm_anyvector_ty ], + [ LLVMMatchType<0>, + LLVMMatchType<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_sub : Intrinsic<[ llvm_anyvector_ty ], + [ LLVMMatchType<0>, + LLVMMatchType<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_mul : Intrinsic<[ llvm_anyvector_ty ], + [ LLVMMatchType<0>, + LLVMMatchType<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_sdiv : Intrinsic<[ llvm_anyvector_ty ], + [ LLVMMatchType<0>, + LLVMMatchType<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_udiv : Intrinsic<[ llvm_anyvector_ty ], + [ LLVMMatchType<0>, + LLVMMatchType<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_srem : Intrinsic<[ llvm_anyvector_ty ], + [ LLVMMatchType<0>, + LLVMMatchType<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_urem : Intrinsic<[ llvm_anyvector_ty ], + [ LLVMMatchType<0>, + LLVMMatchType<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_ashr : Intrinsic<[ llvm_anyvector_ty ], + [ LLVMMatchType<0>, + LLVMMatchType<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_lshr : Intrinsic<[ llvm_anyvector_ty ], + [ LLVMMatchType<0>, + LLVMMatchType<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_shl : Intrinsic<[ llvm_anyvector_ty ], + [ LLVMMatchType<0>, + LLVMMatchType<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_or : Intrinsic<[ llvm_anyvector_ty ], + [ LLVMMatchType<0>, + LLVMMatchType<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_and : Intrinsic<[ llvm_anyvector_ty ], + [ LLVMMatchType<0>, + LLVMMatchType<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_xor : Intrinsic<[ llvm_anyvector_ty ], + [ LLVMMatchType<0>, + LLVMMatchType<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + +} + //===-------------------------- Masked Intrinsics -------------------------===// // def int_masked_store : Intrinsic<[], [llvm_anyvector_ty, diff --git a/llvm/lib/IR/IntrinsicInst.cpp b/llvm/lib/IR/IntrinsicInst.cpp --- a/llvm/lib/IR/IntrinsicInst.cpp +++ b/llvm/lib/IR/IntrinsicInst.cpp @@ -102,6 +102,136 @@ return ConstantInt::get(Type::getInt64Ty(Context), 1); } +Optional VPIntrinsic::getStaticVectorLength() const { + auto GetStaticVectorLengthOfType = [](const Type *T) -> Optional { + auto VT = dyn_cast(T); + if (!VT || VT->isScalable()) + return None; + + // Corner case for excessive number of elements in the vector type + auto Num = VT->getNumElements(); + if (Num >= + static_cast(std::numeric_limits::max())) { + return std::numeric_limits::max(); + } + + return static_cast(Num); + }; + + auto VPMask = getMaskParam(); + return GetStaticVectorLengthOfType(VPMask->getType()); +} + +void VPIntrinsic::setMaskParam(Value *NewMask) { + auto MaskPos = GetMaskParamPos(getIntrinsicID()); + assert(MaskPos.hasValue()); + this->setOperand(MaskPos.getValue(), NewMask); +} + +void VPIntrinsic::setVectorLengthParam(Value *NewVL) { + auto VLPos = GetVectorLengthParamPos(getIntrinsicID()); + assert(VLPos.hasValue()); + this->setOperand(VLPos.getValue(), NewVL); +} + +Value *VPIntrinsic::getMaskParam() const { + auto maskPos = GetMaskParamPos(getIntrinsicID()); + if (maskPos) + return getArgOperand(maskPos.getValue()); + return nullptr; +} + +Value *VPIntrinsic::getVectorLengthParam() const { + auto vlenPos = GetVectorLengthParamPos(getIntrinsicID()); + if (vlenPos) + return getArgOperand(vlenPos.getValue()); + return nullptr; +} + +Optional VPIntrinsic::GetMaskParamPos(Intrinsic::ID IntrinsicID) { + switch (IntrinsicID) { + default: + return None; + + // int arith + case Intrinsic::vp_and: + case Intrinsic::vp_or: + case Intrinsic::vp_xor: + case Intrinsic::vp_ashr: + case Intrinsic::vp_lshr: + case Intrinsic::vp_shl: + case Intrinsic::vp_add: + case Intrinsic::vp_sub: + case Intrinsic::vp_mul: + case Intrinsic::vp_udiv: + case Intrinsic::vp_sdiv: + case Intrinsic::vp_urem: + case Intrinsic::vp_srem: + return 2; + } +} + +Optional VPIntrinsic::GetVectorLengthParamPos(Intrinsic::ID IntrinsicID) { + auto maskPos = GetMaskParamPos(IntrinsicID); + if (maskPos) { + return maskPos.getValue() + 1; + } + return None; +} + +Intrinsic::ID VPIntrinsic::GetForOpcode(unsigned OC) { + switch (OC) { + default: + return Intrinsic::not_intrinsic; + + case Instruction::Add: + return Intrinsic::vp_add; + case Instruction::Sub: + return Intrinsic::vp_sub; + case Instruction::Mul: + return Intrinsic::vp_mul; + case Instruction::SDiv: + return Intrinsic::vp_sdiv; + case Instruction::UDiv: + return Intrinsic::vp_udiv; + case Instruction::SRem: + return Intrinsic::vp_srem; + case Instruction::URem: + return Intrinsic::vp_urem; + case Instruction::Or: + return Intrinsic::vp_or; + case Instruction::And: + return Intrinsic::vp_and; + case Instruction::Xor: + return Intrinsic::vp_xor; + case Instruction::LShr: + return Intrinsic::vp_lshr; + case Instruction::AShr: + return Intrinsic::vp_ashr; + case Instruction::Shl: + return Intrinsic::vp_shl; + } +} + +bool VPIntrinsic::canIgnoreVectorLengthParam() const { + auto StaticVL = getStaticVectorLength(); + if (!StaticVL.hasValue()) + return false; + + auto *VLParam = getVectorLengthParam(); + assert(VLParam); + + // Check whether the vector length param is an out-of-range constant. + auto VLConst = dyn_cast(VLParam); + if (!VLConst) + return false; + int64_t VLNum = VLConst->getSExtValue(); + if (VLNum < 0 || VLNum >= StaticVL.getValue()) + return true; + + return false; +} + Optional ConstrainedFPIntrinsic::getRoundingMode() const { unsigned NumOperands = getNumArgOperands(); diff --git a/llvm/test/Verifier/vp-intrinsics.ll b/llvm/test/Verifier/vp-intrinsics.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Verifier/vp-intrinsics.ll @@ -0,0 +1,34 @@ +; RUN: opt --verify %s + +define void @test_vp_int(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) { + %r0 = call <8 x i32> @llvm.vp.add.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) + %r1 = call <8 x i32> @llvm.vp.sub.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) + %r2 = call <8 x i32> @llvm.vp.mul.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) + %r3 = call <8 x i32> @llvm.vp.sdiv.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) + %r4 = call <8 x i32> @llvm.vp.srem.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) + %r5 = call <8 x i32> @llvm.vp.udiv.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) + %r6 = call <8 x i32> @llvm.vp.urem.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) + %r7 = call <8 x i32> @llvm.vp.and.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) + %r8 = call <8 x i32> @llvm.vp.or.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) + %r9 = call <8 x i32> @llvm.vp.xor.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) + %rA = call <8 x i32> @llvm.vp.ashr.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) + %rB = call <8 x i32> @llvm.vp.lshr.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) + %rC = call <8 x i32> @llvm.vp.shl.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) + ret void +} + +; integer arith +declare <8 x i32> @llvm.vp.add.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) +declare <8 x i32> @llvm.vp.sub.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) +declare <8 x i32> @llvm.vp.mul.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) +declare <8 x i32> @llvm.vp.sdiv.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) +declare <8 x i32> @llvm.vp.srem.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) +declare <8 x i32> @llvm.vp.udiv.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) +declare <8 x i32> @llvm.vp.urem.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) +; bit arith +declare <8 x i32> @llvm.vp.and.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) +declare <8 x i32> @llvm.vp.or.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) +declare <8 x i32> @llvm.vp.xor.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) +declare <8 x i32> @llvm.vp.ashr.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) +declare <8 x i32> @llvm.vp.lshr.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) +declare <8 x i32> @llvm.vp.shl.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) diff --git a/llvm/unittests/IR/CMakeLists.txt b/llvm/unittests/IR/CMakeLists.txt --- a/llvm/unittests/IR/CMakeLists.txt +++ b/llvm/unittests/IR/CMakeLists.txt @@ -39,6 +39,7 @@ ValueTest.cpp VectorTypesTest.cpp VerifierTest.cpp + VPIntrinsicTest.cpp WaymarkTest.cpp ) diff --git a/llvm/unittests/IR/VPIntrinsicTest.cpp b/llvm/unittests/IR/VPIntrinsicTest.cpp new file mode 100644 --- /dev/null +++ b/llvm/unittests/IR/VPIntrinsicTest.cpp @@ -0,0 +1,143 @@ +//===- VPIntrinsicTest.cpp - VPIntrinsic unit tests ---------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/SmallVector.h" +#include "llvm/AsmParser/Parser.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Verifier.h" +#include "llvm/Support/SourceMgr.h" +#include "gtest/gtest.h" + +namespace llvm { +namespace { + +// We use this fixture to ensure that we clean up DivergenceAnalysis before +// deleting the PassManager. +class VPIntrinsicTest : public testing::Test { +protected: + LLVMContext Context; + + VPIntrinsicTest() : Context() {} + + LLVMContext C; + SMDiagnostic Err; + + std::unique_ptr CreateVPDeclarationModule() { + return parseAssemblyString( +" declare <8 x i32> @llvm.vp.add.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) " +" declare <8 x i32> @llvm.vp.sub.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) " +" declare <8 x i32> @llvm.vp.mul.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) " +" declare <8 x i32> @llvm.vp.sdiv.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) " +" declare <8 x i32> @llvm.vp.srem.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) " +" declare <8 x i32> @llvm.vp.udiv.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) " +" declare <8 x i32> @llvm.vp.urem.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) " +" declare <8 x i32> @llvm.vp.and.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) " +" declare <8 x i32> @llvm.vp.xor.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) " +" declare <8 x i32> @llvm.vp.or.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) " +" declare <8 x i32> @llvm.vp.ashr.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) " +" declare <8 x i32> @llvm.vp.lshr.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) " +" declare <8 x i32> @llvm.vp.shl.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) ", + Err, C); + } +}; + +/// Check that VPIntrinsic:canIgnoreVectorLengthParam() returns true +/// if the vector length parameter does not mask-off any lanes. +TEST_F(VPIntrinsicTest, CanIgnoreVectorLength) { + LLVMContext C; + SMDiagnostic Err; + + std::unique_ptr M = + parseAssemblyString( +"declare <256 x i64> @llvm.vp.mul.v256i64(<256 x i64>, <256 x i64>, <256 x i1>, i32)" +"define void @test_static_vlen(<256 x i64> %i0, <256 x i64> %i1, <256 x i1> %m, i32 %vl) { " +" %r0 = call <256 x i64> @llvm.vp.mul.v256i64(<256 x i64> %i0, <256 x i64> %i1, <256 x i1> %m, i32 %vl)" +" %r1 = call <256 x i64> @llvm.vp.mul.v256i64(<256 x i64> %i0, <256 x i64> %i1, <256 x i1> %m, i32 256)" +" %r2 = call <256 x i64> @llvm.vp.mul.v256i64(<256 x i64> %i0, <256 x i64> %i1, <256 x i1> %m, i32 0)" +" %r3 = call <256 x i64> @llvm.vp.mul.v256i64(<256 x i64> %i0, <256 x i64> %i1, <256 x i1> %m, i32 -1)" +" %r4 = call <256 x i64> @llvm.vp.mul.v256i64(<256 x i64> %i0, <256 x i64> %i1, <256 x i1> %m, i32 123)" +" ret void " +"}", + Err, C); + + auto *F = M->getFunction("test_static_vlen"); + assert(F); + + const int NumExpected = 5; + const bool Expected[] = {false, true, false, true, false}; + int i = 0; + for (auto &I : F->getEntryBlock()) { + VPIntrinsic *VPI = dyn_cast(&I); + if (!VPI) { + ASSERT_TRUE(I.isTerminator()); + continue; + } + + ASSERT_LT(i, NumExpected); + ASSERT_EQ(Expected[i], VPI->canIgnoreVectorLengthParam()); + ++i; + } +} + +/// Check that the argument returned by +/// VPIntrinsic::GetParamPos(Intrinsic::ID) has the expected type. +TEST_F(VPIntrinsicTest, GetParamPos) { + std::unique_ptr M = CreateVPDeclarationModule(); + assert(M); + + for (Function &F : *M) { + ASSERT_TRUE(F.isIntrinsic()); + Optional MaskParamPos = VPIntrinsic::GetMaskParamPos(F.getIntrinsicID()); + if (MaskParamPos.hasValue()) { + Type *MaskParamType = F.getArg(MaskParamPos.getValue())->getType(); + ASSERT_TRUE(MaskParamType->isVectorTy()); + ASSERT_TRUE(MaskParamType->getVectorElementType()->isIntegerTy(1)); + } + + Optional VecLenParamPos = VPIntrinsic::GetVectorLengthParamPos(F.getIntrinsicID()); + if (VecLenParamPos.hasValue()) { + Type *VecLenParamType = F.getArg(VecLenParamPos.getValue())->getType(); + ASSERT_TRUE(VecLenParamType->isIntegerTy(32)); + } + } +} + +/// Check that going from Opcode to VP intrinsic and back results in the same Opcode. +TEST_F(VPIntrinsicTest, OpcodeRoundTrip) { + std::vector Opcodes; + Opcodes.reserve(100); + + { +#define HANDLE_INST(OCNum, OCName, Class) Opcodes.push_back(OCNum); +#include "llvm/IR/Instruction.def" + } + + unsigned FullTripCounts = 0; + for (unsigned OC : Opcodes) { + Intrinsic::ID VPID = VPIntrinsic::GetForOpcode(OC); + // no equivalent VP intrinsic available + if (VPID == Intrinsic::not_intrinsic) + continue; + + unsigned RoundTripOC = VPIntrinsic::GetFunctionalOpcodeForVP(VPID); + // no equivalent Opcode available + if (RoundTripOC == Instruction::Call) + continue; + + ASSERT_EQ(RoundTripOC, OC); + ++FullTripCounts; + } + ASSERT_NE(FullTripCounts, 0u); +} + +} // end anonymous namespace +} // end namespace llvm diff --git a/llvm/utils/TableGen/CodeGenTarget.cpp b/llvm/utils/TableGen/CodeGenTarget.cpp --- a/llvm/utils/TableGen/CodeGenTarget.cpp +++ b/llvm/utils/TableGen/CodeGenTarget.cpp @@ -728,8 +728,7 @@ // variants with iAny types; otherwise, if the intrinsic is not // overloaded, all the types can be specified directly. assert(((!TyEl->isSubClassOf("LLVMExtendedType") && - !TyEl->isSubClassOf("LLVMTruncatedType") && - !TyEl->isSubClassOf("LLVMScalarOrSameVectorWidth")) || + !TyEl->isSubClassOf("LLVMTruncatedType")) || VT == MVT::iAny || VT == MVT::vAny) && "Expected iAny or vAny type"); } else