diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -17212,6 +17212,252 @@ %also.r = select <4 x i1> %mask, <4 x i32> %t, <4 x i32> undef +.. _int_vp_fadd: + +'``llvm.vp.fadd.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare <16 x float> @llvm.vp.fadd.v16f32 (<16 x float> , <16 x float> , <16 x i1> , i32 ) + declare @llvm.vp.fadd.nxv4f32 ( , , , i32 ) + declare <256 x double> @llvm.vp.fadd.v256f64 (<256 x double> , <256 x double> , <256 x i1> , i32 ) + +Overview: +""""""""" + +Predicated floating-point addition of two vectors of floating-point values. + + +Arguments: +"""""""""" + +The first two operands and the result have the same vector of floating-point type. The +third operand is the vector mask and has the same number of elements as the +result vector type. The fourth operand is the explicit vector length of the +operation. + +Semantics: +"""""""""" + +The '``llvm.vp.fadd``' intrinsic performs floating-point addition (:ref:`add `) +of the first and second vector operand on each enabled lane. The result on +disabled lanes is undefined. The operation is performed in the default +floating-point environment. + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call <4 x float> @llvm.vp.fadd.v4f32(<4 x float> %a, <4 x float> %b, <4 x i1> %mask, i32 %evl) + ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r + + %t = fadd <4 x float> %a, %b + %also.r = select <4 x i1> %mask, <4 x float> %t, <4 x float> undef + + +.. _int_vp_fsub: + +'``llvm.vp.fsub.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare <16 x float> @llvm.vp.fsub.v16f32 (<16 x float> , <16 x float> , <16 x i1> , i32 ) + declare @llvm.vp.fsub.nxv4f32 ( , , , i32 ) + declare <256 x double> @llvm.vp.fsub.v256f64 (<256 x double> , <256 x double> , <256 x i1> , i32 ) + +Overview: +""""""""" + +Predicated floating-point addition of two vectors of floating-point values. + + +Arguments: +"""""""""" + +The first two operands and the result have the same vector of floating-point type. The +third operand is the vector mask and has the same number of elements as the +result vector type. The fourth operand is the explicit vector length of the +operation. + +Semantics: +"""""""""" + +The '``llvm.vp.fsub``' intrinsic performs floating-point addition (:ref:`add `) +of the first and second vector operand on each enabled lane. The result on +disabled lanes is undefined. The operation is performed in the default +floating-point environment. + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call <4 x float> @llvm.vp.fsub.v4f32(<4 x float> %a, <4 x float> %b, <4 x i1> %mask, i32 %evl) + ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r + + %t = fsub <4 x float> %a, %b + %also.r = select <4 x i1> %mask, <4 x float> %t, <4 x float> undef + + +.. _int_vp_fmul: + +'``llvm.vp.fmul.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare <16 x float> @llvm.vp.fmul.v16f32 (<16 x float> , <16 x float> , <16 x i1> , i32 ) + declare @llvm.vp.fmul.nxv4f32 ( , , , i32 ) + declare <256 x double> @llvm.vp.fmul.v256f64 (<256 x double> , <256 x double> , <256 x i1> , i32 ) + +Overview: +""""""""" + +Predicated floating-point addition of two vectors of floating-point values. + + +Arguments: +"""""""""" + +The first two operands and the result have the same vector of floating-point type. The +third operand is the vector mask and has the same number of elements as the +result vector type. The fourth operand is the explicit vector length of the +operation. + +Semantics: +"""""""""" + +The '``llvm.vp.fmul``' intrinsic performs floating-point addition (:ref:`add `) +of the first and second vector operand on each enabled lane. The result on +disabled lanes is undefined. The operation is performed in the default +floating-point environment. + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call <4 x float> @llvm.vp.fmul.v4f32(<4 x float> %a, <4 x float> %b, <4 x i1> %mask, i32 %evl) + ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r + + %t = fmul <4 x float> %a, %b + %also.r = select <4 x i1> %mask, <4 x float> %t, <4 x float> undef + + +.. _int_vp_fdiv: + +'``llvm.vp.fdiv.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare <16 x float> @llvm.vp.fdiv.v16f32 (<16 x float> , <16 x float> , <16 x i1> , i32 ) + declare @llvm.vp.fdiv.nxv4f32 ( , , , i32 ) + declare <256 x double> @llvm.vp.fdiv.v256f64 (<256 x double> , <256 x double> , <256 x i1> , i32 ) + +Overview: +""""""""" + +Predicated floating-point addition of two vectors of floating-point values. + + +Arguments: +"""""""""" + +The first two operands and the result have the same vector of floating-point type. The +third operand is the vector mask and has the same number of elements as the +result vector type. The fourth operand is the explicit vector length of the +operation. + +Semantics: +"""""""""" + +The '``llvm.vp.fdiv``' intrinsic performs floating-point addition (:ref:`add `) +of the first and second vector operand on each enabled lane. The result on +disabled lanes is undefined. The operation is performed in the default +floating-point environment. + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call <4 x float> @llvm.vp.fdiv.v4f32(<4 x float> %a, <4 x float> %b, <4 x i1> %mask, i32 %evl) + ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r + + %t = fdiv <4 x float> %a, %b + %also.r = select <4 x i1> %mask, <4 x float> %t, <4 x float> undef + + +.. _int_vp_frem: + +'``llvm.vp.frem.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare <16 x float> @llvm.vp.frem.v16f32 (<16 x float> , <16 x float> , <16 x i1> , i32 ) + declare @llvm.vp.frem.nxv4f32 ( , , , i32 ) + declare <256 x double> @llvm.vp.frem.v256f64 (<256 x double> , <256 x double> , <256 x i1> , i32 ) + +Overview: +""""""""" + +Predicated floating-point addition of two vectors of floating-point values. + + +Arguments: +"""""""""" + +The first two operands and the result have the same vector of floating-point type. The +third operand is the vector mask and has the same number of elements as the +result vector type. The fourth operand is the explicit vector length of the +operation. + +Semantics: +"""""""""" + +The '``llvm.vp.frem``' intrinsic performs floating-point addition (:ref:`add `) +of the first and second vector operand on each enabled lane. The result on +disabled lanes is undefined. The operation is performed in the default +floating-point environment. + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call <4 x float> @llvm.vp.frem.v4f32(<4 x float> %a, <4 x float> %b, <4 x i1> %mask, i32 %evl) + ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r + + %t = frem <4 x float> %a, %b + %also.r = select <4 x i1> %mask, <4 x float> %t, <4 x float> undef + + + .. _int_get_active_lane_mask: '``llvm.get.active.lane.mask.*``' Intrinsics diff --git a/llvm/include/llvm/IR/IntrinsicInst.h b/llvm/include/llvm/IR/IntrinsicInst.h --- a/llvm/include/llvm/IR/IntrinsicInst.h +++ b/llvm/include/llvm/IR/IntrinsicInst.h @@ -253,6 +253,17 @@ // Whether \p ID is a VP intrinsic ID. static bool IsVPIntrinsic(Intrinsic::ID); + /// Constrained FP { + + // Whether the VP intrinsic \p ID can have a rounding mode bundle. + static bool HasRoundingMode(Intrinsic::ID); + + // Whether the VP intrinsic \p ID can have a exception behavior + // bundle. + static bool HasExceptionBehavior(Intrinsic::ID); + + /// } Constrained FP + /// \return the mask parameter or nullptr. Value *getMaskParam() const; diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -1382,6 +1382,36 @@ llvm_i32_ty]>; } +// Floating-point arithmetic. +let IntrProperties = + [IntrSpeculatable, IntrNoMem, IntrNoSync, IntrWillReturn] in { + def int_vp_fadd : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ], + [ LLVMMatchType<0>, + LLVMMatchType<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_fsub : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ], + [ LLVMMatchType<0>, + LLVMMatchType<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_fmul : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ], + [ LLVMMatchType<0>, + LLVMMatchType<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_fdiv : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ], + [ LLVMMatchType<0>, + LLVMMatchType<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_frem : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ], + [ LLVMMatchType<0>, + LLVMMatchType<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; +} + def int_get_active_lane_mask: DefaultAttrsIntrinsic<[llvm_anyvector_ty], [llvm_anyint_ty, LLVMMatchType<1>], diff --git a/llvm/include/llvm/IR/VPIntrinsics.def b/llvm/include/llvm/IR/VPIntrinsics.def --- a/llvm/include/llvm/IR/VPIntrinsics.def +++ b/llvm/include/llvm/IR/VPIntrinsics.def @@ -88,6 +88,17 @@ #define HANDLE_VP_TO_OPC(OPC) #endif +// Whether the intrinsic may have a rounding mode or exception behavior operand +// bundle. +// \p HASROUND '1' if the intrinsic can have a rounding mode operand bundle, +// 'None' otherwise. +// \p HASEXCEPT '1' if the intrinsic can have an exception behavior operand +// bundle, 'None' otherwise. +// \p INTRINID The constrained fp intrinsic this VP intrinsic corresponds to. +#ifndef HANDLE_VP_TO_CONSTRAINEDFP +#define HANDLE_VP_TO_CONSTRAINEDFP(HASROUND, HASEXCEPT, INTRINID) +#endif + /// } Property Macros ///// Integer Arithmetic { @@ -146,6 +157,33 @@ ///// } Integer Arithmetic +///// Floating-Point Arithmetic { + +// Specialized helper macro for integer binary operators (%x, %y, %mask, %evl). +#ifdef HELPER_REGISTER_BINARY_FP_VP +#error "The internal helper macro HELPER_REGISTER_BINARY_FP_VP is already defined!" +#endif +#define HELPER_REGISTER_BINARY_FP_VP(OPSUFFIX, SDOPC, OPC) \ +BEGIN_REGISTER_VP(vp_##OPSUFFIX, 2, 3, SDOPC, -1) \ +HANDLE_VP_TO_OPC(OPC) \ +HANDLE_VP_TO_CONSTRAINEDFP(1,1,llvm_experimental_constrained_##OPSUFFIX) \ +END_REGISTER_VP(vp_##OPSUFFIX, SDOPC) + +// llvm.vp.fadd(x,y,mask,vlen) +HELPER_REGISTER_BINARY_FP_VP(fadd, VP_FADD, FAdd) + +// llvm.vp.fmul(x,y,mask,vlen) +HELPER_REGISTER_BINARY_FP_VP(fmul, VP_FMUL, FMul) + +// llvm.vp.fdiv(x,y,mask,vlen) +HELPER_REGISTER_BINARY_FP_VP(fdiv, VP_FDIV, FDiv) + +// llvm.vp.frem(x,y,mask,vlen) +HELPER_REGISTER_BINARY_FP_VP(frem, VP_FREM, FRem) + +#undef HELPER_REGISTER_BINARY_FP_VP + +///// } Floating-Point Arithmetic #undef BEGIN_REGISTER_VP #undef BEGIN_REGISTER_VP_INTRINSIC @@ -154,3 +192,4 @@ #undef END_REGISTER_VP_INTRINSIC #undef END_REGISTER_VP_SDNODE #undef HANDLE_VP_TO_OPC +#undef HANDLE_VP_TO_CONSTRAINEDFP diff --git a/llvm/lib/IR/IntrinsicInst.cpp b/llvm/lib/IR/IntrinsicInst.cpp --- a/llvm/lib/IR/IntrinsicInst.cpp +++ b/llvm/lib/IR/IntrinsicInst.cpp @@ -240,6 +240,33 @@ return true; } +bool VPIntrinsic::HasRoundingMode(Intrinsic::ID ID) { + switch (ID) { + default: + return false; + +#define BEGIN_REGISTER_VP_INTRINSIC(VPID, MASKPOS, VLENPOS) \ + case Intrinsic::VPID: +#define END_REGISTER_VP_INTRINSIC(VPID) return false; +#define HANDLE_VP_TO_CONSTRAINEDFP(HASROUND, ...) return (bool)HASROUND; +#include "llvm/IR/VPIntrinsics.def" + } +} + +bool VPIntrinsic::HasExceptionBehavior(Intrinsic::ID ID) { + switch (ID) { + default: + return false; + +#define BEGIN_REGISTER_VP_INTRINSIC(VPID, MASKPOS, VLENPOS) \ + case Intrinsic::VPID: +#define END_REGISTER_VP_INTRINSIC(VPID) return false; +#define HANDLE_VP_TO_CONSTRAINEDFP(HASROUND, HASEXCEPT, ...) \ + return (bool)HASEXCEPT; +#include "llvm/IR/VPIntrinsics.def" + } +} + // Equivalent non-predicated opcode unsigned VPIntrinsic::GetFunctionalOpcodeForVP(Intrinsic::ID ID) { unsigned FunctionalOC = Instruction::Call; diff --git a/llvm/test/Verifier/vp-intrinsics.ll b/llvm/test/Verifier/vp-intrinsics.ll --- a/llvm/test/Verifier/vp-intrinsics.ll +++ b/llvm/test/Verifier/vp-intrinsics.ll @@ -17,6 +17,18 @@ ret void } + +define void @test_vp_fp(<8 x double> %f0, <8 x double> %f1, <8 x i1> %m, i32 %n) { + %r0 = call <8 x double> @llvm.vp.fadd.v8f64(<8 x double> %f0, <8 x double> %f1, <8 x i1> %m, i32 %n) + %r1 = call <8 x double> @llvm.vp.fsub.v8f64(<8 x double> %f0, <8 x double> %f1, <8 x i1> %m, i32 %n) + %r2 = call <8 x double> @llvm.vp.fmul.v8f64(<8 x double> %f0, <8 x double> %f1, <8 x i1> %m, i32 %n) + %r3 = call <8 x double> @llvm.vp.fdiv.v8f64(<8 x double> %f0, <8 x double> %f1, <8 x i1> %m, i32 %n) + %r4 = call <8 x double> @llvm.vp.frem.v8f64(<8 x double> %f0, <8 x double> %f1, <8 x i1> %m, i32 %n) + ret void +} + +; TODO: test_vp_constrained_fp + ; integer arith declare <8 x i32> @llvm.vp.add.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) declare <8 x i32> @llvm.vp.sub.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) @@ -32,3 +44,10 @@ declare <8 x i32> @llvm.vp.ashr.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) declare <8 x i32> @llvm.vp.lshr.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) declare <8 x i32> @llvm.vp.shl.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) +; fp arith +declare <8 x double> @llvm.vp.fadd.v8f64(<8 x double>, <8 x double>, <8 x i1>, i32) +declare <8 x double> @llvm.vp.fsub.v8f64(<8 x double>, <8 x double>, <8 x i1>, i32) +declare <8 x double> @llvm.vp.fmul.v8f64(<8 x double>, <8 x double>, <8 x i1>, i32) +declare <8 x double> @llvm.vp.fdiv.v8f64(<8 x double>, <8 x double>, <8 x i1>, i32) +declare <8 x double> @llvm.vp.frem.v8f64(<8 x double>, <8 x double>, <8 x i1>, i32) +