diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -17856,6 +17856,67 @@ %also.r = select <4 x i1> %cond, <4 x i32> %on_true, <4 x i32> %on_false +.. _int_vp_merge: + +'``llvm.vp.merge.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare <16 x i32> @llvm.vp.merge.v16i32 (<16 x i1> , <16 x i32> , <16 x i32> , i32 ) + declare @llvm.vp.merge.nxv4i64 ( , , , i32 ) + +Overview: +""""""""" + +The '``llvm.vp.merge``' intrinsic is used to choose one value based on a +condition vector and an index operand, without IR-level branching. + +Arguments: +"""""""""" + +The first operand is a vector of ``i1`` and indicates the condition. The +second operand is the value that is merged where the condition vector is true. +The third operand is the value that is selected where the condition vector is +false or the lane position is greater equal than the pivot. The fourth operand +is the pivot. + +#. The optional ``fast-math flags`` marker indicates that the merge has one or + more :ref:`fast-math flags `. These are optimization hints to + enable otherwise unsafe floating-point optimizations. Fast-math flags are + only valid for merges that return a floating-point scalar or vector type, + or an array (nested to any depth) of floating-point scalar or vector types. + +Semantics: +"""""""""" + +The intrinsic selects lanes from the second and third operand depending on a +condition vector and pivot value. + +For all lanes where the condition vector is true and the lane position is less +than ``%pivot`` the lane is taken from the second operand. Otherwise, the lane +is taken from the third operand. + +Example: +"""""""" + +.. code-block:: llvm + + %r = call <4 x i32> @llvm.vp.merge.v4i32(<4 x i1> %cond, <4 x i32> %on_true, <4 x i32> %on_false, i32 %pivot) + + ;;; Expansion. + ;; Lanes at and above %pivot are taken from %on_false + %atfirst = insertelement <4 x i32> undef, i32 %pivot, i32 0 + %splat = shufflevector <4 x i32> %atfirst, <4 x i32> poison, <4 x i32> zeroinitializer + %pivotmask = icmp ult <4 x i32> %splat, <4 x i32> + %mergemask = and <4 x i1> %cond, <4 x i1> %pivotmask + %also.r = select <4 x i1> %mergemask, <4 x i32> %on_true, <4 x i32> %on_false + + .. _int_vp_add: diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -1507,6 +1507,12 @@ LLVMMatchType<0>, llvm_i32_ty]>; +def int_vp_merge : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ], + [ LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + LLVMMatchType<0>, + LLVMMatchType<0>, + llvm_i32_ty]>; + // Reductions let IntrProperties = [IntrSpeculatable, IntrNoMem, IntrNoSync, IntrWillReturn] in { def int_vp_reduce_fadd : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>], diff --git a/llvm/include/llvm/IR/VPIntrinsics.def b/llvm/include/llvm/IR/VPIntrinsics.def --- a/llvm/include/llvm/IR/VPIntrinsics.def +++ b/llvm/include/llvm/IR/VPIntrinsics.def @@ -349,6 +349,10 @@ VP_PROPERTY_FUNCTIONAL_OPC(Select) END_REGISTER_VP(vp_select, VP_SELECT) +// llvm.vp.merge(mask,on_true,on_false,pivot) +BEGIN_REGISTER_VP(vp_merge, 0, 3, VP_MERGE, -1) +END_REGISTER_VP(vp_merge, VP_MERGE) + BEGIN_REGISTER_VP(experimental_vp_splice, 3, 5, EXPERIMENTAL_VP_SPLICE, -1) END_REGISTER_VP(experimental_vp_splice, EXPERIMENTAL_VP_SPLICE) diff --git a/llvm/lib/IR/IntrinsicInst.cpp b/llvm/lib/IR/IntrinsicInst.cpp --- a/llvm/lib/IR/IntrinsicInst.cpp +++ b/llvm/lib/IR/IntrinsicInst.cpp @@ -482,6 +482,7 @@ VPFunc = Intrinsic::getDeclaration(M, VPID, OverloadTy); break; } + case Intrinsic::vp_merge: case Intrinsic::vp_select: VPFunc = Intrinsic::getDeclaration(M, VPID, {Params[1]->getType()}); break; diff --git a/llvm/unittests/IR/VPIntrinsicTest.cpp b/llvm/unittests/IR/VPIntrinsicTest.cpp --- a/llvm/unittests/IR/VPIntrinsicTest.cpp +++ b/llvm/unittests/IR/VPIntrinsicTest.cpp @@ -68,6 +68,8 @@ Str << " declare float @llvm.vp.reduce." << ReductionOpcode << ".v8f32(float, <8 x float>, <8 x i1>, i32) "; + Str << " declare <8 x i32> @llvm.vp.merge.v8i32(<8 x i1>, <8 x i32>, <8 x " + "i32>, i32)"; Str << " declare <8 x i32> @llvm.vp.select.v8i32(<8 x i1>, <8 x i32>, <8 x " "i32>, i32)"; Str << " declare <8 x i32> @llvm.experimental.vp.splice.v8i32(<8 x "