diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -15982,6 +15982,81 @@ """""""""" The argument to this intrinsic must be a vector of floating-point values. +'``llvm.experimental.vector.insert``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. You can use ``llvm.experimental.vector.insert`` +to insert a fixed-width vector into a scalable vector, but not the other way +around. + +:: + + declare @llvm.experimental.vector.insert.v4f32( %vec, <4 x float> %subvec, i64 %idx) + declare @llvm.experimental.vector.insert.v2f64( %vec, <2 x double> %subvec, i64 %idx) + +Overview: +""""""""" + +The '``llvm.experimental.vector.insert.*``' intrinsics insert a vector into another vector +starting from a given index. The return type matches the type of the vector we +insert into. Conceptually, this can be used to build a scalable vector out of +non-scalable vectors. + +Arguments: +"""""""""" + +The ``vec`` is the vector which ``subvec`` will be inserted into. +The ``subvec`` is the vector that will be inserted. + +``idx`` represents the starting element number at which ``subvec`` will be +inserted. ``idx`` must be a constant multiple of ``subvec``'s known minimum +vector length. If ``subvec`` is a scalable vector, ``idx`` is first scaled by +the runtime scaling factor of ``subvec``. The elements of ``vec`` starting at +``idx`` are overwritten with ``subvec``. Elements ``idx`` through (``idx`` + +num_elements(``subvec``) - 1) must be valid ``vec`` indices. If this condition +cannot be determined statically but is false at runtime, then the result vector +is undefined. + + +'``llvm.experimental.vector.extract``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. You can use +``llvm.experimental.vector.extract`` to extract a fixed-width vector from a +scalable vector, but not the other way around. + +:: + + declare <4 x float> @llvm.experimental.vector.extract.v4f32( %vec, i64 %idx) + declare <2 x double> @llvm.experimental.vector.extract.v2f64( %vec, i64 %idx) + +Overview: +""""""""" + +The '``llvm.experimental.vector.extract.*``' intrinsics extract a vector from +within another vector starting from a given index. The return type must be +explicitly specified. Conceptually, this can be used to decompose a scalable +vector into non-scalable parts. + +Arguments: +"""""""""" + +The ``vec`` is the vector from which we will extract a subvector. + +The ``idx`` specifies the starting element number within ``vec`` from which a +subvector is extracted. ``idx`` must be a constant multiple of the known-minimum +vector length of the result type. If the result type is a scalable vector, +``idx`` is first scaled by the result type's runtime scaling factor. Elements +``idx`` through (``idx`` + num_elements(result_type) - 1) must be valid vector +indices. If this condition cannot be determined statically but is false at +runtime, then the result vector is undefined. The ``idx`` parameter must be a +vector index constant type (for most targets this will be an integer pointer +type). + Matrix Intrinsics ----------------- diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -1625,6 +1625,15 @@ //===---------- Intrinsics to query properties of scalable vectors --------===// def int_vscale : DefaultAttrsIntrinsic<[llvm_anyint_ty], [], [IntrNoMem]>; +//===---------- Intrinsics to perform subvector insertion/extraction ------===// +def int_experimental_vector_insert : DefaultAttrsIntrinsic<[llvm_anyvector_ty], + [LLVMMatchType<0>, llvm_anyvector_ty, llvm_i64_ty], + [IntrNoMem, ImmArg>]>; + +def int_experimental_vector_extract : DefaultAttrsIntrinsic<[llvm_anyvector_ty], + [llvm_anyvector_ty, llvm_i64_ty], + [IntrNoMem, ImmArg>]>; + //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -6892,6 +6892,27 @@ SetCC)); return; } + case Intrinsic::experimental_vector_insert: { + auto DL = getCurSDLoc(); + + SDValue Vec = getValue(I.getOperand(0)); + SDValue SubVec = getValue(I.getOperand(1)); + SDValue Index = getValue(I.getOperand(2)); + EVT ResultVT = TLI.getValueType(DAG.getDataLayout(), I.getType()); + setValue(&I, DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ResultVT, Vec, SubVec, + Index)); + return; + } + case Intrinsic::experimental_vector_extract: { + auto DL = getCurSDLoc(); + + SDValue Vec = getValue(I.getOperand(0)); + SDValue Index = getValue(I.getOperand(1)); + EVT ResultVT = TLI.getValueType(DAG.getDataLayout(), I.getType()); + + setValue(&I, DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResultVT, Vec, Index)); + return; + } } } diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -5146,6 +5146,26 @@ break; } + case Intrinsic::experimental_vector_insert: { + VectorType *VecTy = cast(Call.getArgOperand(0)->getType()); + VectorType *SubVecTy = cast(Call.getArgOperand(1)->getType()); + + Assert(VecTy->getElementType() == SubVecTy->getElementType(), + "experimental_vector_insert parameters must have the same element " + "type.", + &Call); + break; + } + case Intrinsic::experimental_vector_extract: { + VectorType *ResultTy = cast(Call.getType()); + VectorType *VecTy = cast(Call.getArgOperand(0)->getType()); + + Assert(ResultTy->getElementType() == VecTy->getElementType(), + "experimental_vector_extract result must have the same element " + "type as the input vector.", + &Call); + break; + } }; } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -1652,6 +1652,102 @@ } break; } + case Intrinsic::experimental_vector_insert: { + Value *Vec = II->getArgOperand(0); + Value *SubVec = II->getArgOperand(1); + Value *Idx = II->getArgOperand(2); + auto *DstTy = dyn_cast(II->getType()); + auto *VecTy = dyn_cast(Vec->getType()); + auto *SubVecTy = dyn_cast(SubVec->getType()); + + // Only canonicalize if the destination vector, Vec, and SubVec are all + // fixed vectors. + if (DstTy && VecTy && SubVecTy) { + unsigned DstNumElts = DstTy->getNumElements(); + unsigned VecNumElts = VecTy->getNumElements(); + unsigned SubVecNumElts = SubVecTy->getNumElements(); + unsigned IdxN = cast(Idx)->getZExtValue(); + + // The result of this call is undefined if IdxN is not a constant multiple + // of the SubVec's minimum vector length OR the insertion overruns Vec. + if (IdxN % SubVecNumElts != 0 || IdxN + SubVecNumElts > VecNumElts) { + replaceInstUsesWith(CI, UndefValue::get(CI.getType())); + return eraseInstFromFunction(CI); + } + + // An insert that entirely overwrites Vec with SubVec is a nop. + if (VecNumElts == SubVecNumElts) { + replaceInstUsesWith(CI, SubVec); + return eraseInstFromFunction(CI); + } + + // Widen SubVec into a vector of the same width as Vec, since + // shufflevector requires the two input vectors to be the same width. + // Elements beyond the bounds of SubVec within the widened vector are + // undefined. + SmallVector WidenMask; + unsigned i; + for (i = 0; i != SubVecNumElts; ++i) + WidenMask.push_back(i); + for (; i != VecNumElts; ++i) + WidenMask.push_back(UndefMaskElem); + + Value *WidenShuffle = Builder.CreateShuffleVector( + SubVec, llvm::UndefValue::get(SubVecTy), WidenMask); + + SmallVector Mask; + for (unsigned i = 0; i != IdxN; ++i) + Mask.push_back(i); + for (unsigned i = DstNumElts; i != DstNumElts + SubVecNumElts; ++i) + Mask.push_back(i); + for (unsigned i = IdxN + SubVecNumElts; i != DstNumElts; ++i) + Mask.push_back(i); + + Value *Shuffle = Builder.CreateShuffleVector(Vec, WidenShuffle, Mask); + replaceInstUsesWith(CI, Shuffle); + return eraseInstFromFunction(CI); + } + break; + } + case Intrinsic::experimental_vector_extract: { + Value *Vec = II->getArgOperand(0); + Value *Idx = II->getArgOperand(1); + + auto *DstTy = dyn_cast(II->getType()); + auto *VecTy = dyn_cast(Vec->getType()); + + // Only canonicalize if the the destination vector and Vec are fixed + // vectors. + if (DstTy && VecTy) { + unsigned DstNumElts = DstTy->getNumElements(); + unsigned VecNumElts = VecTy->getNumElements(); + unsigned IdxN = cast(Idx)->getZExtValue(); + + // The result of this call is undefined if IdxN is not a constant multiple + // of the result type's minimum vector length OR the extraction overruns + // Vec. + if (IdxN % DstNumElts != 0 || IdxN + DstNumElts > VecNumElts) { + replaceInstUsesWith(CI, UndefValue::get(CI.getType())); + return eraseInstFromFunction(CI); + } + + // Extracting the entirety of Vec is a nop. + if (VecNumElts == DstNumElts) { + replaceInstUsesWith(CI, Vec); + return eraseInstFromFunction(CI); + } + + SmallVector Mask; + for (unsigned i = 0; i != DstNumElts; ++i) + Mask.push_back(IdxN + i); + + Value *Shuffle = + Builder.CreateShuffleVector(Vec, UndefValue::get(VecTy), Mask); + replaceInstUsesWith(CI, Shuffle); + return eraseInstFromFunction(CI); + } + break; + } default: { // Handle target specific intrinsics Optional V = targetInstCombineIntrinsic(*II); diff --git a/llvm/test/CodeGen/AArch64/sve-extract-vector.ll b/llvm/test/CodeGen/AArch64/sve-extract-vector.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-extract-vector.ll @@ -0,0 +1,138 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s 2>%t | FileCheck %s --check-prefixes=CHECK +; RUN: FileCheck --check-prefix=WARN --allow-empty %s < %t + +; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it. +; WARN-NOT: warning + +; Should codegen to a nop, since idx is zero. +define <2 x i64> @extract_v2i64_nxv2i64( %vec) nounwind { +; CHECK-LABEL: extract_v2i64_nxv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %retval = call <2 x i64> @llvm.experimental.vector.extract.v2i64.nxv2i64( %vec, i64 0) + ret <2 x i64> %retval +} + +; Goes through memory currently; idx != 0. +define <2 x i64> @extract_v2i64_nxv2i64_idx1( %vec) nounwind { +; CHECK-LABEL: extract_v2i64_nxv2i64_idx1: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: cntd x8 +; CHECK-NEXT: sub x8, x8, #1 // =1 +; CHECK-NEXT: cmp x8, #1 // =1 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: csinc x8, x8, xzr, lo +; CHECK-NEXT: st1d { z0.d }, p0, [sp] +; CHECK-NEXT: lsl x8, x8, #3 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: ldr q0, [x9, x8] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +%retval = call <2 x i64> @llvm.experimental.vector.extract.v2i64.nxv2i64( %vec, i64 1) +ret <2 x i64> %retval +} + +; Should codegen to a nop, since idx is zero. +define <4 x i32> @extract_v4i32_nxv4i32( %vec) nounwind { +; CHECK-LABEL: extract_v4i32_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret +%retval = call <4 x i32> @llvm.experimental.vector.extract.v4i32.nxv4i32( %vec, i64 0) +ret <4 x i32> %retval +} + +; Goes through memory currently; idx != 0. +define <4 x i32> @extract_v4i32_nxv4i32_idx1( %vec) nounwind { +; CHECK-LABEL: extract_v4i32_nxv4i32_idx1: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: cntw x8 +; CHECK-NEXT: sub x8, x8, #1 // =1 +; CHECK-NEXT: cmp x8, #1 // =1 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: csinc x8, x8, xzr, lo +; CHECK-NEXT: st1w { z0.s }, p0, [sp] +; CHECK-NEXT: lsl x8, x8, #2 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: ldr q0, [x9, x8] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %retval = call <4 x i32> @llvm.experimental.vector.extract.v4i32.nxv4i32( %vec, i64 1) + ret <4 x i32> %retval +} + +; Should codegen to a nop, since idx is zero. +define <8 x i16> @extract_v8i16_nxv8i16( %vec) nounwind { +; CHECK-LABEL: extract_v8i16_nxv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %retval = call <8 x i16> @llvm.experimental.vector.extract.v8i16.nxv8i16( %vec, i64 0) + ret <8 x i16> %retval +} + +; Goes through memory currently; idx != 0. +define <8 x i16> @extract_v8i16_nxv8i16_idx1( %vec) nounwind { +; CHECK-LABEL: extract_v8i16_nxv8i16_idx1: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: cnth x8 +; CHECK-NEXT: sub x8, x8, #1 // =1 +; CHECK-NEXT: cmp x8, #1 // =1 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: csinc x8, x8, xzr, lo +; CHECK-NEXT: st1h { z0.h }, p0, [sp] +; CHECK-NEXT: lsl x8, x8, #1 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: ldr q0, [x9, x8] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %retval = call <8 x i16> @llvm.experimental.vector.extract.v8i16.nxv8i16( %vec, i64 1) + ret <8 x i16> %retval +} + +; Should codegen to a nop, since idx is zero. +define <16 x i8> @extract_v16i8_nxv16i8( %vec) nounwind { +; CHECK-LABEL: extract_v16i8_nxv16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %retval = call <16 x i8> @llvm.experimental.vector.extract.v16i8.nxv16i8( %vec, i64 0) + ret <16 x i8> %retval +} + +; Goes through memory currently; idx != 0. +define <16 x i8> @extract_v16i8_nxv16i8_idx1( %vec) nounwind { +; CHECK-LABEL: extract_v16i8_nxv16i8_idx1: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: sub x8, x8, #1 // =1 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: cmp x8, #1 // =1 +; CHECK-NEXT: st1b { z0.b }, p0, [sp] +; CHECK-NEXT: csinc x8, x8, xzr, lo +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: ldr q0, [x9, x8] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %retval = call <16 x i8> @llvm.experimental.vector.extract.v16i8.nxv16i8( %vec, i64 1) + ret <16 x i8> %retval +} + +declare <2 x i64> @llvm.experimental.vector.extract.v2i64.nxv2i64(, i64) +declare <4 x i32> @llvm.experimental.vector.extract.v4i32.nxv4i32(, i64) +declare <8 x i16> @llvm.experimental.vector.extract.v8i16.nxv8i16(, i64) +declare <16 x i8> @llvm.experimental.vector.extract.v16i8.nxv16i8(, i64) diff --git a/llvm/test/CodeGen/AArch64/sve-insert-vector.ll b/llvm/test/CodeGen/AArch64/sve-insert-vector.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-insert-vector.ll @@ -0,0 +1,184 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s 2>%t | FileCheck %s --check-prefixes=CHECK +; RUN: FileCheck --check-prefix=WARN --allow-empty %s < %t +; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it. +; WARN-NOT: warning + +define @insert_v2i64_nxv2i64( %vec, <2 x i64> %subvec) nounwind { +; CHECK-LABEL: insert_v2i64_nxv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: cntd x8 +; CHECK-NEXT: sub x8, x8, #1 // =1 +; CHECK-NEXT: cmp x8, #0 // =0 +; CHECK-NEXT: csel x8, x8, xzr, lo +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: lsl x8, x8, #3 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: st1d { z0.d }, p0, [sp] +; CHECK-NEXT: str q1, [x9, x8] +; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %retval = call @llvm.experimental.vector.insert.nxv2i64.v2i64( %vec, <2 x i64> %subvec, i64 0) + ret %retval +} + +define @insert_v2i64_nxv2i64_idx1( %vec, <2 x i64> %subvec) nounwind { +; CHECK-LABEL: insert_v2i64_nxv2i64_idx1: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: cntd x8 +; CHECK-NEXT: sub x8, x8, #1 // =1 +; CHECK-NEXT: cmp x8, #1 // =1 +; CHECK-NEXT: csinc x8, x8, xzr, lo +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: lsl x8, x8, #3 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: st1d { z0.d }, p0, [sp] +; CHECK-NEXT: str q1, [x9, x8] +; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %retval = call @llvm.experimental.vector.insert.nxv2i64.v2i64( %vec, <2 x i64> %subvec, i64 1) + ret %retval +} + +define @insert_v4i32_nxv4i32( %vec, <4 x i32> %subvec) nounwind { +; CHECK-LABEL: insert_v4i32_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: cntw x8 +; CHECK-NEXT: sub x8, x8, #1 // =1 +; CHECK-NEXT: cmp x8, #0 // =0 +; CHECK-NEXT: csel x8, x8, xzr, lo +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: lsl x8, x8, #2 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: st1w { z0.s }, p0, [sp] +; CHECK-NEXT: str q1, [x9, x8] +; CHECK-NEXT: ld1w { z0.s }, p0/z, [sp] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %retval = call @llvm.experimental.vector.insert.nxv4i32.v4i32( %vec, <4 x i32> %subvec, i64 0) + ret %retval +} + +define @insert_v4i32_nxv4i32_idx1( %vec, <4 x i32> %subvec) nounwind { +; CHECK-LABEL: insert_v4i32_nxv4i32_idx1: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: cntw x8 +; CHECK-NEXT: sub x8, x8, #1 // =1 +; CHECK-NEXT: cmp x8, #1 // =1 +; CHECK-NEXT: csinc x8, x8, xzr, lo +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: lsl x8, x8, #2 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: st1w { z0.s }, p0, [sp] +; CHECK-NEXT: str q1, [x9, x8] +; CHECK-NEXT: ld1w { z0.s }, p0/z, [sp] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %retval = call @llvm.experimental.vector.insert.nxv4i32.v4i32( %vec, <4 x i32> %subvec, i64 1) + ret %retval +} + +define @insert_v8i16_nxv8i16( %vec, <8 x i16> %subvec) nounwind { +; CHECK-LABEL: insert_v8i16_nxv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: cnth x8 +; CHECK-NEXT: sub x8, x8, #1 // =1 +; CHECK-NEXT: cmp x8, #0 // =0 +; CHECK-NEXT: csel x8, x8, xzr, lo +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: lsl x8, x8, #1 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: st1h { z0.h }, p0, [sp] +; CHECK-NEXT: str q1, [x9, x8] +; CHECK-NEXT: ld1h { z0.h }, p0/z, [sp] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %retval = call @llvm.experimental.vector.insert.nxv8i16.v8i16( %vec, <8 x i16> %subvec, i64 0) + ret %retval +} + +define @insert_v8i16_nxv8i16_idx1( %vec, <8 x i16> %subvec) nounwind { +; CHECK-LABEL: insert_v8i16_nxv8i16_idx1: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: cnth x8 +; CHECK-NEXT: sub x8, x8, #1 // =1 +; CHECK-NEXT: cmp x8, #1 // =1 +; CHECK-NEXT: csinc x8, x8, xzr, lo +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: lsl x8, x8, #1 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: st1h { z0.h }, p0, [sp] +; CHECK-NEXT: str q1, [x9, x8] +; CHECK-NEXT: ld1h { z0.h }, p0/z, [sp] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %retval = call @llvm.experimental.vector.insert.nxv8i16.v8i16( %vec, <8 x i16> %subvec, i64 1) + ret %retval +} + +define @insert_v16i8_nxv16i8( %vec, <16 x i8> %subvec) nounwind { +; CHECK-LABEL: insert_v16i8_nxv16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: sub x8, x8, #1 // =1 +; CHECK-NEXT: cmp x8, #0 // =0 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: csel x8, x8, xzr, lo +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: st1b { z0.b }, p0, [sp] +; CHECK-NEXT: str q1, [x9, x8] +; CHECK-NEXT: ld1b { z0.b }, p0/z, [sp] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %retval = call @llvm.experimental.vector.insert.nxv16i8.v16i8( %vec, <16 x i8> %subvec, i64 0) + ret %retval +} + +define @insert_v16i8_nxv16i8_idx1( %vec, <16 x i8> %subvec) nounwind { +; CHECK-LABEL: insert_v16i8_nxv16i8_idx1: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: sub x8, x8, #1 // =1 +; CHECK-NEXT: cmp x8, #1 // =1 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: csinc x8, x8, xzr, lo +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: st1b { z0.b }, p0, [sp] +; CHECK-NEXT: str q1, [x9, x8] +; CHECK-NEXT: ld1b { z0.b }, p0/z, [sp] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %retval = call @llvm.experimental.vector.insert.nxv16i8.v16i8( %vec, <16 x i8> %subvec, i64 1) + ret %retval +} + +declare @llvm.experimental.vector.insert.nxv2i64.v2i64(, <2 x i64>, i64) +declare @llvm.experimental.vector.insert.nxv4i32.v4i32(, <4 x i32>, i64) +declare @llvm.experimental.vector.insert.nxv8i16.v8i16(, <8 x i16>, i64) +declare @llvm.experimental.vector.insert.nxv16i8.v16i8(, <16 x i8>, i64) diff --git a/llvm/test/Transforms/InstCombine/canonicalize-vector-extract.ll b/llvm/test/Transforms/InstCombine/canonicalize-vector-extract.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/canonicalize-vector-extract.ll @@ -0,0 +1,139 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -instcombine -S | FileCheck %s + +; llvm.experimental.vector.extract canonicalizes to shufflevector in the fixed case. In the +; scalable case, we lower to the EXTRACT_SUBVECTOR ISD node. + +declare <10 x i32> @llvm.experimental.vector.extract.v10i32.v8i32(<8 x i32> %vec, i64 %idx) +declare <2 x i32> @llvm.experimental.vector.extract.v2i32.v4i32(<8 x i32> %vec, i64 %idx) +declare <3 x i32> @llvm.experimental.vector.extract.v3i32.v8i32(<8 x i32> %vec, i64 %idx) +declare <4 x i32> @llvm.experimental.vector.extract.v4i32.nxv4i32( %vec, i64 %idx) +declare <4 x i32> @llvm.experimental.vector.extract.v4i32.v8i32(<8 x i32> %vec, i64 %idx) +declare <8 x i32> @llvm.experimental.vector.extract.v8i32.v8i32(<8 x i32> %vec, i64 %idx) + +; ============================================================================ ; +; Trivial cases +; ============================================================================ ; + +; Extracting the entirety of a vector is a nop. +define <8 x i32> @trivial_nop(<8 x i32> %vec) { +; CHECK-LABEL: @trivial_nop( +; CHECK-NEXT: ret <8 x i32> [[VEC:%.*]] +; + %1 = call <8 x i32> @llvm.experimental.vector.extract.v8i32.v8i32(<8 x i32> %vec, i64 0) + ret <8 x i32> %1 +} + +; ============================================================================ ; +; Valid canonicalizations +; ============================================================================ ; + +define <2 x i32> @valid_extraction_a(<8 x i32> %vec) { +; CHECK-LABEL: @valid_extraction_a( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[VEC:%.*]], <8 x i32> undef, <2 x i32> +; CHECK-NEXT: ret <2 x i32> [[TMP1]] +; + %1 = call <2 x i32> @llvm.experimental.vector.extract.v2i32.v4i32(<8 x i32> %vec, i64 0) + ret <2 x i32> %1 +} + +define <2 x i32> @valid_extraction_b(<8 x i32> %vec) { +; CHECK-LABEL: @valid_extraction_b( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[VEC:%.*]], <8 x i32> undef, <2 x i32> +; CHECK-NEXT: ret <2 x i32> [[TMP1]] +; + %1 = call <2 x i32> @llvm.experimental.vector.extract.v2i32.v4i32(<8 x i32> %vec, i64 2) + ret <2 x i32> %1 +} + +define <2 x i32> @valid_extraction_c(<8 x i32> %vec) { +; CHECK-LABEL: @valid_extraction_c( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[VEC:%.*]], <8 x i32> undef, <2 x i32> +; CHECK-NEXT: ret <2 x i32> [[TMP1]] +; + %1 = call <2 x i32> @llvm.experimental.vector.extract.v2i32.v4i32(<8 x i32> %vec, i64 4) + ret <2 x i32> %1 +} + +define <2 x i32> @valid_extraction_d(<8 x i32> %vec) { +; CHECK-LABEL: @valid_extraction_d( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[VEC:%.*]], <8 x i32> undef, <2 x i32> +; CHECK-NEXT: ret <2 x i32> [[TMP1]] +; + %1 = call <2 x i32> @llvm.experimental.vector.extract.v2i32.v4i32(<8 x i32> %vec, i64 6) + ret <2 x i32> %1 +} + +define <4 x i32> @valid_extraction_e(<8 x i32> %vec) { +; CHECK-LABEL: @valid_extraction_e( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[VEC:%.*]], <8 x i32> undef, <4 x i32> +; CHECK-NEXT: ret <4 x i32> [[TMP1]] +; + %1 = call <4 x i32> @llvm.experimental.vector.extract.v4i32.v8i32(<8 x i32> %vec, i64 0) + ret <4 x i32> %1 +} + +define <4 x i32> @valid_extraction_f(<8 x i32> %vec) { +; CHECK-LABEL: @valid_extraction_f( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[VEC:%.*]], <8 x i32> undef, <4 x i32> +; CHECK-NEXT: ret <4 x i32> [[TMP1]] +; + %1 = call <4 x i32> @llvm.experimental.vector.extract.v4i32.v8i32(<8 x i32> %vec, i64 4) + ret <4 x i32> %1 +} + +define <3 x i32> @valid_extraction_g(<8 x i32> %vec) { +; CHECK-LABEL: @valid_extraction_g( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[VEC:%.*]], <8 x i32> undef, <3 x i32> +; CHECK-NEXT: ret <3 x i32> [[TMP1]] +; + %1 = call <3 x i32> @llvm.experimental.vector.extract.v3i32.v8i32(<8 x i32> %vec, i64 0) + ret <3 x i32> %1 +} + +define <3 x i32> @valid_extraction_h(<8 x i32> %vec) { +; CHECK-LABEL: @valid_extraction_h( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[VEC:%.*]], <8 x i32> undef, <3 x i32> +; CHECK-NEXT: ret <3 x i32> [[TMP1]] +; + %1 = call <3 x i32> @llvm.experimental.vector.extract.v3i32.v8i32(<8 x i32> %vec, i64 3) + ret <3 x i32> %1 +} + +; ============================================================================ ; +; Invalid canonicalizations +; ============================================================================ ; + +; Idx must be the be a constant multiple of the destination vector's length, +; otherwise the result is undefined. +define <4 x i32> @idx_not_constant_multiple(<8 x i32> %vec) { +; CHECK-LABEL: @idx_not_constant_multiple( +; CHECK-NEXT: ret <4 x i32> undef +; + %1 = call <4 x i32> @llvm.experimental.vector.extract.v4i32.v8i32(<8 x i32> %vec, i64 1) + ret <4 x i32> %1 +} + +; If the extraction overruns the vector, the result is undefined. +define <10 x i32> @extract_overrun(<8 x i32> %vec) { +; CHECK-LABEL: @extract_overrun( +; CHECK-NEXT: ret <10 x i32> undef +; + %1 = call <10 x i32> @llvm.experimental.vector.extract.v10i32.v8i32(<8 x i32> %vec, i64 0) + ret <10 x i32> %1 +} + +; ============================================================================ ; +; Scalable cases +; ============================================================================ ; + +; Scalable extractions should not be canonicalized. This will be lowered to the +; EXTRACT_SUBVECTOR ISD node later. +define <4 x i32> @scalable_extract( %vec) { +; CHECK-LABEL: @scalable_extract( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.experimental.vector.extract.v4i32.nxv4i32( [[VEC:%.*]], i64 0) +; CHECK-NEXT: ret <4 x i32> [[TMP1]] +; + %1 = call <4 x i32> @llvm.experimental.vector.extract.v4i32.nxv4i32( %vec, i64 0) + ret <4 x i32> %1 +} diff --git a/llvm/test/Transforms/InstCombine/canonicalize-vector-insert.ll b/llvm/test/Transforms/InstCombine/canonicalize-vector-insert.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/canonicalize-vector-insert.ll @@ -0,0 +1,147 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -instcombine -S | FileCheck %s + +; llvm.experimental.vector.insert canonicalizes to shufflevector in the fixed case. In the +; scalable case, we lower to the INSERT_SUBVECTOR ISD node. + +declare <8 x i32> @llvm.experimental.vector.insert.v8i32.v2i32(<8 x i32> %vec, <2 x i32> %subvec, i64 %idx) +declare <8 x i32> @llvm.experimental.vector.insert.v8i32.v3i32(<8 x i32> %vec, <3 x i32> %subvec, i64 %idx) +declare <8 x i32> @llvm.experimental.vector.insert.v8i32.v4i32(<8 x i32> %vec, <4 x i32> %subvec, i64 %idx) +declare <8 x i32> @llvm.experimental.vector.insert.v8i32.v8i32(<8 x i32> %vec, <8 x i32> %subvec, i64 %idx) +declare @llvm.experimental.vector.insert.nxv4i32.v4i32( %vec, <4 x i32> %subvec, i64 %idx) + +; ============================================================================ ; +; Trivial cases +; ============================================================================ ; + +; An insert that entirely overwrites an with another is a +; nop. +define <8 x i32> @trivial_nop(<8 x i32> %vec, <8 x i32> %subvec) { +; CHECK-LABEL: @trivial_nop( +; CHECK-NEXT: ret <8 x i32> [[SUBVEC:%.*]] +; + %1 = call <8 x i32> @llvm.experimental.vector.insert.v8i32.v8i32(<8 x i32> %vec, <8 x i32> %subvec, i64 0) + ret <8 x i32> %1 +} + +; ============================================================================ ; +; Valid canonicalizations +; ============================================================================ ; + +define <8 x i32> @valid_insertion_a(<8 x i32> %vec, <2 x i32> %subvec) { +; CHECK-LABEL: @valid_insertion_a( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[SUBVEC:%.*]], <2 x i32> undef, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[VEC:%.*]], <8 x i32> +; CHECK-NEXT: ret <8 x i32> [[TMP2]] +; + %1 = call <8 x i32> @llvm.experimental.vector.insert.v8i32.v2i32(<8 x i32> %vec, <2 x i32> %subvec, i64 0) + ret <8 x i32> %1 +} + +define <8 x i32> @valid_insertion_b(<8 x i32> %vec, <2 x i32> %subvec) { +; CHECK-LABEL: @valid_insertion_b( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[SUBVEC:%.*]], <2 x i32> undef, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[VEC:%.*]], <8 x i32> [[TMP1]], <8 x i32> +; CHECK-NEXT: ret <8 x i32> [[TMP2]] +; + %1 = call <8 x i32> @llvm.experimental.vector.insert.v8i32.v2i32(<8 x i32> %vec, <2 x i32> %subvec, i64 2) + ret <8 x i32> %1 +} + +define <8 x i32> @valid_insertion_c(<8 x i32> %vec, <2 x i32> %subvec) { +; CHECK-LABEL: @valid_insertion_c( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[SUBVEC:%.*]], <2 x i32> undef, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[VEC:%.*]], <8 x i32> [[TMP1]], <8 x i32> +; CHECK-NEXT: ret <8 x i32> [[TMP2]] +; + %1 = call <8 x i32> @llvm.experimental.vector.insert.v8i32.v2i32(<8 x i32> %vec, <2 x i32> %subvec, i64 4) + ret <8 x i32> %1 +} + +define <8 x i32> @valid_insertion_d(<8 x i32> %vec, <2 x i32> %subvec) { +; CHECK-LABEL: @valid_insertion_d( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[SUBVEC:%.*]], <2 x i32> undef, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[VEC:%.*]], <8 x i32> [[TMP1]], <8 x i32> +; CHECK-NEXT: ret <8 x i32> [[TMP2]] +; + %1 = call <8 x i32> @llvm.experimental.vector.insert.v8i32.v2i32(<8 x i32> %vec, <2 x i32> %subvec, i64 6) + ret <8 x i32> %1 +} + +define <8 x i32> @valid_insertion_e(<8 x i32> %vec, <4 x i32> %subvec) { +; CHECK-LABEL: @valid_insertion_e( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[SUBVEC:%.*]], <4 x i32> undef, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[VEC:%.*]], <8 x i32> +; CHECK-NEXT: ret <8 x i32> [[TMP2]] +; + %1 = call <8 x i32> @llvm.experimental.vector.insert.v8i32.v4i32(<8 x i32> %vec, <4 x i32> %subvec, i64 0) + ret <8 x i32> %1 +} + +define <8 x i32> @valid_insertion_f(<8 x i32> %vec, <4 x i32> %subvec) { +; CHECK-LABEL: @valid_insertion_f( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[SUBVEC:%.*]], <4 x i32> undef, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[VEC:%.*]], <8 x i32> [[TMP1]], <8 x i32> +; CHECK-NEXT: ret <8 x i32> [[TMP2]] +; + %1 = call <8 x i32> @llvm.experimental.vector.insert.v8i32.v4i32(<8 x i32> %vec, <4 x i32> %subvec, i64 4) + ret <8 x i32> %1 +} + +define <8 x i32> @valid_insertion_g(<8 x i32> %vec, <3 x i32> %subvec) { +; CHECK-LABEL: @valid_insertion_g( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[SUBVEC:%.*]], <3 x i32> undef, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[VEC:%.*]], <8 x i32> +; CHECK-NEXT: ret <8 x i32> [[TMP2]] +; + %1 = call <8 x i32> @llvm.experimental.vector.insert.v8i32.v3i32(<8 x i32> %vec, <3 x i32> %subvec, i64 0) + ret <8 x i32> %1 +} + +define <8 x i32> @valid_insertion_h(<8 x i32> %vec, <3 x i32> %subvec) { +; CHECK-LABEL: @valid_insertion_h( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[SUBVEC:%.*]], <3 x i32> undef, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[VEC:%.*]], <8 x i32> [[TMP1]], <8 x i32> +; CHECK-NEXT: ret <8 x i32> [[TMP2]] +; + %1 = call <8 x i32> @llvm.experimental.vector.insert.v8i32.v3i32(<8 x i32> %vec, <3 x i32> %subvec, i64 3) + ret <8 x i32> %1 +} + +; ============================================================================ ; +; Invalid canonicalizations +; ============================================================================ ; + +; Idx must be the be a constant multiple of the subvector's minimum vector +; length, otherwise the result is undefined. +define <8 x i32> @idx_not_constant_multiple(<8 x i32> %vec, <4 x i32> %subvec) { +; CHECK-LABEL: @idx_not_constant_multiple( +; CHECK-NEXT: ret <8 x i32> undef +; + %1 = call <8 x i32> @llvm.experimental.vector.insert.v8i32.v4i32(<8 x i32> %vec, <4 x i32> %subvec, i64 2) + ret <8 x i32> %1 +} + +; If the insertion overruns the vector, the result is undefined. +define <8 x i32> @insert_overrun(<8 x i32> %vec, <8 x i32> %subvec) { +; CHECK-LABEL: @insert_overrun( +; CHECK-NEXT: ret <8 x i32> undef +; + %1 = call <8 x i32> @llvm.experimental.vector.insert.v8i32.v8i32(<8 x i32> %vec, <8 x i32> %subvec, i64 4) + ret <8 x i32> %1 +} + +; ============================================================================ ; +; Scalable cases +; ============================================================================ ; + +; Scalable insertions should not be canonicalized. This will be lowered to the +; INSERT_SUBVECTOR ISD node later. +define @scalable_insert( %vec, <4 x i32> %subvec) { +; CHECK-LABEL: @scalable_insert( +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.experimental.vector.insert.nxv4i32.v4i32( [[VEC:%.*]], <4 x i32> [[SUBVEC:%.*]], i64 0) +; CHECK-NEXT: ret [[TMP1]] +; + %1 = call @llvm.experimental.vector.insert.nxv4i32.v4i32( %vec, <4 x i32> %subvec, i64 0) + ret %1 +} diff --git a/llvm/test/Verifier/extract-vector-mismatched-element-types.ll b/llvm/test/Verifier/extract-vector-mismatched-element-types.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Verifier/extract-vector-mismatched-element-types.ll @@ -0,0 +1,9 @@ +; RUN: not opt -verify -S < %s 2>&1> /dev/null | FileCheck %s + +; CHECK: experimental_vector_extract result must have the same element type as the input vector. +define <16 x i16> @invalid_mismatched_element_types( %vec) nounwind { + %retval = call <16 x i16> @llvm.experimental.vector.extract.v16i16.nxv16i8( %vec, i64 0) + ret <16 x i16> %retval +} + +declare <16 x i16> @llvm.experimental.vector.extract.v16i16.nxv16i8(, i64) diff --git a/llvm/test/Verifier/insert-vector-mismatched-element-types.ll b/llvm/test/Verifier/insert-vector-mismatched-element-types.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Verifier/insert-vector-mismatched-element-types.ll @@ -0,0 +1,9 @@ +; RUN: not opt -verify -S < %s 2>&1> /dev/null | FileCheck %s + +; CHECK: experimental_vector_insert parameters must have the same element type. +define @invalid_mismatched_element_types( %vec, <4 x i16> %subvec) nounwind { + %retval = call @llvm.experimental.vector.insert.nxv16i8.v4i16( %vec, <4 x i16> %subvec, i64 0) + ret %retval +} + +declare @llvm.experimental.vector.insert.nxv16i8.v4i16(, <4 x i16>, i64)