diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -858,8 +858,11 @@ SK_ExtractSubvector, ///< ExtractSubvector Index indicates start offset. SK_PermuteTwoSrc, ///< Merge elements from two source vectors into one ///< with any shuffle mask. - SK_PermuteSingleSrc ///< Shuffle elements of single source vector with any + SK_PermuteSingleSrc, ///< Shuffle elements of single source vector with any ///< shuffle mask. + SK_Splice ///< Concatenates elements from the first input vector + ///< with elements of the second input vector. Returning + ///< a vector of the same type as the input vectors. }; /// Kind of the reduction data. diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -825,6 +825,7 @@ case TTI::SK_Transpose: case TTI::SK_InsertSubvector: case TTI::SK_ExtractSubvector: + case TTI::SK_Splice: break; } return Kind; @@ -838,6 +839,7 @@ case TTI::SK_Broadcast: return getBroadcastShuffleOverhead(cast(Tp)); case TTI::SK_Select: + case TTI::SK_Splice: case TTI::SK_Reverse: case TTI::SK_Transpose: case TTI::SK_PermuteSingleSrc: @@ -1371,6 +1373,11 @@ cast(Args[0]->getType()), None, 0, cast(RetTy)); } + case Intrinsic::experimental_vector_splice: { + return thisT()->getShuffleCost(TTI::SK_Splice, + cast(Args[0]->getType()), None, + 0, cast(RetTy)); + } case Intrinsic::vector_reduce_add: case Intrinsic::vector_reduce_mul: case Intrinsic::vector_reduce_and: diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -1813,7 +1813,7 @@ Kind = improveShuffleKindFromMask(Kind, Mask); if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose || Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc || - Kind == TTI::SK_Reverse) { + Kind == TTI::SK_Reverse || Kind == TTI::SK_Splice) { static const CostTblEntry ShuffleTbl[] = { // Broadcast shuffle kinds can be performed with 'dup'. { TTI::SK_Broadcast, MVT::v8i8, 1 }, @@ -1897,6 +1897,24 @@ { TTI::SK_Reverse, MVT::nxv8i1, 1 }, { TTI::SK_Reverse, MVT::nxv4i1, 1 }, { TTI::SK_Reverse, MVT::nxv2i1, 1 }, + // Handle the cases for vector.splice with scalable vectors + { TTI::SK_Splice, MVT::nxv16i8, 1 }, + { TTI::SK_Splice, MVT::nxv8i16, 1 }, + { TTI::SK_Splice, MVT::nxv4i32, 1 }, + { TTI::SK_Splice, MVT::nxv2i64, 1 }, + { TTI::SK_Splice, MVT::nxv2f16, 1 }, + { TTI::SK_Splice, MVT::nxv4f16, 1 }, + { TTI::SK_Splice, MVT::nxv8f16, 1 }, + { TTI::SK_Splice, MVT::nxv2bf16, 1 }, + { TTI::SK_Splice, MVT::nxv4bf16, 1 }, + { TTI::SK_Splice, MVT::nxv8bf16, 1 }, + { TTI::SK_Splice, MVT::nxv2f32, 1 }, + { TTI::SK_Splice, MVT::nxv4f32, 1 }, + { TTI::SK_Splice, MVT::nxv2f64, 1 }, + { TTI::SK_Splice, MVT::nxv16i1, 1 }, + { TTI::SK_Splice, MVT::nxv8i1, 1 }, + { TTI::SK_Splice, MVT::nxv4i1, 1 }, + { TTI::SK_Splice, MVT::nxv2i1, 1 }, }; std::pair LT = TLI->getTypeLegalizationCost(DL, Tp); if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second)) diff --git a/llvm/test/Analysis/CostModel/AArch64/splice.ll b/llvm/test/Analysis/CostModel/AArch64/splice.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Analysis/CostModel/AArch64/splice.ll @@ -0,0 +1,94 @@ +; RUN: opt -cost-model -analyze -mtriple=aarch64--linux-gnu < %s | FileCheck %s + +define void @vector_splice() #0 { + ;CHECK-LABEL: 'vector_splice': + ;CHECK-NEXT: Cost Model: Found an estimated cost of 90 for instruction: %splice.nv16i8 = call <16 x i8> @llvm.experimental.vector.splice.v16i8(<16 x i8> zeroinitializer, <16 x i8> zeroinitializer, i32 -1) + ;CHECK-NEXT: Cost Model: Found an estimated cost of 180 for instruction: %splice.nv32i8 = call <32 x i8> @llvm.experimental.vector.splice.v32i8(<32 x i8> zeroinitializer, <32 x i8> zeroinitializer, i32 -1) + ;CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %splice.nv2i16 = call <2 x i16> @llvm.experimental.vector.splice.v2i16(<2 x i16> zeroinitializer, <2 x i16> zeroinitializer, i32 -1) + ;CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %splice.nv4i16 = call <4 x i16> @llvm.experimental.vector.splice.v4i16(<4 x i16> zeroinitializer, <4 x i16> zeroinitializer, i32 -1) + ;CHECK-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %splice.nv8i16 = call <8 x i16> @llvm.experimental.vector.splice.v8i16(<8 x i16> zeroinitializer, <8 x i16> zeroinitializer, i32 -1) + ;CHECK-NEXT: Cost Model: Found an estimated cost of 84 for instruction: %splice.nv16i16 = call <16 x i16> @llvm.experimental.vector.splice.v16i16(<16 x i16> zeroinitializer, <16 x i16> zeroinitializer, i32 -1) + ;CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %splice.nv4i32 = call <4 x i32> @llvm.experimental.vector.splice.v4i32(<4 x i32> zeroinitializer, <4 x i32> zeroinitializer, i32 0) + ;CHECK-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %splice.nv8i32 = call <8 x i32> @llvm.experimental.vector.splice.v8i32(<8 x i32> zeroinitializer, <8 x i32> zeroinitializer, i32 2) + ;CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %splice.nv2i64 = call <2 x i64> @llvm.experimental.vector.splice.v2i64(<2 x i64> zeroinitializer, <2 x i64> zeroinitializer, i32 -2) + ;CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %splice.nv4i64 = call <4 x i64> @llvm.experimental.vector.splice.v4i64(<4 x i64> zeroinitializer, <4 x i64> zeroinitializer, i32 -1) + ;CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %splice.nv2f16 = call <2 x half> @llvm.experimental.vector.splice.v2f16(<2 x half> zeroinitializer, <2 x half> zeroinitializer, i32 -1) + ;CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %splice.nv4f16 = call <4 x half> @llvm.experimental.vector.splice.v4f16(<4 x half> zeroinitializer, <4 x half> zeroinitializer, i32 -1) + ;CHECK-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %splice.nv8f16 = call <8 x half> @llvm.experimental.vector.splice.v8f16(<8 x half> zeroinitializer, <8 x half> zeroinitializer, i32 -1) + ;CHECK-NEXT: Cost Model: Found an estimated cost of 84 for instruction: %splice.nv16f16 = call <16 x half> @llvm.experimental.vector.splice.v16f16(<16 x half> zeroinitializer, <16 x half> zeroinitializer, i32 -1) + ;CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %splice.nv2f32 = call <2 x float> @llvm.experimental.vector.splice.v2f32(<2 x float> zeroinitializer, <2 x float> zeroinitializer, i32 -1) + ;CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %splice.nv4f32 = call <4 x float> @llvm.experimental.vector.splice.v4f32(<4 x float> zeroinitializer, <4 x float> zeroinitializer, i32 -1) + ;CHECK-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %splice.nv8f32 = call <8 x float> @llvm.experimental.vector.splice.v8f32(<8 x float> zeroinitializer, <8 x float> zeroinitializer, i32 -1) + ;CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %splice.nv2f64 = call <2 x double> @llvm.experimental.vector.splice.v2f64(<2 x double> zeroinitializer, <2 x double> zeroinitializer, i32 -2) + ;CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %splice.nv4f64 = call <4 x double> @llvm.experimental.vector.splice.v4f64(<4 x double> zeroinitializer, <4 x double> zeroinitializer, i32 -1) + ;CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %splice.nv2bf16 = call <2 x bfloat> @llvm.experimental.vector.splice.v2bf16(<2 x bfloat> zeroinitializer, <2 x bfloat> zeroinitializer, i32 -1) + ;CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %splice.nv4bf16 = call <4 x bfloat> @llvm.experimental.vector.splice.v4bf16(<4 x bfloat> zeroinitializer, <4 x bfloat> zeroinitializer, i32 -1) + ;CHECK-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %splice.nv8bf16 = call <8 x bfloat> @llvm.experimental.vector.splice.v8bf16(<8 x bfloat> zeroinitializer, <8 x bfloat> zeroinitializer, i32 -1) + ;CHECK-NEXT: Cost Model: Found an estimated cost of 84 for instruction: %splice.nv16bf16 = call <16 x bfloat> @llvm.experimental.vector.splice.v16bf16(<16 x bfloat> zeroinitializer, <16 x bfloat> zeroinitializer, i32 -1) + ;CHECK-NEXT: Cost Model: Found an estimated cost of 90 for instruction: %splice.nv16i1 = call <16 x i1> @llvm.experimental.vector.splice.v16i1(<16 x i1> zeroinitializer, <16 x i1> zeroinitializer, i32 -1) + ;CHECK-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %splice.nv8i1 = call <8 x i1> @llvm.experimental.vector.splice.v8i1(<8 x i1> zeroinitializer, <8 x i1> zeroinitializer, i32 -1) + ;CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %splice.nv4i1 = call <4 x i1> @llvm.experimental.vector.splice.v4i1(<4 x i1> zeroinitializer, <4 x i1> zeroinitializer, i32 -1) + ;CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %splice.nv2i1 = call <2 x i1> @llvm.experimental.vector.splice.v2i1(<2 x i1> zeroinitializer, <2 x i1> zeroinitializer, i32 -1) + ;CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void + + %splice.nv16i8 = call < 16 x i8> @llvm.experimental.vector.splice.nv16i8(< 16 x i8> zeroinitializer, < 16 x i8> zeroinitializer, i32 -1) + %splice.nv32i8 = call < 32 x i8> @llvm.experimental.vector.splice.nv32i8(< 32 x i8> zeroinitializer, < 32 x i8> zeroinitializer, i32 -1) + %splice.nv2i16 = call < 2 x i16> @llvm.experimental.vector.splice.nv2i16(< 2 x i16> zeroinitializer, < 2 x i16> zeroinitializer, i32 -1) + %splice.nv4i16 = call < 4 x i16> @llvm.experimental.vector.splice.nv4i16(< 4 x i16> zeroinitializer, < 4 x i16> zeroinitializer, i32 -1) + %splice.nv8i16 = call < 8 x i16> @llvm.experimental.vector.splice.nv8i16(< 8 x i16> zeroinitializer, < 8 x i16> zeroinitializer, i32 -1) + %splice.nv16i16 = call < 16 x i16> @llvm.experimental.vector.splice.nv16i16(< 16 x i16> zeroinitializer, < 16 x i16> zeroinitializer, i32 -1) + %splice.nv4i32 = call < 4 x i32> @llvm.experimental.vector.splice.nv4i32(< 4 x i32> zeroinitializer, < 4 x i32> zeroinitializer, i32 0) + %splice.nv8i32 = call < 8 x i32> @llvm.experimental.vector.splice.nv8i32(< 8 x i32> zeroinitializer, < 8 x i32> zeroinitializer, i32 2) + %splice.nv2i64 = call < 2 x i64> @llvm.experimental.vector.splice.nv2i64(< 2 x i64> zeroinitializer, < 2 x i64> zeroinitializer, i32 -2) + %splice.nv4i64 = call < 4 x i64> @llvm.experimental.vector.splice.nv4i64(< 4 x i64> zeroinitializer, < 4 x i64> zeroinitializer, i32 -1) + %splice.nv2f16 = call < 2 x half> @llvm.experimental.vector.splice.nv2f16(< 2 x half> zeroinitializer, < 2 x half> zeroinitializer, i32 -1) + %splice.nv4f16 = call < 4 x half> @llvm.experimental.vector.splice.nv4f16(< 4 x half> zeroinitializer, < 4 x half> zeroinitializer, i32 -1) + %splice.nv8f16 = call < 8 x half> @llvm.experimental.vector.splice.nv8f16(< 8 x half> zeroinitializer, < 8 x half> zeroinitializer, i32 -1) + %splice.nv16f16 = call < 16 x half> @llvm.experimental.vector.splice.nv16f16(< 16 x half> zeroinitializer, < 16 x half> zeroinitializer, i32 -1) + %splice.nv2f32 = call < 2 x float> @llvm.experimental.vector.splice.nv2f32(< 2 x float> zeroinitializer, < 2 x float> zeroinitializer, i32 -1) + %splice.nv4f32 = call < 4 x float> @llvm.experimental.vector.splice.nv4f32(< 4 x float> zeroinitializer, < 4 x float> zeroinitializer, i32 -1) + %splice.nv8f32 = call < 8 x float> @llvm.experimental.vector.splice.nv8f32(< 8 x float> zeroinitializer, < 8 x float> zeroinitializer, i32 -1) + %splice.nv2f64 = call < 2 x double> @llvm.experimental.vector.splice.nv2f64(< 2 x double> zeroinitializer, < 2 x double> zeroinitializer, i32 -2) + %splice.nv4f64 = call < 4 x double> @llvm.experimental.vector.splice.nv4f64(< 4 x double> zeroinitializer, < 4 x double> zeroinitializer, i32 -1) + %splice.nv2bf16 = call < 2 x bfloat> @llvm.experimental.vector.splice.nv2bf16(< 2 x bfloat> zeroinitializer, < 2 x bfloat> zeroinitializer, i32 -1) + %splice.nv4bf16 = call < 4 x bfloat> @llvm.experimental.vector.splice.nv4bf16(< 4 x bfloat> zeroinitializer, < 4 x bfloat> zeroinitializer, i32 -1) + %splice.nv8bf16 = call < 8 x bfloat> @llvm.experimental.vector.splice.nv8bf16(< 8 x bfloat> zeroinitializer, < 8 x bfloat> zeroinitializer, i32 -1) + %splice.nv16bf16 = call < 16 x bfloat> @llvm.experimental.vector.splice.nv16bf16(< 16 x bfloat> zeroinitializer, < 16 x bfloat> zeroinitializer, i32 -1) + %splice.nv16i1 = call < 16 x i1> @llvm.experimental.vector.splice.nv16i1(< 16 x i1> zeroinitializer, < 16 x i1> zeroinitializer, i32 -1) + %splice.nv8i1 = call < 8 x i1> @llvm.experimental.vector.splice.nv8i1(< 8 x i1> zeroinitializer, < 8 x i1> zeroinitializer, i32 -1) + %splice.nv4i1 = call < 4 x i1> @llvm.experimental.vector.splice.nv4i1(< 4 x i1> zeroinitializer, < 4 x i1> zeroinitializer, i32 -1) + %splice.nv2i1 = call < 2 x i1> @llvm.experimental.vector.splice.nv2i1(< 2 x i1> zeroinitializer, < 2 x i1> zeroinitializer, i32 -1) + ret void +} + +declare < 2 x i1> @llvm.experimental.vector.splice.nv2i1(< 2 x i1>, < 2 x i1>, i32) +declare < 4 x i1> @llvm.experimental.vector.splice.nv4i1(< 4 x i1>, < 4 x i1>, i32) +declare < 8 x i1> @llvm.experimental.vector.splice.nv8i1(< 8 x i1>, < 8 x i1>, i32) +declare < 16 x i1> @llvm.experimental.vector.splice.nv16i1(< 16 x i1>, < 16 x i1>, i32) +declare < 2 x i8> @llvm.experimental.vector.splice.nv2i8(< 2 x i8>, < 2 x i8>, i32) +declare < 16 x i8> @llvm.experimental.vector.splice.nv16i8(< 16 x i8>, < 16 x i8>, i32) +declare < 32 x i8> @llvm.experimental.vector.splice.nv32i8(< 32 x i8>, < 32 x i8>, i32) +declare < 2 x i16> @llvm.experimental.vector.splice.nv2i16(< 2 x i16>, < 2 x i16>, i32) +declare < 4 x i16> @llvm.experimental.vector.splice.nv4i16(< 4 x i16>, < 4 x i16>, i32) +declare < 8 x i16> @llvm.experimental.vector.splice.nv8i16(< 8 x i16>, < 8 x i16>, i32) +declare < 16 x i16> @llvm.experimental.vector.splice.nv16i16(< 16 x i16>, < 16 x i16>, i32) +declare < 4 x i32> @llvm.experimental.vector.splice.nv4i32(< 4 x i32>, < 4 x i32>, i32) +declare < 8 x i32> @llvm.experimental.vector.splice.nv8i32(< 8 x i32>, < 8 x i32>, i32) +declare < 2 x i64> @llvm.experimental.vector.splice.nv2i64(< 2 x i64>, < 2 x i64>, i32) +declare < 4 x i64> @llvm.experimental.vector.splice.nv4i64(< 4 x i64>, < 4 x i64>, i32) +declare < 2 x half> @llvm.experimental.vector.splice.nv2f16(< 2 x half>, < 2 x half>, i32) +declare < 4 x half> @llvm.experimental.vector.splice.nv4f16(< 4 x half>, < 4 x half>, i32) +declare < 8 x half> @llvm.experimental.vector.splice.nv8f16(< 8 x half>, < 8 x half>, i32) +declare < 16 x half> @llvm.experimental.vector.splice.nv16f16(< 16 x half>, < 16 x half>, i32) +declare < 2 x bfloat> @llvm.experimental.vector.splice.nv2bf16(< 2 x bfloat>, < 2 x bfloat>, i32) +declare < 4 x bfloat> @llvm.experimental.vector.splice.nv4bf16(< 4 x bfloat>, < 4 x bfloat>, i32) +declare < 8 x bfloat> @llvm.experimental.vector.splice.nv8bf16(< 8 x bfloat>, < 8 x bfloat>, i32) +declare < 16 x bfloat> @llvm.experimental.vector.splice.nv16bf16(< 16 x bfloat>, < 16 x bfloat>, i32) +declare < 2 x float> @llvm.experimental.vector.splice.nv2f32(< 2 x float>, < 2 x float>, i32) +declare < 4 x float> @llvm.experimental.vector.splice.nv4f32(< 4 x float>, < 4 x float>, i32) +declare < 8 x float> @llvm.experimental.vector.splice.nv8f32(< 8 x float>, < 8 x float>, i32) +declare < 16 x float> @llvm.experimental.vector.splice.nv16f32(< 16 x float>, < 16 x float>, i32) +declare < 2 x double> @llvm.experimental.vector.splice.nv2f64(< 2 x double>, < 2 x double>, i32) +declare < 4 x double> @llvm.experimental.vector.splice.nv4f64(< 4 x double>, < 4 x double>, i32) + +attributes #0 = { "target-features"="+bf16" } diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll --- a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll +++ b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll @@ -231,4 +231,94 @@ declare @llvm.log2.nxv4f32() declare @llvm.log10.nxv4f32() +define void @vector_splice() #0 { +; CHECK-LABEL: 'vector_splice': +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv16i8 = call @llvm.experimental.vector.splice.nxv16i8( zeroinitializer, zeroinitializer, i32 -1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv32i8 = call @llvm.experimental.vector.splice.nxv32i8( zeroinitializer, zeroinitializer, i32 -1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv2i16 = call @llvm.experimental.vector.splice.nxv2i16( zeroinitializer, zeroinitializer, i32 -1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv4i16 = call @llvm.experimental.vector.splice.nxv4i16( zeroinitializer, zeroinitializer, i32 -1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv8i16 = call @llvm.experimental.vector.splice.nxv8i16( zeroinitializer, zeroinitializer, i32 -1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv16i16 = call @llvm.experimental.vector.splice.nxv16i16( zeroinitializer, zeroinitializer, i32 -1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv4i32 = call @llvm.experimental.vector.splice.nxv4i32( zeroinitializer, zeroinitializer, i32 0) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv8i32 = call @llvm.experimental.vector.splice.nxv8i32( zeroinitializer, zeroinitializer, i32 2) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv2i64 = call @llvm.experimental.vector.splice.nxv2i64( zeroinitializer, zeroinitializer, i32 -2) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv4i64 = call @llvm.experimental.vector.splice.nxv4i64( zeroinitializer, zeroinitializer, i32 -1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv2f16 = call @llvm.experimental.vector.splice.nxv2f16( zeroinitializer, zeroinitializer, i32 -1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv4f16 = call @llvm.experimental.vector.splice.nxv4f16( zeroinitializer, zeroinitializer, i32 -1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv8f16 = call @llvm.experimental.vector.splice.nxv8f16( zeroinitializer, zeroinitializer, i32 -1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv16f16 = call @llvm.experimental.vector.splice.nxv16f16( zeroinitializer, zeroinitializer, i32 -1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv2f32 = call @llvm.experimental.vector.splice.nxv2f32( zeroinitializer, zeroinitializer, i32 -1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv4f32 = call @llvm.experimental.vector.splice.nxv4f32( zeroinitializer, zeroinitializer, i32 -1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv8f32 = call @llvm.experimental.vector.splice.nxv8f32( zeroinitializer, zeroinitializer, i32 -1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv2f64 = call @llvm.experimental.vector.splice.nxv2f64( zeroinitializer, zeroinitializer, i32 -2) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv4f64 = call @llvm.experimental.vector.splice.nxv4f64( zeroinitializer, zeroinitializer, i32 -1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv2bf16 = call @llvm.experimental.vector.splice.nxv2bf16( zeroinitializer, zeroinitializer, i32 -1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv4bf16 = call @llvm.experimental.vector.splice.nxv4bf16( zeroinitializer, zeroinitializer, i32 -1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv8bf16 = call @llvm.experimental.vector.splice.nxv8bf16( zeroinitializer, zeroinitializer, i32 -1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv16bf16 = call @llvm.experimental.vector.splice.nxv16bf16( zeroinitializer, zeroinitializer, i32 -1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv16i1 = call @llvm.experimental.vector.splice.nxv16i1( zeroinitializer, zeroinitializer, i32 -1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv8i1 = call @llvm.experimental.vector.splice.nxv8i1( zeroinitializer, zeroinitializer, i32 -1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv4i1 = call @llvm.experimental.vector.splice.nxv4i1( zeroinitializer, zeroinitializer, i32 -1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv2i1 = call @llvm.experimental.vector.splice.nxv2i1( zeroinitializer, zeroinitializer, i32 -1) + + %splice_nxv16i8 = call @llvm.experimental.vector.splice.nxv16i8( zeroinitializer, zeroinitializer, i32 -1) + %splice_nxv32i8 = call @llvm.experimental.vector.splice.nxv32i8( zeroinitializer, zeroinitializer, i32 -1) + %splice_nxv2i16 = call @llvm.experimental.vector.splice.nxv2i16( zeroinitializer, zeroinitializer, i32 -1) + %splice_nxv4i16 = call @llvm.experimental.vector.splice.nxv4i16( zeroinitializer, zeroinitializer, i32 -1) + %splice_nxv8i16 = call @llvm.experimental.vector.splice.nxv8i16( zeroinitializer, zeroinitializer, i32 -1) + %splice_nxv16i16 = call @llvm.experimental.vector.splice.nxv16i16( zeroinitializer, zeroinitializer, i32 -1) + %splice_nxv4i32 = call @llvm.experimental.vector.splice.nxv4i32( zeroinitializer, zeroinitializer, i32 0) + %splice_nxv8i32 = call @llvm.experimental.vector.splice.nxv8i32( zeroinitializer, zeroinitializer, i32 2) + %splice_nxv2i64 = call @llvm.experimental.vector.splice.nxv2i64( zeroinitializer, zeroinitializer, i32 -2) + %splice_nxv4i64 = call @llvm.experimental.vector.splice.nxv4i64( zeroinitializer, zeroinitializer, i32 -1) + %splice_nxv2f16 = call @llvm.experimental.vector.splice.nxv2f16( zeroinitializer, zeroinitializer, i32 -1) + %splice_nxv4f16 = call @llvm.experimental.vector.splice.nxv4f16( zeroinitializer, zeroinitializer, i32 -1) + %splice_nxv8f16 = call @llvm.experimental.vector.splice.nxv8f16( zeroinitializer, zeroinitializer, i32 -1) + %splice_nxv16f16 = call @llvm.experimental.vector.splice.nxv16f16( zeroinitializer, zeroinitializer, i32 -1) + %splice_nxv2f32 = call @llvm.experimental.vector.splice.nxv2f32( zeroinitializer, zeroinitializer, i32 -1) + %splice_nxv4f32 = call @llvm.experimental.vector.splice.nxv4f32( zeroinitializer, zeroinitializer, i32 -1) + %splice_nxv8f32 = call @llvm.experimental.vector.splice.nxv8f32( zeroinitializer, zeroinitializer, i32 -1) + %splice_nxv2f64 = call @llvm.experimental.vector.splice.nxv2f64( zeroinitializer, zeroinitializer, i32 -2) + %splice_nxv4f64 = call @llvm.experimental.vector.splice.nxv4f64( zeroinitializer, zeroinitializer, i32 -1) + %splice_nxv2bf16 = call @llvm.experimental.vector.splice.nxv2bf16( zeroinitializer, zeroinitializer, i32 -1) + %splice_nxv4bf16 = call @llvm.experimental.vector.splice.nxv4bf16( zeroinitializer, zeroinitializer, i32 -1) + %splice_nxv8bf16 = call @llvm.experimental.vector.splice.nxv8bf16( zeroinitializer, zeroinitializer, i32 -1) + %splice_nxv16bf16 = call @llvm.experimental.vector.splice.nxv16bf16( zeroinitializer, zeroinitializer, i32 -1) + %splice_nxv16i1 = call @llvm.experimental.vector.splice.nxv16i1( zeroinitializer, zeroinitializer, i32 -1) + %splice_nxv8i1 = call @llvm.experimental.vector.splice.nxv8i1( zeroinitializer, zeroinitializer, i32 -1) + %splice_nxv4i1 = call @llvm.experimental.vector.splice.nxv4i1( zeroinitializer, zeroinitializer, i32 -1) + %splice_nxv2i1 = call @llvm.experimental.vector.splice.nxv2i1( zeroinitializer, zeroinitializer, i32 -1) + ret void +} + +declare @llvm.experimental.vector.splice.nxv2i1(, , i32) +declare @llvm.experimental.vector.splice.nxv4i1(, , i32) +declare @llvm.experimental.vector.splice.nxv8i1(, , i32) +declare @llvm.experimental.vector.splice.nxv16i1(, , i32) +declare @llvm.experimental.vector.splice.nxv2i8(, , i32) +declare @llvm.experimental.vector.splice.nxv16i8(, , i32) +declare @llvm.experimental.vector.splice.nxv32i8(, , i32) +declare @llvm.experimental.vector.splice.nxv2i16(, , i32) +declare @llvm.experimental.vector.splice.nxv4i16(, , i32) +declare @llvm.experimental.vector.splice.nxv8i16(, , i32) +declare @llvm.experimental.vector.splice.nxv16i16(, , i32) +declare @llvm.experimental.vector.splice.nxv4i32(, , i32) +declare @llvm.experimental.vector.splice.nxv8i32(, , i32) +declare @llvm.experimental.vector.splice.nxv2i64(, , i32) +declare @llvm.experimental.vector.splice.nxv4i64(, , i32) +declare @llvm.experimental.vector.splice.nxv2f16(, , i32) +declare @llvm.experimental.vector.splice.nxv4f16(, , i32) +declare @llvm.experimental.vector.splice.nxv8f16(, , i32) +declare @llvm.experimental.vector.splice.nxv16f16(, , i32) +declare @llvm.experimental.vector.splice.nxv2bf16(, , i32) +declare @llvm.experimental.vector.splice.nxv4bf16(, , i32) +declare @llvm.experimental.vector.splice.nxv8bf16(, , i32) +declare @llvm.experimental.vector.splice.nxv16bf16(, , i32) +declare @llvm.experimental.vector.splice.nxv2f32(, , i32) +declare @llvm.experimental.vector.splice.nxv4f32(, , i32) +declare @llvm.experimental.vector.splice.nxv8f32(, , i32) +declare @llvm.experimental.vector.splice.nxv16f32(, , i32) +declare @llvm.experimental.vector.splice.nxv2f64(, , i32) +declare @llvm.experimental.vector.splice.nxv4f64(, , i32) + attributes #0 = { "target-features"="+sve,+bf16" }