diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -858,8 +858,11 @@ SK_ExtractSubvector, ///< ExtractSubvector Index indicates start offset. SK_PermuteTwoSrc, ///< Merge elements from two source vectors into one ///< with any shuffle mask. - SK_PermuteSingleSrc ///< Shuffle elements of single source vector with any + SK_PermuteSingleSrc, ///< Shuffle elements of single source vector with any ///< shuffle mask. + SK_Splice ///< Concatenates elements from the first input vector + ///< with elements of the second input vector. Returning + ///< a vector of the same type as the input vectors. }; /// Kind of the reduction data. diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -825,6 +825,7 @@ case TTI::SK_Transpose: case TTI::SK_InsertSubvector: case TTI::SK_ExtractSubvector: + case TTI::SK_Splice: break; } return Kind; @@ -838,6 +839,7 @@ case TTI::SK_Broadcast: return getBroadcastShuffleOverhead(cast(Tp)); case TTI::SK_Select: + case TTI::SK_Splice: case TTI::SK_Reverse: case TTI::SK_Transpose: case TTI::SK_PermuteSingleSrc: @@ -1371,6 +1373,12 @@ cast(Args[0]->getType()), None, 0, cast(RetTy)); } + case Intrinsic::experimental_vector_splice: { + unsigned Index = cast(Args[2])->getZExtValue(); + return thisT()->getShuffleCost(TTI::SK_Splice, + cast(Args[0]->getType()), None, + Index, cast(RetTy)); + } case Intrinsic::vector_reduce_add: case Intrinsic::vector_reduce_mul: case Intrinsic::vector_reduce_and: diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -166,6 +166,8 @@ bool IsPairwiseForm, TTI::TargetCostKind CostKind); + InstructionCost getSpliceCost(VectorType *Tp, int Index); + InstructionCost getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput, diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -791,6 +791,7 @@ { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, 1 }, { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, 3 }, { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, 5 }, + { ISD::TRUNCATE, MVT::nxv16i1, MVT::nxv16i8, 1 }, { ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, 1 }, { ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, 1 }, { ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, 1 }, @@ -856,7 +857,6 @@ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 }, { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, - // LowerVectorFP_TO_INT { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1 }, { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 }, @@ -1806,6 +1806,69 @@ CostKind); } +static MVT getPromotedTypeForPredicate (MVT M) { + switch (M.SimpleTy) { + case MVT::nxv16i1: + return MVT::nxv16i8; + case MVT::nxv8i1: + return MVT::nxv8i16; + case MVT::nxv4i1: + return MVT::nxv4i32; + case MVT::nxv2i1: + return MVT::nxv2i64; + default: + return MVT::Untyped; + } + return MVT::Untyped; +} + +InstructionCost AArch64TTIImpl::getSpliceCost(VectorType *Tp, int Index) { + static const CostTblEntry ShuffleTbl[] = { + { TTI::SK_Splice, MVT::nxv16i8, 1 }, + { TTI::SK_Splice, MVT::nxv8i16, 1 }, + { TTI::SK_Splice, MVT::nxv4i32, 1 }, + { TTI::SK_Splice, MVT::nxv2i64, 1 }, + { TTI::SK_Splice, MVT::nxv2f16, 1 }, + { TTI::SK_Splice, MVT::nxv4f16, 1 }, + { TTI::SK_Splice, MVT::nxv8f16, 1 }, + { TTI::SK_Splice, MVT::nxv2bf16, 1 }, + { TTI::SK_Splice, MVT::nxv4bf16, 1 }, + { TTI::SK_Splice, MVT::nxv8bf16, 1 }, + { TTI::SK_Splice, MVT::nxv2f32, 1 }, + { TTI::SK_Splice, MVT::nxv4f32, 1 }, + { TTI::SK_Splice, MVT::nxv2f64, 1 }, + }; + + std::pair LT = TLI->getTypeLegalizationCost(DL, Tp); + Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext()); + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + MVT PromotedVT = LT.second.getScalarType() == MVT::i1 + ? getPromotedTypeForPredicate(LT.second) + : LT.second; + Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext()); + InstructionCost LegalizationCost = 0; + if (Index < 0) { + LegalizationCost = + getCmpSelInstrCost(Instruction::ICmp, PromotedVTy, PromotedVTy, + CmpInst::BAD_ICMP_PREDICATE, CostKind) + + getCmpSelInstrCost(Instruction::Select, PromotedVTy, LegalVTy, + CmpInst::BAD_ICMP_PREDICATE, CostKind); + } + + // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp + // Cost performed on a promoted type. + if (LT.second.getScalarType() == MVT::i1) { + LegalizationCost += getCastInstrCost(Instruction::ZExt, PromotedVTy, LegalVTy, + TTI::CastContextHint::None, CostKind) + + getCastInstrCost(Instruction::Trunc, LegalVTy, PromotedVTy, + TTI::CastContextHint::None, CostKind); + } + const auto *Entry = CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT); + assert(Entry && "Illegal Type for Splice"); + LegalizationCost += Entry->Cost; + return LegalizationCost * LT.first; +} + InstructionCost AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef Mask, int Index, @@ -1902,6 +1965,7 @@ if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second)) return LT.first * Entry->Cost; } - + if (Kind == TTI::SK_Splice && isa(Tp)) + return getSpliceCost(Tp, Index); return BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp); } diff --git a/llvm/test/Analysis/CostModel/AArch64/splice.ll b/llvm/test/Analysis/CostModel/AArch64/splice.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Analysis/CostModel/AArch64/splice.ll @@ -0,0 +1,94 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py +; RUN: opt < %s -analyze -cost-model -S -mtriple=aarch64--linux-gnu | FileCheck %s + +define void @vector_splice() #0 { +; CHECK-LABEL: 'vector_splice' +; CHECK-NEXT: Cost Model: Found an estimated cost of 90 for instruction: %splice.v16i8 = call <16 x i8> @llvm.experimental.vector.splice.v16i8(<16 x i8> zeroinitializer, <16 x i8> zeroinitializer, i32 1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 180 for instruction: %splice.v32i8 = call <32 x i8> @llvm.experimental.vector.splice.v32i8(<32 x i8> zeroinitializer, <32 x i8> zeroinitializer, i32 1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %splice.v2i16 = call <2 x i16> @llvm.experimental.vector.splice.v2i16(<2 x i16> zeroinitializer, <2 x i16> zeroinitializer, i32 1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %splice.v4i16 = call <4 x i16> @llvm.experimental.vector.splice.v4i16(<4 x i16> zeroinitializer, <4 x i16> zeroinitializer, i32 1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %splice.v8i16 = call <8 x i16> @llvm.experimental.vector.splice.v8i16(<8 x i16> zeroinitializer, <8 x i16> zeroinitializer, i32 1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 84 for instruction: %splice.v16i16 = call <16 x i16> @llvm.experimental.vector.splice.v16i16(<16 x i16> zeroinitializer, <16 x i16> zeroinitializer, i32 1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %splice.v4i32 = call <4 x i32> @llvm.experimental.vector.splice.v4i32(<4 x i32> zeroinitializer, <4 x i32> zeroinitializer, i32 1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %splice.v8i32 = call <8 x i32> @llvm.experimental.vector.splice.v8i32(<8 x i32> zeroinitializer, <8 x i32> zeroinitializer, i32 1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %splice.v2i64 = call <2 x i64> @llvm.experimental.vector.splice.v2i64(<2 x i64> zeroinitializer, <2 x i64> zeroinitializer, i32 1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %splice.v4i64 = call <4 x i64> @llvm.experimental.vector.splice.v4i64(<4 x i64> zeroinitializer, <4 x i64> zeroinitializer, i32 1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %splice.v2f16 = call <2 x half> @llvm.experimental.vector.splice.v2f16(<2 x half> zeroinitializer, <2 x half> zeroinitializer, i32 1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %splice.v4f16 = call <4 x half> @llvm.experimental.vector.splice.v4f16(<4 x half> zeroinitializer, <4 x half> zeroinitializer, i32 1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %splice.v8f16 = call <8 x half> @llvm.experimental.vector.splice.v8f16(<8 x half> zeroinitializer, <8 x half> zeroinitializer, i32 1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 84 for instruction: %splice.v16f16 = call <16 x half> @llvm.experimental.vector.splice.v16f16(<16 x half> zeroinitializer, <16 x half> zeroinitializer, i32 1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %splice.v2f32 = call <2 x float> @llvm.experimental.vector.splice.v2f32(<2 x float> zeroinitializer, <2 x float> zeroinitializer, i32 1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %splice.v4f32 = call <4 x float> @llvm.experimental.vector.splice.v4f32(<4 x float> zeroinitializer, <4 x float> zeroinitializer, i32 1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %splice.v8f32 = call <8 x float> @llvm.experimental.vector.splice.v8f32(<8 x float> zeroinitializer, <8 x float> zeroinitializer, i32 1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %splice.v2f64 = call <2 x double> @llvm.experimental.vector.splice.v2f64(<2 x double> zeroinitializer, <2 x double> zeroinitializer, i32 1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %splice.v4f64 = call <4 x double> @llvm.experimental.vector.splice.v4f64(<4 x double> zeroinitializer, <4 x double> zeroinitializer, i32 1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %splice.v2bf16 = call <2 x bfloat> @llvm.experimental.vector.splice.v2bf16(<2 x bfloat> zeroinitializer, <2 x bfloat> zeroinitializer, i32 1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %splice.v4bf16 = call <4 x bfloat> @llvm.experimental.vector.splice.v4bf16(<4 x bfloat> zeroinitializer, <4 x bfloat> zeroinitializer, i32 1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %splice.v8bf16 = call <8 x bfloat> @llvm.experimental.vector.splice.v8bf16(<8 x bfloat> zeroinitializer, <8 x bfloat> zeroinitializer, i32 1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 84 for instruction: %splice.v16bf16 = call <16 x bfloat> @llvm.experimental.vector.splice.v16bf16(<16 x bfloat> zeroinitializer, <16 x bfloat> zeroinitializer, i32 1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 90 for instruction: %splice.v16i1 = call <16 x i1> @llvm.experimental.vector.splice.v16i1(<16 x i1> zeroinitializer, <16 x i1> zeroinitializer, i32 1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %splice.v8i1 = call <8 x i1> @llvm.experimental.vector.splice.v8i1(<8 x i1> zeroinitializer, <8 x i1> zeroinitializer, i32 1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %splice.v4i1 = call <4 x i1> @llvm.experimental.vector.splice.v4i1(<4 x i1> zeroinitializer, <4 x i1> zeroinitializer, i32 1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %splice.v2i1 = call <2 x i1> @llvm.experimental.vector.splice.v2i1(<2 x i1> zeroinitializer, <2 x i1> zeroinitializer, i32 1) +; + %splice.v16i8 = call <16 x i8> @llvm.experimental.vector.splice.v16i8(<16 x i8> zeroinitializer, <16 x i8> zeroinitializer, i32 1) + %splice.v32i8 = call <32 x i8> @llvm.experimental.vector.splice.v32i8(<32 x i8> zeroinitializer, <32 x i8> zeroinitializer, i32 1) + %splice.v2i16 = call <2 x i16> @llvm.experimental.vector.splice.v2i16(<2 x i16> zeroinitializer, <2 x i16> zeroinitializer, i32 1) + %splice.v4i16 = call <4 x i16> @llvm.experimental.vector.splice.v4i16(<4 x i16> zeroinitializer, <4 x i16> zeroinitializer, i32 1) + %splice.v8i16 = call <8 x i16> @llvm.experimental.vector.splice.v8i16(<8 x i16> zeroinitializer, <8 x i16> zeroinitializer, i32 1) + %splice.v16i16 = call <16 x i16> @llvm.experimental.vector.splice.v16i16(<16 x i16> zeroinitializer, <16 x i16> zeroinitializer, i32 1) + %splice.v4i32 = call <4 x i32> @llvm.experimental.vector.splice.v4i32(<4 x i32> zeroinitializer, <4 x i32> zeroinitializer, i32 1) + %splice.v8i32 = call <8 x i32> @llvm.experimental.vector.splice.v8i32(<8 x i32> zeroinitializer, <8 x i32> zeroinitializer, i32 1) + %splice.v2i64 = call <2 x i64> @llvm.experimental.vector.splice.v2i64(<2 x i64> zeroinitializer, <2 x i64> zeroinitializer, i32 1) + %splice.v4i64 = call <4 x i64> @llvm.experimental.vector.splice.v4i64(<4 x i64> zeroinitializer, <4 x i64> zeroinitializer, i32 1) + %splice.v2f16 = call <2 x half> @llvm.experimental.vector.splice.v2f16(<2 x half> zeroinitializer, <2 x half> zeroinitializer, i32 1) + %splice.v4f16 = call <4 x half> @llvm.experimental.vector.splice.v4f16(<4 x half> zeroinitializer, <4 x half> zeroinitializer, i32 1) + %splice.v8f16 = call <8 x half> @llvm.experimental.vector.splice.v8f16(<8 x half> zeroinitializer, <8 x half> zeroinitializer, i32 1) + %splice.v16f16 = call <16 x half> @llvm.experimental.vector.splice.v16f16(<16 x half> zeroinitializer, <16 x half> zeroinitializer, i32 1) + %splice.v2f32 = call <2 x float> @llvm.experimental.vector.splice.v2f32(<2 x float> zeroinitializer, <2 x float> zeroinitializer, i32 1) + %splice.v4f32 = call <4 x float> @llvm.experimental.vector.splice.v4f32(<4 x float> zeroinitializer, <4 x float> zeroinitializer, i32 1) + %splice.v8f32 = call <8 x float> @llvm.experimental.vector.splice.v8f32(<8 x float> zeroinitializer, <8 x float> zeroinitializer, i32 1) + %splice.v2f64 = call <2 x double> @llvm.experimental.vector.splice.v2f64(<2 x double> zeroinitializer, <2 x double> zeroinitializer, i32 1) + %splice.v4f64 = call <4 x double> @llvm.experimental.vector.splice.v4f64(<4 x double> zeroinitializer, <4 x double> zeroinitializer, i32 1) + %splice.v2bf16 = call <2 x bfloat> @llvm.experimental.vector.splice.v2bf16(<2 x bfloat> zeroinitializer, <2 x bfloat> zeroinitializer, i32 1) + %splice.v4bf16 = call <4 x bfloat> @llvm.experimental.vector.splice.v4bf16(<4 x bfloat> zeroinitializer, <4 x bfloat> zeroinitializer, i32 1) + %splice.v8bf16 = call <8 x bfloat> @llvm.experimental.vector.splice.v8bf16(<8 x bfloat> zeroinitializer, <8 x bfloat> zeroinitializer, i32 1) + %splice.v16bf16 = call <16 x bfloat> @llvm.experimental.vector.splice.v16bf16(<16 x bfloat> zeroinitializer, <16 x bfloat> zeroinitializer, i32 1) + %splice.v16i1 = call <16 x i1> @llvm.experimental.vector.splice.v16i1(<16 x i1> zeroinitializer, <16 x i1> zeroinitializer, i32 1) + %splice.v8i1 = call <8 x i1> @llvm.experimental.vector.splice.v8i1(<8 x i1> zeroinitializer, <8 x i1> zeroinitializer, i32 1) + %splice.v4i1 = call <4 x i1> @llvm.experimental.vector.splice.v4i1(<4 x i1> zeroinitializer, <4 x i1> zeroinitializer, i32 1) + %splice.v2i1 = call <2 x i1> @llvm.experimental.vector.splice.v2i1(<2 x i1> zeroinitializer, <2 x i1> zeroinitializer, i32 1) + ret void +} + +declare <2 x i1> @llvm.experimental.vector.splice.v2i1(<2 x i1>, <2 x i1>, i32) +declare <4 x i1> @llvm.experimental.vector.splice.v4i1(<4 x i1>, <4 x i1>, i32) +declare <8 x i1> @llvm.experimental.vector.splice.v8i1(<8 x i1>, <8 x i1>, i32) +declare <16 x i1> @llvm.experimental.vector.splice.v16i1(<16 x i1>, <16 x i1>, i32) +declare <2 x i8> @llvm.experimental.vector.splice.v2i8(<2 x i8>, <2 x i8>, i32) +declare <16 x i8> @llvm.experimental.vector.splice.v16i8(<16 x i8>, <16 x i8>, i32) +declare <32 x i8> @llvm.experimental.vector.splice.v32i8(<32 x i8>, <32 x i8>, i32) +declare <2 x i16> @llvm.experimental.vector.splice.v2i16(<2 x i16>, <2 x i16>, i32) +declare <4 x i16> @llvm.experimental.vector.splice.v4i16(<4 x i16>, <4 x i16>, i32) +declare <8 x i16> @llvm.experimental.vector.splice.v8i16(<8 x i16>, <8 x i16>, i32) +declare <16 x i16> @llvm.experimental.vector.splice.v16i16(<16 x i16>, <16 x i16>, i32) +declare <4 x i32> @llvm.experimental.vector.splice.v4i32(<4 x i32>, <4 x i32>, i32) +declare <8 x i32> @llvm.experimental.vector.splice.v8i32(<8 x i32>, <8 x i32>, i32) +declare <2 x i64> @llvm.experimental.vector.splice.v2i64(<2 x i64>, <2 x i64>, i32) +declare <4 x i64> @llvm.experimental.vector.splice.v4i64(<4 x i64>, <4 x i64>, i32) +declare <2 x half> @llvm.experimental.vector.splice.v2f16(<2 x half>, <2 x half>, i32) +declare <4 x half> @llvm.experimental.vector.splice.v4f16(<4 x half>, <4 x half>, i32) +declare <8 x half> @llvm.experimental.vector.splice.v8f16(<8 x half>, <8 x half>, i32) +declare <16 x half> @llvm.experimental.vector.splice.v16f16(<16 x half>, <16 x half>, i32) +declare <2 x bfloat> @llvm.experimental.vector.splice.v2bf16(<2 x bfloat>, <2 x bfloat>, i32) +declare <4 x bfloat> @llvm.experimental.vector.splice.v4bf16(<4 x bfloat>, <4 x bfloat>, i32) +declare <8 x bfloat> @llvm.experimental.vector.splice.v8bf16(<8 x bfloat>, <8 x bfloat>, i32) +declare <16 x bfloat> @llvm.experimental.vector.splice.v16bf16(<16 x bfloat>, <16 x bfloat>, i32) +declare <2 x float> @llvm.experimental.vector.splice.v2f32(<2 x float>, <2 x float>, i32) +declare <4 x float> @llvm.experimental.vector.splice.v4f32(<4 x float>, <4 x float>, i32) +declare <8 x float> @llvm.experimental.vector.splice.v8f32(<8 x float>, <8 x float>, i32) +declare <16 x float> @llvm.experimental.vector.splice.v16f32(<16 x float>, <16 x float>, i32) +declare <2 x double> @llvm.experimental.vector.splice.v2f64(<2 x double>, <2 x double>, i32) +declare <4 x double> @llvm.experimental.vector.splice.v4f64(<4 x double>, <4 x double>, i32) + +attributes #0 = { "target-features"="+bf16" } diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll --- a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll +++ b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll @@ -1,4 +1,5 @@ -; RUN: opt -cost-model -analyze -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py +; RUN: opt < %s -analyze -cost-model -S -mtriple=aarch64--linux-gnu -mattr=+sve | FileCheck %s define void @vector_insert_extract( %v0, %v1, <16 x i32> %v2) { ; CHECK-LABEL: 'vector_insert_extract' @@ -231,4 +232,149 @@ declare @llvm.log2.nxv4f32() declare @llvm.log10.nxv4f32() +define void @vector_splice() #0 { +; CHECK-LABEL: 'vector_splice' +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv16i8 = call @llvm.experimental.vector.splice.nxv16i8( zeroinitializer, zeroinitializer, i32 1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv32i8 = call @llvm.experimental.vector.splice.nxv32i8( zeroinitializer, zeroinitializer, i32 1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv2i16 = call @llvm.experimental.vector.splice.nxv2i16( zeroinitializer, zeroinitializer, i32 1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv4i16 = call @llvm.experimental.vector.splice.nxv4i16( zeroinitializer, zeroinitializer, i32 1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv8i16 = call @llvm.experimental.vector.splice.nxv8i16( zeroinitializer, zeroinitializer, i32 1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv16i16 = call @llvm.experimental.vector.splice.nxv16i16( zeroinitializer, zeroinitializer, i32 1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv4i32 = call @llvm.experimental.vector.splice.nxv4i32( zeroinitializer, zeroinitializer, i32 1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv8i32 = call @llvm.experimental.vector.splice.nxv8i32( zeroinitializer, zeroinitializer, i32 1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv2i64 = call @llvm.experimental.vector.splice.nxv2i64( zeroinitializer, zeroinitializer, i32 1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv4i64 = call @llvm.experimental.vector.splice.nxv4i64( zeroinitializer, zeroinitializer, i32 1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv2f16 = call @llvm.experimental.vector.splice.nxv2f16( zeroinitializer, zeroinitializer, i32 1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv4f16 = call @llvm.experimental.vector.splice.nxv4f16( zeroinitializer, zeroinitializer, i32 1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv8f16 = call @llvm.experimental.vector.splice.nxv8f16( zeroinitializer, zeroinitializer, i32 1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv16f16 = call @llvm.experimental.vector.splice.nxv16f16( zeroinitializer, zeroinitializer, i32 1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv2f32 = call @llvm.experimental.vector.splice.nxv2f32( zeroinitializer, zeroinitializer, i32 1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv4f32 = call @llvm.experimental.vector.splice.nxv4f32( zeroinitializer, zeroinitializer, i32 1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv8f32 = call @llvm.experimental.vector.splice.nxv8f32( zeroinitializer, zeroinitializer, i32 1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv2f64 = call @llvm.experimental.vector.splice.nxv2f64( zeroinitializer, zeroinitializer, i32 1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv4f64 = call @llvm.experimental.vector.splice.nxv4f64( zeroinitializer, zeroinitializer, i32 1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv2bf16 = call @llvm.experimental.vector.splice.nxv2bf16( zeroinitializer, zeroinitializer, i32 1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv4bf16 = call @llvm.experimental.vector.splice.nxv4bf16( zeroinitializer, zeroinitializer, i32 1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv8bf16 = call @llvm.experimental.vector.splice.nxv8bf16( zeroinitializer, zeroinitializer, i32 1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv16bf16 = call @llvm.experimental.vector.splice.nxv16bf16( zeroinitializer, zeroinitializer, i32 1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv16i1 = call @llvm.experimental.vector.splice.nxv16i1( zeroinitializer, zeroinitializer, i32 1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv8i1 = call @llvm.experimental.vector.splice.nxv8i1( zeroinitializer, zeroinitializer, i32 1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv4i1 = call @llvm.experimental.vector.splice.nxv4i1( zeroinitializer, zeroinitializer, i32 1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv2i1 = call @llvm.experimental.vector.splice.nxv2i1( zeroinitializer, zeroinitializer, i32 1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv16i8_neg = call @llvm.experimental.vector.splice.nxv16i8( zeroinitializer, zeroinitializer, i32 -1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %splice_nxv32i8_neg = call @llvm.experimental.vector.splice.nxv32i8( zeroinitializer, zeroinitializer, i32 -1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv2i16_neg = call @llvm.experimental.vector.splice.nxv2i16( zeroinitializer, zeroinitializer, i32 -1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv4i16_neg = call @llvm.experimental.vector.splice.nxv4i16( zeroinitializer, zeroinitializer, i32 -1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv8i16_neg = call @llvm.experimental.vector.splice.nxv8i16( zeroinitializer, zeroinitializer, i32 -1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %splice_nxv16i16_neg = call @llvm.experimental.vector.splice.nxv16i16( zeroinitializer, zeroinitializer, i32 -1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv4i32_neg = call @llvm.experimental.vector.splice.nxv4i32( zeroinitializer, zeroinitializer, i32 -1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %splice_nxv8i32_neg = call @llvm.experimental.vector.splice.nxv8i32( zeroinitializer, zeroinitializer, i32 -1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv2i64_neg = call @llvm.experimental.vector.splice.nxv2i64( zeroinitializer, zeroinitializer, i32 -1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %splice_nxv4i64_neg = call @llvm.experimental.vector.splice.nxv4i64( zeroinitializer, zeroinitializer, i32 -1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv2f16_neg = call @llvm.experimental.vector.splice.nxv2f16( zeroinitializer, zeroinitializer, i32 -1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv4f16_neg = call @llvm.experimental.vector.splice.nxv4f16( zeroinitializer, zeroinitializer, i32 -1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv8f16_neg = call @llvm.experimental.vector.splice.nxv8f16( zeroinitializer, zeroinitializer, i32 -1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %splice_nxv16f16_neg = call @llvm.experimental.vector.splice.nxv16f16( zeroinitializer, zeroinitializer, i32 -1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv2f32_neg = call @llvm.experimental.vector.splice.nxv2f32( zeroinitializer, zeroinitializer, i32 -1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv4f32_neg = call @llvm.experimental.vector.splice.nxv4f32( zeroinitializer, zeroinitializer, i32 -1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %splice_nxv8f32_neg = call @llvm.experimental.vector.splice.nxv8f32( zeroinitializer, zeroinitializer, i32 -1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv2f64_neg = call @llvm.experimental.vector.splice.nxv2f64( zeroinitializer, zeroinitializer, i32 -1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %splice_nxv4f64_neg = call @llvm.experimental.vector.splice.nxv4f64( zeroinitializer, zeroinitializer, i32 -1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv2bf16_neg = call @llvm.experimental.vector.splice.nxv2bf16( zeroinitializer, zeroinitializer, i32 -1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv4bf16_neg = call @llvm.experimental.vector.splice.nxv4bf16( zeroinitializer, zeroinitializer, i32 -1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv8bf16_neg = call @llvm.experimental.vector.splice.nxv8bf16( zeroinitializer, zeroinitializer, i32 -1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %splice_nxv16bf16_neg = call @llvm.experimental.vector.splice.nxv16bf16( zeroinitializer, zeroinitializer, i32 -1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %splice_nxv16i1_neg = call @llvm.experimental.vector.splice.nxv16i1( zeroinitializer, zeroinitializer, i32 -1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %splice_nxv8i1_neg = call @llvm.experimental.vector.splice.nxv8i1( zeroinitializer, zeroinitializer, i32 -1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %splice_nxv4i1_neg = call @llvm.experimental.vector.splice.nxv4i1( zeroinitializer, zeroinitializer, i32 -1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %splice_nxv2i1_neg = call @llvm.experimental.vector.splice.nxv2i1( zeroinitializer, zeroinitializer, i32 -1) + + %splice_nxv16i8 = call @llvm.experimental.vector.splice.nxv16i8( zeroinitializer, zeroinitializer, i32 1) + %splice_nxv32i8 = call @llvm.experimental.vector.splice.nxv32i8( zeroinitializer, zeroinitializer, i32 1) + %splice_nxv2i16 = call @llvm.experimental.vector.splice.nxv2i16( zeroinitializer, zeroinitializer, i32 1) + %splice_nxv4i16 = call @llvm.experimental.vector.splice.nxv4i16( zeroinitializer, zeroinitializer, i32 1) + %splice_nxv8i16 = call @llvm.experimental.vector.splice.nxv8i16( zeroinitializer, zeroinitializer, i32 1) + %splice_nxv16i16 = call @llvm.experimental.vector.splice.nxv16i16( zeroinitializer, zeroinitializer, i32 1) + %splice_nxv4i32 = call @llvm.experimental.vector.splice.nxv4i32( zeroinitializer, zeroinitializer, i32 1) + %splice_nxv8i32 = call @llvm.experimental.vector.splice.nxv8i32( zeroinitializer, zeroinitializer, i32 1) + %splice_nxv2i64 = call @llvm.experimental.vector.splice.nxv2i64( zeroinitializer, zeroinitializer, i32 1) + %splice_nxv4i64 = call @llvm.experimental.vector.splice.nxv4i64( zeroinitializer, zeroinitializer, i32 1) + %splice_nxv2f16 = call @llvm.experimental.vector.splice.nxv2f16( zeroinitializer, zeroinitializer, i32 1) + %splice_nxv4f16 = call @llvm.experimental.vector.splice.nxv4f16( zeroinitializer, zeroinitializer, i32 1) + %splice_nxv8f16 = call @llvm.experimental.vector.splice.nxv8f16( zeroinitializer, zeroinitializer, i32 1) + %splice_nxv16f16 = call @llvm.experimental.vector.splice.nxv16f16( zeroinitializer, zeroinitializer, i32 1) + %splice_nxv2f32 = call @llvm.experimental.vector.splice.nxv2f32( zeroinitializer, zeroinitializer, i32 1) + %splice_nxv4f32 = call @llvm.experimental.vector.splice.nxv4f32( zeroinitializer, zeroinitializer, i32 1) + %splice_nxv8f32 = call @llvm.experimental.vector.splice.nxv8f32( zeroinitializer, zeroinitializer, i32 1) + %splice_nxv2f64 = call @llvm.experimental.vector.splice.nxv2f64( zeroinitializer, zeroinitializer, i32 1) + %splice_nxv4f64 = call @llvm.experimental.vector.splice.nxv4f64( zeroinitializer, zeroinitializer, i32 1) + %splice_nxv2bf16 = call @llvm.experimental.vector.splice.nxv2bf16( zeroinitializer, zeroinitializer, i32 1) + %splice_nxv4bf16 = call @llvm.experimental.vector.splice.nxv4bf16( zeroinitializer, zeroinitializer, i32 1) + %splice_nxv8bf16 = call @llvm.experimental.vector.splice.nxv8bf16( zeroinitializer, zeroinitializer, i32 1) + %splice_nxv16bf16 = call @llvm.experimental.vector.splice.nxv16bf16( zeroinitializer, zeroinitializer, i32 1) + %splice_nxv16i1 = call @llvm.experimental.vector.splice.nxv16i1( zeroinitializer, zeroinitializer, i32 1) + %splice_nxv8i1 = call @llvm.experimental.vector.splice.nxv8i1( zeroinitializer, zeroinitializer, i32 1) + %splice_nxv4i1 = call @llvm.experimental.vector.splice.nxv4i1( zeroinitializer, zeroinitializer, i32 1) + %splice_nxv2i1 = call @llvm.experimental.vector.splice.nxv2i1( zeroinitializer, zeroinitializer, i32 1) +;; negative Index + %splice_nxv16i8_neg = call @llvm.experimental.vector.splice.nxv16i8( zeroinitializer, zeroinitializer, i32 -1) + %splice_nxv32i8_neg = call @llvm.experimental.vector.splice.nxv32i8( zeroinitializer, zeroinitializer, i32 -1) + %splice_nxv2i16_neg = call @llvm.experimental.vector.splice.nxv2i16( zeroinitializer, zeroinitializer, i32 -1) + %splice_nxv4i16_neg = call @llvm.experimental.vector.splice.nxv4i16( zeroinitializer, zeroinitializer, i32 -1) + %splice_nxv8i16_neg = call @llvm.experimental.vector.splice.nxv8i16( zeroinitializer, zeroinitializer, i32 -1) + %splice_nxv16i16_neg = call @llvm.experimental.vector.splice.nxv16i16( zeroinitializer, zeroinitializer, i32 -1) + %splice_nxv4i32_neg = call @llvm.experimental.vector.splice.nxv4i32( zeroinitializer, zeroinitializer, i32 -1) + %splice_nxv8i32_neg = call @llvm.experimental.vector.splice.nxv8i32( zeroinitializer, zeroinitializer, i32 -1) + %splice_nxv2i64_neg= call @llvm.experimental.vector.splice.nxv2i64( zeroinitializer, zeroinitializer, i32 -1) + %splice_nxv4i64_neg = call @llvm.experimental.vector.splice.nxv4i64( zeroinitializer, zeroinitializer, i32 -1) + %splice_nxv2f16_neg = call @llvm.experimental.vector.splice.nxv2f16( zeroinitializer, zeroinitializer, i32 -1) + %splice_nxv4f16_neg = call @llvm.experimental.vector.splice.nxv4f16( zeroinitializer, zeroinitializer, i32 -1) + %splice_nxv8f16_neg = call @llvm.experimental.vector.splice.nxv8f16( zeroinitializer, zeroinitializer, i32 -1) + %splice_nxv16f16_neg = call @llvm.experimental.vector.splice.nxv16f16( zeroinitializer, zeroinitializer, i32 -1) + %splice_nxv2f32_neg = call @llvm.experimental.vector.splice.nxv2f32( zeroinitializer, zeroinitializer, i32 -1) + %splice_nxv4f32_neg = call @llvm.experimental.vector.splice.nxv4f32( zeroinitializer, zeroinitializer, i32 -1) + %splice_nxv8f32_neg = call @llvm.experimental.vector.splice.nxv8f32( zeroinitializer, zeroinitializer, i32 -1) + %splice_nxv2f64_neg = call @llvm.experimental.vector.splice.nxv2f64( zeroinitializer, zeroinitializer, i32 -1) + %splice_nxv4f64_neg = call @llvm.experimental.vector.splice.nxv4f64( zeroinitializer, zeroinitializer, i32 -1) + %splice_nxv2bf16_neg = call @llvm.experimental.vector.splice.nxv2bf16( zeroinitializer, zeroinitializer, i32 -1) + %splice_nxv4bf16_neg = call @llvm.experimental.vector.splice.nxv4bf16( zeroinitializer, zeroinitializer, i32 -1) + %splice_nxv8bf16_neg = call @llvm.experimental.vector.splice.nxv8bf16( zeroinitializer, zeroinitializer, i32 -1) + %splice_nxv16bf16_neg = call @llvm.experimental.vector.splice.nxv16bf16( zeroinitializer, zeroinitializer, i32 -1) + %splice_nxv16i1_neg = call @llvm.experimental.vector.splice.nxv16i1( zeroinitializer, zeroinitializer, i32 -1) + %splice_nxv8i1_neg = call @llvm.experimental.vector.splice.nxv8i1( zeroinitializer, zeroinitializer, i32 -1) + %splice_nxv4i1_neg = call @llvm.experimental.vector.splice.nxv4i1( zeroinitializer, zeroinitializer, i32 -1) + %splice_nxv2i1_neg = call @llvm.experimental.vector.splice.nxv2i1( zeroinitializer, zeroinitializer, i32 -1) + ret void +} + +declare @llvm.experimental.vector.splice.nxv2i1(, , i32) +declare @llvm.experimental.vector.splice.nxv4i1(, , i32) +declare @llvm.experimental.vector.splice.nxv8i1(, , i32) +declare @llvm.experimental.vector.splice.nxv16i1(, , i32) +declare @llvm.experimental.vector.splice.nxv2i8(, , i32) +declare @llvm.experimental.vector.splice.nxv16i8(, , i32) +declare @llvm.experimental.vector.splice.nxv32i8(, , i32) +declare @llvm.experimental.vector.splice.nxv2i16(, , i32) +declare @llvm.experimental.vector.splice.nxv4i16(, , i32) +declare @llvm.experimental.vector.splice.nxv8i16(, , i32) +declare @llvm.experimental.vector.splice.nxv16i16(, , i32) +declare @llvm.experimental.vector.splice.nxv4i32(, , i32) +declare @llvm.experimental.vector.splice.nxv8i32(, , i32) +declare @llvm.experimental.vector.splice.nxv2i64(, , i32) +declare @llvm.experimental.vector.splice.nxv4i64(, , i32) +declare @llvm.experimental.vector.splice.nxv2f16(, , i32) +declare @llvm.experimental.vector.splice.nxv4f16(, , i32) +declare @llvm.experimental.vector.splice.nxv8f16(, , i32) +declare @llvm.experimental.vector.splice.nxv16f16(, , i32) +declare @llvm.experimental.vector.splice.nxv2bf16(, , i32) +declare @llvm.experimental.vector.splice.nxv4bf16(, , i32) +declare @llvm.experimental.vector.splice.nxv8bf16(, , i32) +declare @llvm.experimental.vector.splice.nxv16bf16(, , i32) +declare @llvm.experimental.vector.splice.nxv2f32(, , i32) +declare @llvm.experimental.vector.splice.nxv4f32(, , i32) +declare @llvm.experimental.vector.splice.nxv8f32(, , i32) +declare @llvm.experimental.vector.splice.nxv16f32(, , i32) +declare @llvm.experimental.vector.splice.nxv2f64(, , i32) +declare @llvm.experimental.vector.splice.nxv4f64(, , i32) + attributes #0 = { "target-features"="+sve,+bf16" }