diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -1434,6 +1434,110 @@ return std::nullopt; } +static SmallVector SimplifyValuePattern(SmallVector Vec, + bool AllowPoison) { + std::size_t VecSize = Vec.size(); + if (VecSize < 2 || !isPowerOf2_64(VecSize)) + return Vec; + std::size_t HalfVecSize = VecSize / 2; + + SmallVector Lhs(Vec.begin(), Vec.begin() + HalfVecSize); + SmallVector Rhs(Vec.begin() + HalfVecSize, Vec.end()); + + for (std::size_t I = 0; I < Lhs.size(); I++) { + if (Lhs[I] != nullptr && Rhs[I] != nullptr) { + if (Lhs[I] == Rhs[I]) + continue; + else + return Vec; + } + if (!AllowPoison) + return Vec; + if (Lhs[I] == nullptr && Rhs[I] != nullptr) + Lhs[I] = Rhs[I]; + } + + return SimplifyValuePattern(Lhs, AllowPoison); +} + +// Try to simplify dupqlane patterns like dupqlane(f32 A, f32 B, f32 A, f32 B) +// to dupqlane(f64(C)) where C is A concatenated with B +static std::optional instCombineSVEDupqLane(InstCombiner &IC, + IntrinsicInst &II) { + auto VecInsert = dyn_cast(II.getOperand(0)); + if (!VecInsert || VecInsert->getIntrinsicID() != Intrinsic::vector_insert || + !isa(VecInsert->getOperand(1)->getType())) + return std::nullopt; + auto IIScalableTy = cast(II.getType()); + + // Insert the scalars into a SmallVector ordered by InsertElement index + Value *CurrentInsertElt = VecInsert->getOperand(1); + uint64_t NumElements = IIScalableTy->getMinNumElements(); + SmallVector InsertEltVec(NumElements, nullptr); + for (uint64_t I = NumElements; I > 0; I--) { + InsertElementInst *InsertElt = + dyn_cast(CurrentInsertElt); + if (!InsertElt) + break; + auto Idx = cast(InsertElt->getOperand(2)); + InsertEltVec[Idx->getValue().getZExtValue()] = InsertElt->getOperand(1); + CurrentInsertElt = InsertElt->getOperand(0); + } + + bool AllowPoison = isa(VecInsert->getOperand(0)); + SmallVector Pattern = + SimplifyValuePattern(InsertEltVec, AllowPoison); + // Bail out if there is no pattern found + if (Pattern == InsertEltVec) + return std::nullopt; + + IRBuilder<> Builder(II.getContext()); + Builder.SetInsertPoint(&II); + + // Rebuild the simplified chain of InsertElements. e.g. (a, b, a, b) as (a, b) + Value *InsertEltChain = nullptr; + for (size_t I = 0; I < Pattern.size(); I++) { + if (Pattern[I] == nullptr) + continue; + Constant *Idx = ConstantInt::get(Builder.getInt64Ty(), APInt(64, I)); + if (InsertEltChain == nullptr) + InsertEltChain = Builder.CreateInsertElement( + PoisonValue::get(CurrentInsertElt->getType()), Pattern[I], Idx); + else + InsertEltChain = + Builder.CreateInsertElement(InsertEltChain, Pattern[I], Idx); + } + if (InsertEltChain == nullptr) + return std::nullopt; + + // Splat the simplified sequence, e.g. (f16 a, f16 b, f16 c, f16 d) as one i64 + // value or (f16 a, f16 b) as one i32 value. This requires an InsertSubvector + // be bitcast to a type wide enough to fit the sequence, be splatted, and then + // be narrowed back to the original type. + int PatternWidth = IIScalableTy->getScalarSizeInBits() * Pattern.size(); + int PatternElementCount = IIScalableTy->getScalarSizeInBits() * + IIScalableTy->getMinNumElements() / PatternWidth; + + IntegerType *WideTy = Builder.getIntNTy(PatternWidth); + ScalableVectorType *WideScalableTy = + ScalableVectorType::get(WideTy, PatternElementCount); + ScalableVectorType *WideShuffleMaskTy = + ScalableVectorType::get(Builder.getInt32Ty(), PatternElementCount); + + Constant *ZeroIdx = ConstantInt::get(Builder.getInt64Ty(), APInt(64, 0)); + CallInst *InsertSubvector = Builder.CreateInsertVector( + II.getType(), PoisonValue::get(II.getType()), InsertEltChain, ZeroIdx); + Value *WideBitcast = + Builder.CreateBitOrPointerCast(InsertSubvector, WideScalableTy); + auto WideShuffleMask = ConstantAggregateZero::get(WideShuffleMaskTy); + Value *WideShuffle = Builder.CreateShuffleVector( + WideBitcast, PoisonValue::get(WideScalableTy), WideShuffleMask); + + Value *NarrowBitcast = + Builder.CreateBitOrPointerCast(WideShuffle, II.getType()); + return IC.replaceInstUsesWith(II, NarrowBitcast); +} + static std::optional instCombineMaxMinNM(InstCombiner &IC, IntrinsicInst &II) { Value *A = II.getArgOperand(0); @@ -1551,6 +1655,8 @@ return instCombineSVESel(IC, II); case Intrinsic::aarch64_sve_srshl: return instCombineSVESrshl(IC, II); + case Intrinsic::aarch64_sve_dupq_lane: + return instCombineSVEDupqLane(IC, II); } return std::nullopt; diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll --- a/llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll @@ -587,49 +587,35 @@ define dso_local @dupq_f32_repeat_complex(float %x, float %y) { ; CHECK-LABEL: dupq_f32_repeat_complex: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 -; CHECK-NEXT: mov v2.16b, v0.16b +; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0 ; CHECK-NEXT: // kill: def $s1 killed $s1 def $q1 -; CHECK-NEXT: mov v2.s[1], v1.s[0] -; CHECK-NEXT: mov v2.s[2], v0.s[0] -; CHECK-NEXT: mov v2.s[3], v1.s[0] -; CHECK-NEXT: mov z0.q, q2 +; CHECK-NEXT: mov v0.s[1], v1.s[0] +; CHECK-NEXT: mov z0.d, d0 ; CHECK-NEXT: ret %1 = insertelement <4 x float> undef, float %x, i64 0 %2 = insertelement <4 x float> %1, float %y, i64 1 - %3 = insertelement <4 x float> %2, float %x, i64 2 - %4 = insertelement <4 x float> %3, float %y, i64 3 - %5 = tail call @llvm.vector.insert.nxv4f32.v4f32( undef, <4 x float> %4, i64 0) - %6 = tail call @llvm.aarch64.sve.dupq.lane.nxv4f32( %5, i64 0) + %3 = call @llvm.vector.insert.nxv4f32.v4f32( undef, <4 x float> %2, i64 0) + %4 = bitcast %3 to + %5 = shufflevector %4, poison, zeroinitializer + %6 = bitcast %5 to ret %6 } -define dso_local @dupq_f16_repeat_complex(half %a, half %b) { +define dso_local @dupq_f16_repeat_complex(half %x, half %y) { ; CHECK-LABEL: dupq_f16_repeat_complex: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $h0 killed $h0 def $q0 -; CHECK-NEXT: mov v2.16b, v0.16b +; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 ; CHECK-NEXT: // kill: def $h1 killed $h1 def $q1 -; CHECK-NEXT: mov v2.h[1], v1.h[0] -; CHECK-NEXT: mov v2.h[2], v0.h[0] -; CHECK-NEXT: mov v2.h[3], v1.h[0] -; CHECK-NEXT: mov v2.h[4], v0.h[0] -; CHECK-NEXT: mov v2.h[5], v1.h[0] -; CHECK-NEXT: mov v2.h[6], v0.h[0] -; CHECK-NEXT: mov v2.h[7], v1.h[0] -; CHECK-NEXT: mov z0.q, q2 -; CHECK-NEXT: ret - %1 = insertelement <8 x half> undef, half %a, i64 0 - %2 = insertelement <8 x half> %1, half %b, i64 1 - %3 = insertelement <8 x half> %2, half %a, i64 2 - %4 = insertelement <8 x half> %3, half %b, i64 3 - %5 = insertelement <8 x half> %4, half %a, i64 4 - %6 = insertelement <8 x half> %5, half %b, i64 5 - %7 = insertelement <8 x half> %6, half %a, i64 6 - %8 = insertelement <8 x half> %7, half %b, i64 7 - %9 = tail call @llvm.vector.insert.nxv8f16.v8f16( undef, <8 x half> %8, i64 0) - %10 = tail call @llvm.aarch64.sve.dupq.lane.nxv8f16( %9, i64 0) - ret %10 +; CHECK-NEXT: mov v0.h[1], v1.h[0] +; CHECK-NEXT: mov z0.s, s0 +; CHECK-NEXT: ret + %1 = insertelement <8 x half> undef, half %x, i64 0 + %2 = insertelement <8 x half> %1, half %y, i64 1 + %3 = call @llvm.vector.insert.nxv8f16.v8f16( undef, <8 x half> %2, i64 0) + %4 = bitcast %3 to + %5 = shufflevector %4, poison, zeroinitializer + %6 = bitcast %5 to + ret %6 } define @ext_i8( %a, %b) { diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-dupqlane.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-dupqlane.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-dupqlane.ll @@ -0,0 +1,295 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -passes=instcombine < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +define dso_local @dupq_f32_ab_pattern(float %x, float %y) { +; CHECK-LABEL: @dupq_f32_ab_pattern( +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x float> poison, float [[X:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[Y:%.*]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.vector.insert.nxv4f32.v4f32( poison, <4 x float> [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector [[TMP4]], poison, zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +; CHECK-NEXT: ret [[TMP6]] +; + %1 = insertelement <4 x float> poison, float %x, i64 0 + %2 = insertelement <4 x float> %1, float %y, i64 1 + %3 = insertelement <4 x float> %2, float %x, i64 2 + %4 = insertelement <4 x float> %3, float %y, i64 3 + %5 = tail call @llvm.vector.insert.nxv4f32.v4f32( poison, <4 x float> %4, i64 0) + %6 = tail call @llvm.aarch64.sve.dupq.lane.nxv4f32( %5, i64 0) + ret %6 +} + +define dso_local @dupq_f16_a_pattern(half %a) { +; CHECK-LABEL: @dupq_f16_a_pattern( +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x half> poison, half [[A:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8f16.v8f16( poison, <8 x half> [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.dupq.lane.nxv8f16( [[TMP3]], i64 0) +; CHECK-NEXT: ret [[TMP4]] +; + %1 = insertelement <8 x half> poison, half %a, i64 0 + %2 = insertelement <8 x half> %1, half %a, i64 1 + %3 = insertelement <8 x half> %2, half %a, i64 2 + %4 = insertelement <8 x half> %3, half %a, i64 3 + %5 = insertelement <8 x half> %4, half %a, i64 4 + %6 = insertelement <8 x half> %5, half %a, i64 5 + %7 = insertelement <8 x half> %6, half %a, i64 6 + %8 = insertelement <8 x half> %7, half %a, i64 7 + %9 = tail call @llvm.vector.insert.nxv8f16.v8f16( poison, <8 x half> %8, i64 0) + %10 = tail call @llvm.aarch64.sve.dupq.lane.nxv8f16( %9, i64 0) + ret %10 +} + +define dso_local @dupq_f16_ab_pattern(half %a, half %b) { +; CHECK-LABEL: @dupq_f16_ab_pattern( +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x half> poison, half [[A:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half [[B:%.*]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.vector.insert.nxv8f16.v8f16( poison, <8 x half> [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector [[TMP4]], poison, zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +; CHECK-NEXT: ret [[TMP6]] +; + %1 = insertelement <8 x half> poison, half %a, i64 0 + %2 = insertelement <8 x half> %1, half %b, i64 1 + %3 = insertelement <8 x half> %2, half %a, i64 2 + %4 = insertelement <8 x half> %3, half %b, i64 3 + %5 = insertelement <8 x half> %4, half %a, i64 4 + %6 = insertelement <8 x half> %5, half %b, i64 5 + %7 = insertelement <8 x half> %6, half %a, i64 6 + %8 = insertelement <8 x half> %7, half %b, i64 7 + %9 = tail call @llvm.vector.insert.nxv8f16.v8f16( poison, <8 x half> %8, i64 0) + %10 = tail call @llvm.aarch64.sve.dupq.lane.nxv8f16( %9, i64 0) + ret %10 +} + +define dso_local @dupq_f16_abcd_pattern(half %a, half %b, half %c, half %d) { +; CHECK-LABEL: @dupq_f16_abcd_pattern( +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x half> poison, half [[A:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half [[B:%.*]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x half> [[TMP2]], half [[C:%.*]], i64 2 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x half> [[TMP3]], half [[D:%.*]], i64 3 +; CHECK-NEXT: [[TMP5:%.*]] = call @llvm.vector.insert.nxv8f16.v8f16( poison, <8 x half> [[TMP4]], i64 0) +; CHECK-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector [[TMP6]], poison, zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +; CHECK-NEXT: ret [[TMP8]] +; + %1 = insertelement <8 x half> poison, half %a, i64 0 + %2 = insertelement <8 x half> %1, half %b, i64 1 + %3 = insertelement <8 x half> %2, half %c, i64 2 + %4 = insertelement <8 x half> %3, half %d, i64 3 + %5 = insertelement <8 x half> %4, half %a, i64 4 + %6 = insertelement <8 x half> %5, half %b, i64 5 + %7 = insertelement <8 x half> %6, half %c, i64 6 + %8 = insertelement <8 x half> %7, half %d, i64 7 + %9 = tail call @llvm.vector.insert.nxv8f16.v8f16( poison, <8 x half> %8, i64 0) + %10 = tail call @llvm.aarch64.sve.dupq.lane.nxv8f16( %9, i64 0) + ret %10 +} + +define dso_local @dupq_f16_abcd_pattern_reverted_insert(half %a, half %b, half %c, half %d) { +; CHECK-LABEL: @dupq_f16_abcd_pattern_reverted_insert( +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x half> poison, half [[A:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half [[B:%.*]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x half> [[TMP2]], half [[C:%.*]], i64 2 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x half> [[TMP3]], half [[D:%.*]], i64 3 +; CHECK-NEXT: [[TMP5:%.*]] = call @llvm.vector.insert.nxv8f16.v8f16( poison, <8 x half> [[TMP4]], i64 0) +; CHECK-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector [[TMP6]], poison, zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +; CHECK-NEXT: ret [[TMP8]] +; + %1 = insertelement <8 x half> poison, half %d, i64 7 + %2 = insertelement <8 x half> %1, half %c, i64 6 + %3 = insertelement <8 x half> %2, half %b, i64 5 + %4 = insertelement <8 x half> %3, half %a, i64 4 + %5 = insertelement <8 x half> %4, half %d, i64 3 + %6 = insertelement <8 x half> %5, half %c, i64 2 + %7 = insertelement <8 x half> %6, half %b, i64 1 + %8 = insertelement <8 x half> %7, half %a, i64 0 + %9 = tail call @llvm.vector.insert.nxv8f16.v8f16( poison, <8 x half> %8, i64 0) + %10 = tail call @llvm.aarch64.sve.dupq.lane.nxv8f16( %9, i64 0) + ret %10 +} + +define dso_local @dupq_f16_ab_pattern_no_front_indices(half %a, half %b) { +; CHECK-LABEL: @dupq_f16_ab_pattern_no_front_indices( +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x half> poison, half [[A:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half [[B:%.*]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.vector.insert.nxv8f16.v8f16( poison, <8 x half> [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector [[TMP4]], poison, zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +; CHECK-NEXT: ret [[TMP6]] +; + %1 = insertelement <8 x half> poison, half %a, i64 2 + %2 = insertelement <8 x half> %1, half %b, i64 3 + %3 = insertelement <8 x half> %2, half %a, i64 4 + %4 = insertelement <8 x half> %3, half %b, i64 5 + %5 = insertelement <8 x half> %4, half %a, i64 6 + %6 = insertelement <8 x half> %5, half %b, i64 7 + %7 = tail call @llvm.vector.insert.nxv8f16.v8f16( poison, <8 x half> %6, i64 0) + %8 = tail call @llvm.aarch64.sve.dupq.lane.nxv8f16( %7, i64 0) + ret %8 +} + +define dso_local @dupq_f16_ab_pattern_no_middle_indices(half %a, half %b) { +; CHECK-LABEL: @dupq_f16_ab_pattern_no_middle_indices( +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x half> poison, half [[A:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half [[B:%.*]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.vector.insert.nxv8f16.v8f16( poison, <8 x half> [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector [[TMP4]], poison, zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +; CHECK-NEXT: ret [[TMP6]] +; + %1 = insertelement <8 x half> poison, half %a, i64 0 + %2 = insertelement <8 x half> %1, half %b, i64 1 + %3 = insertelement <8 x half> %2, half %a, i64 2 + %4 = insertelement <8 x half> %3, half %b, i64 5 + %5 = insertelement <8 x half> %4, half %a, i64 6 + %6 = insertelement <8 x half> %5, half %b, i64 7 + %7 = tail call @llvm.vector.insert.nxv8f16.v8f16( poison, <8 x half> %6, i64 0) + %8 = tail call @llvm.aarch64.sve.dupq.lane.nxv8f16( %7, i64 0) + ret %8 +} + +define dso_local @dupq_f16_ab_pattern_no_end_indices(half %a, half %b) { +; CHECK-LABEL: @dupq_f16_ab_pattern_no_end_indices( +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x half> poison, half [[A:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half [[B:%.*]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.vector.insert.nxv8f16.v8f16( poison, <8 x half> [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector [[TMP4]], poison, zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +; CHECK-NEXT: ret [[TMP6]] +; + %1 = insertelement <8 x half> poison, half %a, i64 0 + %2 = insertelement <8 x half> %1, half %b, i64 1 + %3 = insertelement <8 x half> %2, half %a, i64 2 + %4 = insertelement <8 x half> %3, half %b, i64 3 + %5 = insertelement <8 x half> %4, half %a, i64 4 + %6 = insertelement <8 x half> %5, half %b, i64 5 + %7 = tail call @llvm.vector.insert.nxv8f16.v8f16( poison, <8 x half> %6, i64 0) + %8 = tail call @llvm.aarch64.sve.dupq.lane.nxv8f16( %7, i64 0) + ret %8 +} + +define dso_local @dupq_f16_ab_pattern_no_end_indices_not_poison(half %a, half %b, %c) { +; CHECK-LABEL: @dupq_f16_ab_pattern_no_end_indices_not_poison( +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x half> poison, half [[A:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half [[B:%.*]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x half> [[TMP2]], half [[A]], i64 2 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x half> [[TMP3]], half [[B]], i64 3 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x half> [[TMP4]], half [[A]], i64 4 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x half> [[TMP5]], half [[B]], i64 5 +; CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv8f16.v8f16( [[C:%.*]], <8 x half> [[TMP6]], i64 0) +; CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.aarch64.sve.dupq.lane.nxv8f16( [[TMP7]], i64 0) +; CHECK-NEXT: ret [[TMP8]] +; + %1 = insertelement <8 x half> poison, half %a, i64 0 + %2 = insertelement <8 x half> %1, half %b, i64 1 + %3 = insertelement <8 x half> %2, half %a, i64 2 + %4 = insertelement <8 x half> %3, half %b, i64 3 + %5 = insertelement <8 x half> %4, half %a, i64 4 + %6 = insertelement <8 x half> %5, half %b, i64 5 + %7 = tail call @llvm.vector.insert.nxv8f16.v8f16( %c, <8 x half> %6, i64 0) + %8 = tail call @llvm.aarch64.sve.dupq.lane.nxv8f16( %7, i64 0) + ret %8 +} + +define dso_local @dupq_f16_ab_no_front_pattern(half %a, half %b) { +; CHECK-LABEL: @dupq_f16_ab_no_front_pattern( +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x half> poison, half [[A:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half [[A]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x half> [[TMP2]], half [[A]], i64 2 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x half> [[TMP3]], half [[B:%.*]], i64 3 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x half> [[TMP4]], half [[A]], i64 4 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x half> [[TMP5]], half [[B]], i64 5 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x half> [[TMP6]], half [[A]], i64 6 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x half> [[TMP7]], half [[B]], i64 7 +; CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv8f16.v8f16( poison, <8 x half> [[TMP8]], i64 0) +; CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.aarch64.sve.dupq.lane.nxv8f16( [[TMP9]], i64 0) +; CHECK-NEXT: ret [[TMP10]] +; + %1 = insertelement <8 x half> poison, half %a, i64 0 + %2 = insertelement <8 x half> %1, half %a, i64 1 + %3 = insertelement <8 x half> %2, half %a, i64 2 + %4 = insertelement <8 x half> %3, half %b, i64 3 + %5 = insertelement <8 x half> %4, half %a, i64 4 + %6 = insertelement <8 x half> %5, half %b, i64 5 + %7 = insertelement <8 x half> %6, half %a, i64 6 + %8 = insertelement <8 x half> %7, half %b, i64 7 + %9 = tail call @llvm.vector.insert.nxv8f16.v8f16( poison, <8 x half> %8, i64 0) + %10 = tail call @llvm.aarch64.sve.dupq.lane.nxv8f16( %9, i64 0) + ret %10 +} + +define dso_local @dupq_f16_ab_no_middle_pattern(half %a, half %b) { +; CHECK-LABEL: @dupq_f16_ab_no_middle_pattern( +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x half> poison, half [[A:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half [[A]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x half> [[TMP2]], half [[B:%.*]], i64 2 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x half> [[TMP3]], half [[B]], i64 3 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x half> [[TMP4]], half [[A]], i64 4 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x half> [[TMP5]], half [[A]], i64 5 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x half> [[TMP6]], half [[A]], i64 6 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x half> [[TMP7]], half [[B]], i64 7 +; CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv8f16.v8f16( poison, <8 x half> [[TMP8]], i64 0) +; CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.aarch64.sve.dupq.lane.nxv8f16( [[TMP9]], i64 0) +; CHECK-NEXT: ret [[TMP10]] +; + %1 = insertelement <8 x half> poison, half %a, i64 0 + %2 = insertelement <8 x half> %1, half %a, i64 1 + %3 = insertelement <8 x half> %2, half %b, i64 2 + %4 = insertelement <8 x half> %3, half %b, i64 3 + %5 = insertelement <8 x half> %4, half %a, i64 4 + %6 = insertelement <8 x half> %5, half %a, i64 5 + %7 = insertelement <8 x half> %6, half %a, i64 6 + %8 = insertelement <8 x half> %7, half %b, i64 7 + %9 = tail call @llvm.vector.insert.nxv8f16.v8f16( poison, <8 x half> %8, i64 0) + %10 = tail call @llvm.aarch64.sve.dupq.lane.nxv8f16( %9, i64 0) + ret %10 +} + +define dso_local @dupq_f16_ab_no_end_pattern(half %a, half %b) { +; CHECK-LABEL: @dupq_f16_ab_no_end_pattern( +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x half> poison, half [[A:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half [[B:%.*]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x half> [[TMP2]], half [[A]], i64 2 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x half> [[TMP3]], half [[B]], i64 3 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x half> [[TMP4]], half [[A]], i64 4 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x half> [[TMP5]], half [[B]], i64 5 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x half> [[TMP6]], half [[A]], i64 6 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x half> [[TMP7]], half [[A]], i64 7 +; CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv8f16.v8f16( poison, <8 x half> [[TMP8]], i64 0) +; CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.aarch64.sve.dupq.lane.nxv8f16( [[TMP9]], i64 0) +; CHECK-NEXT: ret [[TMP10]] +; + %1 = insertelement <8 x half> poison, half %a, i64 0 + %2 = insertelement <8 x half> %1, half %b, i64 1 + %3 = insertelement <8 x half> %2, half %a, i64 2 + %4 = insertelement <8 x half> %3, half %b, i64 3 + %5 = insertelement <8 x half> %4, half %a, i64 4 + %6 = insertelement <8 x half> %5, half %b, i64 5 + %7 = insertelement <8 x half> %6, half %a, i64 6 + %8 = insertelement <8 x half> %7, half %a, i64 7 + %9 = tail call @llvm.vector.insert.nxv8f16.v8f16( poison, <8 x half> %8, i64 0) + %10 = tail call @llvm.aarch64.sve.dupq.lane.nxv8f16( %9, i64 0) + ret %10 +} + +declare @llvm.vector.insert.nxv8f16.v8f16(, <8 x half>, i64) +declare @llvm.aarch64.sve.dupq.lane.nxv8f16(, i64) +declare @llvm.vector.insert.nxv4f32.v4f32(, <4 x float>, i64) +declare @llvm.aarch64.sve.dupq.lane.nxv4f32(, i64) +declare @llvm.vector.insert.nxv4i32.v4i32(, <4 x i32>, i64) +declare @llvm.aarch64.sve.dupq.lane.nxv4i32(, i64) +declare @llvm.vector.insert.nxv8i16.v8i16(, <8 x i16>, i64) +declare @llvm.aarch64.sve.dupq.lane.nxv8i16(, i64) + +attributes #0 = { "target-features"="+sve" }