diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -1434,6 +1434,103 @@ return std::nullopt; } +bool SimplifyValuePattern(SmallVector &Vec, bool AllowPoison) { + size_t VecSize = Vec.size(); + if (!isPowerOf2_64(VecSize)) + return false; + size_t HalfVecSize = VecSize / 2; + + for (auto Lhs = Vec.begin(), Rhs = Vec.begin() + HalfVecSize; + Rhs != Vec.end(); Lhs++, Rhs++) { + if (*Lhs != nullptr && *Rhs != nullptr) { + if (*Lhs == *Rhs) + continue; + else + return false; + } + if (!AllowPoison) + return false; + if (*Lhs == nullptr && *Rhs != nullptr) + *Lhs = *Rhs; + } + + Vec.resize(HalfVecSize); + if (Vec.size() == 1) + return true; + SimplifyValuePattern(Vec, AllowPoison); + return true; +} + +// Try to simplify dupqlane patterns like dupqlane(f32 A, f32 B, f32 A, f32 B) +// to dupqlane(f64(C)) where C is A concatenated with B +static std::optional instCombineSVEDupqLane(InstCombiner &IC, + IntrinsicInst &II) { + Value *CurrentInsertElt = nullptr, *Default = nullptr; + if (!match(II.getOperand(0), + m_Intrinsic( + m_Value(Default), m_Value(CurrentInsertElt), m_Value())) || + !isa(CurrentInsertElt->getType())) + return std::nullopt; + auto IIScalableTy = cast(II.getType()); + + // Insert the scalars into a SmallVector ordered by InsertElement index + SmallVector InsertEltVec(IIScalableTy->getMinNumElements(), nullptr); + while (InsertElementInst *InsertElt = + dyn_cast(CurrentInsertElt)) { + auto Idx = cast(InsertElt->getOperand(2)); + InsertEltVec[Idx->getValue().getZExtValue()] = InsertElt->getOperand(1); + CurrentInsertElt = InsertElt->getOperand(0); + } + + // Bail if we aren't inserting a poison vector or we can't find a pattern + if (!SimplifyValuePattern(InsertEltVec, isa(Default))) + return std::nullopt; + + // Rebuild the simplified chain of InsertElements. e.g. (a, b, a, b) as (a, b) + IRBuilder<> Builder(II.getContext()); + Builder.SetInsertPoint(&II); + Value *InsertEltChain = nullptr; + for (size_t I = 0; I < InsertEltVec.size(); I++) { + if (InsertEltVec[I] == nullptr) + continue; + Constant *Idx = ConstantInt::get(Builder.getInt64Ty(), APInt(64, I)); + InsertEltChain = Builder.CreateInsertElement( + InsertEltChain == nullptr + ? PoisonValue::get(CurrentInsertElt->getType()) + : InsertEltChain, + InsertEltVec[I], Idx); + } + if (InsertEltChain == nullptr) + return std::nullopt; + + // Splat the simplified sequence, e.g. (f16 a, f16 b, f16 c, f16 d) as one i64 + // value or (f16 a, f16 b) as one i32 value. This requires an InsertSubvector + // be bitcast to a type wide enough to fit the sequence, be splatted, and then + // be narrowed back to the original type. + int PatternWidth = IIScalableTy->getScalarSizeInBits() * InsertEltVec.size(); + int PatternElementCount = IIScalableTy->getScalarSizeInBits() * + IIScalableTy->getMinNumElements() / PatternWidth; + + IntegerType *WideTy = Builder.getIntNTy(PatternWidth); + auto *WideScalableTy = ScalableVectorType::get(WideTy, PatternElementCount); + auto *WideShuffleMaskTy = + ScalableVectorType::get(Builder.getInt32Ty(), PatternElementCount); + + auto ZeroIdx = ConstantInt::get(Builder.getInt64Ty(), APInt(64, 0)); + CallInst *InsertSubvector = Builder.CreateInsertVector( + II.getType(), PoisonValue::get(II.getType()), InsertEltChain, ZeroIdx); + Value *WideBitcast = + Builder.CreateBitOrPointerCast(InsertSubvector, WideScalableTy); + auto WideShuffleMask = ConstantAggregateZero::get(WideShuffleMaskTy); + Value *WideShuffle = Builder.CreateShuffleVector( + WideBitcast, PoisonValue::get(WideScalableTy), WideShuffleMask); + + Value *NarrowBitcast = + Builder.CreateBitOrPointerCast(WideShuffle, II.getType()); + + return IC.replaceInstUsesWith(II, NarrowBitcast); +} + static std::optional instCombineMaxMinNM(InstCombiner &IC, IntrinsicInst &II) { Value *A = II.getArgOperand(0); @@ -1551,6 +1648,8 @@ return instCombineSVESel(IC, II); case Intrinsic::aarch64_sve_srshl: return instCombineSVESrshl(IC, II); + case Intrinsic::aarch64_sve_dupq_lane: + return instCombineSVEDupqLane(IC, II); } return std::nullopt; diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll --- a/llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll @@ -587,49 +587,35 @@ define dso_local @dupq_f32_repeat_complex(float %x, float %y) { ; CHECK-LABEL: dupq_f32_repeat_complex: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 -; CHECK-NEXT: mov v2.16b, v0.16b +; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0 ; CHECK-NEXT: // kill: def $s1 killed $s1 def $q1 -; CHECK-NEXT: mov v2.s[1], v1.s[0] -; CHECK-NEXT: mov v2.s[2], v0.s[0] -; CHECK-NEXT: mov v2.s[3], v1.s[0] -; CHECK-NEXT: mov z0.q, q2 +; CHECK-NEXT: mov v0.s[1], v1.s[0] +; CHECK-NEXT: mov z0.d, d0 ; CHECK-NEXT: ret %1 = insertelement <4 x float> undef, float %x, i64 0 %2 = insertelement <4 x float> %1, float %y, i64 1 - %3 = insertelement <4 x float> %2, float %x, i64 2 - %4 = insertelement <4 x float> %3, float %y, i64 3 - %5 = tail call @llvm.vector.insert.nxv4f32.v4f32( undef, <4 x float> %4, i64 0) - %6 = tail call @llvm.aarch64.sve.dupq.lane.nxv4f32( %5, i64 0) + %3 = call @llvm.vector.insert.nxv4f32.v4f32( undef, <4 x float> %2, i64 0) + %4 = bitcast %3 to + %5 = shufflevector %4, poison, zeroinitializer + %6 = bitcast %5 to ret %6 } -define dso_local @dupq_f16_repeat_complex(half %a, half %b) { +define dso_local @dupq_f16_repeat_complex(half %x, half %y) { ; CHECK-LABEL: dupq_f16_repeat_complex: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $h0 killed $h0 def $q0 -; CHECK-NEXT: mov v2.16b, v0.16b +; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 ; CHECK-NEXT: // kill: def $h1 killed $h1 def $q1 -; CHECK-NEXT: mov v2.h[1], v1.h[0] -; CHECK-NEXT: mov v2.h[2], v0.h[0] -; CHECK-NEXT: mov v2.h[3], v1.h[0] -; CHECK-NEXT: mov v2.h[4], v0.h[0] -; CHECK-NEXT: mov v2.h[5], v1.h[0] -; CHECK-NEXT: mov v2.h[6], v0.h[0] -; CHECK-NEXT: mov v2.h[7], v1.h[0] -; CHECK-NEXT: mov z0.q, q2 -; CHECK-NEXT: ret - %1 = insertelement <8 x half> undef, half %a, i64 0 - %2 = insertelement <8 x half> %1, half %b, i64 1 - %3 = insertelement <8 x half> %2, half %a, i64 2 - %4 = insertelement <8 x half> %3, half %b, i64 3 - %5 = insertelement <8 x half> %4, half %a, i64 4 - %6 = insertelement <8 x half> %5, half %b, i64 5 - %7 = insertelement <8 x half> %6, half %a, i64 6 - %8 = insertelement <8 x half> %7, half %b, i64 7 - %9 = tail call @llvm.vector.insert.nxv8f16.v8f16( undef, <8 x half> %8, i64 0) - %10 = tail call @llvm.aarch64.sve.dupq.lane.nxv8f16( %9, i64 0) - ret %10 +; CHECK-NEXT: mov v0.h[1], v1.h[0] +; CHECK-NEXT: mov z0.s, s0 +; CHECK-NEXT: ret + %1 = insertelement <8 x half> undef, half %x, i64 0 + %2 = insertelement <8 x half> %1, half %y, i64 1 + %3 = call @llvm.vector.insert.nxv8f16.v8f16( undef, <8 x half> %2, i64 0) + %4 = bitcast %3 to + %5 = shufflevector %4, poison, zeroinitializer + %6 = bitcast %5 to + ret %6 } define @ext_i8( %a, %b) { diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-dupqlane.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-dupqlane.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-dupqlane.ll @@ -0,0 +1,344 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -passes=instcombine < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +define dso_local @dupq_f32_ab_pattern(float %x, float %y) { +; CHECK-LABEL: @dupq_f32_ab_pattern( +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x float> poison, float [[X:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[Y:%.*]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.vector.insert.nxv4f32.v4f32( poison, <4 x float> [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector [[TMP4]], poison, zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +; CHECK-NEXT: ret [[TMP6]] +; + %1 = insertelement <4 x float> poison, float %x, i64 0 + %2 = insertelement <4 x float> %1, float %y, i64 1 + %3 = insertelement <4 x float> %2, float %x, i64 2 + %4 = insertelement <4 x float> %3, float %y, i64 3 + %5 = tail call @llvm.vector.insert.nxv4f32.v4f32( poison, <4 x float> %4, i64 0) + %6 = tail call @llvm.aarch64.sve.dupq.lane.nxv4f32( %5, i64 0) + ret %6 +} + +define dso_local @dupq_f16_a_pattern(half %a) { +; CHECK-LABEL: @dupq_f16_a_pattern( +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x half> poison, half [[A:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8f16.v8f16( poison, <8 x half> [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.dupq.lane.nxv8f16( [[TMP3]], i64 0) +; CHECK-NEXT: ret [[TMP4]] +; + %1 = insertelement <8 x half> poison, half %a, i64 0 + %2 = insertelement <8 x half> %1, half %a, i64 1 + %3 = insertelement <8 x half> %2, half %a, i64 2 + %4 = insertelement <8 x half> %3, half %a, i64 3 + %5 = insertelement <8 x half> %4, half %a, i64 4 + %6 = insertelement <8 x half> %5, half %a, i64 5 + %7 = insertelement <8 x half> %6, half %a, i64 6 + %8 = insertelement <8 x half> %7, half %a, i64 7 + %9 = tail call @llvm.vector.insert.nxv8f16.v8f16( poison, <8 x half> %8, i64 0) + %10 = tail call @llvm.aarch64.sve.dupq.lane.nxv8f16( %9, i64 0) + ret %10 +} + +define dso_local @dupq_f16_ab_pattern(half %a, half %b) { +; CHECK-LABEL: @dupq_f16_ab_pattern( +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x half> poison, half [[A:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half [[B:%.*]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.vector.insert.nxv8f16.v8f16( poison, <8 x half> [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector [[TMP4]], poison, zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +; CHECK-NEXT: ret [[TMP6]] +; + %1 = insertelement <8 x half> poison, half %a, i64 0 + %2 = insertelement <8 x half> %1, half %b, i64 1 + %3 = insertelement <8 x half> %2, half %a, i64 2 + %4 = insertelement <8 x half> %3, half %b, i64 3 + %5 = insertelement <8 x half> %4, half %a, i64 4 + %6 = insertelement <8 x half> %5, half %b, i64 5 + %7 = insertelement <8 x half> %6, half %a, i64 6 + %8 = insertelement <8 x half> %7, half %b, i64 7 + %9 = tail call @llvm.vector.insert.nxv8f16.v8f16( poison, <8 x half> %8, i64 0) + %10 = tail call @llvm.aarch64.sve.dupq.lane.nxv8f16( %9, i64 0) + ret %10 +} + +define dso_local @dupq_f16_abcd_pattern(half %a, half %b, half %c, half %d) { +; CHECK-LABEL: @dupq_f16_abcd_pattern( +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x half> poison, half [[A:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half [[B:%.*]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x half> [[TMP2]], half [[C:%.*]], i64 2 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x half> [[TMP3]], half [[D:%.*]], i64 3 +; CHECK-NEXT: [[TMP5:%.*]] = call @llvm.vector.insert.nxv8f16.v8f16( poison, <8 x half> [[TMP4]], i64 0) +; CHECK-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector [[TMP6]], poison, zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +; CHECK-NEXT: ret [[TMP8]] +; + %1 = insertelement <8 x half> poison, half %a, i64 0 + %2 = insertelement <8 x half> %1, half %b, i64 1 + %3 = insertelement <8 x half> %2, half %c, i64 2 + %4 = insertelement <8 x half> %3, half %d, i64 3 + %5 = insertelement <8 x half> %4, half %a, i64 4 + %6 = insertelement <8 x half> %5, half %b, i64 5 + %7 = insertelement <8 x half> %6, half %c, i64 6 + %8 = insertelement <8 x half> %7, half %d, i64 7 + %9 = tail call @llvm.vector.insert.nxv8f16.v8f16( poison, <8 x half> %8, i64 0) + %10 = tail call @llvm.aarch64.sve.dupq.lane.nxv8f16( %9, i64 0) + ret %10 +} + +define dso_local @dupq_f16_abcnull_pattern(half %a, half %b, half %c, half %d) { +; CHECK-LABEL: @dupq_f16_abcnull_pattern( +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x half> poison, half [[A:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half [[B:%.*]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x half> [[TMP2]], half [[C:%.*]], i64 2 +; CHECK-NEXT: [[TMP4:%.*]] = call @llvm.vector.insert.nxv8f16.v8f16( poison, <8 x half> [[TMP3]], i64 0) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector [[TMP5]], poison, zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +; CHECK-NEXT: ret [[TMP7]] +; + %1 = insertelement <8 x half> poison, half %a, i64 0 + %2 = insertelement <8 x half> %1, half %b, i64 1 + %3 = insertelement <8 x half> %2, half %c, i64 2 + %4 = insertelement <8 x half> %3, half %a, i64 4 + %5 = insertelement <8 x half> %4, half %b, i64 5 + %6 = insertelement <8 x half> %5, half %c, i64 6 + %7 = tail call @llvm.vector.insert.nxv8f16.v8f16( poison, <8 x half> %6, i64 0) + %8 = tail call @llvm.aarch64.sve.dupq.lane.nxv8f16( %7, i64 0) + ret %8 +} + +define dso_local @dupq_f16_abcd_pattern_double_insert(half %a, half %b, half %c, half %d) { +; CHECK-LABEL: @dupq_f16_abcd_pattern_double_insert( +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x half> poison, half [[A:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half [[B:%.*]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x half> [[TMP2]], half [[C:%.*]], i64 2 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x half> [[TMP3]], half [[D:%.*]], i64 3 +; CHECK-NEXT: [[TMP5:%.*]] = call @llvm.vector.insert.nxv8f16.v8f16( poison, <8 x half> [[TMP4]], i64 0) +; CHECK-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector [[TMP6]], poison, zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +; CHECK-NEXT: ret [[TMP8]] +; + %1 = insertelement <8 x half> poison, half %a, i64 0 + %2 = insertelement <8 x half> %1, half %b, i64 1 + %3 = insertelement <8 x half> %2, half %c, i64 2 + %4 = insertelement <8 x half> %3, half %d, i64 3 + %5 = insertelement <8 x half> %4, half %a, i64 4 + %6 = insertelement <8 x half> %5, half %b, i64 5 + %7 = insertelement <8 x half> %6, half %c, i64 6 + %8 = insertelement <8 x half> %7, half %c, i64 7 + %9 = insertelement <8 x half> %8, half %d, i64 7 + %10 = tail call @llvm.vector.insert.nxv8f16.v8f16( poison, <8 x half> %9, i64 0) + %11 = tail call @llvm.aarch64.sve.dupq.lane.nxv8f16( %10, i64 0) + ret %11 +} + +define dso_local @dupq_f16_abcd_pattern_reverted_insert(half %a, half %b, half %c, half %d) { +; CHECK-LABEL: @dupq_f16_abcd_pattern_reverted_insert( +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x half> poison, half [[A:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half [[B:%.*]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x half> [[TMP2]], half [[C:%.*]], i64 2 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x half> [[TMP3]], half [[D:%.*]], i64 3 +; CHECK-NEXT: [[TMP5:%.*]] = call @llvm.vector.insert.nxv8f16.v8f16( poison, <8 x half> [[TMP4]], i64 0) +; CHECK-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector [[TMP6]], poison, zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +; CHECK-NEXT: ret [[TMP8]] +; + %1 = insertelement <8 x half> poison, half %d, i64 7 + %2 = insertelement <8 x half> %1, half %c, i64 6 + %3 = insertelement <8 x half> %2, half %b, i64 5 + %4 = insertelement <8 x half> %3, half %a, i64 4 + %5 = insertelement <8 x half> %4, half %d, i64 3 + %6 = insertelement <8 x half> %5, half %c, i64 2 + %7 = insertelement <8 x half> %6, half %b, i64 1 + %8 = insertelement <8 x half> %7, half %a, i64 0 + %9 = tail call @llvm.vector.insert.nxv8f16.v8f16( poison, <8 x half> %8, i64 0) + %10 = tail call @llvm.aarch64.sve.dupq.lane.nxv8f16( %9, i64 0) + ret %10 +} + +define dso_local @dupq_f16_ab_pattern_no_front_indices(half %a, half %b) { +; CHECK-LABEL: @dupq_f16_ab_pattern_no_front_indices( +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x half> poison, half [[A:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half [[B:%.*]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.vector.insert.nxv8f16.v8f16( poison, <8 x half> [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector [[TMP4]], poison, zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +; CHECK-NEXT: ret [[TMP6]] +; + %1 = insertelement <8 x half> poison, half %a, i64 2 + %2 = insertelement <8 x half> %1, half %b, i64 3 + %3 = insertelement <8 x half> %2, half %a, i64 4 + %4 = insertelement <8 x half> %3, half %b, i64 5 + %5 = insertelement <8 x half> %4, half %a, i64 6 + %6 = insertelement <8 x half> %5, half %b, i64 7 + %7 = tail call @llvm.vector.insert.nxv8f16.v8f16( poison, <8 x half> %6, i64 0) + %8 = tail call @llvm.aarch64.sve.dupq.lane.nxv8f16( %7, i64 0) + ret %8 +} + +define dso_local @dupq_f16_ab_pattern_no_middle_indices(half %a, half %b) { +; CHECK-LABEL: @dupq_f16_ab_pattern_no_middle_indices( +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x half> poison, half [[A:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half [[B:%.*]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.vector.insert.nxv8f16.v8f16( poison, <8 x half> [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector [[TMP4]], poison, zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +; CHECK-NEXT: ret [[TMP6]] +; + %1 = insertelement <8 x half> poison, half %a, i64 0 + %2 = insertelement <8 x half> %1, half %b, i64 1 + %3 = insertelement <8 x half> %2, half %a, i64 2 + %4 = insertelement <8 x half> %3, half %b, i64 5 + %5 = insertelement <8 x half> %4, half %a, i64 6 + %6 = insertelement <8 x half> %5, half %b, i64 7 + %7 = tail call @llvm.vector.insert.nxv8f16.v8f16( poison, <8 x half> %6, i64 0) + %8 = tail call @llvm.aarch64.sve.dupq.lane.nxv8f16( %7, i64 0) + ret %8 +} + +define dso_local @dupq_f16_ab_pattern_no_end_indices(half %a, half %b) { +; CHECK-LABEL: @dupq_f16_ab_pattern_no_end_indices( +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x half> poison, half [[A:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half [[B:%.*]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.vector.insert.nxv8f16.v8f16( poison, <8 x half> [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector [[TMP4]], poison, zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +; CHECK-NEXT: ret [[TMP6]] +; + %1 = insertelement <8 x half> poison, half %a, i64 0 + %2 = insertelement <8 x half> %1, half %b, i64 1 + %3 = insertelement <8 x half> %2, half %a, i64 2 + %4 = insertelement <8 x half> %3, half %b, i64 3 + %5 = insertelement <8 x half> %4, half %a, i64 4 + %6 = insertelement <8 x half> %5, half %b, i64 5 + %7 = tail call @llvm.vector.insert.nxv8f16.v8f16( poison, <8 x half> %6, i64 0) + %8 = tail call @llvm.aarch64.sve.dupq.lane.nxv8f16( %7, i64 0) + ret %8 +} + +define dso_local @dupq_f16_ab_pattern_no_end_indices_not_poison(half %a, half %b, %c) { +; CHECK-LABEL: @dupq_f16_ab_pattern_no_end_indices_not_poison( +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x half> poison, half [[A:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half [[B:%.*]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x half> [[TMP2]], half [[A]], i64 2 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x half> [[TMP3]], half [[B]], i64 3 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x half> [[TMP4]], half [[A]], i64 4 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x half> [[TMP5]], half [[B]], i64 5 +; CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv8f16.v8f16( [[C:%.*]], <8 x half> [[TMP6]], i64 0) +; CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.aarch64.sve.dupq.lane.nxv8f16( [[TMP7]], i64 0) +; CHECK-NEXT: ret [[TMP8]] +; + %1 = insertelement <8 x half> poison, half %a, i64 0 + %2 = insertelement <8 x half> %1, half %b, i64 1 + %3 = insertelement <8 x half> %2, half %a, i64 2 + %4 = insertelement <8 x half> %3, half %b, i64 3 + %5 = insertelement <8 x half> %4, half %a, i64 4 + %6 = insertelement <8 x half> %5, half %b, i64 5 + %7 = tail call @llvm.vector.insert.nxv8f16.v8f16( %c, <8 x half> %6, i64 0) + %8 = tail call @llvm.aarch64.sve.dupq.lane.nxv8f16( %7, i64 0) + ret %8 +} + +define dso_local @dupq_f16_ab_no_front_pattern(half %a, half %b) { +; CHECK-LABEL: @dupq_f16_ab_no_front_pattern( +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x half> poison, half [[A:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half [[A]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x half> [[TMP2]], half [[A]], i64 2 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x half> [[TMP3]], half [[B:%.*]], i64 3 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x half> [[TMP4]], half [[A]], i64 4 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x half> [[TMP5]], half [[B]], i64 5 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x half> [[TMP6]], half [[A]], i64 6 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x half> [[TMP7]], half [[B]], i64 7 +; CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv8f16.v8f16( poison, <8 x half> [[TMP8]], i64 0) +; CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.aarch64.sve.dupq.lane.nxv8f16( [[TMP9]], i64 0) +; CHECK-NEXT: ret [[TMP10]] +; + %1 = insertelement <8 x half> poison, half %a, i64 0 + %2 = insertelement <8 x half> %1, half %a, i64 1 + %3 = insertelement <8 x half> %2, half %a, i64 2 + %4 = insertelement <8 x half> %3, half %b, i64 3 + %5 = insertelement <8 x half> %4, half %a, i64 4 + %6 = insertelement <8 x half> %5, half %b, i64 5 + %7 = insertelement <8 x half> %6, half %a, i64 6 + %8 = insertelement <8 x half> %7, half %b, i64 7 + %9 = tail call @llvm.vector.insert.nxv8f16.v8f16( poison, <8 x half> %8, i64 0) + %10 = tail call @llvm.aarch64.sve.dupq.lane.nxv8f16( %9, i64 0) + ret %10 +} + +define dso_local @dupq_f16_ab_no_middle_pattern(half %a, half %b) { +; CHECK-LABEL: @dupq_f16_ab_no_middle_pattern( +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x half> poison, half [[A:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half [[A]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x half> [[TMP2]], half [[B:%.*]], i64 2 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x half> [[TMP3]], half [[B]], i64 3 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x half> [[TMP4]], half [[A]], i64 4 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x half> [[TMP5]], half [[A]], i64 5 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x half> [[TMP6]], half [[A]], i64 6 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x half> [[TMP7]], half [[B]], i64 7 +; CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv8f16.v8f16( poison, <8 x half> [[TMP8]], i64 0) +; CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.aarch64.sve.dupq.lane.nxv8f16( [[TMP9]], i64 0) +; CHECK-NEXT: ret [[TMP10]] +; + %1 = insertelement <8 x half> poison, half %a, i64 0 + %2 = insertelement <8 x half> %1, half %a, i64 1 + %3 = insertelement <8 x half> %2, half %b, i64 2 + %4 = insertelement <8 x half> %3, half %b, i64 3 + %5 = insertelement <8 x half> %4, half %a, i64 4 + %6 = insertelement <8 x half> %5, half %a, i64 5 + %7 = insertelement <8 x half> %6, half %a, i64 6 + %8 = insertelement <8 x half> %7, half %b, i64 7 + %9 = tail call @llvm.vector.insert.nxv8f16.v8f16( poison, <8 x half> %8, i64 0) + %10 = tail call @llvm.aarch64.sve.dupq.lane.nxv8f16( %9, i64 0) + ret %10 +} + +define dso_local @dupq_f16_ab_no_end_pattern(half %a, half %b) { +; CHECK-LABEL: @dupq_f16_ab_no_end_pattern( +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x half> poison, half [[A:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half [[B:%.*]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x half> [[TMP2]], half [[A]], i64 2 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x half> [[TMP3]], half [[B]], i64 3 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x half> [[TMP4]], half [[A]], i64 4 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x half> [[TMP5]], half [[B]], i64 5 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x half> [[TMP6]], half [[A]], i64 6 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x half> [[TMP7]], half [[A]], i64 7 +; CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv8f16.v8f16( poison, <8 x half> [[TMP8]], i64 0) +; CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.aarch64.sve.dupq.lane.nxv8f16( [[TMP9]], i64 0) +; CHECK-NEXT: ret [[TMP10]] +; + %1 = insertelement <8 x half> poison, half %a, i64 0 + %2 = insertelement <8 x half> %1, half %b, i64 1 + %3 = insertelement <8 x half> %2, half %a, i64 2 + %4 = insertelement <8 x half> %3, half %b, i64 3 + %5 = insertelement <8 x half> %4, half %a, i64 4 + %6 = insertelement <8 x half> %5, half %b, i64 5 + %7 = insertelement <8 x half> %6, half %a, i64 6 + %8 = insertelement <8 x half> %7, half %a, i64 7 + %9 = tail call @llvm.vector.insert.nxv8f16.v8f16( poison, <8 x half> %8, i64 0) + %10 = tail call @llvm.aarch64.sve.dupq.lane.nxv8f16( %9, i64 0) + ret %10 +} + +declare @llvm.vector.insert.nxv8f16.v8f16(, <8 x half>, i64) +declare @llvm.aarch64.sve.dupq.lane.nxv8f16(, i64) +declare @llvm.vector.insert.nxv4f32.v4f32(, <4 x float>, i64) +declare @llvm.vector.insert.nxv2f32.v2f32(, <2 x float>, i64) +declare @llvm.aarch64.sve.dupq.lane.nxv4f32(, i64) +declare @llvm.vector.insert.nxv4i32.v4i32(, <4 x i32>, i64) +declare @llvm.aarch64.sve.dupq.lane.nxv4i32(, i64) +declare @llvm.vector.insert.nxv8i16.v8i16(, <8 x i16>, i64) +declare @llvm.aarch64.sve.dupq.lane.nxv8i16(, i64) + +attributes #0 = { "target-features"="+sve" }