diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1434,6 +1434,110 @@
   return std::nullopt;
 }
 
+static SmallVector<Value *> SimplifyValuePattern(SmallVector<Value *> Vec,
+                                                 bool AllowPoison) {
+  std::size_t VecSize = Vec.size();
+  if (VecSize < 2 || !isPowerOf2_64(VecSize))
+    return Vec;
+  std::size_t HalfVecSize = VecSize / 2;
+
+  SmallVector<Value *> Lhs(Vec.begin(), Vec.begin() + HalfVecSize);
+  SmallVector<Value *> Rhs(Vec.begin() + HalfVecSize, Vec.end());
+
+  for (std::size_t I = 0; I < Lhs.size(); I++) {
+    if (Lhs[I] != nullptr && Rhs[I] != nullptr) {
+      if (Lhs[I] == Rhs[I])
+        continue;
+      else
+        return Vec;
+    }
+    if (!AllowPoison)
+      return Vec;
+    if (Lhs[I] == nullptr && Rhs[I] != nullptr)
+      Lhs[I] = Rhs[I];
+  }
+
+  return SimplifyValuePattern(Lhs, AllowPoison);
+}
+
+// Try to simplify dupqlane patterns like dupqlane(f32 A, f32 B, f32 A, f32 B)
+// to dupqlane(f64(C)) where C is A concatenated with B
+static std::optional<Instruction *> instCombineSVEDupqLane(InstCombiner &IC,
+                                                           IntrinsicInst &II) {
+  auto VecInsert = dyn_cast<IntrinsicInst>(II.getOperand(0));
+  if (!VecInsert || VecInsert->getIntrinsicID() != Intrinsic::vector_insert ||
+      !isa<FixedVectorType>(VecInsert->getOperand(1)->getType()))
+    return std::nullopt;
+  auto IIScalableTy = cast<ScalableVectorType>(II.getType());
+
+  // Insert the scalars into a SmallVector ordered by InsertElement index
+  Value *CurrentInsertElt = VecInsert->getOperand(1);
+  uint64_t NumElements = IIScalableTy->getMinNumElements();
+  SmallVector<Value *> InsertEltVec(NumElements, nullptr);
+  for (uint64_t I = NumElements; I > 0; I--) {
+    InsertElementInst *InsertElt =
+        dyn_cast<InsertElementInst>(CurrentInsertElt);
+    if (!InsertElt)
+      break;
+    auto Idx = cast<ConstantInt>(InsertElt->getOperand(2));
+    InsertEltVec[Idx->getValue().getZExtValue()] = InsertElt->getOperand(1);
+    CurrentInsertElt = InsertElt->getOperand(0);
+  }
+
+  bool AllowPoison = isa<PoisonValue>(VecInsert->getOperand(0));
+  SmallVector<Value *> Pattern =
+      SimplifyValuePattern(InsertEltVec, AllowPoison);
+  // Bail out if there is no pattern found
+  if (Pattern == InsertEltVec)
+    return std::nullopt;
+
+  IRBuilder<> Builder(II.getContext());
+  Builder.SetInsertPoint(&II);
+
+  // Rebuild the simplified chain of InsertElements. e.g. (a, b, a, b) as (a, b)
+  Value *InsertEltChain = nullptr;
+  for (size_t I = 0; I < Pattern.size(); I++) {
+    if (Pattern[I] == nullptr)
+      continue;
+    Constant *Idx = ConstantInt::get(Builder.getInt64Ty(), APInt(64, I));
+    if (InsertEltChain == nullptr)
+      InsertEltChain = Builder.CreateInsertElement(
+          PoisonValue::get(CurrentInsertElt->getType()), Pattern[I], Idx);
+    else
+      InsertEltChain =
+          Builder.CreateInsertElement(InsertEltChain, Pattern[I], Idx);
+  }
+  if (InsertEltChain == nullptr)
+    return std::nullopt;
+
+  // Splat the simplified sequence, e.g. (f16 a, f16 b, f16 c, f16 d) as one i64
+  // value or (f16 a, f16 b) as one i32 value. This requires an InsertSubvector
+  // be bitcast to a type wide enough to fit the sequence, be splatted, and then
+  // be narrowed back to the original type.
+  int PatternWidth = IIScalableTy->getScalarSizeInBits() * Pattern.size();
+  int PatternElementCount = IIScalableTy->getScalarSizeInBits() *
+                            IIScalableTy->getMinNumElements() / PatternWidth;
+
+  IntegerType *WideTy = Builder.getIntNTy(PatternWidth);
+  ScalableVectorType *WideScalableTy =
+      ScalableVectorType::get(WideTy, PatternElementCount);
+  ScalableVectorType *WideShuffleMaskTy =
+      ScalableVectorType::get(Builder.getInt32Ty(), PatternElementCount);
+
+  Constant *ZeroIdx = ConstantInt::get(Builder.getInt64Ty(), APInt(64, 0));
+  CallInst *InsertSubvector = Builder.CreateInsertVector(
+      II.getType(), PoisonValue::get(II.getType()), InsertEltChain, ZeroIdx);
+  Value *WideBitcast =
+      Builder.CreateBitOrPointerCast(InsertSubvector, WideScalableTy);
+  auto WideShuffleMask = ConstantAggregateZero::get(WideShuffleMaskTy);
+  Value *WideShuffle = Builder.CreateShuffleVector(
+      WideBitcast, PoisonValue::get(WideScalableTy), WideShuffleMask);
+
+  Value *NarrowBitcast =
+      Builder.CreateBitOrPointerCast(WideShuffle, II.getType());
+  return IC.replaceInstUsesWith(II, NarrowBitcast);
+}
+
 static std::optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC,
                                                         IntrinsicInst &II) {
   Value *A = II.getArgOperand(0);
@@ -1551,6 +1655,8 @@
     return instCombineSVESel(IC, II);
   case Intrinsic::aarch64_sve_srshl:
     return instCombineSVESrshl(IC, II);
+  case Intrinsic::aarch64_sve_dupq_lane:
+    return instCombineSVEDupqLane(IC, II);
   }
 
   return std::nullopt;
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll
@@ -587,49 +587,35 @@
 define dso_local <vscale x 4 x float> @dupq_f32_repeat_complex(float %x, float %y) {
 ; CHECK-LABEL: dupq_f32_repeat_complex:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
-; CHECK-NEXT:    mov v2.16b, v0.16b
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
 ; CHECK-NEXT:    // kill: def $s1 killed $s1 def $q1
-; CHECK-NEXT:    mov v2.s[1], v1.s[0]
-; CHECK-NEXT:    mov v2.s[2], v0.s[0]
-; CHECK-NEXT:    mov v2.s[3], v1.s[0]
-; CHECK-NEXT:    mov z0.q, q2
+; CHECK-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-NEXT:    mov z0.d, d0
 ; CHECK-NEXT:    ret
   %1 = insertelement <4 x float> undef, float %x, i64 0
   %2 = insertelement <4 x float> %1, float %y, i64 1
-  %3 = insertelement <4 x float> %2, float %x, i64 2
-  %4 = insertelement <4 x float> %3, float %y, i64 3
-  %5 = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> undef, <4 x float> %4, i64 0)
-  %6 = tail call <vscale x 4 x float> @llvm.aarch64.sve.dupq.lane.nxv4f32(<vscale x 4 x float> %5, i64 0)
+  %3 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> undef, <4 x float> %2, i64 0)
+  %4 = bitcast <vscale x 4 x float> %3 to <vscale x 2 x double>
+  %5 = shufflevector <vscale x 2 x double> %4, <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer
+  %6 = bitcast <vscale x 2 x double> %5 to <vscale x 4 x float>
   ret <vscale x 4 x float> %6
 }
 
-define dso_local <vscale x 8 x half> @dupq_f16_repeat_complex(half %a, half %b) {
+define dso_local <vscale x 8 x half> @dupq_f16_repeat_complex(half %x, half %y) {
 ; CHECK-LABEL: dupq_f16_repeat_complex:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $q0
-; CHECK-NEXT:    mov v2.16b, v0.16b
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
 ; CHECK-NEXT:    // kill: def $h1 killed $h1 def $q1
-; CHECK-NEXT:    mov v2.h[1], v1.h[0]
-; CHECK-NEXT:    mov v2.h[2], v0.h[0]
-; CHECK-NEXT:    mov v2.h[3], v1.h[0]
-; CHECK-NEXT:    mov v2.h[4], v0.h[0]
-; CHECK-NEXT:    mov v2.h[5], v1.h[0]
-; CHECK-NEXT:    mov v2.h[6], v0.h[0]
-; CHECK-NEXT:    mov v2.h[7], v1.h[0]
-; CHECK-NEXT:    mov z0.q, q2
-; CHECK-NEXT:    ret
-  %1 = insertelement <8 x half> undef, half %a, i64 0
-  %2 = insertelement <8 x half> %1, half %b, i64 1
-  %3 = insertelement <8 x half> %2, half %a, i64 2
-  %4 = insertelement <8 x half> %3, half %b, i64 3
-  %5 = insertelement <8 x half> %4, half %a, i64 4
-  %6 = insertelement <8 x half> %5, half %b, i64 5
-  %7 = insertelement <8 x half> %6, half %a, i64 6
-  %8 = insertelement <8 x half> %7, half %b, i64 7
-  %9 = tail call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> undef, <8 x half> %8, i64 0)
-  %10 = tail call <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half> %9, i64 0)
-  ret <vscale x 8 x half> %10
+; CHECK-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-NEXT:    mov z0.s, s0
+; CHECK-NEXT:    ret
+  %1 = insertelement <8 x half> undef, half %x, i64 0
+  %2 = insertelement <8 x half> %1, half %y, i64 1
+  %3 = call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> undef, <8 x half> %2, i64 0)
+  %4 = bitcast <vscale x 8 x half> %3 to <vscale x 4 x float>
+  %5 = shufflevector <vscale x 4 x float> %4, <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
+  %6 = bitcast <vscale x 4 x float> %5 to <vscale x 8 x half>
+  ret <vscale x 8 x half> %6
 }
 
 define <vscale x 16 x i8> @ext_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-dupqlane.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-dupqlane.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-dupqlane.ll
@@ -0,0 +1,295 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -passes=instcombine < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define dso_local <vscale x 4 x float> @dupq_f32_ab_pattern(float %x, float %y) {
+; CHECK-LABEL: @dupq_f32_ab_pattern(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> poison, float [[X:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[Y:%.*]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> poison, <4 x float> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 4 x float> [[TMP3]] to <vscale x 2 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <vscale x 2 x i64> [[TMP4]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 2 x i64> [[TMP5]] to <vscale x 4 x float>
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP6]]
+;
+  %1 = insertelement <4 x float> poison, float %x, i64 0
+  %2 = insertelement <4 x float> %1, float %y, i64 1
+  %3 = insertelement <4 x float> %2, float %x, i64 2
+  %4 = insertelement <4 x float> %3, float %y, i64 3
+  %5 = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> poison, <4 x float> %4, i64 0)
+  %6 = tail call <vscale x 4 x float> @llvm.aarch64.sve.dupq.lane.nxv4f32(<vscale x 4 x float> %5, i64 0)
+  ret <vscale x 4 x float> %6
+}
+
+define dso_local <vscale x 8 x half> @dupq_f16_a_pattern(half %a) {
+; CHECK-LABEL: @dupq_f16_a_pattern(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x half> poison, half [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> poison, <8 x half> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half> [[TMP3]], i64 0)
+; CHECK-NEXT:    ret <vscale x 8 x half> [[TMP4]]
+;
+  %1 = insertelement <8 x half> poison, half %a, i64 0
+  %2 = insertelement <8 x half> %1, half %a, i64 1
+  %3 = insertelement <8 x half> %2, half %a, i64 2
+  %4 = insertelement <8 x half> %3, half %a, i64 3
+  %5 = insertelement <8 x half> %4, half %a, i64 4
+  %6 = insertelement <8 x half> %5, half %a, i64 5
+  %7 = insertelement <8 x half> %6, half %a, i64 6
+  %8 = insertelement <8 x half> %7, half %a, i64 7
+  %9 = tail call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> poison, <8 x half> %8, i64 0)
+  %10 = tail call <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half> %9, i64 0)
+  ret <vscale x 8 x half> %10
+}
+
+define dso_local <vscale x 8 x half> @dupq_f16_ab_pattern(half %a, half %b) {
+; CHECK-LABEL: @dupq_f16_ab_pattern(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x half> poison, half [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half [[B:%.*]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> poison, <8 x half> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 8 x half> [[TMP3]] to <vscale x 4 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <vscale x 4 x i32> [[TMP4]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 4 x i32> [[TMP5]] to <vscale x 8 x half>
+; CHECK-NEXT:    ret <vscale x 8 x half> [[TMP6]]
+;
+  %1 = insertelement <8 x half> poison, half %a, i64 0
+  %2 = insertelement <8 x half> %1, half %b, i64 1
+  %3 = insertelement <8 x half> %2, half %a, i64 2
+  %4 = insertelement <8 x half> %3, half %b, i64 3
+  %5 = insertelement <8 x half> %4, half %a, i64 4
+  %6 = insertelement <8 x half> %5, half %b, i64 5
+  %7 = insertelement <8 x half> %6, half %a, i64 6
+  %8 = insertelement <8 x half> %7, half %b, i64 7
+  %9 = tail call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> poison, <8 x half> %8, i64 0)
+  %10 = tail call <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half> %9, i64 0)
+  ret <vscale x 8 x half> %10
+}
+
+define dso_local <vscale x 8 x half> @dupq_f16_abcd_pattern(half %a, half %b, half %c, half %d) {
+; CHECK-LABEL: @dupq_f16_abcd_pattern(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x half> poison, half [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half [[B:%.*]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x half> [[TMP2]], half [[C:%.*]], i64 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x half> [[TMP3]], half [[D:%.*]], i64 3
+; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> poison, <8 x half> [[TMP4]], i64 0)
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 8 x half> [[TMP5]] to <vscale x 2 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <vscale x 2 x i64> [[TMP6]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 2 x i64> [[TMP7]] to <vscale x 8 x half>
+; CHECK-NEXT:    ret <vscale x 8 x half> [[TMP8]]
+;
+  %1 = insertelement <8 x half> poison, half %a, i64 0
+  %2 = insertelement <8 x half> %1, half %b, i64 1
+  %3 = insertelement <8 x half> %2, half %c, i64 2
+  %4 = insertelement <8 x half> %3, half %d, i64 3
+  %5 = insertelement <8 x half> %4, half %a, i64 4
+  %6 = insertelement <8 x half> %5, half %b, i64 5
+  %7 = insertelement <8 x half> %6, half %c, i64 6
+  %8 = insertelement <8 x half> %7, half %d, i64 7
+  %9 = tail call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> poison, <8 x half> %8, i64 0)
+  %10 = tail call <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half> %9, i64 0)
+  ret <vscale x 8 x half> %10
+}
+
+define dso_local <vscale x 8 x half> @dupq_f16_abcd_pattern_reverted_insert(half %a, half %b, half %c, half %d) {
+; CHECK-LABEL: @dupq_f16_abcd_pattern_reverted_insert(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x half> poison, half [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half [[B:%.*]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x half> [[TMP2]], half [[C:%.*]], i64 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x half> [[TMP3]], half [[D:%.*]], i64 3
+; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> poison, <8 x half> [[TMP4]], i64 0)
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 8 x half> [[TMP5]] to <vscale x 2 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <vscale x 2 x i64> [[TMP6]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 2 x i64> [[TMP7]] to <vscale x 8 x half>
+; CHECK-NEXT:    ret <vscale x 8 x half> [[TMP8]]
+;
+  %1 = insertelement <8 x half> poison, half %d, i64 7
+  %2 = insertelement <8 x half> %1, half %c, i64 6
+  %3 = insertelement <8 x half> %2, half %b, i64 5
+  %4 = insertelement <8 x half> %3, half %a, i64 4
+  %5 = insertelement <8 x half> %4, half %d, i64 3
+  %6 = insertelement <8 x half> %5, half %c, i64 2
+  %7 = insertelement <8 x half> %6, half %b, i64 1
+  %8 = insertelement <8 x half> %7, half %a, i64 0
+  %9 = tail call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> poison, <8 x half> %8, i64 0)
+  %10 = tail call <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half> %9, i64 0)
+  ret <vscale x 8 x half> %10
+}
+
+define dso_local <vscale x 8 x half> @dupq_f16_ab_pattern_no_front_indices(half %a, half %b) {
+; CHECK-LABEL: @dupq_f16_ab_pattern_no_front_indices(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x half> poison, half [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half [[B:%.*]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> poison, <8 x half> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 8 x half> [[TMP3]] to <vscale x 4 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <vscale x 4 x i32> [[TMP4]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 4 x i32> [[TMP5]] to <vscale x 8 x half>
+; CHECK-NEXT:    ret <vscale x 8 x half> [[TMP6]]
+;
+  %1 = insertelement <8 x half> poison, half %a, i64 2
+  %2 = insertelement <8 x half> %1, half %b, i64 3
+  %3 = insertelement <8 x half> %2, half %a, i64 4
+  %4 = insertelement <8 x half> %3, half %b, i64 5
+  %5 = insertelement <8 x half> %4, half %a, i64 6
+  %6 = insertelement <8 x half> %5, half %b, i64 7
+  %7 = tail call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> poison, <8 x half> %6, i64 0)
+  %8 = tail call <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half> %7, i64 0)
+  ret <vscale x 8 x half> %8
+}
+
+define dso_local <vscale x 8 x half> @dupq_f16_ab_pattern_no_middle_indices(half %a, half %b) {
+; CHECK-LABEL: @dupq_f16_ab_pattern_no_middle_indices(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x half> poison, half [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half [[B:%.*]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> poison, <8 x half> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 8 x half> [[TMP3]] to <vscale x 4 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <vscale x 4 x i32> [[TMP4]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 4 x i32> [[TMP5]] to <vscale x 8 x half>
+; CHECK-NEXT:    ret <vscale x 8 x half> [[TMP6]]
+;
+  %1 = insertelement <8 x half> poison, half %a, i64 0
+  %2 = insertelement <8 x half> %1, half %b, i64 1
+  %3 = insertelement <8 x half> %2, half %a, i64 2
+  %4 = insertelement <8 x half> %3, half %b, i64 5
+  %5 = insertelement <8 x half> %4, half %a, i64 6
+  %6 = insertelement <8 x half> %5, half %b, i64 7
+  %7 = tail call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> poison, <8 x half> %6, i64 0)
+  %8 = tail call <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half> %7, i64 0)
+  ret <vscale x 8 x half> %8
+}
+
+define dso_local <vscale x 8 x half> @dupq_f16_ab_pattern_no_end_indices(half %a, half %b) {
+; CHECK-LABEL: @dupq_f16_ab_pattern_no_end_indices(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x half> poison, half [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half [[B:%.*]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> poison, <8 x half> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 8 x half> [[TMP3]] to <vscale x 4 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <vscale x 4 x i32> [[TMP4]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 4 x i32> [[TMP5]] to <vscale x 8 x half>
+; CHECK-NEXT:    ret <vscale x 8 x half> [[TMP6]]
+;
+  %1 = insertelement <8 x half> poison, half %a, i64 0
+  %2 = insertelement <8 x half> %1, half %b, i64 1
+  %3 = insertelement <8 x half> %2, half %a, i64 2
+  %4 = insertelement <8 x half> %3, half %b, i64 3
+  %5 = insertelement <8 x half> %4, half %a, i64 4
+  %6 = insertelement <8 x half> %5, half %b, i64 5
+  %7 = tail call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> poison, <8 x half> %6, i64 0)
+  %8 = tail call <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half> %7, i64 0)
+  ret <vscale x 8 x half> %8
+}
+
+define dso_local <vscale x 8 x half> @dupq_f16_ab_pattern_no_end_indices_not_poison(half %a, half %b, <vscale x 8 x half> %c) {
+; CHECK-LABEL: @dupq_f16_ab_pattern_no_end_indices_not_poison(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x half> poison, half [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half [[B:%.*]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x half> [[TMP2]], half [[A]], i64 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x half> [[TMP3]], half [[B]], i64 3
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x half> [[TMP4]], half [[A]], i64 4
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x half> [[TMP5]], half [[B]], i64 5
+; CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> [[C:%.*]], <8 x half> [[TMP6]], i64 0)
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half> [[TMP7]], i64 0)
+; CHECK-NEXT:    ret <vscale x 8 x half> [[TMP8]]
+;
+  %1 = insertelement <8 x half> poison, half %a, i64 0
+  %2 = insertelement <8 x half> %1, half %b, i64 1
+  %3 = insertelement <8 x half> %2, half %a, i64 2
+  %4 = insertelement <8 x half> %3, half %b, i64 3
+  %5 = insertelement <8 x half> %4, half %a, i64 4
+  %6 = insertelement <8 x half> %5, half %b, i64 5
+  %7 = tail call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> %c, <8 x half> %6, i64 0)
+  %8 = tail call <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half> %7, i64 0)
+  ret <vscale x 8 x half> %8
+}
+
+define dso_local <vscale x 8 x half> @dupq_f16_ab_no_front_pattern(half %a, half %b) {
+; CHECK-LABEL: @dupq_f16_ab_no_front_pattern(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x half> poison, half [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half [[A]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x half> [[TMP2]], half [[A]], i64 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x half> [[TMP3]], half [[B:%.*]], i64 3
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x half> [[TMP4]], half [[A]], i64 4
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x half> [[TMP5]], half [[B]], i64 5
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x half> [[TMP6]], half [[A]], i64 6
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x half> [[TMP7]], half [[B]], i64 7
+; CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> poison, <8 x half> [[TMP8]], i64 0)
+; CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half> [[TMP9]], i64 0)
+; CHECK-NEXT:    ret <vscale x 8 x half> [[TMP10]]
+;
+  %1 = insertelement <8 x half> poison, half %a, i64 0
+  %2 = insertelement <8 x half> %1, half %a, i64 1
+  %3 = insertelement <8 x half> %2, half %a, i64 2
+  %4 = insertelement <8 x half> %3, half %b, i64 3
+  %5 = insertelement <8 x half> %4, half %a, i64 4
+  %6 = insertelement <8 x half> %5, half %b, i64 5
+  %7 = insertelement <8 x half> %6, half %a, i64 6
+  %8 = insertelement <8 x half> %7, half %b, i64 7
+  %9 = tail call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> poison, <8 x half> %8, i64 0)
+  %10 = tail call <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half> %9, i64 0)
+  ret <vscale x 8 x half> %10
+}
+
+define dso_local <vscale x 8 x half> @dupq_f16_ab_no_middle_pattern(half %a, half %b) {
+; CHECK-LABEL: @dupq_f16_ab_no_middle_pattern(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x half> poison, half [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half [[A]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x half> [[TMP2]], half [[B:%.*]], i64 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x half> [[TMP3]], half [[B]], i64 3
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x half> [[TMP4]], half [[A]], i64 4
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x half> [[TMP5]], half [[A]], i64 5
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x half> [[TMP6]], half [[A]], i64 6
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x half> [[TMP7]], half [[B]], i64 7
+; CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> poison, <8 x half> [[TMP8]], i64 0)
+; CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half> [[TMP9]], i64 0)
+; CHECK-NEXT:    ret <vscale x 8 x half> [[TMP10]]
+;
+  %1 = insertelement <8 x half> poison, half %a, i64 0
+  %2 = insertelement <8 x half> %1, half %a, i64 1
+  %3 = insertelement <8 x half> %2, half %b, i64 2
+  %4 = insertelement <8 x half> %3, half %b, i64 3
+  %5 = insertelement <8 x half> %4, half %a, i64 4
+  %6 = insertelement <8 x half> %5, half %a, i64 5
+  %7 = insertelement <8 x half> %6, half %a, i64 6
+  %8 = insertelement <8 x half> %7, half %b, i64 7
+  %9 = tail call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> poison, <8 x half> %8, i64 0)
+  %10 = tail call <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half> %9, i64 0)
+  ret <vscale x 8 x half> %10
+}
+
+define dso_local <vscale x 8 x half> @dupq_f16_ab_no_end_pattern(half %a, half %b) {
+; CHECK-LABEL: @dupq_f16_ab_no_end_pattern(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x half> poison, half [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half [[B:%.*]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x half> [[TMP2]], half [[A]], i64 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x half> [[TMP3]], half [[B]], i64 3
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x half> [[TMP4]], half [[A]], i64 4
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x half> [[TMP5]], half [[B]], i64 5
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x half> [[TMP6]], half [[A]], i64 6
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x half> [[TMP7]], half [[A]], i64 7
+; CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> poison, <8 x half> [[TMP8]], i64 0)
+; CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half> [[TMP9]], i64 0)
+; CHECK-NEXT:    ret <vscale x 8 x half> [[TMP10]]
+;
+  %1 = insertelement <8 x half> poison, half %a, i64 0
+  %2 = insertelement <8 x half> %1, half %b, i64 1
+  %3 = insertelement <8 x half> %2, half %a, i64 2
+  %4 = insertelement <8 x half> %3, half %b, i64 3
+  %5 = insertelement <8 x half> %4, half %a, i64 4
+  %6 = insertelement <8 x half> %5, half %b, i64 5
+  %7 = insertelement <8 x half> %6, half %a, i64 6
+  %8 = insertelement <8 x half> %7, half %a, i64 7
+  %9 = tail call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> poison, <8 x half> %8, i64 0)
+  %10 = tail call <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half> %9, i64 0)
+  ret <vscale x 8 x half> %10
+}
+
+declare <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half>, <8 x half>, i64)
+declare <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half>, i64)
+declare <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float>, <4 x float>, i64)
+declare <vscale x 4 x float> @llvm.aarch64.sve.dupq.lane.nxv4f32(<vscale x 4 x float>, i64)
+declare <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32>, <4 x i32>, i64)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.dupq.lane.nxv4i32(<vscale x 4 x i32>, i64)
+declare <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v8i16(<vscale x 8 x i16>, <8 x i16>, i64)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.dupq.lane.nxv8i16(<vscale x 8 x i16>, i64)
+
+attributes #0 = { "target-features"="+sve" }