diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1381,6 +1381,79 @@
   return None;
 }
 
+static Optional<Instruction *> instCombineSVEDupqLane(InstCombiner &IC,
+                                                      IntrinsicInst &II) {
+  IRBuilder<> Builder(II.getContext());
+  Builder.SetInsertPoint(&II);
+
+  auto VecTy = cast<ScalableVectorType>(II.getType());
+  Type *VecElTy = VecTy->getScalarType();
+
+  Type *WideElTy = nullptr;
+  if (VecElTy->isFloatTy())
+    WideElTy = Builder.getDoubleTy();
+  if (VecElTy->isHalfTy())
+    WideElTy = Builder.getFloatTy();
+  if (WideElTy == nullptr)
+    return None;
+
+  auto VecInsert = dyn_cast<IntrinsicInst>(II.getOperand(0));
+  if (!VecInsert || VecInsert->getIntrinsicID() != Intrinsic::vector_insert)
+    return None;
+
+  // Capture the sequence of scalars in reverse, i.e. y(3), x(2), y(1), x(0)
+  SmallVector<Value *, 8> RSequence;
+  Value *CurrentValue = VecInsert->getOperand(1);
+  uint64_t NumElements = VecTy->getMinNumElements();
+  for (uint64_t i = NumElements; i > 0; i--) {
+    InsertElementInst *InsertElt = dyn_cast<InsertElementInst>(CurrentValue);
+    if (!InsertElt)
+      break;
+
+    // Bail if the insertelement indices aren't descending linearly
+    auto Idx = cast<ConstantInt>(InsertElt->getOperand(2));
+    if (Idx->getValue() != APInt(64, i - 1))
+      return None;
+
+    RSequence.push_back(InsertElt->getOperand(1));
+    CurrentValue = InsertElt->getOperand(0);
+  }
+
+  if (RSequence.size() < NumElements)
+    return None;
+
+  // Check for the pattern (y, x, y, x) or (y, x, y, x, y, x, y, x)
+  for (uint64_t i = 0; i < NumElements - 2; i++)
+    if (RSequence[i] != RSequence[i + 2])
+      return None;
+
+  // Simplify the redundant long chain of insertelement's to two insertelements
+  // splatted as one widened element
+  auto Zero = ConstantInt::get(Builder.getInt64Ty(), APInt::getZero(64));
+  auto One = ConstantInt::get(Builder.getInt64Ty(), APInt(64, 1));
+  Value *InsertEltX = Builder.CreateInsertElement(
+      UndefValue::get(CurrentValue->getType()), RSequence[1], Zero);
+  Value *InsertEltY =
+      Builder.CreateInsertElement(InsertEltX, RSequence[0], One);
+  CallInst *InsertSubvec = Builder.CreateInsertVector(
+      II.getType(), UndefValue::get(II.getType()), InsertEltY, Zero);
+
+  uint64_t HalfNumElements = NumElements / 2;
+  auto WideScalableTy = ScalableVectorType::get(WideElTy, HalfNumElements);
+  Value *WideCast =
+      Builder.CreateBitOrPointerCast(InsertSubvec, WideScalableTy);
+
+  auto ShuffleMaskTy =
+      ScalableVectorType::get(Builder.getInt32Ty(), HalfNumElements);
+  auto ShuffleMask = ConstantAggregateZero::get(ShuffleMaskTy);
+
+  Value *Shuffle = Builder.CreateShuffleVector(
+      WideCast, PoisonValue::get(WideScalableTy), ShuffleMask);
+  Value *RetCast = Builder.CreateBitOrPointerCast(Shuffle, II.getType());
+
+  return IC.replaceInstUsesWith(II, RetCast);
+}
+
 static Optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC,
                                                    IntrinsicInst &II) {
   Value *A = II.getArgOperand(0);
@@ -1497,6 +1570,8 @@
     return instCombineSVESel(IC, II);
   case Intrinsic::aarch64_sve_srshl:
     return instCombineSVESrshl(IC, II);
+  case Intrinsic::aarch64_sve_dupq_lane:
+    return instCombineSVEDupqLane(IC, II);
   }
 
   return None;
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll
@@ -587,49 +587,35 @@
 define dso_local <vscale x 4 x float> @dupq_f32_repeat_complex(float %x, float %y) {
 ; CHECK-LABEL: dupq_f32_repeat_complex:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
-; CHECK-NEXT:    mov v2.16b, v0.16b
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
 ; CHECK-NEXT:    // kill: def $s1 killed $s1 def $q1
-; CHECK-NEXT:    mov v2.s[1], v1.s[0]
-; CHECK-NEXT:    mov v2.s[2], v0.s[0]
-; CHECK-NEXT:    mov v2.s[3], v1.s[0]
-; CHECK-NEXT:    mov z0.q, q2
+; CHECK-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-NEXT:    mov z0.d, d0
 ; CHECK-NEXT:    ret
   %1 = insertelement <4 x float> undef, float %x, i64 0
   %2 = insertelement <4 x float> %1, float %y, i64 1
-  %3 = insertelement <4 x float> %2, float %x, i64 2
-  %4 = insertelement <4 x float> %3, float %y, i64 3
-  %5 = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> undef, <4 x float> %4, i64 0)
-  %6 = tail call <vscale x 4 x float> @llvm.aarch64.sve.dupq.lane.nxv4f32(<vscale x 4 x float> %5, i64 0)
+  %3 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> undef, <4 x float> %2, i64 0)
+  %4 = bitcast <vscale x 4 x float> %3 to <vscale x 2 x double>
+  %5 = shufflevector <vscale x 2 x double> %4, <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer
+  %6 = bitcast <vscale x 2 x double> %5 to <vscale x 4 x float>
   ret <vscale x 4 x float> %6
 }
 
-define dso_local <vscale x 8 x half> @dupq_f16_repeat_complex(half %a, half %b) {
+define dso_local <vscale x 8 x half> @dupq_f16_repeat_complex(half %x, half %y) {
 ; CHECK-LABEL: dupq_f16_repeat_complex:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $q0
-; CHECK-NEXT:    mov v2.16b, v0.16b
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
 ; CHECK-NEXT:    // kill: def $h1 killed $h1 def $q1
-; CHECK-NEXT:    mov v2.h[1], v1.h[0]
-; CHECK-NEXT:    mov v2.h[2], v0.h[0]
-; CHECK-NEXT:    mov v2.h[3], v1.h[0]
-; CHECK-NEXT:    mov v2.h[4], v0.h[0]
-; CHECK-NEXT:    mov v2.h[5], v1.h[0]
-; CHECK-NEXT:    mov v2.h[6], v0.h[0]
-; CHECK-NEXT:    mov v2.h[7], v1.h[0]
-; CHECK-NEXT:    mov z0.q, q2
-; CHECK-NEXT:    ret
-  %1 = insertelement <8 x half> undef, half %a, i64 0
-  %2 = insertelement <8 x half> %1, half %b, i64 1
-  %3 = insertelement <8 x half> %2, half %a, i64 2
-  %4 = insertelement <8 x half> %3, half %b, i64 3
-  %5 = insertelement <8 x half> %4, half %a, i64 4
-  %6 = insertelement <8 x half> %5, half %b, i64 5
-  %7 = insertelement <8 x half> %6, half %a, i64 6
-  %8 = insertelement <8 x half> %7, half %b, i64 7
-  %9 = tail call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> undef, <8 x half> %8, i64 0)
-  %10 = tail call <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half> %9, i64 0)
-  ret <vscale x 8 x half> %10
+; CHECK-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-NEXT:    mov z0.s, s0
+; CHECK-NEXT:    ret
+  %1 = insertelement <8 x half> undef, half %x, i64 0
+  %2 = insertelement <8 x half> %1, half %y, i64 1
+  %3 = call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> undef, <8 x half> %2, i64 0)
+  %4 = bitcast <vscale x 8 x half> %3 to <vscale x 4 x float>
+  %5 = shufflevector <vscale x 4 x float> %4, <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
+  %6 = bitcast <vscale x 4 x float> %5 to <vscale x 8 x half>
+  ret <vscale x 8 x half> %6
 }
 
 define <vscale x 16 x i8> @ext_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-dupqlane.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-dupqlane.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-dupqlane.ll
@@ -0,0 +1,172 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -passes=instcombine < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define dso_local <vscale x 4 x float> @dupq_f32_repeat_complex(float %x, float %y) {
+; CHECK-LABEL: @dupq_f32_repeat_complex(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float [[X:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[Y:%.*]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> undef, <4 x float> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 4 x float> [[TMP3]] to <vscale x 2 x double>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <vscale x 2 x double> [[TMP4]], <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 2 x double> [[TMP5]] to <vscale x 4 x float>
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP6]]
+;
+  %1 = insertelement <4 x float> undef, float %x, i64 0
+  %2 = insertelement <4 x float> %1, float %y, i64 1
+  %3 = insertelement <4 x float> %2, float %x, i64 2
+  %4 = insertelement <4 x float> %3, float %y, i64 3
+  %5 = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> undef, <4 x float> %4, i64 0)
+  %6 = tail call <vscale x 4 x float> @llvm.aarch64.sve.dupq.lane.nxv4f32(<vscale x 4 x float> %5, i64 0)
+  ret <vscale x 4 x float> %6
+}
+
+define dso_local <vscale x 8 x half> @dupq_f16_repeat_complex(half %a, half %b) {
+; CHECK-LABEL: @dupq_f16_repeat_complex(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x half> undef, half [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half [[B:%.*]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> undef, <8 x half> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 8 x half> [[TMP3]] to <vscale x 4 x float>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <vscale x 4 x float> [[TMP4]], <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 4 x float> [[TMP5]] to <vscale x 8 x half>
+; CHECK-NEXT:    ret <vscale x 8 x half> [[TMP6]]
+;
+  %1 = insertelement <8 x half> undef, half %a, i64 0
+  %2 = insertelement <8 x half> %1, half %b, i64 1
+  %3 = insertelement <8 x half> %2, half %a, i64 2
+  %4 = insertelement <8 x half> %3, half %b, i64 3
+  %5 = insertelement <8 x half> %4, half %a, i64 4
+  %6 = insertelement <8 x half> %5, half %b, i64 5
+  %7 = insertelement <8 x half> %6, half %a, i64 6
+  %8 = insertelement <8 x half> %7, half %b, i64 7
+  %9 = tail call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> undef, <8 x half> %8, i64 0)
+  %10 = tail call <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half> %9, i64 0)
+  ret <vscale x 8 x half> %10
+}
+
+define dso_local <vscale x 4 x float> @dupq_f32_complex_no_pattern(float %x, float %y, float %z) {
+; CHECK-LABEL: @dupq_f32_complex_no_pattern(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float [[X:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[Y:%.*]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[Z:%.*]], i64 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[Y]], i64 3
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> undef, <4 x float> [[TMP4]], i64 0)
+; CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.dupq.lane.nxv4f32(<vscale x 4 x float> [[TMP5]], i64 0)
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP6]]
+;
+  %1 = insertelement <4 x float> undef, float %x, i64 0
+  %2 = insertelement <4 x float> %1, float %y, i64 1
+  %3 = insertelement <4 x float> %2, float %z, i64 2
+  %4 = insertelement <4 x float> %3, float %y, i64 3
+  %5 = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> undef, <4 x float> %4, i64 0)
+  %6 = tail call <vscale x 4 x float> @llvm.aarch64.sve.dupq.lane.nxv4f32(<vscale x 4 x float> %5, i64 0)
+  ret <vscale x 4 x float> %6
+}
+
+define dso_local <vscale x 4 x float> @dupq_f32_complex_rev(float %x, float %y) {
+; CHECK-LABEL: @dupq_f32_complex_rev(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float [[X:%.*]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[Y:%.*]], i64 2
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[X]], i64 1
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[Y]], i64 0
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> undef, <4 x float> [[TMP4]], i64 0)
+; CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.dupq.lane.nxv4f32(<vscale x 4 x float> [[TMP5]], i64 0)
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP6]]
+;
+  %1 = insertelement <4 x float> undef, float %x, i64 3
+  %2 = insertelement <4 x float> %1, float %y, i64 2
+  %3 = insertelement <4 x float> %2, float %x, i64 1
+  %4 = insertelement <4 x float> %3, float %y, i64 0
+  %5 = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> undef, <4 x float> %4, i64 0)
+  %6 = tail call <vscale x 4 x float> @llvm.aarch64.sve.dupq.lane.nxv4f32(<vscale x 4 x float> %5, i64 0)
+  ret <vscale x 4 x float> %6
+}
+
+define dso_local <vscale x 8 x half> @dupq_f16_complex_missing_middle(half %a, half %b) {
+; CHECK-LABEL: @dupq_f16_complex_missing_middle(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x half> undef, half [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half [[B:%.*]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x half> [[TMP2]], half [[A]], i64 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x half> [[TMP3]], half [[B]], i64 3
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x half> [[TMP4]], half [[A]], i64 6
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x half> [[TMP5]], half [[B]], i64 7
+; CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> undef, <8 x half> [[TMP6]], i64 0)
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half> [[TMP7]], i64 0)
+; CHECK-NEXT:    ret <vscale x 8 x half> [[TMP8]]
+;
+  %1 = insertelement <8 x half> undef, half %a, i64 0
+  %2 = insertelement <8 x half> %1, half %b, i64 1
+  %3 = insertelement <8 x half> %2, half %a, i64 2
+  %4 = insertelement <8 x half> %3, half %b, i64 3
+  %5 = insertelement <8 x half> %4, half %a, i64 6
+  %6 = insertelement <8 x half> %5, half %b, i64 7
+  %7 = tail call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> undef, <8 x half> %6, i64 0)
+  %8 = tail call <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half> %7, i64 0)
+  ret <vscale x 8 x half> %8
+}
+
+define dso_local <vscale x 8 x half> @dupq_f16_repeat_complex_bad_front(half %a, half %b) {
+; CHECK-LABEL: @dupq_f16_repeat_complex_bad_front(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x half> undef, half [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half [[A]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x half> [[TMP2]], half [[A]], i64 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x half> [[TMP3]], half [[B:%.*]], i64 3
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x half> [[TMP4]], half [[A]], i64 4
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x half> [[TMP5]], half [[B]], i64 5
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x half> [[TMP6]], half [[A]], i64 6
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x half> [[TMP7]], half [[B]], i64 7
+; CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> undef, <8 x half> [[TMP8]], i64 0)
+; CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half> [[TMP9]], i64 0)
+; CHECK-NEXT:    ret <vscale x 8 x half> [[TMP10]]
+;
+  %1 = insertelement <8 x half> undef, half %a, i64 0
+  %2 = insertelement <8 x half> %1, half %a, i64 1
+  %3 = insertelement <8 x half> %2, half %a, i64 2
+  %4 = insertelement <8 x half> %3, half %b, i64 3
+  %5 = insertelement <8 x half> %4, half %a, i64 4
+  %6 = insertelement <8 x half> %5, half %b, i64 5
+  %7 = insertelement <8 x half> %6, half %a, i64 6
+  %8 = insertelement <8 x half> %7, half %b, i64 7
+  %9 = tail call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> undef, <8 x half> %8, i64 0)
+  %10 = tail call <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half> %9, i64 0)
+  ret <vscale x 8 x half> %10
+}
+
+define dso_local <vscale x 8 x half> @dupq_f16_repeat_complex_bad_back(half %a, half %b) {
+; CHECK-LABEL: @dupq_f16_repeat_complex_bad_back(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x half> undef, half [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half [[B:%.*]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x half> [[TMP2]], half [[A]], i64 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x half> [[TMP3]], half [[B]], i64 3
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x half> [[TMP4]], half [[A]], i64 4
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x half> [[TMP5]], half [[B]], i64 5
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x half> [[TMP6]], half [[A]], i64 6
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x half> [[TMP7]], half [[A]], i64 7
+; CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> undef, <8 x half> [[TMP8]], i64 0)
+; CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half> [[TMP9]], i64 0)
+; CHECK-NEXT:    ret <vscale x 8 x half> [[TMP10]]
+;
+  %1 = insertelement <8 x half> undef, half %a, i64 0
+  %2 = insertelement <8 x half> %1, half %b, i64 1
+  %3 = insertelement <8 x half> %2, half %a, i64 2
+  %4 = insertelement <8 x half> %3, half %b, i64 3
+  %5 = insertelement <8 x half> %4, half %a, i64 4
+  %6 = insertelement <8 x half> %5, half %b, i64 5
+  %7 = insertelement <8 x half> %6, half %a, i64 6
+  %8 = insertelement <8 x half> %7, half %a, i64 7
+  %9 = tail call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> undef, <8 x half> %8, i64 0)
+  %10 = tail call <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half> %9, i64 0)
+  ret <vscale x 8 x half> %10
+}
+
+declare <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half>, <8 x half>, i64)
+declare <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half>, i64)
+declare <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float>, <4 x float>, i64)
+declare <vscale x 4 x float> @llvm.aarch64.sve.dupq.lane.nxv4f32(<vscale x 4 x float>, i64)
+declare <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32>, <4 x i32>, i64)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.dupq.lane.nxv4i32(<vscale x 4 x i32>, i64)
+declare <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v8i16(<vscale x 8 x i16>, <8 x i16>, i64)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.dupq.lane.nxv8i16(<vscale x 8 x i16>, i64)
+
+attributes #0 = { "target-features"="+sve" }