diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -2766,6 +2766,11 @@
     return false;
   }
 
+  virtual bool isInsertSubvectorLegal(EVT ResVT, EVT SrcVT,
+                                      unsigned Index) const {
+    return true;
+  }
+
   /// Try to convert an extract element of a vector binary operation into an
   /// extract element followed by a scalar operation.
   virtual bool shouldScalarizeBinop(SDValue VecOp) const {
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -18306,7 +18306,7 @@
 /// Convert a disguised subvector insertion into a shuffle:
 SDValue DAGCombiner::combineInsertEltToShuffle(SDNode *N, unsigned InsIndex) {
   assert(N->getOpcode() == ISD::INSERT_VECTOR_ELT &&
-         "Expected extract_vector_elt");
+         "Expected insert_vector_elt");
   SDValue InsertVal = N->getOperand(1);
   SDValue Vec = N->getOperand(0);
 
@@ -18468,15 +18468,11 @@
     return SDValue();
   }
 
-  if (VT.isScalableVector())
-    return SDValue();
-
-  unsigned NumElts = VT.getVectorNumElements();
-
   // We must know which element is being inserted for folds below here.
   unsigned Elt = IndexC->getZExtValue();
-  if (SDValue Shuf = combineInsertEltToShuffle(N, Elt))
-    return Shuf;
+  if (VT.isFixedLengthVector())
+    if (SDValue Shuf = combineInsertEltToShuffle(N, Elt))
+      return Shuf;
 
   // Canonicalize insert_vector_elt dag nodes.
   // Example:
@@ -18502,6 +18498,42 @@
   if (LegalOperations && !TLI.isOperationLegal(ISD::BUILD_VECTOR, VT))
     return SDValue();
 
+  unsigned NumElts = VT.getVectorMinNumElements();
+  bool ScalableOut = false;
+  EVT FixedVT = VT;
+  if (VT.isScalableVector()) {
+    // BUILD_VECTOR does not currently support scalable vectors. Insert into
+    // BUILD_VECTOR through an INSERT_SUBVECTOR. The motivation for this is to
+    // allow conversions from <M x Ty> to a <vscale x N x Ty> to become a no-op
+    // where the fixed-vector originates from subregister of the scalable
+    // register.
+    FixedVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), NumElts);
+
+    // Can't make an insert_subvector of this.
+    if (!TLI.isInsertSubvectorLegal(VT, FixedVT, 0) ||
+        !TLI.isTypeLegal(FixedVT) || !TLI.isTypeLegal(VT) ||
+        !TLI.isOperationLegalOrCustom(ISD::INSERT_SUBVECTOR, VT))
+      return SDValue();
+
+    if (InVec.isUndef()) {
+      // InVec was undef.
+      InVec = DAG.getUNDEF(FixedVT);
+    } else if (InVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
+               InVec.getOperand(0).isUndef() &&
+               InVec.getOperand(1).getOpcode() == ISD::BUILD_VECTOR &&
+               InVec.getOperand(1).hasOneUse() &&
+               InVec.getConstantOperandVal(2) == 0 &&
+               Elt < InVec.getOperand(1)
+                         .getValueType()
+                         .getVectorMinNumElements()) {
+      // InVec was (insert_subvector undef (build_vector {...}) 0).
+      InVec = InVec.getOperand(1);
+      // InVec now (build_vector {...}).
+    }
+    // Code following is as-if insertion is against FixedVectorTy BUILD_VECTOR.
+    ScalableOut = true;
+  }
+
   // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
   // be converted to a BUILD_VECTOR).  Fill in the Ops vector with the
   // vector elements.
@@ -18526,8 +18558,16 @@
     Ops[Elt] = OpVT.isInteger() ? DAG.getAnyExtOrTrunc(InVal, DL, OpVT) : InVal;
   }
 
-  // Return the new vector
-  return DAG.getBuildVector(VT, DL, Ops);
+  SDValue Ret = DAG.getBuildVector(FixedVT, DL, Ops);
+  if (ScalableOut) {
+    // There is no scalable build_vector, so use (insert_subvector vec
+    // (build_vector {...}) 0).
+    SDValue Zero =
+        DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout()));
+    Ret =
+        DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Ret, Zero);
+  }
+  return Ret;
 }
 
 SDValue DAGCombiner::scalarizeExtractedVectorLoad(SDNode *EVE, EVT InVecVT,
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -639,6 +639,17 @@
   bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
                                unsigned Index) const override;
 
+  bool isInsertSubvectorLegal(EVT ResVT, EVT SrcVT,
+                              unsigned Index) const override {
+    if (ResVT.isScalableVector() && SrcVT.isFixedLengthVector()) {
+      // Fixed insert into Scalable only legal if the scalable result occupies
+      // the full vector granule.
+      return ResVT.getSizeInBits().getKnownMinSize() ==
+             AArch64::SVEBitsPerBlock;
+    }
+    return true;
+  }
+
   bool shouldFormOverflowOp(unsigned Opcode, EVT VT,
                             bool MathUsed) const override {
     // Using overflow ops for overflow checks only should beneficial on
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -418,26 +418,51 @@
 
 static Optional<Instruction *> instCombineSVEDup(InstCombiner &IC,
                                                  IntrinsicInst &II) {
-  IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
-  if (!Pg)
-    return None;
+  assert(II.getIntrinsicID() == Intrinsic::aarch64_sve_dup &&
+         "Expected SVE DUP!");
 
-  if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
-    return None;
+  auto *Ty = cast<ScalableVectorType>(II.getType());
+  assert(Ty && "non-scalable result of scalable intrinsic");
 
-  const auto PTruePattern =
-      cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
-  if (PTruePattern != AArch64SVEPredPattern::vl1)
+  // Only consider dup producing <2 x double>.
+  if (!Ty || !Ty->getScalarType()->isDoubleTy() || Ty->getMinNumElements() != 2)
     return None;
 
-  // The intrinsic is inserting into lane zero so use an insert instead.
+  LLVMContext &Ctx = II.getContext();
+  IRBuilder<> Builder(Ctx);
+  Builder.SetInsertPoint(&II);
   auto *IdxTy = Type::getInt64Ty(II.getContext());
-  auto *Insert = InsertElementInst::Create(
-      II.getArgOperand(0), II.getArgOperand(2), ConstantInt::get(IdxTy, 0));
-  Insert->insertBefore(&II);
-  Insert->takeName(&II);
 
-  return IC.replaceInstUsesWith(II, Insert);
+  auto m_PTrue = [](auto Pat) {
+    return m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(Pat);
+  };
+  auto m_Dup = [](auto PassThru, auto Pg, auto Val) {
+    return m_Intrinsic<Intrinsic::aarch64_sve_dup>(PassThru, Pg, Val);
+  };
+
+  auto VL2 = m_PTrue(m_SpecificInt(AArch64SVEPredPattern::vl2));
+  auto VL1 = m_PTrue(m_SpecificInt(AArch64SVEPredPattern::vl1));
+  Value *Out = nullptr, *PassThru, *Elem0, *Elem1;
+  if (match(&II, m_Dup(m_Value(PassThru), VL1, m_Value(Elem0)))) {
+    // (dup vec VL1 elem0) => (insertelement vec elem0 0)
+    Out = PassThru;
+    Out = Builder.CreateInsertElement(PassThru, Elem0,
+                                      ConstantInt::get(IdxTy, 0));
+  } else if (match(&II, m_Dup(m_Dup(m_Value(PassThru), VL2, m_Value(Elem1)),
+                              VL1, m_Value(Elem0)))) {
+    // (dup (dup vec VL2 elem1) VL1 elem0) => (insertelement (insertelement vec
+    // elem1 1) elem0 0)
+    Out = PassThru;
+    Out = Builder.CreateInsertElement(PassThru, Elem0,
+                                      ConstantInt::get(IdxTy, 0));
+    Out = Builder.CreateInsertElement(PassThru, Elem1,
+                                      ConstantInt::get(IdxTy, 0));
+  }
+
+  if (!Out)
+    return None;
+  Out->takeName(&II);
+  return IC.replaceInstUsesWith(II, Out);
 }
 
 static Optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,
diff --git a/llvm/test/CodeGen/AArch64/dag-combine-insert-elt.ll b/llvm/test/CodeGen/AArch64/dag-combine-insert-elt.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/dag-combine-insert-elt.ll
@@ -0,0 +1,56 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+attributes #0 = {"target-features"="+sve"}
+
+define <vscale x 2 x double> @two_inserts(<vscale x 2 x double> %in) #0 {
+; CHECK-LABEL: two_inserts:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov v0.2d, #1.00000000
+; CHECK-NEXT:    ret
+  %b = insertelement <vscale x 2 x double> undef, double 0.0, i32 0
+  %a = insertelement <vscale x 2 x double> undef, double 1.0, i32 1
+  ret <vscale x 2 x double> %a
+}
+
+
+define <vscale x 2 x double> @neon_to_sve(<2 x double> %in) #0 {
+; CHECK-LABEL: neon_to_sve:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    ret
+  %e0 = extractelement <2 x double> %in, i32 0
+  %e1 = extractelement <2 x double> %in, i32 1
+  %b = insertelement <vscale x 2 x double> undef, double %e0, i32 0
+  %a = insertelement <vscale x 2 x double> undef, double %e1, i32 1
+  ret <vscale x 2 x double> %a
+}
+
+; Function Attrs: nofree norecurse nosync nounwind readnone uwtable willreturn mustprogress vscale_range(4,4)
+define dso_local <vscale x 4 x float> @float32x4_t_to_svfloat32_t(<4 x float> %nv) local_unnamed_addr #0 {
+; CHECK-LABEL: float32x4_t_to_svfloat32_t:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    ret
+entry:
+  %lane0_floats = shufflevector <4 x float> %nv, <4 x float> undef, <2 x i32> <i32 0, i32 1>
+  %lane1_floats = shufflevector <4 x float> %nv, <4 x float> undef, <2 x i32> <i32 2, i32 3>
+
+  %0 = bitcast <2 x float> %lane0_floats to <1 x double>
+  %1 = bitcast <2 x float> %lane1_floats to <1 x double>
+
+  %lane0 = extractelement <1 x double> %0, i32 0
+  %lane1 = extractelement <1 x double> %1, i32 0
+
+  %b = insertelement <vscale x 2 x double> undef, double %lane1, i32 1
+  %a = insertelement <vscale x 2 x double> %b, double %lane0, i32 0
+
+  %out = bitcast <vscale x 2 x double> %a to <vscale x 4 x float>
+  ret <vscale x 4 x float> %out
+}
+
+declare <vscale x 2 x double> @llvm.experimental.vector.insert.nxv2f64.v2f64(<vscale x 2 x double>, <2 x double>, i64 immarg)
+
+declare <vscale x 2 x double> @llvm.aarch64.sve.dup.nxv2f64(<vscale x 2 x double>, <vscale x 2 x i1>, double)
+declare <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 immarg)
diff --git a/llvm/test/CodeGen/AArch64/sve-insert-element.ll b/llvm/test/CodeGen/AArch64/sve-insert-element.ll
--- a/llvm/test/CodeGen/AArch64/sve-insert-element.ll
+++ b/llvm/test/CodeGen/AArch64/sve-insert-element.ll
@@ -12,6 +12,7 @@
   ret <vscale x 16 x i8> %b
 }
 
+
 define <vscale x 8 x i16> @test_lane0_8xi16(<vscale x 8 x i16> %a) {
 ; CHECK-LABEL: test_lane0_8xi16:
 ; CHECK:       // %bb.0:
@@ -56,6 +57,58 @@
   ret <vscale x 2 x double> %b
 }
 
+define <vscale x 2 x double> @test_lane01_2xf64(<vscale x 2 x double> %a) {
+; CHECK-LABEL: test_lane01_2xf64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #1
+; CHECK-NEXT:    mov x9, #4636737291354636288
+; CHECK-NEXT:    mov x10, #70368744177664
+; CHECK-NEXT:    index z1.d, #0, #1
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ptrue p1.d, vl1
+; CHECK-NEXT:    movk x10, #16473, lsl #48
+; CHECK-NEXT:    mov z2.d, x8
+; CHECK-NEXT:    fmov d3, x9
+; CHECK-NEXT:    cmpeq p0.d, p0/z, z1.d, z2.d
+; CHECK-NEXT:    mov z0.d, p1/m, z3.d
+; CHECK-NEXT:    fmov d1, x10
+; CHECK-NEXT:    mov z0.d, p0/m, d1
+; CHECK-NEXT:    ret
+  %b = insertelement <vscale x 2 x double> %a, double 101.0, i32 1
+  %c = insertelement <vscale x 2 x double> %b, double 100.0, i32 0
+  ret <vscale x 2 x double> %c
+}
+
+define <vscale x 2 x double> @test_lane012_2xf64(<vscale x 2 x double> %a) {
+; CHECK-LABEL: test_lane012_2xf64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x8, #4636737291354636288
+; CHECK-NEXT:    fmov d1, x8
+; CHECK-NEXT:    mov w8, #2
+; CHECK-NEXT:    ptrue p1.d, vl1
+; CHECK-NEXT:    index z2.d, #0, #1
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov z0.d, p1/m, z1.d
+; CHECK-NEXT:    mov z1.d, x8
+; CHECK-NEXT:    mov w8, #1
+; CHECK-NEXT:    cmpeq p1.d, p0/z, z2.d, z1.d
+; CHECK-NEXT:    mov z1.d, x8
+; CHECK-NEXT:    mov x8, #70368744177664
+; CHECK-NEXT:    movk x8, #16473, lsl #48
+; CHECK-NEXT:    cmpeq p0.d, p0/z, z2.d, z1.d
+; CHECK-NEXT:    fmov d1, x8
+; CHECK-NEXT:    mov x8, #140737488355328
+; CHECK-NEXT:    movk x8, #16473, lsl #48
+; CHECK-NEXT:    mov z0.d, p0/m, d1
+; CHECK-NEXT:    fmov d1, x8
+; CHECK-NEXT:    mov z0.d, p1/m, d1
+; CHECK-NEXT:    ret
+  %b = insertelement <vscale x 2 x double> %a, double 102.0, i32 2
+  %c = insertelement <vscale x 2 x double> %b, double 101.0, i32 1
+  %d = insertelement <vscale x 2 x double> %c, double 100.0, i32 0
+  ret <vscale x 2 x double> %d
+}
+
 define <vscale x 4 x float> @test_lane0_4xf32(<vscale x 4 x float> %a) {
 ; CHECK-LABEL: test_lane0_4xf32:
 ; CHECK:       // %bb.0:
@@ -155,12 +208,7 @@
 define <vscale x 8 x i16> @test_lane6_undef_8xi16(i16 %a) {
 ; CHECK-LABEL: test_lane6_undef_8xi16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #6
-; CHECK-NEXT:    index z0.h, #0, #1
-; CHECK-NEXT:    mov z1.h, w8
-; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    cmpeq p0.h, p0/z, z0.h, z1.h
-; CHECK-NEXT:    mov z0.h, p0/m, w0
+; CHECK-NEXT:    dup v0.8h, w0
 ; CHECK-NEXT:    ret
   %b = insertelement <vscale x 8 x i16> undef, i16 %a, i32 6
   ret <vscale x 8 x i16> %b
diff --git a/llvm/test/CodeGen/AArch64/sve-ld-post-inc.ll b/llvm/test/CodeGen/AArch64/sve-ld-post-inc.ll
--- a/llvm/test/CodeGen/AArch64/sve-ld-post-inc.ll
+++ b/llvm/test/CodeGen/AArch64/sve-ld-post-inc.ll
@@ -8,10 +8,9 @@
 define <vscale x 4 x i32> @test_post_ld1_insert(i32* %a, i32** %ptr, i64 %inc) {
 ; CHECK-LABEL: test_post_ld1_insert:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr w8, [x0]
-; CHECK-NEXT:    add x9, x0, x2, lsl #2
-; CHECK-NEXT:    str x9, [x1]
-; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ldr s0, [x0]
+; CHECK-NEXT:    add x8, x0, x2, lsl #2
+; CHECK-NEXT:    str x8, [x1]
 ; CHECK-NEXT:    ret
   %load = load i32, i32* %a
   %ins = insertelement <vscale x 4 x i32> undef, i32 %load, i32 0
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-dup.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-dup.ll
--- a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-dup.ll
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-dup.ll
@@ -1,11 +1,14 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -instcombine < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
 define <vscale x 16 x i8> @dup_insertelement_0(<vscale x 16 x i8> %v, i8 %s) #0 {
 ; CHECK-LABEL: @dup_insertelement_0(
-; CHECK: %insert = insertelement <vscale x 16 x i8> %v, i8 %s, i64 0
-; CHECK-NEXT: ret <vscale x 16 x i8> %insert
+; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 1)
+; CHECK-NEXT:    [[INSERT:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.dup.nxv16i8(<vscale x 16 x i8> [[V:%.*]], <vscale x 16 x i1> [[PG]], i8 [[S:%.*]])
+; CHECK-NEXT:    ret <vscale x 16 x i8> [[INSERT]]
+;
   %pg = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 1)
   %insert = tail call <vscale x 16 x i8> @llvm.aarch64.sve.dup.nxv16i8(<vscale x 16 x i8> %v, <vscale x 16 x i1> %pg, i8 %s)
   ret <vscale x 16 x i8> %insert
@@ -13,9 +16,10 @@
 
 define <vscale x 16 x i8> @dup_insertelement_1(<vscale x 16 x i8> %v, i8 %s) #0 {
 ; CHECK-LABEL: @dup_insertelement_1(
-; CHECK: %pg = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 2)
-; CHECK-NEXT: %insert = tail call <vscale x 16 x i8> @llvm.aarch64.sve.dup.nxv16i8(<vscale x 16 x i8> %v, <vscale x 16 x i1> %pg, i8 %s)
-; CHECK-NEXT: ret <vscale x 16 x i8> %insert
+; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 2)
+; CHECK-NEXT:    [[INSERT:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.dup.nxv16i8(<vscale x 16 x i8> [[V:%.*]], <vscale x 16 x i1> [[PG]], i8 [[S:%.*]])
+; CHECK-NEXT:    ret <vscale x 16 x i8> [[INSERT]]
+;
   %pg = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 2)
   %insert = tail call <vscale x 16 x i8> @llvm.aarch64.sve.dup.nxv16i8(<vscale x 16 x i8> %v, <vscale x 16 x i1> %pg, i8 %s)
   ret <vscale x 16 x i8> %insert
@@ -23,16 +27,19 @@
 
 define <vscale x 16 x i8> @dup_insertelement_x(<vscale x 16 x i8> %v, i8 %s, <vscale x 16 x i1> %pg) #0 {
 ; CHECK-LABEL: @dup_insertelement_x(
-; CHECK: %insert = tail call <vscale x 16 x i8> @llvm.aarch64.sve.dup.nxv16i8(<vscale x 16 x i8> %v, <vscale x 16 x i1> %pg, i8 %s)
-; CHECK-NEXT: ret <vscale x 16 x i8> %insert
+; CHECK-NEXT:    [[INSERT:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.dup.nxv16i8(<vscale x 16 x i8> [[V:%.*]], <vscale x 16 x i1> [[PG:%.*]], i8 [[S:%.*]])
+; CHECK-NEXT:    ret <vscale x 16 x i8> [[INSERT]]
+;
   %insert = tail call <vscale x 16 x i8> @llvm.aarch64.sve.dup.nxv16i8(<vscale x 16 x i8> %v, <vscale x 16 x i1> %pg, i8 %s)
   ret <vscale x 16 x i8> %insert
 }
 
 define <vscale x 8 x i16> @dup_insertelement_0_convert(<vscale x 8 x i16> %v, i16 %s) #0 {
 ; CHECK-LABEL: @dup_insertelement_0_convert(
-; CHECK: %insert = insertelement <vscale x 8 x i16> %v, i16 %s, i64 0
-; CHECK-NEXT: ret <vscale x 8 x i16> %insert
+; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 1)
+; CHECK-NEXT:    [[INSERT:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.dup.nxv8i16(<vscale x 8 x i16> [[V:%.*]], <vscale x 8 x i1> [[PG]], i16 [[S:%.*]])
+; CHECK-NEXT:    ret <vscale x 8 x i16> [[INSERT]]
+;
   %pg = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 1)
   %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %pg)
   %2 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %1)