diff --git a/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp
--- a/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp
+++ b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp
@@ -22,9 +22,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "AArch64.h"
+#include "AArch64Subtarget.h"
 #include "Utils/AArch64BaseInfo.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/SetVector.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
@@ -59,10 +61,13 @@
   bool coalescePTrueIntrinsicCalls(BasicBlock &BB,
                                    SmallSetVector<IntrinsicInst *, 4> &PTrues);
   bool optimizePTrueIntrinsicCalls(SmallSetVector<Function *, 4> &Functions);
+  bool optimizePredicateVectorInsert(SmallSetVector<Function *, 4> &Functions);
 
   /// Operates at the function-scope. I.e., optimizations are applied local to
   /// the functions themselves.
   bool optimizeFunctions(SmallSetVector<Function *, 4> &Functions);
+
+  const TargetMachine *TM;
 };
 } // end anonymous namespace
 
@@ -276,11 +281,84 @@
   return Changed;
 }
 
+// This is done in SVEIntrinsicOpts rather than InstCombine so that we introduce
+// scalable loads as late as possible
+bool SVEIntrinsicOpts::optimizePredicateVectorInsert(
+    SmallSetVector<Function *, 4> &Functions) {
+  bool Changed = false;
+
+  for (auto *F : Functions) {
+    auto &ST = TM->getSubtarget<AArch64Subtarget>(*F);
+    if (!ST.hasSVE())
+      continue;
+    unsigned MinSVEVectorSize = ST.getMinSVEVectorSizeInBits() / 128;
+    if (MinSVEVectorSize == 0)
+      continue;
+
+    auto *PredType =
+        ScalableVectorType::get(Type::getInt1Ty(F->getContext()), 16);
+
+    for (auto &BB : *F) {
+      SmallSetVector<BitCastInst *, 4> InterestingBitCasts;
+
+      for (Instruction &I : BB) {
+        if (I.use_empty())
+          continue;
+
+        auto *BitCast = dyn_cast<BitCastInst>(&I);
+        if (!BitCast || BitCast->getType() != PredType)
+          continue;
+
+        auto *IntrI = dyn_cast<IntrinsicInst>(BitCast->getOperand(0));
+        if (!IntrI ||
+            IntrI->getIntrinsicID() != Intrinsic::experimental_vector_insert ||
+            !IntrI->hasOneUse())
+          continue;
+
+        auto Idx = cast<ConstantInt>(IntrI->getOperand(2))->getZExtValue();
+        if (!isa<UndefValue>(IntrI->getOperand(0)) || Idx != 0)
+          continue;
+
+        auto *Load = dyn_cast<LoadInst>(IntrI->getOperand(1));
+        if (!Load || !Load->hasOneUse())
+          continue;
+
+        auto *FixedLoadType = dyn_cast<FixedVectorType>(Load->getType());
+        if (!FixedLoadType ||
+            FixedLoadType->getPrimitiveSizeInBits() != 16 * MinSVEVectorSize)
+          continue;
+
+        InterestingBitCasts.insert(BitCast);
+      }
+
+      for (auto *BitCast : InterestingBitCasts) {
+        IRBuilder<> Builder(F->getContext());
+        Builder.SetInsertPoint(&BB, BitCast->getIterator());
+
+        auto *IntrI = cast<IntrinsicInst>(BitCast->getOperand(0));
+        auto *Load = cast<LoadInst>(IntrI->getOperand(1));
+        auto *PtrBitCast = Builder.CreateBitCast(
+            Load->getPointerOperand(),
+            PredType->getPointerTo(Load->getPointerAddressSpace()));
+        auto *LoadPred = Builder.CreateLoad(PredType, PtrBitCast);
+
+        BitCast->replaceAllUsesWith(LoadPred);
+        BitCast->eraseFromParent();
+        IntrI->eraseFromParent();
+        Load->eraseFromParent();
+      }
+    }
+  }
+
+  return Changed;
+}
+
 bool SVEIntrinsicOpts::optimizeFunctions(
     SmallSetVector<Function *, 4> &Functions) {
   bool Changed = false;
 
   Changed |= optimizePTrueIntrinsicCalls(Functions);
+  Changed |= optimizePredicateVectorInsert(Functions);
 
   return Changed;
 }
@@ -289,6 +367,10 @@
   bool Changed = false;
   SmallSetVector<Function *, 4> Functions;
 
+  auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
+  assert(TPC && "Expected a TargetPassConfig");
+  TM = &TPC->getTM<TargetMachine>();
+
   // Check for SVE intrinsic declarations first so that we only iterate over
   // relevant functions. Where an appropriate declaration is found, store the
   // function(s) where it is used so we can target these only.
@@ -297,6 +379,7 @@
       continue;
 
     switch (F.getIntrinsicID()) {
+    case Intrinsic::experimental_vector_insert:
     case Intrinsic::aarch64_sve_ptrue:
       for (User *U : F.users())
         Functions.insert(cast<Instruction>(U)->getFunction());
diff --git a/llvm/test/CodeGen/AArch64/sve-insert-vector-to-predicate-load.ll b/llvm/test/CodeGen/AArch64/sve-insert-vector-to-predicate-load.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-insert-vector-to-predicate-load.ll
@@ -0,0 +1,85 @@
+; RUN: opt -S -aarch64-sve-intrinsic-opts < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define <vscale x 16 x i1> @pred_load_v2i8(<2 x i8>* %addr) #0 {
+; CHECK-LABEL: @pred_load_v2i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i8>* %addr to <vscale x 16 x i1>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <vscale x 16 x i1>, <vscale x 16 x i1>* [[TMP1]]
+; CHECK-NEXT:    ret <vscale x 16 x i1> [[TMP2]]
+  %load = load <2 x i8>, <2 x i8>* %addr, align 4
+  %insert = tail call <vscale x 2 x i8> @llvm.experimental.vector.insert.nxv2i8.v2i8(<vscale x 2 x i8> undef, <2 x i8> %load, i64 0)
+  %ret = bitcast <vscale x 2 x i8> %insert to <vscale x 16 x i1>
+  ret <vscale x 16 x i1> %ret
+}
+
+
+define <vscale x 16 x i1> @pred_load_v4i8(<4 x i8>* %addr) #1 {
+; CHECK-LABEL: @pred_load_v4i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i8>* %addr to <vscale x 16 x i1>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <vscale x 16 x i1>, <vscale x 16 x i1>* [[TMP1]]
+; CHECK-NEXT:    ret <vscale x 16 x i1> [[TMP2]]
+  %load = load <4 x i8>, <4 x i8>* %addr, align 4
+  %insert = tail call <vscale x 2 x i8> @llvm.experimental.vector.insert.nxv2i8.v4i8(<vscale x 2 x i8> undef, <4 x i8> %load, i64 0)
+  %ret = bitcast <vscale x 2 x i8> %insert to <vscale x 16 x i1>
+  ret <vscale x 16 x i1> %ret
+}
+
+define <vscale x 16 x i1> @pred_load_v8i8(<8 x i8>* %addr) #2 {
+; CHECK-LABEL: @pred_load_v8i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8>* %addr to <vscale x 16 x i1>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <vscale x 16 x i1>, <vscale x 16 x i1>* [[TMP1]]
+; CHECK-NEXT:    ret <vscale x 16 x i1> [[TMP2]]
+  %load = load <8 x i8>, <8 x i8>* %addr, align 4
+  %insert = tail call <vscale x 2 x i8> @llvm.experimental.vector.insert.nxv2i8.v8i8(<vscale x 2 x i8> undef, <8 x i8> %load, i64 0)
+  %ret = bitcast <vscale x 2 x i8> %insert to <vscale x 16 x i1>
+  ret <vscale x 16 x i1> %ret
+}
+
+; Check that too small of a vscale prevents optimization
+define <vscale x 16 x i1> @pred_load_neg1(<4 x i8>* %addr) #0 {
+; CHECK-LABEL: @pred_load_neg1(
+; CHECK:         llvm.experimental.vector.insert
+  %load = load <4 x i8>, <4 x i8>* %addr, align 4
+  %insert = tail call <vscale x 2 x i8> @llvm.experimental.vector.insert.nxv2i8.v4i8(<vscale x 2 x i8> undef, <4 x i8> %load, i64 0)
+  %ret = bitcast <vscale x 2 x i8> %insert to <vscale x 16 x i1>
+  ret <vscale x 16 x i1> %ret
+}
+
+; Check that too large of a vscale prevents optimization
+define <vscale x 16 x i1> @pred_load_neg2(<4 x i8>* %addr) #2 {
+; CHECK-LABEL: @pred_load_neg2(
+; CHECK:         llvm.experimental.vector.insert
+  %load = load <4 x i8>, <4 x i8>* %addr, align 4
+  %insert = tail call <vscale x 2 x i8> @llvm.experimental.vector.insert.nxv2i8.v4i8(<vscale x 2 x i8> undef, <4 x i8> %load, i64 0)
+  %ret = bitcast <vscale x 2 x i8> %insert to <vscale x 16 x i1>
+  ret <vscale x 16 x i1> %ret
+}
+
+; Check that a non-zero index prevents optimization
+define <vscale x 16 x i1> @pred_load_neg3(<4 x i8>* %addr) #1 {
+; CHECK-LABEL: @pred_load_neg3(
+; CHECK:         llvm.experimental.vector.insert
+  %load = load <4 x i8>, <4 x i8>* %addr, align 4
+  %insert = tail call <vscale x 2 x i8> @llvm.experimental.vector.insert.nxv2i8.v4i8(<vscale x 2 x i8> undef, <4 x i8> %load, i64 4)
+  %ret = bitcast <vscale x 2 x i8> %insert to <vscale x 16 x i1>
+  ret <vscale x 16 x i1> %ret
+}
+
+; Check that insertion into a non-undef vector prevents optimization
+define <vscale x 16 x i1> @pred_load_neg4(<4 x i8>* %addr, <vscale x 2 x i8> %passthru) #1 {
+; CHECK-LABEL: @pred_load_neg4(
+; CHECK:         llvm.experimental.vector.insert
+  %load = load <4 x i8>, <4 x i8>* %addr, align 4
+  %insert = tail call <vscale x 2 x i8> @llvm.experimental.vector.insert.nxv2i8.v4i8(<vscale x 2 x i8> %passthru, <4 x i8> %load, i64 0)
+  %ret = bitcast <vscale x 2 x i8> %insert to <vscale x 16 x i1>
+  ret <vscale x 16 x i1> %ret
+}
+
+declare <vscale x 2 x i8> @llvm.experimental.vector.insert.nxv2i8.v2i8(<vscale x 2 x i8>, <2 x i8>, i64)
+declare <vscale x 2 x i8> @llvm.experimental.vector.insert.nxv2i8.v4i8(<vscale x 2 x i8>, <4 x i8>, i64)
+declare <vscale x 2 x i8> @llvm.experimental.vector.insert.nxv2i8.v8i8(<vscale x 2 x i8>, <8 x i8>, i64)
+
+attributes #0 = { "target-features"="+sve" vscale_range(1,1) }
+attributes #1 = { "target-features"="+sve" vscale_range(2,2) }
+attributes #2 = { "target-features"="+sve" vscale_range(4,8) }