diff --git a/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp
--- a/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp
+++ b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp
@@ -59,6 +59,8 @@
   bool coalescePTrueIntrinsicCalls(BasicBlock &BB,
                                    SmallSetVector<IntrinsicInst *, 4> &PTrues);
   bool optimizePTrueIntrinsicCalls(SmallSetVector<Function *, 4> &Functions);
+  bool optimizePredicateVectorExtract(SmallSetVector<Function *, 4> &Functions);
+  bool optimizePredicateVectorInsert(SmallSetVector<Function *, 4> &Functions);
 
   /// Operates at the function-scope. I.e., optimizations are applied local to
   /// the functions themselves.
@@ -276,11 +278,176 @@
   return Changed;
 }
 
+// This is done in SVEIntrinsicOpts rather than InstCombine so that we introduce
+// scalable stores as late as possible
+bool SVEIntrinsicOpts::optimizePredicateVectorExtract(
+    SmallSetVector<Function *, 4> &Functions) {
+  bool Changed = false;
+
+  for (auto *F : Functions) {
+    auto Attr = F->getFnAttribute(Attribute::VScaleRange);
+    if (!Attr.isValid())
+      continue;
+
+    unsigned MinSVEVectorSize, MaxSVEVectorSize;
+    std::tie(MinSVEVectorSize, MaxSVEVectorSize) = Attr.getVScaleRangeArgs();
+    if (MinSVEVectorSize != MaxSVEVectorSize)
+      continue;
+
+    auto *PredType =
+        ScalableVectorType::get(Type::getInt1Ty(F->getContext()), 16);
+
+    for (auto &BB : *F) {
+      SmallSetVector<StoreInst *, 4> InterestingStores;
+
+      for (Instruction &I : BB) {
+        // If we have a store..
+        auto *Store = dyn_cast<StoreInst>(&I);
+        if (!Store)
+          continue;
+
+        // ..that is storing a predicate vector sized worth of bits..
+        auto *FixedStoreType =
+            dyn_cast<FixedVectorType>(Store->getOperand(0)->getType());
+        if (!FixedStoreType ||
+            FixedStoreType->getPrimitiveSizeInBits() != 16 * MinSVEVectorSize)
+          continue;
+
+        // ..where the value stored comes from a vector extract with one use..
+        auto *IntrI = dyn_cast<IntrinsicInst>(Store->getOperand(0));
+        if (!IntrI ||
+            IntrI->getIntrinsicID() != Intrinsic::experimental_vector_extract ||
+            !IntrI->hasOneUse())
+          continue;
+
+        // ..that is extracting from index 0..
+        auto Idx = cast<ConstantInt>(IntrI->getOperand(1))->getZExtValue();
+        if (Idx != 0)
+          continue;
+
+        // ..where the value being extract from comes from a bitcast
+        auto *BitCast = dyn_cast<BitCastInst>(IntrI->getOperand(0));
+        if (!BitCast)
+          continue;
+
+        // ..and the bitcast is casting from predicate type
+        if (BitCast->getOperand(0)->getType() != PredType)
+          continue;
+
+        // ..convert it into a direct predicate store.
+        InterestingStores.insert(Store);
+      }
+
+      for (auto *Store : InterestingStores) {
+        IRBuilder<> Builder(F->getContext());
+        Builder.SetInsertPoint(&BB, Store->getIterator());
+
+        auto *IntrI = cast<IntrinsicInst>(Store->getOperand(0));
+        auto *BitCast = cast<BitCastInst>(IntrI->getOperand(0));
+        auto *PtrBitCast = Builder.CreateBitCast(
+            Store->getPointerOperand(),
+            PredType->getPointerTo(Store->getPointerAddressSpace()));
+        auto *StorePred =
+            Builder.CreateStore(BitCast->getOperand(0), PtrBitCast);
+
+        Store->replaceAllUsesWith(StorePred);
+        Store->eraseFromParent();
+        IntrI->eraseFromParent();
+        BitCast->eraseFromParent();
+      }
+    }
+  }
+
+  return Changed;
+}
+
+// This is done in SVEIntrinsicOpts rather than InstCombine so that we introduce
+// scalable loads as late as possible
+bool SVEIntrinsicOpts::optimizePredicateVectorInsert(
+    SmallSetVector<Function *, 4> &Functions) {
+  bool Changed = false;
+
+  for (auto *F : Functions) {
+    auto Attr = F->getFnAttribute(Attribute::VScaleRange);
+    if (!Attr.isValid())
+      continue;
+
+    unsigned MinSVEVectorSize, MaxSVEVectorSize;
+    std::tie(MinSVEVectorSize, MaxSVEVectorSize) = Attr.getVScaleRangeArgs();
+    if (MinSVEVectorSize != MaxSVEVectorSize)
+      continue;
+
+    auto *PredType =
+        ScalableVectorType::get(Type::getInt1Ty(F->getContext()), 16);
+
+    for (auto &BB : *F) {
+      SmallSetVector<BitCastInst *, 4> InterestingBitCasts;
+
+      for (Instruction &I : BB) {
+        if (I.use_empty())
+          continue;
+
+        // If we have a bitcast..
+        auto *BitCast = dyn_cast<BitCastInst>(&I);
+        if (!BitCast || BitCast->getType() != PredType)
+          continue;
+
+        // ..whose operand is a vector_insert with only one use..
+        auto *IntrI = dyn_cast<IntrinsicInst>(BitCast->getOperand(0));
+        if (!IntrI ||
+            IntrI->getIntrinsicID() != Intrinsic::experimental_vector_insert ||
+            !IntrI->hasOneUse())
+          continue;
+
+        // ..that is inserting into index zero of an undef vector..
+        auto Idx = cast<ConstantInt>(IntrI->getOperand(2))->getZExtValue();
+        if (!isa<UndefValue>(IntrI->getOperand(0)) || Idx != 0)
+          continue;
+
+        // ..where the value inserted comes from a load with only one use..
+        auto *Load = dyn_cast<LoadInst>(IntrI->getOperand(1));
+        if (!Load || !Load->hasOneUse())
+          continue;
+
+        // ..that is loading a predicate vector sized worth of bits..
+        auto *FixedLoadType = dyn_cast<FixedVectorType>(Load->getType());
+        if (!FixedLoadType ||
+            FixedLoadType->getPrimitiveSizeInBits() != 16 * MinSVEVectorSize)
+          continue;
+
+        // ..convert it into a direct predicate load.
+        InterestingBitCasts.insert(BitCast);
+      }
+
+      for (auto *BitCast : InterestingBitCasts) {
+        IRBuilder<> Builder(F->getContext());
+        Builder.SetInsertPoint(&BB, BitCast->getIterator());
+
+        auto *IntrI = cast<IntrinsicInst>(BitCast->getOperand(0));
+        auto *Load = cast<LoadInst>(IntrI->getOperand(1));
+        auto *PtrBitCast = Builder.CreateBitCast(
+            Load->getPointerOperand(),
+            PredType->getPointerTo(Load->getPointerAddressSpace()));
+        auto *LoadPred = Builder.CreateLoad(PredType, PtrBitCast);
+
+        BitCast->replaceAllUsesWith(LoadPred);
+        BitCast->eraseFromParent();
+        IntrI->eraseFromParent();
+        Load->eraseFromParent();
+      }
+    }
+  }
+
+  return Changed;
+}
+
 bool SVEIntrinsicOpts::optimizeFunctions(
     SmallSetVector<Function *, 4> &Functions) {
   bool Changed = false;
 
   Changed |= optimizePTrueIntrinsicCalls(Functions);
+  Changed |= optimizePredicateVectorExtract(Functions);
+  Changed |= optimizePredicateVectorInsert(Functions);
 
   return Changed;
 }
@@ -297,6 +464,8 @@
       continue;
 
     switch (F.getIntrinsicID()) {
+    case Intrinsic::experimental_vector_extract:
+    case Intrinsic::experimental_vector_insert:
     case Intrinsic::aarch64_sve_ptrue:
       for (User *U : F.users())
         Functions.insert(cast<Instruction>(U)->getFunction());
diff --git a/llvm/test/CodeGen/AArch64/sve-extract-vector-to-predicate-store.ll b/llvm/test/CodeGen/AArch64/sve-extract-vector-to-predicate-store.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-extract-vector-to-predicate-store.ll
@@ -0,0 +1,86 @@
+; RUN: opt -S -aarch64-sve-intrinsic-opts < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define void @pred_store_v2i8(<vscale x 16 x i1> %pred, <2 x i8>* %addr) #0 {
+; CHECK-LABEL: @pred_store_v2i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i8>* %addr to <vscale x 16 x i1>*
+; CHECK-NEXT:    store <vscale x 16 x i1> %pred, <vscale x 16 x i1>* [[TMP1]]
+; CHECK-NEXT:    ret void
+  %bitcast = bitcast <vscale x 16 x i1> %pred to <vscale x 2 x i8>
+  %extract = tail call <2 x i8> @llvm.experimental.vector.extract.v2i8.nxv2i8(<vscale x 2 x i8> %bitcast, i64 0)
+  store <2 x i8> %extract, <2 x i8>* %addr, align 4
+  ret void
+}
+
+define void @pred_store_v4i8(<vscale x 16 x i1> %pred, <4 x i8>* %addr) #1 {
+; CHECK-LABEL: @pred_store_v4i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i8>* %addr to <vscale x 16 x i1>*
+; CHECK-NEXT:    store <vscale x 16 x i1> %pred, <vscale x 16 x i1>* [[TMP1]]
+; CHECK-NEXT:    ret void
+  %bitcast = bitcast <vscale x 16 x i1> %pred to <vscale x 2 x i8>
+  %extract = tail call <4 x i8> @llvm.experimental.vector.extract.v4i8.nxv2i8(<vscale x 2 x i8> %bitcast, i64 0)
+  store <4 x i8> %extract, <4 x i8>* %addr, align 4
+  ret void
+}
+
+define void @pred_store_v8i8(<vscale x 16 x i1> %pred, <8 x i8>* %addr) #2 {
+; CHECK-LABEL: @pred_store_v8i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8>* %addr to <vscale x 16 x i1>*
+; CHECK-NEXT:    store <vscale x 16 x i1> %pred, <vscale x 16 x i1>* [[TMP1]]
+; CHECK-NEXT:    ret void
+  %bitcast = bitcast <vscale x 16 x i1> %pred to <vscale x 2 x i8>
+  %extract = tail call <8 x i8> @llvm.experimental.vector.extract.v8i8.nxv2i8(<vscale x 2 x i8> %bitcast, i64 0)
+  store <8 x i8> %extract, <8 x i8>* %addr, align 4
+  ret void
+}
+
+
+; Check that too small of a vscale prevents optimization
+define void @pred_store_neg1(<vscale x 16 x i1> %pred, <4 x i8>* %addr) #0 {
+; CHECK-LABEL: @pred_store_neg1(
+; CHECK:         call <4 x i8> @llvm.experimental.vector.extract
+  %bitcast = bitcast <vscale x 16 x i1> %pred to <vscale x 2 x i8>
+  %extract = tail call <4 x i8> @llvm.experimental.vector.extract.v4i8.nxv2i8(<vscale x 2 x i8> %bitcast, i64 0)
+  store <4 x i8> %extract, <4 x i8>* %addr, align 4
+  ret void
+}
+
+; Check that too large of a vscale prevents optimization
+define void @pred_store_neg2(<vscale x 16 x i1> %pred, <4 x i8>* %addr) #2 {
+; CHECK-LABEL: @pred_store_neg2(
+; CHECK:         call <4 x i8> @llvm.experimental.vector.extract
+  %bitcast = bitcast <vscale x 16 x i1> %pred to <vscale x 2 x i8>
+  %extract = tail call <4 x i8> @llvm.experimental.vector.extract.v4i8.nxv2i8(<vscale x 2 x i8> %bitcast, i64 0)
+  store <4 x i8> %extract, <4 x i8>* %addr, align 4
+  ret void
+}
+
+; Check that a non-zero index prevents optimization
+define void @pred_store_neg3(<vscale x 16 x i1> %pred, <4 x i8>* %addr) #1 {
+; CHECK-LABEL: @pred_store_neg3(
+; CHECK:         call <4 x i8> @llvm.experimental.vector.extract
+  %bitcast = bitcast <vscale x 16 x i1> %pred to <vscale x 2 x i8>
+  %extract = tail call <4 x i8> @llvm.experimental.vector.extract.v4i8.nxv2i8(<vscale x 2 x i8> %bitcast, i64 4)
+  store <4 x i8> %extract, <4 x i8>* %addr, align 4
+  ret void
+}
+
+; Check that differing vscale min/max prevents optimization
+define void @pred_store_neg4(<vscale x 16 x i1> %pred, <4 x i8>* %addr) #3 {
+; CHECK-LABEL: @pred_store_neg4(
+; CHECK:         call <4 x i8> @llvm.experimental.vector.extract
+  %bitcast = bitcast <vscale x 16 x i1> %pred to <vscale x 2 x i8>
+  %extract = tail call <4 x i8> @llvm.experimental.vector.extract.v4i8.nxv2i8(<vscale x 2 x i8> %bitcast, i64 0)
+  store <4 x i8> %extract, <4 x i8>* %addr, align 4
+  ret void
+}
+
+declare <2 x i8> @llvm.experimental.vector.extract.v2i8.nxv2i8(<vscale x 2 x i8>, i64)
+declare <4 x i8> @llvm.experimental.vector.extract.v4i8.nxv2i8(<vscale x 2 x i8>, i64)
+declare <8 x i8> @llvm.experimental.vector.extract.v8i8.nxv2i8(<vscale x 2 x i8>, i64)
+
+attributes #0 = { "target-features"="+sve" vscale_range(1,1) }
+attributes #1 = { "target-features"="+sve" vscale_range(2,2) }
+attributes #2 = { "target-features"="+sve" vscale_range(4,4) }
+attributes #3 = { "target-features"="+sve" vscale_range(2,4) }
diff --git a/llvm/test/CodeGen/AArch64/sve-insert-vector-to-predicate-load.ll b/llvm/test/CodeGen/AArch64/sve-insert-vector-to-predicate-load.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-insert-vector-to-predicate-load.ll
@@ -0,0 +1,96 @@
+; RUN: opt -S -aarch64-sve-intrinsic-opts < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define <vscale x 16 x i1> @pred_load_v2i8(<2 x i8>* %addr) #0 {
+; CHECK-LABEL: @pred_load_v2i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i8>* %addr to <vscale x 16 x i1>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <vscale x 16 x i1>, <vscale x 16 x i1>* [[TMP1]]
+; CHECK-NEXT:    ret <vscale x 16 x i1> [[TMP2]]
+  %load = load <2 x i8>, <2 x i8>* %addr, align 4
+  %insert = tail call <vscale x 2 x i8> @llvm.experimental.vector.insert.nxv2i8.v2i8(<vscale x 2 x i8> undef, <2 x i8> %load, i64 0)
+  %ret = bitcast <vscale x 2 x i8> %insert to <vscale x 16 x i1>
+  ret <vscale x 16 x i1> %ret
+}
+
+
+define <vscale x 16 x i1> @pred_load_v4i8(<4 x i8>* %addr) #1 {
+; CHECK-LABEL: @pred_load_v4i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i8>* %addr to <vscale x 16 x i1>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <vscale x 16 x i1>, <vscale x 16 x i1>* [[TMP1]]
+; CHECK-NEXT:    ret <vscale x 16 x i1> [[TMP2]]
+  %load = load <4 x i8>, <4 x i8>* %addr, align 4
+  %insert = tail call <vscale x 2 x i8> @llvm.experimental.vector.insert.nxv2i8.v4i8(<vscale x 2 x i8> undef, <4 x i8> %load, i64 0)
+  %ret = bitcast <vscale x 2 x i8> %insert to <vscale x 16 x i1>
+  ret <vscale x 16 x i1> %ret
+}
+
+define <vscale x 16 x i1> @pred_load_v8i8(<8 x i8>* %addr) #2 {
+; CHECK-LABEL: @pred_load_v8i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8>* %addr to <vscale x 16 x i1>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <vscale x 16 x i1>, <vscale x 16 x i1>* [[TMP1]]
+; CHECK-NEXT:    ret <vscale x 16 x i1> [[TMP2]]
+  %load = load <8 x i8>, <8 x i8>* %addr, align 4
+  %insert = tail call <vscale x 2 x i8> @llvm.experimental.vector.insert.nxv2i8.v8i8(<vscale x 2 x i8> undef, <8 x i8> %load, i64 0)
+  %ret = bitcast <vscale x 2 x i8> %insert to <vscale x 16 x i1>
+  ret <vscale x 16 x i1> %ret
+}
+
+; Check that too small of a vscale prevents optimization
+define <vscale x 16 x i1> @pred_load_neg1(<4 x i8>* %addr) #0 {
+; CHECK-LABEL: @pred_load_neg1(
+; CHECK:         call <vscale x 2 x i8> @llvm.experimental.vector.insert
+  %load = load <4 x i8>, <4 x i8>* %addr, align 4
+  %insert = tail call <vscale x 2 x i8> @llvm.experimental.vector.insert.nxv2i8.v4i8(<vscale x 2 x i8> undef, <4 x i8> %load, i64 0)
+  %ret = bitcast <vscale x 2 x i8> %insert to <vscale x 16 x i1>
+  ret <vscale x 16 x i1> %ret
+}
+
+; Check that too large of a vscale prevents optimization
+define <vscale x 16 x i1> @pred_load_neg2(<4 x i8>* %addr) #2 {
+; CHECK-LABEL: @pred_load_neg2(
+; CHECK:         call <vscale x 2 x i8> @llvm.experimental.vector.insert
+  %load = load <4 x i8>, <4 x i8>* %addr, align 4
+  %insert = tail call <vscale x 2 x i8> @llvm.experimental.vector.insert.nxv2i8.v4i8(<vscale x 2 x i8> undef, <4 x i8> %load, i64 0)
+  %ret = bitcast <vscale x 2 x i8> %insert to <vscale x 16 x i1>
+  ret <vscale x 16 x i1> %ret
+}
+
+; Check that a non-zero index prevents optimization
+define <vscale x 16 x i1> @pred_load_neg3(<4 x i8>* %addr) #1 {
+; CHECK-LABEL: @pred_load_neg3(
+; CHECK:         call <vscale x 2 x i8> @llvm.experimental.vector.insert
+  %load = load <4 x i8>, <4 x i8>* %addr, align 4
+  %insert = tail call <vscale x 2 x i8> @llvm.experimental.vector.insert.nxv2i8.v4i8(<vscale x 2 x i8> undef, <4 x i8> %load, i64 4)
+  %ret = bitcast <vscale x 2 x i8> %insert to <vscale x 16 x i1>
+  ret <vscale x 16 x i1> %ret
+}
+
+; Check that differing vscale min/max prevents optimization
+define <vscale x 16 x i1> @pred_load_neg4(<4 x i8>* %addr) #3 {
+; CHECK-LABEL: @pred_load_neg4(
+; CHECK:         call <vscale x 2 x i8> @llvm.experimental.vector.insert
+  %load = load <4 x i8>, <4 x i8>* %addr, align 4
+  %insert = tail call <vscale x 2 x i8> @llvm.experimental.vector.insert.nxv2i8.v4i8(<vscale x 2 x i8> undef, <4 x i8> %load, i64 0)
+  %ret = bitcast <vscale x 2 x i8> %insert to <vscale x 16 x i1>
+  ret <vscale x 16 x i1> %ret
+}
+
+; Check that insertion into a non-undef vector prevents optimization
+define <vscale x 16 x i1> @pred_load_neg5(<4 x i8>* %addr, <vscale x 2 x i8> %passthru) #1 {
+; CHECK-LABEL: @pred_load_neg5(
+; CHECK:         call <vscale x 2 x i8> @llvm.experimental.vector.insert
+  %load = load <4 x i8>, <4 x i8>* %addr, align 4
+  %insert = tail call <vscale x 2 x i8> @llvm.experimental.vector.insert.nxv2i8.v4i8(<vscale x 2 x i8> %passthru, <4 x i8> %load, i64 0)
+  %ret = bitcast <vscale x 2 x i8> %insert to <vscale x 16 x i1>
+  ret <vscale x 16 x i1> %ret
+}
+
+declare <vscale x 2 x i8> @llvm.experimental.vector.insert.nxv2i8.v2i8(<vscale x 2 x i8>, <2 x i8>, i64)
+declare <vscale x 2 x i8> @llvm.experimental.vector.insert.nxv2i8.v4i8(<vscale x 2 x i8>, <4 x i8>, i64)
+declare <vscale x 2 x i8> @llvm.experimental.vector.insert.nxv2i8.v8i8(<vscale x 2 x i8>, <8 x i8>, i64)
+
+attributes #0 = { "target-features"="+sve" vscale_range(1,1) }
+attributes #1 = { "target-features"="+sve" vscale_range(2,2) }
+attributes #2 = { "target-features"="+sve" vscale_range(4,4) }
+attributes #3 = { "target-features"="+sve" vscale_range(2,4) }