diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td
--- a/llvm/lib/Target/ARM/ARMInstrMVE.td
+++ b/llvm/lib/Target/ARM/ARMInstrMVE.td
@@ -5829,6 +5829,7 @@
   let mayLoad = load;
   let mayStore = !eq(load,0);
   let hasSideEffects = 0;
+  let validForTailPredication = load;
 }
 
 // A parameter class used to encapsulate all the ways the writeback
diff --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
--- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
+++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
@@ -782,7 +782,7 @@
   // the false lanes are zeroed and here we're trying to track that those false
   // lanes remain zero, or where they change, the differences are masked away
   // by their user(s).
-  // All MVE loads and stores have to be predicated, so we know that any load
+  // All MVE stores have to be predicated, so we know that any predicate load
   // operands, or stored results are equivalent already. Other explicitly
   // predicated instructions will perform the same operation in the original
   // loop and the tail-predicated form too. Because of this, we can insert
@@ -1038,8 +1038,8 @@
   }
 
   // If the instruction is already explicitly predicated, then the conversion
-  // will be fine, but ensure that all memory operations are predicated.
-  return !IsUse && MI->mayLoadOrStore() ? false : true;
+  // will be fine, but ensure that all store operations are predicated.
+  return !IsUse && MI->mayStore() ? false : true;
 }
 
 bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &mf) {
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredload.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredload.ll
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredload.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredload.ll
@@ -6,26 +6,17 @@
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    subs.w r12, r2, #8
-; CHECK-NEXT:    mov.w r3, #-1
-; CHECK-NEXT:    csinv r3, r3, r12, pl
-; CHECK-NEXT:    add.w r12, r3, r2
-; CHECK-NEXT:    movs r3, #1
-; CHECK-NEXT:    add.w lr, r3, r12, lsr #3
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    dlstp.16 lr, r2
 ; CHECK-NEXT:  .LBB0_1: @ %do.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vld20.16 {q0, q1}, [r0]
-; CHECK-NEXT:    vctp.16 r2
-; CHECK-NEXT:    subs r2, #8
 ; CHECK-NEXT:    vld21.16 {q0, q1}, [r0]!
-; CHECK-NEXT:    vpstttt
-; CHECK-NEXT:    vmulht.s16 q2, q1, q1
-; CHECK-NEXT:    vmulht.s16 q0, q0, q0
-; CHECK-NEXT:    vqaddt.s16 q0, q0, q2
-; CHECK-NEXT:    vshrt.s16 q0, q0, #1
-; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vstrht.16 q0, [r1], #16
-; CHECK-NEXT:    le lr, .LBB0_1
+; CHECK-NEXT:    vmulh.s16 q2, q1, q1
+; CHECK-NEXT:    vmulh.s16 q0, q0, q0
+; CHECK-NEXT:    vqadd.s16 q0, q0, q2
+; CHECK-NEXT:    vshr.s16 q0, q0, #1
+; CHECK-NEXT:    vstrh.16 q0, [r1], #16
+; CHECK-NEXT:    letp lr, .LBB0_1
 ; CHECK-NEXT:  @ %bb.2: @ %do.end
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
@@ -148,25 +139,14 @@
 ; CHECK-LABEL: good2:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    mov r3, r2
-; CHECK-NEXT:    cmp r2, #4
-; CHECK-NEXT:    it ge
-; CHECK-NEXT:    movge r3, #4
-; CHECK-NEXT:    subs r3, r2, r3
-; CHECK-NEXT:    add.w r12, r3, #3
-; CHECK-NEXT:    movs r3, #1
-; CHECK-NEXT:    add.w lr, r3, r12, lsr #2
 ; CHECK-NEXT:    mov.w r12, #0
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    dlstp.32 lr, r2
 ; CHECK-NEXT:  .LBB3_1: @ %do.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vctp.32 r2
 ; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
 ; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
-; CHECK-NEXT:    subs r2, #4
-; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vmlavat.s32 r12, q1, q0
-; CHECK-NEXT:    le lr, .LBB3_1
+; CHECK-NEXT:    vmlava.s32 r12, q1, q0
+; CHECK-NEXT:    letp lr, .LBB3_1
 ; CHECK-NEXT:  @ %bb.2: @ %do.end
 ; CHECK-NEXT:    mov r0, r12
 ; CHECK-NEXT:    pop {r7, pc}
diff --git a/llvm/unittests/Target/ARM/MachineInstrTest.cpp b/llvm/unittests/Target/ARM/MachineInstrTest.cpp
--- a/llvm/unittests/Target/ARM/MachineInstrTest.cpp
+++ b/llvm/unittests/Target/ARM/MachineInstrTest.cpp
@@ -382,7 +382,7 @@
       return false;
     case MVE_ASRLi:
     case MVE_ASRLr:
-    case MVE_LSRL:	
+    case MVE_LSRL:
     case MVE_SQRSHR:
     case MVE_SQSHL:
     case MVE_SRSHR:
@@ -393,7 +393,7 @@
     case MVE_VABDf32:
     case MVE_VABDs16:
     case MVE_VABDs32:
-    case MVE_VABDs8:	
+    case MVE_VABDs8:
     case MVE_VABDu16:
     case MVE_VABDu32:
     case MVE_VABDu8:
@@ -609,6 +609,42 @@
     case MVE_VIWDUPu16:
     case MVE_VIWDUPu32:
     case MVE_VIWDUPu8:
+    case MVE_VLD20_8:
+    case MVE_VLD21_8:
+    case MVE_VLD20_16:
+    case MVE_VLD21_16:
+    case MVE_VLD20_32:
+    case MVE_VLD21_32:
+    case MVE_VLD20_8_wb:
+    case MVE_VLD21_8_wb:
+    case MVE_VLD20_16_wb:
+    case MVE_VLD21_16_wb:
+    case MVE_VLD20_32_wb:
+    case MVE_VLD21_32_wb:
+    case MVE_VLD40_8:
+    case MVE_VLD41_8:
+    case MVE_VLD42_8:
+    case MVE_VLD43_8:
+    case MVE_VLD40_16:
+    case MVE_VLD41_16:
+    case MVE_VLD42_16:
+    case MVE_VLD43_16:
+    case MVE_VLD40_32:
+    case MVE_VLD41_32:
+    case MVE_VLD42_32:
+    case MVE_VLD43_32:
+    case MVE_VLD40_8_wb:
+    case MVE_VLD41_8_wb:
+    case MVE_VLD42_8_wb:
+    case MVE_VLD43_8_wb:
+    case MVE_VLD40_16_wb:
+    case MVE_VLD41_16_wb:
+    case MVE_VLD42_16_wb:
+    case MVE_VLD43_16_wb:
+    case MVE_VLD40_32_wb:
+    case MVE_VLD41_32_wb:
+    case MVE_VLD42_32_wb:
+    case MVE_VLD43_32_wb:
     case MVE_VLDRBS16:
     case MVE_VLDRBS16_post:
     case MVE_VLDRBS16_pre:
@@ -657,9 +693,9 @@
     case MVE_VLDRWU32_rq_u:
     case MVE_VMOVimmf32:
     case MVE_VMOVimmi16:
-    case MVE_VMOVimmi32:	
+    case MVE_VMOVimmi32:
     case MVE_VMOVimmi64:
-    case MVE_VMOVimmi8:	
+    case MVE_VMOVimmi8:
     case MVE_VMOVNi16bh:
     case MVE_VMOVNi16th:
     case MVE_VMOVNi32bh:
@@ -679,7 +715,7 @@
     case MVE_VMULLTs8:
     case MVE_VMULLTu16:
     case MVE_VMULLTu32:
-    case MVE_VMULLTu8:	
+    case MVE_VMULLTu8:
     case MVE_VMUL_qr_f16:
     case MVE_VMUL_qr_f32:
     case MVE_VMUL_qr_i16:
@@ -702,7 +738,7 @@
     case MVE_VORR:
     case MVE_VORRimmi16:
     case MVE_VORRimmi32:
-    case MVE_VPST:	
+    case MVE_VPST:
     case MVE_VQABSs16:
     case MVE_VQABSs32:
     case MVE_VQABSs8:
@@ -814,7 +850,7 @@
     case MVE_VRHADDs32:
     case MVE_VRHADDs8:
     case MVE_VRHADDu16:
-    case MVE_VRHADDu32:	
+    case MVE_VRHADDu32:
     case MVE_VRHADDu8:
     case MVE_VRINTf16A:
     case MVE_VRINTf16M:
@@ -825,12 +861,12 @@
     case MVE_VRINTf32A:
     case MVE_VRINTf32M:
     case MVE_VRINTf32N:
-    case MVE_VRINTf32P:	
-    case MVE_VRINTf32X:	
+    case MVE_VRINTf32P:
+    case MVE_VRINTf32X:
     case MVE_VRINTf32Z:
     case MVE_VRSHL_by_vecs16:
     case MVE_VRSHL_by_vecs32:
-    case MVE_VRSHL_by_vecs8:	
+    case MVE_VRSHL_by_vecs8:
     case MVE_VRSHL_by_vecu16:
     case MVE_VRSHL_by_vecu32:
     case MVE_VRSHL_by_vecu8:
@@ -887,7 +923,7 @@
     case MVE_VSTRB16_rq:
     case MVE_VSTRB32:
     case MVE_VSTRB32_post:
-    case MVE_VSTRB32_pre:	
+    case MVE_VSTRB32_pre:
     case MVE_VSTRB32_rq:
     case MVE_VSTRB8_rq:
     case MVE_VSTRBU8:
@@ -957,7 +993,9 @@
     for (auto &Op : Desc.operands()) {
       // Only check instructions that access the MQPR regs.
       if ((Op.OperandType & MCOI::OPERAND_REGISTER) == 0 ||
-          Op.RegClass != ARM::MQPRRegClassID)
+          (Op.RegClass != ARM::MQPRRegClassID &&
+           Op.RegClass != ARM::QQPRRegClassID &&
+           Op.RegClass != ARM::QQQQPRRegClassID))
         continue;
 
       uint64_t Flags = MII->get(i).TSFlags;