diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -361,7 +361,8 @@
   bool shouldOutlineFromFunctionByDefault(MachineFunction &MF) const override;
 
   bool isUnspillableTerminatorImpl(const MachineInstr *MI) const override {
-    return MI->getOpcode() == ARM::t2LoopEndDec;
+    return MI->getOpcode() == ARM::t2LoopEndDec ||
+           MI->getOpcode() == ARM::t2DoLoopStartTP;
   }
 
 private:
diff --git a/llvm/lib/Target/ARM/ARMInstrThumb2.td b/llvm/lib/Target/ARM/ARMInstrThumb2.td
--- a/llvm/lib/Target/ARM/ARMInstrThumb2.td
+++ b/llvm/lib/Target/ARM/ARMInstrThumb2.td
@@ -5427,6 +5427,7 @@
   t2PseudoInst<(outs GPRlr:$X), (ins rGPR:$elts), 4, IIC_Br,
   [(set GPRlr:$X, (int_start_loop_iterations rGPR:$elts))]>;
 
+let isTerminator = 1, hasSideEffects = 1 in
 def t2DoLoopStartTP :
   t2PseudoInst<(outs GPRlr:$X), (ins rGPR:$elts, rGPR:$count), 4, IIC_Br, []>;
 
diff --git a/llvm/lib/Target/ARM/MVEVPTOptimisationsPass.cpp b/llvm/lib/Target/ARM/MVEVPTOptimisationsPass.cpp
--- a/llvm/lib/Target/ARM/MVEVPTOptimisationsPass.cpp
+++ b/llvm/lib/Target/ARM/MVEVPTOptimisationsPass.cpp
@@ -341,13 +341,10 @@
   for (MachineInstr &Use :
        MRI->use_instructions(LoopStart->getOperand(0).getReg()))
     if ((InsertPt != MBB->end() && !DT->dominates(&*InsertPt, &Use)) ||
-        !DT->dominates(ML->getHeader(), Use.getParent()))
-      InsertPt = &Use;
-  if (InsertPt != MBB->end() &&
-      !DT->dominates(MRI->getVRegDef(CountReg), &*InsertPt)) {
-    LLVM_DEBUG(dbgs() << "  InsertPt does not dominate CountReg!\n");
-    return false;
-  }
+        !DT->dominates(ML->getHeader(), Use.getParent())) {
+      LLVM_DEBUG(dbgs() << "  InsertPt could not be a terminator!\n");
+      return false;
+    }
 
   MachineInstrBuilder MI = BuildMI(*MBB, InsertPt, LoopStart->getDebugLoc(),
                                    TII->get(ARM::t2DoLoopStartTP))
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/exitcount.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/exitcount.ll
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/exitcount.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/exitcount.ll
@@ -7,15 +7,15 @@
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, lr}
 ; CHECK-NEXT:    push {r4, lr}
-; CHECK-NEXT:    ldrd r12, r4, [r0]
-; CHECK-NEXT:    ldrd r2, r3, [r0, #8]
-; CHECK-NEXT:    rsb r12, r12, r4, lsl #1
-; CHECK-NEXT:    mov r4, r12
+; CHECK-NEXT:    ldrd r12, r2, [r0]
+; CHECK-NEXT:    ldrd r4, r3, [r0, #8]
+; CHECK-NEXT:    rsb r12, r12, r2, lsl #1
+; CHECK-NEXT:    mov r2, r12
 ; CHECK-NEXT:    dlstp.16 lr, r12
 ; CHECK-NEXT:  .LBB0_1: @ %do.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrh.u16 q0, [r3], #16
-; CHECK-NEXT:    vstrh.16 q0, [r2], #16
+; CHECK-NEXT:    vstrh.16 q0, [r4], #16
 ; CHECK-NEXT:    letp lr, .LBB0_1
 ; CHECK-NEXT:  @ %bb.2: @ %do.end
 ; CHECK-NEXT:    ldr r2, [r0]
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll
@@ -8,8 +8,8 @@
 ; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
 ; CHECK-NEXT:    mov r3, r1
-; CHECK-NEXT:    dlstp.32 lr, r1
 ; CHECK-NEXT:    mov r12, r0
+; CHECK-NEXT:    dlstp.32 lr, r1
 ; CHECK-NEXT:  .LBB0_1: @ %do.body.i
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q1, [r12], #16
@@ -17,30 +17,19 @@
 ; CHECK-NEXT:    letp lr, .LBB0_1
 ; CHECK-NEXT:  @ %bb.2: @ %arm_mean_f32_mve.exit
 ; CHECK-NEXT:    vmov s4, r1
-; CHECK-NEXT:    mov r3, r1
 ; CHECK-NEXT:    vadd.f32 s0, s3, s3
-; CHECK-NEXT:    cmp r1, #4
 ; CHECK-NEXT:    vcvt.f32.u32 s4, s4
-; CHECK-NEXT:    it ge
-; CHECK-NEXT:    movge r3, #4
-; CHECK-NEXT:    subs r3, r1, r3
-; CHECK-NEXT:    mov.w lr, #1
-; CHECK-NEXT:    adds r3, #3
-; CHECK-NEXT:    add.w lr, lr, r3, lsr #2
 ; CHECK-NEXT:    mov r3, r1
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:    vdiv.f32 s0, s0, s4
 ; CHECK-NEXT:    vmov r12, s0
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
+; CHECK-NEXT:    dlstp.32 lr, r1
 ; CHECK-NEXT:  .LBB0_3: @ %do.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vctp.32 r3
-; CHECK-NEXT:    subs r3, #4
-; CHECK-NEXT:    vpsttt
-; CHECK-NEXT:    vldrwt.u32 q1, [r0], #16
-; CHECK-NEXT:    vsubt.f32 q1, q1, r12
-; CHECK-NEXT:    vfmat.f32 q0, q1, q1
-; CHECK-NEXT:    le lr, .LBB0_3
+; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
+; CHECK-NEXT:    vsub.f32 q1, q1, r12
+; CHECK-NEXT:    vfma.f32 q0, q1, q1
+; CHECK-NEXT:    letp lr, .LBB0_3
 ; CHECK-NEXT:  @ %bb.4: @ %do.end
 ; CHECK-NEXT:    subs r0, r1, #1
 ; CHECK-NEXT:    vadd.f32 s0, s3, s3
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll
@@ -69,7 +69,7 @@
 ; CHECK-NEXT:  .LBB1_1: @ %vector.ph
 ; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    adds r3, r2, #7
-; CHECK-NEXT:    vmov.i32 q0, #0x0
+; CHECK-NEXT:    vmov.i32 q1, #0x0
 ; CHECK-NEXT:    bic r3, r3, #7
 ; CHECK-NEXT:    sub.w r12, r3, #8
 ; CHECK-NEXT:    movs r3, #1
@@ -78,17 +78,17 @@
 ; CHECK-NEXT:  .LBB1_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.16 r2
-; CHECK-NEXT:    vmov q1, q0
+; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vldrbt.u16 q0, [r0], #8
+; CHECK-NEXT:    vldrbt.u16 q1, [r0], #8
 ; CHECK-NEXT:    subs r2, #8
-; CHECK-NEXT:    vadd.i16 q0, q1, q0
+; CHECK-NEXT:    vadd.i16 q1, q0, q1
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vldrbt.u16 q2, [r1], #8
-; CHECK-NEXT:    vadd.i16 q0, q0, q2
+; CHECK-NEXT:    vadd.i16 q1, q1, q2
 ; CHECK-NEXT:    le lr, .LBB1_2
 ; CHECK-NEXT:  @ %bb.3: @ %middle.block
-; CHECK-NEXT:    vpsel q0, q0, q1
+; CHECK-NEXT:    vpsel q0, q1, q0
 ; CHECK-NEXT:    vaddv.u16 r0, q0
 ; CHECK-NEXT:    pop.w {r7, lr}
 ; CHECK-NEXT:    sxth r0, r0
@@ -142,7 +142,7 @@
 ; CHECK-NEXT:  .LBB2_1: @ %vector.ph
 ; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    add.w r3, r2, #15
-; CHECK-NEXT:    vmov.i32 q0, #0x0
+; CHECK-NEXT:    vmov.i32 q1, #0x0
 ; CHECK-NEXT:    bic r3, r3, #15
 ; CHECK-NEXT:    sub.w r12, r3, #16
 ; CHECK-NEXT:    movs r3, #1
@@ -151,16 +151,16 @@
 ; CHECK-NEXT:  .LBB2_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.8 r2
-; CHECK-NEXT:    vmov q1, q0
+; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    vpstt
-; CHECK-NEXT:    vldrbt.u8 q0, [r1], #16
+; CHECK-NEXT:    vldrbt.u8 q1, [r1], #16
 ; CHECK-NEXT:    vldrbt.u8 q2, [r0], #16
 ; CHECK-NEXT:    subs r2, #16
-; CHECK-NEXT:    vsub.i8 q0, q2, q0
-; CHECK-NEXT:    vadd.i8 q0, q0, q1
+; CHECK-NEXT:    vsub.i8 q1, q2, q1
+; CHECK-NEXT:    vadd.i8 q1, q1, q0
 ; CHECK-NEXT:    le lr, .LBB2_2
 ; CHECK-NEXT:  @ %bb.3: @ %middle.block
-; CHECK-NEXT:    vpsel q0, q0, q1
+; CHECK-NEXT:    vpsel q0, q1, q0
 ; CHECK-NEXT:    vaddv.u8 r0, q0
 ; CHECK-NEXT:    pop.w {r7, lr}
 ; CHECK-NEXT:    uxtb r0, r0
@@ -212,7 +212,7 @@
 ; CHECK-NEXT:  .LBB3_1: @ %vector.ph
 ; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    adds r3, r2, #7
-; CHECK-NEXT:    vmov.i32 q0, #0x0
+; CHECK-NEXT:    vmov.i32 q1, #0x0
 ; CHECK-NEXT:    bic r3, r3, #7
 ; CHECK-NEXT:    sub.w r12, r3, #8
 ; CHECK-NEXT:    movs r3, #1
@@ -221,16 +221,16 @@
 ; CHECK-NEXT:  .LBB3_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.16 r2
-; CHECK-NEXT:    vmov q1, q0
+; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    vpstt
-; CHECK-NEXT:    vldrbt.u16 q0, [r0], #8
+; CHECK-NEXT:    vldrbt.u16 q1, [r0], #8
 ; CHECK-NEXT:    vldrbt.u16 q2, [r1], #8
 ; CHECK-NEXT:    subs r2, #8
-; CHECK-NEXT:    vsub.i16 q0, q2, q0
-; CHECK-NEXT:    vadd.i16 q0, q0, q1
+; CHECK-NEXT:    vsub.i16 q1, q2, q1
+; CHECK-NEXT:    vadd.i16 q1, q1, q0
 ; CHECK-NEXT:    le lr, .LBB3_2
 ; CHECK-NEXT:  @ %bb.3: @ %middle.block
-; CHECK-NEXT:    vpsel q0, q0, q1
+; CHECK-NEXT:    vpsel q0, q1, q0
 ; CHECK-NEXT:    vaddv.u16 r0, q0
 ; CHECK-NEXT:    pop.w {r7, lr}
 ; CHECK-NEXT:    sxth r0, r0
@@ -284,7 +284,7 @@
 ; CHECK-NEXT:  .LBB4_1: @ %vector.ph
 ; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    add.w r3, r2, #15
-; CHECK-NEXT:    vmov.i32 q0, #0x0
+; CHECK-NEXT:    vmov.i32 q1, #0x0
 ; CHECK-NEXT:    bic r3, r3, #15
 ; CHECK-NEXT:    sub.w r12, r3, #16
 ; CHECK-NEXT:    movs r3, #1
@@ -293,16 +293,16 @@
 ; CHECK-NEXT:  .LBB4_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.8 r2
-; CHECK-NEXT:    vmov q1, q0
+; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    vpstt
-; CHECK-NEXT:    vldrbt.u8 q0, [r0], #16
+; CHECK-NEXT:    vldrbt.u8 q1, [r0], #16
 ; CHECK-NEXT:    vldrbt.u8 q2, [r1], #16
 ; CHECK-NEXT:    subs r2, #16
-; CHECK-NEXT:    vmul.i8 q0, q2, q0
-; CHECK-NEXT:    vadd.i8 q0, q0, q1
+; CHECK-NEXT:    vmul.i8 q1, q2, q1
+; CHECK-NEXT:    vadd.i8 q1, q1, q0
 ; CHECK-NEXT:    le lr, .LBB4_2
 ; CHECK-NEXT:  @ %bb.3: @ %middle.block
-; CHECK-NEXT:    vpsel q0, q0, q1
+; CHECK-NEXT:    vpsel q0, q1, q0
 ; CHECK-NEXT:    vaddv.u8 r0, q0
 ; CHECK-NEXT:    pop.w {r7, lr}
 ; CHECK-NEXT:    uxtb r0, r0
@@ -354,7 +354,7 @@
 ; CHECK-NEXT:  .LBB5_1: @ %vector.ph
 ; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    adds r3, r2, #7
-; CHECK-NEXT:    vmov.i32 q0, #0x0
+; CHECK-NEXT:    vmov.i32 q1, #0x0
 ; CHECK-NEXT:    bic r3, r3, #7
 ; CHECK-NEXT:    sub.w r12, r3, #8
 ; CHECK-NEXT:    movs r3, #1
@@ -363,16 +363,16 @@
 ; CHECK-NEXT:  .LBB5_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.16 r2
-; CHECK-NEXT:    vmov q1, q0
+; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    vpstt
-; CHECK-NEXT:    vldrbt.u16 q0, [r0], #8
+; CHECK-NEXT:    vldrbt.u16 q1, [r0], #8
 ; CHECK-NEXT:    vldrbt.u16 q2, [r1], #8
 ; CHECK-NEXT:    subs r2, #8
-; CHECK-NEXT:    vmul.i16 q0, q2, q0
-; CHECK-NEXT:    vadd.i16 q0, q0, q1
+; CHECK-NEXT:    vmul.i16 q1, q2, q1
+; CHECK-NEXT:    vadd.i16 q1, q1, q0
 ; CHECK-NEXT:    le lr, .LBB5_2
 ; CHECK-NEXT:  @ %bb.3: @ %middle.block
-; CHECK-NEXT:    vpsel q0, q0, q1
+; CHECK-NEXT:    vpsel q0, q1, q0
 ; CHECK-NEXT:    vaddv.u16 r0, q0
 ; CHECK-NEXT:    pop.w {r7, lr}
 ; CHECK-NEXT:    sxth r0, r0
@@ -423,7 +423,7 @@
 ; CHECK-NEXT:    beq .LBB6_8
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph
 ; CHECK-NEXT:    adds r3, r2, #3
-; CHECK-NEXT:    vmov.i32 q0, #0x0
+; CHECK-NEXT:    vmov.i32 q1, #0x0
 ; CHECK-NEXT:    bic r3, r3, #3
 ; CHECK-NEXT:    mov r4, r0
 ; CHECK-NEXT:    subs r6, r3, #4
@@ -435,16 +435,16 @@
 ; CHECK-NEXT:  .LBB6_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.32 r3
-; CHECK-NEXT:    vmov q1, q0
+; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    vpstt
-; CHECK-NEXT:    vldrbt.u32 q0, [r4], #4
+; CHECK-NEXT:    vldrbt.u32 q1, [r4], #4
 ; CHECK-NEXT:    vldrbt.u32 q2, [r5], #4
 ; CHECK-NEXT:    subs r3, #4
-; CHECK-NEXT:    vmul.i32 q0, q2, q0
-; CHECK-NEXT:    vadd.i32 q0, q0, q1
+; CHECK-NEXT:    vmul.i32 q1, q2, q1
+; CHECK-NEXT:    vadd.i32 q1, q1, q0
 ; CHECK-NEXT:    le lr, .LBB6_2
 ; CHECK-NEXT:  @ %bb.3: @ %middle.block
-; CHECK-NEXT:    vpsel q0, q0, q1
+; CHECK-NEXT:    vpsel q0, q1, q0
 ; CHECK-NEXT:    vaddv.u32 r12, q0
 ; CHECK-NEXT:    cbz r2, .LBB6_7
 ; CHECK-NEXT:  @ %bb.4: @ %vector.ph47
@@ -550,32 +550,32 @@
 ; CHECK-NEXT:    cbz r2, .LBB7_4
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph
 ; CHECK-NEXT:    adds r3, r2, #7
-; CHECK-NEXT:    vmov.i32 q0, #0x0
+; CHECK-NEXT:    vmov.i32 q1, #0x0
 ; CHECK-NEXT:    bic r3, r3, #7
 ; CHECK-NEXT:    movs r4, #1
 ; CHECK-NEXT:    subs r3, #8
-; CHECK-NEXT:    vmov q3, q0
+; CHECK-NEXT:    vmov q3, q1
 ; CHECK-NEXT:    add.w lr, r4, r3, lsr #3
 ; CHECK-NEXT:    mov r3, r0
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:    mov r4, r1
+; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB7_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.16 r2
-; CHECK-NEXT:    vmov q1, q0
+; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    vpstt
-; CHECK-NEXT:    vldrbt.u16 q0, [r3], #8
+; CHECK-NEXT:    vldrbt.u16 q1, [r3], #8
 ; CHECK-NEXT:    vldrbt.u16 q4, [r4], #8
 ; CHECK-NEXT:    vmov q2, q3
-; CHECK-NEXT:    vsub.i16 q3, q4, q0
-; CHECK-NEXT:    vmul.i16 q0, q4, q0
+; CHECK-NEXT:    vsub.i16 q3, q4, q1
+; CHECK-NEXT:    vmul.i16 q1, q4, q1
 ; CHECK-NEXT:    subs r2, #8
 ; CHECK-NEXT:    vadd.i16 q3, q3, q2
-; CHECK-NEXT:    vadd.i16 q0, q0, q1
+; CHECK-NEXT:    vadd.i16 q1, q1, q0
 ; CHECK-NEXT:    le lr, .LBB7_2
 ; CHECK-NEXT:  @ %bb.3: @ %middle.block
 ; CHECK-NEXT:    vpsel q2, q3, q2
-; CHECK-NEXT:    vpsel q0, q0, q1
+; CHECK-NEXT:    vpsel q0, q1, q0
 ; CHECK-NEXT:    vaddv.u16 r4, q2
 ; CHECK-NEXT:    vaddv.u16 r2, q0
 ; CHECK-NEXT:    b .LBB7_5
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll
@@ -53,11 +53,11 @@
 define void @nested(i32* nocapture readonly %x, i32* nocapture readnone %y, i32* nocapture %z, i32 %m, i32 %n) {
 ; CHECK-LABEL: nested:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, lr}
-; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, lr}
 ; CHECK-NEXT:    cbz r3, .LBB1_8
 ; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
-; CHECK-NEXT:    ldr r5, [sp, #24]
+; CHECK-NEXT:    ldr r5, [sp, #28]
 ; CHECK-NEXT:    mov.w r12, #0
 ; CHECK-NEXT:    movs r1, #0
 ; CHECK-NEXT:    b .LBB1_4
@@ -77,11 +77,11 @@
 ; CHECK-NEXT:    beq .LBB1_2
 ; CHECK-NEXT:  @ %bb.5: @ %do.body.preheader
 ; CHECK-NEXT:    @ in Loop: Header=BB1_4 Depth=1
-; CHECK-NEXT:    bic r6, r7, #3
-; CHECK-NEXT:    dlstp.32 lr, r5
+; CHECK-NEXT:    bic r9, r7, #3
 ; CHECK-NEXT:    mov r7, r5
-; CHECK-NEXT:    add.w r8, r0, r6, lsl #2
 ; CHECK-NEXT:    mov r4, r3
+; CHECK-NEXT:    add.w r8, r0, r9, lsl #2
+; CHECK-NEXT:    dlstp.32 lr, r5
 ; CHECK-NEXT:  .LBB1_6: @ %do.body
 ; CHECK-NEXT:    @ Parent Loop BB1_4 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
@@ -90,11 +90,11 @@
 ; CHECK-NEXT:    letp lr, .LBB1_6
 ; CHECK-NEXT:  @ %bb.7: @ %if.end.loopexit
 ; CHECK-NEXT:    @ in Loop: Header=BB1_4 Depth=1
-; CHECK-NEXT:    subs r5, r5, r6
+; CHECK-NEXT:    sub.w r5, r5, r9
 ; CHECK-NEXT:    mov r0, r8
 ; CHECK-NEXT:    b .LBB1_3
 ; CHECK-NEXT:  .LBB1_8: @ %for.cond.cleanup
-; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, pc}
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, pc}
 entry:
   %cmp20.not = icmp eq i32 %m, 0
   br i1 %cmp20.not, label %for.cond.cleanup, label %for.body
diff --git a/llvm/test/CodeGen/Thumb2/mve-fma-loops.ll b/llvm/test/CodeGen/Thumb2/mve-fma-loops.ll
--- a/llvm/test/CodeGen/Thumb2/mve-fma-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-fma-loops.ll
@@ -266,8 +266,8 @@
 ; CHECK-NEXT:  .LBB4_1: @ %vector.ph
 ; CHECK-NEXT:    vmov r12, s0
 ; CHECK-NEXT:    movs r4, #0
-; CHECK-NEXT:    dlstp.32 lr, r3
 ; CHECK-NEXT:    eor r12, r12, #-2147483648
+; CHECK-NEXT:    dlstp.32 lr, r3
 ; CHECK-NEXT:  .LBB4_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    adds r4, #4
@@ -530,8 +530,8 @@
 ; CHECK-NEXT:  .LBB8_1: @ %vector.ph
 ; CHECK-NEXT:    vmov r12, s0
 ; CHECK-NEXT:    movs r4, #0
-; CHECK-NEXT:    dlstp.32 lr, r3
 ; CHECK-NEXT:    eor r12, r12, #-2147483648
+; CHECK-NEXT:    dlstp.32 lr, r3
 ; CHECK-NEXT:  .LBB8_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    adds r4, #4
diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll
--- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll
@@ -9,11 +9,11 @@
 ; CHECK-NEXT:    add.w r12, r0, r3, lsl #2
 ; CHECK-NEXT:    adr r0, .LCPI0_0
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
-; CHECK-NEXT:    movw lr, #1250
 ; CHECK-NEXT:    vmov.i32 q1, #0x0
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    movw lr, #1250
 ; CHECK-NEXT:    vadd.i32 q0, q0, r1
 ; CHECK-NEXT:    adds r1, r3, #4
+; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB0_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.32 r3
@@ -80,11 +80,11 @@
 ; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    add.w r4, r0, r3, lsl #2
 ; CHECK-NEXT:    adr r0, .LCPI1_0
-; CHECK-NEXT:    movw lr, #1250
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
 ; CHECK-NEXT:    add.w r12, r3, #4
 ; CHECK-NEXT:    vmov.i32 q2, #0x0
 ; CHECK-NEXT:    vmov.i32 q0, #0x14
+; CHECK-NEXT:    movw lr, #1250
 ; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB1_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
@@ -155,8 +155,8 @@
 ; CHECK-NEXT:    add.w r12, r0, r3, lsl #2
 ; CHECK-NEXT:    adr r0, .LCPI2_0
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
-; CHECK-NEXT:    movw lr, #1250
 ; CHECK-NEXT:    vmov.i32 q1, #0x0
+; CHECK-NEXT:    movw lr, #1250
 ; CHECK-NEXT:    vmov.i32 q2, #0x3
 ; CHECK-NEXT:    vadd.i32 q0, q0, r1
 ; CHECK-NEXT:    adds r1, r3, #4
diff --git a/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll b/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll
--- a/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll
@@ -15,34 +15,34 @@
 ; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
 ; CHECK-NEXT:    ldr r5, [r0, #8]
 ; CHECK-NEXT:    ldr r3, [r0]
-; CHECK-NEXT:    add.w r4, r3, r5, lsl #2
+; CHECK-NEXT:    add.w r3, r3, r5, lsl #2
 ; CHECK-NEXT:    movs r0, #1
 ; CHECK-NEXT:    lsl.w r9, r5, #2
 ; CHECK-NEXT:  .LBB0_2: @ %for.body
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB0_3 Depth 2
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
+; CHECK-NEXT:    mov r6, r1
+; CHECK-NEXT:    mov r7, r3
+; CHECK-NEXT:    mov r4, r5
 ; CHECK-NEXT:    dlstp.32 lr, r5
-; CHECK-NEXT:    mov r7, r1
-; CHECK-NEXT:    mov r3, r4
-; CHECK-NEXT:    mov r6, r5
 ; CHECK-NEXT:  .LBB0_3: @ %vector.body
 ; CHECK-NEXT:    @ Parent Loop BB0_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    vldrw.u32 q1, [r7], #16
-; CHECK-NEXT:    vldrw.u32 q2, [r3], #16
+; CHECK-NEXT:    vldrw.u32 q1, [r6], #16
+; CHECK-NEXT:    vldrw.u32 q2, [r7], #16
 ; CHECK-NEXT:    vfma.f32 q0, q2, q1
 ; CHECK-NEXT:    letp lr, .LBB0_3
 ; CHECK-NEXT:  @ %bb.4: @ %middle.block
 ; CHECK-NEXT:    @ in Loop: Header=BB0_2 Depth=1
 ; CHECK-NEXT:    vadd.f32 s4, s2, s3
-; CHECK-NEXT:    add.w r3, r2, r0, lsl #2
+; CHECK-NEXT:    add.w r7, r2, r0, lsl #2
 ; CHECK-NEXT:    vadd.f32 s0, s0, s1
 ; CHECK-NEXT:    adds r0, #1
-; CHECK-NEXT:    add r4, r9
+; CHECK-NEXT:    add r3, r9
 ; CHECK-NEXT:    cmp r0, r12
 ; CHECK-NEXT:    vadd.f32 s0, s0, s4
-; CHECK-NEXT:    vstr s0, [r3]
+; CHECK-NEXT:    vstr s0, [r7]
 ; CHECK-NEXT:    bne .LBB0_2
 ; CHECK-NEXT:  .LBB0_5: @ %for.cond.cleanup
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, pc}
@@ -115,43 +115,35 @@
 ; CHECK-NEXT:    ldr.w r12, [r0, #8]
 ; CHECK-NEXT:    movs r4, #1
 ; CHECK-NEXT:    ldr r3, [r0]
-; CHECK-NEXT:    add.w r0, r12, #3
-; CHECK-NEXT:    bic r0, r0, #3
-; CHECK-NEXT:    add.w r5, r3, r12, lsl #2
-; CHECK-NEXT:    subs r0, #4
+; CHECK-NEXT:    add.w r11, r3, r12, lsl #2
 ; CHECK-NEXT:    add.w r7, r3, r12, lsl #3
 ; CHECK-NEXT:    lsl.w r9, r12, #3
-; CHECK-NEXT:    add.w r8, r4, r0, lsr #2
 ; CHECK-NEXT:  .LBB1_2: @ %for.body
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB1_3 Depth 2
-; CHECK-NEXT:    dls lr, r8
-; CHECK-NEXT:    ldr r6, [sp] @ 4-byte Reload
+; CHECK-NEXT:    ldr r5, [sp] @ 4-byte Reload
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
-; CHECK-NEXT:    add.w r11, r4, #1
-; CHECK-NEXT:    mov r3, r5
+; CHECK-NEXT:    add.w r10, r4, #1
+; CHECK-NEXT:    mov r3, r11
 ; CHECK-NEXT:    mov r0, r7
 ; CHECK-NEXT:    vmov q1, q0
-; CHECK-NEXT:    mov r10, r12
+; CHECK-NEXT:    mov r6, r12
+; CHECK-NEXT:    dlstp.32 lr, r12
 ; CHECK-NEXT:  .LBB1_3: @ %vector.body
 ; CHECK-NEXT:    @ Parent Loop BB1_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    vctp.32 r10
-; CHECK-NEXT:    sub.w r10, r10, #4
-; CHECK-NEXT:    vpstttt
-; CHECK-NEXT:    vldrwt.u32 q2, [r6], #16
-; CHECK-NEXT:    vldrwt.u32 q3, [r3], #16
-; CHECK-NEXT:    vfmat.f32 q1, q3, q2
-; CHECK-NEXT:    vldrwt.u32 q3, [r0], #16
-; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vfmat.f32 q0, q3, q2
-; CHECK-NEXT:    le lr, .LBB1_3
+; CHECK-NEXT:    vldrw.u32 q2, [r5], #16
+; CHECK-NEXT:    vldrw.u32 q3, [r3], #16
+; CHECK-NEXT:    vfma.f32 q1, q3, q2
+; CHECK-NEXT:    vldrw.u32 q3, [r0], #16
+; CHECK-NEXT:    vfma.f32 q0, q3, q2
+; CHECK-NEXT:    letp lr, .LBB1_3
 ; CHECK-NEXT:  @ %bb.4: @ %middle.block
 ; CHECK-NEXT:    @ in Loop: Header=BB1_2 Depth=1
 ; CHECK-NEXT:    vadd.f32 s8, s2, s3
-; CHECK-NEXT:    add.w r0, r2, r11, lsl #2
+; CHECK-NEXT:    add.w r0, r2, r10, lsl #2
 ; CHECK-NEXT:    vadd.f32 s0, s0, s1
-; CHECK-NEXT:    add r5, r9
+; CHECK-NEXT:    add r11, r9
 ; CHECK-NEXT:    vadd.f32 s2, s6, s7
 ; CHECK-NEXT:    add r7, r9
 ; CHECK-NEXT:    vadd.f32 s4, s4, s5
@@ -241,84 +233,86 @@
 ; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    .vsave {d8, d9}
 ; CHECK-NEXT:    vpush {d8, d9}
-; CHECK-NEXT:    .pad #16
-; CHECK-NEXT:    sub sp, #16
-; CHECK-NEXT:    str r1, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT:    .pad #24
+; CHECK-NEXT:    sub sp, #24
+; CHECK-NEXT:    str r1, [sp, #16] @ 4-byte Spill
 ; CHECK-NEXT:    ldr r1, [r0, #4]
+; CHECK-NEXT:    str r2, [sp, #8] @ 4-byte Spill
 ; CHECK-NEXT:    subs r1, #3
-; CHECK-NEXT:    str r1, [sp, #8] @ 4-byte Spill
+; CHECK-NEXT:    str r1, [sp, #12] @ 4-byte Spill
 ; CHECK-NEXT:    cmp r1, #2
 ; CHECK-NEXT:    blo .LBB2_5
 ; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
-; CHECK-NEXT:    ldr r7, [r0, #8]
+; CHECK-NEXT:    ldr r3, [r0, #8]
 ; CHECK-NEXT:    movs r5, #1
-; CHECK-NEXT:    ldr r3, [r0]
-; CHECK-NEXT:    str r7, [sp, #4] @ 4-byte Spill
-; CHECK-NEXT:    add.w r0, r7, r7, lsl #1
-; CHECK-NEXT:    add.w r12, r3, r7, lsl #2
-; CHECK-NEXT:    add.w r1, r3, r7, lsl #3
-; CHECK-NEXT:    add.w r8, r3, r0, lsl #2
-; CHECK-NEXT:    adds r3, r7, #3
+; CHECK-NEXT:    ldr r1, [r0]
+; CHECK-NEXT:    str r3, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT:    add.w r0, r3, r3, lsl #1
+; CHECK-NEXT:    add.w r9, r1, r3, lsl #2
+; CHECK-NEXT:    add.w r12, r1, r3, lsl #3
+; CHECK-NEXT:    adds r3, #3
 ; CHECK-NEXT:    bic r3, r3, #3
-; CHECK-NEXT:    lsls r7, r0, #2
+; CHECK-NEXT:    ldr r7, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    add.w r10, r1, r0, lsl #2
 ; CHECK-NEXT:    subs r3, #4
-; CHECK-NEXT:    add.w r3, r5, r3, lsr #2
-; CHECK-NEXT:    str r3, [sp] @ 4-byte Spill
+; CHECK-NEXT:    lsl.w r11, r0, #2
+; CHECK-NEXT:    add.w r1, r5, r3, lsr #2
+; CHECK-NEXT:    str r1, [sp] @ 4-byte Spill
 ; CHECK-NEXT:  .LBB2_2: @ %for.body
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB2_3 Depth 2
-; CHECK-NEXT:    ldrd r0, r10, [sp] @ 8-byte Folded Reload
+; CHECK-NEXT:    ldr r6, [sp, #16] @ 4-byte Reload
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
-; CHECK-NEXT:    add.w r9, r5, #2
-; CHECK-NEXT:    add.w r11, r5, #1
-; CHECK-NEXT:    dls lr, r0
-; CHECK-NEXT:    ldr r6, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT:    mov r3, r12
-; CHECK-NEXT:    mov r0, r1
-; CHECK-NEXT:    mov r4, r8
+; CHECK-NEXT:    ldr r1, [sp] @ 4-byte Reload
+; CHECK-NEXT:    adds r0, r5, #2
+; CHECK-NEXT:    adds r2, r5, #1
+; CHECK-NEXT:    str r0, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT:    mov r3, r9
+; CHECK-NEXT:    mov r0, r12
+; CHECK-NEXT:    mov r4, r10
 ; CHECK-NEXT:    vmov q2, q0
 ; CHECK-NEXT:    vmov q1, q0
+; CHECK-NEXT:    mov r8, r7
+; CHECK-NEXT:    dlstp.32 lr, r7
 ; CHECK-NEXT:  .LBB2_3: @ %vector.body
 ; CHECK-NEXT:    @ Parent Loop BB2_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    vctp.32 r10
-; CHECK-NEXT:    sub.w r10, r10, #4
-; CHECK-NEXT:    vpstttt
-; CHECK-NEXT:    vldrwt.u32 q3, [r6], #16
-; CHECK-NEXT:    vldrwt.u32 q4, [r3], #16
-; CHECK-NEXT:    vfmat.f32 q1, q4, q3
-; CHECK-NEXT:    vldrwt.u32 q4, [r0], #16
-; CHECK-NEXT:    vpsttt
-; CHECK-NEXT:    vfmat.f32 q2, q4, q3
-; CHECK-NEXT:    vldrwt.u32 q4, [r4], #16
-; CHECK-NEXT:    vfmat.f32 q0, q4, q3
-; CHECK-NEXT:    le lr, .LBB2_3
+; CHECK-NEXT:    vldrw.u32 q3, [r6], #16
+; CHECK-NEXT:    vldrw.u32 q4, [r3], #16
+; CHECK-NEXT:    vfma.f32 q1, q4, q3
+; CHECK-NEXT:    vldrw.u32 q4, [r0], #16
+; CHECK-NEXT:    vfma.f32 q2, q4, q3
+; CHECK-NEXT:    vldrw.u32 q4, [r4], #16
+; CHECK-NEXT:    vfma.f32 q0, q4, q3
+; CHECK-NEXT:    letp lr, .LBB2_3
 ; CHECK-NEXT:  @ %bb.4: @ %middle.block
 ; CHECK-NEXT:    @ in Loop: Header=BB2_2 Depth=1
 ; CHECK-NEXT:    vadd.f32 s12, s10, s11
-; CHECK-NEXT:    add.w r0, r2, r11, lsl #2
+; CHECK-NEXT:    ldr r1, [sp, #8] @ 4-byte Reload
 ; CHECK-NEXT:    vadd.f32 s8, s8, s9
-; CHECK-NEXT:    add r12, r7
+; CHECK-NEXT:    add r9, r11
 ; CHECK-NEXT:    vadd.f32 s10, s6, s7
-; CHECK-NEXT:    add r1, r7
+; CHECK-NEXT:    add.w r0, r1, r2, lsl #2
 ; CHECK-NEXT:    vadd.f32 s4, s4, s5
-; CHECK-NEXT:    add r8, r7
+; CHECK-NEXT:    add r12, r11
 ; CHECK-NEXT:    vadd.f32 s6, s2, s3
+; CHECK-NEXT:    add r10, r11
 ; CHECK-NEXT:    vadd.f32 s0, s0, s1
 ; CHECK-NEXT:    vadd.f32 s2, s8, s12
 ; CHECK-NEXT:    vadd.f32 s4, s4, s10
 ; CHECK-NEXT:    vadd.f32 s0, s0, s6
 ; CHECK-NEXT:    vstr s2, [r0]
-; CHECK-NEXT:    add.w r0, r2, r5, lsl #2
+; CHECK-NEXT:    add.w r0, r1, r5, lsl #2
 ; CHECK-NEXT:    adds r5, #3
 ; CHECK-NEXT:    vstr s4, [r0]
-; CHECK-NEXT:    add.w r0, r2, r9, lsl #2
+; CHECK-NEXT:    ldr r0, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT:    add.w r0, r1, r0, lsl #2
 ; CHECK-NEXT:    vstr s0, [r0]
-; CHECK-NEXT:    ldr r0, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    ldr r0, [sp, #12] @ 4-byte Reload
 ; CHECK-NEXT:    cmp r5, r0
 ; CHECK-NEXT:    blo .LBB2_2
 ; CHECK-NEXT:  .LBB2_5: @ %for.cond.cleanup
-; CHECK-NEXT:    add sp, #16
+; CHECK-NEXT:    add sp, #24
 ; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
@@ -410,81 +404,76 @@
 ; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11}
-; CHECK-NEXT:    .pad #32
-; CHECK-NEXT:    sub sp, #32
-; CHECK-NEXT:    str r1, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT:    .pad #40
+; CHECK-NEXT:    sub sp, #40
+; CHECK-NEXT:    str r1, [sp, #24] @ 4-byte Spill
 ; CHECK-NEXT:    ldr r1, [r0, #4]
+; CHECK-NEXT:    str r2, [sp, #16] @ 4-byte Spill
 ; CHECK-NEXT:    subs r1, #4
-; CHECK-NEXT:    str r1, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT:    str r1, [sp, #20] @ 4-byte Spill
 ; CHECK-NEXT:    cmp r1, #2
 ; CHECK-NEXT:    blo.w .LBB3_5
 ; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
-; CHECK-NEXT:    ldr r3, [r0, #8]
+; CHECK-NEXT:    ldr r2, [r0, #8]
 ; CHECK-NEXT:    movs r6, #1
 ; CHECK-NEXT:    ldr r1, [r0]
-; CHECK-NEXT:    add.w r0, r3, r3, lsl #1
-; CHECK-NEXT:    add.w r8, r1, r3, lsl #2
-; CHECK-NEXT:    add.w r12, r1, r3, lsl #3
-; CHECK-NEXT:    add.w r10, r1, r3, lsl #4
-; CHECK-NEXT:    add.w r9, r1, r0, lsl #2
-; CHECK-NEXT:    adds r0, r3, #3
+; CHECK-NEXT:    add.w r0, r2, r2, lsl #1
+; CHECK-NEXT:    add.w r12, r1, r2, lsl #2
+; CHECK-NEXT:    add.w r8, r1, r2, lsl #3
+; CHECK-NEXT:    add.w r9, r1, r2, lsl #4
+; CHECK-NEXT:    add.w r11, r1, r0, lsl #2
+; CHECK-NEXT:    adds r0, r2, #3
 ; CHECK-NEXT:    bic r0, r0, #3
-; CHECK-NEXT:    lsls r7, r3, #4
 ; CHECK-NEXT:    subs r0, #4
 ; CHECK-NEXT:    add.w r0, r6, r0, lsr #2
-; CHECK-NEXT:    strd r0, r3, [sp, #4] @ 8-byte Folded Spill
+; CHECK-NEXT:    strd r0, r2, [sp, #8] @ 8-byte Folded Spill
+; CHECK-NEXT:    lsls r0, r2, #4
+; CHECK-NEXT:    ldrd r2, r7, [sp, #8] @ 8-byte Folded Reload
+; CHECK-NEXT:    str r0, [sp, #4] @ 4-byte Spill
 ; CHECK-NEXT:  .LBB3_2: @ %for.body
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB3_3 Depth 2
 ; CHECK-NEXT:    adds r0, r6, #3
-; CHECK-NEXT:    str r0, [sp, #28] @ 4-byte Spill
+; CHECK-NEXT:    str r0, [sp, #36] @ 4-byte Spill
 ; CHECK-NEXT:    adds r0, r6, #2
-; CHECK-NEXT:    str r0, [sp, #24] @ 4-byte Spill
-; CHECK-NEXT:    adds r0, r6, #1
-; CHECK-NEXT:    str r0, [sp, #20] @ 4-byte Spill
-; CHECK-NEXT:    ldrd r0, r11, [sp, #4] @ 8-byte Folded Reload
+; CHECK-NEXT:    ldr r1, [sp, #24] @ 4-byte Reload
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
-; CHECK-NEXT:    mov r3, r8
-; CHECK-NEXT:    mov r5, r9
-; CHECK-NEXT:    dls lr, r0
-; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
-; CHECK-NEXT:    mov r0, r12
-; CHECK-NEXT:    mov r4, r10
+; CHECK-NEXT:    str r0, [sp, #32] @ 4-byte Spill
+; CHECK-NEXT:    adds r0, r6, #1
+; CHECK-NEXT:    str r0, [sp, #28] @ 4-byte Spill
+; CHECK-NEXT:    mov r3, r12
+; CHECK-NEXT:    mov r0, r8
+; CHECK-NEXT:    mov r5, r11
+; CHECK-NEXT:    mov r4, r9
 ; CHECK-NEXT:    vmov q1, q0
 ; CHECK-NEXT:    vmov q2, q0
 ; CHECK-NEXT:    vmov q3, q0
+; CHECK-NEXT:    mov r10, r7
+; CHECK-NEXT:    dlstp.32 lr, r7
 ; CHECK-NEXT:  .LBB3_3: @ %vector.body
 ; CHECK-NEXT:    @ Parent Loop BB3_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    vctp.32 r11
-; CHECK-NEXT:    sub.w r11, r11, #4
-; CHECK-NEXT:    vpstttt
-; CHECK-NEXT:    vldrwt.u32 q4, [r1], #16
-; CHECK-NEXT:    vldrwt.u32 q5, [r0], #16
-; CHECK-NEXT:    vfmat.f32 q3, q5, q4
-; CHECK-NEXT:    vldrwt.u32 q5, [r3], #16
-; CHECK-NEXT:    vpstttt
-; CHECK-NEXT:    vfmat.f32 q2, q5, q4
-; CHECK-NEXT:    vldrwt.u32 q5, [r5], #16
-; CHECK-NEXT:    vfmat.f32 q1, q5, q4
-; CHECK-NEXT:    vldrwt.u32 q5, [r4], #16
-; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vfmat.f32 q0, q5, q4
-; CHECK-NEXT:    le lr, .LBB3_3
+; CHECK-NEXT:    vldrw.u32 q4, [r1], #16
+; CHECK-NEXT:    vldrw.u32 q5, [r0], #16
+; CHECK-NEXT:    vfma.f32 q3, q5, q4
+; CHECK-NEXT:    vldrw.u32 q5, [r3], #16
+; CHECK-NEXT:    vfma.f32 q2, q5, q4
+; CHECK-NEXT:    vldrw.u32 q5, [r5], #16
+; CHECK-NEXT:    vfma.f32 q1, q5, q4
+; CHECK-NEXT:    vldrw.u32 q5, [r4], #16
+; CHECK-NEXT:    vfma.f32 q0, q5, q4
+; CHECK-NEXT:    letp lr, .LBB3_3
 ; CHECK-NEXT:  @ %bb.4: @ %middle.block
 ; CHECK-NEXT:    @ in Loop: Header=BB3_2 Depth=1
 ; CHECK-NEXT:    vadd.f32 s16, s14, s15
-; CHECK-NEXT:    ldr r0, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT:    ldr r0, [sp, #28] @ 4-byte Reload
 ; CHECK-NEXT:    vadd.f32 s12, s12, s13
-; CHECK-NEXT:    add r8, r7
+; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
 ; CHECK-NEXT:    vadd.f32 s14, s10, s11
-; CHECK-NEXT:    add r12, r7
 ; CHECK-NEXT:    vadd.f32 s8, s8, s9
-; CHECK-NEXT:    add.w r0, r2, r0, lsl #2
+; CHECK-NEXT:    add.w r0, r1, r0, lsl #2
 ; CHECK-NEXT:    vadd.f32 s10, s6, s7
-; CHECK-NEXT:    add r9, r7
 ; CHECK-NEXT:    vadd.f32 s4, s4, s5
-; CHECK-NEXT:    add r10, r7
 ; CHECK-NEXT:    vadd.f32 s6, s2, s3
 ; CHECK-NEXT:    vadd.f32 s0, s0, s1
 ; CHECK-NEXT:    vadd.f32 s2, s12, s16
@@ -492,20 +481,25 @@
 ; CHECK-NEXT:    vadd.f32 s4, s4, s10
 ; CHECK-NEXT:    vadd.f32 s0, s0, s6
 ; CHECK-NEXT:    vstr s2, [r0]
-; CHECK-NEXT:    add.w r0, r2, r6, lsl #2
+; CHECK-NEXT:    add.w r0, r1, r6, lsl #2
 ; CHECK-NEXT:    adds r6, #4
 ; CHECK-NEXT:    vstr s8, [r0]
-; CHECK-NEXT:    ldr r0, [sp, #24] @ 4-byte Reload
-; CHECK-NEXT:    add.w r0, r2, r0, lsl #2
+; CHECK-NEXT:    ldr r0, [sp, #32] @ 4-byte Reload
+; CHECK-NEXT:    add.w r0, r1, r0, lsl #2
 ; CHECK-NEXT:    vstr s4, [r0]
-; CHECK-NEXT:    ldr r0, [sp, #28] @ 4-byte Reload
-; CHECK-NEXT:    add.w r0, r2, r0, lsl #2
+; CHECK-NEXT:    ldr r0, [sp, #36] @ 4-byte Reload
+; CHECK-NEXT:    add.w r0, r1, r0, lsl #2
 ; CHECK-NEXT:    vstr s0, [r0]
-; CHECK-NEXT:    ldr r0, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT:    ldr r0, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    add r12, r0
+; CHECK-NEXT:    add r8, r0
+; CHECK-NEXT:    add r11, r0
+; CHECK-NEXT:    add r9, r0
+; CHECK-NEXT:    ldr r0, [sp, #20] @ 4-byte Reload
 ; CHECK-NEXT:    cmp r6, r0
 ; CHECK-NEXT:    blo .LBB3_2
 ; CHECK-NEXT:  .LBB3_5: @ %for.cond.cleanup
-; CHECK-NEXT:    add sp, #32
+; CHECK-NEXT:    add sp, #40
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
@@ -636,50 +630,46 @@
 ; CHECK-NEXT:  .LBB4_2: @ %for.body
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB4_3 Depth 2
+; CHECK-NEXT:    ldr r7, [sp, #12] @ 4-byte Reload
 ; CHECK-NEXT:    adds r1, r0, #4
+; CHECK-NEXT:    ldr r4, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT:    vmov.i32 q1, #0x0
+; CHECK-NEXT:    ldr r6, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    add.w r10, r0, #2
 ; CHECK-NEXT:    str r1, [sp, #28] @ 4-byte Spill
 ; CHECK-NEXT:    adds r1, r0, #3
+; CHECK-NEXT:    add.w r11, r0, #1
 ; CHECK-NEXT:    str r1, [sp, #24] @ 4-byte Spill
-; CHECK-NEXT:    ldrd r1, r11, [sp, #8] @ 8-byte Folded Reload
-; CHECK-NEXT:    vmov.i32 q1, #0x0
-; CHECK-NEXT:    add.w r10, r0, #2
-; CHECK-NEXT:    adds r7, r0, #1
-; CHECK-NEXT:    dls lr, r1
-; CHECK-NEXT:    ldr r1, [sp, #20] @ 4-byte Reload
 ; CHECK-NEXT:    mov r3, r8
 ; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    vmov q3, q1
 ; CHECK-NEXT:    vmov q2, q1
 ; CHECK-NEXT:    vmov q4, q1
+; CHECK-NEXT:    mov r1, r7
+; CHECK-NEXT:    dlstp.32 lr, r7
 ; CHECK-NEXT:  .LBB4_3: @ %vector.body
 ; CHECK-NEXT:    @ Parent Loop BB4_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    add.w r9, r3, r5
-; CHECK-NEXT:    vctp.32 r11
-; CHECK-NEXT:    vpsttt
-; CHECK-NEXT:    vldrwt.u32 q5, [r1], #16
-; CHECK-NEXT:    vldrwt.u32 q6, [r3], #16
-; CHECK-NEXT:    vfmat.f32 q3, q6, q5
+; CHECK-NEXT:    vldrw.u32 q5, [r4], #16
+; CHECK-NEXT:    vldrw.u32 q6, [r3], #16
+; CHECK-NEXT:    vfma.f32 q3, q6, q5
 ; CHECK-NEXT:    add.w r12, r9, r5
-; CHECK-NEXT:    vpstt
-; CHECK-NEXT:    vldrwt.u32 q6, [r9]
-; CHECK-NEXT:    vfmat.f32 q4, q6, q5
-; CHECK-NEXT:    sub.w r11, r11, #4
-; CHECK-NEXT:    add.w r4, r12, r5
-; CHECK-NEXT:    vpstt
-; CHECK-NEXT:    vldrwt.u32 q6, [r12]
-; CHECK-NEXT:    vfmat.f32 q2, q6, q5
-; CHECK-NEXT:    adds r6, r4, r5
-; CHECK-NEXT:    vpstttt
-; CHECK-NEXT:    vldrwt.u32 q6, [r4]
-; CHECK-NEXT:    vfmat.f32 q0, q6, q5
-; CHECK-NEXT:    vldrwt.u32 q6, [r6]
-; CHECK-NEXT:    vfmat.f32 q1, q6, q5
-; CHECK-NEXT:    le lr, .LBB4_3
+; CHECK-NEXT:    vldrw.u32 q6, [r9]
+; CHECK-NEXT:    vfma.f32 q4, q6, q5
+; CHECK-NEXT:    add.w r6, r12, r5
+; CHECK-NEXT:    vldrw.u32 q6, [r12]
+; CHECK-NEXT:    vfma.f32 q2, q6, q5
+; CHECK-NEXT:    adds r7, r6, r5
+; CHECK-NEXT:    vldrw.u32 q6, [r6]
+; CHECK-NEXT:    vfma.f32 q0, q6, q5
+; CHECK-NEXT:    vldrw.u32 q6, [r7]
+; CHECK-NEXT:    vfma.f32 q1, q6, q5
+; CHECK-NEXT:    letp lr, .LBB4_3
 ; CHECK-NEXT:  @ %bb.4: @ %middle.block
 ; CHECK-NEXT:    @ in Loop: Header=BB4_2 Depth=1
 ; CHECK-NEXT:    vadd.f32 s20, s18, s19
-; CHECK-NEXT:    add.w r1, r2, r7, lsl #2
+; CHECK-NEXT:    add.w r1, r2, r11, lsl #2
 ; CHECK-NEXT:    vadd.f32 s16, s16, s17
 ; CHECK-NEXT:    vadd.f32 s18, s14, s15
 ; CHECK-NEXT:    vadd.f32 s12, s12, s13
@@ -844,7 +834,7 @@
 ; CHECK-NEXT:    adds r0, r3, #3
 ; CHECK-NEXT:    str r3, [sp, #8] @ 4-byte Spill
 ; CHECK-NEXT:    bic r0, r0, #3
-; CHECK-NEXT:    add.w r9, r1, r3, lsl #2
+; CHECK-NEXT:    add.w r8, r1, r3, lsl #2
 ; CHECK-NEXT:    subs r1, r0, #4
 ; CHECK-NEXT:    movs r0, #1
 ; CHECK-NEXT:    lsls r5, r3, #2
@@ -861,48 +851,43 @@
 ; CHECK-NEXT:    adds r1, r0, #4
 ; CHECK-NEXT:    str r1, [sp, #24] @ 4-byte Spill
 ; CHECK-NEXT:    adds r1, r0, #3
+; CHECK-NEXT:    ldr r7, [sp, #8] @ 4-byte Reload
 ; CHECK-NEXT:    str r1, [sp, #20] @ 4-byte Spill
-; CHECK-NEXT:    ldrd r1, r8, [sp, #4] @ 8-byte Folded Reload
 ; CHECK-NEXT:    vmov.i32 q1, #0x0
+; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
 ; CHECK-NEXT:    add.w r11, r0, #2
+; CHECK-NEXT:    ldr r6, [sp, #4] @ 4-byte Reload
 ; CHECK-NEXT:    adds r4, r0, #1
-; CHECK-NEXT:    dls lr, r1
-; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
-; CHECK-NEXT:    mov r3, r9
+; CHECK-NEXT:    mov r3, r8
 ; CHECK-NEXT:    vmov q3, q1
 ; CHECK-NEXT:    vmov q4, q1
 ; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    vmov q5, q1
 ; CHECK-NEXT:    vmov q2, q1
+; CHECK-NEXT:    mov r9, r7
+; CHECK-NEXT:    dlstp.32 lr, r7
 ; CHECK-NEXT:  .LBB5_3: @ %vector.body
 ; CHECK-NEXT:    @ Parent Loop BB5_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    add.w r12, r3, r5
-; CHECK-NEXT:    vctp.32 r8
-; CHECK-NEXT:    vpsttt
-; CHECK-NEXT:    vldrwt.u32 q6, [r1], #16
-; CHECK-NEXT:    vldrwt.u32 q7, [r3], #16
-; CHECK-NEXT:    vfmat.f32 q4, q7, q6
+; CHECK-NEXT:    vldrw.u32 q6, [r1], #16
+; CHECK-NEXT:    vldrw.u32 q7, [r3], #16
+; CHECK-NEXT:    vfma.f32 q4, q7, q6
 ; CHECK-NEXT:    add.w r10, r12, r5
-; CHECK-NEXT:    vpstt
-; CHECK-NEXT:    vldrwt.u32 q7, [r12]
-; CHECK-NEXT:    vfmat.f32 q5, q7, q6
+; CHECK-NEXT:    vldrw.u32 q7, [r12]
+; CHECK-NEXT:    vfma.f32 q5, q7, q6
 ; CHECK-NEXT:    add.w r6, r10, r5
-; CHECK-NEXT:    vpstt
-; CHECK-NEXT:    vldrwt.u32 q7, [r10]
-; CHECK-NEXT:    vfmat.f32 q2, q7, q6
-; CHECK-NEXT:    sub.w r8, r8, #4
+; CHECK-NEXT:    vldrw.u32 q7, [r10]
+; CHECK-NEXT:    vfma.f32 q2, q7, q6
 ; CHECK-NEXT:    adds r7, r6, r5
-; CHECK-NEXT:    vpstt
-; CHECK-NEXT:    vldrwt.u32 q7, [r6]
-; CHECK-NEXT:    vfmat.f32 q0, q7, q6
+; CHECK-NEXT:    vldrw.u32 q7, [r6]
+; CHECK-NEXT:    vfma.f32 q0, q7, q6
 ; CHECK-NEXT:    adds r6, r7, r5
-; CHECK-NEXT:    vpstttt
-; CHECK-NEXT:    vldrwt.u32 q7, [r7]
-; CHECK-NEXT:    vfmat.f32 q3, q7, q6
-; CHECK-NEXT:    vldrwt.u32 q7, [r6]
-; CHECK-NEXT:    vfmat.f32 q1, q7, q6
-; CHECK-NEXT:    le lr, .LBB5_3
+; CHECK-NEXT:    vldrw.u32 q7, [r7]
+; CHECK-NEXT:    vfma.f32 q3, q7, q6
+; CHECK-NEXT:    vldrw.u32 q7, [r6]
+; CHECK-NEXT:    vfma.f32 q1, q7, q6
+; CHECK-NEXT:    letp lr, .LBB5_3
 ; CHECK-NEXT:  @ %bb.4: @ %middle.block
 ; CHECK-NEXT:    @ in Loop: Header=BB5_2 Depth=1
 ; CHECK-NEXT:    vadd.f32 s24, s22, s23
@@ -940,7 +925,7 @@
 ; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
 ; CHECK-NEXT:    vstr s4, [r1]
 ; CHECK-NEXT:    ldr r1, [sp] @ 4-byte Reload
-; CHECK-NEXT:    add r9, r1
+; CHECK-NEXT:    add r8, r1
 ; CHECK-NEXT:    ldr r1, [sp, #12] @ 4-byte Reload
 ; CHECK-NEXT:    cmp r0, r1
 ; CHECK-NEXT:    blo.w .LBB5_2
@@ -1090,7 +1075,7 @@
 ; CHECK-NEXT:    adds r0, r3, #3
 ; CHECK-NEXT:    str r3, [sp, #20] @ 4-byte Spill
 ; CHECK-NEXT:    bic r0, r0, #3
-; CHECK-NEXT:    add.w r12, r1, r3, lsl #2
+; CHECK-NEXT:    add.w r9, r1, r3, lsl #2
 ; CHECK-NEXT:    subs r1, r0, #4
 ; CHECK-NEXT:    movs r0, #1
 ; CHECK-NEXT:    lsls r5, r3, #2
@@ -1109,27 +1094,29 @@
 ; CHECK-NEXT:    adds r1, r0, #4
 ; CHECK-NEXT:    str r1, [sp, #36] @ 4-byte Spill
 ; CHECK-NEXT:    adds r1, r0, #3
+; CHECK-NEXT:    ldr r7, [sp, #20] @ 4-byte Reload
 ; CHECK-NEXT:    str r1, [sp, #32] @ 4-byte Spill
-; CHECK-NEXT:    ldrd r3, r1, [sp, #16] @ 8-byte Folded Reload
 ; CHECK-NEXT:    vmov.i32 q2, #0x0
+; CHECK-NEXT:    ldr r1, [sp, #28] @ 4-byte Reload
 ; CHECK-NEXT:    adds r4, r0, #2
+; CHECK-NEXT:    ldr r6, [sp, #16] @ 4-byte Reload
 ; CHECK-NEXT:    add.w r8, r0, #1
-; CHECK-NEXT:    dls lr, r3
-; CHECK-NEXT:    ldr.w r9, [sp, #28] @ 4-byte Reload
-; CHECK-NEXT:    mov r3, r12
+; CHECK-NEXT:    mov r3, r9
 ; CHECK-NEXT:    vmov q4, q2
 ; CHECK-NEXT:    vmov q5, q2
 ; CHECK-NEXT:    vmov q3, q2
 ; CHECK-NEXT:    vmov q6, q2
 ; CHECK-NEXT:    vmov q1, q2
+; CHECK-NEXT:    mov r12, r7
 ; CHECK-NEXT:    vstrw.32 q2, [sp, #64] @ 16-byte Spill
+; CHECK-NEXT:    dls lr, r6
 ; CHECK-NEXT:  .LBB6_3: @ %vector.body
 ; CHECK-NEXT:    @ Parent Loop BB6_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    add.w r10, r3, r5
-; CHECK-NEXT:    vctp.32 r1
+; CHECK-NEXT:    vctp.32 r12
 ; CHECK-NEXT:    vpsttt
-; CHECK-NEXT:    vldrwt.u32 q7, [r9], #16
+; CHECK-NEXT:    vldrwt.u32 q7, [r1], #16
 ; CHECK-NEXT:    vldrwt.u32 q0, [r3], #16
 ; CHECK-NEXT:    vfmat.f32 q5, q0, q7
 ; CHECK-NEXT:    add.w r11, r10, r5
@@ -1159,7 +1146,7 @@
 ; CHECK-NEXT:    vmov q4, q5
 ; CHECK-NEXT:    vmov q5, q6
 ; CHECK-NEXT:    vldrw.u32 q6, [sp, #48] @ 16-byte Reload
-; CHECK-NEXT:    subs r1, #4
+; CHECK-NEXT:    sub.w r12, r12, #4
 ; CHECK-NEXT:    adds r6, r7, r5
 ; CHECK-NEXT:    vpstt
 ; CHECK-NEXT:    vldrwt.u32 q0, [r7]
@@ -1215,7 +1202,7 @@
 ; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
 ; CHECK-NEXT:    vstr s8, [r1]
 ; CHECK-NEXT:    ldr r1, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT:    add r12, r1
+; CHECK-NEXT:    add r9, r1
 ; CHECK-NEXT:    ldr r1, [sp, #24] @ 4-byte Reload
 ; CHECK-NEXT:    cmp r0, r1
 ; CHECK-NEXT:    blo.w .LBB6_2
@@ -1378,7 +1365,7 @@
 ; CHECK-NEXT:    adds r0, r3, #3
 ; CHECK-NEXT:    str r3, [sp, #20] @ 4-byte Spill
 ; CHECK-NEXT:    bic r0, r0, #3
-; CHECK-NEXT:    add.w r9, r1, r3, lsl #2
+; CHECK-NEXT:    add.w r12, r1, r3, lsl #2
 ; CHECK-NEXT:    subs r1, r0, #4
 ; CHECK-NEXT:    movs r0, #1
 ; CHECK-NEXT:    lsls r5, r3, #2
@@ -1392,33 +1379,35 @@
 ; CHECK-NEXT:    adds r1, r0, #7
 ; CHECK-NEXT:    str r1, [sp, #44] @ 4-byte Spill
 ; CHECK-NEXT:    adds r1, r0, #6
-; CHECK-NEXT:    ldrd r3, r10, [sp, #16] @ 8-byte Folded Reload
 ; CHECK-NEXT:    str r1, [sp, #40] @ 4-byte Spill
 ; CHECK-NEXT:    adds r1, r0, #5
+; CHECK-NEXT:    ldr r7, [sp, #20] @ 4-byte Reload
 ; CHECK-NEXT:    str r1, [sp, #36] @ 4-byte Spill
 ; CHECK-NEXT:    adds r1, r0, #4
-; CHECK-NEXT:    str r1, [sp, #32] @ 4-byte Spill
-; CHECK-NEXT:    dls lr, r3
-; CHECK-NEXT:    ldr.w r12, [sp, #28] @ 4-byte Reload
+; CHECK-NEXT:    ldr.w r9, [sp, #28] @ 4-byte Reload
 ; CHECK-NEXT:    vmov.i32 q3, #0x0
+; CHECK-NEXT:    ldr r6, [sp, #16] @ 4-byte Reload
 ; CHECK-NEXT:    adds r4, r0, #3
+; CHECK-NEXT:    str r1, [sp, #32] @ 4-byte Spill
 ; CHECK-NEXT:    add.w r8, r0, #2
 ; CHECK-NEXT:    adds r1, r0, #1
-; CHECK-NEXT:    mov r3, r9
+; CHECK-NEXT:    mov r3, r12
 ; CHECK-NEXT:    vmov q5, q3
 ; CHECK-NEXT:    vmov q6, q3
 ; CHECK-NEXT:    vmov q4, q3
 ; CHECK-NEXT:    vmov q7, q3
 ; CHECK-NEXT:    vmov q2, q3
+; CHECK-NEXT:    mov r10, r7
 ; CHECK-NEXT:    vstrw.32 q3, [sp, #64] @ 16-byte Spill
 ; CHECK-NEXT:    vstrw.32 q3, [sp, #80] @ 16-byte Spill
+; CHECK-NEXT:    dls lr, r6
 ; CHECK-NEXT:  .LBB7_3: @ %vector.body
 ; CHECK-NEXT:    @ Parent Loop BB7_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    add.w r11, r3, r5
 ; CHECK-NEXT:    vctp.32 r10
 ; CHECK-NEXT:    vpsttt
-; CHECK-NEXT:    vldrwt.u32 q0, [r12], #16
+; CHECK-NEXT:    vldrwt.u32 q0, [r9], #16
 ; CHECK-NEXT:    vldrwt.u32 q1, [r3], #16
 ; CHECK-NEXT:    vfmat.f32 q6, q1, q0
 ; CHECK-NEXT:    vstrw.32 q6, [sp, #48] @ 16-byte Spill
@@ -1515,7 +1504,7 @@
 ; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
 ; CHECK-NEXT:    vstr s6, [r1]
 ; CHECK-NEXT:    ldr r1, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT:    add r9, r1
+; CHECK-NEXT:    add r12, r1
 ; CHECK-NEXT:    ldr r1, [sp, #24] @ 4-byte Reload
 ; CHECK-NEXT:    cmp r0, r1
 ; CHECK-NEXT:    blo.w .LBB7_2
diff --git a/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll b/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll
--- a/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll
@@ -683,8 +683,8 @@
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
 ; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
-; CHECK-NEXT:    .pad #24
-; CHECK-NEXT:    sub sp, #24
+; CHECK-NEXT:    .pad #28
+; CHECK-NEXT:    sub sp, #28
 ; CHECK-NEXT:    add.w r12, sp, #12
 ; CHECK-NEXT:    cmp r3, #4
 ; CHECK-NEXT:    stm.w r12, {r0, r1, r2} @ 12-byte Folded Spill
@@ -694,10 +694,10 @@
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    beq .LBB5_8
 ; CHECK-NEXT:  @ %bb.2: @ %for.body.lr.ph
-; CHECK-NEXT:    ldr r2, [sp, #88]
+; CHECK-NEXT:    ldr r2, [sp, #92]
 ; CHECK-NEXT:    mov.w r9, #0
 ; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
-; CHECK-NEXT:    ldr.w r10, [sp, #72]
+; CHECK-NEXT:    ldr r4, [sp, #76]
 ; CHECK-NEXT:    add.w r0, r1, r2, lsl #1
 ; CHECK-NEXT:    str r0, [sp, #8] @ 4-byte Spill
 ; CHECK-NEXT:    adds r0, r1, r2
@@ -706,62 +706,66 @@
 ; CHECK-NEXT:    add r0, r1
 ; CHECK-NEXT:    str r0, [sp] @ 4-byte Spill
 ; CHECK-NEXT:    adds r0, r2, #7
-; CHECK-NEXT:    lsrs r2, r0, #3
+; CHECK-NEXT:    lsrs r1, r0, #3
 ; CHECK-NEXT:    b .LBB5_5
 ; CHECK-NEXT:  .LBB5_3: @ in Loop: Header=BB5_5 Depth=1
-; CHECK-NEXT:    mov r8, r0
-; CHECK-NEXT:    mov r12, r0
-; CHECK-NEXT:    mov r6, r0
+; CHECK-NEXT:    mov r8, r10
+; CHECK-NEXT:    mov r12, r10
+; CHECK-NEXT:    mov r6, r10
 ; CHECK-NEXT:  .LBB5_4: @ %for.cond.cleanup23
 ; CHECK-NEXT:    @ in Loop: Header=BB5_5 Depth=1
-; CHECK-NEXT:    add.w r1, r12, r8
-; CHECK-NEXT:    add r1, r6
-; CHECK-NEXT:    add r0, r1
-; CHECK-NEXT:    ldr r1, [sp, #96]
+; CHECK-NEXT:    add.w r0, r12, r8
+; CHECK-NEXT:    ldr r1, [sp, #100]
+; CHECK-NEXT:    add r0, r6
+; CHECK-NEXT:    add r0, r10
 ; CHECK-NEXT:    strb.w r0, [r1, r9]
 ; CHECK-NEXT:    add.w r9, r9, #1
 ; CHECK-NEXT:    ldr r0, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT:    ldr r1, [sp, #24] @ 4-byte Reload
 ; CHECK-NEXT:    cmp r9, r0
 ; CHECK-NEXT:    beq .LBB5_8
 ; CHECK-NEXT:  .LBB5_5: @ %for.body
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB5_7 Depth 2
-; CHECK-NEXT:    ldr r0, [sp, #92]
-; CHECK-NEXT:    cmp r2, r2
-; CHECK-NEXT:    ldr.w r0, [r0, r9, lsl #2]
+; CHECK-NEXT:    ldr r0, [sp, #96]
+; CHECK-NEXT:    cmp r1, r1
+; CHECK-NEXT:    str r1, [sp, #24] @ 4-byte Spill
+; CHECK-NEXT:    ldr.w r10, [r0, r9, lsl #2]
 ; CHECK-NEXT:    bge .LBB5_3
 ; CHECK-NEXT:  @ %bb.6: @ %for.body24.preheader
 ; CHECK-NEXT:    @ in Loop: Header=BB5_5 Depth=1
-; CHECK-NEXT:    ldr.w r11, [sp, #88]
-; CHECK-NEXT:    ldr r1, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT:    mov r6, r0
-; CHECK-NEXT:    dlstp.16 lr, r11
-; CHECK-NEXT:    ldm.w sp, {r4, r5, r7} @ 12-byte Folded Reload
-; CHECK-NEXT:    mov r12, r0
-; CHECK-NEXT:    mla r3, r9, r11, r1
+; CHECK-NEXT:    ldr r2, [sp, #92]
+; CHECK-NEXT:    ldr r0, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT:    mov r6, r10
 ; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
-; CHECK-NEXT:    mov r8, r0
+; CHECK-NEXT:    mov r12, r10
+; CHECK-NEXT:    mla r3, r9, r2, r0
+; CHECK-NEXT:    ldr r5, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    ldrd r7, r0, [sp] @ 8-byte Folded Reload
+; CHECK-NEXT:    mov r11, r2
+; CHECK-NEXT:    mov r8, r10
+; CHECK-NEXT:    dlstp.16 lr, r2
 ; CHECK-NEXT:  .LBB5_7: @ %for.body24
 ; CHECK-NEXT:    @ Parent Loop BB5_5 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    vldrb.s16 q0, [r4], #8
-; CHECK-NEXT:    vadd.i16 q1, q0, r10
+; CHECK-NEXT:    vldrb.s16 q0, [r7], #8
+; CHECK-NEXT:    vadd.i16 q1, q0, r4
 ; CHECK-NEXT:    vldrb.s16 q0, [r3], #8
-; CHECK-NEXT:    vmlava.s16 r0, q0, q1
-; CHECK-NEXT:    vldrb.s16 q1, [r7], #8
-; CHECK-NEXT:    vadd.i16 q1, q1, r10
-; CHECK-NEXT:    vmlava.s16 r6, q0, q1
+; CHECK-NEXT:    vmlava.s16 r10, q0, q1
 ; CHECK-NEXT:    vldrb.s16 q1, [r5], #8
-; CHECK-NEXT:    vadd.i16 q1, q1, r10
+; CHECK-NEXT:    vadd.i16 q1, q1, r4
+; CHECK-NEXT:    vmlava.s16 r6, q0, q1
+; CHECK-NEXT:    vldrb.s16 q1, [r0], #8
+; CHECK-NEXT:    vadd.i16 q1, q1, r4
 ; CHECK-NEXT:    vmlava.s16 r12, q0, q1
 ; CHECK-NEXT:    vldrb.s16 q1, [r1], #8
-; CHECK-NEXT:    vadd.i16 q1, q1, r10
+; CHECK-NEXT:    vadd.i16 q1, q1, r4
 ; CHECK-NEXT:    vmlava.s16 r8, q0, q1
 ; CHECK-NEXT:    letp lr, .LBB5_7
 ; CHECK-NEXT:    b .LBB5_4
 ; CHECK-NEXT:  .LBB5_8: @ %if.end
-; CHECK-NEXT:    ldr r0, [sp, #96]
-; CHECK-NEXT:    add sp, #24
+; CHECK-NEXT:    ldr r0, [sp, #100]
+; CHECK-NEXT:    add sp, #28
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
 entry:
   %cmp = icmp eq i16 %num_cols, 4
@@ -870,8 +874,8 @@
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
 ; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
-; CHECK-NEXT:    .pad #24
-; CHECK-NEXT:    sub sp, #24
+; CHECK-NEXT:    .pad #28
+; CHECK-NEXT:    sub sp, #28
 ; CHECK-NEXT:    add.w r12, sp, #12
 ; CHECK-NEXT:    cmp r3, #4
 ; CHECK-NEXT:    stm.w r12, {r0, r1, r2} @ 12-byte Folded Spill
@@ -881,10 +885,10 @@
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    beq .LBB6_8
 ; CHECK-NEXT:  @ %bb.2: @ %for.body.lr.ph
-; CHECK-NEXT:    ldr r2, [sp, #88]
+; CHECK-NEXT:    ldr r2, [sp, #92]
 ; CHECK-NEXT:    mov.w r9, #0
 ; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
-; CHECK-NEXT:    ldr.w r10, [sp, #72]
+; CHECK-NEXT:    ldr r4, [sp, #76]
 ; CHECK-NEXT:    add.w r0, r1, r2, lsl #1
 ; CHECK-NEXT:    str r0, [sp, #8] @ 4-byte Spill
 ; CHECK-NEXT:    adds r0, r1, r2
@@ -893,61 +897,65 @@
 ; CHECK-NEXT:    add r0, r1
 ; CHECK-NEXT:    str r0, [sp] @ 4-byte Spill
 ; CHECK-NEXT:    adds r0, r2, #7
-; CHECK-NEXT:    lsrs r2, r0, #3
+; CHECK-NEXT:    lsrs r1, r0, #3
 ; CHECK-NEXT:  .LBB6_3: @ %for.body
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB6_5 Depth 2
-; CHECK-NEXT:    ldr r0, [sp, #92]
-; CHECK-NEXT:    cmp r2, r2
-; CHECK-NEXT:    ldr.w r0, [r0, r9, lsl #2]
+; CHECK-NEXT:    ldr r0, [sp, #96]
+; CHECK-NEXT:    cmp r1, r1
+; CHECK-NEXT:    str r1, [sp, #24] @ 4-byte Spill
+; CHECK-NEXT:    ldr.w r10, [r0, r9, lsl #2]
 ; CHECK-NEXT:    bge .LBB6_6
 ; CHECK-NEXT:  @ %bb.4: @ %for.body24.preheader
 ; CHECK-NEXT:    @ in Loop: Header=BB6_3 Depth=1
-; CHECK-NEXT:    ldr.w r11, [sp, #88]
-; CHECK-NEXT:    ldr r1, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT:    mov r6, r0
-; CHECK-NEXT:    dlstp.16 lr, r11
-; CHECK-NEXT:    ldm.w sp, {r4, r5, r7} @ 12-byte Folded Reload
-; CHECK-NEXT:    mov r12, r0
-; CHECK-NEXT:    mla r3, r9, r11, r1
+; CHECK-NEXT:    ldr r2, [sp, #92]
+; CHECK-NEXT:    ldr r0, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT:    mov r6, r10
 ; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
-; CHECK-NEXT:    mov r8, r0
+; CHECK-NEXT:    mov r12, r10
+; CHECK-NEXT:    mla r3, r9, r2, r0
+; CHECK-NEXT:    ldr r5, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    ldrd r7, r0, [sp] @ 8-byte Folded Reload
+; CHECK-NEXT:    mov r11, r2
+; CHECK-NEXT:    mov r8, r10
+; CHECK-NEXT:    dlstp.16 lr, r2
 ; CHECK-NEXT:  .LBB6_5: @ %for.body24
 ; CHECK-NEXT:    @ Parent Loop BB6_3 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    vldrb.s16 q0, [r4], #8
-; CHECK-NEXT:    vadd.i16 q1, q0, r10
+; CHECK-NEXT:    vldrb.s16 q0, [r7], #8
+; CHECK-NEXT:    vadd.i16 q1, q0, r4
 ; CHECK-NEXT:    vldrb.s16 q0, [r3], #8
-; CHECK-NEXT:    vmlava.s16 r0, q0, q1
-; CHECK-NEXT:    vldrb.s16 q1, [r7], #8
-; CHECK-NEXT:    vadd.i16 q1, q1, r10
-; CHECK-NEXT:    vmlava.s16 r6, q0, q1
+; CHECK-NEXT:    vmlava.s16 r10, q0, q1
 ; CHECK-NEXT:    vldrb.s16 q1, [r5], #8
-; CHECK-NEXT:    vadd.i16 q1, q1, r10
+; CHECK-NEXT:    vadd.i16 q1, q1, r4
+; CHECK-NEXT:    vmlava.s16 r6, q0, q1
+; CHECK-NEXT:    vldrb.s16 q1, [r0], #8
+; CHECK-NEXT:    vadd.i16 q1, q1, r4
 ; CHECK-NEXT:    vmlava.s16 r12, q0, q1
 ; CHECK-NEXT:    vldrb.s16 q1, [r1], #8
-; CHECK-NEXT:    vadd.i16 q1, q1, r10
+; CHECK-NEXT:    vadd.i16 q1, q1, r4
 ; CHECK-NEXT:    vmlava.s16 r8, q0, q1
 ; CHECK-NEXT:    letp lr, .LBB6_5
 ; CHECK-NEXT:    b .LBB6_7
 ; CHECK-NEXT:  .LBB6_6: @ in Loop: Header=BB6_3 Depth=1
-; CHECK-NEXT:    mov r8, r0
-; CHECK-NEXT:    mov r12, r0
-; CHECK-NEXT:    mov r6, r0
+; CHECK-NEXT:    mov r8, r10
+; CHECK-NEXT:    mov r12, r10
+; CHECK-NEXT:    mov r6, r10
 ; CHECK-NEXT:  .LBB6_7: @ %for.cond.cleanup23
 ; CHECK-NEXT:    @ in Loop: Header=BB6_3 Depth=1
-; CHECK-NEXT:    add.w r1, r12, r8
-; CHECK-NEXT:    add r1, r6
-; CHECK-NEXT:    add r0, r1
-; CHECK-NEXT:    ldr r1, [sp, #96]
+; CHECK-NEXT:    add.w r0, r12, r8
+; CHECK-NEXT:    ldr r1, [sp, #100]
+; CHECK-NEXT:    add r0, r6
+; CHECK-NEXT:    add r0, r10
 ; CHECK-NEXT:    strb.w r0, [r1, r9]
 ; CHECK-NEXT:    add.w r9, r9, #1
 ; CHECK-NEXT:    ldr r0, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT:    ldr r1, [sp, #24] @ 4-byte Reload
 ; CHECK-NEXT:    cmp r9, r0
 ; CHECK-NEXT:    bne .LBB6_3
 ; CHECK-NEXT:  .LBB6_8: @ %if.end
-; CHECK-NEXT:    ldr r0, [sp, #96]
-; CHECK-NEXT:    add sp, #24
+; CHECK-NEXT:    ldr r0, [sp, #100]
+; CHECK-NEXT:    add sp, #28
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
 entry:
   %cmp = icmp eq i16 %num_cols, 4
diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-vctpvpsel.ll b/llvm/test/CodeGen/Thumb2/mve-pred-vctpvpsel.ll
--- a/llvm/test/CodeGen/Thumb2/mve-pred-vctpvpsel.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-pred-vctpvpsel.ll
@@ -14,8 +14,8 @@
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
 ; CHECK-NEXT:    vldrw.u32 q1, [r4]
 ; CHECK-NEXT:    vmov.i32 q3, #0x4
-; CHECK-NEXT:    dlstp.32 lr, r1
 ; CHECK-NEXT:    mov r12, r1
+; CHECK-NEXT:    dlstp.32 lr, r1
 ; CHECK-NEXT:  .LBB0_1: @ %do.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q4, [r0], #16
diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll
@@ -2330,8 +2330,8 @@
 ; CHECK-NEXT:    cbz r1, .LBB29_3
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph
 ; CHECK-NEXT:    movs r2, #0
-; CHECK-NEXT:    dlstp.32 lr, r1
 ; CHECK-NEXT:    mov r3, r2
+; CHECK-NEXT:    dlstp.32 lr, r1
 ; CHECK-NEXT:  .LBB29_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
@@ -2383,8 +2383,8 @@
 ; CHECK-NEXT:    cbz r2, .LBB30_3
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph
 ; CHECK-NEXT:    mov.w r12, #0
-; CHECK-NEXT:    dlstp.32 lr, r2
 ; CHECK-NEXT:    mov r3, r12
+; CHECK-NEXT:    dlstp.32 lr, r2
 ; CHECK-NEXT:  .LBB30_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
@@ -2442,8 +2442,8 @@
 ; CHECK-NEXT:    cbz r2, .LBB31_3
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph
 ; CHECK-NEXT:    mov.w r12, #0
-; CHECK-NEXT:    dlstp.16 lr, r2
 ; CHECK-NEXT:    mov r3, r12
+; CHECK-NEXT:    dlstp.16 lr, r2
 ; CHECK-NEXT:  .LBB31_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrh.u16 q0, [r0], #16