Index: lib/Target/ARM/ARMLoadStoreOptimizer.cpp
===================================================================
--- lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -2196,7 +2196,7 @@
     else {
       SmallPtrSet<MachineInstr*, 4> MemOps;
       SmallSet<unsigned, 4> MemRegs;
-      for (int i = NumMove-1; i >= 0; --i) {
+      for (size_t i = Ops.size() - NumMove, e = Ops.size(); i != e; ++i) {
         MemOps.insert(Ops[i]);
         MemRegs.insert(Ops[i]->getOperand(0).getReg());
       }
Index: test/CodeGen/ARM/prera-ldst-insertpt.mir
===================================================================
--- /dev/null
+++ test/CodeGen/ARM/prera-ldst-insertpt.mir
@@ -0,0 +1,93 @@
+# RUN: llc -run-pass arm-prera-ldst-opt %s -o - | FileCheck %s
+--- |
+  target triple = "thumbv7---eabi"
+
+  define void @a(i32* nocapture %x, i32 %y, i32 %z) {
+  entry:
+    ret void
+  }
+
+  define void @b(i32* nocapture %x, i32 %y, i32 %z) {
+  entry:
+    ret void
+  }
+...
+---
+name:            a
+alignment:       1
+tracksRegLiveness: true
+liveins:
+  - { reg: '%r0', virtual-reg: '%0' }
+  - { reg: '%r1', virtual-reg: '%1' }
+  - { reg: '%r2', virtual-reg: '%2' }
+body:             |
+  bb.0.entry:
+    liveins: %r0, %r1, %r2
+
+    %2 : rgpr = COPY %r2
+    %1 : rgpr = COPY %r1
+    %0 : gpr = COPY %r0
+    %3 : rgpr = t2MUL %2, %2, 14, _
+    %4 : rgpr = t2MUL %1, %1, 14, _
+    %5 : rgpr = t2MOVi32imm -858993459
+    %6 : rgpr, %7 : rgpr  = t2UMULL killed %3, %5, 14, _
+    %8 : rgpr, %9 : rgpr  = t2UMULL killed %4, %5, 14, _
+    t2STRi12 %1, %0, 0, 14, _ :: (store 4)
+    %10 : rgpr = t2LSLri %2, 1, 14, _, _
+    t2STRi12 killed %10, %0, 4, 14, _ :: (store 4)
+    %11 : rgpr = t2MOVi 55, 14, _, _
+    %12 : gprnopc = t2ADDrs %11, killed %7, 19, 14, _, _
+    t2STRi12 killed %12, %0, 16, 14, _ :: (store 4)
+    %13 : gprnopc = t2ADDrs %11, killed %9, 19, 14, _, _
+    t2STRi12 killed %13, %0, 20, 14, _ :: (store 4)
+    tBX_RET 14, _
+---
+name:            b
+alignment:       1
+tracksRegLiveness: true
+liveins:
+  - { reg: '%r0', virtual-reg: '%0' }
+  - { reg: '%r1', virtual-reg: '%1' }
+  - { reg: '%r2', virtual-reg: '%2' }
+body:             |
+  bb.0.entry:
+    liveins: %r0, %r1, %r2
+
+    %2 : rgpr = COPY %r2
+    %1 : rgpr = COPY %r1
+    %0 : gpr = COPY %r0
+    t2STRi12 %1, %0, 0, 14, _ :: (store 4)
+    %10 : rgpr = t2LSLri %2, 1, 14, _, _
+    t2STRi12 killed %10, %0, 4, 14, _ :: (store 4)
+    %3 : rgpr = t2MUL %2, %2, 14, _
+    t2STRi12 %3, %0, 8, 14, _ :: (store 4)
+    %4 : rgpr = t2MUL %1, %1, 14, _
+    %5 : rgpr = t2MOVi32imm -858993459
+    %6 : rgpr, %7 : rgpr  = t2UMULL killed %3, %5, 14, _
+    %8 : rgpr, %9 : rgpr  = t2UMULL killed %4, %5, 14, _
+    %10 : rgpr = t2LSLri %2, 1, 14, _, _
+    %11 : rgpr = t2MOVi 55, 14, _, _
+    %12 : gprnopc = t2ADDrs %11, killed %7, 19, 14, _, _
+    t2STRi12 killed %12, %0, 16, 14, _ :: (store 4)
+    %13 : gprnopc = t2ADDrs %11, killed %9, 19, 14, _, _
+    t2STRi12 killed %13, %0, 20, 14, _ :: (store 4)
+    tBX_RET 14, _
+
+...
+# Make sure we move the paired stores next to each other.
+# FIXME: Make sure we don't extend the live-range of a store
+# when we don't need to.
+# CHECK-LABEL: name: a
+# CHECK: t2STRi12 %1,
+# CHECK-NEXT: t2STRi12 killed %10,
+# CHECK-NEXT: %13 = t2ADDrs %11
+# CHECK-NEXT: t2STRi12 killed %12,
+# CHECK-NEXT: t2STRi12 killed %13,
+#
+# CHECK-LABEL: name: b
+# CHECK: t2STRi12 {{.*}}, 0
+# CHECK-NEXT: t2STRi12 {{.*}}, 4
+# CHECK-NEXT: t2STRi12 {{.*}}, 8
+# CHECK-NEXT: t2ADDrs
+# CHECK-NEXT: t2STRi12 {{.*}}, 16
+# CHECK-NEXT: t2STRi12 {{.*}}, 20
Index: test/CodeGen/ARM/vldm-liveness.ll
===================================================================
--- test/CodeGen/ARM/vldm-liveness.ll
+++ /dev/null
@@ -1,40 +0,0 @@
-; RUN: llc -mtriple thumbv7-apple-ios -verify-machineinstrs -o - %s | FileCheck %s
-
-; ARM load store optimizer was dealing with a sequence like:
-;     s1 = VLDRS [r0, 1], Q0<imp-def>
-;     s3 = VLDRS [r0, 2], Q0<imp-use,kill>, Q0<imp-def>
-;     s0 = VLDRS [r0, 0], Q0<imp-use,kill>, Q0<imp-def>
-;     s2 = VLDRS [r0, 4], Q0<imp-use,kill>, Q0<imp-def>
-;
-; It decided to combine the {s0, s1} loads into a single instruction in the
-; third position. However, this leaves the instruction defining s3 with a stray
-; imp-use of Q0, which is undefined.
-;
-; The verifier catches this, so this test just makes sure that appropriate
-; liveness flags are added.
-;
-; I believe the change will be tested as long as the vldmia is not the first of
-; the loads. Earlier optimisations may perturb the output over time, but
-; fiddling the indices should be sufficient to restore the test.
-
-define arm_aapcs_vfpcc <4 x float> @foo(float* %ptr) {
-; CHECK-LABEL: foo:
-; CHECK: vldr s3, [r0, #8]
-; CHECK: vldmia r0, {s0, s1}
-; CHECK: vldr s2, [r0, #16]
-   %off0 = getelementptr float, float* %ptr, i32 0
-   %val0 = load float, float* %off0
-   %off1 = getelementptr float, float* %ptr, i32 1
-   %val1 = load float, float* %off1
-   %off4 = getelementptr float, float* %ptr, i32 4
-   %val4 = load float, float* %off4
-   %off2 = getelementptr float, float* %ptr, i32 2
-   %val2 = load float, float* %off2
-
-   %vec1 = insertelement <4 x float> undef, float %val0, i32 0
-   %vec2 = insertelement <4 x float> %vec1, float %val1, i32 1
-   %vec3 = insertelement <4 x float> %vec2, float %val4, i32 2
-   %vec4 = insertelement <4 x float> %vec3, float %val2, i32 3
-
-   ret <4 x float> %vec4
-}
Index: test/CodeGen/ARM/vldm-liveness.mir
===================================================================
--- /dev/null
+++ test/CodeGen/ARM/vldm-liveness.mir
@@ -0,0 +1,37 @@
+# RUN: llc -run-pass arm-ldst-opt -verify-machineinstrs %s -o - | FileCheck %s
+# ARM load store optimizer was dealing with a sequence like:
+#     s1 = VLDRS [r0, 1], Q0<imp-def>
+#     s3 = VLDRS [r0, 2], Q0<imp-use,kill>, Q0<imp-def>
+#     s0 = VLDRS [r0, 0], Q0<imp-use,kill>, Q0<imp-def>
+#     s2 = VLDRS [r0, 4], Q0<imp-use,kill>, Q0<imp-def>
+#
+# It decided to combine the {s0, s1} loads into a single instruction in the
+# third position. However, this leaves the instruction defining s3 with a stray
+# imp-use of Q0, which is undefined.
+#
+# The verifier catches this, so this test just makes sure that appropriate
+# liveness flags are added.
+--- |
+  target triple = "thumbv7-apple-ios"
+  define arm_aapcs_vfpcc <4 x float> @foo(float* %ptr) {
+    ret <4 x float> undef
+  }
+...
+---
+name:            foo
+alignment:       1
+liveins:
+  - { reg: '%r0' }
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %r0
+
+    %s1 = VLDRS %r0, 1, 14, _, implicit-def %q0 :: (load 4)
+    %s3 = VLDRS %r0, 2, 14, _, implicit killed %q0, implicit-def %q0 :: (load 4)
+    %s0 = VLDRS %r0, 0, 14, _, implicit killed %q0, implicit-def %q0 :: (load 4)
+    %s2 = VLDRS killed %r0, 4, 14, _, implicit killed %q0, implicit-def %q0 :: (load 4)
+    tBX_RET 14, _, implicit %q0
+...
+# CHECK: %s3 = VLDRS %r0, 2, 14, _, implicit killed undef %q0, implicit-def %q0 :: (load 4)
+# CHECK: VLDMSIA %r0, 14, _, def %s0, def %s1, implicit-def _
+# CHECK: %s2 = VLDRS killed %r0, 4, 14, _, implicit killed %q0, implicit-def %q0 :: (load 4)