Index: include/llvm/Analysis/TargetTransformInfo.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfo.h
+++ include/llvm/Analysis/TargetTransformInfo.h
@@ -487,6 +487,8 @@
   /// addressing mode expressions.
   bool shouldFavorPostInc() const;
 
+  bool shouldFavorCrossIterationPostInc() const;
+
   /// Return true if the target supports masked load/store
   /// AVX2 and AVX-512 targets allow masks for consecutive load and store
   bool isLegalMaskedStore(Type *DataType) const;
@@ -1054,6 +1056,7 @@
                              TargetTransformInfo::LSRCost &C2) = 0;
   virtual bool canMacroFuseCmp() = 0;
   virtual bool shouldFavorPostInc() const = 0;
+  virtual bool shouldFavorCrossIterationPostInc() const = 0;
   virtual bool isLegalMaskedStore(Type *DataType) = 0;
   virtual bool isLegalMaskedLoad(Type *DataType) = 0;
   virtual bool isLegalMaskedScatter(Type *DataType) = 0;
@@ -1287,6 +1290,9 @@
   bool shouldFavorPostInc() const override {
     return Impl.shouldFavorPostInc();
   }
+  bool shouldFavorCrossIterationPostInc() const override {
+    return Impl.shouldFavorCrossIterationPostInc();
+  }
   bool isLegalMaskedStore(Type *DataType) override {
     return Impl.isLegalMaskedStore(DataType);
   }
Index: include/llvm/Analysis/TargetTransformInfoImpl.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfoImpl.h
+++ include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -254,6 +254,8 @@
 
   bool shouldFavorPostInc() const { return false; }
 
+  bool shouldFavorCrossIterationPostInc() const { return false; }
+
   bool isLegalMaskedStore(Type *DataType) { return false; }
 
   bool isLegalMaskedLoad(Type *DataType) { return false; }
Index: lib/Analysis/TargetTransformInfo.cpp
===================================================================
--- lib/Analysis/TargetTransformInfo.cpp
+++ lib/Analysis/TargetTransformInfo.cpp
@@ -163,6 +163,10 @@
   return TTIImpl->shouldFavorPostInc();
 }
 
+bool TargetTransformInfo::shouldFavorCrossIterationPostInc() const {
+  return TTIImpl->shouldFavorCrossIterationPostInc();
+}
+
 bool TargetTransformInfo::isLegalMaskedStore(Type *DataType) const {
   return TTIImpl->isLegalMaskedStore(DataType);
 }
Index: lib/Target/ARM/ARMTargetTransformInfo.h
===================================================================
--- lib/Target/ARM/ARMTargetTransformInfo.h
+++ lib/Target/ARM/ARMTargetTransformInfo.h
@@ -47,6 +47,7 @@
 
   const ARMSubtarget *ST;
   const ARMTargetLowering *TLI;
+  const Function &F;
 
   // Currently the following features are excluded from InlineFeatureWhitelist.
   // ModeThumb, FeatureNoARM, ModeSoftFloat, FeatureVFPOnlySP, FeatureD16
@@ -87,13 +88,18 @@
 public:
   explicit ARMTTIImpl(const ARMBaseTargetMachine *TM, const Function &F)
       : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
-        TLI(ST->getTargetLowering()) {}
+        TLI(ST->getTargetLowering()), F(F) {}
 
   bool areInlineCompatible(const Function *Caller,
                            const Function *Callee) const;
 
   bool enableInterleavedAccessVectorization() { return true; }
 
+  bool shouldFavorCrossIterationPostInc() const {
+    return !F.optForMinSize() &&
+      ST->isMClass() && ST->isThumb2() && !ST->hasBranchPredictor();
+  }
+
   /// Floating-point computation using ARMv8 AArch32 Advanced
   /// SIMD instructions remains unchanged from ARMv7. Only AArch64 SIMD
   /// is IEEE-754 compliant, but it's not covered in this target.
Index: lib/Transforms/Scalar/LoopStrengthReduce.cpp
===================================================================
--- lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -1236,16 +1236,44 @@
     }
 
     unsigned LoopCost = 1;
-    if (TTI.shouldFavorPostInc()) {
-      const SCEV *LoopStep = AR->getStepRecurrence(SE);
-      if (isa<SCEVConstant>(LoopStep)) {
-        // Check if a post-indexed load/store can be used.
-        if (TTI.isIndexedLoadLegal(TTI.MIM_PostInc, AR->getType()) ||
-            TTI.isIndexedStoreLegal(TTI.MIM_PostInc, AR->getType())) {
+    if (TTI.isIndexedLoadLegal(TTI.MIM_PostInc, AR->getType()) ||
+        TTI.isIndexedStoreLegal(TTI.MIM_PostInc, AR->getType())) {
+
+      std::function<const SCEVConstant*(const SCEV*)> GetConstantStart =
+        [&GetConstantStart](const SCEV *S) -> const SCEVConstant* {
+        if (auto *C = dyn_cast<SCEVConstant>(S))
+          return C;
+
+        if (auto *AddRec = dyn_cast<SCEVAddRecExpr>(S))
+          return GetConstantStart(AddRec->getStart());
+
+        if (auto *Add = dyn_cast<SCEVAddExpr>(S))
+          return GetConstantStart(Add->getOperand(0));
+
+        return nullptr;
+      };
+
+      if (TTI.shouldFavorCrossIterationPostInc()) {
+        if (isa<SCEVConstant>(AR->getOperand(1))) {
+          if (auto *Start = GetConstantStart(AR)) {
+            const APInt &StartInt = Start->getAPInt();
+            const APInt &ARInt = cast<SCEVConstant>(AR->getOperand(1))->getAPInt();
+            // We can turn this access into a post increment as the initial offset
+            // required matches the recurrence.
+            if ((StartInt.isNegative() && StartInt.abs() == ARInt) ||
+                (ARInt.isNegative() && ARInt.abs() == StartInt))
+              LoopCost = 0;
+          }
+        }
+      }
+
+      if (TTI.shouldFavorPostInc()) {
+        const SCEV *LoopStep = AR->getStepRecurrence(SE);
+        if (isa<SCEVConstant>(LoopStep)) {
           const SCEV *LoopStart = AR->getStart();
           if (!isa<SCEVConstant>(LoopStart) &&
-            SE.isLoopInvariant(LoopStart, L))
-              LoopCost = 0;
+              SE.isLoopInvariant(LoopStart, L))
+            LoopCost = 0;
         }
       }
     }
@@ -1262,7 +1290,6 @@
     }
   }
   ++C.NumRegs;
-
   // Rough heuristic; favor registers which don't require extra setup
   // instructions in the preheader.
   if (!isa<SCEVUnknown>(Reg) &&
@@ -1354,8 +1381,9 @@
     // specifically not supported.
     if (LU.Kind == LSRUse::Address && Offset != 0 &&
         !isAMCompletelyFolded(TTI, LSRUse::Address, LU.AccessTy, F.BaseGV,
-                              Offset, F.HasBaseReg, F.Scale, Fixup.UserInst))
+                              Offset, F.HasBaseReg, F.Scale, Fixup.UserInst)) {
       C.NumBaseAdds++;
+    }
   }
 
   // If we don't count instruction cost exit here.
@@ -1394,6 +1422,7 @@
   // BaseAdds adds instructions for unfolded registers.
   if (LU.Kind != LSRUse::ICmpZero)
     C.Insns += C.NumBaseAdds - PrevNumBaseAdds;
+
   assert(isValid() && "invalid cost");
 }
 
@@ -3738,8 +3767,8 @@
 void LSRInstance::GenerateConstantOffsetsImpl(
     LSRUse &LU, unsigned LUIdx, const Formula &Base,
     const SmallVectorImpl<int64_t> &Worklist, size_t Idx, bool IsScaledReg) {
-  const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
-  for (int64_t Offset : Worklist) {
+
+  auto GenerateOffset = [&](const SCEV *G, int64_t Offset) {
     Formula F = Base;
     F.BaseOffset = (uint64_t)Base.BaseOffset - Offset;
     if (isLegalUse(TTI, LU.MinOffset - Offset, LU.MaxOffset - Offset, LU.Kind,
@@ -3761,8 +3790,34 @@
 
       (void)InsertFormula(LU, LUIdx, F);
     }
+  };
+
+  const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
+
+  // With constant offsets and constant steps, we can generate post index
+  // accesses by having the offset equal the step. So, for access #0 with a
+  // step of 8, we could generate a G - 8 base which would require the first
+  // access to be ((G - 8) + 8),+,8. The post-indexed access would then update
+  // the pointer for itself in the next iteration.
+  if (TTI.shouldFavorCrossIterationPostInc() && LU.Kind == LSRUse::Address) {
+    if (auto *GAddRec = dyn_cast<SCEVAddRecExpr>(G)) {
+      if (auto *StepRec =
+          dyn_cast<SCEVConstant>(GAddRec->getStepRecurrence(SE))) {
+        const APInt &StepInt = StepRec->getAPInt();
+        int64_t Step = StepInt.isNegative() ?
+          StepInt.getSExtValue() : StepInt.getZExtValue();
+
+        for (int64_t Offset : Worklist) {
+          Offset -= Step;
+          GenerateOffset(G, Offset);
+        }
+      }
+    }
   }
 
+  for (int64_t Offset : Worklist)
+    GenerateOffset(G, Offset);
+
   int64_t Imm = ExtractImmediate(G, SE);
   if (G->isZero() || Imm == 0)
     return;
Index: test/CodeGen/ARM/dsp-post-incs.ll
===================================================================
--- /dev/null
+++ test/CodeGen/ARM/dsp-post-incs.ll
@@ -0,0 +1,531 @@
+; RUN: llc -mtriple=thumbv7em -mcpu=cortex-m7 %s -o - | FileCheck %s --check-prefix=DISABLED
+; RUN: llc -mtriple=thumbv7em -mcpu=cortex-m4 %s -o - | FileCheck %s --check-prefix=MINSIZE
+; RUN: llc -mtriple=thumbv8m.main -mcpu=cortex-m33 %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DEFAULT
+; RUN: llc -mtriple=thumbv8m.main -mcpu=cortex-m33 -lsr-complexity-limit=2147483647 %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-COMPLEX
+
+; Tests to check that post increment addressing modes are used instead of
+; updating base pointers with add instructions.
+
+; DISABLED-LABEL: test_qadd_2
+; DISABLED-NOT: ldr{{.*}}, [{{.*}}, {{.*}}]!
+; DISABLED-NOT: str{{.*}}, [{{.*}}, {{.*}}]!
+
+; CHECK-LABEL: test_qadd_2
+; CHECK: sub{{.*}} [[A:r[0-9]+]], r0, #8
+; CHECK: subs [[B:r[0-9]+]], #8
+; CHECK: subs [[OUT:r[0-9]+]], #8
+
+; CHECK: ldr{{.*}}, {{\[}}[[B]], #8]!
+; CHECK: ldr{{.*}}, {{\[}}[[A]], #8]!
+; CHECK: str{{.*}}, {{\[}}[[OUT]], #8]!
+; CHECK: ldr{{.*}}, {{\[}}[[B]], #4]
+; CHECK: ldr{{.*}}, {{\[}}[[A]], #4]
+; CHECK: str{{.*}}, {{\[}}[[OUT]], #4]
+; CHECK: blo
+define void @test_qadd_2(i32* %a.array, i32* %b.array, i32* %out.array, i32 %N) {
+entry:
+  br label %loop
+
+loop:
+  %i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
+  %idx.1 = phi i32 [ 0, %entry ], [ %idx.next, %loop ]
+  %gep.a.1 = getelementptr inbounds i32, i32* %a.array, i32 %idx.1
+  %a.1 = load i32, i32* %gep.a.1
+  %gep.b.1 = getelementptr inbounds i32, i32* %b.array, i32 %idx.1
+  %b.1 = load i32, i32* %gep.b.1
+  %qadd.1 = call i32 @llvm.arm.qadd(i32 %a.1, i32 %b.1)
+  %addr.1 = getelementptr inbounds i32, i32* %out.array, i32 %idx.1
+  store i32 %qadd.1, i32* %addr.1
+  %idx.2 = or i32 %idx.1, 1
+  %gep.a.2 = getelementptr inbounds i32, i32* %a.array, i32 %idx.2
+  %a.2 = load i32, i32* %gep.a.2
+  %gep.b.2 = getelementptr inbounds i32, i32* %b.array, i32 %idx.2
+  %b.2 = load i32, i32* %gep.b.2
+  %qadd.2 = call i32 @llvm.arm.qadd(i32 %a.2, i32 %b.2)
+  %addr.2 = getelementptr inbounds i32, i32* %out.array, i32 %idx.2
+  store i32 %qadd.2, i32* %addr.2
+  %i.next = add nsw nuw i32 %i, -2
+  %idx.next = add nsw nuw i32 %idx.1, 2
+  %cmp = icmp ult i32 %i.next, %N
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+; DISABLED-LABEL: test_qadd_2_backwards
+; DISABLED-NOT: ldr{{.*}}, [{{.*}}, {{.*}}]!
+; DISABLED-NOT: str{{.*}}, [{{.*}}, {{.*}}]!
+
+; CHECK-LABEL: test_qadd_2_backwards
+
+; CHECK-DEFAULT: [[shift:[rl0-9]+]], r3, #2
+; CHECK-DEFAULT: add{{.*}} [[A:r[0-9]+]], r0, [[shift]], lsl #2
+; CHECK-DEFAULT: add{{.*}} [[B:r[0-9]+]], r1, [[shift]], lsl #2
+; CHECK-DEFAULT: add{{.*}} [[OUT:r[0-9]+]], r2, [[shift]], lsl #2
+
+; CHECK-DEFAULT: ldr{{.*}}, {{\[}}[[B]], #-8]!
+; CHECK-DEFAULT: ldr{{.*}}, {{\[}}[[A]], #-8]!
+; CHECK-DEFAULT: str{{.*}}, {{\[}}[[OUT]], #-8]!
+; CHECK-DEFAULT: ldr{{.*}}, {{\[}}[[B]], #-4]
+; CHECK-DEFAULT: ldr{{.*}}, {{\[}}[[A]], #-4]
+; CHECK-DEFAULT: str{{.*}}, {{\[}}[[OUT]], #-4]
+
+; FIXME: The higher complexity produces more instructions in the preheader
+; CHECK-COMPLEX: ldr{{.*}} lsl #2]
+; CHECK-COMPLEX: ldr{{.*}} lsl #2]
+; CHECK-COMPLEX: str{{.*}} lsl #2]
+; CHECK-COMPLEX: ldr{{.*}} lsl #2]
+; CHECK-COMPLEX: ldr{{.*}} lsl #2]
+; CHECK-COMPLEX: str{{.*}} lsl #2]
+
+; CHECK: blo
+
+define void @test_qadd_2_backwards(i32* %a.array, i32* %b.array, i32* %out.array, i32 %N) {
+entry:
+  br label %loop
+
+loop:
+  %i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
+  %idx.1 = phi i32 [ %N, %entry ], [ %idx.next, %loop ]
+  %gep.a.1 = getelementptr inbounds i32, i32* %a.array, i32 %idx.1
+  %a.1 = load i32, i32* %gep.a.1
+  %gep.b.1 = getelementptr inbounds i32, i32* %b.array, i32 %idx.1
+  %b.1 = load i32, i32* %gep.b.1
+  %qadd.1 = call i32 @llvm.arm.qadd(i32 %a.1, i32 %b.1)
+  %addr.1 = getelementptr inbounds i32, i32* %out.array, i32 %idx.1
+  store i32 %qadd.1, i32* %addr.1
+  %idx.2 = sub nsw nuw i32 %idx.1, 1
+  %gep.a.2 = getelementptr inbounds i32, i32* %a.array, i32 %idx.2
+  %a.2 = load i32, i32* %gep.a.2
+  %gep.b.2 = getelementptr inbounds i32, i32* %b.array, i32 %idx.2
+  %b.2 = load i32, i32* %gep.b.2
+  %qadd.2 = call i32 @llvm.arm.qadd(i32 %a.2, i32 %b.2)
+  %addr.2 = getelementptr inbounds i32, i32* %out.array, i32 %idx.2
+  store i32 %qadd.2, i32* %addr.2
+  %i.next = add nsw nuw i32 %i, -2
+  %idx.next = sub nsw nuw i32 %idx.1, 2
+  %cmp = icmp ult i32 %i.next, %N
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+; DISABLED-LABEL: test_qadd_3
+; DISABLED-NOT: ldr{{.*}}, [{{.*}}, {{.*}}]!
+; DISABLED-NOT: str{{.*}}, [{{.*}}, {{.*}}]!
+
+; CHECK-LABEL: test_qadd_3
+; CHECK: sub{{.*}}, #12
+; CHECK: sub{{.*}}, #12
+; CHECK: subs{{.*}}, #12
+
+; CHECK: ldr{{.*}}, {{\[}}{{.*}}, #12]!
+; CHECK: ldr{{.*}}, {{\[}}{{.*}}, #12]!
+; CHECK: str{{.*}}, {{\[}}{{.*}}, #12]!
+; CHECK: ldr{{.*}}, {{\[}}{{.*}}, #4]
+; CHECK: ldr{{.*}}, {{\[}}{{.*}}, #4]
+; CHECK: str{{.*}}, {{\[}}{{.*}}, #4]
+; CHECK: ldr{{.*}}, {{\[}}{{.*}}, #8]
+; CHECK: ldr{{.*}}, {{\[}}{{.*}}, #8]
+; CHECK: str{{.*}}, {{\[}}{{.*}}, #8]
+; CHECK: blo
+define void @test_qadd_3(i32* %a.array, i32* %b.array, i32* %out.array, i32 %N) {
+entry:
+  br label %loop
+
+loop:
+  %i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
+  %idx.1 = phi i32 [ 0, %entry ], [ %idx.next, %loop ]
+  %gep.a.1 = getelementptr inbounds i32, i32* %a.array, i32 %idx.1
+  %a.1 = load i32, i32* %gep.a.1
+  %gep.b.1 = getelementptr inbounds i32, i32* %b.array, i32 %idx.1
+  %b.1 = load i32, i32* %gep.b.1
+  %qadd.1 = call i32 @llvm.arm.qadd(i32 %a.1, i32 %b.1)
+  %addr.1 = getelementptr inbounds i32, i32* %out.array, i32 %idx.1
+  store i32 %qadd.1, i32* %addr.1
+  %idx.2 = add nuw nsw i32 %idx.1, 1
+  %gep.a.2 = getelementptr inbounds i32, i32* %a.array, i32 %idx.2
+  %a.2 = load i32, i32* %gep.a.2
+  %gep.b.2 = getelementptr inbounds i32, i32* %b.array, i32 %idx.2
+  %b.2 = load i32, i32* %gep.b.2
+  %qadd.2 = call i32 @llvm.arm.qadd(i32 %a.2, i32 %b.2)
+  %addr.2 = getelementptr inbounds i32, i32* %out.array, i32 %idx.2
+  store i32 %qadd.2, i32* %addr.2
+  %idx.3 = add nuw nsw i32 %idx.1, 2
+  %gep.a.3 = getelementptr inbounds i32, i32* %a.array, i32 %idx.3
+  %a.3 = load i32, i32* %gep.a.3
+  %gep.b.3 = getelementptr inbounds i32, i32* %b.array, i32 %idx.3
+  %b.3 = load i32, i32* %gep.b.3
+  %qadd.3 = call i32 @llvm.arm.qadd(i32 %a.3, i32 %b.3)
+  %addr.3 = getelementptr inbounds i32, i32* %out.array, i32 %idx.3
+  store i32 %qadd.3, i32* %addr.3
+  %i.next = add nsw nuw i32 %i, -3
+  %idx.next = add nsw nuw i32 %idx.1, 3
+  %cmp = icmp ult i32 %i.next, %N
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+; DISABLED-LABEL: test_qadd_4
+; DISABLED-NOT: ldr{{.*}}, [{{.*}}, {{.*}}]!
+; DISABLED-NOT: str{{.*}}, [{{.*}}, {{.*}}]!
+
+; CHECK-LABEL: test_qadd_4
+; CHECK-COMPLEX: sub{{.*}}, #16
+; CHECK-COMPLEX: sub{{.*}}, #16
+; CHECK-COMPLEX: sub{{.*}}, #16
+
+; CHECK-COMPLEX: ldr{{.*}}, {{\[}}{{.*}}, #16]!
+; CHECK-COMPLEX: ldr{{.*}}, {{\[}}{{.*}}, #16]!
+; CHECK-COMPLEX: str{{.*}}, {{\[}}{{.*}}, #16]!
+; CHECK-COMPLEX: ldr{{.*}}, {{\[}}{{.*}} #4]
+; CHECK-COMPLEX: ldr{{.*}}, {{\[}}{{.*}}, #4]
+; CHECK-COMPLEX: str{{.*}}, {{\[}}{{.*}}, #4]
+; CHECK-COMPLEX: ldr{{.*}}, {{\[}}{{.*}}, #8]
+; CHECK-COMPLEX: ldr{{.*}}, {{\[}}{{.*}}, #8]
+; CHECK-COMPLEX: str{{.*}}, {{\[}}{{.*}}, #8]
+; CHECK-COMPLEX: ldr{{.*}}, {{\[}}{{.*}}, #12]
+; CHECK-COMPLEX: ldr{{.*}}, {{\[}}{{.*}}, #12]
+; CHECK-COMPLEX: str{{.*}}, {{\[}}{{.*}}, #12]
+; CHECK: blo
+define void @test_qadd_4(i32* %a.array, i32* %b.array, i32* %out.array, i32 %N) {
+entry:
+  br label %loop
+
+loop:
+  %i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
+  %idx.1 = phi i32 [ 0, %entry ], [ %idx.next, %loop ]
+  %gep.a.1 = getelementptr inbounds i32, i32* %a.array, i32 %idx.1
+  %a.1 = load i32, i32* %gep.a.1
+  %gep.b.1 = getelementptr inbounds i32, i32* %b.array, i32 %idx.1
+  %b.1 = load i32, i32* %gep.b.1
+  %qadd.1 = call i32 @llvm.arm.qadd(i32 %a.1, i32 %b.1)
+  %addr.1 = getelementptr inbounds i32, i32* %out.array, i32 %idx.1
+  store i32 %qadd.1, i32* %addr.1
+  %idx.2 = or i32 %idx.1, 1
+  %gep.a.2 = getelementptr inbounds i32, i32* %a.array, i32 %idx.2
+  %a.2 = load i32, i32* %gep.a.2
+  %gep.b.2 = getelementptr inbounds i32, i32* %b.array, i32 %idx.2
+  %b.2 = load i32, i32* %gep.b.2
+  %qadd.2 = call i32 @llvm.arm.qadd(i32 %a.2, i32 %b.2)
+  %addr.2 = getelementptr inbounds i32, i32* %out.array, i32 %idx.2
+  store i32 %qadd.2, i32* %addr.2
+  %idx.3 = or i32 %idx.1, 2
+  %gep.a.3 = getelementptr inbounds i32, i32* %a.array, i32 %idx.3
+  %a.3 = load i32, i32* %gep.a.3
+  %gep.b.3 = getelementptr inbounds i32, i32* %b.array, i32 %idx.3
+  %b.3 = load i32, i32* %gep.b.3
+  %qadd.3 = call i32 @llvm.arm.qadd(i32 %a.3, i32 %b.3)
+  %addr.3 = getelementptr inbounds i32, i32* %out.array, i32 %idx.3
+  store i32 %qadd.3, i32* %addr.3
+  %idx.4 = or i32 %idx.1, 3
+  %gep.a.4 = getelementptr inbounds i32, i32* %a.array, i32 %idx.4
+  %a.4 = load i32, i32* %gep.a.4
+  %gep.b.4 = getelementptr inbounds i32, i32* %b.array, i32 %idx.4
+  %b.4 = load i32, i32* %gep.b.4
+  %qadd.4 = call i32 @llvm.arm.qadd(i32 %a.4, i32 %b.4)
+  %addr.4 = getelementptr inbounds i32, i32* %out.array, i32 %idx.4
+  store i32 %qadd.4, i32* %addr.4
+  %i.next = add nsw nuw i32 %i, -4
+  %idx.next = add nsw nuw i32 %idx.1, 4
+  %cmp = icmp ult i32 %i.next, %N
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+; DISABLED-LABEL: test_qadd16_2
+; DISABLED-NOT: ldr{{.*}}, [{{.*}}, {{.*}}]!
+; DISABLED-NOT: str{{.*}}, [{{.*}}, {{.*}}]!
+
+; CHECK-LABEL: test_qadd16_2
+; CHECK: sub.w [[A:r[0-9]+]], r0, #8
+; CHECK: subs [[B:r[0-9]+]], #8
+; CHECK: subs [[OUT:r[0-9]+]], #16
+
+; CHECK: ldr{{.*}}, {{\[}}[[B]], #8]!
+; CHECK: ldr{{.*}}, {{\[}}[[A]], #8]!
+; CHECK: str{{.*}}, {{\[}}[[OUT]], #16]!
+; CHECK: ldr{{.*}}, {{\[}}[[B]], #4]
+; CHECK: ldr{{.*}}, {{\[}}[[A]], #4]
+; CHECK: str{{.*}}, {{\[}}[[OUT]], #8]
+define void @test_qadd16_2(i16* %a.array, i16* %b.array, i32* %out.array, i32 %N) {
+entry:
+  br label %loop
+
+loop:
+  %i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
+  %idx.1 = phi i32 [ 0, %entry ], [ %idx.next, %loop ]
+  %gep.a.1 = getelementptr inbounds i16, i16* %a.array, i32 %idx.1
+  %cast.a.1 = bitcast i16* %gep.a.1 to i32*
+  %a.1 = load i32, i32* %cast.a.1
+  %gep.b.1 = getelementptr inbounds i16, i16* %b.array, i32 %idx.1
+  %cast.b.1 = bitcast i16* %gep.b.1 to i32*
+  %b.1 = load i32, i32* %cast.b.1
+  %qadd.1 = call i32 @llvm.arm.qadd16(i32 %a.1, i32 %b.1)
+  %addr.1 = getelementptr inbounds i32, i32* %out.array, i32 %idx.1
+  store i32 %qadd.1, i32* %addr.1
+  %idx.2 = add nsw nuw i32 %idx.1, 2
+  %gep.a.2 = getelementptr inbounds i16, i16* %a.array, i32 %idx.2
+  %cast.a.2 = bitcast i16* %gep.a.2 to i32*
+  %a.2 = load i32, i32* %cast.a.2
+  %gep.b.2 = getelementptr inbounds i16, i16* %b.array, i32 %idx.2
+  %cast.b.2 = bitcast i16* %gep.b.2 to i32*
+  %b.2 = load i32, i32* %cast.b.2
+  %qadd.2 = call i32 @llvm.arm.qadd16(i32 %a.2, i32 %b.2)
+  %addr.2 = getelementptr inbounds i32, i32* %out.array, i32 %idx.2
+  store i32 %qadd.2, i32* %addr.2
+  %i.next = add nsw nuw i32 %i, -2
+  %idx.next = add nsw nuw i32 %idx.1, 4
+  %cmp = icmp ult i32 %i.next, %N
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+; MINSIZE-LABEL: test_qadd16_2_minsize
+; MINSIZE-NOT: ldr{{.*}}, [{{.*}}, {{.*}}]!
+; MINSIZE-NOT: str{{.*}}, [{{.*}}, {{.*}}]!
+define void @test_qadd16_2_minsize(i16* %a.array, i16* %b.array, i32* %out.array, i32 %N) minsize {
+entry:
+  br label %loop
+
+loop:
+  %i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
+  %idx.1 = phi i32 [ 0, %entry ], [ %idx.next, %loop ]
+  %gep.a.1 = getelementptr inbounds i16, i16* %a.array, i32 %idx.1
+  %cast.a.1 = bitcast i16* %gep.a.1 to i32*
+  %a.1 = load i32, i32* %cast.a.1
+  %gep.b.1 = getelementptr inbounds i16, i16* %b.array, i32 %idx.1
+  %cast.b.1 = bitcast i16* %gep.b.1 to i32*
+  %b.1 = load i32, i32* %cast.b.1
+  %qadd.1 = call i32 @llvm.arm.qadd16(i32 %a.1, i32 %b.1)
+  %addr.1 = getelementptr inbounds i32, i32* %out.array, i32 %idx.1
+  store i32 %qadd.1, i32* %addr.1
+  %idx.2 = add nsw nuw i32 %idx.1, 2
+  %gep.a.2 = getelementptr inbounds i16, i16* %a.array, i32 %idx.2
+  %cast.a.2 = bitcast i16* %gep.a.2 to i32*
+  %a.2 = load i32, i32* %cast.a.2
+  %gep.b.2 = getelementptr inbounds i16, i16* %b.array, i32 %idx.2
+  %cast.b.2 = bitcast i16* %gep.b.2 to i32*
+  %b.2 = load i32, i32* %cast.b.2
+  %qadd.2 = call i32 @llvm.arm.qadd16(i32 %a.2, i32 %b.2)
+  %addr.2 = getelementptr inbounds i32, i32* %out.array, i32 %idx.2
+  store i32 %qadd.2, i32* %addr.2
+  %i.next = add nsw nuw i32 %i, -2
+  %idx.next = add nsw nuw i32 %idx.1, 4
+  %cmp = icmp ult i32 %i.next, %N
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+; MINSIZE-LABEL: test_qadd16_2_size
+; MINSIZE: ldr{{.*}}, [{{.*}}, {{.*}}]!
+; MINSIZE: str{{.*}}, [{{.*}}, {{.*}}]!
+define void @test_qadd16_2_size(i16* %a.array, i16* %b.array, i32* %out.array, i32 %N) optsize {
+entry:
+  br label %loop
+
+loop:
+  %i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
+  %idx.1 = phi i32 [ 0, %entry ], [ %idx.next, %loop ]
+  %gep.a.1 = getelementptr inbounds i16, i16* %a.array, i32 %idx.1
+  %cast.a.1 = bitcast i16* %gep.a.1 to i32*
+  %a.1 = load i32, i32* %cast.a.1
+  %gep.b.1 = getelementptr inbounds i16, i16* %b.array, i32 %idx.1
+  %cast.b.1 = bitcast i16* %gep.b.1 to i32*
+  %b.1 = load i32, i32* %cast.b.1
+  %qadd.1 = call i32 @llvm.arm.qadd16(i32 %a.1, i32 %b.1)
+  %addr.1 = getelementptr inbounds i32, i32* %out.array, i32 %idx.1
+  store i32 %qadd.1, i32* %addr.1
+  %idx.2 = add nsw nuw i32 %idx.1, 2
+  %gep.a.2 = getelementptr inbounds i16, i16* %a.array, i32 %idx.2
+  %cast.a.2 = bitcast i16* %gep.a.2 to i32*
+  %a.2 = load i32, i32* %cast.a.2
+  %gep.b.2 = getelementptr inbounds i16, i16* %b.array, i32 %idx.2
+  %cast.b.2 = bitcast i16* %gep.b.2 to i32*
+  %b.2 = load i32, i32* %cast.b.2
+  %qadd.2 = call i32 @llvm.arm.qadd16(i32 %a.2, i32 %b.2)
+  %addr.2 = getelementptr inbounds i32, i32* %out.array, i32 %idx.2
+  store i32 %qadd.2, i32* %addr.2
+  %i.next = add nsw nuw i32 %i, -2
+  %idx.next = add nsw nuw i32 %idx.1, 4
+  %cmp = icmp ult i32 %i.next, %N
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+; DISABLED-LABEL: test_fma
+; DISABLED-NOT: vldr{{.*}}, [{{.*}}, {{.*}}]!
+; DISABLED-NOT: vstr{{.*}}, [{{.*}}, {{.*}}]!
+
+; TODO: I think we should be able to use post inc addressing with VLDM
+; instructions.
+; CHECK-LABEL: test_fma
+; CHECK: subs [[A:r[0-9]+]], #8
+; CHECK: subs [[B:r[0-9]+]], #8
+
+; CHECK: vldr s{{.*}}, {{\[}}[[B]], #8]
+; CHECK: vldr s{{.*}}, {{\[}}[[A]], #8]
+; CHECK: vldr s{{.*}}, {{\[}}[[B]], #12]
+; CHECK: vldr s{{.*}}, {{\[}}[[A]], #12]
+define float @test_fma(float* %a, float* %b, i32 %N) {
+entry:
+  br label %loop
+
+loop:
+  %i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
+  %idx.1 = phi i32 [ 0, %entry ], [ %idx.next, %loop ]
+  %res = phi float [ 0.0, %entry ], [ %fma.2, %loop ]
+  %gep.a.1 = getelementptr inbounds float, float* %a, i32 %idx.1
+  %a.1 = load float, float* %gep.a.1
+  %gep.b.1 = getelementptr inbounds float, float* %b, i32 %idx.1
+  %b.1 = load float, float* %gep.b.1
+  %fmul.1 = fmul float %a.1, %b.1
+  %fma.1 = fadd float %fmul.1, %res
+  %idx.2 = or i32 %idx.1, 1
+  %gep.a.2 = getelementptr inbounds float, float* %a, i32 %idx.2
+  %a.2 = load float, float* %gep.a.2
+  %gep.b.2 = getelementptr inbounds float, float* %b, i32 %idx.2
+  %b.2 = load float, float* %gep.b.2
+  %fmul.2 = fmul float %a.2, %b.2
+  %fma.2 = fadd float %fmul.2, %fma.1
+  %i.next = add nsw nuw i32 %i, -2
+  %idx.next = add nsw nuw i32 %idx.1, 2
+  %cmp = icmp ult i32 %i.next, %N
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret float %fma.2
+}
+
+; DISABLED-LABEL: convolve_16bit
+; DISABLED-NOT: ldr{{.*}}, [{{.*}}, {{.*}}]!
+; DISABLED-NOT: str{{.*}}, [{{.*}}, {{.*}}]!
+
+; CHECK-LABEL: convolve_16bit
+
+; CHECK: ldr.w {{.*}}, [{{.*}}, lsl #2]
+; CHECK: ldr.w [[pA:r[rl0-9]+]], [{{.*}}, lsl #2]
+; CHECK: ldr.w [[pB:[rl0-9]+]], [{{.*}}, lsl #2]
+; CHECK: add{{.*}} [[A:[rl0-9]+]], [[pA]], {{.*}}, lsl #1
+; CHECK: sub{{.*}} [[B:[rl0-9]+]], [[pB]], #8
+
+; CHECK: ldr{{.*}}, {{\[}}[[B]], #8]!
+; CHECK: ldr{{.*}}, {{\[}}[[A]], #8]!
+; CHECK: ldr{{.*}}, {{\[}}[[B]], #4]
+; CHECK: ldr{{.*}}, {{\[}}[[A]], #4]
+define void @convolve_16bit(i16** nocapture readonly %input_image, i16** nocapture readonly %filter,
+                            i32 %filter_dim, i32 %out_width, i32 %out_height,
+                            i32** nocapture readonly %convolved) {
+entry:
+  %cmp92 = icmp eq i32 %out_height, 0
+  br i1 %cmp92, label %for.cond.cleanup, label %for.cond1.preheader.lr.ph
+
+for.cond1.preheader.lr.ph:                        ; preds = %entry
+  %xtraiter = and i32 %filter_dim, 3
+  %unroll_iter = sub i32 %filter_dim, %xtraiter
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.cond.cleanup3, %for.cond1.preheader.lr.ph
+  %res_y.093 = phi i32 [ 0, %for.cond1.preheader.lr.ph ], [ %add28, %for.cond.cleanup3 ]
+  %arrayidx22 = getelementptr inbounds i32*, i32** %convolved, i32 %res_y.093
+  %tmp3 = load i32*, i32** %arrayidx22, align 4
+  br label %for.cond9.preheader.us.us.preheader
+
+for.cond9.preheader.us.us.preheader:              ; preds = %for.cond5.for.cond.cleanup7_crit_edge.us, %for.cond5.preheader.lr.ph
+  %res_x.060.us = phi i32 [ %add25.us, %for.cond5.for.cond.cleanup7_crit_edge.us ], [ 0, %for.cond1.preheader ]
+  br label %for.cond9.preheader.us.us
+
+for.cond9.preheader.us.us:                        ; preds = %for.cond9.for.cond.cleanup11_crit_edge.us.us, %for.cond9.preheader.us.us.preheader
+  %filter_y.056.us.us = phi i32 [ %inc20.us.us, %for.cond9.for.cond.cleanup11_crit_edge.us.us.unr-lcssa ], [ 0, %for.cond9.preheader.us.us.preheader ]
+  %result_element.055.us.us = phi i32 [ %add18.us.us.3, %for.cond9.for.cond.cleanup11_crit_edge.us.us.unr-lcssa ], [ 0, %for.cond9.preheader.us.us.preheader ]
+  %add.us.us = add i32 %filter_y.056.us.us, %res_y.093
+  %arrayidx.us.us = getelementptr inbounds i16*, i16** %filter, i32 %filter_y.056.us.us
+  %tmp5 = load i16*, i16** %arrayidx.us.us, align 4
+  %arrayidx15.us.us = getelementptr inbounds i16*, i16** %input_image, i32 %add.us.us
+  %tmp6 = load i16*, i16** %arrayidx15.us.us, align 4
+  br label %for.body12.us.us
+
+for.body12.us.us:                                 ; preds = %for.body12.us.us, %for.cond9.preheader.us.us
+  %filter_x.053.us.us = phi i32 [ %inc.us.us.3, %for.body12.us.us ], [ 0, %for.cond9.preheader.us.us ]
+  %result_element.152.us.us = phi i32 [ %add18.us.us.3, %for.body12.us.us ], [ %result_element.055.us.us, %for.cond9.preheader.us.us ]
+  %niter = phi i32 [ %niter.nsub.3, %for.body12.us.us ], [ %unroll_iter, %for.cond9.preheader.us.us ]
+  %add13.us.us = add i32 %filter_x.053.us.us, %res_x.060.us
+  %arrayidx14.us.us = getelementptr inbounds i16, i16* %tmp5, i32 %filter_x.053.us.us
+  %tmp9 = load i16, i16* %arrayidx14.us.us, align 2
+  %conv.us.us = sext i16 %tmp9 to i32
+  %arrayidx16.us.us = getelementptr inbounds i16, i16* %tmp6, i32 %add13.us.us
+  %tmp10 = load i16, i16* %arrayidx16.us.us, align 2
+  %conv17.us.us = sext i16 %tmp10 to i32
+  %mul.us.us = mul nsw i32 %conv17.us.us, %conv.us.us
+  %add18.us.us = add nsw i32 %mul.us.us, %result_element.152.us.us
+  %inc.us.us = or i32 %filter_x.053.us.us, 1
+  %add13.us.us.1 = add i32 %inc.us.us, %res_x.060.us
+  %arrayidx14.us.us.1 = getelementptr inbounds i16, i16* %tmp5, i32 %inc.us.us
+  %tmp11 = load i16, i16* %arrayidx14.us.us.1, align 2
+  %conv.us.us.1 = sext i16 %tmp11 to i32
+  %arrayidx16.us.us.1 = getelementptr inbounds i16, i16* %tmp6, i32 %add13.us.us.1
+  %tmp12 = load i16, i16* %arrayidx16.us.us.1, align 2
+  %conv17.us.us.1 = sext i16 %tmp12 to i32
+  %mul.us.us.1 = mul nsw i32 %conv17.us.us.1, %conv.us.us.1
+  %add18.us.us.1 = add nsw i32 %mul.us.us.1, %add18.us.us
+  %inc.us.us.1 = or i32 %filter_x.053.us.us, 2
+  %add13.us.us.2 = add i32 %inc.us.us.1, %res_x.060.us
+  %arrayidx14.us.us.2 = getelementptr inbounds i16, i16* %tmp5, i32 %inc.us.us.1
+  %tmp13 = load i16, i16* %arrayidx14.us.us.2, align 2
+  %conv.us.us.2 = sext i16 %tmp13 to i32
+  %arrayidx16.us.us.2 = getelementptr inbounds i16, i16* %tmp6, i32 %add13.us.us.2
+  %tmp14 = load i16, i16* %arrayidx16.us.us.2, align 2
+  %conv17.us.us.2 = sext i16 %tmp14 to i32
+  %mul.us.us.2 = mul nsw i32 %conv17.us.us.2, %conv.us.us.2
+  %add18.us.us.2 = add nsw i32 %mul.us.us.2, %add18.us.us.1
+  %inc.us.us.2 = or i32 %filter_x.053.us.us, 3
+  %add13.us.us.3 = add i32 %inc.us.us.2, %res_x.060.us
+  %arrayidx14.us.us.3 = getelementptr inbounds i16, i16* %tmp5, i32 %inc.us.us.2
+  %tmp15 = load i16, i16* %arrayidx14.us.us.3, align 2
+  %conv.us.us.3 = sext i16 %tmp15 to i32
+  %arrayidx16.us.us.3 = getelementptr inbounds i16, i16* %tmp6, i32 %add13.us.us.3
+  %tmp16 = load i16, i16* %arrayidx16.us.us.3, align 2
+  %conv17.us.us.3 = sext i16 %tmp16 to i32
+  %mul.us.us.3 = mul nsw i32 %conv17.us.us.3, %conv.us.us.3
+  %add18.us.us.3 = add nsw i32 %mul.us.us.3, %add18.us.us.2
+  %inc.us.us.3 = add i32 %filter_x.053.us.us, 4
+  %niter.nsub.3 = add i32 %niter, -4
+  %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
+  br i1 %niter.ncmp.3, label %for.cond9.for.cond.cleanup11_crit_edge.us.us.unr-lcssa, label %for.body12.us.us
+
+for.cond9.for.cond.cleanup11_crit_edge.us.us.unr-lcssa: ; preds = %for.body12.us.us, %for.cond9.preheader.us.us
+  %inc20.us.us = add nuw i32 %filter_y.056.us.us, 1
+  %exitcond98 = icmp eq i32 %inc20.us.us, %filter_dim
+  br i1 %exitcond98, label %for.cond5.for.cond.cleanup7_crit_edge.us, label %for.cond9.preheader.us.us
+
+for.cond5.for.cond.cleanup7_crit_edge.us:         ; preds = %for.cond9.for.cond.cleanup11_crit_edge.us.us
+  %arrayidx23.us = getelementptr inbounds i32, i32* %tmp3, i32 %res_x.060.us
+  store i32 %add18.us.us.3, i32* %arrayidx23.us, align 4
+  %add25.us = add nuw i32 %res_x.060.us, 1
+  %exitcond99 = icmp eq i32 %add25.us, %out_width
+  br i1 %exitcond99, label %for.cond.cleanup3, label %for.cond9.preheader.us.us.preheader
+
+for.cond.cleanup3:                                ; preds = %for.cond5.for.cond.cleanup7_crit_edge.us, %for.cond5.preheader.preheader, %for.cond1.preheader
+  %add28 = add nuw i32 %res_y.093, 1
+  %exitcond100 = icmp eq i32 %add28, %out_height
+  br i1 %exitcond100, label %for.cond.cleanup, label %for.cond1.preheader
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup3, %entry
+  ret void
+}
+
+declare i32 @llvm.arm.qadd(i32, i32)
+declare i32 @llvm.arm.qadd16(i32, i32)
+
Index: test/CodeGen/ARM/loop-align-cortex-m.ll
===================================================================
--- test/CodeGen/ARM/loop-align-cortex-m.ll
+++ test/CodeGen/ARM/loop-align-cortex-m.ll
@@ -1,10 +1,10 @@
 ; RUN: llc -mtriple=thumbv7m-none-eabi %s -mcpu=cortex-m3 -o - | FileCheck %s
 ; RUN: llc -mtriple=thumbv7m-none-eabi %s -mcpu=cortex-m4 -o - | FileCheck %s
-; RUN: llc -mtriple=thumbv7m-none-eabi %s -mcpu=cortex-m33 -o - | FileCheck %s
+; RUN: llc -mtriple=thumbv8m-none-eabi %s -mcpu=cortex-m33 -o - | FileCheck %s
 
 define void @test_loop_alignment(i32* %in, i32*  %out) optsize {
 ; CHECK-LABEL: test_loop_alignment:
-; CHECK: movs {{r[0-9]+}}, #0
+; CHECK: mov{{.*}}, #4092
 ; CHECK: .p2align 2
 
 entry:
Index: test/Transforms/LoopStrengthReduce/ARM/complexity.ll
===================================================================
--- test/Transforms/LoopStrengthReduce/ARM/complexity.ll
+++ test/Transforms/LoopStrengthReduce/ARM/complexity.ll
@@ -1,21 +1,25 @@
 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
 
-; RUN: opt -mtriple=thumbv7em %s -S -loop-reduce -lsr-complexity-limit=65536 -o - | FileCheck %s --check-prefix=CHECK-DEFAULT
-; RUN: opt -mtriple=thumbv7em %s -S -loop-reduce -lsr-complexity-limit=2147483647 -o - | FileCheck %s --check-prefix=CHECK-COMPLEX
+; RUN: opt -mtriple=thumbv7em -mcpu=cortex-m4 %s -S -loop-reduce -lsr-complexity-limit=65536 -o - | FileCheck %s
+; RUN: opt -mtriple=thumbv7em -mcpu=cortex-m4 %s -S -loop-reduce -lsr-complexity-limit=2147483647 -o - | FileCheck %s
 
-; CHECK-DEFAULT-LABEL: for.body12.us.us:
-; CHECK-DEFAULT: phi i32
-; CHECK-DEFAULT: [[LSR_IV:%[^ ]+]] = phi i32 [ [[LSR_IV_NEXT:%[^ ]+]], %for.body12.us.us ], [ 0, %for.cond9.preheader.us.us ]
-; CHECK-DEFAULT: phi i32
-; CHECK-DEFAULT: [[LSR_IV_NEXT]] = add i32 [[LSR_IV]], 8
+; CHECK-LABEL: for.cond9.preheader.us.us:
+; CHECK: [[SCEVGEP:%[^ ]+]] = getelementptr i16, i16* %tmp5, i32 -4 
+; CHECK: [[SCEVGEP9:%[^ ]+]] = getelementptr i16, i16* %tmp6, i32 %lsr.iv
 
-; CHECK-COMPLEX-LABEL: for.body12.us.us:
-; CHECK-COMPLEX: phi i32
-; CHECK-COMPLEX: [[LSR_IV6:%[^ ]+]] = phi i16* [ [[SCEVGEP7:%[^ ]+]], %for.body12.us.us ], [ [[SCEVGEP5:%[^ ]+]], %for.cond9.preheader.us.us ]
-; CHECK-COMPLEX: [[LSR_IV:%[^ ]+]] = phi i16* [ [[SCEVGEP1:%[^ ]+]], %for.body12.us.us ], [ [[SCEVGEP:%[^ ]+]], %for.cond9.preheader.us.us ]
-; CHECK-COMPLEX: phi i32
-; CHECK-COMPLEX: [[SCEVGEP1]] = getelementptr i16, i16* [[LSR_IV]], i32 4
-; CHECK-COMPLEX: [[SCEVGEP7]] = getelementptr i16, i16* [[LSR_IV6]], i32 4
+; CHECK-LABEL: for.body12.us.us:
+; CHECK: [[LSR_IV10:%[^ ]+]] = phi i16* [ [[SCEVGEP11:%[^ ]+]], %for.body12.us.us ], [ [[SCEVGEP9]], %for.cond9.preheader.us.us ]
+; CHECK: [[LSR_IV:%[^ ]+]] = phi i16* [ [[SCEVGEP1:%[^ ]+]], %for.body12.us.us ], [ [[SCEVGEP]], %for.cond9.preheader.us.us ]
+; CHECK: getelementptr i16, i16* [[LSR_IV]], i32 4
+; CHECK: getelementptr i16, i16* [[LSR_IV10]], i32 4
+; CHECK: getelementptr i16, i16* [[LSR_IV]], i32 5
+; CHECK: getelementptr i16, i16* [[LSR_IV10]], i32 5
+; CHECK: getelementptr i16, i16* [[LSR_IV]], i32 6
+; CHECK: getelementptr i16, i16* [[LSR_IV10]], i32 6
+; CHECK: getelementptr i16, i16* [[LSR_IV]], i32 7
+; CHECK: getelementptr i16, i16* [[LSR_IV10]], i32 7
+; CHECK: [[SCEVGEP1]] = getelementptr i16, i16* [[LSR_IV]], i32 4
+; CHECK: [[SCEVGEP11]] = getelementptr i16, i16* [[LSR_IV10]], i32 4
 
 define void @convolve(i16** nocapture readonly %input_image, i16** nocapture readonly %filter, i32 %filter_dim, i32 %out_width, i32 %out_height, i32** nocapture readonly %convolved) {
 entry: