diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -115,6 +115,7 @@
 #include <cstdlib>
 #include <iterator>
 #include <limits>
+#include <numeric>
 #include <map>
 #include <utility>
 
@@ -163,6 +164,10 @@
   cl::init(std::numeric_limits<uint16_t>::max()),
   cl::desc("LSR search space complexity limit"));
 
+static cl::opt<bool> EnableRecursiveSetupCost(
+  "lsr-recursive-setupcost", cl::Hidden, cl::init(true),
+  cl::desc("Enable more thorough lsr setup cost calculation"));
+
 #ifndef NDEBUG
 // Stress test IV chain generation.
 static cl::opt<bool> StressIVChain(
@@ -1211,6 +1216,25 @@
                                  bool HasBaseReg, int64_t Scale,
                                  Instruction *Fixup = nullptr);
 
+static unsigned getSetupCost(const SCEV *Reg) {
+  if (isa<SCEVUnknown>(Reg) || isa<SCEVConstant>(Reg))
+    return 1;
+  if (!EnableRecursiveSetupCost)
+    return 0;
+  if (const auto *S = dyn_cast<SCEVAddRecExpr>(Reg))
+    return getSetupCost(S->getStart());
+  if (auto S = dyn_cast<SCEVCastExpr>(Reg))
+    return getSetupCost(S->getOperand());
+  if (auto S = dyn_cast<SCEVNAryExpr>(Reg))
+    return std::accumulate(S->op_begin(), S->op_end(), 0,
+                           [](unsigned i, const SCEV *Reg) {
+                             return i + getSetupCost(Reg);
+                           });
+  if (auto S = dyn_cast<SCEVUDivExpr>(Reg))
+    return getSetupCost(S->getLHS()) + getSetupCost(S->getRHS());
+  return 0;
+}
+
 /// Tally up interesting quantities from the given register.
 void Cost::RateRegister(const Formula &F, const SCEV *Reg,
                         SmallPtrSetImpl<const SCEV *> &Regs,
@@ -1276,12 +1300,7 @@
 
   // Rough heuristic; favor registers which don't require extra setup
   // instructions in the preheader.
-  if (!isa<SCEVUnknown>(Reg) &&
-      !isa<SCEVConstant>(Reg) &&
-      !(isa<SCEVAddRecExpr>(Reg) &&
-        (isa<SCEVUnknown>(cast<SCEVAddRecExpr>(Reg)->getStart()) ||
-         isa<SCEVConstant>(cast<SCEVAddRecExpr>(Reg)->getStart()))))
-    ++C.SetupCost;
+  C.SetupCost += getSetupCost(Reg);
 
   C.NumIVMuls += isa<SCEVMulExpr>(Reg) &&
                SE.hasComputableLoopEvolution(Reg, L);
diff --git a/llvm/test/CodeGen/ARM/lsr-setupcost.ll b/llvm/test/CodeGen/ARM/lsr-setupcost.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/lsr-setupcost.ll
@@ -0,0 +1,100 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -mtriple=thumbv6m-none-eabi -loop-reduce %s -S -o - | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+
+%struct.arm_matrix_instance_q15 = type { i16, i16, i16* }
+
+define i32 @arm_mat_add_q15(%struct.arm_matrix_instance_q15* nocapture readonly %pSrcA, %struct.arm_matrix_instance_q15* nocapture readonly %pSrcB, %struct.arm_matrix_instance_q15* nocapture readonly %pDst)  {
+; CHECK-LABEL: @arm_mat_add_q15(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[NUMROWS:%.*]] = getelementptr inbounds [[STRUCT_ARM_MATRIX_INSTANCE_Q15:%.*]], %struct.arm_matrix_instance_q15* [[PSRCA:%.*]], i32 0, i32 0
+; CHECK-NEXT:    [[I0:%.*]] = load i16, i16* [[NUMROWS]], align 4
+; CHECK-NEXT:    [[NUMCOLS:%.*]] = getelementptr inbounds [[STRUCT_ARM_MATRIX_INSTANCE_Q15]], %struct.arm_matrix_instance_q15* [[PSRCA]], i32 0, i32 1
+; CHECK-NEXT:    [[I1:%.*]] = load i16, i16* [[NUMCOLS]], align 2
+; CHECK-NEXT:    [[MUL:%.*]] = mul i16 [[I1]], [[I0]]
+; CHECK-NEXT:    [[CMP22:%.*]] = icmp eq i16 [[MUL]], 0
+; CHECK-NEXT:    br i1 [[CMP22]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
+; CHECK:       while.body.preheader:
+; CHECK-NEXT:    [[CONV5:%.*]] = zext i16 [[MUL]] to i32
+; CHECK-NEXT:    [[PDATA2:%.*]] = getelementptr inbounds [[STRUCT_ARM_MATRIX_INSTANCE_Q15]], %struct.arm_matrix_instance_q15* [[PDST:%.*]], i32 0, i32 2
+; CHECK-NEXT:    [[I2:%.*]] = load i16*, i16** [[PDATA2]], align 4
+; CHECK-NEXT:    [[PDATA1:%.*]] = getelementptr inbounds [[STRUCT_ARM_MATRIX_INSTANCE_Q15]], %struct.arm_matrix_instance_q15* [[PSRCB:%.*]], i32 0, i32 2
+; CHECK-NEXT:    [[I3:%.*]] = load i16*, i16** [[PDATA1]], align 4
+; CHECK-NEXT:    [[PDATA:%.*]] = getelementptr inbounds [[STRUCT_ARM_MATRIX_INSTANCE_Q15]], %struct.arm_matrix_instance_q15* [[PSRCA]], i32 0, i32 2
+; CHECK-NEXT:    [[I4:%.*]] = load i16*, i16** [[PDATA]], align 4
+; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
+; CHECK:       while.body:
+; CHECK-NEXT:    [[PINA_026:%.*]] = phi i16* [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[I4]], [[WHILE_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[BLKCNT_025:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[CONV5]], [[WHILE_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[PINB_024:%.*]] = phi i16* [ [[INCDEC_PTR8:%.*]], [[WHILE_BODY]] ], [ [[I3]], [[WHILE_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[POUT_023:%.*]] = phi i16* [ [[INCDEC_PTR11:%.*]], [[WHILE_BODY]] ], [ [[I2]], [[WHILE_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i16, i16* [[PINA_026]], i32 1
+; CHECK-NEXT:    [[I5:%.*]] = load i16, i16* [[PINA_026]], align 2
+; CHECK-NEXT:    [[CONV7:%.*]] = sext i16 [[I5]] to i32
+; CHECK-NEXT:    [[INCDEC_PTR8]] = getelementptr inbounds i16, i16* [[PINB_024]], i32 1
+; CHECK-NEXT:    [[I6:%.*]] = load i16, i16* [[PINB_024]], align 2
+; CHECK-NEXT:    [[CONV9:%.*]] = sext i16 [[I6]] to i32
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[CONV9]], [[CONV7]]
+; CHECK-NEXT:    [[I7:%.*]] = icmp sgt i32 [[ADD]], -32768
+; CHECK-NEXT:    [[SPEC_SELECT_I:%.*]] = select i1 [[I7]], i32 [[ADD]], i32 -32768
+; CHECK-NEXT:    [[I8:%.*]] = icmp slt i32 [[SPEC_SELECT_I]], 32767
+; CHECK-NEXT:    [[CALL21:%.*]] = select i1 [[I8]], i32 [[SPEC_SELECT_I]], i32 32767
+; CHECK-NEXT:    [[CONV10:%.*]] = trunc i32 [[CALL21]] to i16
+; CHECK-NEXT:    [[INCDEC_PTR11]] = getelementptr inbounds i16, i16* [[POUT_023]], i32 1
+; CHECK-NEXT:    store i16 [[CONV10]], i16* [[POUT_023]], align 2
+; CHECK-NEXT:    [[DEC]] = add nsw i32 [[BLKCNT_025]], -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[DEC]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY]]
+; CHECK:       while.end.loopexit:
+; CHECK-NEXT:    br label [[WHILE_END]]
+; CHECK:       while.end:
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %numRows = getelementptr inbounds %struct.arm_matrix_instance_q15, %struct.arm_matrix_instance_q15* %pSrcA, i32 0, i32 0
+  %i0 = load i16, i16* %numRows, align 4
+  %numCols = getelementptr inbounds %struct.arm_matrix_instance_q15, %struct.arm_matrix_instance_q15* %pSrcA, i32 0, i32 1
+  %i1 = load i16, i16* %numCols, align 2
+  %mul = mul i16 %i1, %i0
+  %cmp22 = icmp eq i16 %mul, 0
+  br i1 %cmp22, label %while.end, label %while.body.preheader
+
+while.body.preheader:                             ; preds = %entry
+  %conv5 = zext i16 %mul to i32
+  %pData2 = getelementptr inbounds %struct.arm_matrix_instance_q15, %struct.arm_matrix_instance_q15* %pDst, i32 0, i32 2
+  %i2 = load i16*, i16** %pData2, align 4
+  %pData1 = getelementptr inbounds %struct.arm_matrix_instance_q15, %struct.arm_matrix_instance_q15* %pSrcB, i32 0, i32 2
+  %i3 = load i16*, i16** %pData1, align 4
+  %pData = getelementptr inbounds %struct.arm_matrix_instance_q15, %struct.arm_matrix_instance_q15* %pSrcA, i32 0, i32 2
+  %i4 = load i16*, i16** %pData, align 4
+  br label %while.body
+
+while.body:                                       ; preds = %while.body.preheader, %while.body
+  %pInA.026 = phi i16* [ %incdec.ptr, %while.body ], [ %i4, %while.body.preheader ]
+  %blkCnt.025 = phi i32 [ %dec, %while.body ], [ %conv5, %while.body.preheader ]
+  %pInB.024 = phi i16* [ %incdec.ptr8, %while.body ], [ %i3, %while.body.preheader ]
+  %pOut.023 = phi i16* [ %incdec.ptr11, %while.body ], [ %i2, %while.body.preheader ]
+  %incdec.ptr = getelementptr inbounds i16, i16* %pInA.026, i32 1
+  %i5 = load i16, i16* %pInA.026, align 2
+  %conv7 = sext i16 %i5 to i32
+  %incdec.ptr8 = getelementptr inbounds i16, i16* %pInB.024, i32 1
+  %i6 = load i16, i16* %pInB.024, align 2
+  %conv9 = sext i16 %i6 to i32
+  %add = add nsw i32 %conv9, %conv7
+  %i7 = icmp sgt i32 %add, -32768
+  %spec.select.i = select i1 %i7, i32 %add, i32 -32768
+  %i8 = icmp slt i32 %spec.select.i, 32767
+  %call21 = select i1 %i8, i32 %spec.select.i, i32 32767
+  %conv10 = trunc i32 %call21 to i16
+  %incdec.ptr11 = getelementptr inbounds i16, i16* %pOut.023, i32 1
+  store i16 %conv10, i16* %pOut.023, align 2
+  %dec = add nsw i32 %blkCnt.025, -1
+  %cmp = icmp eq i32 %dec, 0
+  br i1 %cmp, label %while.end, label %while.body
+
+while.end:                                        ; preds = %while.body, %entry
+  ret i32 0
+}
+
+
diff --git a/llvm/test/CodeGen/Hexagon/swp-carried-1.ll b/llvm/test/CodeGen/Hexagon/swp-carried-1.ll
--- a/llvm/test/CodeGen/Hexagon/swp-carried-1.ll
+++ b/llvm/test/CodeGen/Hexagon/swp-carried-1.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=hexagon -rdf-opt=0 -disable-hexagon-misched -hexagon-initial-cfg-cleanup=0 < %s | FileCheck %s
+; RUN: llc -march=hexagon -rdf-opt=0 -disable-hexagon-misched -hexagon-initial-cfg-cleanup=0 -lsr-recursive-setupcost=0 < %s | FileCheck %s
 
 ; Test that we generate the correct code when a loop carried value
 ; is scheduled one stage earlier than it's use. The code in
diff --git a/llvm/test/CodeGen/Hexagon/swp-epilog-phi5.ll b/llvm/test/CodeGen/Hexagon/swp-epilog-phi5.ll
--- a/llvm/test/CodeGen/Hexagon/swp-epilog-phi5.ll
+++ b/llvm/test/CodeGen/Hexagon/swp-epilog-phi5.ll
@@ -7,10 +7,10 @@
 
 ; In this test case, the second loop is pipelined, block b5.
 
-; CHECK: loop0
+; CHECK: loop1
 ; CHECK: [[REG0:r([0-9]+)]] += mpyi
 ; CHECK: [[REG2:r([0-9]+)]] = add([[REG1:r([0-9]+)]],add([[REG0]],#8
-; CHECK: endloop0
+; CHECK: endloop1
 
 %s.0 = type { %s.1*, %s.4*, %s.7*, i8*, i8, i32, %s.8*, i32, i32, i32, i8, i8, i32, i32, double, i8, i8, i8, i8, i8, i8, i8, i8, i32, i8, i8, i8, i32, i32, i32, i32, i32, i32, i8**, i32, i32, i32, i32, i32, [64 x i32]*, [4 x %s.9*], [4 x %s.10*], [4 x %s.10*], i32, %s.23*, i8, i8, [16 x i8], [16 x i8], [16 x i8], i32, i8, i8, i8, i8, i16, i16, i8, i8, i8, %s.11*, i32, i32, i32, i32, i8*, i32, [4 x %s.23*], i32, i32, i32, [10 x i32], i32, i32, i32, i32, i32, %s.12*, %s.13*, %s.14*, %s.15*, %s.16*, %s.17*, %s.18*, %s.19*, %s.20*, %s.21*, %s.22* }
 %s.1 = type { void (%s.2*)*, void (%s.2*, i32)*, void (%s.2*)*, void (%s.2*, i8*)*, void (%s.2*)*, i32, %s.3, i32, i32, i8**, i32, i8**, i32, i32 }
diff --git a/llvm/test/Transforms/LoopStrengthReduce/two-combinations-bug.ll b/llvm/test/Transforms/LoopStrengthReduce/two-combinations-bug.ll
--- a/llvm/test/Transforms/LoopStrengthReduce/two-combinations-bug.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/two-combinations-bug.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -loop-reduce -S | FileCheck %s
+; RUN: opt < %s -loop-reduce -lsr-recursive-setupcost=0 -S | FileCheck %s
 
 ; This test is adapted from the n-body test of the LLVM test-suite: A bug in
 ; r345114 caused LSR to generate incorrect code. The test verifies that the