Index: docs/Passes.rst
===================================================================
--- docs/Passes.rst
+++ docs/Passes.rst
@@ -780,6 +780,14 @@
 access for the first iteration, and then creating a new GEP instruction in the
 loop to increment the value by the appropriate amount.
 
+``-loop-exit-values``: Optimize Loop Exit Values
+------------------------------------------------
+
+This optimization finds loop exit values reevaluated after the loop
+execution and replaces them by the corresponding exit values if they are
+available. Such sequencies can arise after SimplifyIndVars and
+LoopStrengthReduce passes. This pass should be run after LoopStrengthReduce.
+
 ``-loop-rotate``: Rotate Loops
 ------------------------------
 
Index: include/llvm/InitializePasses.h
===================================================================
--- include/llvm/InitializePasses.h
+++ include/llvm/InitializePasses.h
@@ -167,6 +167,7 @@
 void initializeLoopRotatePass(PassRegistry&);
 void initializeLoopSimplifyPass(PassRegistry&);
 void initializeLoopStrengthReducePass(PassRegistry&);
+void initializeLoopExitValuesPass(PassRegistry&);
 void initializeGlobalMergePass(PassRegistry&);
 void initializeLoopRerollPass(PassRegistry&);
 void initializeLoopUnrollPass(PassRegistry&);
Index: include/llvm/LinkAllPasses.h
===================================================================
--- include/llvm/LinkAllPasses.h
+++ include/llvm/LinkAllPasses.h
@@ -106,6 +106,7 @@
       (void) llvm::createLoopInterchangePass();
       (void) llvm::createLoopSimplifyPass();
       (void) llvm::createLoopStrengthReducePass();
+      (void) llvm::createLoopExitValuesPass();
       (void) llvm::createLoopRerollPass();
       (void) llvm::createLoopUnrollPass();
       (void) llvm::createLoopUnswitchPass();
Index: include/llvm/Transforms/Scalar.h
===================================================================
--- include/llvm/Transforms/Scalar.h
+++ include/llvm/Transforms/Scalar.h
@@ -156,6 +156,15 @@
 
 //===----------------------------------------------------------------------===//
 //
+// LoopExitValues - This pass finds loop exit values reevaluated after the
+// loop execution and replaces them by the corresponding values if they are
+// available. Such sequencies often arise after SimplifyIndVars +
+// LoopStrengthReduce. This pass should be run after LoopStrengthReduce.
+//
+Pass *createLoopExitValuesPass();
+
+//===----------------------------------------------------------------------===//
+//
 // GlobalMerge - This pass merges internal (by default) globals into structs
 // to enable reuse of a base pointer by indexed addressing modes.
 // It can also be configured to focus on size optimizations only.
Index: lib/CodeGen/Passes.cpp
===================================================================
--- lib/CodeGen/Passes.cpp
+++ lib/CodeGen/Passes.cpp
@@ -69,6 +69,8 @@
     cl::desc("Disable Machine Sinking"));
 static cl::opt<bool> DisableLSR("disable-lsr", cl::Hidden,
     cl::desc("Disable Loop Strength Reduction Pass"));
+static cl::opt<bool> DisableLoopExitVals("disable-loop-exit-vals", cl::Hidden,
+    cl::desc("Disable LoopExitValues pass"));
 static cl::opt<bool> DisableConstantHoisting("disable-constant-hoisting",
     cl::Hidden, cl::desc("Disable ConstantHoisting"));
 static cl::opt<bool> DisableCGP("disable-cgp", cl::Hidden,
@@ -83,6 +85,8 @@
     cl::init(false));
 static cl::opt<bool> PrintLSR("print-lsr-output", cl::Hidden,
     cl::desc("Print LLVM IR produced by the loop-reduce pass"));
+static cl::opt<bool> PrintLoopExitValues("print-loop-exit-values-output", cl::Hidden,
+    cl::desc("Print LLVM IR produced by the loop exit value optimization pass"));
 static cl::opt<bool> PrintISelInput("print-isel-input", cl::Hidden,
     cl::desc("Print LLVM IR input to isel pass"));
 static cl::opt<bool> PrintGCInfo("print-gc", cl::Hidden,
@@ -401,6 +405,12 @@
       addPass(createPrintFunctionPass(dbgs(), "\n\n*** Code after LSR ***\n"));
   }
 
+  if (getOptLevel() != CodeGenOpt::None && !DisableLoopExitVals) {
+    addPass(createLoopExitValuesPass());
+	if (PrintLoopExitValues)
+	  addPass(createPrintFunctionPass(dbgs(), "\n\n*** Code after Loop Exit Values Optimization **\n"));
+  }
+
   // Run GC lowering passes for builtin collectors
   // TODO: add a pass insertion point here
   addPass(createGCLoweringPass());
Index: lib/Transforms/Scalar/CMakeLists.txt
===================================================================
--- lib/Transforms/Scalar/CMakeLists.txt
+++ lib/Transforms/Scalar/CMakeLists.txt
@@ -31,6 +31,7 @@
   MemCpyOptimizer.cpp
   MergedLoadStoreMotion.cpp
   NaryReassociate.cpp
+  LoopExitValues.cpp
   PartiallyInlineLibCalls.cpp
   PlaceSafepoints.cpp
   Reassociate.cpp
Index: lib/Transforms/Scalar/LoopExitValues.cpp
===================================================================
--- /dev/null
+++ lib/Transforms/Scalar/LoopExitValues.cpp
@@ -0,0 +1,191 @@
+//===- LoopStrengthReduce.cpp - Strength Reduce IVs in Loops --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This optimization finds loop exit values reevaluated after the loop
+// execution and replaces them by the corresponding exit values if they are
+// available. Such sequences can arise after SimplifyIndVars +
+// LoopStrengthReduce passes. This pass should be run after LoopStrengthReduce.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/IVUsers.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-exit-values"
+
+/// If any of the instructions is the specified set are trivially dead, delete
+/// them and see if this makes any of their operands subsequently dead.
+/// TODO: This function is identical to one in LoopStrengthReduce.  Move these
+/// TODO: to one common location.
+static bool
+DeleteTriviallyDeadInstructions(SmallVectorImpl<WeakVH> &DeadInsts) {
+  bool Changed = false;
+
+  while (!DeadInsts.empty()) {
+    Value *V = DeadInsts.pop_back_val();
+    Instruction *I = dyn_cast_or_null<Instruction>(V);
+
+    if (!I || !isInstructionTriviallyDead(I))
+      continue;
+
+    for (Use &O : I->operands())
+      if (Instruction *U = dyn_cast<Instruction>(O)) {
+        O = nullptr;
+        if (U->use_empty())
+          DeadInsts.emplace_back(U);
+      }
+
+    I->eraseFromParent();
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+static void collectNestedLoops(Loop *L, std::vector<Loop *> &Loops) {
+  Loops.push_back(L);
+  for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I)
+    collectNestedLoops(*I, Loops);
+}
+
+static void collectLoops(LoopInfo &LI, std::vector<Loop *> &Loops) {
+  for (LoopInfo::iterator I = LI.begin(), E = LI.end(); I != E; ++I)
+    collectNestedLoops(*I, Loops);
+}
+
+namespace {
+
+class LoopExitValues : public FunctionPass {
+public:
+  static char ID; // Pass ID, replacement for typeid
+  LoopExitValues();
+
+private:
+  bool runOnFunction(Function &F) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+};
+
+} // anonymous
+
+char LoopExitValues::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopExitValues, "loop-exit-values",
+                      "Optimize loop exit values", false, false)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_END(LoopExitValues, "loop-exit-values",
+                    "Optimize loop exit values", false, false)
+
+Pass *llvm::createLoopExitValuesPass() {
+  return new LoopExitValues();
+}
+
+LoopExitValues::LoopExitValues()
+        : FunctionPass(ID) {
+  initializeLoopExitValuesPass(*PassRegistry::getPassRegistry());
+}
+
+bool LoopExitValues::runOnFunction(Function &F) {
+  ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  LoopInfo &LI =  getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+
+  std::vector<Loop *> Loops;
+  collectLoops(LI, Loops);
+
+  // Collect exit values available after each of the loops
+  typedef DenseMap<const SCEV *, std::vector<Instruction *> >SCEVToInstMap;
+  SCEVToInstMap SCEVToInstr;
+  for (Loop* L : Loops) {
+    for (BasicBlock* BB : L->getBlocks()) {
+      for (BasicBlock::iterator I : *BB) {
+        if (!SE.isSCEVable(I->getType()))
+          continue;
+
+        const SCEV *ExitValue = SE.getSCEVAtScope(SE.getSCEV(&*I),
+                                                  L->getParentLoop());
+        if (!SE.isLoopInvariant(ExitValue, L) ||
+            ExitValue->getSCEVType() == scUnknown ||
+            ExitValue->getSCEVType() == scCouldNotCompute)
+          continue;
+
+        SCEVToInstr[ExitValue].push_back(&*I);
+      }
+    }
+  }
+  if (SCEVToInstr.empty())
+    return false;
+
+  // Find similar values
+  bool Changed = false;
+  SmallVector<WeakVH, 16> DeadInsts;
+  for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ++I) {
+    if (!SE.isSCEVable(I->getType()))
+      continue;
+
+    // Since SCEVs are unique, we may use simple pointer comparison here.
+    const SCEV *SV = SE.getSCEV(&*I);
+    SCEVToInstMap::const_iterator iter = SCEVToInstr.find(SV);
+    if (iter == SCEVToInstr.end())
+      continue;
+
+    for (Instruction *Cand : iter->second) {
+      if (not DT.properlyDominates(Cand->getParent(), I->getParent()))
+        continue;
+
+      // Try to replace I with the other instruction.
+      Instruction *NewInstr = Cand;
+      if (NewInstr->getType() != I->getType()) {
+        // If either instruction is a bitcast, that's ok, just insert a
+        // bitcast.  Otherwise, some strange stuff detected, so do not
+        // optimize.
+        if (isa<BitCastInst>(NewInstr) || isa<BitCastInst>(&*I))
+          NewInstr =
+                  new BitCastInst(NewInstr, I->getType(), I->getName(), &*I);
+        else
+          NewInstr = nullptr;
+      }
+
+      if (NewInstr == nullptr)
+        continue;
+
+      DEBUG(dbgs() << "LoopExitValues: Replacing instruction: ");
+      DEBUG(I->dump());
+      DEBUG(dbgs() << "                    with SCEV equivalent: ");
+      DEBUG(Cand->dump());
+      Changed = true;
+      I->replaceAllUsesWith(NewInstr);
+      DeadInsts.push_back(&*I);
+      break;
+    }
+  }
+
+  DeleteTriviallyDeadInstructions(DeadInsts);
+  return Changed;
+}
+
+void LoopExitValues::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<DominatorTreeWrapperPass>();
+  AU.addPreserved<DominatorTreeWrapperPass>();
+  AU.addRequired<ScalarEvolutionWrapperPass>();
+  AU.addPreserved<ScalarEvolutionWrapperPass>();
+  AU.addRequired<LoopInfoWrapperPass>();
+  AU.addPreserved<LoopInfoWrapperPass>();
+}
+
Index: lib/Transforms/Scalar/Scalar.cpp
===================================================================
--- lib/Transforms/Scalar/Scalar.cpp
+++ lib/Transforms/Scalar/Scalar.cpp
@@ -53,6 +53,7 @@
   initializeLoopInterchangePass(Registry);
   initializeLoopRotatePass(Registry);
   initializeLoopStrengthReducePass(Registry);
+  initializeLoopExitValuesPass(Registry);
   initializeLoopRerollPass(Registry);
   initializeLoopUnrollPass(Registry);
   initializeLoopUnswitchPass(Registry);
Index: test/CodeGen/Generic/stop-after.ll
===================================================================
--- test/CodeGen/Generic/stop-after.ll
+++ test/CodeGen/Generic/stop-after.ll
@@ -6,6 +6,10 @@
 ; STOP-NEXT: Machine Function Analysis
 ; STOP-NEXT: MIR Printing Pass
 
-; START: -machine-branch-prob -gc-lowering
+; START: -machine-branch-prob -domtree -loops -scalar-evolution -loop-exit-values
 ; START: FunctionPass Manager
-; START-NEXT: Lower Garbage Collection Instructions
+; START-NEXT: Dominator Tree Construction
+; START-NEXT: Natural Loop Information
+; START-NEXT: Scalar Evolution Analysis
+; START-NEXT: Optimize loop exit values
+
Index: test/CodeGen/X86/lsr-loop-exit-cond.ll
===================================================================
--- test/CodeGen/X86/lsr-loop-exit-cond.ll
+++ test/CodeGen/X86/lsr-loop-exit-cond.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -mtriple=x86_64-darwin -mcpu=atom < %s | FileCheck -check-prefix=ATOM %s
 
 ; CHECK-LABEL: t:
-; CHECK: movl (%r9,%rax,4), %e{{..}}
+; CHECK: movl (%r9,%{{.+}},4), %e{{..}}
 ; CHECK-NEXT: decq
 ; CHECK-NEXT: jne
 
Index: test/CodeGen/X86/pr11202.ll
===================================================================
--- test/CodeGen/X86/pr11202.ll
+++ test/CodeGen/X86/pr11202.ll
@@ -15,5 +15,5 @@
   br label %l1
 }
 
-; CHECK: .Ltmp0:                                 # Address of block that was removed by CodeGen
+; CHECK: .Ltmp0:
 ; CHECK: .quad	.Ltmp0
Index: test/Transforms/LoopExitValues/matrix_mul.ll
===================================================================
--- /dev/null
+++ test/Transforms/LoopExitValues/matrix_mul.ll
@@ -0,0 +1,51 @@
+; RUN: llc %s -print-lsr-output -print-loop-exit-values-output -o /dev/null 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+target triple = "i686-unknown-unknown"
+
+define void @matrix_mul(i32 %Size, i32* nocapture %Dst, i32* nocapture readonly %Src, i32 %Val) {
+entry:
+  %cmp.25 = icmp eq i32 %Size, 0
+  br i1 %cmp.25, label %for.cond.cleanup, label %for.body.4.lr.ph
+
+for.body.4.lr.ph:                                 ; preds = %entry, %for.cond.cleanup.3
+  %Outer.026 = phi i32 [ %inc10, %for.cond.cleanup.3 ], [ 0, %entry ]
+  %mul = mul i32 %Outer.026, %Size
+  br label %for.body.4
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.3, %entry
+  ret void
+
+for.cond.cleanup.3:                               ; preds = %for.body.4
+  %inc10 = add nuw nsw i32 %Outer.026, 1
+  %exitcond27 = icmp eq i32 %inc10, %Size
+  br i1 %exitcond27, label %for.cond.cleanup, label %for.body.4.lr.ph
+
+for.body.4:                                       ; preds = %for.body.4, %for.body.4.lr.ph
+  %Inner.024 = phi i32 [ 0, %for.body.4.lr.ph ], [ %inc, %for.body.4 ]
+  %add = add i32 %Inner.024, %mul
+  %arrayidx = getelementptr inbounds i32, i32* %Src, i32 %add
+  %0 = load i32, i32* %arrayidx, align 4, !tbaa !1
+  %mul5 = mul i32 %0, %Val
+  %arrayidx8 = getelementptr inbounds i32, i32* %Dst, i32 %add
+  store i32 %mul5, i32* %arrayidx8, align 4, !tbaa !1
+  %inc = add nuw nsw i32 %Inner.024, 1
+  %exitcond = icmp eq i32 %inc, %Size
+  br i1 %exitcond, label %for.cond.cleanup.3, label %for.body.4
+}
+
+!1 = !{!2, !2, i64 0}
+!2 = !{!"int", !3, i64 0}
+!3 = !{!"omnipotent char", !4, i64 0}
+!4 = !{!"Simple C/C++ TBAA"}
+
+
+; Verify LSR produced the expected bitcasts
+; CHECK: Code after LSR
+; CHECK: [ %2, {{.+}} ]
+; CHECK-NEXT: [ %1, {{.+}} ]
+
+; Verify Loop Exit Values pass replaced these with available GEPs
+; CHECK: Code after Loop Exit Values
+; CHECK: [ %scevgep9, {{.+}} ]
+; CHECK-NEXT: [ %scevgep4, {{.+}} ]
Index: test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll
===================================================================
--- test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll
+++ test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll
@@ -210,13 +210,14 @@
 ; A9: %.lr.ph
 ; A9-NOT: lsl.w
 ; A9-NOT: {{ldr|str|adds|add r}}
-; A9: vst1.8 {{.*}} [r{{[0-9]+}}]!
+; A9: vst1.8 {{.*}} [{{lr}}]!
 ; A9-NOT: {{ldr|str|adds|add r}}
-; A9: add.w r
+; A9: add.w lr
 ; A9-NOT: {{ldr|str|adds|add r}}
-; A9: add.w r
+; A9: add.w lr
 ; A9-NOT: {{ldr|str|adds|add r}}
 ; A9-NOT: add.w r
+; A9-NOT: add.w lr
 ; A9: bne
 define hidden void @testNeon(i8* %ref_data, i32 %ref_stride, i32 %limit, <16 x i8>* nocapture %data) nounwind optsize {
   %1 = icmp sgt i32 %limit, 0
Index: tools/llc/llc.cpp
===================================================================
--- tools/llc/llc.cpp
+++ tools/llc/llc.cpp
@@ -190,6 +190,7 @@
   initializeCore(*Registry);
   initializeCodeGen(*Registry);
   initializeLoopStrengthReducePass(*Registry);
+  initializeLoopExitValuesPass(*Registry);
   initializeLowerIntrinsicsPass(*Registry);
   initializeUnreachableBlockElimPass(*Registry);