Index: lib/Transforms/Scalar/LICM.cpp
===================================================================
--- lib/Transforms/Scalar/LICM.cpp
+++ lib/Transforms/Scalar/LICM.cpp
@@ -250,27 +250,47 @@
 
   // Now that all loop invariants have been removed from the loop, promote any
   // memory references to scalars that we can.
-  if (!DisablePromotion && (Preheader || L->hasDedicatedExits())) {
+  // Don't sink stores from loops without dedicated block exits. Exits
+  // containing indirect branches are not transformed by loop simplify,
+  // make sure we catch that. An additional load may be generated in the
+  // preheader for SSA updater, so also avoid sinking when no preheader
+  // is available.
+  if (!DisablePromotion && Preheader && L->hasDedicatedExits()) {
+    // Figure out the loop exits and their insertion points
     SmallVector<BasicBlock *, 8> ExitBlocks;
-    SmallVector<Instruction *, 8> InsertPts;
-    PredIteratorCache PIC;
+    L->getUniqueExitBlocks(ExitBlocks);
 
-    // Loop over all of the alias sets in the tracker object.
-    for (AliasSet &AS : *CurAST)
-      Changed |= promoteLoopAccessesToScalars(
-          AS, ExitBlocks, InsertPts, PIC, LI, DT, TLI, L, CurAST, &SafetyInfo);
-
-    // Once we have promoted values across the loop body we have to recursively
-    // reform LCSSA as any nested loop may now have values defined within the
-    // loop used in the outer loop.
-    // FIXME: This is really heavy handed. It would be a bit better to use an
-    // SSAUpdater strategy during promotion that was LCSSA aware and reformed
-    // it as it went.
-    if (Changed) {
-      formLCSSARecursively(*L, *DT, LI, SE);
+    // We can't insert into a catchswitch.
+    bool HasCatchSwitch = llvm::any_of(ExitBlocks, [](BasicBlock *Exit) {
+      return isa<CatchSwitchInst>(Exit->getTerminator());
+    });
+
+    if (!HasCatchSwitch) {
+      SmallVector<Instruction *, 8> InsertPts;
+      InsertPts.reserve(ExitBlocks.size());
+      for (BasicBlock *ExitBlock : ExitBlocks)
+        InsertPts.push_back(&*ExitBlock->getFirstInsertionPt());
+
+      PredIteratorCache PIC;
+
+      // Loop over all of the alias sets in the tracker object.
+      for (AliasSet &AS : *CurAST)
+        Changed |=
+            promoteLoopAccessesToScalars(AS, ExitBlocks, InsertPts, PIC, LI, DT,
+                                         TLI, L, CurAST, &SafetyInfo);
     }
   }
 
+  // Once we have promoted values across the loop body we have to
+  // recursively
+  // reform LCSSA as any nested loop may now have values defined within the
+  // loop used in the outer loop.
+  // FIXME: This is really heavy handed. It would be a bit better to use an
+  // SSAUpdater strategy during promotion that was LCSSA aware and reformed
+  // it as it went.
+  if (Changed)
+    formLCSSARecursively(*L, *DT, LI, SE);
+
   // Check that neither this loop nor its parent have had LCSSA broken. LICM is
   // specifically moving instructions across the loop boundary and so it is
   // especially in need of sanity checking here.
@@ -887,17 +907,23 @@
   //    path which did not originally have one.
   //
   // It is safe to promote P if all uses are direct load/stores and if at
-  // least one is guaranteed to be executed.
-  bool GuaranteedToExecute = false;
+  // least one store is guaranteed to be executed.
+  bool PromotionIsLegal = false;
 
   // It is also safe to promote P if we can prove that speculating a load into
   // the preheader is safe (i.e. proving dereferenceability on all
-  // paths through the loop), and that the memory can be proven thread local
-  // (so that the memory model requirement doesn't apply.)  We first establish
-  // the former, and then run a capture analysis below to establish the later.
-  // We can use any access within the alias set to prove dereferenceability
-  // since they're all must alias.
+  // paths through the loop), and if we are sure we don't introduce memory
+  // model issues by sinking the store.
+  // A store is safe to sink if:
+  // a) We can prove the memory it stores to is thread local, in which case
+  // the memory model requirement does not apply.
+  // b) It was already guaranteed to execute at least once (see above).
+  // c) It was not guaranteed to execute - but only because we may exit the loop
+  // through an exception. This guarantees that dynamic executions which would
+  // not execute the original store will not execute the exit block, so no 
+  // new store is introduced.
   bool CanSpeculateLoad = false;
+  bool StoreDominatesExits = false;
 
   SmallVector<Instruction *, 64> LoopUses;
   SmallPtrSet<Value *, 4> PointerMustAliases;
@@ -906,15 +932,9 @@
   // us to prove better alignment.
   unsigned Alignment = 1;
   AAMDNodes AATags;
-  bool HasDedicatedExits = CurLoop->hasDedicatedExits();
 
-  // Don't sink stores from loops without dedicated block exits. Exits
-  // containing indirect branches are not transformed by loop simplify,
-  // make sure we catch that. An additional load may be generated in the
-  // preheader for SSA updater, so also avoid sinking when no preheader
-  // is available.
-  if (!HasDedicatedExits || !Preheader)
-    return false;
+  assert(CurLoop->hasDedicatedExits() && Preheader &&
+         "Expected prehader and dedicated loop exits.");
 
   const DataLayout &MDL = Preheader->getModule()->getDataLayout();
 
@@ -955,7 +975,7 @@
         if (!Load->isSimple())
           return Changed;
 
-        if (!GuaranteedToExecute && !CanSpeculateLoad)
+        if (!PromotionIsLegal && !CanSpeculateLoad)
           CanSpeculateLoad = isSafeToExecuteUnconditionally(
               *Load, DT, CurLoop, SafetyInfo, Preheader->getTerminator());
       } else if (const StoreInst *Store = dyn_cast<StoreInst>(UI)) {
@@ -967,10 +987,6 @@
         if (!Store->isSimple())
           return Changed;
 
-        // Note that we only check GuaranteedToExecute inside the store case
-        // so that we do not introduce stores where they did not exist before
-        // (which would break the LLVM concurrency model).
-
         // If the alignment of this instruction allows us to specify a more
         // restrictive (and performant) alignment and if we are sure this
         // instruction will be executed, update the alignment.
@@ -979,18 +995,25 @@
         if ((InstAlignment > Alignment || InstAlignment == 0) &&
             Alignment != 0) {
           if (isGuaranteedToExecute(*UI, DT, CurLoop, SafetyInfo)) {
-            GuaranteedToExecute = true;
+            PromotionIsLegal = true;
             Alignment = InstAlignment;
           }
-        } else if (!GuaranteedToExecute) {
-          GuaranteedToExecute =
+        } else if (!PromotionIsLegal) {
+          PromotionIsLegal =
               isGuaranteedToExecute(*UI, DT, CurLoop, SafetyInfo);
         }
 
-        if (!GuaranteedToExecute && !CanSpeculateLoad) {
-          CanSpeculateLoad = isDereferenceableAndAlignedPointer(
-              Store->getPointerOperand(), Store->getAlignment(), MDL,
-              Preheader->getTerminator(), DT);
+        if (!PromotionIsLegal) {
+          if (!CanSpeculateLoad)
+            CanSpeculateLoad = isDereferenceableAndAlignedPointer(
+                Store->getPointerOperand(), Store->getAlignment(), MDL,
+                Preheader->getTerminator(), DT);
+          if(!StoreDominatesExits) {
+            StoreDominatesExits = llvm::all_of(ExitBlocks, [&](BasicBlock *Exit) {
+              return DT->dominates(Store->getParent(), Exit);
+            });
+          }
+
         }
       } else
         return Changed; // Not a load or store.
@@ -1008,33 +1031,20 @@
   }
 
   // Check legality per comment above. Otherwise, we can't promote.
-  bool PromotionIsLegal = GuaranteedToExecute;
   if (!PromotionIsLegal && CanSpeculateLoad) {
+    PromotionIsLegal = StoreDominatesExits;
     // If this is a thread local location, then we can insert stores along
     // paths which originally didn't have them without violating the memory
     // model.
-    Value *Object = GetUnderlyingObject(SomePtr, MDL);
-    PromotionIsLegal =
-        isAllocLikeFn(Object, TLI) && !PointerMayBeCaptured(Object, true, true);
+    if (!PromotionIsLegal) {
+      Value *Object = GetUnderlyingObject(SomePtr, MDL);
+      PromotionIsLegal = isAllocLikeFn(Object, TLI) &&
+                         !PointerMayBeCaptured(Object, true, true);
+    }
   }
   if (!PromotionIsLegal)
     return Changed;
 
-  // Figure out the loop exits and their insertion points, if this is the
-  // first promotion.
-  if (ExitBlocks.empty()) {
-    CurLoop->getUniqueExitBlocks(ExitBlocks);
-    InsertPts.clear();
-    InsertPts.reserve(ExitBlocks.size());
-    for (BasicBlock *ExitBlock : ExitBlocks)
-      InsertPts.push_back(&*ExitBlock->getFirstInsertionPt());
-  }
-
-  // Can't insert into a catchswitch.
-  for (BasicBlock *ExitBlock : ExitBlocks)
-    if (isa<CatchSwitchInst>(ExitBlock->getTerminator()))
-      return Changed;
-
   // Otherwise, this is safe to promote, lets do it!
   DEBUG(dbgs() << "LICM: Promoting value stored to in loop: " << *SomePtr
                << '\n');
Index: test/Transforms/LICM/scalar_promote.ll
===================================================================
--- test/Transforms/LICM/scalar_promote.ll
+++ test/Transforms/LICM/scalar_promote.ll
@@ -186,6 +186,188 @@
 ; CHECK-NEXT:  store i32 %[[LCSSAPHI]], i32* %gi, align 4, !tbaa !0
 }
 
+declare i32 @opaque(i32)
+
+; We can promote even if opaque may throw.
+define i32 @test7() {
+; CHECK-LABEL: @test7(
+; CHECK: entry:
+; CHECK-NEXT: %local = alloca
+; CHECK-NEXT: load i32, i32* %local
+; CHECK-NEXT: br label %loop
+; CHECK: exit:
+; CHECK-NEXT: %[[LCSSAPHI:.*]] = phi i32 [ %x2, %loop ]
+; CHECK-NEXT: store i32 %[[LCSSAPHI]], i32* %local
+; CHECK-NEXT: %ret = load i32, i32* %local
+; CHECK-NEXT: ret i32 %ret
+entry:
+  %local = alloca i32
+  br label %loop
+
+
+loop:
+  %j = phi i32 [ 0, %entry ], [ %next, %loop ]
+  %x = load i32, i32* %local
+  %x2 = call i32 @opaque(i32 %x) ; Note this does not capture %local
+  store i32 %x2, i32* %local
+  %next = add i32 %j, 1
+  %cond = icmp eq i32 %next, 0
+  br i1 %cond, label %exit, label %loop
+
+exit:
+  %ret = load i32, i32* %local
+  ret i32 %ret
+}
+
+; Make sure we don't promote if the store is really control-flow dependent.
+define i32 @test7bad() {
+; CHECK-LABEL: @test7bad(
+; CHECK: entry:
+; CHECK-NEXT: %local = alloca
+; CHECK-NEXT: br label %loop
+; CHECK: if:
+; CHECK-NEXT: store i32 %x2, i32* %local
+; CHECK-NEXT: br label %else
+; CHECK: exit:
+; CHECK-NEXT: %ret = load i32, i32* %local
+; CHECK-NEXT: ret i32 %ret
+entry:
+  %local = alloca i32
+  br label %loop
+loop:
+  %j = phi i32 [ 0, %entry ], [ %next, %else ]
+  %x = load i32, i32* %local
+  %x2 = call i32 @opaque(i32 %x) ; Note this does not capture %local
+  %cmp = icmp eq i32 %x2, 0
+  br i1 %cmp, label %if, label %else
+
+if:  
+  store i32 %x2, i32* %local
+  br label %else
+
+else:
+  %next = add i32 %j, 1
+  %cond = icmp eq i32 %next, 0
+  br i1 %cond, label %exit, label %loop
+
+exit:
+  %ret = load i32, i32* %local
+  ret i32 %ret
+}
+
+; Even if neither the load nor the store or guaranteed to execute because
+; opaque() may throw, we can still promote - the load not being guaranteed
+; doesn't block us, because %local is always dereferencible.
+define i32 @test8() {
+; CHECK-LABEL: @test8(
+; CHECK: entry:
+; CHECK-NEXT: %local = alloca
+; CHECK-NEXT: load i32, i32* %local
+; CHECK-NEXT: br label %loop
+; CHECK: exit:
+; CHECK-NEXT: %[[LCSSAPHI:.*]] = phi i32 [ %x2, %loop ]
+; CHECK-NEXT: store i32 %[[LCSSAPHI]], i32* %local
+; CHECK-NEXT: %ret = load i32, i32* %local
+; CHECK-NEXT: ret i32 %ret
+entry:
+  %local = alloca i32
+  br label %loop
+
+loop:
+  %j = phi i32 [ 0, %entry ], [ %next, %loop ]
+  %throwaway = call i32 @opaque(i32 %j)
+  %x = load i32, i32* %local  
+  %x2 = call i32 @opaque(i32 %x)
+  store i32 %x2, i32* %local
+  %next = add i32 %j, 1
+  %cond = icmp eq i32 %next, 0
+  br i1 %cond, label %exit, label %loop
+
+exit:
+  %ret = load i32, i32* %local
+  ret i32 %ret
+}
+
+
+; If the store is "guaranteed modulo exceptions", and the load depends on
+; control flow, we can only promote if the pointer is otherwise known to be
+; dereferencible
+define i32 @test9() {
+; CHECK-LABEL: @test9(
+; CHECK: entry:
+; CHECK-NEXT: %local = alloca
+; CHECK-NEXT: load i32, i32* %local
+; CHECK-NEXT: br label %loop
+; CHECK: exit:
+; CHECK-NEXT: %[[LCSSAPHI:.*]] = phi i32 [ %x2, %else ]
+; CHECK-NEXT: store i32 %[[LCSSAPHI]], i32* %local
+; CHECK-NEXT: %ret = load i32, i32* %local
+; CHECK-NEXT: ret i32 %ret
+entry:
+  %local = alloca i32
+  br label %loop
+
+loop:
+  %j = phi i32 [ 0, %entry ], [ %next, %else ]  
+  %j2 = call i32 @opaque(i32 %j)
+  %cmp = icmp eq i32 %j2, 0
+  br i1 %cmp, label %if, label %else
+
+if:  
+  %x = load i32, i32* %local
+  br label %else
+
+else:
+  %x2 = phi i32 [ 0, %loop ], [ %x, %if]
+  store i32 %x2, i32* %local
+  %next = add i32 %j, 1
+  %cond = icmp eq i32 %next, 0
+  br i1 %cond, label %exit, label %loop
+
+exit:
+  %ret = load i32, i32* %local
+  ret i32 %ret
+}
+
+define i32 @test9bad(i32 %i) {
+; CHECK-LABEL: @test9bad(
+; CHECK: entry:
+; CHECK-NEXT: %local = alloca
+; CHECK-NEXT: %notderef = getelementptr
+; CHECK-NEXT: br label %loop
+; CHECK: if:
+; CHECK-NEXT: load i32, i32* %notderef
+; CHECK-NEXT: br label %else
+; CHECK: exit:
+; CHECK-NEXT: %ret = load i32, i32* %notderef
+; CHECK-NEXT: ret i32 %ret
+entry:
+  %local = alloca i32
+  %notderef = getelementptr i32, i32* %local, i32 %i
+  br label %loop
+
+loop:
+  %j = phi i32 [ 0, %entry ], [ %next, %else ]  
+  %j2 = call i32 @opaque(i32 %j)
+  %cmp = icmp eq i32 %j2, 0
+  br i1 %cmp, label %if, label %else
+
+if:  
+  %x = load i32, i32* %notderef
+  br label %else
+
+else:
+  %x2 = phi i32 [ 0, %loop ], [ %x, %if]
+  store i32 %x2, i32* %notderef
+  %next = add i32 %j, 1
+  %cond = icmp eq i32 %next, 0
+  br i1 %cond, label %exit, label %loop
+
+exit:
+  %ret = load i32, i32* %notderef
+  ret i32 %ret
+}
+
 !0 = !{!4, !4, i64 0}
 !1 = !{!"omnipotent char", !2}
 !2 = !{!"Simple C/C++ TBAA"}