Index: lib/Transforms/Scalar/GVN.cpp
===================================================================
--- lib/Transforms/Scalar/GVN.cpp
+++ lib/Transforms/Scalar/GVN.cpp
@@ -1514,6 +1514,119 @@
   }
 }
 
+/// Return true if 'Header' is actually the header of a valid loop.  PredBlocks
+/// are either direct predeccessors of the 'Header' BB or indirect predecessors
+/// through 'merge only' blocks.
+static bool isLoopHeader(BasicBlock* Header,
+                         SmallVector<BasicBlock *, 4>& PredBlocks,
+                         DominatorTree &DT) {
+  // TODO: If LoopInfo is available, just use that.
+  
+  if (PredBlocks.size() <= 1) {
+    // Without at least two predecessors, this can't be a loop
+    return false;
+  }
+  // We can't assume we're given a loop header.  We need to ensure that all of
+  // non-mergin predecessors are either inside the loop (i.e. dominated by the
+  // header) or a single preheader.  If we find any block which doesn't meet
+  // this requirement, we do not have a natural loop.
+  // Note: The term 'preheader' is being slightly abused here.  We allow
+  BasicBlock *LoopPred = nullptr;
+  for (BasicBlock *Pred : PredBlocks) {
+    if (DT.properlyDominates(Pred, Header)) {
+      assert(!LoopPred && "Can only have one preheader!");
+      LoopPred = Pred;
+    } else if (DT.dominates(Header, Pred)) {
+      // Iff this is a loop, this is either a latch, or a block which leads to
+      // latch (i.e. via a merge only block)
+    } else {
+      // Not a natural loop at all
+      return false;
+    }
+  }
+  // At this point, we've seen all the predecessors, did we find *one* from
+  // outside the loop?  If not, this is dead code and should just be deleted.
+  return LoopPred;
+}
+
+
+/// Return true if this is a natural loop header and at least one 'real latch'
+/// (i.e. what was a latch before LoopSimplify introduced a merge point) is
+/// fully available.  The idea here is that we want to perform a transform
+/// analogous to LICM, but also allow pushing an extra copy of the load into
+/// one of several paths which lead to the backedge.  (i.e. most paths through
+/// the loop should not involve a load, but the path which clobbers it will
+/// reload)
+static bool isLICMLike(BasicBlock* Header,
+                       SmallVector<BasicBlock *, 4>& PredBlocks,
+                       unsigned NumUnavailable,
+                       DominatorTree &DT) {
+  // We can only judge profitability if this is a loop
+  if (!isLoopHeader(Header, PredBlocks, DT))
+    return false;
+
+  // Do we end up with a least one (possibly indirect) predeccesor
+  // (i.e. hopefully a latch) where the load is fully available?  The
+  // unavailable block could also be the loop predeccessor in which case this
+  // devolves to classic LICM.  
+  return (PredBlocks.size() > NumUnavailable);
+}
+
+/// Return true if this is a merge block.
+/// A 'merge block' is one that ends with an unconditional jump to a single
+/// basic block and contains no useful computation of it's own.  Such blocks
+/// arise frequently from loop simplify.  JumpThreading removes such blocks,
+/// but may not have run immediately before GVN.
+static bool isMergeOnlyBlock(BasicBlock *BB) {
+  if (&BB->getParent()->getEntryBlock() == BB)
+    return false;
+  if (BB->getTerminator()->getNumSuccessors() != 1) {
+    return false;
+  }
+  for (auto I = BB->begin(), E = BB->end(); I != E; ++I) {
+    switch (I->getOpcode()) {
+    default:
+      // unsupported instruction
+      return false;
+    case Instruction::PHI:
+    case Instruction::Br:
+      continue;
+    };
+  }
+  return true;
+}
+
+/// Given a 'merge block' return true if we think it is profitable to look
+/// through the block and treat it's predeccessors as possible insertion
+/// points.  This is conservative in that it defaults to not splitting unless
+/// it clearly does no harm.
+static bool isProfitableToSplit(BasicBlock *Merge,
+                           DenseMap<BasicBlock*, char>& FullyAvailableBlocks) {
+  // If by looking at the merge blocks predeccessors we find only a single
+  // unavailable predeccessor, we can do no harm by looking through it since
+  // this block would be in our list of unavailable blocks anyways
+  BasicBlock *UnavailablePred = nullptr;
+  for (pred_iterator PI = pred_begin(Merge), E = pred_end(Merge);
+       PI != E; ++PI) {
+    BasicBlock *Pred = *PI;
+    if (IsValueFullyAvailableInBlock(Pred, FullyAvailableBlocks, 0)) {
+      continue;
+    }
+    if (UnavailablePred && UnavailablePred != Pred)
+      return false;
+    UnavailablePred = Pred;
+
+    if (Pred->getTerminator()->getNumSuccessors() != 1) {
+      // We'd need to split this critical edge and don't currently keep
+      // around the info to do so if we look through merge blocks.
+      return false;
+    }
+  }
+  return nullptr != UnavailablePred;
+}
+
+
+
 bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, 
                          UnavailBlkVect &UnavailableBlocks) {
   // Okay, we have *some* definitions of the value.  This means that the value
@@ -1561,13 +1674,63 @@
   for (unsigned i = 0, e = UnavailableBlocks.size(); i != e; ++i)
     FullyAvailableBlocks[UnavailableBlocks[i]] = false;
 
-  SmallVector<BasicBlock *, 4> CriticalEdgePred;
+
+  // When we get done, we want PredBlocks to contain both all of the indirect
+  // predeccessors we considered for placement including those which are fully
+  // available.  We only exclude a block from this list if we've looked through
+  // it to consider *its* predeccessors.  This only happens for partially
+  // available 'merge' blocks.
+  SmallVector<BasicBlock *, 4> PredBlocks;
+  SmallSet<BasicBlock*, 4> Seen;
+  // Step 1 - Identify any unique direct predecessors of LoadBB
   for (pred_iterator PI = pred_begin(LoadBB), E = pred_end(LoadBB);
        PI != E; ++PI) {
     BasicBlock *Pred = *PI;
+    if (!Seen.count(Pred)) {
+      PredBlocks.push_back(Pred);
+      Seen.insert(Pred);
+    }
+  }
+
+  // Step 2 - Identify actual insertion sites (may involve looking through some
+  // merge blocks already in the list)
+  SmallVector<BasicBlock *, 4> CriticalEdgePred;
+  for(auto PI = PredBlocks.begin(), E = PredBlocks.end(); PI != E; ++PI) {
+    BasicBlock *Pred = *PI;
     if (IsValueFullyAvailableInBlock(Pred, FullyAvailableBlocks, 0)) {
       continue;
     }
+    
+    // If this a merge block - one with a single successor and no computation
+    // of it's own - which is partially available, we may want to look at it's
+    // predeccessors as possible load insertion points.  Note that a merge
+    // block can not effect whether a load is 'anticipated' along some path.
+    // This might be the single block we want to insert a load in.  Look
+    // through this block only if can do no harm.  This is enough to get most
+    // cases produced by loop simplify which will trigger the LICM-like case
+    // below.
+    if (isMergeOnlyBlock(Pred) &&
+        isProfitableToSplit(Pred, FullyAvailableBlocks)) {
+      errs() << "MergeOnly: " << Pred->getName() << "\n";
+      // First, add each predeccessor block which hasn't already been
+      // considered as a placement candidate.  We happen to know that only
+      // one of those is unavailable, but we need to add each so we can judge
+      // profitability when we get done.
+      for (pred_iterator PI = pred_begin(Pred), E = pred_end(Pred);
+           PI != E; ++PI) {
+        BasicBlock *Pred = *PI;
+        if (!Seen.count(Pred)) {
+          Seen.insert(Pred);
+          PredBlocks.push_back(Pred);
+        }
+      }
+      // Delete the original merge block from the list and adjust iterators
+      auto Last = PI; --Last;
+      PredBlocks.erase(PI);
+      PI = Last;
+      E = PredBlocks.end();
+      continue;
+    }
 
     if (Pred->getTerminator()->getNumSuccessors() != 1) {
       if (isa<IndirectBrInst>(Pred->getTerminator())) {
@@ -1582,7 +1745,6 @@
               << Pred->getName() << "': " << *LI << '\n');
         return false;
       }
-
       CriticalEdgePred.push_back(Pred);
     } else {
       // Only add the predecessors that will not be split for now.
@@ -1590,16 +1752,30 @@
     }
   }
 
+  for (auto P : PredBlocks) {
+    errs() << "pred block: " << P->getName() << "\n";
+  }
+
+  // FIXME: If we could restructure the CFG, we could make a common pred with
+  // all the preds that don't have an available LI and insert a new load into
+  // that one block.  With care, this works for 'indirect' predecessors due to
+  // merge blocks as well.
+
   // Decide whether PRE is profitable for this load.
   unsigned NumUnavailablePreds = PredLoads.size() + CriticalEdgePred.size();
   assert(NumUnavailablePreds != 0 &&
          "Fully available value should already be eliminated!");
 
-  // If this load is unavailable in multiple predecessors, reject it.
-  // FIXME: If we could restructure the CFG, we could make a common pred with
-  // all the preds that don't have an available LI and insert a new load into
-  // that one block.
-  if (NumUnavailablePreds != 1)
+  errs() << NumUnavailablePreds << "\n";
+
+  // We judge a load placement as profitable if:
+  // - We're moving a single load into some (potentially indirect) predeccessor
+  // - We can pull a load into a loop header and reload it only along only some
+  // paths through the loop.  This is analogous to LICM, but additional allows
+  // a reload along one path in loops with interior control flow or multiple
+  // latches. 
+  if (NumUnavailablePreds != 1 &&
+      !isLICMLike(LoadBB, PredBlocks, NumUnavailablePreds, *DT))
       return false;
 
   // Split critical edges, and update the unavailable predecessors accordingly.
@@ -1671,6 +1847,8 @@
   for (const auto &PredLoad : PredLoads) {
     BasicBlock *UnavailablePred = PredLoad.first;
     Value *LoadPtr = PredLoad.second;
+    errs() << "Pred Load: " << UnavailablePred->getName() << "\n";
+    LoadPtr->dump();
 
     Instruction *NewLoad = new LoadInst(LoadPtr, LI->getName()+".pre", false,
                                         LI->getAlignment(),
@@ -1711,6 +1889,14 @@
   LoadDepVect Deps;
   MD->getNonLocalPointerDependency(LI, Deps);
 
+  for (auto Dep : Deps) {
+    if (Dep.getResult().getInst())
+      Dep.getResult().getInst()->dump();
+    else
+      errs() << "nullptr\n";
+    errs() << Dep.getBB()->getName() << "\n";
+  }
+
   // If we had to process more than one hundred blocks to find the
   // dependencies, this load isn't worth worrying about.  Optimizing
   // it will be too expensive.
Index: test/Transforms/GVN/pre-loopsimplify.ll
===================================================================
--- /dev/null
+++ test/Transforms/GVN/pre-loopsimplify.ll
@@ -0,0 +1,113 @@
+; RUN: opt -basicaa -gvn -S < %s | FileCheck %s
+
+target datalayout = "e"
+
+declare void @hold(i32) readonly
+declare void @clobber()
+
+; This is a classic LICM case
+define i32 @test1(i1 %cnd, i32* %p) {
+; CHECK-LABEL: @test1
+entry: 
+; CHECK-LABEL: entry
+; CHECK-NEXT: %v1.pre = load i32* %p
+  br label %header
+
+header:
+; CHECK-LABEL: header
+; CHECK-NOT: load i32* %p
+  %v1 = load i32* %p
+  call void @hold(i32 %v1)
+  br label %header
+}
+
+; This is classic LICM, but we need to look through a merge block
+; to see it.
+define i32 @test2(i1 %cnd, i32* %p) {
+; CHECK-LABEL: @test2
+entry: 
+; CHECK-LABEL: entry
+; CHECK-NEXT: %v1.pre = load i32* %p
+  br label %header
+
+header:
+; CHECK-LABEL: header
+; CHECK-NOT: load i32* %p
+  %v1 = load i32* %p
+  call void @hold(i32 %v1)
+  br i1 %cnd, label %bb1, label %bb2
+
+bb1:
+  br label %merge
+
+bb2:
+  br label %merge
+
+merge:
+  br label %header
+}
+
+; In this case we have a load which is 'almost' loop invariant
+; it's modified along one path, but not another.  We want to 
+; pull the load out of the loop and reload in the clobbering 
+; block.
+define i32 @test3(i1 %cnd, i32* %p) {
+; CHECK-LABEL: @test3
+entry: 
+; CHECK-LABEL: entry
+; CHECK-NEXT: %v1.pre1 = load i32* %p
+  br label %header
+
+header:
+; CHECK-LABEL: header
+; CHECK-NOT: load i32* %p
+  %v1 = load i32* %p
+  call void @hold(i32 %v1)
+  br i1 %cnd, label %bb1, label %bb2
+
+bb1:
+  br label %header
+
+bb2:
+; CHECK-LABEL: bb2
+; CHECK:       call void @clobber()
+; CHECK-NEXT:  %v1.pre = load i32* %p
+; CHECK-NEXT:  br label %header
+
+  call void @clobber()
+  br label %header
+}
+
+; Same as above, but we need to look through a merge block
+; to see it.  This is the form that loop-simplify produces
+; and is thus the most important test case in this file.
+define i32 @test4(i1 %cnd, i32* %p) {
+; CHECK-LABEL: @test4
+entry: 
+; CHECK-LABEL: entry
+; CHECK-NEXT: %v1.pre = load i32* %p
+  br label %header
+
+header:
+; CHECK-LABEL: header
+; CHECK-NOT: load i32* %p
+  %v1 = load i32* %p
+  call void @hold(i32 %v1)
+  br i1 %cnd, label %bb1, label %bb2
+
+bb1:
+  br label %merge
+
+bb2:
+; CHECK-LABEL: bb2
+; CHECK:       call void @clobber()
+; CHECK-NEXT:  %v1.pre1 = load i32* %p
+; CHECK-NEXT:  br label %merge
+
+  call void @clobber()
+  br label %merge
+
+merge:
+  br label %header
+}
+