Index: llvm/lib/Transforms/Scalar/LoopFuse.cpp
===================================================================
--- llvm/lib/Transforms/Scalar/LoopFuse.cpp
+++ llvm/lib/Transforms/Scalar/LoopFuse.cpp
@@ -68,6 +68,12 @@
 #include "llvm/Transforms/Utils/CodeMoverUtils.h"
 #include "llvm/Transforms/Utils/LoopPeel.h"
 
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+
 using namespace llvm;
 
 #define DEBUG_TYPE "loop-fusion"
@@ -101,6 +107,8 @@
 STATISTIC(NotRotated, "Candidate is not rotated");
 STATISTIC(OnlySecondCandidateIsGuarded,
           "The second candidate is guarded while the first one is not");
+STATISTIC(NumHoistedInsts, "Number of hoisted preheader instructions.");
+STATISTIC(NumSunkInsts, "Number of hoisted preheader instructions.");
 
 enum FusionDependenceAnalysisChoice {
   FUSION_DEPENDENCE_ANALYSIS_SCEV,
@@ -549,16 +557,16 @@
   PostDominatorTree &PDT;
   OptimizationRemarkEmitter &ORE;
   AssumptionCache &AC;
-
   const TargetTransformInfo &TTI;
+  AAResults &AA;
 
 public:
   LoopFuser(LoopInfo &LI, DominatorTree &DT, DependenceInfo &DI,
             ScalarEvolution &SE, PostDominatorTree &PDT,
             OptimizationRemarkEmitter &ORE, const DataLayout &DL,
-            AssumptionCache &AC, const TargetTransformInfo &TTI)
+            AssumptionCache &AC, const TargetTransformInfo &TTI, AAResults &AA)
       : LDT(LI), DTU(DT, PDT, DomTreeUpdater::UpdateStrategy::Lazy), LI(LI),
-        DT(DT), DI(DI), SE(SE), PDT(PDT), ORE(ORE), AC(AC), TTI(TTI) {}
+        DT(DT), DI(DI), SE(SE), PDT(PDT), ORE(ORE), AC(AC), TTI(TTI), AA(AA) {}
 
   /// This is the main entry point for loop fusion. It will traverse the
   /// specified function and collect candidate loops to fuse, starting at the
@@ -914,16 +922,6 @@
             continue;
           }
 
-          if (!isSafeToMoveBefore(*FC1->Preheader,
-                                  *FC0->Preheader->getTerminator(), DT, &PDT,
-                                  &DI)) {
-            LLVM_DEBUG(dbgs() << "Fusion candidate contains unsafe "
-                                 "instructions in preheader. Not fusing.\n");
-            reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
-                                                       NonEmptyPreheader);
-            continue;
-          }
-
           if (FC0->GuardBranch) {
             assert(FC1->GuardBranch && "Expecting valid FC1 guard branch");
 
@@ -959,6 +957,32 @@
             continue;
           }
 
+          // If the second loop has instructions in the pre-header, attempt to
+          // hoist them up to the first loop's pre-header or sink them into the
+          // body of the second loop.
+          if (!isEmptyPreheader(*FC1)) {
+            LLVM_DEBUG(dbgs() << "Fusion candidate does not have empty "
+                                 "preheader.\n");
+            // At this point, this is the last remaining legality check.
+            // Which means if we can make this pre-header empty, we can fuse
+            // these loops
+            SmallVector<Instruction *, 4> SafeToHoist;
+            SmallVector<Instruction *, 4> SafeToSink;
+            
+            // If it is not safe to hoist/sink all instructions in the
+            // pre-header, we cannot fuse these loops.
+            if (!collectMovablePreheaderInsts(*FC0, *FC1, SafeToHoist, SafeToSink)) { 
+              LLVM_DEBUG(dbgs() << "Could not hoist/sink all instructions in "
+                                   "Fusion Candidate Pre-header.\n"
+                                << "Not Fusing.\n");
+              reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
+                                                         NonEmptyPreheader);
+              continue;
+            }
+            // Execute the hoist/sink operations on preheader instructions
+            movePreheaderInsts(*FC0, *FC1, SafeToHoist, SafeToSink);
+          }
+
           bool BeneficialToFuse = isBeneficialFusion(*FC0, *FC1);
           LLVM_DEBUG(dbgs()
                      << "\tFusion appears to be "
@@ -1022,6 +1046,74 @@
     return Fused;
   }
 
+  /// Collect instructions in the \p FC1 Preheader that can be hoisted
+  /// to the \p FC0 Preheader or sunk into the \p FC1 Body
+  bool collectMovablePreheaderInsts(
+      const FusionCandidate &FC0, const FusionCandidate &FC1,
+      SmallVector<Instruction *, 4> &SafeToHoist,
+      SmallVector<Instruction *, 4> &SafeToSink) const {
+    BasicBlock *FC1Preheader = FC1.Preheader;
+    for (auto &I : *FC1Preheader) {
+      // Can't move a branch
+      if (&I == FC1Preheader->getTerminator())
+        continue;
+      // If the instruction has side-effects, give up.
+      // TODO: The case of mayReadFromMemory we can handle but requires
+      // additional work with a dependence analysis so for now we give
+      // up on memory reads.
+      if (I.mayWriteToMemory() || I.mayThrow() || I.mayReadFromMemory()) {
+        LLVM_DEBUG(dbgs() << "Inst: " << I << " has side-effects.\n");
+        return false;
+      }
+      // Compute alias set needed for canSinkOrHoinstInst
+      AliasSetTracker CurAST0(AA);
+      for (BasicBlock *BB : FC0.L->blocks())
+        CurAST0.add(*BB);
+
+      AliasSetTracker CurAST1(AA);
+      for (BasicBlock *BB : FC1.L->blocks())
+        CurAST1.add(*BB);
+      CurAST1.add(*FC1Preheader);
+      // Default behaviour of this routine as defined in LICM decides
+      // to not hoist or sink debug intrinsics because it is not useful.
+      // For us it is useful so we make a special case here, instead of
+      // changing LICM code which would result in more side-effects.
+      if (!isa<DbgInfoIntrinsic>(I) &&
+          !canSinkOrHoistInst(I, &AA, &DT, FC0.L, &CurAST0, nullptr, true) &&
+          !canSinkOrHoistInst(I, &AA, &DT, FC1.L, &CurAST1, nullptr, true)) {
+        LLVM_DEBUG(dbgs() << "Inst: " << I << " cannot be sunk or hoisted.\n");
+        return false;
+      }
+      // First check if can be hoisted
+      // If the operands of this instruction dominate the FC0 Preheader
+      // target block, then it is safe to move them to the end of the FC0
+      BasicBlock *FC0PreheaderTarget = FC0.Preheader->getSingleSuccessor();
+      assert(FC0PreheaderTarget &&
+             "Expected single successor for loop preheader.");
+      bool OperandsDomFC0Body = true;
+      for (auto &Op : I.operands())
+        if (Instruction *OpInst = dyn_cast<Instruction>(Op))
+          if (!DT.dominates(OpInst, FC0PreheaderTarget)) {
+            OperandsDomFC0Body = false;
+            continue;
+          }
+
+      if (OperandsDomFC0Body)
+        SafeToHoist.push_back(&I);
+      else {
+        // If I is not used in FC1.L, it can be sunk
+        for (auto U : I.users()) {
+          if (Instruction *UI = dyn_cast<Instruction>(U))
+            if (!DT.dominates(FC1.ExitBlock, UI->getParent()))
+              return false; // Cannot sink or hoist; might as well stop here
+        }
+        // second loop does not contain users of I, safe to sink
+        SafeToSink.push_back(&I);
+      }
+    }
+    return true;
+  }
+
   /// Rewrite all additive recurrences in a SCEV to use a new loop.
   class AddRecLoopReplacer : public SCEVRewriteVisitor<AddRecLoopReplacer> {
   public:
@@ -1235,6 +1327,39 @@
       return FC0.ExitBlock == FC1.getEntryBlock();
   }
 
+  bool isEmptyPreheader(const FusionCandidate &FC) const {
+    return FC.Preheader->size() == 1;
+  }
+
+  /// Hoist \p FC1 Preheader instructions to \p FC0 Preheader
+  /// and sink others into the body of \p FC1.
+  void movePreheaderInsts(const FusionCandidate &FC0,
+                          const FusionCandidate &FC1,
+                          SmallVector<Instruction *, 4> &HoistInsts,
+                          SmallVector<Instruction *, 4> &SinkInsts) const {
+    LLVM_DEBUG(if (VerboseFusionDebugging) {
+      if (!HoistInsts.empty())
+        dbgs() << "Hoisting: \n";
+      for (auto I : HoistInsts)
+        dbgs() << *I << "\n";
+      if (!SinkInsts.empty())
+        dbgs() << "Sinking: \n";
+      for (auto I : SinkInsts)
+        dbgs() << *I << "\n";
+    });
+    for (auto I : HoistInsts) {
+      assert(I->getParent() == FC1.Preheader);
+      NumHoistedInsts++;
+      I->moveBefore(FC0.Preheader->getTerminator());
+    }
+    // insert instructions in reverse order to maintain dominance relationship
+    for (auto I : reverse(SinkInsts)) {
+      assert(I->getParent() == FC1.Preheader);
+      NumSunkInsts++;
+      I->moveBefore(&*FC1.ExitBlock->getFirstInsertionPt());
+    }
+  }
+
   /// Determine if two fusion candidates have identical guards
   ///
   /// This method will determine if two fusion candidates have the same guards.
@@ -1820,6 +1945,7 @@
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequiredID(LoopSimplifyID);
+    AU.addRequired<AAResultsWrapperPass>();
     AU.addRequired<ScalarEvolutionWrapperPass>();
     AU.addRequired<LoopInfoWrapperPass>();
     AU.addRequired<DominatorTreeWrapperPass>();
@@ -1838,6 +1964,7 @@
   bool runOnFunction(Function &F) override {
     if (skipFunction(F))
       return false;
+
     auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
     auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
     auto &DI = getAnalysis<DependenceAnalysisWrapperPass>().getDI();
@@ -1847,9 +1974,10 @@
     auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
     const TargetTransformInfo &TTI =
         getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+    auto &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
     const DataLayout &DL = F.getParent()->getDataLayout();
 
-    LoopFuser LF(LI, DT, DI, SE, PDT, ORE, DL, AC, TTI);
+    LoopFuser LF(LI, DT, DI, SE, PDT, ORE, DL, AC, TTI, AA);
     return LF.fuseLoops(F);
   }
 };
@@ -1864,9 +1992,10 @@
   auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
   auto &AC = AM.getResult<AssumptionAnalysis>(F);
   const TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
+  auto &AA = AM.getResult<AAManager>(F);
   const DataLayout &DL = F.getParent()->getDataLayout();
 
-  LoopFuser LF(LI, DT, DI, SE, PDT, ORE, DL, AC, TTI);
+  LoopFuser LF(LI, DT, DI, SE, PDT, ORE, DL, AC, TTI, AA);
   bool Changed = LF.fuseLoops(F);
   if (!Changed)
     return PreservedAnalyses::all();
Index: llvm/test/Transforms/LoopFusion/hoist_preheader.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopFusion/hoist_preheader.ll
@@ -0,0 +1,31 @@
+; RUN: opt -S -loop-fusion < %s | FileCheck %s
+
+define void @hoist_preheader(i32 %N) {
+
+; CHECK:pre1:
+; CHECK-NEXT:  %hoistme = add i32 1, %N
+; CHECK-NEXT:  br label %body1
+pre1:
+  br label %body1
+
+; CHECK: body1:
+; CHECK-NOT:  %hoistme = add i32 1, %N
+body1:  ; preds = %pre1, %body1
+  %i = phi i32 [%i_next, %body1], [0, %pre1]
+  %i_next = add i32 1, %i
+  %cond = icmp ne i32 %i, %N
+  br i1 %cond, label %body1, label %pre2
+
+pre2:
+  %hoistme = add i32 1, %N
+  br label %body2
+
+body2:  ; preds = %pre2, %body2
+  %i2 = phi i32 [%i_next2, %body2], [0, %pre2]
+  %i_next2 = add i32 1, %i2
+  %cond2 = icmp ne i32 %i2, %N
+  br i1 %cond2, label %body2, label %exit
+
+exit:
+  ret void
+}
Index: llvm/test/Transforms/LoopFusion/no_sink_hoist.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopFusion/no_sink_hoist.ll
@@ -0,0 +1,40 @@
+;;;; RUN: opt -S -loop-fusion < %s | FileCheck %s
+; RUN: opt -S -loop-fusion -debug -o %t1 < %s &> %t2
+; RUN: cat %t1 | FileCheck %s
+; RUN: grep "Could not hoist/sink all instructions" %t2
+
+define void @sink_preheader(i32 %N) {
+; CHECK:pre1:
+; CHECK-NEXT:  br label %body1
+pre1:
+  br label %body1
+
+; CHECK:body1: 
+; CHECK-NOT:  %stay =
+body1:  ; preds = %pre1, %body1
+  %i = phi i32 [%i_next, %body1], [0, %pre1]
+  %i_next = add i32 1, %i
+  %cond = icmp ne i32 %i, %N
+  %barrier = add i32 1, %N
+  br i1 %cond, label %body1, label %pre2
+
+; CHECK:pre2:
+; CHECK: %stay =
+pre2:
+  %stay = add i32 1, %barrier
+  br label %body2
+
+; CHECK: body2:
+; CHECK-NOT: %stay =
+body2:  ; preds = %pre2, %body2
+  %i2 = phi i32 [%i_next2, %body2], [0, %pre2]
+  %i_next2 = add i32 1, %i2
+  %cond2 = icmp ne i32 %i2, %N
+  %barrier2 = add i32 1, %stay
+  br i1 %cond2, label %body2, label %exit
+
+; CHECK: exit:
+; CHECK-NOT: %stay =
+exit:
+  ret void
+}
Index: llvm/test/Transforms/LoopFusion/simple.ll
===================================================================
--- llvm/test/Transforms/LoopFusion/simple.ll
+++ llvm/test/Transforms/LoopFusion/simple.ll
@@ -487,8 +487,9 @@
 }
 
 ; Test that `%add` cannot be moved to basic block entry, as it uses %i, which
-; defined after basic block entry. And the two loops for.first and for.second
-; are not fused.
+; defined after basic block entry. It also cannot be moved to for.second.exit 
+; since it is used in for.second. Check also that the two loops for.first and
+; for.second are not fused.
 
 define i64 @unsafe_preheader(i32* %A, i64 %x) {
 ; CHECK-LABEL: @unsafe_preheader(
@@ -505,7 +506,7 @@
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[X:%.*]], [[I]]
 ; CHECK-NEXT:    br label [[FOR_SECOND:%.*]]
 ; CHECK:       for.second:
-; CHECK-NEXT:    [[J:%.*]] = phi i64 [ 0, [[FOR_FIRST_EXIT]] ], [ [[INC_J:%.*]], [[FOR_SECOND]] ]
+; CHECK-NEXT:    [[J:%.*]] = phi i64 [ [[ADD]], [[FOR_FIRST_EXIT]] ], [ [[INC_J:%.*]], [[FOR_SECOND]] ]
 ; CHECK-NEXT:    [[AJ:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[J]]
 ; CHECK-NEXT:    store i32 2, i32* [[AJ]], align 4
 ; CHECK-NEXT:    [[INC_J]] = add nsw i64 [[J]], 1
@@ -530,7 +531,7 @@
   br label %for.second
 
 for.second:
-  %j = phi i64 [ 0, %for.first.exit ], [ %inc.j, %for.second ]
+  %j = phi i64 [ %add, %for.first.exit ], [ %inc.j, %for.second ]
   %Aj = getelementptr inbounds i32, i32* %A, i64 %j
   store i32 2, i32* %Aj, align 4
   %inc.j = add nsw i64 %j, 1
Index: llvm/test/Transforms/LoopFusion/sink_preheader.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopFusion/sink_preheader.ll
@@ -0,0 +1,38 @@
+; RUN: opt -S -loop-fusion < %s | FileCheck %s
+
+define void @sink_preheader(i32 %N) {
+
+; CHECK:pre1:
+; CHECK-NEXT:  br label %body1
+pre1:
+  br label %body1
+
+; CHECK:body1: 
+; CHECK-NOT:  %sinkme
+body1:  ; preds = %pre1, %body1
+  %i = phi i32 [%i_next, %body1], [0, %pre1]
+  %i_next = add i32 1, %i
+  %cond = icmp ne i32 %i, %N
+  %barrier = add i32 1, %N
+  br i1 %cond, label %body1, label %pre2
+
+pre2:
+  %sinkme = add i32 1, %barrier
+  %sinkme2 = add i32 1, %barrier
+  br label %body2
+
+body2:  ; preds = %pre2, %body2
+  %i2 = phi i32 [%i_next2, %body2], [0, %pre2]
+  %i_next2 = add i32 1, %i2
+  %cond2 = icmp ne i32 %i2, %N
+  br i1 %cond2, label %body2, label %exit
+
+; CHECK:exit:
+; CHECK-NEXT:  %sinkme = add i32 1, %barrier
+; CHECK-NEXT:  %sinkme2 = add i32 1, %barrier
+; CHECK-NEXT:  ret void
+exit:
+  ret void
+
+
+}