diff --git a/llvm/include/llvm/Transforms/Utils/LoopPeel.h b/llvm/include/llvm/Transforms/Utils/LoopPeel.h
new file mode 100644
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Utils/LoopPeel.h
@@ -0,0 +1,40 @@
+//===- llvm/Transforms/Utils/LoopPeel.h ----- Peeling utilities -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines some loop peeling utilities. It does not define any
+// actual pass or policy.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_UTILS_LOOPPEEL_H
+#define LLVM_TRANSFORMS_UTILS_LOOPPEEL_H
+
+#include "llvm/Analysis/TargetTransformInfo.h"
+
+namespace llvm {
+
+bool canPeel(Loop *L);
+
+bool peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI, ScalarEvolution *SE,
+              DominatorTree *DT, AssumptionCache *AC, bool PreserveLCSSA);
+
+TargetTransformInfo::PeelingPreferences
+gatherPeelingPreferences(Loop *L, ScalarEvolution &SE,
+                         const TargetTransformInfo &TTI,
+                         Optional<bool> UserAllowPeeling,
+                         Optional<bool> UserAllowProfileBasedPeeling,
+                         bool UnrollingSpecficValues = false);
+
+void computePeelCount(Loop *L, unsigned LoopSize,
+                      TargetTransformInfo::PeelingPreferences &PP,
+                      unsigned &TripCount, ScalarEvolution &SE,
+                      unsigned Threshold = UINT_MAX);
+
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_UTILS_LOOPPEEL_H
diff --git a/llvm/include/llvm/Transforms/Utils/UnrollLoop.h b/llvm/include/llvm/Transforms/Utils/UnrollLoop.h
--- a/llvm/include/llvm/Transforms/Utils/UnrollLoop.h
+++ b/llvm/include/llvm/Transforms/Utils/UnrollLoop.h
@@ -92,16 +92,6 @@
     const TargetTransformInfo *TTI, bool PreserveLCSSA,
     Loop **ResultLoop = nullptr);
 
-void computePeelCount(Loop *L, unsigned LoopSize,
-                      TargetTransformInfo::UnrollingPreferences &UP,
-                      TargetTransformInfo::PeelingPreferences &PP,
-                      unsigned &TripCount, ScalarEvolution &SE);
-
-bool canPeel(Loop *L);
-
-bool peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI, ScalarEvolution *SE,
-              DominatorTree *DT, AssumptionCache *AC, bool PreserveLCSSA);
-
 LoopUnrollResult UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount,
                                   unsigned TripMultiple, bool UnrollRemainder,
                                   LoopInfo *LI, ScalarEvolution *SE,
@@ -121,7 +111,6 @@
                         unsigned &TripMultiple, unsigned LoopSize,
                         TargetTransformInfo::UnrollingPreferences &UP,
                         TargetTransformInfo::PeelingPreferences &PP,
-
                         bool &UseUpperBound);
 
 void simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI,
@@ -138,12 +127,6 @@
     Optional<bool> UserAllowPartial, Optional<bool> UserRuntime,
     Optional<bool> UserUpperBound, Optional<unsigned> UserFullUnrollMaxCount);
 
-TargetTransformInfo::PeelingPreferences
-gatherPeelingPreferences(Loop *L, ScalarEvolution &SE,
-                         const TargetTransformInfo &TTI,
-                         Optional<bool> UserAllowPeeling,
-                         Optional<bool> UserAllowProfileBasedPeeling);
-
 unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls,
                              bool &NotDuplicatable, bool &Convergent,
                              const TargetTransformInfo &TTI,
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
@@ -21,6 +21,7 @@
 #include "llvm/IR/User.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Utils/LoopPeel.h"
 #include "llvm/Transforms/Utils/UnrollLoop.h"
 
 using namespace llvm;
diff --git a/llvm/lib/Transforms/Scalar/LoopFuse.cpp b/llvm/lib/Transforms/Scalar/LoopFuse.cpp
--- a/llvm/lib/Transforms/Scalar/LoopFuse.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopFuse.cpp
@@ -46,6 +46,7 @@
 
 #include "llvm/Transforms/Scalar/LoopFuse.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/DependenceAnalysis.h"
 #include "llvm/Analysis/DomTreeUpdater.h"
 #include "llvm/Analysis/LoopInfo.h"
@@ -53,6 +54,7 @@
 #include "llvm/Analysis/PostDominators.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/InitializePasses.h"
@@ -64,6 +66,7 @@
 #include "llvm/Transforms/Utils.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/CodeMoverUtils.h"
+#include "llvm/Transforms/Utils/LoopPeel.h"
 
 using namespace llvm;
 
@@ -114,6 +117,14 @@
                           "Use all available analyses")),
     cl::Hidden, cl::init(FUSION_DEPENDENCE_ANALYSIS_ALL), cl::ZeroOrMore);
 
+static cl::opt<bool>
+    FusionPeel("loop-fusion-peel", cl::init(false), cl::Hidden,
+               cl::desc("Peel loops if needed so that fusion can take place"));
+
+static cl::opt<unsigned> FusionPeelMaxCount(
+    "loop-fusion-peel-max-count", cl::init(3), cl::Hidden,
+    cl::desc("Max number of iterations to be peeled from a loop"));
+
 #ifndef NDEBUG
 static cl::opt<bool>
     VerboseFusionDebugging("loop-fusion-verbose-debug",
@@ -157,6 +168,12 @@
   bool Valid;
   /// Guard branch of the loop, if it exists
   BranchInst *GuardBranch;
+  /// Peeling Paramaters of the Loop.
+  TTI::PeelingPreferences PP;
+  /// Can you Peel this Loop?
+  bool AbleToPeel;
+  /// Has this loop been Peeled
+  bool Peeled;
 
   /// Dominator and PostDominator trees are needed for the
   /// FusionCandidateCompare function, required by FusionCandidateSet to
@@ -168,11 +185,13 @@
   OptimizationRemarkEmitter &ORE;
 
   FusionCandidate(Loop *L, const DominatorTree *DT,
-                  const PostDominatorTree *PDT, OptimizationRemarkEmitter &ORE)
+                  const PostDominatorTree *PDT, OptimizationRemarkEmitter &ORE,
+                  TTI::PeelingPreferences PP)
       : Preheader(L->getLoopPreheader()), Header(L->getHeader()),
         ExitingBlock(L->getExitingBlock()), ExitBlock(L->getExitBlock()),
         Latch(L->getLoopLatch()), L(L), Valid(true),
-        GuardBranch(L->getLoopGuardBranch()), DT(DT), PDT(PDT), ORE(ORE) {
+        GuardBranch(L->getLoopGuardBranch()), PP(PP), AbleToPeel(canPeel(L)),
+        Peeled(false), DT(DT), PDT(PDT), ORE(ORE) {
 
     // Walk over all blocks in the loop and check for conditions that may
     // prevent fusion. For each block, walk over all instructions and collect
@@ -243,6 +262,17 @@
       return Preheader;
   }
 
+  /// After Peeling the loop is modified quite a bit, hence all of the Blocks
+  /// need to be updated accordingly.
+  void updateAfterPeeling() {
+    Preheader = L->getLoopPreheader();
+    Header = L->getHeader();
+    ExitingBlock = L->getExitingBlock();
+    ExitBlock = L->getExitBlock();
+    Latch = L->getLoopLatch();
+    verify();
+  }
+
   /// Given a guarded loop, get the successor of the guard that is not in the
   /// loop.
   ///
@@ -254,6 +284,8 @@
     assert(GuardBranch && "Only valid on guarded loops.");
     assert(GuardBranch->isConditional() &&
            "Expecting guard to be a conditional branch.");
+    if (Peeled)
+      return GuardBranch->getSuccessor(1);
     return (GuardBranch->getSuccessor(0) == Preheader)
                ? GuardBranch->getSuccessor(1)
                : GuardBranch->getSuccessor(0);
@@ -358,25 +390,26 @@
   bool operator()(const FusionCandidate &LHS,
                   const FusionCandidate &RHS) const {
     const DominatorTree *DT = LHS.DT;
+    const PostDominatorTree *PDT = LHS.PDT;
 
     BasicBlock *LHSEntryBlock = LHS.getEntryBlock();
     BasicBlock *RHSEntryBlock = RHS.getEntryBlock();
 
     // Do not save PDT to local variable as it is only used in asserts and thus
     // will trigger an unused variable warning if building without asserts.
-    assert(DT && LHS.PDT && "Expecting valid dominator tree");
+    assert(DT && PDT && "Expecting valid dominator tree");
 
     // Do this compare first so if LHS == RHS, function returns false.
     if (DT->dominates(RHSEntryBlock, LHSEntryBlock)) {
       // RHS dominates LHS
       // Verify LHS post-dominates RHS
-      assert(LHS.PDT->dominates(LHSEntryBlock, RHSEntryBlock));
+      assert(PDT->dominates(LHSEntryBlock, RHSEntryBlock));
       return false;
     }
 
     if (DT->dominates(LHSEntryBlock, RHSEntryBlock)) {
       // Verify RHS Postdominates LHS
-      assert(LHS.PDT->dominates(RHSEntryBlock, LHSEntryBlock));
+      assert(PDT->dominates(RHSEntryBlock, LHSEntryBlock));
       return true;
     }
 
@@ -515,13 +548,17 @@
   ScalarEvolution &SE;
   PostDominatorTree &PDT;
   OptimizationRemarkEmitter &ORE;
+  AssumptionCache &AC;
+
+  const TargetTransformInfo &TTI;
 
 public:
   LoopFuser(LoopInfo &LI, DominatorTree &DT, DependenceInfo &DI,
             ScalarEvolution &SE, PostDominatorTree &PDT,
-            OptimizationRemarkEmitter &ORE, const DataLayout &DL)
+            OptimizationRemarkEmitter &ORE, const DataLayout &DL,
+            AssumptionCache &AC, const TargetTransformInfo &TTI)
       : LDT(LI), DTU(DT, PDT, DomTreeUpdater::UpdateStrategy::Lazy), LI(LI),
-        DT(DT), DI(DI), SE(SE), PDT(PDT), ORE(ORE) {}
+        DT(DT), DI(DI), SE(SE), PDT(PDT), ORE(ORE), AC(AC), TTI(TTI) {}
 
   /// This is the main entry point for loop fusion. It will traverse the
   /// specified function and collect candidate loops to fuse, starting at the
@@ -606,7 +643,9 @@
   /// Flow Equivalent sets, sorted by dominance.
   void collectFusionCandidates(const LoopVector &LV) {
     for (Loop *L : LV) {
-      FusionCandidate CurrCand(L, &DT, &PDT, ORE);
+      TTI::PeelingPreferences PP =
+          gatherPeelingPreferences(L, SE, TTI, None, None);
+      FusionCandidate CurrCand(L, &DT, &PDT, ORE, PP);
       if (!CurrCand.isEligibleForFusion(SE))
         continue;
 
@@ -656,33 +695,126 @@
   /// Determine if two fusion candidates have the same trip count (i.e., they
   /// execute the same number of iterations).
   ///
-  /// Note that for now this method simply returns a boolean value because there
-  /// are no mechanisms in loop fusion to handle different trip counts. In the
-  /// future, this behaviour can be extended to adjust one of the loops to make
-  /// the trip counts equal (e.g., loop peeling). When this is added, this
-  /// interface may need to change to return more information than just a
-  /// boolean value.
-  bool identicalTripCounts(const FusionCandidate &FC0,
-                           const FusionCandidate &FC1) const {
+  /// This function will return a pair of values. The first is a boolean,
+  /// stating whether or not the two candidates are known at compile time to
+  /// have the same TripCount. The second is the difference in the two
+  /// TripCounts. This information can be used later to determine whether or not
+  /// peeling can be performed on either one of the candiates.
+  std::pair<bool, Optional<unsigned>>
+  HaveIdenticalTripCounts(const FusionCandidate &FC0,
+                          const FusionCandidate &FC1) const {
+    std::pair<bool, Optional<unsigned>> Res = std::make_pair(false, None);
+
     const SCEV *TripCount0 = SE.getBackedgeTakenCount(FC0.L);
     if (isa<SCEVCouldNotCompute>(TripCount0)) {
       UncomputableTripCount++;
       LLVM_DEBUG(dbgs() << "Trip count of first loop could not be computed!");
-      return false;
+      return Res;
     }
 
     const SCEV *TripCount1 = SE.getBackedgeTakenCount(FC1.L);
     if (isa<SCEVCouldNotCompute>(TripCount1)) {
       UncomputableTripCount++;
       LLVM_DEBUG(dbgs() << "Trip count of second loop could not be computed!");
-      return false;
+      return Res;
     }
+
     LLVM_DEBUG(dbgs() << "\tTrip counts: " << *TripCount0 << " & "
                       << *TripCount1 << " are "
                       << (TripCount0 == TripCount1 ? "identical" : "different")
                       << "\n");
 
-    return (TripCount0 == TripCount1);
+    Res.first = (TripCount0 == TripCount1);
+
+    if (Res.first)
+      return Res;
+
+    LLVM_DEBUG(dbgs() << "The loops do not have the same tripcount, "
+                         "determining the difference between trip counts\n");
+
+    // Currently only considering loops with a single exit point
+    // and a non-constant trip count.
+    unsigned TC0 = SE.getSmallConstantTripCount(FC0.L);
+    unsigned TC1 = SE.getSmallConstantTripCount(FC1.L);
+
+    // If any of the tripcounts are zero that means that loop(s) do not have
+    // a single exit or a constant tripcount.
+    if (TC0 == 0 || TC1 == 0) {
+      LLVM_DEBUG(dbgs() << "Loop(s) do not have a single exit point or do not "
+                           "have a constant number of iterations. Peeling "
+                           "is not benefical\n");
+      return Res;
+    }
+
+    int diff = TC0 - TC1;
+
+    if (diff > 0)
+      Res.second = diff;
+    else {
+      LLVM_DEBUG(
+          dbgs()
+          << "Difference is less than 0. FC1 (second loop) has more "
+             "iterations than the first one. Currently not supported.\n");
+    }
+
+    LLVM_DEBUG(dbgs() << "Difference in Loop trip count is: " << Res.second
+                      << "\n");
+
+    return Res;
+  }
+
+  void PeelFusionCandidate(FusionCandidate &FC0, const FusionCandidate &FC1,
+                           unsigned PeelCount) {
+    assert(FC0.AbleToPeel && "Should be able to Peel Loop");
+
+    LLVM_DEBUG(dbgs() << "Attempting to peel first " << PeelCount
+                      << " iterations of the first loop. \n");
+
+    FC0.Peeled = peelLoop(FC0.L, PeelCount, &LI, &SE, &DT, &AC, true);
+    if (FC0.Peeled) {
+      LLVM_DEBUG(dbgs() << "Done Peeling\n");
+
+      auto IdenticalTripCount = HaveIdenticalTripCounts(FC0, FC1);
+
+      assert(IdenticalTripCount.first && IdenticalTripCount.second == None &&
+             "Loops should have identical trip counts after peeling");
+
+      FC0.PP.PeelCount = PeelCount;
+
+      // Peeling does not update the PDT
+      PDT.recalculate(*FC0.Preheader->getParent());
+
+      FC0.updateAfterPeeling();
+
+      // In this case the iterations of the loop are constant, so the first
+      // loop will exectue completely (will not jump from one of
+      // the peeled blocks to the second loop). Here we are updating the
+      // branch conditions of each of the peeled blocks, such that it will
+      // branch to its successor which is not the Preheader of the second Loop.
+      // Doing this update will ensure that the entry block of the first loop
+      // dominates the entry block of the second loop.
+      BasicBlock *BB =
+          FC0.GuardBranch ? FC0.ExitBlock->getUniqueSuccessor() : FC1.Preheader;
+      SmallVector<DominatorTree::UpdateType, 8> TreeUpdates;
+      for (BasicBlock *Pred : predecessors(BB)) {
+        if (Pred != FC0.ExitBlock) {
+          BranchInst *Old = dyn_cast<BranchInst>(Pred->getTerminator());
+          BasicBlock *Succ = Old->getSuccessor(0);
+          if (Succ == BB)
+            Succ = Old->getSuccessor(1);
+          BranchInst *NewBranch = BranchInst::Create(Succ);
+          ReplaceInstWithInst(Old, NewBranch);
+          TreeUpdates.emplace_back(
+              DominatorTree::UpdateType(DominatorTree::Delete, Pred, BB));
+        }
+      }
+      DTU.applyUpdates(TreeUpdates);
+      DTU.flush();
+      LLVM_DEBUG(
+          dbgs() << "Sucessfully Peeled " << FC0.PP.PeelCount
+                 << " iterations from the first loop.\n"
+                    "Both Loops have the same number of iterations now.\n");
+    }
   }
 
   /// Walk each set of control flow equivalent fusion candidates and attempt to
@@ -716,7 +848,28 @@
           FC0->verify();
           FC1->verify();
 
-          if (!identicalTripCounts(*FC0, *FC1)) {
+          // Check if the candidates have identical tripcounts (first value of
+          // pair), and if not check the difference in the tripcounts between
+          // the loops (second value of pair). The difference is not equal to
+          // None iff the loops iterate a constant number of times, and have a
+          // single exit.
+          std::pair<bool, Optional<unsigned>> IdenticalTripCountRes =
+              HaveIdenticalTripCounts(*FC0, *FC1);
+
+          // Check if the user specfices to use Peeling with fusion
+          // Check FC0 (first loop) is able to be peeled
+          // Check that both loops have the different tripcounts
+          if (FusionPeel.getNumOccurrences() > 0 && FusionPeel &&
+              FC0->AbleToPeel && !IdenticalTripCountRes.first &&
+              IdenticalTripCountRes.second) {
+            assert(*IdenticalTripCountRes.second <= FusionPeelMaxCount &&
+                   "Peel Count: is greater than maximum value specificed");
+            // Dependent on peeling being performed on the first loop, and
+            // assuming all other conditions for fusion returning true.
+            IdenticalTripCountRes.first = true;
+          }
+
+          if (!IdenticalTripCountRes.first) {
             LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical trip "
                                  "counts. Not fusing.\n");
             reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
@@ -734,7 +887,8 @@
           // Ensure that FC0 and FC1 have identical guards.
           // If one (or both) are not guarded, this check is not necessary.
           if (FC0->GuardBranch && FC1->GuardBranch &&
-              !haveIdenticalGuards(*FC0, *FC1)) {
+              !haveIdenticalGuards(*FC0, *FC1) &&
+              !IdenticalTripCountRes.second) {
             LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical "
                                  "guards. Not Fusing.\n");
             reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
@@ -796,20 +950,31 @@
                                                        FusionNotBeneficial);
             continue;
           }
-          // All analysis has completed and has determined that fusion is legal
-          // and profitable. At this point, start transforming the code and
-          // perform fusion.
+          // All analysis has completed and has determined that fusion is
+          // legal and profitable. At this point, start transforming the
+          // code and perform fusion.
 
           LLVM_DEBUG(dbgs() << "\tFusion is performed: " << *FC0 << " and "
                             << *FC1 << "\n");
 
+          FusionCandidate FC0Copy = *FC0;
+          // Peel the loop after determining that fusion is legal. The Loops
+          // will still be safe to fuse after the peeling is performed.
+          if (IdenticalTripCountRes.second)
+            PeelFusionCandidate(FC0Copy, *FC1, *IdenticalTripCountRes.second);
+
           // Report fusion to the Optimization Remarks.
           // Note this needs to be done *before* performFusion because
           // performFusion will change the original loops, making it not
           // possible to identify them after fusion is complete.
-          reportLoopFusion<OptimizationRemark>(*FC0, *FC1, FuseCounter);
-
-          FusionCandidate FusedCand(performFusion(*FC0, *FC1), &DT, &PDT, ORE);
+          reportLoopFusion<OptimizationRemark>(
+              (IdenticalTripCountRes.second ? FC0Copy : *FC0), *FC1,
+              FuseCounter);
+
+          FusionCandidate FusedCand(
+              performFusion((IdenticalTripCountRes.second ? FC0Copy : *FC0),
+                            *FC1),
+              &DT, &PDT, ORE, FC0Copy.PP);
           FusedCand.verify();
           assert(FusedCand.isEligibleForFusion(SE) &&
                  "Fused candidate should be eligible for fusion!");
@@ -1094,8 +1259,9 @@
       assert(FCLatchBranch->isConditional() &&
              FCLatchBranch->getSuccessor(0) == FCLatchBranch->getSuccessor(1) &&
              "Expecting the two successors of FCLatchBranch to be the same");
-      FCLatchBranch->setCondition(
-          llvm::ConstantInt::getTrue(FCLatchBranch->getCondition()->getType()));
+      BranchInst *NewBranch =
+          BranchInst::Create(FCLatchBranch->getSuccessor(0));
+      ReplaceInstWithInst(FCLatchBranch, NewBranch);
     }
   }
 
@@ -1155,7 +1321,8 @@
     if (FC0.GuardBranch)
       return fuseGuardedLoops(FC0, FC1);
 
-    assert(FC1.Preheader == FC0.ExitBlock);
+    assert(FC1.Preheader ==
+           (FC0.Peeled ? FC0.ExitBlock->getUniqueSuccessor() : FC0.ExitBlock));
     assert(FC1.Preheader->size() == 1 &&
            FC1.Preheader->getSingleSuccessor() == FC1.Header);
 
@@ -1178,7 +1345,7 @@
     FC0.Latch->replaceSuccessorsPhiUsesWith(FC1.Latch);
 
     // Then modify the control flow and update DT and PDT.
-    SmallVector<DominatorTree::UpdateType, 8> TreeUpdates;
+    SmallVector<DominatorTree::UpdateType, 16> TreeUpdates;
 
     // The old exiting block of the first loop (FC0) has to jump to the header
     // of the second as we need to execute the code in the second header block
@@ -1197,12 +1364,27 @@
     // to FC1.Header? I think this is basically what the three sequences are
     // trying to accomplish; however, doing this directly in the CFG may mean
     // the DT/PDT becomes invalid
-    FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC1.Preheader,
-                                                         FC1.Header);
-    TreeUpdates.emplace_back(DominatorTree::UpdateType(
-        DominatorTree::Delete, FC0.ExitingBlock, FC1.Preheader));
-    TreeUpdates.emplace_back(DominatorTree::UpdateType(
-        DominatorTree::Insert, FC0.ExitingBlock, FC1.Header));
+    if (!FC0.Peeled) {
+      FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC1.Preheader,
+                                                           FC1.Header);
+      TreeUpdates.emplace_back(DominatorTree::UpdateType(
+          DominatorTree::Delete, FC0.ExitingBlock, FC1.Preheader));
+      TreeUpdates.emplace_back(DominatorTree::UpdateType(
+          DominatorTree::Insert, FC0.ExitingBlock, FC1.Header));
+    } else {
+      TreeUpdates.emplace_back(DominatorTree::UpdateType(
+          DominatorTree::Delete, FC0.ExitBlock, FC1.Preheader));
+
+      // Remove the ExitBlock of the first Loop (also not needed)
+      FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC0.ExitBlock,
+                                                           FC1.Header);
+      TreeUpdates.emplace_back(DominatorTree::UpdateType(
+          DominatorTree::Delete, FC0.ExitingBlock, FC0.ExitBlock));
+      FC0.ExitBlock->getTerminator()->eraseFromParent();
+      TreeUpdates.emplace_back(DominatorTree::UpdateType(
+          DominatorTree::Insert, FC0.ExitingBlock, FC1.Header));
+      new UnreachableInst(FC0.ExitBlock->getContext(), FC0.ExitBlock);
+    }
 
     // The pre-header of L1 is not necessary anymore.
     assert(pred_begin(FC1.Preheader) == pred_end(FC1.Preheader));
@@ -1246,7 +1428,7 @@
     FC0.Latch->getTerminator()->replaceUsesOfWith(FC0.Header, FC1.Header);
     FC1.Latch->getTerminator()->replaceUsesOfWith(FC1.Header, FC0.Header);
 
-    // Change the condition of FC0 latch branch to true, as both successors of
+    // Modify the latch branch of FC0 to be unconditional as both successors of
     // the branch are the same.
     simplifyLatchBranch(FC0);
 
@@ -1268,6 +1450,11 @@
 
     LI.removeBlock(FC1.Preheader);
     DTU.deleteBB(FC1.Preheader);
+    if (FC0.Peeled) {
+      LI.removeBlock(FC0.ExitBlock);
+      DTU.deleteBB(FC0.ExitBlock);
+    }
+
     DTU.flush();
 
     // Is there a way to keep SE up-to-date so we don't need to forget the loops
@@ -1364,10 +1551,18 @@
     BasicBlock *FC1GuardBlock = FC1.GuardBranch->getParent();
     BasicBlock *FC0NonLoopBlock = FC0.getNonLoopBlock();
     BasicBlock *FC1NonLoopBlock = FC1.getNonLoopBlock();
+    BasicBlock *FC0ExitBlockSuccessor = FC0.ExitBlock->getUniqueSuccessor();
 
     // Move instructions from the exit block of FC0 to the beginning of the exit
-    // block of FC1.
-    moveInstructionsToTheBeginning(*FC0.ExitBlock, *FC1.ExitBlock, DT, PDT, DI);
+    // block of FC1, in the case that the FC0 loop has not been peeled. In the
+    // case that FC0 loop is peeled, then move the instructions of the successor
+    // of the FC0 Exit block to the to the beginning of the exit block of FC1.
+    if (!FC0.Peeled)
+      moveInstructionsToTheBeginning(*FC0.ExitBlock, *FC1.ExitBlock, DT, PDT,
+                                     DI);
+    else
+      moveInstructionsToTheBeginning(*FC0ExitBlockSuccessor, *FC1.ExitBlock, DT,
+                                     PDT, DI);
 
     // Move instructions from the guard block of FC1 to the end of the guard
     // block of FC0.
@@ -1387,8 +1582,12 @@
     // for FC1 (where FC1 guard would have gone if FC1 was not executed).
     FC1NonLoopBlock->replacePhiUsesWith(FC1GuardBlock, FC0GuardBlock);
     FC0.GuardBranch->replaceUsesOfWith(FC0NonLoopBlock, FC1NonLoopBlock);
-    FC0.ExitBlock->getTerminator()->replaceUsesOfWith(FC1GuardBlock,
-                                                      FC1.Header);
+    if (!FC0.Peeled)
+      FC0.ExitBlock->getTerminator()->replaceUsesOfWith(FC1GuardBlock,
+                                                        FC1.Header);
+    else
+      FC0ExitBlockSuccessor->getTerminator()->replaceUsesOfWith(FC1GuardBlock,
+                                                                FC1.Header);
 
     // The guard of FC1 is not necessary anymore.
     FC1.GuardBranch->eraseFromParent();
@@ -1403,6 +1602,15 @@
     TreeUpdates.emplace_back(DominatorTree::UpdateType(
         DominatorTree::Insert, FC0GuardBlock, FC1NonLoopBlock));
 
+    if (FC0.Peeled) {
+      // Remove the Block after the ExitBlock of FC0
+      TreeUpdates.emplace_back(DominatorTree::UpdateType(
+          DominatorTree::Delete, FC0ExitBlockSuccessor, FC1GuardBlock));
+      FC0ExitBlockSuccessor->getTerminator()->eraseFromParent();
+      new UnreachableInst(FC0ExitBlockSuccessor->getContext(),
+                          FC0ExitBlockSuccessor);
+    }
+
     assert(pred_begin(FC1GuardBlock) == pred_end(FC1GuardBlock) &&
            "Expecting guard block to have no predecessors");
     assert(succ_begin(FC1GuardBlock) == succ_end(FC1GuardBlock) &&
@@ -1540,6 +1748,10 @@
     LI.removeBlock(FC1GuardBlock);
     LI.removeBlock(FC1.Preheader);
     LI.removeBlock(FC0.ExitBlock);
+    if (FC0.Peeled) {
+      LI.removeBlock(FC0ExitBlockSuccessor);
+      DTU.deleteBB(FC0ExitBlockSuccessor);
+    }
     DTU.deleteBB(FC1GuardBlock);
     DTU.deleteBB(FC1.Preheader);
     DTU.deleteBB(FC0.ExitBlock);
@@ -1606,6 +1818,8 @@
     AU.addRequired<PostDominatorTreeWrapperPass>();
     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
     AU.addRequired<DependenceAnalysisWrapperPass>();
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
 
     AU.addPreserved<ScalarEvolutionWrapperPass>();
     AU.addPreserved<LoopInfoWrapperPass>();
@@ -1622,9 +1836,12 @@
     auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
     auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
     auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
-
+    auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+    const TargetTransformInfo &TTI =
+        getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
     const DataLayout &DL = F.getParent()->getDataLayout();
-    LoopFuser LF(LI, DT, DI, SE, PDT, ORE, DL);
+
+    LoopFuser LF(LI, DT, DI, SE, PDT, ORE, DL, AC, TTI);
     return LF.fuseLoops(F);
   }
 };
@@ -1637,9 +1854,11 @@
   auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
   auto &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
   auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
-
+  auto &AC = AM.getResult<AssumptionAnalysis>(F);
+  const TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
   const DataLayout &DL = F.getParent()->getDataLayout();
-  LoopFuser LF(LI, DT, DI, SE, PDT, ORE, DL);
+
+  LoopFuser LF(LI, DT, DI, SE, PDT, ORE, DL, AC, TTI);
   bool Changed = LF.fuseLoops(F);
   if (!Changed)
     return PreservedAnalyses::all();
@@ -1662,6 +1881,8 @@
 INITIALIZE_PASS_DEPENDENCY(DependenceAnalysisWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 INITIALIZE_PASS_END(LoopFuseLegacy, "loop-fusion", "Loop Fusion", false, false)
 
 FunctionPass *llvm::createLoopFusePass() { return new LoopFuseLegacy(); }
diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
--- a/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
@@ -41,6 +41,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/LoopPeel.h"
 #include "llvm/Transforms/Utils/LoopSimplify.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/UnrollLoop.h"
diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -56,6 +56,7 @@
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/LoopPeel.h"
 #include "llvm/Transforms/Utils/LoopSimplify.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/SizeOpts.h"
@@ -115,10 +116,6 @@
     cl::desc(
         "Set the max unroll count for full unrolling, for testing purposes"));
 
-static cl::opt<unsigned> UnrollPeelCount(
-    "unroll-peel-count", cl::Hidden,
-    cl::desc("Set the unroll peeling count, for testing purposes"));
-
 static cl::opt<bool>
     UnrollAllowPartial("unroll-allow-partial", cl::Hidden,
                        cl::desc("Allows loops to be partially unrolled until "
@@ -149,15 +146,6 @@
              "threshold, the loop is considered as flat and will be less "
              "aggressively unrolled."));
 
-static cl::opt<bool>
-    UnrollAllowPeeling("unroll-allow-peeling", cl::init(true), cl::Hidden,
-                       cl::desc("Allows loops to be peeled when the dynamic "
-                                "trip count is known to be low."));
-
-static cl::opt<bool> UnrollAllowLoopNestsPeeling(
-    "unroll-allow-loop-nests-peeling", cl::init(false), cl::Hidden,
-    cl::desc("Allows loop nests to be peeled."));
-
 static cl::opt<bool> UnrollUnrollRemainder(
   "unroll-remainder", cl::Hidden,
   cl::desc("Allow the loop remainder to be unrolled."));
@@ -175,6 +163,7 @@
     "unroll-threshold-aggressive", cl::init(300), cl::Hidden,
     cl::desc("Threshold (max size of unrolled loop) to use in aggressive (O3) "
              "optimizations"));
+
 static cl::opt<unsigned>
     UnrollThresholdDefault("unroll-threshold-default", cl::init(150),
                            cl::Hidden,
@@ -275,33 +264,6 @@
   return UP;
 }
 
-TargetTransformInfo::PeelingPreferences
-llvm::gatherPeelingPreferences(Loop *L, ScalarEvolution &SE,
-                               const TargetTransformInfo &TTI,
-                               Optional<bool> UserAllowPeeling,
-                               Optional<bool> UserAllowProfileBasedPeeling) {
-  TargetTransformInfo::PeelingPreferences PP;
-
-  // Get Target Specifc Values
-  TTI.getPeelingPreferences(L, SE, PP);
-
-  // User Specified Values using cl::opt
-  if (UnrollPeelCount.getNumOccurrences() > 0)
-    PP.PeelCount = UnrollPeelCount;
-  if (UnrollAllowPeeling.getNumOccurrences() > 0)
-    PP.AllowPeeling = UnrollAllowPeeling;
-  if (UnrollAllowLoopNestsPeeling.getNumOccurrences() > 0)
-    PP.AllowLoopNestsPeeling = UnrollAllowLoopNestsPeeling;
-
-  // User Specifed values provided by argument
-  if (UserAllowPeeling.hasValue())
-    PP.AllowPeeling = *UserAllowPeeling;
-  if (UserAllowProfileBasedPeeling.hasValue())
-    PP.PeelProfiledIterations = *UserAllowProfileBasedPeeling;
-
-  return PP;
-}
-
 namespace {
 
 /// A struct to densely store the state of an instruction after unrolling at
@@ -875,7 +837,7 @@
   }
 
   // 4th priority is loop peeling.
-  computePeelCount(L, LoopSize, UP, PP, TripCount, SE);
+  computePeelCount(L, LoopSize, PP, TripCount, SE, UP.Threshold);
   if (PP.PeelCount) {
     UP.Runtime = false;
     UP.Count = 1;
@@ -1081,7 +1043,7 @@
       ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound,
       ProvidedFullUnrollMaxCount);
   TargetTransformInfo::PeelingPreferences PP = gatherPeelingPreferences(
-      L, SE, TTI, ProvidedAllowPeeling, ProvidedAllowProfileBasedPeeling);
+      L, SE, TTI, ProvidedAllowPeeling, ProvidedAllowProfileBasedPeeling, true);
 
   // Exit early if unrolling is disabled. For OptForSize, we pick the loop size
   // as threshold later on.
diff --git a/llvm/lib/Transforms/Utils/CMakeLists.txt b/llvm/lib/Transforms/Utils/CMakeLists.txt
--- a/llvm/lib/Transforms/Utils/CMakeLists.txt
+++ b/llvm/lib/Transforms/Utils/CMakeLists.txt
@@ -35,11 +35,11 @@
   LCSSA.cpp
   LibCallsShrinkWrap.cpp
   Local.cpp
+  LoopPeel.cpp
   LoopRotationUtils.cpp
   LoopSimplify.cpp
   LoopUnroll.cpp
   LoopUnrollAndJam.cpp
-  LoopUnrollPeel.cpp
   LoopUnrollRuntime.cpp
   LoopUtils.cpp
   LoopVersioning.cpp
diff --git a/llvm/lib/Transforms/Utils/LoopUnrollPeel.cpp b/llvm/lib/Transforms/Utils/LoopPeel.cpp
rename from llvm/lib/Transforms/Utils/LoopUnrollPeel.cpp
rename to llvm/lib/Transforms/Utils/LoopPeel.cpp
--- a/llvm/lib/Transforms/Utils/LoopUnrollPeel.cpp
+++ b/llvm/lib/Transforms/Utils/LoopPeel.cpp
@@ -1,4 +1,4 @@
-//===- UnrollLoopPeel.cpp - Loop peeling utilities ------------------------===//
+//===- LoopPeel.cpp -------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,53 +6,65 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file implements some loop unrolling utilities for peeling loops
-// with dynamically inferred (from PGO) trip counts. See LoopUnroll.cpp for
-// unrolling loops with compile-time constant trip counts.
-//
+// Loop Peel Utilities.
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Transforms/Utils/LoopPeel.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/LazyBlockFrequencyInfo.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopIterator.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/LoopUnrollAnalyzer.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/MDBuilder.h"
-#include "llvm/IR/Metadata.h"
 #include "llvm/IR/PatternMatch.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar/LoopUnrollPass.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/LoopSimplify.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/UnrollLoop.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
-#include <algorithm>
-#include <cassert>
-#include <cstdint>
-#include <limits>
 
 using namespace llvm;
 using namespace llvm::PatternMatch;
 
-#define DEBUG_TYPE "loop-unroll"
+#define DEBUG_TYPE "loop-peel"
 
 STATISTIC(NumPeeled, "Number of loops peeled");
 
+static cl::opt<unsigned> UnrollPeelCount(
+    "unroll-peel-count", cl::Hidden,
+    cl::desc("Set the unroll peeling count, for testing purposes"));
+
+static cl::opt<bool>
+    UnrollAllowPeeling("unroll-allow-peeling", cl::init(true), cl::Hidden,
+                       cl::desc("Allows loops to be peeled when the dynamic "
+                                "trip count is known to be low."));
+
+static cl::opt<bool>
+    UnrollAllowLoopNestsPeeling("unroll-allow-loop-nests-peeling",
+                                cl::init(false), cl::Hidden,
+                                cl::desc("Allows loop nests to be peeled."));
+
 static cl::opt<unsigned> UnrollPeelMaxCount(
     "unroll-peel-max-count", cl::init(7), cl::Hidden,
     cl::desc("Max average trip count which will cause loop peeling."));
@@ -278,9 +290,9 @@
 
 // Return the number of iterations we want to peel off.
 void llvm::computePeelCount(Loop *L, unsigned LoopSize,
-                            TargetTransformInfo::UnrollingPreferences &UP,
                             TargetTransformInfo::PeelingPreferences &PP,
-                            unsigned &TripCount, ScalarEvolution &SE) {
+                            unsigned &TripCount, ScalarEvolution &SE,
+                            unsigned Threshold) {
   assert(LoopSize > 0 && "Zero loop size is not allowed!");
   // Save the PP.PeelCount value set by the target in
   // TTI.getPeelingPreferences or by the flag -unroll-peel-count.
@@ -322,7 +334,7 @@
   // maximum number of iterations among these values, thus turning all those
   // Phis into invariants.
   // First, check that we can peel at least one iteration.
-  if (2 * LoopSize <= UP.Threshold && UnrollPeelMaxCount > 0) {
+  if (2 * LoopSize <= Threshold && UnrollPeelMaxCount > 0) {
     // Store the pre-calculated values here.
     SmallDenseMap<PHINode *, unsigned> IterationsToInvariance;
     // Now go through all Phis to calculate their the number of iterations they
@@ -342,7 +354,7 @@
 
     // Pay respect to limitations implied by loop size and the max peel count.
     unsigned MaxPeelCount = UnrollPeelMaxCount;
-    MaxPeelCount = std::min(MaxPeelCount, UP.Threshold / LoopSize - 1);
+    MaxPeelCount = std::min(MaxPeelCount, Threshold / LoopSize - 1);
 
     DesiredPeelCount = std::max(DesiredPeelCount,
                                 countToEliminateCompares(*L, MaxPeelCount, SE));
@@ -385,7 +397,7 @@
 
     if (*PeelCount) {
       if ((*PeelCount + AlreadyPeeled <= UnrollPeelMaxCount) &&
-          (LoopSize * (*PeelCount + 1) <= UP.Threshold)) {
+          (LoopSize * (*PeelCount + 1) <= Threshold)) {
         LLVM_DEBUG(dbgs() << "Peeling first " << *PeelCount
                           << " iterations.\n");
         PP.PeelCount = *PeelCount;
@@ -396,7 +408,7 @@
       LLVM_DEBUG(dbgs() << "Max peel count: " << UnrollPeelMaxCount << "\n");
       LLVM_DEBUG(dbgs() << "Peel cost: " << LoopSize * (*PeelCount + 1)
                         << "\n");
-      LLVM_DEBUG(dbgs() << "Max peel cost: " << UP.Threshold << "\n");
+      LLVM_DEBUG(dbgs() << "Max peel cost: " << Threshold << "\n");
     }
   }
 }
@@ -491,7 +503,7 @@
 /// instructions in the last peeled-off iteration.
 static void cloneLoopBlocks(
     Loop *L, unsigned IterNumber, BasicBlock *InsertTop, BasicBlock *InsertBot,
-    SmallVectorImpl<std::pair<BasicBlock *, BasicBlock *> > &ExitEdges,
+    SmallVectorImpl<std::pair<BasicBlock *, BasicBlock *>> &ExitEdges,
     SmallVectorImpl<BasicBlock *> &NewBlocks, LoopBlocksDFS &LoopBlocks,
     ValueToValueMapTy &VMap, ValueToValueMapTy &LVMap, DominatorTree *DT,
     LoopInfo *LI) {
@@ -599,6 +611,34 @@
     LVMap[KV.first] = KV.second;
 }
 
+TargetTransformInfo::PeelingPreferences llvm::gatherPeelingPreferences(
+    Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI,
+    Optional<bool> UserAllowPeeling,
+    Optional<bool> UserAllowProfileBasedPeeling, bool UnrollingSpecficValues) {
+  TargetTransformInfo::PeelingPreferences PP;
+
+  // Get Target Specifc Values
+  TTI.getPeelingPreferences(L, SE, PP);
+
+  if (UnrollingSpecficValues) {
+    // User Specified Values using cl::opt
+    if (UnrollPeelCount.getNumOccurrences() > 0)
+      PP.PeelCount = UnrollPeelCount;
+    if (UnrollAllowPeeling.getNumOccurrences() > 0)
+      PP.AllowPeeling = UnrollAllowPeeling;
+    if (UnrollAllowLoopNestsPeeling.getNumOccurrences() > 0)
+      PP.AllowLoopNestsPeeling = UnrollAllowLoopNestsPeeling;
+  }
+
+  // User Specifed values provided by argument
+  if (UserAllowPeeling.hasValue())
+    PP.AllowPeeling = *UserAllowPeeling;
+  if (UserAllowProfileBasedPeeling.hasValue())
+    PP.PeelProfiledIterations = *UserAllowProfileBasedPeeling;
+
+  return PP;
+}
+
 /// Peel off the first \p PeelCount iterations of loop \p L.
 ///
 /// Note that this does not peel them off as a single straight-line block.
@@ -609,8 +649,8 @@
 /// for the bulk of dynamic execution, can be further simplified by scalar
 /// optimizations.
 bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI,
-                    ScalarEvolution *SE, DominatorTree *DT,
-                    AssumptionCache *AC, bool PreserveLCSSA) {
+                    ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC,
+                    bool PreserveLCSSA) {
   assert(PeelCount > 0 && "Attempt to peel out zero iterations?");
   assert(canPeel(L) && "Attempt to peel a loop which is not peelable?");
 
diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
--- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
@@ -59,6 +59,7 @@
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopPeel.h"
 #include "llvm/Transforms/Utils/LoopSimplify.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/SimplifyIndVar.h"
diff --git a/llvm/test/Transforms/LoopFusion/guarded_peel.ll b/llvm/test/Transforms/LoopFusion/guarded_peel.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/LoopFusion/guarded_peel.ll
@@ -0,0 +1,84 @@
+; RUN: opt -S -loop-fusion -loop-fusion-peel < %s | FileCheck %s
+
+; This will test if we are able to fuse two guarded loops which have constant
+; but different trip counts. The first two iterations of the first loop should
+; be peeled off, and then the loops should be fused together.
+
+@B = common global [1024 x i32] zeroinitializer, align 16
+
+; CHECK: void @main
+; CHECK-next: entry:
+; CHECK: br i1 %cmp4, label %for.first.entry, label %for.end
+; CHECK: for.first.entry
+; CHECK:  br label %for.first.peel.begin
+; CHECK: for.first.peel.begin:
+; CHECK: br label %for.first.peel
+; CHECK: for.first.peel:
+; CHECK: br label %for.first.peel.next
+; CHECK: for.first.peel.next:
+; CHECK: br label %for.first.peel2
+; CHECK: for.first.peel2:
+; CHECK: br label %for.first.peel.next1
+; CHECK: for.first.peel.next1:
+; CHECK: br label %for.first.peel.next11
+; CHECK: for.first.peel.next11:
+; CHECK: br label %for.first.entry.peel.newph
+; CHECK: br label %for.first
+; CHECK: for.first:
+; CHECK: br i1 %cmp3, label %for.first, label %for.second.exit
+; CHECK: for.second.exit:
+; CHECK:  br label %for.end
+; CHECK: for.end:
+; CHECK: ret void
+
+define void @main(i32* noalias %A) {
+entry:
+  %cmp4 = icmp slt i64 0, 45
+  br i1 %cmp4, label %for.first.entry, label %for.second.guard
+
+for.first.entry:                               ; preds = %entry
+  br label %for.first
+
+for.first:                                         ; preds = %for.first.entry, %for.first
+  %i.05 = phi i64 [ %inc, %for.first ], [ 0, %for.first.entry ]
+  %sub = sub nsw i64 %i.05, 3
+  %add = add nsw i64 %i.05, 3
+  %mul = mul nsw i64 %sub, %add
+  %rem = srem i64 %mul, %i.05
+  %conv = trunc i64 %rem to i32
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %i.05
+  store i32 %conv, i32* %arrayidx, align 4
+  %inc = add nsw i64 %i.05, 1
+  %cmp = icmp slt i64 %inc, 45
+  br i1 %cmp, label %for.first, label %for.first.exit
+
+for.first.exit:                                 ; preds = %for.first
+  br label %for.second.guard
+
+for.second.guard:                                          ; preds = %for.first.exit, %entry
+  %cmp31 = icmp slt i64 2, 45
+  br i1 %cmp31, label %for.second.entry, label %for.end
+
+for.second.entry:                              ; preds = %for.second.guard
+  br label %for.second
+
+for.second:                                        ; preds = %for.second.entry, %for.second
+  %i1.02 = phi i64 [ %inc14, %for.second ], [ 2, %for.second.entry ]
+  %sub7 = sub nsw i64 %i1.02, 3
+  %add8 = add nsw i64 %i1.02, 3
+  %mul9 = mul nsw i64 %sub7, %add8
+  %rem10 = srem i64 %mul9, %i1.02
+  %conv11 = trunc i64 %rem10 to i32
+  %arrayidx12 = getelementptr inbounds [1024 x i32], [1024 x i32]* @B, i64 0, i64 %i1.02
+  store i32 %conv11, i32* %arrayidx12, align 4
+  %inc14 = add nsw i64 %i1.02, 1
+  %cmp3 = icmp slt i64 %inc14, 45
+  br i1 %cmp3, label %for.second, label %for.second.exit
+
+for.second.exit:                               ; preds = %for.second
+  br label %for.end
+
+for.end:                                        ; preds = %for.second.exit, %for.second.guard
+  ret void
+}
+
diff --git a/llvm/test/Transforms/LoopFusion/guarded_unsafeblock_peel.ll b/llvm/test/Transforms/LoopFusion/guarded_unsafeblock_peel.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/LoopFusion/guarded_unsafeblock_peel.ll
@@ -0,0 +1,68 @@
+; RUN: opt -S -loop-fusion -loop-fusion-peel < %s | FileCheck %s
+
+; This will test that we do not fuse two guarded loops together.
+; These loops do not have the same trip count, fusing should be possible after
+; peeling the loops. However, the exit block of the first loop makes the loops
+; unsafe to peel.
+; The expected output of this test is the function as below.
+
+; CHECK: void @unsafe_exitblock
+; CHECK: for.first.guard
+; CHECK: br i1 %cmp3, label %for.first.preheader, label %for.second.guard
+; CHECK: for.first:
+; CHECK: br i1 %cmp, label %for.first, label %for.first.exit
+; CHECK: for.first.exit:
+; CHECK: br label %for.second.guard
+; CHECK: for.second.guard:
+; CHECK: br i1 %cmp21, label %for.second.preheader, label %for.end
+; CHECK: for.second.preheader:
+; CHECK: br label %for.second
+; CHECK: for.second:
+; CHECK: br i1 %cmp2, label %for.second, label %for.second.exit
+; CHECK: for.second.exit:
+; CHECK: br label %for.end
+; CHECK: for.end:
+; CHECK: ret void
+define void @unsafe_exitblock(i32* noalias %A, i32* noalias %B) {
+for.first.guard:
+  %cmp3 = icmp slt i64 0, 45
+  br i1 %cmp3, label %for.first.preheader, label %for.second.guard
+
+for.first.preheader:
+  br label %for.first
+
+for.first:
+  %i.04 = phi i64 [ %inc, %for.first ], [ 0, %for.first.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %i.04
+  store i32 0, i32* %arrayidx, align 4
+  %inc = add nsw i64 %i.04, 1
+  %cmp = icmp slt i64 %inc, 45
+  br i1 %cmp, label %for.first, label %for.first.exit
+
+for.first.exit:
+  call void @bar()
+  br label %for.second.guard
+
+for.second.guard:
+  %cmp21 = icmp slt i64 2,45
+  br i1 %cmp21, label %for.second.preheader, label %for.end
+
+for.second.preheader:
+  br label %for.second
+
+for.second:
+  %j.02 = phi i64 [ %inc6, %for.second ], [ 2, %for.second.preheader ]
+  %arrayidx4 = getelementptr inbounds i32, i32* %B, i64 %j.02
+  store i32 0, i32* %arrayidx4, align 4
+  %inc6 = add nsw i64 %j.02, 1
+  %cmp2 = icmp slt i64 %inc6, 45
+  br i1 %cmp2, label %for.second, label %for.second.exit
+
+for.second.exit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+declare void @bar()
diff --git a/llvm/test/Transforms/LoopFusion/nonadjacent_peel.ll b/llvm/test/Transforms/LoopFusion/nonadjacent_peel.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/LoopFusion/nonadjacent_peel.ll
@@ -0,0 +1,82 @@
+; RUN: opt -S -loop-fusion -loop-fusion-peel < %s | FileCheck %s
+
+; This will check that we do not fuse these two loops together. These loops are
+; valid cadidates for peeling, however they are not adjacent.
+; The expected output of this test is the function below.
+
+; CHECK: void @function
+; CHECK-NEXT: for.first.preheader:
+; CHECK-NEXT: br label %for.first
+; CHECK: for.first:
+; CHECK: br label %for.first.latch
+; CHECK: for.first.latch:
+; CHECK: br i1 %exitcond4, label %for.first, label %for.first.exit
+; CHECK: for.first.exit:
+; CHECK: br label %for.next
+; CHECK: for.next:
+; CHECK: br label %for.second.preheader
+; CHECK: for.second.preheader:
+; CHECK: br label %for.second
+; CHECK: for.second:
+; CHECK: br label %for.second.latch
+; CHECK: br i1 %exitcond, label %for.second, label %for.end
+; CHECK: for.end:
+; CHECK-NEXT: ret void
+
+@B = common global [1024 x i32] zeroinitializer, align 16
+
+define void @function(i32* noalias %arg) {
+for.first.preheader:
+  br label %for.first
+
+for.first:                                              ; preds = %for.first.preheader, %for.first.latch
+  %.014 = phi i32 [ 0, %for.first.preheader ], [ %tmp15, %for.first.latch ]
+  %indvars.iv23 = phi i64 [ 0, %for.first.preheader ], [ %indvars.iv.next3, %for.first.latch ]
+  %tmp = add nsw i32 %.014, -3
+  %tmp8 = add nuw nsw i64 %indvars.iv23, 3
+  %tmp9 = trunc i64 %tmp8 to i32
+  %tmp10 = mul nsw i32 %tmp, %tmp9
+  %tmp11 = trunc i64 %indvars.iv23 to i32
+  %tmp12 = srem i32 %tmp10, %tmp11
+  %tmp13 = getelementptr inbounds i32, i32* %arg, i64 %indvars.iv23
+  store i32 %tmp12, i32* %tmp13, align 4
+  br label %for.first.latch
+
+for.first.latch:                                             ; preds = %for.first
+  %indvars.iv.next3 = add nuw nsw i64 %indvars.iv23, 1
+  %tmp15 = add nuw nsw i32 %.014, 1
+  %exitcond4 = icmp ne i64 %indvars.iv.next3, 100
+  br i1 %exitcond4, label %for.first, label %for.first.exit
+
+for.first.exit:                                            ; preds: %for.first.latch
+  br label %for.next
+
+for.next:                                      ; preds = %for.first.exit
+  br label %for.second.preheader
+
+for.second.preheader:                                   ; preds = %for.next
+  br label %for.second
+
+for.second:                                             ; preds = %for.second.preheader, %for.second.latch
+  %.02 = phi i32 [ 0, %for.second.preheader ], [ %tmp28, %for.second.latch ]
+  %indvars.iv1 = phi i64 [ 3, %for.second.preheader ], [ %indvars.iv.next, %for.second.latch ]
+  %tmp20 = add nsw i32 %.02, -3
+  %tmp21 = add nuw nsw i64 %indvars.iv1, 3
+  %tmp22 = trunc i64 %tmp21 to i32
+  %tmp23 = mul nsw i32 %tmp20, %tmp22
+  %tmp24 = trunc i64 %indvars.iv1 to i32
+  %tmp25 = srem i32 %tmp23, %tmp24
+  %tmp26 = getelementptr inbounds [1024 x i32], [1024 x i32]* @B, i64 0, i64 %indvars.iv1
+  store i32 %tmp25, i32* %tmp26, align 4
+  br label %for.second.latch
+
+for.second.latch:                                             ; preds = %for.second
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv1, 1
+  %tmp28 = add nuw nsw i32 %.02, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, 100
+  br i1 %exitcond, label %for.second, label %for.end
+
+for.end:                                             ; preds = %for.second.latch
+  ret void
+}
+
diff --git a/llvm/test/Transforms/LoopFusion/peel.ll b/llvm/test/Transforms/LoopFusion/peel.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/LoopFusion/peel.ll
@@ -0,0 +1,106 @@
+; RUN: opt -S -loop-fusion -loop-fusion-peel < %s | FileCheck %s
+
+; This will test whether we can fuse two loops together if they have constant
+; but a different tripcount.
+; The first three iterations of the first loop should be peeled, and then the
+; two loops should be fused together in this example.
+
+; C Code
+;
+; int B[1024];
+;
+; void function(int *arg) {
+;   for (int i = 0; i != 100; ++i)
+;     arg[i] = ((i - 3)*(i+3)) % i;
+;
+;   for (int i = 3; i != 100; ++i)
+;     B[i] = ((i-6)*(i+3)) % i;
+; }
+
+; CHECK: void @function
+; CHECK-NEXT: for.first.preheader:
+; CHECK-NEXT: br label %for.first.peel.begin
+; CHECK: for.first.peel.begin:
+; CHECK-NEXT: br label %for.first.peel
+; CHECK: for.first.peel
+; CHECK: br label %for.first.latch.peel
+; CHECK: for.first.latch.peel:
+; CHECK: br label %for.first.peel.next
+; CHECK: for.first.peel.next:
+; CHECK: br label %for.first.peel2
+; CHECK: for.first.peel2:
+; CHECK: br label %for.first.latch.peel10
+; CHECK: for.first.latch.peel10:
+; CHECK: br label %for.first.peel.next1
+; CHECK: for.first.peel.next1:
+; CHECK: br label %for.first.peel15
+; CHECK: for.first.peel15:
+; CHECK: br label %for.first.latch.peel23
+; CHECK: for.first.latch.peel23:
+; CHECK: br label %for.first.peel.next14
+; CHECK: for.first.peel.next14:
+; CHECK-NEXT: br label %for.first.peel.next27
+; CHECK: for.first.peel.next27:
+; CHECK-NEXT: br label %for.first.preheader.peel.newph
+; CHECK: for.first.preheader.peel.newph:
+; CHECK-NEXT: br label %for.first
+; CHECK: for.first:
+; CHECK: br label %for.first.latch
+; CHECK: for.first.latch:
+; CHECK: br label %for.second.latch
+; CHECK: for.second.latch:
+; CHECK: br i1 %exitcond, label %for.first, label %for.end
+; CHECK: for.end:
+; CHECK-NEXT: ret void
+
+@B = common global [1024 x i32] zeroinitializer, align 16
+
+define void @function(i32* noalias %arg) {
+for.first.preheader:
+  br label %for.first
+
+for.first:                                              ; preds = %for.first.preheader, %for.first.latch
+  %.014 = phi i32 [ 0, %for.first.preheader ], [ %tmp15, %for.first.latch ]
+  %indvars.iv23 = phi i64 [ 0, %for.first.preheader ], [ %indvars.iv.next3, %for.first.latch ]
+  %tmp = add nsw i32 %.014, -3
+  %tmp8 = add nuw nsw i64 %indvars.iv23, 3
+  %tmp9 = trunc i64 %tmp8 to i32
+  %tmp10 = mul nsw i32 %tmp, %tmp9
+  %tmp11 = trunc i64 %indvars.iv23 to i32
+  %tmp12 = srem i32 %tmp10, %tmp11
+  %tmp13 = getelementptr inbounds i32, i32* %arg, i64 %indvars.iv23
+  store i32 %tmp12, i32* %tmp13, align 4
+  br label %for.first.latch
+
+for.first.latch:                                             ; preds = %for.first
+  %indvars.iv.next3 = add nuw nsw i64 %indvars.iv23, 1
+  %tmp15 = add nuw nsw i32 %.014, 1
+  %exitcond4 = icmp ne i64 %indvars.iv.next3, 100
+  br i1 %exitcond4, label %for.first, label %for.second.preheader
+
+for.second.preheader:                                   ; preds = %for.first.latch
+  br label %for.second
+
+for.second:                                             ; preds = %for.second.preheader, %for.second.latch
+  %.02 = phi i32 [ 0, %for.second.preheader ], [ %tmp28, %for.second.latch ]
+  %indvars.iv1 = phi i64 [ 3, %for.second.preheader ], [ %indvars.iv.next, %for.second.latch ]
+  %tmp20 = add nsw i32 %.02, -3
+  %tmp21 = add nuw nsw i64 %indvars.iv1, 3
+  %tmp22 = trunc i64 %tmp21 to i32
+  %tmp23 = mul nsw i32 %tmp20, %tmp22
+  %tmp24 = trunc i64 %indvars.iv1 to i32
+  %tmp25 = srem i32 %tmp23, %tmp24
+  %tmp26 = getelementptr inbounds [1024 x i32], [1024 x i32]* @B, i64 0, i64 %indvars.iv1
+  store i32 %tmp25, i32* %tmp26, align 4
+  br label %for.second.latch
+
+for.second.latch:                                             ; preds = %for.second
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv1, 1
+  %tmp28 = add nuw nsw i32 %.02, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, 100
+  br i1 %exitcond, label %for.second, label %for.end
+
+for.end:                                             ; preds = %for.second.latch
+  ret void
+}
+