Index: lib/Transforms/Scalar/LoopRotation.cpp
===================================================================
--- lib/Transforms/Scalar/LoopRotation.cpp
+++ lib/Transforms/Scalar/LoopRotation.cpp
@@ -28,6 +28,7 @@
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -39,6 +40,7 @@
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
+#include <algorithm> /// std::max
 using namespace llvm;
 
 #define DEBUG_TYPE "loop-rotate"
@@ -47,6 +49,16 @@
     "rotation-max-header-size", cl::init(16), cl::Hidden,
     cl::desc("The default maximum header size for automatic loop rotation"));
 
+static cl::opt<unsigned> LargeAverageTripCountThreshold(
+    "large-average-tripcount-multiplier-threshold", cl::init(5), cl::Hidden,
+    cl::desc("Loop is considered to have large average tripcout if "
+             "total_tripcount/total_exit > default"));
+
+static cl::opt<unsigned> SmallAverageTripCountThreshold(
+    "small-average-tripcount-multiplier-threshold", cl::init(5), cl::Hidden,
+    cl::desc("Loop is considered to have small average tripcout if "
+             "total_exit/total_tripcount > default"));
+
 STATISTIC(NumRotated, "Number of loops rotated");
 
 namespace {
@@ -70,6 +82,18 @@
 private:
   bool rotateLoop(Loop *L, bool SimplifiedLatch);
   bool simplifyLoopLatch(Loop *L);
+
+  /// Update the branch_weight metadata for the rotated loop latch and guard
+  /// block.
+  ///
+  /// NOTE: this function assumes that the tripcounts every time the loop is
+  /// executed are similar or bell curve like. PGO does not provide us with a
+  /// distribution of the tripcounts of the loop, it merely gives a summation
+  /// of all the times the branch is taken or not taken when the loop is hit.
+  /// In case the loop has a more skewed tripcount distribution we could end
+  /// up underestimating the tripcount each time the loop is executed.
+  void updateLoopEstimatedBranchWeight(Loop *L, BranchInst *GuardBR,
+                                       BranchInst *LatchBR);
 };
 } // end anonymous namespace
 
@@ -363,6 +387,7 @@
   // loop in cases important for nested loops, and it also means we don't have
   // to split as many edges.
   BranchInst *PHBI = cast<BranchInst>(OrigPreheader->getTerminator());
+  BranchInst *GBI = PHBI;
   assert(PHBI->isConditional() && "Should be clone of BI condbr!");
   if (!isa<ConstantInt>(PHBI->getCondition()) ||
       PHBI->getSuccessor(cast<ConstantInt>(PHBI->getCondition())->isZero()) !=
@@ -425,6 +450,7 @@
     BranchInst *NewBI = BranchInst::Create(NewHeader, PHBI);
     NewBI->setDebugLoc(PHBI->getDebugLoc());
     PHBI->eraseFromParent();
+    GBI = NewBI;
 
     // With our CFG finalized, update DomTree if it is available.
     if (DT) {
@@ -463,6 +489,7 @@
     }
   }
 
+
   assert(L->getLoopPreheader() && "Invalid loop preheader after loop rotation");
   assert(L->getLoopLatch() && "Invalid loop latch after loop rotation");
 
@@ -472,12 +499,156 @@
   // emitted code isn't too gross in this common case.
   MergeBlockIntoPredecessor(OrigHeader, DT, LI);
 
+  // We now have done all the transformations on the CFG. Adjust the loop branch
+  // metadata in case we have them.
+  assert(L->getLoopLatch()->getTerminator() == BI && "Unexpected latch terminator"); 
+  updateLoopEstimatedBranchWeight(L, GBI, BI);
+
   DEBUG(dbgs() << "LoopRotation: into "; L->dump());
 
   ++NumRotated;
   return true;
 }
 
+/// We have now rotated the loop, update branch weight prof metadata in case
+/// we have it.
+void LoopRotate::updateLoopEstimatedBranchWeight(Loop *L,
+                                                 BranchInst *GuardBR,
+                                                 BranchInst *LatchBR) {
+  // If loop has multiple exiting blocks, we need to look through each one
+  // of them and reason about their profile metadata to compute the rotated
+  // loop metadata correctly. e.g. if we have an early exit in the loop,
+  // we will not be able to tell whether the latch block will ever be
+  // reached after rotation, i.e. early exit could have been taken.
+  // we bail for now.
+  //
+  // FIXME: Handle loops with early exits.
+  if (!L->getExitingBlock())
+    return; 
+
+  // Check whether there is loop !prof metadata.
+  uint64_t LoopBodyWeight, LoopExitWeight;
+  if (!LatchBR->extractProfMetadata(LoopBodyWeight, LoopExitWeight))
+    return;
+
+  // At this point, we know we have a branch weight metadata, recompute it.
+  //
+  // Update latch branch metadata. After the loop is rotated, the # of times 
+  // the loop body is executed remain the same and the # of times the exit
+  // block is executed remain the same as well. We use these information to
+  // compute the weight for the rest of the branches.
+  //
+  // Make sure the loop body and exit weight gets its correct value.
+  bool LoopBodyOnTrue = LatchBR->getSuccessor(0) == L->getHeader();
+  if (!LoopBodyOnTrue)
+    std::swap(LoopBodyWeight, LoopExitWeight);
+
+  uint64_t LoopPHWeight, BackedgeWeight, NotTakenOnceWeight;
+  // This is the minimum edge weight.
+  uint64_t MinimumEdgeWeight = 1;
+  if (LoopBodyWeight >= LoopExitWeight) {
+    bool HasLargeAverageTripCount =
+      LoopBodyWeight >= (LoopExitWeight * LargeAverageTripCountThreshold);
+    // In case when LoopBodyWeight >> LoopExitWeight, i.e. the runtime trip
+    // count is large, we model the guard branch to be very biased against
+    // exiting. we give it 5% chance to exit. This is because everytime the
+    // guard branch is hit, it most likely will branch to the loop preheader.
+    //
+    // And in case it has a smaller average trip count (but bigger than or
+    // equal to 1). We give it a 50% chance to exit.
+    double NotTakenOnceRatio = HasLargeAverageTripCount ? 0.05 : 0.5;
+
+    // In case we figure out that the guard branch has been turned into an
+    // unconidtional branch, we adjust NotTakenOnceRatio to 0.
+    if (GuardBR->isUnconditional())
+      NotTakenOnceRatio = 0;
+
+    // Calculate the not-taken-once weight and backedge weight.
+    NotTakenOnceWeight = LoopExitWeight * NotTakenOnceRatio;
+    // # of times loop is entered is the same as # of times the latch branch
+    // goes to the exit block. 
+    LoopPHWeight = LoopExitWeight - NotTakenOnceWeight;
+    // Backedge weight is the # of times the loop body is executed - # of times
+    // loop body is entered. 
+    BackedgeWeight = LoopBodyWeight - LoopPHWeight;
+
+    // We finished computing the edge weights, normalize them a bit to
+    // make sure every edge weight is at least as big as the minimum
+    // edge weight.
+    LoopPHWeight = std::max(LoopPHWeight, MinimumEdgeWeight);
+    NotTakenOnceWeight = std::max(NotTakenOnceWeight, MinimumEdgeWeight);
+    BackedgeWeight = std::max(BackedgeWeight, MinimumEdgeWeight);
+  } else {
+    bool HasSmallAverageTripCount =
+      (LoopBodyWeight * SmallAverageTripCountThreshold) <= LoopExitWeight;
+    // For the case when LoopBodyWeight << LoopExitWeight, i.e. the runtime trip
+    // count is very small, we model the guard branch to be very biased for exiting.
+    // We give it 95% chance to exit, this is because everytime the guard branch
+    // is hit, it most likely will branch to the exit.
+    //
+    // And in case it has a bigger average trip count (but smaller than 1). We
+    // give the guard branch a 50% chance to exit.
+    const double NotTakenOnceRatio = HasSmallAverageTripCount ? 0.95 : 0.5;
+
+    // NOTE: We do not check whether guard branch is unconditional here, as it
+    // should not be. However, its not impossible that we get incorrect loop
+    // body weight or loop exit weight. 
+
+    // Calculate the not-taken-once weight and backedge weight.
+    NotTakenOnceWeight = LoopExitWeight * NotTakenOnceRatio;
+    // We make sure that the not-taken-once weight is always going to be no smaller 
+    // than LoopExitWeight - LoopBodyWeight. As LoopBodyWeight is the maximum # of
+    // times exit is branched to from the latch, which makes LoopExitWeight -
+    // LoopBodyWeight the minimum # of times the exit is reached without executing
+    // a single iteration of the loop.
+    NotTakenOnceWeight = std::max(NotTakenOnceWeight, LoopExitWeight - LoopBodyWeight);
+    // # of times loop is entered is the same as # of times the latch branch
+    // goes to the exit block.
+    LoopPHWeight = LoopExitWeight - NotTakenOnceWeight;
+    // Backedge weight is the # of times the loop body is executed - # of times
+    // loop body is entered. 
+    BackedgeWeight = LoopBodyWeight - LoopPHWeight; 
+
+    // We finished computing the edge weights, normalize them a bit to
+    // make sure every edge weight is at least as big as the minimum
+    // edge weight.
+    LoopPHWeight = std::max(LoopPHWeight, MinimumEdgeWeight);
+    NotTakenOnceWeight = std::max(NotTakenOnceWeight, MinimumEdgeWeight);
+    BackedgeWeight = std::max(BackedgeWeight, MinimumEdgeWeight);
+  }
+
+  // Sanity check ... no branch weight should be smaller than 1.
+  assert(BackedgeWeight >= MinimumEdgeWeight &&
+         LoopExitWeight >= MinimumEdgeWeight &&
+         NotTakenOnceWeight >= MinimumEdgeWeight &&
+         LoopPHWeight >= MinimumEdgeWeight &&
+         "Invalid branch weight detected!");
+
+  MDBuilder MDB(LatchBR->getFunction()->getContext());
+  // Update the latch branch metadata. The loop preheader weight is the
+  // same as the loop exit weight.
+  LatchBR->setMetadata(LLVMContext::MD_prof, LoopBodyOnTrue ? 
+                       MDB.createBranchWeights(BackedgeWeight,
+                                               LoopPHWeight) :
+                       MDB.createBranchWeights(LoopPHWeight,
+                                               BackedgeWeight));
+  // Update the guard branch metadata.
+  if (GuardBR->isUnconditional()) {
+    // Guard branch has been simplified to an unconditional. It carries no
+    // branch metadata. 
+    GuardBR->setMetadata(LLVMContext::MD_prof, nullptr);
+  } else {
+    GuardBR->setMetadata(LLVMContext::MD_prof,
+                         // Make sure we know where guard branch goes to on
+                         // true.
+                         GuardBR->getSuccessor(0) == L->getLoopPreheader() ?
+                         MDB.createBranchWeights(LoopPHWeight,
+                                                 NotTakenOnceWeight) :
+                         MDB.createBranchWeights(NotTakenOnceWeight,
+                                                 LoopPHWeight));
+  }
+}
+
 /// Determine whether the instructions in this range may be safely and cheaply
 /// speculated. This is not an important enough situation to develop complex
 /// heuristics. We handle a single arithmetic instruction along with any type
Index: lib/Transforms/Utils/LoopUtils.cpp
===================================================================
--- lib/Transforms/Utils/LoopUtils.cpp
+++ lib/Transforms/Utils/LoopUtils.cpp
@@ -1105,9 +1105,9 @@
     return 0;
 
   // Divide the count of the backedge by the count of the edge exiting the loop,
-  // rounding to nearest.
+  // rounding to nearest. Trip count == backedge-taken count + 1.
   if (LatchBR->getSuccessor(0) == L->getHeader())
-    return (TrueVal + (FalseVal / 2)) / FalseVal;
+    return (TrueVal + (FalseVal / 2)) / FalseVal + 1;
   else
-    return (FalseVal + (TrueVal / 2)) / TrueVal;
+    return (FalseVal + (TrueVal / 2)) / TrueVal + 1;
 }
Index: test/Transforms/LoopRotate/loop-rotate-pgo.ll
===================================================================
--- /dev/null
+++ test/Transforms/LoopRotate/loop-rotate-pgo.ll
@@ -0,0 +1,154 @@
+; RUN: opt -S -loop-rotate -large-average-tripcount-multiplier-threshold=5  -small-average-tripcount-multiplier-threshold=5 < %s | FileCheck %s
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.12.0"
+
+; This loop has a very large average trip count, we bias the guard branch highly
+; against exiting, i.e only 5% chance to exit.
+define i32 @loops_with_large_tripcount(i32 %n) {
+; CHECK: entry
+; CHECK: icmp slt i32 0, %n
+; CHECK: label %for.body.lr.ph, label %for.end, !prof !0
+entry:
+   br label %for.cond
+
+for.cond:
+   %index = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+   %cmp = icmp slt i32 %index, %n 
+   br i1 %cmp, label %for.body, label %for.end, !prof !0
+
+for.body:
+   br label %for.inc
+
+; CHECK: for.inc
+; CHECK: icmp slt i32 %inc, %n 
+; CHECK-NEXT: label %for.body, label %for.cond.for.end_crit_edge, !prof !1  
+for.inc:
+   %inc = add nsw i32 %index, 1
+   br label %for.cond
+
+for.end:
+   ret i32 0
+}
+
+; This loop has average trip count bigger than 1 but no bigger than 5, we give
+; the guard branch 50% chance to exit.
+define i32 @loops_with_moderately_large_tripcount(i32 %n) {
+; CHECK: entry
+; CHECK: icmp slt i32 0, %n
+; CHECK: label %for.body.lr.ph, label %for.end, !prof !2
+entry:
+   br label %for.cond
+
+for.cond:
+   %index = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+   %cmp = icmp slt i32 %index, %n 
+   br i1 %cmp, label %for.body, label %for.end, !prof !1
+
+for.body:
+   br label %for.inc
+
+; CHECK: for.inc
+; CHECK: icmp slt i32 %inc, %n 
+; CHECK-NEXT: label %for.body, label %for.cond.for.end_crit_edge, !prof !3 
+for.inc:
+   %inc = add nsw i32 %index, 1
+   br label %for.cond
+
+for.end:
+   ret i32 0
+}
+
+; This loop has a very small trip count, we bias the guard branch to have a
+; very high chance to exit - 95%.
+define i32 @loops_with_small_tripcount(i32 %a) {
+; CHECK: icmp slt i32 15, %a
+; CHECK-NEXT: label %for.end, label %for.body.lr.ph, !prof !4 
+entry:
+   br label %for.cond
+
+for.cond:
+   %index = phi i32 [ 15, %entry ], [ %inc, %for.inc ]
+   %cmp = icmp slt i32 %index, %a 
+   br i1 %cmp, label %for.end, label %for.body, !prof !0
+
+for.body:
+   br label %for.inc
+
+; CHECK: for.inc
+; CHECK: icmp slt i32 %inc, %a
+; CHECK-NEXT: label %for.cond.for.end_crit_edge, label %for.body, !prof !5 
+for.inc:
+   %inc = add nsw i32 %index, 1
+   br label %for.cond
+
+for.end:
+   ret i32 0
+}
+
+; This loop has average trip count smaller than 1 but no bigger than 1/5, we give
+; the guard branch 50% chance to exit.
+define i32 @loops_with_moderately_small_tripcount(i32 %a) {
+; CHECK: icmp slt i32 15, %a
+; CHECK-NEXT: label %for.end, label %for.body.lr.ph, !prof !6 
+entry:
+   br label %for.cond
+
+for.cond:
+   %index = phi i32 [ 15, %entry ], [ %inc, %for.inc ]
+   %cmp = icmp slt i32 %index, %a 
+   br i1 %cmp, label %for.end, label %for.body, !prof !1
+
+for.body:
+   br label %for.inc
+
+; CHECK: for.inc
+; CHECK: icmp slt i32 %inc, %a
+; CHECK-NEXT: label %for.cond.for.end_crit_edge, label %for.body, !prof !0 
+for.inc:
+   %inc = add nsw i32 %index, 1
+   br label %for.cond
+
+for.end:
+   ret i32 0
+}
+
+; This is a loop with known trip count, make sure guard branch metadata is
+; deleted after the branch is simplified to an unconditional br.
+define i32 @loops_with_known_tripcount() {
+; Make sure entry has a branch with no metadata. 
+; CHECK: entry
+; CHECK-NOT: !prof
+; CHECK: for.body:
+entry:
+   br label %for.cond
+
+for.cond:
+   %index = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+   %cmp = icmp slt i32 %index, 4 
+   br i1 %cmp, label %for.body, label %for.end, !prof !1
+
+for.body:
+   br label %for.inc
+
+; CHECK: for.inc
+; CHECK: icmp slt i32 %inc, 4 
+; CHECK-NEXT: label %for.body, label %for.end, !prof !6
+for.inc:
+   %inc = add nsw i32 %index, 1 
+   br label %for.cond
+
+for.end:
+   ret i32 0
+}
+
+
+; CHECK: !0 = !{!"branch_weights", i32 7, i32 1}
+; CHECK: !1 = !{!"branch_weights", i32 28, i32 7}
+; CHECK: !2 = !{!"branch_weights", i32 4, i32 3}
+; CHECK: !3 = !{!"branch_weights", i32 24, i32 4}
+; CHECK: !4 = !{!"branch_weights", i32 33, i32 2}
+; CHECK: !5 = !{!"branch_weights", i32 2, i32 5}
+; CHECK: !6 = !{!"branch_weights", i32 21, i32 7}
+!0 = !{!"branch_weights", i32 35, i32 7}
+!1 = !{!"branch_weights", i32 28, i32 7}
+!2 = !{!"branch_weights", i32 4, i32 1}
Index: test/Transforms/LoopUnroll/peel-loop-pgo.ll
===================================================================
--- test/Transforms/LoopUnroll/peel-loop-pgo.ll
+++ test/Transforms/LoopUnroll/peel-loop-pgo.ll
@@ -1,11 +1,11 @@
 ; RUN: opt < %s -S -debug-only=loop-unroll -loop-unroll -unroll-allow-peeling 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 
-; Make sure we use the profile information correctly to peel-off 3 iterations
+; Make sure we use the profile information correctly to peel-off 4 iterations
 ; from the loop, and update the branch weights for the peeled loop properly.
 
 ; CHECK: Loop Unroll: F[basic]
-; CHECK: PEELING loop %for.body with iteration count 3!
+; CHECK: PEELING loop %for.body with iteration count 4!
 ; CHECK: Loop Unroll: F[optsize]
 ; CHECK-NOT: PEELING
 
@@ -16,7 +16,9 @@
 ; CHECK: [[NEXT1]]:
 ; CHECK: br i1 %{{.*}}, label %[[NEXT2:.*]], label %for.cond.for.end_crit_edge, !prof !3
 ; CHECK: [[NEXT2]]:
-; CHECK: br i1 %{{.*}}, label %for.body, label %{{.*}}, !prof !4
+; CHECK: br i1 %{{.*}}, label %[[NEXT3:.*]], label %for.cond.for.end_crit_edge, !prof !4
+; CHECK: [[NEXT3]]:
+; CHECK: br i1 %{{.*}}, label %for.body, label %{{.*}}, !prof !5
 
 define void @basic(i32* %p, i32 %k) #0 !prof !0 {
 entry:
@@ -80,7 +82,7 @@
 !1 = !{!"branch_weights", i32 3001, i32 1001}
 
 ;CHECK: !1 = !{!"branch_weights", i32 900, i32 101}
-;CHECK: !2 = !{!"branch_weights", i32 540, i32 360}
-;CHECK: !3 = !{!"branch_weights", i32 162, i32 378}
-;CHECK: !4 = !{!"branch_weights", i32 1399, i32 162}
-
+;CHECK: !2 = !{!"branch_weights", i32 607, i32 293}
+;CHECK: !3 = !{!"branch_weights", i32 273, i32 334}
+;CHECK: !4 = !{!"branch_weights", i32 61, i32 212}
+;CHECK: !5 = !{!"branch_weights", i32 1160, i32 61}