Index: lib/Transforms/Scalar/IndVarSimplify.cpp
===================================================================
--- lib/Transforms/Scalar/IndVarSimplify.cpp
+++ lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -762,6 +762,8 @@
 
   Instruction *WidenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter);
 
+  bool WidenLoopCompare(NarrowIVDefUse DU);
+
   void pushNarrowIVUsers(Instruction *NarrowDef, Instruction *WideDef);
 };
 } // anonymous namespace
@@ -926,6 +928,35 @@
   DU.NarrowUse->replaceUsesOfWith(DU.NarrowDef, Trunc);
 }
 
+/// If the narrow use is a compare instruction, and the compare has a single use
+/// then widen the compare instruction (and possibly the other operand).
+bool WidenIV::WidenLoopCompare(NarrowIVDefUse DU) {
+  ICmpInst *Cmp = dyn_cast<ICmpInst>(DU.NarrowUse);
+  if (!Cmp || !Cmp->hasOneUse())
+    return false;
+
+  /// FIXME: Add support for unsigned compare.
+  bool IsSigned = CmpInst::isSigned(Cmp->getPredicate());
+  if (!IsSigned)
+    return false;
+
+  Value *Op = Cmp->getOperand(Cmp->getOperand(0) == DU.NarrowDef ? 1 : 0);
+  unsigned CastWidth = SE->getTypeSizeInBits(Op->getType());
+  unsigned IVWidth = SE->getTypeSizeInBits(WideType);
+  assert (CastWidth <= IVWidth && "Unexpected width while widening compare.");
+
+  // Widen the compare instruction.
+  IRBuilder<> Builder(getInsertPointForUses(DU.NarrowUse, DU.NarrowDef, DT));
+  DU.NarrowUse->replaceUsesOfWith(DU.NarrowDef, DU.WideDef);
+
+  // Widen the other operand of the compare, if necessary.
+  if (CastWidth < IVWidth) {
+    Value *ExtOp = getExtend(Op, WideType, IsSigned, Cmp);
+    DU.NarrowUse->replaceUsesOfWith(Op, ExtOp);
+  }
+  return true;
+}
+
 /// WidenIVUse - Determine whether an individual user of the narrow IV can be
 /// widened. If so, return the wide clone of the user.
 Instruction *WidenIV::WidenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) {
@@ -993,10 +1024,15 @@
 
   // Does this user itself evaluate to a recurrence after widening?
   const SCEVAddRecExpr *WideAddRec = GetWideRecurrence(DU.NarrowUse);
+  if (!WideAddRec)
+    WideAddRec = GetExtendedOperandRecurrence(DU);
+
   if (!WideAddRec) {
-      WideAddRec = GetExtendedOperandRecurrence(DU);
-  }
-  if (!WideAddRec) {
+    // If use is a loop condition, try to promote the condition instead of
+    // truncating the IV first.
+    if (WidenLoopCompare(DU))
+      return nullptr;
+
     // This user does not evaluate to a recurence after widening, so don't
     // follow it. Instead insert a Trunc to kill off the original use,
     // eventually isolating the original narrow IV so it can be removed.
Index: test/Transforms/IndVarSimplify/2011-09-27-hoistsext.ll
===================================================================
--- test/Transforms/IndVarSimplify/2011-09-27-hoistsext.ll
+++ test/Transforms/IndVarSimplify/2011-09-27-hoistsext.ll
@@ -12,6 +12,7 @@
 
 ; CHECK: for.body:
 ; CHECK-NOT: sext
+; CHECK: indvars.iv.next
 ; CHECK: br
 for.body:
   %i2.115 = phi i32 [ 0, %entry ], [ %add249, %for.body ]
Index: test/Transforms/IndVarSimplify/elim-extend.ll
===================================================================
--- test/Transforms/IndVarSimplify/elim-extend.ll
+++ test/Transforms/IndVarSimplify/elim-extend.ll
@@ -7,6 +7,7 @@
 define void @postincConstIV(i8* %base, i32 %limit) nounwind {
 entry:
   br label %loop
+; CHECK: sext i32
 ; CHECK: loop:
 ; CHECK-NOT: sext
 ; CHECK: exit:
Index: test/Transforms/IndVarSimplify/no-iv-rewrite.ll
===================================================================
--- test/Transforms/IndVarSimplify/no-iv-rewrite.ll
+++ test/Transforms/IndVarSimplify/no-iv-rewrite.ll
@@ -229,10 +229,10 @@
 ; loop and the OR instruction is replaced by an ADD keeping the result
 ; equivalent.
 ;
+; CHECK: sext
 ; CHECK: loop:
 ; CHECK: phi i64
 ; CHECK-NOT: sext
-; CHECK: icmp slt i32
 ; CHECK: exit:
 ; CHECK: add i64
 loop:
Index: test/Transforms/IndVarSimplify/verify-scev.ll
===================================================================
--- test/Transforms/IndVarSimplify/verify-scev.ll
+++ test/Transforms/IndVarSimplify/verify-scev.ll
@@ -380,11 +380,11 @@
 
 for.body65.lr.ph:                                 ; preds = %for.body48
   %0 = load i32* undef, align 4
+  %1 = sext i32 %0 to i64
   br label %for.body65.us
 
 for.body65.us:                                    ; preds = %for.inc219.us, %for.body65.lr.ph
-  %k.09.us = phi i32 [ %inc.us, %for.inc219.us ], [ 1, %for.body65.lr.ph ]
-  %idxprom66.us = sext i32 %k.09.us to i64
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc219.us ], [ 1, %for.body65.lr.ph ]
   br i1 undef, label %for.inc219.us, label %if.end72.us
 
 if.end72.us:                                      ; preds = %for.body65.us
@@ -406,8 +406,8 @@
   br i1 undef, label %for.cond139.loopexit.us, label %for.cond152.us
 
 for.inc219.us:                                    ; preds = %for.cond139.loopexit.us, %if.end110.us, %if.end93.us, %for.body65.us
-  %inc.us = add nsw i32 %k.09.us, 1
-  %cmp64.us = icmp sgt i32 %inc.us, %0
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %cmp64.us = icmp sgt i64 %indvars.iv.next, %1
   br i1 %cmp64.us, label %for.inc221, label %for.body65.us
 
 for.cond139.loopexit.us:                          ; preds = %for.cond152.us
Index: test/Transforms/IndVarSimplify/widen-loop-comp.ll
===================================================================
--- /dev/null
+++ test/Transforms/IndVarSimplify/widen-loop-comp.ll
@@ -0,0 +1,137 @@
+; RUN: opt < %s -indvars -S | FileCheck %s
+target triple = "aarch64--linux-gnu"
+
+; Check the loop exit i32 compare instruction and operand are widened to i64
+; instead of truncating IV before its use in the i32 compare instruction.
+
+@idx = common global i32 0, align 4
+@e = common global i32 0, align 4
+@ptr = common global i32* null, align 8
+
+; CHECK-LABEL: @test1
+; CHECK: for.body.lr.ph:
+; CHECK: sext i32
+; CHECK: for.cond:
+; CHECK: icmp slt i64
+; CHECK: for.body:
+; CHECK: phi i64
+
+define i32 @test1() {
+entry:
+  store i32 -1, i32* @idx, align 4
+  %0 = load i32* @e, align 4
+  %cmp4 = icmp slt i32 %0, 0
+  br i1 %cmp4, label %for.end.loopexit, label %for.body.lr.ph
+
+for.body.lr.ph:
+  %1 = load i32** @ptr, align 8
+  %2 = load i32* @e, align 4
+  br label %for.body
+
+for.cond:
+  %inc = add nsw i32 %i.05, 1
+  %cmp = icmp slt i32 %i.05, %2
+  br i1 %cmp, label %for.body, label %for.cond.for.end.loopexit_crit_edge
+
+for.body:
+  %i.05 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.cond ]
+  %idxprom = sext i32 %i.05 to i64
+  %arrayidx = getelementptr inbounds i32* %1, i64 %idxprom
+  %3 = load i32* %arrayidx, align 4
+  %tobool = icmp eq i32 %3, 0
+  br i1 %tobool, label %if.then, label %for.cond
+
+if.then:
+  %i.05.lcssa = phi i32 [ %i.05, %for.body ]
+  store i32 %i.05.lcssa, i32* @idx, align 4
+  br label %for.end
+
+for.cond.for.end.loopexit_crit_edge:
+  br label %for.end.loopexit
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  %4 = load i32* @idx, align 4
+  ret i32 %4
+}
+
+; CHECK-LABEL: @test2
+; CHECK: for.body4.us
+; CHECK: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+; CHECK: %cmp2.us = icmp slt i64
+; CHECK-NOT: %2 = trunc i64 %indvars.iv.next to i32
+; CHECK-NOT: %cmp2.us = icmp slt i32
+
+define void @test2([8 x i8]* %a, i8* %b, i8 %limit) {
+entry:
+  %conv = zext i8 %limit to i32
+  %cmp23 = icmp eq i8 %limit, 0
+  br i1 %cmp23, label %for.cond1.preheader, label %for.cond1.preheader.us
+
+for.cond1.preheader.us:
+  %storemerge5.us = phi i32 [ 0, %entry ], [ %inc14.us, %for.inc13.us ]
+  br i1 true, label %for.body4.lr.ph.us, label %for.inc13.us
+
+for.inc13.us:
+  %inc14.us = add nsw i32 %storemerge5.us, 1
+  %cmp.us = icmp slt i32 %inc14.us, 4
+  br i1 %cmp.us, label %for.cond1.preheader.us, label %for.end
+
+for.body4.us:
+  %storemerge14.us = phi i32 [ 0, %for.body4.lr.ph.us ], [ %inc.us, %for.body4.us ]
+  %idxprom.us = sext i32 %storemerge14.us to i64
+  %arrayidx6.us = getelementptr inbounds [8 x i8]* %a, i64 %idxprom5.us, i64 %idxprom.us
+  %0 = load i8* %arrayidx6.us, align 1
+  %idxprom7.us = zext i8 %0 to i64
+  %arrayidx8.us = getelementptr inbounds i8* %b, i64 %idxprom7.us
+  %1 = load i8* %arrayidx8.us, align 1
+  store i8 %1, i8* %arrayidx6.us, align 1
+  %inc.us = add nsw i32 %storemerge14.us, 1
+  %cmp2.us = icmp slt i32 %inc.us, %conv
+  br i1 %cmp2.us, label %for.body4.us, label %for.inc13.us
+
+for.body4.lr.ph.us:
+  %idxprom5.us = sext i32 %storemerge5.us to i64
+  br label %for.body4.us
+
+for.cond1.preheader:
+  %storemerge5 = phi i32 [ 0, %entry ], [ %inc14, %for.inc13 ]
+  br i1 false, label %for.inc13, label %for.inc13
+
+for.inc13:
+  %inc14 = add nsw i32 %storemerge5, 1
+  %cmp = icmp slt i32 %inc14, 4
+  br i1 %cmp, label %for.cond1.preheader, label %for.end
+
+for.end:
+  ret void
+}
+
+; CHECK-LABEL: @test3
+; CHECK: for.cond:
+; CHECK: phi i64
+; CHECK: icmp ne i64
+
+define i32 @test3(i32* %a) {
+entry:
+  br label %for.cond
+
+for.cond:
+  %sum.0 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %cmp = icmp slt i32 %i.0, 1000
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+  %idxprom = sext i32 %i.0 to i64
+  %arrayidx = getelementptr inbounds i32* %a, i64 %idxprom
+  %0 = load i32* %arrayidx, align 4
+  %add = add nsw i32 %sum.0, %0
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:
+  ret i32 %sum.0
+}
Index: test/Transforms/LoopSimplify/merge-exits.ll
===================================================================
--- test/Transforms/LoopSimplify/merge-exits.ll
+++ test/Transforms/LoopSimplify/merge-exits.ll
@@ -1,6 +1,4 @@
-; RUN: opt < %s -loop-simplify -loop-rotate -instcombine -indvars -S -verify-loop-info -verify-dom-info > %t
-; RUN: not grep sext %t
-; RUN: grep "phi i64" %t | count 1
+; RUN: opt < %s -loop-simplify -loop-rotate -instcombine -indvars -S -verify-loop-info -verify-dom-info | FileCheck %s
 
 ; Loopsimplify should be able to merge the two loop exits
 ; into one, so that loop rotate can rotate the loop, so
@@ -9,36 +7,42 @@
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n32:64"
 
-define float @t(float* %pTmp1, float* %peakWeight, i32 %bandEdgeIndex) nounwind {
+; CHECK-LABEL: @test1
+; CHECK: bb
+; CHECK: phi i64
+; CHECK-NOT: phi i64
+; CHECK-NOT: sext
+
+define float @test1(float* %pTmp1, float* %peakWeight, i32 %bandEdgeIndex) nounwind {
 entry:
-	%t0 = load float* %peakWeight, align 4		; <float> [#uses=1]
+	%t0 = load float* %peakWeight, align 4
 	br label %bb1
 
-bb:		; preds = %bb2
-	%t1 = sext i32 %hiPart.0 to i64		; <i64> [#uses=1]
-	%t2 = getelementptr float* %pTmp1, i64 %t1		; <float*> [#uses=1]
-	%t3 = load float* %t2, align 4		; <float> [#uses=1]
-	%t4 = fadd float %t3, %distERBhi.0		; <float> [#uses=1]
-	%t5 = add i32 %hiPart.0, 1		; <i32> [#uses=2]
-	%t6 = sext i32 %t5 to i64		; <i64> [#uses=1]
-	%t7 = getelementptr float* %peakWeight, i64 %t6		; <float*> [#uses=1]
-	%t8 = load float* %t7, align 4		; <float> [#uses=1]
-	%t9 = fadd float %t8, %peakCount.0		; <float> [#uses=1]
+bb:
+	%t1 = sext i32 %hiPart.0 to i64
+	%t2 = getelementptr float* %pTmp1, i64 %t1
+	%t3 = load float* %t2, align 4
+	%t4 = fadd float %t3, %distERBhi.0
+	%t5 = add i32 %hiPart.0, 1
+	%t6 = sext i32 %t5 to i64
+	%t7 = getelementptr float* %peakWeight, i64 %t6
+	%t8 = load float* %t7, align 4
+	%t9 = fadd float %t8, %peakCount.0
 	br label %bb1
 
-bb1:		; preds = %bb, %entry
-	%peakCount.0 = phi float [ %t0, %entry ], [ %t9, %bb ]		; <float> [#uses=2]
-	%hiPart.0 = phi i32 [ 0, %entry ], [ %t5, %bb ]		; <i32> [#uses=3]
-	%distERBhi.0 = phi float [ 0.000000e+00, %entry ], [ %t4, %bb ]		; <float> [#uses=3]
-	%t10 = fcmp uge float %distERBhi.0, 2.500000e+00		; <i1> [#uses=1]
+bb1:
+	%peakCount.0 = phi float [ %t0, %entry ], [ %t9, %bb ]
+	%hiPart.0 = phi i32 [ 0, %entry ], [ %t5, %bb ]
+	%distERBhi.0 = phi float [ 0.000000e+00, %entry ], [ %t4, %bb ]
+	%t10 = fcmp uge float %distERBhi.0, 2.500000e+00
 	br i1 %t10, label %bb3, label %bb2
 
-bb2:		; preds = %bb1
-	%t11 = add i32 %bandEdgeIndex, -1		; <i32> [#uses=1]
-	%t12 = icmp sgt i32 %t11, %hiPart.0		; <i1> [#uses=1]
+bb2:
+	%t11 = add i32 %bandEdgeIndex, -1
+	%t12 = icmp sgt i32 %t11, %hiPart.0
 	br i1 %t12, label %bb, label %bb3
 
-bb3:		; preds = %bb2, %bb1
-	%t13 = fdiv float %peakCount.0, %distERBhi.0		; <float> [#uses=1]
+bb3:
+	%t13 = fdiv float %peakCount.0, %distERBhi.0
 	ret float %t13
 }