Index: lib/Transforms/InstCombine/InstCombineInternal.h
===================================================================
--- lib/Transforms/InstCombine/InstCombineInternal.h
+++ lib/Transforms/InstCombine/InstCombineInternal.h
@@ -703,6 +703,7 @@
   Instruction *FoldPHIArgGEPIntoPHI(PHINode &PN);
   Instruction *FoldPHIArgLoadIntoPHI(PHINode &PN);
   Instruction *FoldPHIArgZextsIntoPHI(PHINode &PN);
+  Instruction *FoldPHIUserOpIntoPred(PHINode &PN);
 
   /// If an integer typed PHI has only one use which is an IntToPtr operation,
   /// replace the PHI with an existing pointer typed PHI if it exists. Otherwise
Index: lib/Transforms/InstCombine/InstCombinePHI.cpp
===================================================================
--- lib/Transforms/InstCombine/InstCombinePHI.cpp
+++ lib/Transforms/InstCombine/InstCombinePHI.cpp
@@ -14,6 +14,7 @@
 #include "InstCombineInternal.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/PatternMatch.h"
@@ -23,6 +24,10 @@
 
 #define DEBUG_TYPE "instcombine"
 
+static cl::opt<bool>
+FoldPhiUser("instcombine-fold-phi-user", cl::Hidden, cl::init(true),
+            cl::desc("enable folding phi user into incoming values"));
+
 /// The PHI arguments will be folded into a single operation with a PHI node
 /// as input. The debug location of the single operation will be the merged
 /// locations of the original PHI node arguments.
@@ -635,6 +640,119 @@
   return NewLI;
 }
 
+// FoldPHIUserOpIntoPred finds a phi node that is used by only one add/sub and
+// all its incomming values are ConstantInt or add/sub used only by this phi.
+// For such case, we can eliminate one add/sub by changing immediates.
+//
+// Example of redundant add instruction to be optimized:
+// BB1:
+//   %add = add i64 %a, 5
+//   br label %BB3
+// BB2:
+//   %sub = sub i64 %b, 3
+//   br label %BB3
+// BB3:
+//   %phi = phi i64 [ %add, %BB1 ], [ %sub, %BB2 ]
+//   %rc = add i64 %phi, 1 # -> will be removed
+//
+// Additionally, if only one incoming value to the phi does not meet above
+// condition, we can move the add/sub instruction to avoid partially redundant
+// computation.
+
+Instruction *InstCombiner::FoldPHIUserOpIntoPred(PHINode &Phi) {
+  // This optimization is disabled for Hexagon so far because it affects
+  // Hexagon loop idiom recognition.
+  Triple T(Phi.getModule()->getTargetTriple());
+  if (T.getArch() == Triple::hexagon || !FoldPhiUser)
+    return nullptr;
+
+  if (!Phi.hasOneUse())
+    return nullptr;
+
+  // We optimize a phi node that is used by only one add/sub instruction.
+  Instruction *User = Phi.user_back();
+  ConstantInt *UserImm = nullptr;
+  if (!match(User, m_Add(m_Specific(&Phi), m_ConstantInt(UserImm))) &&
+      !match(User, m_Sub(m_Specific(&Phi), m_ConstantInt(UserImm))))
+    return nullptr;
+
+  int FailCount = 0;
+  int FailedIdx = -1;
+  // Here we check all incoming values.
+  for (unsigned Idx = 0; Idx < Phi.getNumIncomingValues(); Idx++) {
+    Value *V = Phi.getIncomingValue(Idx);
+    // We can optimize constant int by changing the value.
+    if (isa<ConstantInt>(V))
+      continue;
+
+    // An add/sub with an immediate can be optimized if it is used only by
+    // this phi node.
+    if (V->hasOneUse() &&
+        (match(V, m_Add(m_Value(), m_ConstantInt())) ||
+         match(V, m_Sub(m_Value(), m_ConstantInt()))))
+      continue;
+
+    // We need to handle partially redundant case here.
+    // We do not eliminate partial redudancy if there are more than one
+    // incoming values that cannot be optimized to avoid code size bloat.
+    if (++FailCount > 1)
+      break;
+
+    // If this is a cyclic phi chain, moving instruction may potentially cause
+    // infinite loop. This case, we do not set FailedIdx.
+    std::function<bool(Value*,Value*)>
+    IsPotentialPhiLoop = [&IsPotentialPhiLoop](Value *V, Value *AddVal) {
+      if (!V->hasOneUse() || !isa<PHINode>(V))
+        return false;
+      PHINode *PN = dyn_cast<PHINode>(V);
+      for (Value *V : PN->incoming_values())
+        if (V == AddVal || IsPotentialPhiLoop(V, AddVal))
+          return true;
+      return false;
+    };
+    if (IsPotentialPhiLoop(V, User)) break;
+
+    // We remember which incoming value cannot be optimized.
+    FailedIdx = Idx;
+  }
+
+  // If all incoming values can be optimized (FailCount == 0) or
+  // all but one incoming values cannot be optimized (FailCount == 1),
+  // apply optimization here.
+  if (FailCount == 0 || (FailCount == 1 && FailedIdx != -1)) {
+    for (unsigned Idx = 0; Idx < Phi.getNumIncomingValues(); Idx++) {
+      Value *V = Phi.getIncomingValue(Idx);
+      if ((int)Idx == FailedIdx) {
+        // We move add/sub instruction into a BB, which we cannot change
+        // immediate in the incoming value from the BB.
+        assert(FailCount != 0 &&
+               "FailedIdx must not be set for fully redundant case");
+        User->setOperand(0, V);
+        User->moveBefore(Phi.getIncomingBlock(Idx)->getTerminator());
+      } else if (isa<Instruction>(V)) {
+        // Update the immediate of the add/sub instruction.
+        Instruction *I = cast<Instruction>(V);
+        ConstantInt *PredImm = cast<ConstantInt>(I->getOperand(1));
+        auto PM = (User->getOpcode() == I->getOpcode()) ? Instruction::Add:
+                                                          Instruction::Sub;
+        Value* NewImm = ConstantExpr::get(PM, PredImm, UserImm);
+        I->setOperand(1, NewImm);
+      }
+      else if (isa<ConstantInt>(V)) {
+        ConstantInt *PredImm = cast<ConstantInt>(V);
+        Value* NewImm = ConstantExpr::get(User->getOpcode(), PredImm, UserImm);
+        Phi.setIncomingValue(Idx, NewImm);
+      }
+    }
+    User->replaceAllUsesWith(&Phi);
+    if (FailedIdx != -1)
+      Phi.setIncomingValue(FailedIdx, User);
+    return &Phi;
+  }
+
+  return nullptr;
+}
+
 /// TODO: This function could handle other cast types, but then it might
 /// require special-casing a cast from the 'i1' type. See the comment in
 /// FoldPHIArgOpIntoPHI() about pessimizing illegal integer types.
@@ -1130,6 +1248,11 @@
     if (Instruction *Result = FoldPHIArgOpIntoPHI(PN))
       return Result;
 
+  if (PN.hasOneUse()) {
+    if (Instruction *Result = FoldPHIUserOpIntoPred(PN))
+      return Result;
+  }
+
   // If this is a trivial cycle in the PHI node graph, remove it.  Basically, if
   // this PHI only has a single use (a PHI), and if that PHI only has one use (a
   // PHI)... break the cycle.
Index: test/Analysis/ValueTracking/non-negative-phi-bits.ll
===================================================================
--- test/Analysis/ValueTracking/non-negative-phi-bits.ll
+++ test/Analysis/ValueTracking/non-negative-phi-bits.ll
@@ -6,9 +6,9 @@
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 1, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ult i64 [[INDVARS_IV]], 40
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], 40
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
Index: test/Transforms/IndVarSimplify/rewrite-loop-exit-value.ll
===================================================================
--- test/Transforms/IndVarSimplify/rewrite-loop-exit-value.ll
+++ test/Transforms/IndVarSimplify/rewrite-loop-exit-value.ll
@@ -1,4 +1,4 @@
-; RUN: opt -indvars -instcombine -S < %s | FileCheck %s
+; RUN: opt -indvars -instcombine -instcombine-fold-phi-user=0 -S < %s | FileCheck %s
 
 ;; Test that loop's exit value is rewritten to its initial
 ;; value from loop preheader
Index: test/Transforms/InstCombine/stacksaverestore.ll
===================================================================
--- test/Transforms/InstCombine/stacksaverestore.ll
+++ test/Transforms/InstCombine/stacksaverestore.ll
@@ -102,7 +102,7 @@
 
 ; CHECK-LABEL: define void @test3(
 ; CHECK: loop:
-; CHECK: %i = phi i32 [ 0, %entry ], [ %i1, %loop ]
+; CHECK: %i = phi i32 [ 1, %entry ], [ %i1, %loop ]
 ; CHECK: %save1 = call i8* @llvm.stacksave()
 ; CHECK: %argmem = alloca inalloca i32
 ; CHECK: store i32 0, i32* %argmem
Index: test/Transforms/LoopUnroll/runtime-loop-multiple-exits.ll
===================================================================
--- test/Transforms/LoopUnroll/runtime-loop-multiple-exits.ll
+++ test/Transforms/LoopUnroll/runtime-loop-multiple-exits.ll
@@ -37,13 +37,13 @@
 ; PROLOG-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[XTRAITER]], 0
 ; PROLOG-NEXT:    br i1 [[TMP1]], label %loop_header.prol.loopexit, label %loop_header.prol.preheader
 ; PROLOG:       loop_header.prol:
-; PROLOG-NEXT:    %iv.prol = phi i64 [ 0, %loop_header.prol.preheader ], [ %iv_next.prol, %loop_latch.prol ]
+; PROLOG-NEXT:    %iv.prol = phi i64 [ 1, %loop_header.prol.preheader ], [ %iv_next.prol, %loop_latch.prol ]
 ; PROLOG-NEXT:    %prol.iter = phi i64 [ [[XTRAITER]], %loop_header.prol.preheader ], [ %prol.iter.sub, %loop_latch.prol ]
 ; PROLOG-NEXT:    br i1 %cond, label %loop_latch.prol, label %loop_exiting_bb1.prol
 ; PROLOG:       loop_latch.prol:
-; PROLOG-NEXT:    %iv_next.prol = add i64 %iv.prol, 1
 ; PROLOG-NEXT:    %prol.iter.sub = add i64 %prol.iter, -1
 ; PROLOG-NEXT:    %prol.iter.cmp = icmp eq i64 %prol.iter.sub, 0
+; PROLOG-NEXT:    %iv_next.prol = add i64 %iv.prol, 1
 ; PROLOG-NEXT:    br i1 %prol.iter.cmp, label %loop_header.prol.loopexit.unr-lcssa, label %loop_header.prol
 ; PROLOG:  loop_latch.7:
 ; PROLOG-NEXT:     %iv_next.7 = add i64 %iv, 8
@@ -172,7 +172,7 @@
 ; PROLOG:  loop_exiting_bb1.7:
 ; PROLOG-NEXT:     switch i64 %sum.next.6, label %loop_latch.7
 ; PROLOG:  loop_latch.7:
-; PROLOG-NEXT:     %iv_next.7 = add nsw i64 %iv, 8
+; PROLOG-NEXT:     %iv_next.7 = add nuw nsw i64 %iv, 8
 ; PROLOG-NEXT:     %sum.next.7 = add i64 %sum.next.6, %add
 ; PROLOG-NEXT:     %cmp.7 = icmp eq i64 %iv_next.7, %trip
 ; PROLOG-NEXT:     br i1 %cmp.7, label %exit2.loopexit.unr-lcssa, label %loop_header
@@ -488,7 +488,7 @@
 define i8 addrspace(1)* @test9(i8* nocapture readonly %arg, i32 %n) {
 ; PROLOG: test9(
 ; PROLOG: header.prol:
-; PROLOG-NEXT: %phi.prol = phi i64 [ 0, %header.prol.preheader ], [ %iv.next.prol, %latch.prol ]
+; PROLOG-NEXT: %phi.prol = phi i64 [ 1, %header.prol.preheader ], [ %iv.next.prol, %latch.prol ]
 ; PROLOG: latch.prol:
 ; PROLOG-NOT: trip
 ; PROLOG:     br i1 %prol.iter.cmp, label %header.prol.loopexit.unr-lcssa, label %header.prol
Index: test/Transforms/LoopVectorize/X86/masked_load_store.ll
===================================================================
--- test/Transforms/LoopVectorize/X86/masked_load_store.ll
+++ test/Transforms/LoopVectorize/X86/masked_load_store.ll
@@ -1988,7 +1988,7 @@
 ; AVX512-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT19]]
 ; AVX512-NEXT:    br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_BODY:%.*]]
 ; AVX512:       vector.body:
-; AVX512-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT_2:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; AVX512-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT_2:%.*]], [[VECTOR_BODY]] ], [ 24, [[ENTRY:%.*]] ]
 ; AVX512-NEXT:    [[VEC_IND:%.*]] = phi <8 x i64> [ [[VEC_IND_NEXT_2:%.*]], [[VECTOR_BODY]] ], [ <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112>, [[ENTRY]] ]
 ; AVX512-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <8 x i64> [[VEC_IND]]
 ; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP2]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef), !alias.scope !41
@@ -2022,9 +2022,9 @@
 ; AVX512-NEXT:    [[TMP21:%.*]] = fadd <8 x double> [[WIDE_MASKED_GATHER20_2]], [[TMP20]]
 ; AVX512-NEXT:    [[TMP22:%.*]] = getelementptr inbounds double, double* [[A]], <8 x i64> [[VEC_IND_NEXT_1]]
 ; AVX512-NEXT:    call void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double> [[TMP21]], <8 x double*> [[TMP22]], i32 8, <8 x i1> [[TMP17]]), !alias.scope !46, !noalias !48
-; AVX512-NEXT:    [[INDEX_NEXT_2]] = add nuw nsw i64 [[INDEX]], 24
 ; AVX512-NEXT:    [[VEC_IND_NEXT_2]] = add <8 x i64> [[VEC_IND]], <i64 384, i64 384, i64 384, i64 384, i64 384, i64 384, i64 384, i64 384>
-; AVX512-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT_2]], 624
+; AVX512-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX]], 624
+; AVX512-NEXT:    [[INDEX_NEXT_2]] = add nuw nsw i64 [[INDEX]], 24
 ; AVX512-NEXT:    br i1 [[TMP23]], label [[FOR_BODY_PREHEADER]], label [[VECTOR_BODY]], !llvm.loop !49
 ; AVX512:       for.body.preheader:
 ; AVX512-NEXT:    [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 9984, [[VECTOR_BODY]] ]
@@ -2032,7 +2032,7 @@
 ; AVX512-NEXT:    br label [[FOR_BODY_PROL:%.*]]
 ; AVX512:       for.body.prol:
 ; AVX512-NEXT:    [[INDVARS_IV_PROL:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_PROL:%.*]], [[FOR_INC_PROL:%.*]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER]] ]
-; AVX512-NEXT:    [[PROL_ITER:%.*]] = phi i64 [ [[PROL_ITER_SUB:%.*]], [[FOR_INC_PROL]] ], [ 1, [[FOR_BODY_PREHEADER]] ]
+; AVX512-NEXT:    [[PROL_ITER:%.*]] = phi i64 [ [[PROL_ITER_SUB:%.*]], [[FOR_INC_PROL]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; AVX512-NEXT:    [[ARRAYIDX_PROL:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_PROL]]
 ; AVX512-NEXT:    [[TMP25:%.*]] = load i32, i32* [[ARRAYIDX_PROL]], align 4
 ; AVX512-NEXT:    [[CMP1_PROL:%.*]] = icmp slt i32 [[TMP25]], 100
@@ -2048,8 +2048,8 @@
 ; AVX512-NEXT:    br label [[FOR_INC_PROL]]
 ; AVX512:       for.inc.prol:
 ; AVX512-NEXT:    [[INDVARS_IV_NEXT_PROL]] = add nuw nsw i64 [[INDVARS_IV_PROL]], 16
+; AVX512-NEXT:    [[PROL_ITER_CMP:%.*]] = icmp eq i64 [[PROL_ITER]], 0
 ; AVX512-NEXT:    [[PROL_ITER_SUB]] = add i64 [[PROL_ITER]], -1
-; AVX512-NEXT:    [[PROL_ITER_CMP:%.*]] = icmp eq i64 [[PROL_ITER_SUB]], 0
 ; AVX512-NEXT:    br i1 [[PROL_ITER_CMP]], label [[FOR_BODY_PROL_LOOPEXIT:%.*]], label [[FOR_BODY_PROL]], !llvm.loop !50
 ; AVX512:       for.body.prol.loopexit:
 ; AVX512-NEXT:    [[DOTMASK:%.*]] = and i64 [[TMP24]], 9984
Index: test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll
===================================================================
--- test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll
+++ test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll
@@ -33,7 +33,7 @@
 ; GENERIC-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD66:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
 ; GENERIC-NEXT:    ret i32 [[SUM_0_LCSSA]]
 ; GENERIC:       for.body:
-; GENERIC-NEXT:    [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; GENERIC-NEXT:    [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 1, [[FOR_BODY_PREHEADER]] ]
 ; GENERIC-NEXT:    [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; GENERIC-NEXT:    [[A_ADDR_0101:%.*]] = phi i16* [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ]
 ; GENERIC-NEXT:    [[TMP0:%.*]] = bitcast i16* [[A_ADDR_0101]] to <8 x i16>*
@@ -92,8 +92,8 @@
 ; GENERIC-NEXT:    [[TMP30:%.*]] = load i16, i16* [[ARRAYIDX64]], align 2
 ; GENERIC-NEXT:    [[CONV65:%.*]] = zext i16 [[TMP30]] to i32
 ; GENERIC-NEXT:    [[ADD66]] = add nsw i32 [[ADD57]], [[CONV65]]
+; GENERIC-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[I_0103]], [[N]]
 ; GENERIC-NEXT:    [[INC]] = add nuw nsw i32 [[I_0103]], 1
-; GENERIC-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
 ; GENERIC-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
 ;
 ; KRYO-LABEL: @gather_reduce_8x16_i32(
@@ -108,7 +108,7 @@
 ; KRYO-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD66:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
 ; KRYO-NEXT:    ret i32 [[SUM_0_LCSSA]]
 ; KRYO:       for.body:
-; KRYO-NEXT:    [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; KRYO-NEXT:    [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 1, [[FOR_BODY_PREHEADER]] ]
 ; KRYO-NEXT:    [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; KRYO-NEXT:    [[A_ADDR_0101:%.*]] = phi i16* [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ]
 ; KRYO-NEXT:    [[TMP0:%.*]] = bitcast i16* [[A_ADDR_0101]] to <8 x i16>*
@@ -167,8 +167,8 @@
 ; KRYO-NEXT:    [[TMP30:%.*]] = load i16, i16* [[ARRAYIDX64]], align 2
 ; KRYO-NEXT:    [[CONV65:%.*]] = zext i16 [[TMP30]] to i32
 ; KRYO-NEXT:    [[ADD66]] = add nsw i32 [[ADD57]], [[CONV65]]
+; KRYO-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[I_0103]], [[N]]
 ; KRYO-NEXT:    [[INC]] = add nuw nsw i32 [[I_0103]], 1
-; KRYO-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
 ; KRYO-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
 ;
 entry:
@@ -294,7 +294,7 @@
 ; GENERIC-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD66:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
 ; GENERIC-NEXT:    ret i32 [[SUM_0_LCSSA]]
 ; GENERIC:       for.body:
-; GENERIC-NEXT:    [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; GENERIC-NEXT:    [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 1, [[FOR_BODY_PREHEADER]] ]
 ; GENERIC-NEXT:    [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; GENERIC-NEXT:    [[A_ADDR_0101:%.*]] = phi i16* [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ]
 ; GENERIC-NEXT:    [[TMP0:%.*]] = bitcast i16* [[A_ADDR_0101]] to <8 x i16>*
@@ -353,8 +353,8 @@
 ; GENERIC-NEXT:    [[TMP30:%.*]] = load i16, i16* [[ARRAYIDX64]], align 2
 ; GENERIC-NEXT:    [[CONV65:%.*]] = zext i16 [[TMP30]] to i32
 ; GENERIC-NEXT:    [[ADD66]] = add nsw i32 [[ADD57]], [[CONV65]]
+; GENERIC-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[I_0103]], [[N]]
 ; GENERIC-NEXT:    [[INC]] = add nuw nsw i32 [[I_0103]], 1
-; GENERIC-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
 ; GENERIC-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
 ;
 ; KRYO-LABEL: @gather_reduce_8x16_i64(
@@ -369,7 +369,7 @@
 ; KRYO-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD66:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
 ; KRYO-NEXT:    ret i32 [[SUM_0_LCSSA]]
 ; KRYO:       for.body:
-; KRYO-NEXT:    [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; KRYO-NEXT:    [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 1, [[FOR_BODY_PREHEADER]] ]
 ; KRYO-NEXT:    [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; KRYO-NEXT:    [[A_ADDR_0101:%.*]] = phi i16* [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ]
 ; KRYO-NEXT:    [[TMP0:%.*]] = bitcast i16* [[A_ADDR_0101]] to <8 x i16>*
@@ -428,8 +428,8 @@
 ; KRYO-NEXT:    [[TMP30:%.*]] = load i16, i16* [[ARRAYIDX64]], align 2
 ; KRYO-NEXT:    [[CONV65:%.*]] = zext i16 [[TMP30]] to i32
 ; KRYO-NEXT:    [[ADD66]] = add nsw i32 [[ADD57]], [[CONV65]]
+; KRYO-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[I_0103]], [[N]]
 ; KRYO-NEXT:    [[INC]] = add nuw nsw i32 [[I_0103]], 1
-; KRYO-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
 ; KRYO-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
 ;
 entry: