Index: lib/Transforms/InstCombine/InstCombineInternal.h =================================================================== --- lib/Transforms/InstCombine/InstCombineInternal.h +++ lib/Transforms/InstCombine/InstCombineInternal.h @@ -703,6 +703,7 @@ Instruction *FoldPHIArgGEPIntoPHI(PHINode &PN); Instruction *FoldPHIArgLoadIntoPHI(PHINode &PN); Instruction *FoldPHIArgZextsIntoPHI(PHINode &PN); + Instruction *FoldPHIUserOpIntoPred(PHINode &PN); /// If an integer typed PHI has only one use which is an IntToPtr operation, /// replace the PHI with an existing pointer typed PHI if it exists. Otherwise Index: lib/Transforms/InstCombine/InstCombinePHI.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombinePHI.cpp +++ lib/Transforms/InstCombine/InstCombinePHI.cpp @@ -14,6 +14,7 @@ #include "InstCombineInternal.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/Triple.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/PatternMatch.h" @@ -23,6 +24,10 @@ #define DEBUG_TYPE "instcombine" +static cl::opt +FoldPhiUser("instcombine-fold-phi-user", cl::Hidden, cl::init(true), + cl::desc("enable folding phi user into incoming values")); + /// The PHI arguments will be folded into a single operation with a PHI node /// as input. The debug location of the single operation will be the merged /// locations of the original PHI node arguments. @@ -635,6 +640,119 @@ return NewLI; } +// FoldPHIUserOpIntoPred finds a phi node that is used by only one add/sub and +// all its incomming values are ConstantInt or add/sub used only by this phi. +// For such case, we can eliminate one add/sub by changing immediates. +// +// Example of redundant add instruction to be optimized: +// BB1: +// %add = add i64 %a, 5 +// br label %BB3 +// BB2: +// %sub = sub i64 %b, 3 +// br label %BB3 +// BB3: +// %phi = phi i64 [ %add, %BB1 ], [ %sub, %BB2 ] +// %rc = add i64 %phi, 1 # -> will be removed +// +// Additionally, if only one incoming value to the phi does not meet above +// condition, we can move the add/sub instruction to avoid partially redundant +// computation. + +Instruction *InstCombiner::FoldPHIUserOpIntoPred(PHINode &Phi) { + // This optimization is disabled for Hexagon so far because it affects + // Hexagon loop idiom recognition. + Triple T(Phi.getModule()->getTargetTriple()); + if (T.getArch() == Triple::hexagon || !FoldPhiUser) + return nullptr; + + if (!Phi.hasOneUse()) + return nullptr; + + // We optimize a phi node that is used by only one add/sub instruction. + Instruction *User = Phi.user_back(); + ConstantInt *UserImm = nullptr; + if (!match(User, m_Add(m_Specific(&Phi), m_ConstantInt(UserImm))) && + !match(User, m_Sub(m_Specific(&Phi), m_ConstantInt(UserImm)))) + return nullptr; + + int FailCount = 0; + int FailedIdx = -1; + // Here we check all incoming values. + for (unsigned Idx = 0; Idx < Phi.getNumIncomingValues(); Idx++) { + Value *V = Phi.getIncomingValue(Idx); + // We can optimize constant int by changing the value. + if (isa(V)) + continue; + + // An add/sub with an immediate can be optimized if it is used only by + // this phi node. + if (V->hasOneUse() && + (match(V, m_Add(m_Value(), m_ConstantInt())) || + match(V, m_Sub(m_Value(), m_ConstantInt())))) + continue; + + // We need to handle partially redundant case here. + // We do not eliminate partial redudancy if there are more than one + // incoming values that cannot be optimized to avoid code size bloat. + if (++FailCount > 1) + break; + + // If this is a cyclic phi chain, moving instruction may potentially cause + // infinite loop. This case, we do not set FailedIdx. + std::function + IsPotentialPhiLoop = [&IsPotentialPhiLoop](Value *V, Value *AddVal) { + if (!V->hasOneUse() || !isa(V)) + return false; + PHINode *PN = dyn_cast(V); + for (Value *V : PN->incoming_values()) + if (V == AddVal || IsPotentialPhiLoop(V, AddVal)) + return true; + return false; + }; + if (IsPotentialPhiLoop(V, User)) break; + + // We remember which incoming value cannot be optimized. + FailedIdx = Idx; + } + + // If all incoming values can be optimized (FailCount == 0) or + // all but one incoming values cannot be optimized (FailCount == 1), + // apply optimization here. + if (FailCount == 0 || (FailCount == 1 && FailedIdx != -1)) { + for (unsigned Idx = 0; Idx < Phi.getNumIncomingValues(); Idx++) { + Value *V = Phi.getIncomingValue(Idx); + if ((int)Idx == FailedIdx) { + // We move add/sub instruction into a BB, which we cannot change + // immediate in the incoming value from the BB. + assert(FailCount != 0 && + "FailedIdx must not be set for fully redundant case"); + User->setOperand(0, V); + User->moveBefore(Phi.getIncomingBlock(Idx)->getTerminator()); + } else if (isa(V)) { + // Update the immediate of the add/sub instruction. + Instruction *I = cast(V); + ConstantInt *PredImm = cast(I->getOperand(1)); + auto PM = (User->getOpcode() == I->getOpcode()) ? Instruction::Add: + Instruction::Sub; + Value* NewImm = ConstantExpr::get(PM, PredImm, UserImm); + I->setOperand(1, NewImm); + } + else if (isa(V)) { + ConstantInt *PredImm = cast(V); + Value* NewImm = ConstantExpr::get(User->getOpcode(), PredImm, UserImm); + Phi.setIncomingValue(Idx, NewImm); + } + } + User->replaceAllUsesWith(&Phi); + if (FailedIdx != -1) + Phi.setIncomingValue(FailedIdx, User); + return Φ + } + + return nullptr; +} + /// TODO: This function could handle other cast types, but then it might /// require special-casing a cast from the 'i1' type. See the comment in /// FoldPHIArgOpIntoPHI() about pessimizing illegal integer types. @@ -1130,6 +1248,11 @@ if (Instruction *Result = FoldPHIArgOpIntoPHI(PN)) return Result; + if (PN.hasOneUse()) { + if (Instruction *Result = FoldPHIUserOpIntoPred(PN)) + return Result; + } + // If this is a trivial cycle in the PHI node graph, remove it. Basically, if // this PHI only has a single use (a PHI), and if that PHI only has one use (a // PHI)... break the cycle. Index: test/Analysis/ValueTracking/non-negative-phi-bits.ll =================================================================== --- test/Analysis/ValueTracking/non-negative-phi-bits.ll +++ test/Analysis/ValueTracking/non-negative-phi-bits.ll @@ -6,9 +6,9 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 1, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ult i64 [[INDVARS_IV]], 40 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], 40 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]] ; CHECK: for.end: ; CHECK-NEXT: ret void Index: test/Transforms/IndVarSimplify/rewrite-loop-exit-value.ll =================================================================== --- test/Transforms/IndVarSimplify/rewrite-loop-exit-value.ll +++ test/Transforms/IndVarSimplify/rewrite-loop-exit-value.ll @@ -1,4 +1,4 @@ -; RUN: opt -indvars -instcombine -S < %s | FileCheck %s +; RUN: opt -indvars -instcombine -instcombine-fold-phi-user=0 -S < %s | FileCheck %s ;; Test that loop's exit value is rewritten to its initial ;; value from loop preheader Index: test/Transforms/InstCombine/stacksaverestore.ll =================================================================== --- test/Transforms/InstCombine/stacksaverestore.ll +++ test/Transforms/InstCombine/stacksaverestore.ll @@ -102,7 +102,7 @@ ; CHECK-LABEL: define void @test3( ; CHECK: loop: -; CHECK: %i = phi i32 [ 0, %entry ], [ %i1, %loop ] +; CHECK: %i = phi i32 [ 1, %entry ], [ %i1, %loop ] ; CHECK: %save1 = call i8* @llvm.stacksave() ; CHECK: %argmem = alloca inalloca i32 ; CHECK: store i32 0, i32* %argmem Index: test/Transforms/LoopUnroll/runtime-loop-multiple-exits.ll =================================================================== --- test/Transforms/LoopUnroll/runtime-loop-multiple-exits.ll +++ test/Transforms/LoopUnroll/runtime-loop-multiple-exits.ll @@ -37,13 +37,13 @@ ; PROLOG-NEXT: [[TMP1:%.*]] = icmp eq i64 [[XTRAITER]], 0 ; PROLOG-NEXT: br i1 [[TMP1]], label %loop_header.prol.loopexit, label %loop_header.prol.preheader ; PROLOG: loop_header.prol: -; PROLOG-NEXT: %iv.prol = phi i64 [ 0, %loop_header.prol.preheader ], [ %iv_next.prol, %loop_latch.prol ] +; PROLOG-NEXT: %iv.prol = phi i64 [ 1, %loop_header.prol.preheader ], [ %iv_next.prol, %loop_latch.prol ] ; PROLOG-NEXT: %prol.iter = phi i64 [ [[XTRAITER]], %loop_header.prol.preheader ], [ %prol.iter.sub, %loop_latch.prol ] ; PROLOG-NEXT: br i1 %cond, label %loop_latch.prol, label %loop_exiting_bb1.prol ; PROLOG: loop_latch.prol: -; PROLOG-NEXT: %iv_next.prol = add i64 %iv.prol, 1 ; PROLOG-NEXT: %prol.iter.sub = add i64 %prol.iter, -1 ; PROLOG-NEXT: %prol.iter.cmp = icmp eq i64 %prol.iter.sub, 0 +; PROLOG-NEXT: %iv_next.prol = add i64 %iv.prol, 1 ; PROLOG-NEXT: br i1 %prol.iter.cmp, label %loop_header.prol.loopexit.unr-lcssa, label %loop_header.prol ; PROLOG: loop_latch.7: ; PROLOG-NEXT: %iv_next.7 = add i64 %iv, 8 @@ -172,7 +172,7 @@ ; PROLOG: loop_exiting_bb1.7: ; PROLOG-NEXT: switch i64 %sum.next.6, label %loop_latch.7 ; PROLOG: loop_latch.7: -; PROLOG-NEXT: %iv_next.7 = add nsw i64 %iv, 8 +; PROLOG-NEXT: %iv_next.7 = add nuw nsw i64 %iv, 8 ; PROLOG-NEXT: %sum.next.7 = add i64 %sum.next.6, %add ; PROLOG-NEXT: %cmp.7 = icmp eq i64 %iv_next.7, %trip ; PROLOG-NEXT: br i1 %cmp.7, label %exit2.loopexit.unr-lcssa, label %loop_header @@ -488,7 +488,7 @@ define i8 addrspace(1)* @test9(i8* nocapture readonly %arg, i32 %n) { ; PROLOG: test9( ; PROLOG: header.prol: -; PROLOG-NEXT: %phi.prol = phi i64 [ 0, %header.prol.preheader ], [ %iv.next.prol, %latch.prol ] +; PROLOG-NEXT: %phi.prol = phi i64 [ 1, %header.prol.preheader ], [ %iv.next.prol, %latch.prol ] ; PROLOG: latch.prol: ; PROLOG-NOT: trip ; PROLOG: br i1 %prol.iter.cmp, label %header.prol.loopexit.unr-lcssa, label %header.prol Index: test/Transforms/LoopVectorize/X86/masked_load_store.ll =================================================================== --- test/Transforms/LoopVectorize/X86/masked_load_store.ll +++ test/Transforms/LoopVectorize/X86/masked_load_store.ll @@ -1988,7 +1988,7 @@ ; AVX512-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT19]] ; AVX512-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_BODY:%.*]] ; AVX512: vector.body: -; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT_2:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] +; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT_2:%.*]], [[VECTOR_BODY]] ], [ 24, [[ENTRY:%.*]] ] ; AVX512-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ [[VEC_IND_NEXT_2:%.*]], [[VECTOR_BODY]] ], [ , [[ENTRY]] ] ; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <8 x i64> [[VEC_IND]] ; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP2]], i32 4, <8 x i1> , <8 x i32> undef), !alias.scope !41 @@ -2022,9 +2022,9 @@ ; AVX512-NEXT: [[TMP21:%.*]] = fadd <8 x double> [[WIDE_MASKED_GATHER20_2]], [[TMP20]] ; AVX512-NEXT: [[TMP22:%.*]] = getelementptr inbounds double, double* [[A]], <8 x i64> [[VEC_IND_NEXT_1]] ; AVX512-NEXT: call void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double> [[TMP21]], <8 x double*> [[TMP22]], i32 8, <8 x i1> [[TMP17]]), !alias.scope !46, !noalias !48 -; AVX512-NEXT: [[INDEX_NEXT_2]] = add nuw nsw i64 [[INDEX]], 24 ; AVX512-NEXT: [[VEC_IND_NEXT_2]] = add <8 x i64> [[VEC_IND]], -; AVX512-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT_2]], 624 +; AVX512-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX]], 624 +; AVX512-NEXT: [[INDEX_NEXT_2]] = add nuw nsw i64 [[INDEX]], 24 ; AVX512-NEXT: br i1 [[TMP23]], label [[FOR_BODY_PREHEADER]], label [[VECTOR_BODY]], !llvm.loop !49 ; AVX512: for.body.preheader: ; AVX512-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 9984, [[VECTOR_BODY]] ] @@ -2032,7 +2032,7 @@ ; AVX512-NEXT: br label [[FOR_BODY_PROL:%.*]] ; AVX512: for.body.prol: ; AVX512-NEXT: [[INDVARS_IV_PROL:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_PROL:%.*]], [[FOR_INC_PROL:%.*]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER]] ] -; AVX512-NEXT: [[PROL_ITER:%.*]] = phi i64 [ [[PROL_ITER_SUB:%.*]], [[FOR_INC_PROL]] ], [ 1, [[FOR_BODY_PREHEADER]] ] +; AVX512-NEXT: [[PROL_ITER:%.*]] = phi i64 [ [[PROL_ITER_SUB:%.*]], [[FOR_INC_PROL]] ], [ 0, [[FOR_BODY_PREHEADER]] ] ; AVX512-NEXT: [[ARRAYIDX_PROL:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_PROL]] ; AVX512-NEXT: [[TMP25:%.*]] = load i32, i32* [[ARRAYIDX_PROL]], align 4 ; AVX512-NEXT: [[CMP1_PROL:%.*]] = icmp slt i32 [[TMP25]], 100 @@ -2048,8 +2048,8 @@ ; AVX512-NEXT: br label [[FOR_INC_PROL]] ; AVX512: for.inc.prol: ; AVX512-NEXT: [[INDVARS_IV_NEXT_PROL]] = add nuw nsw i64 [[INDVARS_IV_PROL]], 16 +; AVX512-NEXT: [[PROL_ITER_CMP:%.*]] = icmp eq i64 [[PROL_ITER]], 0 ; AVX512-NEXT: [[PROL_ITER_SUB]] = add i64 [[PROL_ITER]], -1 -; AVX512-NEXT: [[PROL_ITER_CMP:%.*]] = icmp eq i64 [[PROL_ITER_SUB]], 0 ; AVX512-NEXT: br i1 [[PROL_ITER_CMP]], label [[FOR_BODY_PROL_LOOPEXIT:%.*]], label [[FOR_BODY_PROL]], !llvm.loop !50 ; AVX512: for.body.prol.loopexit: ; AVX512-NEXT: [[DOTMASK:%.*]] = and i64 [[TMP24]], 9984 Index: test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll =================================================================== --- test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll +++ test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll @@ -33,7 +33,7 @@ ; GENERIC-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD66:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ] ; GENERIC-NEXT: ret i32 [[SUM_0_LCSSA]] ; GENERIC: for.body: -; GENERIC-NEXT: [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; GENERIC-NEXT: [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 1, [[FOR_BODY_PREHEADER]] ] ; GENERIC-NEXT: [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] ; GENERIC-NEXT: [[A_ADDR_0101:%.*]] = phi i16* [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ] ; GENERIC-NEXT: [[TMP0:%.*]] = bitcast i16* [[A_ADDR_0101]] to <8 x i16>* @@ -92,8 +92,8 @@ ; GENERIC-NEXT: [[TMP30:%.*]] = load i16, i16* [[ARRAYIDX64]], align 2 ; GENERIC-NEXT: [[CONV65:%.*]] = zext i16 [[TMP30]] to i32 ; GENERIC-NEXT: [[ADD66]] = add nsw i32 [[ADD57]], [[CONV65]] +; GENERIC-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[I_0103]], [[N]] ; GENERIC-NEXT: [[INC]] = add nuw nsw i32 [[I_0103]], 1 -; GENERIC-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] ; GENERIC-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]] ; ; KRYO-LABEL: @gather_reduce_8x16_i32( @@ -108,7 +108,7 @@ ; KRYO-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD66:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ] ; KRYO-NEXT: ret i32 [[SUM_0_LCSSA]] ; KRYO: for.body: -; KRYO-NEXT: [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; KRYO-NEXT: [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 1, [[FOR_BODY_PREHEADER]] ] ; KRYO-NEXT: [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] ; KRYO-NEXT: [[A_ADDR_0101:%.*]] = phi i16* [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ] ; KRYO-NEXT: [[TMP0:%.*]] = bitcast i16* [[A_ADDR_0101]] to <8 x i16>* @@ -167,8 +167,8 @@ ; KRYO-NEXT: [[TMP30:%.*]] = load i16, i16* [[ARRAYIDX64]], align 2 ; KRYO-NEXT: [[CONV65:%.*]] = zext i16 [[TMP30]] to i32 ; KRYO-NEXT: [[ADD66]] = add nsw i32 [[ADD57]], [[CONV65]] +; KRYO-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[I_0103]], [[N]] ; KRYO-NEXT: [[INC]] = add nuw nsw i32 [[I_0103]], 1 -; KRYO-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] ; KRYO-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]] ; entry: @@ -294,7 +294,7 @@ ; GENERIC-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD66:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ] ; GENERIC-NEXT: ret i32 [[SUM_0_LCSSA]] ; GENERIC: for.body: -; GENERIC-NEXT: [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; GENERIC-NEXT: [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 1, [[FOR_BODY_PREHEADER]] ] ; GENERIC-NEXT: [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] ; GENERIC-NEXT: [[A_ADDR_0101:%.*]] = phi i16* [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ] ; GENERIC-NEXT: [[TMP0:%.*]] = bitcast i16* [[A_ADDR_0101]] to <8 x i16>* @@ -353,8 +353,8 @@ ; GENERIC-NEXT: [[TMP30:%.*]] = load i16, i16* [[ARRAYIDX64]], align 2 ; GENERIC-NEXT: [[CONV65:%.*]] = zext i16 [[TMP30]] to i32 ; GENERIC-NEXT: [[ADD66]] = add nsw i32 [[ADD57]], [[CONV65]] +; GENERIC-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[I_0103]], [[N]] ; GENERIC-NEXT: [[INC]] = add nuw nsw i32 [[I_0103]], 1 -; GENERIC-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] ; GENERIC-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]] ; ; KRYO-LABEL: @gather_reduce_8x16_i64( @@ -369,7 +369,7 @@ ; KRYO-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD66:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ] ; KRYO-NEXT: ret i32 [[SUM_0_LCSSA]] ; KRYO: for.body: -; KRYO-NEXT: [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; KRYO-NEXT: [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 1, [[FOR_BODY_PREHEADER]] ] ; KRYO-NEXT: [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] ; KRYO-NEXT: [[A_ADDR_0101:%.*]] = phi i16* [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ] ; KRYO-NEXT: [[TMP0:%.*]] = bitcast i16* [[A_ADDR_0101]] to <8 x i16>* @@ -428,8 +428,8 @@ ; KRYO-NEXT: [[TMP30:%.*]] = load i16, i16* [[ARRAYIDX64]], align 2 ; KRYO-NEXT: [[CONV65:%.*]] = zext i16 [[TMP30]] to i32 ; KRYO-NEXT: [[ADD66]] = add nsw i32 [[ADD57]], [[CONV65]] +; KRYO-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[I_0103]], [[N]] ; KRYO-NEXT: [[INC]] = add nuw nsw i32 [[I_0103]], 1 -; KRYO-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] ; KRYO-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]] ; entry: