Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -3866,7 +3866,16 @@ const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); RecurKind RK = RdxDesc.getRecurrenceKind(); - TrackingVH ReductionStartValue = RdxDesc.getRecurrenceStartValue(); + // There are two start values in play. For non-epilogue reduction, these + // are the same. For epilogue vectorization, the recurrence start value + // is the start of the unvectorized reduction. The start value of the + // recipe is the start value of the epilogue loop after vectorization + // of the main loop (i.e. the resume value). + TrackingVH OrigStartValue = RdxDesc.getRecurrenceStartValue(); + VPValue *StartVPV = PhiR->getStartValue(); + TrackingVH ReductionStartValue = StartVPV->getLiveInIRValue(); + + Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); State.setDebugLocFromInst(ReductionStartValue); @@ -3995,10 +4004,15 @@ ReducedPartRdx = RdxDesc.isSigned() ? Builder.CreateSExt(ReducedPartRdx, PhiTy) : Builder.CreateZExt(ReducedPartRdx, PhiTy); + + if (!PhiR->isOrdered() && Op != Instruction::ICmp && Op != Instruction::FCmp) { + ReducedPartRdx = Builder.CreateBinOp( + (Instruction::BinaryOps)Op, ReductionStartValue, ReducedPartRdx, + "rdx.start"); + } } - PHINode *ResumePhi = - dyn_cast(PhiR->getStartValue()->getUnderlyingValue()); + PHINode *ResumePhi = dyn_cast(StartVPV->getUnderlyingValue()); // Create a phi node that merges control-flow from the backedge-taken check // block and the middle block. @@ -4015,7 +4029,7 @@ BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming), Incoming); else - BCBlockPhi->addIncoming(ReductionStartValue, Incoming); + BCBlockPhi->addIncoming(OrigStartValue, Incoming); } // Set the resume value for this reduction Index: llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -1167,6 +1167,8 @@ Type *VecTy = ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), State.VF); + bool CanReassociate = !IsInLoop && !IsOrdered; + BasicBlock *HeaderBB = State.CFG.PrevBB; assert(State.CurrentVectorLoop->getHeader() == HeaderBB && "recipe must be in the vector loop header"); @@ -1205,8 +1207,14 @@ Iden = Builder.CreateVectorSplat(State.VF, Iden); IRBuilderBase::InsertPointGuard IPBuilder(Builder); Builder.SetInsertPoint(VectorPH->getTerminator()); - Constant *Zero = Builder.getInt32(0); - StartV = Builder.CreateInsertElement(Iden, StartV, Zero); + + if (CanReassociate) + StartV = Iden; + else { + Constant *Zero = Builder.getInt32(0); + StartV = Builder.CreateInsertElement(Iden, StartV, Zero); + } + StartV = Iden; } } Index: llvm/test/Transforms/LoopVectorize/AArch64/gather-do-not-vectorize-addressing.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/gather-do-not-vectorize-addressing.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/gather-do-not-vectorize-addressing.ll @@ -19,7 +19,7 @@ ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x double> [ , [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x double> [ , [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[OFFSET:%.*]], i64 [[TMP0]] @@ -38,6 +38,33 @@ ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[TMP16:%.*]] = call double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> [[TMP14]]) +; CHECK-NEXT: [[RDX_START:%.*]] = fadd double 0.000000e+00, [[TMP16]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup.loopexit: +; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi double [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: [[RES_0_LCSSA:%.*]] = phi double [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] +; CHECK-NEXT: ret double [[RES_0_LCSSA]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[RES_07:%.*]] = phi double [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[OFFSET]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[IDXPROM1:%.*]] = sext i32 [[TMP17]] to i64 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[DATA]], i64 [[IDXPROM1]] +; CHECK-NEXT: [[TMP18:%.*]] = load double, ptr [[ARRAYIDX2]], align 8 +; CHECK-NEXT: [[ADD]] = fadd double [[RES_07]], [[TMP18]] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] ; ; SVE-LABEL: @test( ; SVE-NEXT: entry: @@ -57,7 +84,7 @@ ; SVE-NEXT: br label [[VECTOR_BODY:%.*]] ; SVE: vector.body: ; SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SVE-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( shufflevector ( insertelement ( poison, double -0.000000e+00, i32 0), poison, zeroinitializer), double 0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] +; SVE-NEXT: [[VEC_PHI:%.*]] = phi [ shufflevector ( insertelement ( poison, double -0.000000e+00, i32 0), poison, zeroinitializer), [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] ; SVE-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 ; SVE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[OFFSET:%.*]], i64 [[TMP4]] ; SVE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0 @@ -71,6 +98,33 @@ ; SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]] ; SVE-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; SVE-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; SVE: middle.block: +; SVE-NEXT: [[TMP13:%.*]] = call double @llvm.vector.reduce.fadd.nxv2f64(double -0.000000e+00, [[TMP9]]) +; SVE-NEXT: [[RDX_START:%.*]] = fadd double 0.000000e+00, [[TMP13]] +; SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; SVE: scalar.ph: +; SVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; SVE-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] +; SVE-NEXT: br label [[FOR_BODY:%.*]] +; SVE: for.cond.cleanup.loopexit: +; SVE-NEXT: [[ADD_LCSSA:%.*]] = phi double [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] +; SVE-NEXT: br label [[FOR_COND_CLEANUP]] +; SVE: for.cond.cleanup: +; SVE-NEXT: [[RES_0_LCSSA:%.*]] = phi double [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] +; SVE-NEXT: ret double [[RES_0_LCSSA]] +; SVE: for.body: +; SVE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; SVE-NEXT: [[RES_07:%.*]] = phi double [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD]], [[FOR_BODY]] ] +; SVE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[OFFSET]], i64 [[INDVARS_IV]] +; SVE-NEXT: [[TMP14:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; SVE-NEXT: [[IDXPROM1:%.*]] = sext i32 [[TMP14]] to i64 +; SVE-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[DATA]], i64 [[IDXPROM1]] +; SVE-NEXT: [[TMP15:%.*]] = load double, ptr [[ARRAYIDX2]], align 8 +; SVE-NEXT: [[ADD]] = fadd double [[RES_07]], [[TMP15]] +; SVE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; SVE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; SVE-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] ; entry: %cmp6 = icmp sgt i32 %size, 0 Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll @@ -22,7 +22,7 @@ ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( zeroinitializer, i64 5, i32 0), [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() @@ -50,6 +50,7 @@ ; CHECK: middle.block: ; CHECK-NEXT: [[BIN_RDX:%.*]] = add [[TMP19]], [[TMP18]] ; CHECK-NEXT: [[TMP23:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[BIN_RDX]]) +; CHECK-NEXT: [[RDX_START:%.*]] = add i64 5, [[TMP23]] ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: @@ -57,43 +58,43 @@ ; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 2 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 5, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[TMP23]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 5, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[RDX_START]], [[VEC_EPILOG_ITER_CHECK]] ] ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: [[N_MOD_VF4:%.*]] = urem i64 [[N]], 2 ; CHECK-NEXT: [[N_VEC5:%.*]] = sub i64 [[N]], [[N_MOD_VF4]] -; CHECK-NEXT: [[TMP24:%.*]] = insertelement <2 x i64> zeroinitializer, i64 [[BC_MERGE_RDX]], i32 0 ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT10:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI8:%.*]] = phi <2 x i64> [ [[TMP24]], [[VEC_EPILOG_PH]] ], [ [[TMP29:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP25:%.*]] = add i64 [[OFFSET_IDX]], 0 -; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP25]] -; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i64, i64* [[TMP26]], i32 0 -; CHECK-NEXT: [[TMP28:%.*]] = bitcast i64* [[TMP27]] to <2 x i64>* -; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <2 x i64>, <2 x i64>* [[TMP28]], align 4 -; CHECK-NEXT: [[TMP29]] = add <2 x i64> [[WIDE_LOAD9]], [[VEC_PHI8]] +; CHECK-NEXT: [[VEC_PHI8:%.*]] = phi <2 x i64> [ zeroinitializer, [[VEC_EPILOG_PH]] ], [ [[TMP28:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP24:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP24]] +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i64, i64* [[TMP25]], i32 0 +; CHECK-NEXT: [[TMP27:%.*]] = bitcast i64* [[TMP26]] to <2 x i64>* +; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <2 x i64>, <2 x i64>* [[TMP27]], align 4 +; CHECK-NEXT: [[TMP28]] = add <2 x i64> [[WIDE_LOAD9]], [[VEC_PHI8]] ; CHECK-NEXT: [[INDEX_NEXT10]] = add nuw i64 [[OFFSET_IDX]], 2 -; CHECK-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT10]], [[N_VEC5]] -; CHECK-NEXT: br i1 [[TMP30]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT10]], [[N_VEC5]] +; CHECK-NEXT: br i1 [[TMP29]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] ; CHECK: vec.epilog.middle.block: -; CHECK-NEXT: [[TMP31:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[TMP29]]) +; CHECK-NEXT: [[TMP30:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[TMP28]]) +; CHECK-NEXT: [[RDX_START11:%.*]] = add i64 [[BC_MERGE_RDX]], [[TMP30]] ; CHECK-NEXT: [[CMP_N6:%.*]] = icmp eq i64 [[N]], [[N_VEC5]] ; CHECK-NEXT: br i1 [[CMP_N6]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC5]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX11:%.*]] = phi i64 [ 5, [[ITER_CHECK]] ], [ [[TMP23]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[TMP31]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX12:%.*]] = phi i64 [ 5, [[ITER_CHECK]] ], [ [[RDX_START]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[RDX_START11]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[SUM:%.*]] = phi i64 [ [[BC_MERGE_RDX11]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[SUM:%.*]] = phi i64 [ [[BC_MERGE_RDX12]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[IV]] -; CHECK-NEXT: [[TMP32:%.*]] = load i64, i64* [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[ADD]] = add i64 [[TMP32]], [[SUM]] +; CHECK-NEXT: [[TMP31:%.*]] = load i64, i64* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ADD]] = add i64 [[TMP31]], [[SUM]] ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: for.end: -; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ], [ [[TMP31]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ], [ [[RDX_START11]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i64 [[ADD_LCSSA]] ; entry: Index: llvm/test/Transforms/LoopVectorize/AMDGPU/packed-math.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AMDGPU/packed-math.ll +++ llvm/test/Transforms/LoopVectorize/AMDGPU/packed-math.ll @@ -27,13 +27,14 @@ ; GFX9: middle.block: ; GFX9-NEXT: [[BIN_RDX:%.*]] = fadd fast <2 x half> [[TMP5]], [[TMP4]] ; GFX9-NEXT: [[TMP7:%.*]] = call fast half @llvm.vector.reduce.fadd.v2f16(half 0xH8000, <2 x half> [[BIN_RDX]]) +; GFX9-NEXT: [[RDX_START:%.*]] = fadd half [[TMP7]], 0xH0000 ; GFX9-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; GFX9: scalar.ph: ; GFX9-NEXT: br label [[FOR_BODY:%.*]] ; GFX9: for.body: ; GFX9-NEXT: br i1 poison, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] ; GFX9: for.end: -; GFX9-NEXT: [[ADD_LCSSA:%.*]] = phi half [ poison, [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; GFX9-NEXT: [[ADD_LCSSA:%.*]] = phi half [ poison, [[FOR_BODY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; GFX9-NEXT: ret half [[ADD_LCSSA]] ; ; VI-LABEL: @vectorize_v2f16_loop( @@ -59,13 +60,14 @@ ; VI: middle.block: ; VI-NEXT: [[BIN_RDX:%.*]] = fadd fast <2 x half> [[TMP5]], [[TMP4]] ; VI-NEXT: [[TMP7:%.*]] = call fast half @llvm.vector.reduce.fadd.v2f16(half 0xH8000, <2 x half> [[BIN_RDX]]) +; VI-NEXT: [[RDX_START:%.*]] = fadd half [[TMP7]], 0xH0000 ; VI-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; VI: scalar.ph: ; VI-NEXT: br label [[FOR_BODY:%.*]] ; VI: for.body: ; VI-NEXT: br i1 poison, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] ; VI: for.end: -; VI-NEXT: [[ADD_LCSSA:%.*]] = phi half [ poison, [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; VI-NEXT: [[ADD_LCSSA:%.*]] = phi half [ poison, [[FOR_BODY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; VI-NEXT: ret half [[ADD_LCSSA]] ; ; CI-LABEL: @vectorize_v2f16_loop( Index: llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-predselect.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-predselect.ll +++ llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-predselect.ll @@ -350,13 +350,14 @@ ; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP6]]) +; CHECK-NEXT: [[RDX_START:%.*]] = fadd float [[TMP8]], 0.000000e+00 ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: br i1 poison, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK: for.end: -; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi float [ poison, [[FOR_BODY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi float [ poison, [[FOR_BODY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret float [[RESULT_0_LCSSA]] ; entry: @@ -388,7 +389,7 @@ ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ , [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ , [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 257) ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[TMP0]] to <4 x float>* @@ -404,13 +405,14 @@ ; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[TMP6]]) +; CHECK-NEXT: [[RDX_START:%.*]] = fmul float [[TMP8]], 0.000000e+00 ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: br i1 poison, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] ; CHECK: for.end: -; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi float [ poison, [[FOR_BODY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi float [ poison, [[FOR_BODY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret float [[RESULT_0_LCSSA]] ; entry: Index: llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll +++ llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll @@ -270,10 +270,11 @@ ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP5]]) +; CHECK-NEXT: [[RDX_START:%.*]] = mul i32 1, [[TMP7]] ; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 1, [[FOR_BODY_PREHEADER]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 1, [[FOR_BODY_PREHEADER]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -285,7 +286,7 @@ ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: for.cond.cleanup.loopexit: -; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] @@ -338,10 +339,11 @@ ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[TMP5]]) +; CHECK-NEXT: [[RDX_START:%.*]] = and i32 -1, [[TMP7]] ; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -353,7 +355,7 @@ ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: for.cond.cleanup.loopexit: -; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] @@ -406,10 +408,11 @@ ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP5]]) +; CHECK-NEXT: [[RDX_START:%.*]] = or i32 0, [[TMP7]] ; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -421,7 +424,7 @@ ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: for.cond.cleanup.loopexit: -; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] @@ -474,10 +477,11 @@ ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> [[TMP5]]) +; CHECK-NEXT: [[RDX_START:%.*]] = xor i32 0, [[TMP7]] ; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -489,7 +493,7 @@ ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK: for.cond.cleanup.loopexit: -; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] @@ -542,10 +546,11 @@ ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP5]]) +; CHECK-NEXT: [[RDX_START:%.*]] = fadd float 0.000000e+00, [[TMP7]] ; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -557,7 +562,7 @@ ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] ; CHECK: for.cond.cleanup.loopexit: -; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] @@ -610,10 +615,11 @@ ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[TMP5]]) +; CHECK-NEXT: [[RDX_START:%.*]] = fmul float 1.000000e+00, [[TMP7]] ; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -625,7 +631,7 @@ ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] ; CHECK: for.cond.cleanup.loopexit: -; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] Index: llvm/test/Transforms/LoopVectorize/ARM/mve-selectandorcost.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/ARM/mve-selectandorcost.ll +++ llvm/test/Transforms/LoopVectorize/ARM/mve-selectandorcost.ll @@ -50,13 +50,14 @@ ; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[PREDPHI]]) +; CHECK-NEXT: [[RDX_START:%.*]] = fadd float [[TMP13]], 0.000000e+00 ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[BLOCKSIZE]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi float* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[PA]], [[WHILE_BODY_PREHEADER]] ] ; CHECK-NEXT: [[BC_RESUME_VAL2:%.*]] = phi float* [ [[IND_END1]], [[MIDDLE_BLOCK]] ], [ [[PB]], [[WHILE_BODY_PREHEADER]] ] ; CHECK-NEXT: [[BC_RESUME_VAL4:%.*]] = phi i32 [ [[IND_END3]], [[MIDDLE_BLOCK]] ], [ [[BLOCKSIZE]], [[WHILE_BODY_PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP13]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[WHILE_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[RDX_START]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[WHILE_BODY_PREHEADER]] ] ; CHECK-NEXT: br label [[WHILE_BODY:%.*]] ; CHECK: while.body: ; CHECK-NEXT: [[PA_ADDR_020:%.*]] = phi float* [ [[INCDEC_PTR:%.*]], [[IF_END:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -86,7 +87,7 @@ ; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[DEC]], 0 ; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]], !llvm.loop [[LOOP2:![0-9]+]] ; CHECK: while.end: -; CHECK-NEXT: [[ACCUM_0_LCSSA:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[ACCUM_1]], [[IF_END]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ACCUM_0_LCSSA:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[ACCUM_1]], [[IF_END]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret float [[ACCUM_0_LCSSA]] ; entry: Index: llvm/test/Transforms/LoopVectorize/ARM/sphinx.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/ARM/sphinx.ll +++ llvm/test/Transforms/LoopVectorize/ARM/sphinx.ll @@ -39,40 +39,40 @@ ; CHECK: vector.ph: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[T]], 2 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[T]], [[N_MOD_VF]] -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> zeroinitializer, double [[CONV114]], i32 0 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x double> [ [[TMP0]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i32 [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast float* [[TMP3]] to <2 x float>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, <2 x float>* [[TMP4]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[T4]], i32 [[TMP1]] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <2 x float>* -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x float>, <2 x float>* [[TMP7]], align 4 -; CHECK-NEXT: [[TMP8:%.*]] = fsub fast <2 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]] -; CHECK-NEXT: [[TMP9:%.*]] = fpext <2 x float> [[TMP8]] to <2 x double> -; CHECK-NEXT: [[TMP10:%.*]] = fmul fast <2 x double> [[TMP9]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[T6]], i32 [[TMP1]] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP11]], i32 0 -; CHECK-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP12]] to <2 x float>* -; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <2 x float>, <2 x float>* [[TMP13]], align 4 -; CHECK-NEXT: [[TMP14:%.*]] = fpext <2 x float> [[WIDE_LOAD2]] to <2 x double> -; CHECK-NEXT: [[TMP15:%.*]] = fmul fast <2 x double> [[TMP10]], [[TMP14]] -; CHECK-NEXT: [[TMP16]] = fsub fast <2 x double> [[VEC_PHI]], [[TMP15]] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <2 x float>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, <2 x float>* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[T4]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast float* [[TMP5]] to <2 x float>* +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x float>, <2 x float>* [[TMP6]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = fsub fast <2 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; CHECK-NEXT: [[TMP8:%.*]] = fpext <2 x float> [[TMP7]] to <2 x double> +; CHECK-NEXT: [[TMP9:%.*]] = fmul fast <2 x double> [[TMP8]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[T6]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = bitcast float* [[TMP11]] to <2 x float>* +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <2 x float>, <2 x float>* [[TMP12]], align 4 +; CHECK-NEXT: [[TMP13:%.*]] = fpext <2 x float> [[WIDE_LOAD2]] to <2 x double> +; CHECK-NEXT: [[TMP14:%.*]] = fmul fast <2 x double> [[TMP9]], [[TMP13]] +; CHECK-NEXT: [[TMP15]] = fsub fast <2 x double> [[VEC_PHI]], [[TMP14]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 -; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]] +; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP18:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> [[TMP16]]) +; CHECK-NEXT: [[TMP17:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> [[TMP15]]) +; CHECK-NEXT: [[RDX_START:%.*]] = fadd double [[CONV114]], [[TMP17]] ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[T]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[OUTEREND]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[OUTERLOOP]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ [[CONV114]], [[OUTERLOOP]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ [[CONV114]], [[OUTERLOOP]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[INNERLOOP:%.*]] ; CHECK: innerloop: ; CHECK-NEXT: [[I_2132:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC129:%.*]], [[INNERLOOP]] ] @@ -91,9 +91,9 @@ ; CHECK-NEXT: [[SUB127]] = fsub fast double [[DVAL1_4131]], [[MUL126]] ; CHECK-NEXT: [[INC129]] = add nuw nsw i32 [[I_2132]], 1 ; CHECK-NEXT: [[EXITCOND143:%.*]] = icmp eq i32 [[INC129]], [[T]] -; CHECK-NEXT: br i1 [[EXITCOND143]], label [[OUTEREND]], label [[INNERLOOP]], [[LOOP2:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[EXITCOND143]], label [[OUTEREND]], label [[INNERLOOP]], !llvm.loop [[LOOP2:![0-9]+]] ; CHECK: outerend: -; CHECK-NEXT: [[SUB127_LCSSA:%.*]] = phi double [ [[SUB127]], [[INNERLOOP]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[SUB127_LCSSA:%.*]] = phi double [ [[SUB127]], [[INNERLOOP]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: [[CONV138:%.*]] = fptosi double [[SUB127_LCSSA]] to i32 ; CHECK-NEXT: [[CALL142]] = add nuw nsw i32 [[SCORE_1135]], [[CONV138]] ; CHECK-NEXT: [[INC144]] = add nuw nsw i32 [[J_0136]], 1 Index: llvm/test/Transforms/LoopVectorize/PowerPC/widened-massv-call.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/PowerPC/widened-massv-call.ll +++ llvm/test/Transforms/LoopVectorize/PowerPC/widened-massv-call.ll @@ -23,7 +23,8 @@ ; CHECK: middle.block: ; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi <2 x double> [ [[TMP5]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP7:%.*]] = tail call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> [[DOTLCSSA]]) -; CHECK-NEXT: ret double [[TMP7]] +; CHECK-NEXT: [[RDX_START:%.*]] = fadd double [[TMP7]], 0.000000e+00 +; CHECK-NEXT: ret double [[RDX_START]] ; entry: br label %for.cond Index: llvm/test/Transforms/LoopVectorize/PowerPC/widened-massv-vfabi-attr.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/PowerPC/widened-massv-vfabi-attr.ll +++ llvm/test/Transforms/LoopVectorize/PowerPC/widened-massv-vfabi-attr.ll @@ -21,7 +21,8 @@ ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP7:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> [[TMP5]]) -; CHECK-NEXT: ret double [[TMP7]] +; CHECK-NEXT: [[RDX_START:%.*]] = fadd double [[TMP7]], 0.000000e+00 +; CHECK-NEXT: ret double [[RDX_START]] ; entry: br label %for.cond Index: llvm/test/Transforms/LoopVectorize/RISCV/defaults.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/RISCV/defaults.ll +++ llvm/test/Transforms/LoopVectorize/RISCV/defaults.ll @@ -95,11 +95,12 @@ ; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.add.nxv1i64( [[TMP5]]) +; CHECK-NEXT: [[RDX_START:%.*]] = add i64 0, [[TMP8]] ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -111,7 +112,7 @@ ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 ; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: for.end: -; CHECK-NEXT: [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT]], [[FOR_BODY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT]], [[FOR_BODY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i64 [[SUM_NEXT_LCSSA]] ; entry: Index: llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll +++ llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll @@ -54,11 +54,12 @@ ; OUTLOOP: middle.block: ; OUTLOOP-NEXT: [[BIN_RDX:%.*]] = add [[TMP21]], [[TMP20]] ; OUTLOOP-NEXT: [[TMP25:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32( [[BIN_RDX]]) +; OUTLOOP-NEXT: [[RDX_START:%.*]] = add i32 0, [[TMP25]] ; OUTLOOP-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] ; OUTLOOP-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; OUTLOOP: scalar.ph: ; OUTLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; OUTLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ] +; OUTLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; OUTLOOP-NEXT: br label [[FOR_BODY:%.*]] ; OUTLOOP: for.body: ; OUTLOOP-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -71,7 +72,7 @@ ; OUTLOOP-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] ; OUTLOOP-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] ; OUTLOOP: for.cond.cleanup.loopexit: -; OUTLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ] +; OUTLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; OUTLOOP-NEXT: br label [[FOR_COND_CLEANUP]] ; OUTLOOP: for.cond.cleanup: ; OUTLOOP-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] Index: llvm/test/Transforms/LoopVectorize/RISCV/scalable-basics.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/RISCV/scalable-basics.ll +++ llvm/test/Transforms/LoopVectorize/RISCV/scalable-basics.ll @@ -446,11 +446,12 @@ ; VLENUNK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; VLENUNK: middle.block: ; VLENUNK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.nxv1i64( [[TMP6]]) +; VLENUNK-NEXT: [[RDX_START:%.*]] = add i64 0, [[TMP9]] ; VLENUNK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; VLENUNK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; VLENUNK: scalar.ph: ; VLENUNK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; VLENUNK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] +; VLENUNK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; VLENUNK-NEXT: br label [[FOR_BODY:%.*]] ; VLENUNK: for.body: ; VLENUNK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -464,7 +465,7 @@ ; VLENUNK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 ; VLENUNK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; VLENUNK: for.end: -; VLENUNK-NEXT: [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] +; VLENUNK-NEXT: [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT]], [[FOR_BODY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; VLENUNK-NEXT: ret i64 [[SUM_NEXT_LCSSA]] ; ; VLEN128-LABEL: @indexed_load( @@ -493,11 +494,12 @@ ; VLEN128-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; VLEN128: middle.block: ; VLEN128-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.nxv1i64( [[TMP6]]) +; VLEN128-NEXT: [[RDX_START:%.*]] = add i64 0, [[TMP9]] ; VLEN128-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; VLEN128-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; VLEN128: scalar.ph: ; VLEN128-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; VLEN128-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] +; VLEN128-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; VLEN128-NEXT: br label [[FOR_BODY:%.*]] ; VLEN128: for.body: ; VLEN128-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -511,7 +513,7 @@ ; VLEN128-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 ; VLEN128-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; VLEN128: for.end: -; VLEN128-NEXT: [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] +; VLEN128-NEXT: [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT]], [[FOR_BODY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; VLEN128-NEXT: ret i64 [[SUM_NEXT_LCSSA]] ; entry: Index: llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll +++ llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll @@ -166,10 +166,11 @@ ; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv1i64( [[TMP10]]) +; CHECK-NEXT: [[RDX_START:%.*]] = add i64 0, [[TMP13]] ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -183,7 +184,7 @@ ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 ; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: for.end: -; CHECK-NEXT: [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT]], [[FOR_BODY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT]], [[FOR_BODY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i64 [[SUM_NEXT_LCSSA]] ; entry: Index: llvm/test/Transforms/LoopVectorize/X86/cost-model.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/X86/cost-model.ll +++ llvm/test/Transforms/LoopVectorize/X86/cost-model.ll @@ -216,11 +216,12 @@ ; CHECK-NEXT: [[BIN_RDX4:%.*]] = fadd fast <4 x float> [[TMP121]], [[BIN_RDX]] ; CHECK-NEXT: [[BIN_RDX5:%.*]] = fadd fast <4 x float> [[TMP122]], [[BIN_RDX4]] ; CHECK-NEXT: [[TMP124:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[BIN_RDX5]]) +; CHECK-NEXT: [[RDX_START:%.*]] = fadd float 0.000000e+00, [[TMP124]] ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[PREHEADER]] ], [ [[TMP124]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[PREHEADER]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR:%.*]] ; CHECK: for: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR]] ] @@ -235,7 +236,7 @@ ; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV_NEXT]], [[T0]] ; CHECK-NEXT: br i1 [[CMP1]], label [[FOR]], label [[LOOPEXIT]], !llvm.loop [[LOOP2:![0-9]+]] ; CHECK: loopexit: -; CHECK-NEXT: [[ADD4_LCSSA:%.*]] = phi float [ [[ADD4]], [[FOR]] ], [ [[TMP124]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ADD4_LCSSA:%.*]] = phi float [ [[ADD4]], [[FOR]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_END]] ; CHECK: for.end: ; CHECK-NEXT: [[S_0_LCSSA:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[ADD4_LCSSA]], [[LOOPEXIT]] ] Index: llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll +++ llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll @@ -87,11 +87,12 @@ ; SSE-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; SSE: middle.block: ; SSE-NEXT: [[TMP8:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> [[PREDPHI]]) +; SSE-NEXT: [[RDX_START:%.*]] = fadd double 0.000000e+00, [[TMP8]] ; SSE-NEXT: [[CMP_N:%.*]] = icmp eq i32 32, 32 ; SSE-NEXT: br i1 [[CMP_N]], label [[DONE:%.*]], label [[SCALAR_PH]] ; SSE: scalar.ph: ; SSE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 32, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; SSE-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] +; SSE-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; SSE-NEXT: br label [[LOOP:%.*]] ; SSE: loop: ; SSE-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[NEXT_ITER:%.*]] ] @@ -111,7 +112,7 @@ ; SSE-NEXT: [[AGAIN:%.*]] = icmp ult i32 [[I_NEXT]], 32 ; SSE-NEXT: br i1 [[AGAIN]], label [[LOOP]], label [[DONE]], !llvm.loop [[LOOP2:![0-9]+]] ; SSE: done: -; SSE-NEXT: [[TOT_NEXT_LCSSA:%.*]] = phi double [ [[TOT_NEXT]], [[NEXT_ITER]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] +; SSE-NEXT: [[TOT_NEXT_LCSSA:%.*]] = phi double [ [[TOT_NEXT]], [[NEXT_ITER]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; SSE-NEXT: ret double [[TOT_NEXT_LCSSA]] ; ; AVX-LABEL: @sumIfVector( @@ -136,11 +137,12 @@ ; AVX-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; AVX: middle.block: ; AVX-NEXT: [[TMP8:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[PREDPHI]]) +; AVX-NEXT: [[RDX_START:%.*]] = fadd double 0.000000e+00, [[TMP8]] ; AVX-NEXT: [[CMP_N:%.*]] = icmp eq i32 32, 32 ; AVX-NEXT: br i1 [[CMP_N]], label [[DONE:%.*]], label [[SCALAR_PH]] ; AVX: scalar.ph: ; AVX-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 32, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; AVX-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] +; AVX-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; AVX-NEXT: br label [[LOOP:%.*]] ; AVX: loop: ; AVX-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[NEXT_ITER:%.*]] ] @@ -160,7 +162,7 @@ ; AVX-NEXT: [[AGAIN:%.*]] = icmp ult i32 [[I_NEXT]], 32 ; AVX-NEXT: br i1 [[AGAIN]], label [[LOOP]], label [[DONE]], !llvm.loop [[LOOP2:![0-9]+]] ; AVX: done: -; AVX-NEXT: [[TOT_NEXT_LCSSA:%.*]] = phi double [ [[TOT_NEXT]], [[NEXT_ITER]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] +; AVX-NEXT: [[TOT_NEXT_LCSSA:%.*]] = phi double [ [[TOT_NEXT]], [[NEXT_ITER]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; AVX-NEXT: ret double [[TOT_NEXT_LCSSA]] ; entry: Index: llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll +++ llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll @@ -69,30 +69,30 @@ ; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[TMP13]], [[VEC_EPILOG_ITER_CHECK]] ] ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ] ; CHECK-NEXT: [[N_VEC17:%.*]] = and i64 [[SMAX6]], 9223372036854775800 -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <8 x i32> , i32 [[BC_MERGE_RDX]], i64 0 ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT22:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI20:%.*]] = phi <8 x i32> [ [[TMP14]], [[VEC_EPILOG_PH]] ], [ [[TMP17:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[OFFSET_IDX]] -; CHECK-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP15]] to <8 x i32>* -; CHECK-NEXT: [[WIDE_LOAD21:%.*]] = load <8 x i32>, <8 x i32>* [[TMP16]], align 8, !alias.scope !7 -; CHECK-NEXT: [[TMP17]] = add <8 x i32> [[VEC_PHI20]], [[WIDE_LOAD21]] +; CHECK-NEXT: [[VEC_PHI20:%.*]] = phi <8 x i32> [ zeroinitializer, [[VEC_EPILOG_PH]] ], [ [[TMP16:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <8 x i32>* +; CHECK-NEXT: [[WIDE_LOAD21:%.*]] = load <8 x i32>, <8 x i32>* [[TMP15]], align 8, !alias.scope !7 +; CHECK-NEXT: [[TMP16]] = add <8 x i32> [[VEC_PHI20]], [[WIDE_LOAD21]] ; CHECK-NEXT: store i32 [[NTRUNC]], i32* [[A]], align 4, !alias.scope !10, !noalias !7 ; CHECK-NEXT: [[INDEX_NEXT22]] = add nuw i64 [[OFFSET_IDX]], 8 -; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT22]], [[N_VEC17]] -; CHECK-NEXT: br i1 [[TMP18]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT22]], [[N_VEC17]] +; CHECK-NEXT: br i1 [[TMP17]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: vec.epilog.middle.block: -; CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP17]]) +; CHECK-NEXT: [[TMP18:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP16]]) +; CHECK-NEXT: [[RDX_START23:%.*]] = add i32 [[BC_MERGE_RDX]], [[TMP18]] ; CHECK-NEXT: [[CMP_N18:%.*]] = icmp eq i64 [[SMAX6]], [[N_VEC17]] ; CHECK-NEXT: br i1 [[CMP_N18]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC17]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX23:%.*]] = phi i32 [ [[TMP19]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP13]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX24:%.*]] = phi i32 [ [[RDX_START23]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP13]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ] -; CHECK-NEXT: [[T0:%.*]] = phi i32 [ [[T3:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX23]], [[VEC_EPILOG_SCALAR_PH]] ] +; CHECK-NEXT: [[T0:%.*]] = phi i32 [ [[T3:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX24]], [[VEC_EPILOG_SCALAR_PH]] ] ; CHECK-NEXT: [[T1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]] ; CHECK-NEXT: [[T2:%.*]] = load i32, i32* [[T1]], align 8 ; CHECK-NEXT: [[T3]] = add i32 [[T0]], [[T2]] @@ -101,7 +101,7 @@ ; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: for.end: -; CHECK-NEXT: [[T4:%.*]] = phi i32 [ [[T3]], [[FOR_BODY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ], [ [[TMP19]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[T4:%.*]] = phi i32 [ [[T3]], [[FOR_BODY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ], [ [[RDX_START23]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[T4]] ; entry: Index: llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll +++ llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll @@ -86,11 +86,12 @@ ; CHECK-NEXT: [[BIN_RDX19:%.*]] = add <4 x i32> [[TMP26]], [[BIN_RDX]] ; CHECK-NEXT: [[BIN_RDX20:%.*]] = add <4 x i32> [[TMP27]], [[BIN_RDX19]] ; CHECK-NEXT: [[TMP29:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX20]]) +; CHECK-NEXT: [[RDX_START:%.*]] = add i32 0, [[TMP29]] ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP29]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] @@ -108,7 +109,7 @@ ; CHECK-NEXT: [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094 ; CHECK-NEXT: br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]] ; CHECK: loop_exit: -; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP29]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[ACCUM_NEXT_LCSSA]] ; entry: @@ -252,11 +253,12 @@ ; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[BIN_RDX]] ; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX10]] ; CHECK-NEXT: [[TMP85:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) +; CHECK-NEXT: [[RDX_START:%.*]] = add i32 0, [[TMP85]] ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] @@ -275,7 +277,7 @@ ; CHECK-NEXT: [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094 ; CHECK-NEXT: br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: loop_exit: -; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[ACCUM_NEXT_LCSSA]] ; entry: @@ -439,11 +441,12 @@ ; CHECK-NEXT: [[BIN_RDX7:%.*]] = add <4 x i32> [[TMP102]], [[BIN_RDX]] ; CHECK-NEXT: [[BIN_RDX8:%.*]] = add <4 x i32> [[TMP103]], [[BIN_RDX7]] ; CHECK-NEXT: [[TMP105:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX8]]) +; CHECK-NEXT: [[RDX_START:%.*]] = add i32 0, [[TMP105]] ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP105]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] @@ -461,7 +464,7 @@ ; CHECK-NEXT: [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094 ; CHECK-NEXT: br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: loop_exit: -; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP105]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[ACCUM_NEXT_LCSSA]] ; entry: @@ -765,11 +768,12 @@ ; CHECK-NEXT: [[BIN_RDX37:%.*]] = add <4 x i32> [[TMP182]], [[BIN_RDX]] ; CHECK-NEXT: [[BIN_RDX38:%.*]] = add <4 x i32> [[TMP183]], [[BIN_RDX37]] ; CHECK-NEXT: [[TMP185:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX38]]) +; CHECK-NEXT: [[RDX_START:%.*]] = add i32 0, [[TMP185]] ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP185]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] @@ -790,7 +794,7 @@ ; CHECK-NEXT: [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094 ; CHECK-NEXT: br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: loop_exit: -; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP185]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[ACCUM_NEXT_LCSSA]] ; entry: @@ -942,11 +946,12 @@ ; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX]] ; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP84]], [[BIN_RDX10]] ; CHECK-NEXT: [[TMP86:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) +; CHECK-NEXT: [[RDX_START:%.*]] = add i32 0, [[TMP86]] ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP86]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] @@ -965,7 +970,7 @@ ; CHECK-NEXT: [[EXIT:%.*]] = icmp ugt i64 [[IV]], [[MIN_N]] ; CHECK-NEXT: br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: loop_exit: -; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP86]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[ACCUM_NEXT_LCSSA]] ; entry: @@ -1115,11 +1120,12 @@ ; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[BIN_RDX]] ; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX10]] ; CHECK-NEXT: [[TMP85:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) +; CHECK-NEXT: [[RDX_START:%.*]] = add i32 0, [[TMP85]] ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 3072, 3072 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 1024, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] @@ -1138,7 +1144,7 @@ ; CHECK-NEXT: [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094 ; CHECK-NEXT: br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: loop_exit: -; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[ACCUM_NEXT_LCSSA]] ; entry: @@ -1466,11 +1472,12 @@ ; CHECK-NEXT: [[BIN_RDX37:%.*]] = add <4 x i32> [[TMP150]], [[BIN_RDX]] ; CHECK-NEXT: [[BIN_RDX38:%.*]] = add <4 x i32> [[TMP151]], [[BIN_RDX37]] ; CHECK-NEXT: [[TMP153:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX38]]) +; CHECK-NEXT: [[RDX_START:%.*]] = add i32 0, [[TMP153]] ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 2048, 2048 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP153]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] @@ -1489,7 +1496,7 @@ ; CHECK-NEXT: [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4093 ; CHECK-NEXT: br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK: loop_exit: -; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP153]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[ACCUM_NEXT_LCSSA]] ; entry: @@ -1633,11 +1640,12 @@ ; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[BIN_RDX]] ; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX10]] ; CHECK-NEXT: [[TMP85:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) +; CHECK-NEXT: [[RDX_START:%.*]] = add i32 0, [[TMP85]] ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] @@ -1656,7 +1664,7 @@ ; CHECK-NEXT: [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094 ; CHECK-NEXT: br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP17:![0-9]+]] ; CHECK: loop_exit: -; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[ACCUM_NEXT_LCSSA]] ; entry: @@ -1800,11 +1808,12 @@ ; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[BIN_RDX]] ; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX10]] ; CHECK-NEXT: [[TMP85:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) +; CHECK-NEXT: [[RDX_START:%.*]] = add i32 0, [[TMP85]] ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] @@ -1823,7 +1832,7 @@ ; CHECK-NEXT: [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094 ; CHECK-NEXT: br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP19:![0-9]+]] ; CHECK: loop_exit: -; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[ACCUM_NEXT_LCSSA]] ; entry: @@ -1967,11 +1976,12 @@ ; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[BIN_RDX]] ; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX10]] ; CHECK-NEXT: [[TMP85:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) +; CHECK-NEXT: [[RDX_START:%.*]] = add i32 0, [[TMP85]] ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] @@ -1990,7 +2000,7 @@ ; CHECK-NEXT: [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094 ; CHECK-NEXT: br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP21:![0-9]+]] ; CHECK: loop_exit: -; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[ACCUM_NEXT_LCSSA]] ; entry: @@ -2143,11 +2153,12 @@ ; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX]] ; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP84]], [[BIN_RDX10]] ; CHECK-NEXT: [[TMP86:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) +; CHECK-NEXT: [[RDX_START:%.*]] = add i32 0, [[TMP86]] ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP86]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] @@ -2166,7 +2177,7 @@ ; CHECK-NEXT: [[EXIT:%.*]] = icmp ugt i64 [[IV]], [[MIN]] ; CHECK-NEXT: br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP23:![0-9]+]] ; CHECK: loop_exit: -; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP86]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[ACCUM_NEXT_LCSSA]] ; entry: @@ -2317,11 +2328,12 @@ ; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[BIN_RDX]] ; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX10]] ; CHECK-NEXT: [[TMP85:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) +; CHECK-NEXT: [[RDX_START:%.*]] = add i32 0, [[TMP85]] ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] @@ -2340,7 +2352,7 @@ ; CHECK-NEXT: [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094 ; CHECK-NEXT: br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP25:![0-9]+]] ; CHECK: loop_exit: -; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[ACCUM_NEXT_LCSSA]] ; entry: @@ -2485,11 +2497,12 @@ ; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[BIN_RDX]] ; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX10]] ; CHECK-NEXT: [[TMP85:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) +; CHECK-NEXT: [[RDX_START:%.*]] = add i32 0, [[TMP85]] ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] @@ -2508,7 +2521,7 @@ ; CHECK-NEXT: [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094 ; CHECK-NEXT: br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP27:![0-9]+]] ; CHECK: loop_exit: -; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[ACCUM_NEXT_LCSSA]] ; entry: @@ -2663,11 +2676,12 @@ ; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[BIN_RDX]] ; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX10]] ; CHECK-NEXT: [[TMP85:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) +; CHECK-NEXT: [[RDX_START:%.*]] = add i32 0, [[TMP85]] ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[PREHEADER]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[PREHEADER]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] @@ -2686,7 +2700,7 @@ ; CHECK-NEXT: [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094 ; CHECK-NEXT: br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP29:![0-9]+]] ; CHECK: loop_exit: -; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[ACCUM_NEXT_LCSSA]] ; entry: Index: llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll +++ llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll @@ -73,15 +73,16 @@ ; CHECK-NEXT: [[TMP9]] = fadd fast <4 x float> [[VEC_PHI1]], [[WIDE_LOAD2]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4096 -; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP9]], [[TMP8]] ; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[BIN_RDX]]) +; CHECK-NEXT: [[RDX_START:%.*]] = fadd float 0.000000e+00, [[TMP11]] ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 4096, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[LOOP_PREHEADER]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[LOOP_PREHEADER]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IDX:%.*]] = phi i32 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -91,9 +92,9 @@ ; CHECK-NEXT: [[SUM_INC]] = fadd fast float [[SUM]], [[VALUE]] ; CHECK-NEXT: [[IDX_INC]] = add i32 [[IDX]], 1 ; CHECK-NEXT: [[BE_COND:%.*]] = icmp ne i32 [[IDX_INC]], 4096 -; CHECK-NEXT: br i1 [[BE_COND]], label [[LOOP]], label [[LOOP_EXIT_LOOPEXIT]], [[LOOP2:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[BE_COND]], label [[LOOP]], label [[LOOP_EXIT_LOOPEXIT]], !llvm.loop [[LOOP2:![0-9]+]] ; CHECK: loop.exit.loopexit: -; CHECK-NEXT: [[SUM_INC_LCSSA:%.*]] = phi float [ [[SUM_INC]], [[LOOP]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[SUM_INC_LCSSA:%.*]] = phi float [ [[SUM_INC]], [[LOOP]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP_EXIT]] ; CHECK: loop.exit: ; CHECK-NEXT: [[SUM_LCSSA:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[SUM_INC_LCSSA]], [[LOOP_EXIT_LOOPEXIT]] ] @@ -145,15 +146,16 @@ ; CHECK-NEXT: [[TMP9]] = fadd reassoc <4 x float> [[VEC_PHI1]], [[WIDE_LOAD2]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4096 -; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP4:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd reassoc <4 x float> [[TMP9]], [[TMP8]] ; CHECK-NEXT: [[TMP11:%.*]] = call reassoc float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[BIN_RDX]]) +; CHECK-NEXT: [[RDX_START:%.*]] = fadd float -0.000000e+00, [[TMP11]] ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 4096, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ -0.000000e+00, [[LOOP_PREHEADER]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ -0.000000e+00, [[LOOP_PREHEADER]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IDX:%.*]] = phi i32 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -163,9 +165,9 @@ ; CHECK-NEXT: [[SUM_INC]] = fadd reassoc float [[SUM]], [[VALUE]] ; CHECK-NEXT: [[IDX_INC]] = add i32 [[IDX]], 1 ; CHECK-NEXT: [[BE_COND:%.*]] = icmp ne i32 [[IDX_INC]], 4096 -; CHECK-NEXT: br i1 [[BE_COND]], label [[LOOP]], label [[LOOP_EXIT_LOOPEXIT]], [[LOOP5:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[BE_COND]], label [[LOOP]], label [[LOOP_EXIT_LOOPEXIT]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: loop.exit.loopexit: -; CHECK-NEXT: [[SUM_INC_LCSSA:%.*]] = phi float [ [[SUM_INC]], [[LOOP]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[SUM_INC_LCSSA:%.*]] = phi float [ [[SUM_INC]], [[LOOP]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP_EXIT]] ; CHECK: loop.exit: ; CHECK-NEXT: [[SUM_LCSSA:%.*]] = phi float [ -0.000000e+00, [[ENTRY:%.*]] ], [ [[SUM_INC_LCSSA]], [[LOOP_EXIT_LOOPEXIT]] ] @@ -217,15 +219,16 @@ ; CHECK-NEXT: [[TMP9]] = fadd reassoc contract <4 x float> [[VEC_PHI1]], [[WIDE_LOAD2]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4096 -; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP6:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd reassoc contract <4 x float> [[TMP9]], [[TMP8]] ; CHECK-NEXT: [[TMP11:%.*]] = call reassoc contract float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[BIN_RDX]]) +; CHECK-NEXT: [[RDX_START:%.*]] = fadd float -0.000000e+00, [[TMP11]] ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 4096, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ -0.000000e+00, [[LOOP_PREHEADER]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ -0.000000e+00, [[LOOP_PREHEADER]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IDX:%.*]] = phi i32 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -235,9 +238,9 @@ ; CHECK-NEXT: [[SUM_INC]] = fadd reassoc contract float [[SUM]], [[VALUE]] ; CHECK-NEXT: [[IDX_INC]] = add i32 [[IDX]], 1 ; CHECK-NEXT: [[BE_COND:%.*]] = icmp ne i32 [[IDX_INC]], 4096 -; CHECK-NEXT: br i1 [[BE_COND]], label [[LOOP]], label [[LOOP_EXIT_LOOPEXIT]], [[LOOP7:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[BE_COND]], label [[LOOP]], label [[LOOP_EXIT_LOOPEXIT]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: loop.exit.loopexit: -; CHECK-NEXT: [[SUM_INC_LCSSA:%.*]] = phi float [ [[SUM_INC]], [[LOOP]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[SUM_INC_LCSSA:%.*]] = phi float [ [[SUM_INC]], [[LOOP]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP_EXIT]] ; CHECK: loop.exit: ; CHECK-NEXT: [[SUM_LCSSA:%.*]] = phi float [ -0.000000e+00, [[ENTRY:%.*]] ], [ [[SUM_INC_LCSSA]], [[LOOP_EXIT_LOOPEXIT]] ] @@ -298,7 +301,7 @@ ; CHECK-NEXT: [[TMP11]] = select <4 x i1> [[TMP9]], <4 x float> [[WIDE_LOAD2]], <4 x float> [[VEC_PHI1]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP8:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp nnan ninf nsz ogt <4 x float> [[TMP10]], [[TMP11]] ; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select nnan ninf nsz <4 x i1> [[RDX_MINMAX_CMP]], <4 x float> [[TMP10]], <4 x float> [[TMP11]] @@ -324,7 +327,7 @@ ; CHECK-NEXT: [[MAX_0_]] = select i1 [[CMP1_INV]], float [[TMP14]], float [[MAX_013]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], [[LOOP9:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; entry: %cmp12 = icmp sgt i32 %N, 0 @@ -385,7 +388,7 @@ ; CHECK-NEXT: [[TMP11]] = select <4 x i1> [[TMP9]], <4 x float> [[WIDE_LOAD2]], <4 x float> [[VEC_PHI1]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP10:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp nnan ninf ogt <4 x float> [[TMP10]], [[TMP11]] ; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select nnan ninf <4 x i1> [[RDX_MINMAX_CMP]], <4 x float> [[TMP10]], <4 x float> [[TMP11]] @@ -411,7 +414,7 @@ ; CHECK-NEXT: [[MAX_0_]] = select nnan ninf i1 [[CMP1_INV]], float [[TMP14]], float [[MAX_013]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], [[LOOP11:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; entry: %cmp12 = icmp sgt i32 %N, 0 Index: llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll +++ llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll @@ -63,14 +63,15 @@ ; CHECK-NEXT: br i1 [[TMP38]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP39:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP37]]) +; CHECK-NEXT: [[RDX_START:%.*]] = add i32 0, [[TMP39]] ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 100, 96 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 96, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP39]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[ADD7_LCSSA:%.*]] = phi i32 [ [[ADD7:%.*]], [[FOR_BODY]] ], [ [[TMP39]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ADD7_LCSSA:%.*]] = phi i32 [ [[ADD7:%.*]], [[FOR_BODY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[ADD7_LCSSA]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] Index: llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll +++ llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll @@ -52,7 +52,7 @@ ; CHECK-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX4]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 430 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] ; entry: br label %for.body @@ -194,10 +194,11 @@ ; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP13]]) +; CHECK-NEXT: [[RDX_START:%.*]] = add i32 0, [[TMP15]] ; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -213,7 +214,7 @@ ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[SUM_1_LCSSA:%.*]] = phi i32 [ [[SUM_1]], [[FOR_BODY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[SUM_1_LCSSA:%.*]] = phi i32 [ [[SUM_1]], [[FOR_BODY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[SUM_1_LCSSA]] ; entry: @@ -237,12 +238,6 @@ ret i32 %sum.1 } -; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[MD_IS_VEC:![0-9]+]]} -; CHECK-NEXT: [[MD_IS_VEC:![0-9]+]] = !{!"llvm.loop.isvectorized", i32 1} -; CHECK-NEXT: [[LOOP3]] = distinct !{[[LOOP3]], [[MD_RT_UNROLL_DIS:![0-9]+]], [[MD_IS_VEC]]} -; CHECK-NEXT: [[MD_RT_UNROLL_DIS]] = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK-NEXT: [[LOOP4]] = distinct !{[[LOOP4]], [[MD_IS_VEC]]} -; CHECK-NEXT: [[LOOP5]] = distinct !{[[LOOP5]], [[MD_RT_UNROLL_DIS]], [[MD_IS_VEC]]} attributes #0 = { nounwind optsize uwtable "target-cpu"="core-avx2" "target-features"="+avx,+avx2" } Index: llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll +++ llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll @@ -77,11 +77,12 @@ ; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP3]], [[BIN_RDX]] ; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP4]], [[BIN_RDX10]] ; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) +; CHECK-NEXT: [[RDX_START:%.*]] = add i32 0, [[TMP6]] ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4097, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -92,7 +93,7 @@ ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 4096 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: loopexit: -; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[FOR_BODY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[ACCUM_NEXT_LCSSA]] ; entry: @@ -436,11 +437,12 @@ ; CHECK: middle.block: ; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP51]], [[TMP50]] ; CHECK-NEXT: [[TMP53:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) +; CHECK-NEXT: [[RDX_START:%.*]] = add i32 0, [[TMP53]] ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP53]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] @@ -458,7 +460,7 @@ ; CHECK-NEXT: [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094 ; CHECK-NEXT: br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP20:![0-9]+]] ; CHECK: loop_exit: -; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LOOP]] ], [ [[TMP53]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LOOP]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[ACCUM_NEXT_LCSSA]] ; entry: @@ -516,11 +518,12 @@ ; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP3]], [[BIN_RDX]] ; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP4]], [[BIN_RDX10]] ; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) +; CHECK-NEXT: [[RDX_START:%.*]] = add i32 0, [[TMP6]] ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4097, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -531,7 +534,7 @@ ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 4096 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] ; CHECK: loopexit: -; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[FOR_BODY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[ACCUM_NEXT_LCSSA]] ; entry: @@ -578,11 +581,12 @@ ; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP3]], [[BIN_RDX]] ; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP4]], [[BIN_RDX10]] ; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) +; CHECK-NEXT: [[RDX_START:%.*]] = add i32 0, [[TMP6]] ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4097, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -593,7 +597,7 @@ ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 4096 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] ; CHECK: loopexit: -; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[FOR_BODY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[ACCUM_NEXT_LCSSA]] ; entry: Index: llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll +++ llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll @@ -54,11 +54,12 @@ ; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[TMP19]]) +; CHECK-NEXT: [[RDX_START:%.*]] = add i32 0, [[TMP21]] ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 10000 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 10000, [[MIDDLE_BLOCK]] ] -; CHECK-NEXT: [[R:%.*]] = phi i32 [ [[T7:%.*]], [[FOR_INC]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[R:%.*]] = phi i32 [ [[T7:%.*]], [[FOR_INC]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I]] ; CHECK-NEXT: [[T1:%.*]] = load i32, i32* [[T0]], align 4 ; CHECK-NEXT: br i1 [[C]], label [[IF_THEN:%.*]], label [[FOR_INC]] @@ -75,7 +76,7 @@ ; CHECK-NEXT: [[COND:%.*]] = icmp eq i64 [[I_NEXT]], 10000 ; CHECK-NEXT: br i1 [[COND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] ; CHECK: for.end: -; CHECK-NEXT: [[T8:%.*]] = phi i32 [ [[T7]], [[FOR_INC]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[T8:%.*]] = phi i32 [ [[T7]], [[FOR_INC]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[T8]] ; ; SINK-GATHER-LABEL: @predicated_sdiv_masked_load( @@ -178,11 +179,12 @@ ; SINK-GATHER-NEXT: br i1 [[TMP50]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; SINK-GATHER: middle.block: ; SINK-GATHER-NEXT: [[TMP51:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP49]]) +; SINK-GATHER-NEXT: [[RDX_START:%.*]] = add i32 0, [[TMP51]] ; SINK-GATHER-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 10000 ; SINK-GATHER-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; SINK-GATHER: scalar.ph: ; SINK-GATHER-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; SINK-GATHER-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP51]], [[MIDDLE_BLOCK]] ] +; SINK-GATHER-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; SINK-GATHER-NEXT: br label [[FOR_BODY:%.*]] ; SINK-GATHER: for.body: ; SINK-GATHER-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[FOR_INC:%.*]] ] @@ -203,7 +205,7 @@ ; SINK-GATHER-NEXT: [[COND:%.*]] = icmp eq i64 [[I_NEXT]], 10000 ; SINK-GATHER-NEXT: br i1 [[COND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] ; SINK-GATHER: for.end: -; SINK-GATHER-NEXT: [[T8:%.*]] = phi i32 [ [[T7]], [[FOR_INC]] ], [ [[TMP51]], [[MIDDLE_BLOCK]] ] +; SINK-GATHER-NEXT: [[T8:%.*]] = phi i32 [ [[T7]], [[FOR_INC]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; SINK-GATHER-NEXT: ret i32 [[T8]] ; entry: @@ -290,11 +292,12 @@ ; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[TMP18]]) +; CHECK-NEXT: [[RDX_START:%.*]] = add i32 0, [[TMP20]] ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[FOR_INC:%.*]] ] @@ -313,7 +316,7 @@ ; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: for.end: -; CHECK-NEXT: [[T7:%.*]] = phi i32 [ [[T6]], [[FOR_INC]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[T7:%.*]] = phi i32 [ [[T6]], [[FOR_INC]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[T7]] ; ; SINK-GATHER-LABEL: @scalarize_and_sink_gather( @@ -439,11 +442,12 @@ ; SINK-GATHER-NEXT: br i1 [[TMP67]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; SINK-GATHER: middle.block: ; SINK-GATHER-NEXT: [[TMP68:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP66]]) +; SINK-GATHER-NEXT: [[RDX_START:%.*]] = add i32 0, [[TMP68]] ; SINK-GATHER-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]] ; SINK-GATHER-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; SINK-GATHER: scalar.ph: ; SINK-GATHER-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; SINK-GATHER-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP68]], [[MIDDLE_BLOCK]] ] +; SINK-GATHER-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; SINK-GATHER-NEXT: br label [[FOR_BODY:%.*]] ; SINK-GATHER: for.body: ; SINK-GATHER-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[FOR_INC:%.*]] ] @@ -462,7 +466,7 @@ ; SINK-GATHER-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] ; SINK-GATHER-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP5:![0-9]+]] ; SINK-GATHER: for.end: -; SINK-GATHER-NEXT: [[T7:%.*]] = phi i32 [ [[T6]], [[FOR_INC]] ], [ [[TMP68]], [[MIDDLE_BLOCK]] ] +; SINK-GATHER-NEXT: [[T7:%.*]] = phi i32 [ [[T6]], [[FOR_INC]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; SINK-GATHER-NEXT: ret i32 [[T7]] ; entry: Index: llvm/test/Transforms/LoopVectorize/epilog-vectorization-reductions.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/epilog-vectorization-reductions.ll +++ llvm/test/Transforms/LoopVectorize/epilog-vectorization-reductions.ll @@ -18,7 +18,7 @@ ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, i64* [[A:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, i64* [[TMP1]], i32 0 @@ -30,6 +30,7 @@ ; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP4]]) +; CHECK-NEXT: [[RDX_START:%.*]] = add i64 5, [[TMP6]] ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: @@ -37,43 +38,43 @@ ; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 5, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[TMP6]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 5, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[RDX_START]], [[VEC_EPILOG_ITER_CHECK]] ] ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: [[N_MOD_VF2:%.*]] = urem i64 [[N]], 4 ; CHECK-NEXT: [[N_VEC3:%.*]] = sub i64 [[N]], [[N_MOD_VF2]] -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i64> zeroinitializer, i64 [[BC_MERGE_RDX]], i32 0 ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT8:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i64> [ [[TMP7]], [[VEC_EPILOG_PH]] ], [ [[TMP12:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 0 -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, i64* [[TMP9]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i64* [[TMP10]] to <4 x i64>* -; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x i64>, <4 x i64>* [[TMP11]], align 4 -; CHECK-NEXT: [[TMP12]] = add <4 x i64> [[WIDE_LOAD7]], [[VEC_PHI6]] +; CHECK-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i64> [ zeroinitializer, [[VEC_EPILOG_PH]] ], [ [[TMP11:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, i64* [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i64* [[TMP9]] to <4 x i64>* +; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x i64>, <4 x i64>* [[TMP10]], align 4 +; CHECK-NEXT: [[TMP11]] = add <4 x i64> [[WIDE_LOAD7]], [[VEC_PHI6]] ; CHECK-NEXT: [[INDEX_NEXT8]] = add nuw i64 [[OFFSET_IDX]], 4 -; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT8]], [[N_VEC3]] -; CHECK-NEXT: br i1 [[TMP13]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT8]], [[N_VEC3]] +; CHECK-NEXT: br i1 [[TMP12]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] ; CHECK: vec.epilog.middle.block: -; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP12]]) +; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP11]]) +; CHECK-NEXT: [[RDX_START9:%.*]] = add i64 [[BC_MERGE_RDX]], [[TMP13]] ; CHECK-NEXT: [[CMP_N4:%.*]] = icmp eq i64 [[N]], [[N_VEC3]] ; CHECK-NEXT: br i1 [[CMP_N4]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX9:%.*]] = phi i64 [ 5, [[ITER_CHECK]] ], [ [[TMP6]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[TMP14]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX10:%.*]] = phi i64 [ 5, [[ITER_CHECK]] ], [ [[RDX_START]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[RDX_START9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[SUM:%.*]] = phi i64 [ [[BC_MERGE_RDX9]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[SUM:%.*]] = phi i64 [ [[BC_MERGE_RDX10]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[IV]] -; CHECK-NEXT: [[TMP15:%.*]] = load i64, i64* [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[ADD]] = add i64 [[TMP15]], [[SUM]] +; CHECK-NEXT: [[TMP14:%.*]] = load i64, i64* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ADD]] = add i64 [[TMP14]], [[SUM]] ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: for.end: -; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ [[TMP14]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ], [ [[RDX_START9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i64 [[ADD_LCSSA]] ; entry: @@ -221,44 +222,45 @@ ; CHECK-NEXT: [[TMP10:%.*]] = trunc <4 x i32> [[TMP9]] to <4 x i16> ; CHECK-NEXT: [[TMP11:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP10]]) ; CHECK-NEXT: [[TMP12:%.*]] = zext i16 [[TMP11]] to i32 +; CHECK-NEXT: [[RDX_START:%.*]] = or i32 0, [[TMP12]] ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 256, 256 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: ; CHECK-NEXT: br i1 true, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[TMP12]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[RDX_START]], [[VEC_EPILOG_ITER_CHECK]] ] ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i32 [ 256, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0 ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i32 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT5:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ [[TMP13]], [[VEC_EPILOG_PH]] ], [ [[TMP23:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[OFFSET_IDX]], 0 -; CHECK-NEXT: [[TMP15:%.*]] = and <4 x i32> [[VEC_PHI3]], -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i16, i16* [[PTR]], i32 [[TMP14]] -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i16, i16* [[TMP16]], i32 0 -; CHECK-NEXT: [[TMP18:%.*]] = bitcast i16* [[TMP17]] to <4 x i16>* -; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i16>, <4 x i16>* [[TMP18]], align 2 -; CHECK-NEXT: [[TMP19:%.*]] = zext <4 x i16> [[WIDE_LOAD4]] to <4 x i32> -; CHECK-NEXT: [[TMP20:%.*]] = or <4 x i32> [[TMP15]], [[TMP19]] +; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VEC_EPILOG_PH]] ], [ [[TMP22:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP14:%.*]] = and <4 x i32> [[VEC_PHI3]], +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i16, i16* [[PTR]], i32 [[TMP13]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i16, i16* [[TMP15]], i32 0 +; CHECK-NEXT: [[TMP17:%.*]] = bitcast i16* [[TMP16]] to <4 x i16>* +; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i16>, <4 x i16>* [[TMP17]], align 2 +; CHECK-NEXT: [[TMP18:%.*]] = zext <4 x i16> [[WIDE_LOAD4]] to <4 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = or <4 x i32> [[TMP14]], [[TMP18]] ; CHECK-NEXT: [[INDEX_NEXT5]] = add nuw i32 [[OFFSET_IDX]], 4 -; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i32 [[INDEX_NEXT5]], 256 -; CHECK-NEXT: [[TMP22:%.*]] = trunc <4 x i32> [[TMP20]] to <4 x i16> -; CHECK-NEXT: [[TMP23]] = zext <4 x i16> [[TMP22]] to <4 x i32> -; CHECK-NEXT: br i1 [[TMP21]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i32 [[INDEX_NEXT5]], 256 +; CHECK-NEXT: [[TMP21:%.*]] = trunc <4 x i32> [[TMP19]] to <4 x i16> +; CHECK-NEXT: [[TMP22]] = zext <4 x i16> [[TMP21]] to <4 x i32> +; CHECK-NEXT: br i1 [[TMP20]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: vec.epilog.middle.block: -; CHECK-NEXT: [[TMP24:%.*]] = trunc <4 x i32> [[TMP23]] to <4 x i16> -; CHECK-NEXT: [[TMP25:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP24]]) -; CHECK-NEXT: [[TMP26:%.*]] = zext i16 [[TMP25]] to i32 +; CHECK-NEXT: [[TMP23:%.*]] = trunc <4 x i32> [[TMP22]] to <4 x i16> +; CHECK-NEXT: [[TMP24:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP23]]) +; CHECK-NEXT: [[TMP25:%.*]] = zext i16 [[TMP24]] to i32 +; CHECK-NEXT: [[RDX_START6:%.*]] = or i32 [[BC_MERGE_RDX]], [[TMP25]] ; CHECK-NEXT: [[CMP_N1:%.*]] = icmp eq i32 256, 256 ; CHECK-NEXT: br i1 [[CMP_N1]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 256, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 256, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX6:%.*]] = phi i32 [ 0, [[ITER_CHECK]] ], [ [[TMP12]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[TMP26]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX7:%.*]] = phi i32 [ 0, [[ITER_CHECK]] ], [ [[RDX_START]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[RDX_START6]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ] -; CHECK-NEXT: [[SUM_02P:%.*]] = phi i32 [ [[XOR:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX6]], [[VEC_EPILOG_SCALAR_PH]] ] +; CHECK-NEXT: [[SUM_02P:%.*]] = phi i32 [ [[XOR:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX7]], [[VEC_EPILOG_SCALAR_PH]] ] ; CHECK-NEXT: [[SUM_02:%.*]] = and i32 [[SUM_02P]], 65535 ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i16, i16* [[PTR]], i32 [[IV]] ; CHECK-NEXT: [[LOAD:%.*]] = load i16, i16* [[GEP]], align 2 @@ -268,7 +270,7 @@ ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[IV_NEXT]], 256 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: for.end: -; CHECK-NEXT: [[XOR_LCSSA:%.*]] = phi i32 [ [[XOR]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ [[TMP26]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[XOR_LCSSA:%.*]] = phi i32 [ [[XOR]], [[FOR_BODY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ], [ [[RDX_START6]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: [[RET:%.*]] = trunc i32 [[XOR_LCSSA]] to i16 ; CHECK-NEXT: ret i16 [[RET]] ; @@ -309,8 +311,8 @@ ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ , [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x float> [ , [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ , [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0 @@ -323,7 +325,9 @@ ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP4]]) +; CHECK-NEXT: [[RDX_START3:%.*]] = fadd float 1.000000e+01, [[TMP7]] ; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[TMP5]]) +; CHECK-NEXT: [[RDX_START:%.*]] = fmul float 1.500000e+01, [[TMP8]] ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: @@ -331,52 +335,52 @@ ; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 1.500000e+01, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[TMP8]], [[VEC_EPILOG_ITER_CHECK]] ] -; CHECK-NEXT: [[BC_MERGE_RDX3:%.*]] = phi float [ 1.000000e+01, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[TMP7]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 1.500000e+01, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[RDX_START]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX4:%.*]] = phi float [ 1.000000e+01, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[RDX_START3]], [[VEC_EPILOG_ITER_CHECK]] ] ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[N_MOD_VF4:%.*]] = urem i64 [[N]], 4 -; CHECK-NEXT: [[N_VEC5:%.*]] = sub i64 [[N]], [[N_MOD_VF4]] -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x float> , float [[BC_MERGE_RDX]], i32 0 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x float> zeroinitializer, float [[BC_MERGE_RDX3]], i32 0 +; CHECK-NEXT: [[N_MOD_VF5:%.*]] = urem i64 [[N]], 4 +; CHECK-NEXT: [[N_VEC6:%.*]] = sub i64 [[N]], [[N_MOD_VF5]] ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI8:%.*]] = phi <4 x float> [ [[TMP9]], [[VEC_EPILOG_PH]] ], [ [[TMP16:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI9:%.*]] = phi <4 x float> [ [[TMP10]], [[VEC_EPILOG_PH]] ], [ [[TMP15:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], 0 -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, float* [[TMP12]], i32 0 -; CHECK-NEXT: [[TMP14:%.*]] = bitcast float* [[TMP13]] to <4 x float>* -; CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x float>, <4 x float>* [[TMP14]], align 4 -; CHECK-NEXT: [[TMP15]] = fadd fast <4 x float> [[VEC_PHI9]], [[WIDE_LOAD10]] -; CHECK-NEXT: [[TMP16]] = fmul fast <4 x float> [[VEC_PHI8]], [[WIDE_LOAD10]] -; CHECK-NEXT: [[INDEX_NEXT11]] = add nuw i64 [[OFFSET_IDX]], 4 -; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT11]], [[N_VEC5]] -; CHECK-NEXT: br i1 [[TMP17]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT12:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI9:%.*]] = phi <4 x float> [ , [[VEC_EPILOG_PH]] ], [ [[TMP14:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI10:%.*]] = phi <4 x float> [ zeroinitializer, [[VEC_EPILOG_PH]] ], [ [[TMP13:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = bitcast float* [[TMP11]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD11:%.*]] = load <4 x float>, <4 x float>* [[TMP12]], align 4 +; CHECK-NEXT: [[TMP13]] = fadd fast <4 x float> [[VEC_PHI10]], [[WIDE_LOAD11]] +; CHECK-NEXT: [[TMP14]] = fmul fast <4 x float> [[VEC_PHI9]], [[WIDE_LOAD11]] +; CHECK-NEXT: [[INDEX_NEXT12]] = add nuw i64 [[OFFSET_IDX]], 4 +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT12]], [[N_VEC6]] +; CHECK-NEXT: br i1 [[TMP15]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: vec.epilog.middle.block: -; CHECK-NEXT: [[TMP18:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP15]]) -; CHECK-NEXT: [[TMP19:%.*]] = call fast float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[TMP16]]) -; CHECK-NEXT: [[CMP_N6:%.*]] = icmp eq i64 [[N]], [[N_VEC5]] -; CHECK-NEXT: br i1 [[CMP_N6]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP13]]) +; CHECK-NEXT: [[RDX_START15:%.*]] = fadd float [[BC_MERGE_RDX4]], [[TMP16]] +; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[TMP14]]) +; CHECK-NEXT: [[RDX_START13:%.*]] = fmul float [[BC_MERGE_RDX]], [[TMP17]] +; CHECK-NEXT: [[CMP_N7:%.*]] = icmp eq i64 [[N]], [[N_VEC6]] +; CHECK-NEXT: br i1 [[CMP_N7]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC5]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX12:%.*]] = phi float [ 1.500000e+01, [[ITER_CHECK]] ], [ [[TMP8]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[TMP19]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] -; CHECK-NEXT: [[BC_MERGE_RDX13:%.*]] = phi float [ 1.000000e+01, [[ITER_CHECK]] ], [ [[TMP7]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[TMP18]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC6]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX14:%.*]] = phi float [ 1.500000e+01, [[ITER_CHECK]] ], [ [[RDX_START]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[RDX_START13]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX16:%.*]] = phi float [ 1.000000e+01, [[ITER_CHECK]] ], [ [[RDX_START3]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[RDX_START15]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[PROD:%.*]] = phi float [ [[BC_MERGE_RDX12]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[MUL:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[SUM:%.*]] = phi float [ [[BC_MERGE_RDX13]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[PROD:%.*]] = phi float [ [[BC_MERGE_RDX14]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[MUL:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[SUM:%.*]] = phi float [ [[BC_MERGE_RDX16]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] -; CHECK-NEXT: [[TMP20:%.*]] = load float, float* [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[ADD]] = fadd fast float [[SUM]], [[TMP20]] -; CHECK-NEXT: [[MUL]] = fmul fast float [[PROD]], [[TMP20]] +; CHECK-NEXT: [[TMP18:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ADD]] = fadd fast float [[SUM]], [[TMP18]] +; CHECK-NEXT: [[MUL]] = fmul fast float [[PROD]], [[TMP18]] ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: for.end: -; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ [[TMP18]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] -; CHECK-NEXT: [[MUL_LCSSA:%.*]] = phi float [ [[MUL]], [[FOR_BODY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ], [ [[TMP19]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[RDX_START3]], [[MIDDLE_BLOCK]] ], [ [[RDX_START15]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[MUL_LCSSA:%.*]] = phi float [ [[MUL]], [[FOR_BODY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ], [ [[RDX_START13]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: [[DIV:%.*]] = fdiv float [[MUL_LCSSA]], [[ADD_LCSSA]] ; CHECK-NEXT: ret float [[DIV]] ; @@ -418,22 +422,22 @@ ; CHECK: vector.ph: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[START_SUM]], i32 0 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP0]], [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4 -; CHECK-NEXT: [[TMP5]] = sub <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP4]] = sub <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP5]]) +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]]) +; CHECK-NEXT: [[RDX_START:%.*]] = add i32 [[START_SUM]], [[TMP6]] ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: @@ -441,35 +445,35 @@ ; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[START_SUM]], [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[TMP7]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[START_SUM]], [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[RDX_START]], [[VEC_EPILOG_ITER_CHECK]] ] ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: [[N_MOD_VF2:%.*]] = urem i64 [[N]], 4 ; CHECK-NEXT: [[N_VEC3:%.*]] = sub i64 [[N]], [[N_MOD_VF2]] -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0 ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT8:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i32> [ [[TMP8]], [[VEC_EPILOG_PH]] ], [ [[TMP13:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[OFFSET_IDX]], 0 -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP12]], align 4 -; CHECK-NEXT: [[TMP13]] = sub <4 x i32> [[VEC_PHI6]], [[WIDE_LOAD7]] +; CHECK-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i32> [ zeroinitializer, [[VEC_EPILOG_PH]] ], [ [[TMP11:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP10]], align 4 +; CHECK-NEXT: [[TMP11]] = sub <4 x i32> [[VEC_PHI6]], [[WIDE_LOAD7]] ; CHECK-NEXT: [[INDEX_NEXT8]] = add nuw i64 [[OFFSET_IDX]], 4 -; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT8]], [[N_VEC3]] -; CHECK-NEXT: br i1 [[TMP14]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT8]], [[N_VEC3]] +; CHECK-NEXT: br i1 [[TMP12]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK: vec.epilog.middle.block: -; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP13]]) +; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP11]]) +; CHECK-NEXT: [[RDX_START9:%.*]] = add i32 [[BC_MERGE_RDX]], [[TMP13]] ; CHECK-NEXT: [[CMP_N4:%.*]] = icmp eq i64 [[N]], [[N_VEC3]] ; CHECK-NEXT: br i1 [[CMP_N4]], label [[FOR_COND]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ] -; CHECK-NEXT: [[BC_MERGE_RDX9:%.*]] = phi i32 [ [[START_SUM]], [[ITER_CHECK]] ], [ [[TMP7]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[TMP15]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX10:%.*]] = phi i32 [ [[START_SUM]], [[ITER_CHECK]] ], [ [[RDX_START]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[RDX_START9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[SUM:%.*]] = phi i32 [ [[BC_MERGE_RDX9]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[SUB:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[SUM:%.*]] = phi i32 [ [[BC_MERGE_RDX10]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[SUB:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV]] ; CHECK-NEXT: [[LOAD:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[SUB]] = sub nsw i32 [[SUM]], [[LOAD]] @@ -477,7 +481,7 @@ ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND]], label [[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK: for.cond: -; CHECK-NEXT: [[SUB_LCSSA]] = phi i32 [ [[SUB]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ [[TMP15]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[SUB_LCSSA]] = phi i32 [ [[SUB]], [[FOR_BODY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ], [ [[RDX_START9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: [[OUTER_IV_NEXT]] = add nuw nsw i64 [[OUTER_IV]], 1 ; CHECK-NEXT: [[OUTER_EXITCOND_NOT:%.*]] = icmp eq i64 [[OUTER_IV_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[OUTER_EXITCOND_NOT]], label [[FOR_END:%.*]], label [[ITER_CHECK]] Index: llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll +++ llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll @@ -100,39 +100,39 @@ ; UNROLL-NO-VF-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL-NO-VF: vector.body: ; UNROLL-NO-VF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; UNROLL-NO-VF-NEXT: [[VECTOR_RECUR:%.*]] = phi i32 [ [[PRE_LOAD]], [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] -; UNROLL-NO-VF-NEXT: [[INDUCTION:%.*]] = add i64 [[INDEX]], 0 -; UNROLL-NO-VF-NEXT: [[INDUCTION1:%.*]] = add i64 [[INDEX]], 1 -; UNROLL-NO-VF-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[INDUCTION]], 1 -; UNROLL-NO-VF-NEXT: [[TMP4:%.*]] = add nuw nsw i64 [[INDUCTION1]], 1 -; UNROLL-NO-VF-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP3]] -; UNROLL-NO-VF-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP4]] -; UNROLL-NO-VF-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP5]], align 4 -; UNROLL-NO-VF-NEXT: [[TMP8]] = load i32, i32* [[TMP6]], align 4 -; UNROLL-NO-VF-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDUCTION]] -; UNROLL-NO-VF-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDUCTION1]] -; UNROLL-NO-VF-NEXT: [[TMP11:%.*]] = add i32 [[TMP7]], [[VECTOR_RECUR]] -; UNROLL-NO-VF-NEXT: [[TMP12:%.*]] = add i32 [[TMP8]], [[TMP7]] -; UNROLL-NO-VF-NEXT: store i32 [[TMP11]], i32* [[TMP9]], align 4 -; UNROLL-NO-VF-NEXT: store i32 [[TMP12]], i32* [[TMP10]], align 4 +; UNROLL-NO-VF-NEXT: [[VECTOR_RECUR:%.*]] = phi i32 [ [[PRE_LOAD]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] +; UNROLL-NO-VF-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0 +; UNROLL-NO-VF-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 1 +; UNROLL-NO-VF-NEXT: [[TMP5:%.*]] = add nuw nsw i64 [[TMP3]], 1 +; UNROLL-NO-VF-NEXT: [[TMP6:%.*]] = add nuw nsw i64 [[TMP4]], 1 +; UNROLL-NO-VF-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP5]] +; UNROLL-NO-VF-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP6]] +; UNROLL-NO-VF-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4 +; UNROLL-NO-VF-NEXT: [[TMP10]] = load i32, i32* [[TMP8]], align 4 +; UNROLL-NO-VF-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP3]] +; UNROLL-NO-VF-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP4]] +; UNROLL-NO-VF-NEXT: [[TMP13:%.*]] = add i32 [[TMP9]], [[VECTOR_RECUR]] +; UNROLL-NO-VF-NEXT: [[TMP14:%.*]] = add i32 [[TMP10]], [[TMP9]] +; UNROLL-NO-VF-NEXT: store i32 [[TMP13]], i32* [[TMP11]], align 4 +; UNROLL-NO-VF-NEXT: store i32 [[TMP14]], i32* [[TMP12]], align 4 ; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; UNROLL-NO-VF-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; UNROLL-NO-VF-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; UNROLL-NO-VF-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; UNROLL-NO-VF-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; UNROLL-NO-VF: middle.block: ; UNROLL-NO-VF-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] ; UNROLL-NO-VF-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] ; UNROLL-NO-VF: scalar.ph: -; UNROLL-NO-VF-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[PRE_LOAD]], [[FOR_PREHEADER]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] +; UNROLL-NO-VF-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[PRE_LOAD]], [[FOR_PREHEADER]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ] ; UNROLL-NO-VF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PREHEADER]] ] ; UNROLL-NO-VF-NEXT: br label [[SCALAR_BODY:%.*]] ; UNROLL-NO-VF: scalar.body: -; UNROLL-NO-VF-NEXT: [[SCALAR_RECUR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP14:%.*]], [[SCALAR_BODY]] ] +; UNROLL-NO-VF-NEXT: [[SCALAR_RECUR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP16:%.*]], [[SCALAR_BODY]] ] ; UNROLL-NO-VF-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[SCALAR_BODY]] ] ; UNROLL-NO-VF-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; UNROLL-NO-VF-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT]] -; UNROLL-NO-VF-NEXT: [[TMP14]] = load i32, i32* [[ARRAYIDX32]], align 4 +; UNROLL-NO-VF-NEXT: [[TMP16]] = load i32, i32* [[ARRAYIDX32]], align 4 ; UNROLL-NO-VF-NEXT: [[ARRAYIDX34:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]] -; UNROLL-NO-VF-NEXT: [[ADD35:%.*]] = add i32 [[TMP14]], [[SCALAR_RECUR]] +; UNROLL-NO-VF-NEXT: [[ADD35:%.*]] = add i32 [[TMP16]], [[SCALAR_RECUR]] ; UNROLL-NO-VF-NEXT: store i32 [[ADD35]], i32* [[ARRAYIDX34]], align 4 ; UNROLL-NO-VF-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 ; UNROLL-NO-VF-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] @@ -333,35 +333,35 @@ ; UNROLL-NO-VF-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL-NO-VF: vector.body: ; UNROLL-NO-VF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; UNROLL-NO-VF-NEXT: [[VECTOR_RECUR:%.*]] = phi i32 [ [[DOTPRE]], [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] -; UNROLL-NO-VF-NEXT: [[VEC_PHI:%.*]] = phi i32 [ poison, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] -; UNROLL-NO-VF-NEXT: [[VEC_PHI1:%.*]] = phi i32 [ poison, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] -; UNROLL-NO-VF-NEXT: [[INDUCTION:%.*]] = add i64 [[INDEX]], 0 -; UNROLL-NO-VF-NEXT: [[INDUCTION2:%.*]] = add i64 [[INDEX]], 1 -; UNROLL-NO-VF-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDUCTION]] -; UNROLL-NO-VF-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDUCTION2]] -; UNROLL-NO-VF-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP3]], align 4 -; UNROLL-NO-VF-NEXT: [[TMP6]] = load i32, i32* [[TMP4]], align 4 -; UNROLL-NO-VF-NEXT: [[TMP7:%.*]] = sub nsw i32 [[TMP5]], [[VECTOR_RECUR]] -; UNROLL-NO-VF-NEXT: [[TMP8:%.*]] = sub nsw i32 [[TMP6]], [[TMP5]] -; UNROLL-NO-VF-NEXT: [[TMP9:%.*]] = icmp sgt i32 [[TMP7]], 0 -; UNROLL-NO-VF-NEXT: [[TMP10:%.*]] = icmp sgt i32 [[TMP8]], 0 -; UNROLL-NO-VF-NEXT: [[TMP11:%.*]] = select i1 [[TMP9]], i32 [[TMP7]], i32 0 -; UNROLL-NO-VF-NEXT: [[TMP12:%.*]] = select i1 [[TMP10]], i32 [[TMP8]], i32 0 -; UNROLL-NO-VF-NEXT: [[TMP13:%.*]] = icmp slt i32 [[VEC_PHI]], [[TMP11]] -; UNROLL-NO-VF-NEXT: [[TMP14:%.*]] = icmp slt i32 [[VEC_PHI1]], [[TMP12]] -; UNROLL-NO-VF-NEXT: [[TMP15]] = select i1 [[TMP13]], i32 [[VEC_PHI]], i32 [[TMP11]] -; UNROLL-NO-VF-NEXT: [[TMP16]] = select i1 [[TMP14]], i32 [[VEC_PHI1]], i32 [[TMP12]] +; UNROLL-NO-VF-NEXT: [[VECTOR_RECUR:%.*]] = phi i32 [ [[DOTPRE]], [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] +; UNROLL-NO-VF-NEXT: [[VEC_PHI:%.*]] = phi i32 [ poison, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ] +; UNROLL-NO-VF-NEXT: [[VEC_PHI1:%.*]] = phi i32 [ poison, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] +; UNROLL-NO-VF-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0 +; UNROLL-NO-VF-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 1 +; UNROLL-NO-VF-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP3]] +; UNROLL-NO-VF-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP4]] +; UNROLL-NO-VF-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP5]], align 4 +; UNROLL-NO-VF-NEXT: [[TMP8]] = load i32, i32* [[TMP6]], align 4 +; UNROLL-NO-VF-NEXT: [[TMP9:%.*]] = sub nsw i32 [[TMP7]], [[VECTOR_RECUR]] +; UNROLL-NO-VF-NEXT: [[TMP10:%.*]] = sub nsw i32 [[TMP8]], [[TMP7]] +; UNROLL-NO-VF-NEXT: [[TMP11:%.*]] = icmp sgt i32 [[TMP9]], 0 +; UNROLL-NO-VF-NEXT: [[TMP12:%.*]] = icmp sgt i32 [[TMP10]], 0 +; UNROLL-NO-VF-NEXT: [[TMP13:%.*]] = select i1 [[TMP11]], i32 [[TMP9]], i32 0 +; UNROLL-NO-VF-NEXT: [[TMP14:%.*]] = select i1 [[TMP12]], i32 [[TMP10]], i32 0 +; UNROLL-NO-VF-NEXT: [[TMP15:%.*]] = icmp slt i32 [[VEC_PHI]], [[TMP13]] +; UNROLL-NO-VF-NEXT: [[TMP16:%.*]] = icmp slt i32 [[VEC_PHI1]], [[TMP14]] +; UNROLL-NO-VF-NEXT: [[TMP17]] = select i1 [[TMP15]], i32 [[VEC_PHI]], i32 [[TMP13]] +; UNROLL-NO-VF-NEXT: [[TMP18]] = select i1 [[TMP16]], i32 [[VEC_PHI1]], i32 [[TMP14]] ; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; UNROLL-NO-VF-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; UNROLL-NO-VF-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; UNROLL-NO-VF-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; UNROLL-NO-VF-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; UNROLL-NO-VF: middle.block: -; UNROLL-NO-VF-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp slt i32 [[TMP15]], [[TMP16]] -; UNROLL-NO-VF-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select i1 [[RDX_MINMAX_CMP]], i32 [[TMP15]], i32 [[TMP16]] +; UNROLL-NO-VF-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp slt i32 [[TMP17]], [[TMP18]] +; UNROLL-NO-VF-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select i1 [[RDX_MINMAX_CMP]], i32 [[TMP17]], i32 [[TMP18]] ; UNROLL-NO-VF-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] ; UNROLL-NO-VF-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; UNROLL-NO-VF: scalar.ph: -; UNROLL-NO-VF-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[DOTPRE]], [[FOR_PREHEADER]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] +; UNROLL-NO-VF-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[DOTPRE]], [[FOR_PREHEADER]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] ; UNROLL-NO-VF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PREHEADER]] ] ; UNROLL-NO-VF-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ poison, [[FOR_PREHEADER]] ], [ [[RDX_MINMAX_SELECT]], [[MIDDLE_BLOCK]] ] ; UNROLL-NO-VF-NEXT: br label [[SCALAR_BODY:%.*]] @@ -372,12 +372,12 @@ ; UNROLL-NO-VF-NEXT: [[MINMAX_0_LCSSA:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[MINMAX_0_COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] ; UNROLL-NO-VF-NEXT: ret i32 [[MINMAX_0_LCSSA]] ; UNROLL-NO-VF: scalar.body: -; UNROLL-NO-VF-NEXT: [[SCALAR_RECUR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP18:%.*]], [[SCALAR_BODY]] ] +; UNROLL-NO-VF-NEXT: [[SCALAR_RECUR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP20:%.*]], [[SCALAR_BODY]] ] ; UNROLL-NO-VF-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[SCALAR_BODY]] ] ; UNROLL-NO-VF-NEXT: [[MINMAX_028:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MINMAX_0_COND]], [[SCALAR_BODY]] ] ; UNROLL-NO-VF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]] -; UNROLL-NO-VF-NEXT: [[TMP18]] = load i32, i32* [[ARRAYIDX]], align 4 -; UNROLL-NO-VF-NEXT: [[SUB3:%.*]] = sub nsw i32 [[TMP18]], [[SCALAR_RECUR]] +; UNROLL-NO-VF-NEXT: [[TMP20]] = load i32, i32* [[ARRAYIDX]], align 4 +; UNROLL-NO-VF-NEXT: [[SUB3:%.*]] = sub nsw i32 [[TMP20]], [[SCALAR_RECUR]] ; UNROLL-NO-VF-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[SUB3]], 0 ; UNROLL-NO-VF-NEXT: [[COND:%.*]] = select i1 [[CMP4]], i32 [[SUB3]], i32 0 ; UNROLL-NO-VF-NEXT: [[CMP5:%.*]] = icmp slt i32 [[MINMAX_028]], [[COND]] @@ -614,42 +614,42 @@ ; UNROLL-NO-VF-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL-NO-VF: vector.body: ; UNROLL-NO-VF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; UNROLL-NO-VF-NEXT: [[VECTOR_RECUR:%.*]] = phi i16 [ [[TMP0]], [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] +; UNROLL-NO-VF-NEXT: [[VECTOR_RECUR:%.*]] = phi i16 [ [[TMP0]], [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] ; UNROLL-NO-VF-NEXT: [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]] -; UNROLL-NO-VF-NEXT: [[INDUCTION:%.*]] = add i64 [[OFFSET_IDX]], 0 -; UNROLL-NO-VF-NEXT: [[INDUCTION1:%.*]] = add i64 [[OFFSET_IDX]], 1 -; UNROLL-NO-VF-NEXT: [[TMP4:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[INDUCTION]] -; UNROLL-NO-VF-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[INDUCTION1]] -; UNROLL-NO-VF-NEXT: [[TMP6:%.*]] = load i16, i16* [[TMP4]], align 2 -; UNROLL-NO-VF-NEXT: [[TMP7]] = load i16, i16* [[TMP5]], align 2 -; UNROLL-NO-VF-NEXT: [[TMP8:%.*]] = sitofp i16 [[TMP6]] to double -; UNROLL-NO-VF-NEXT: [[TMP9:%.*]] = sitofp i16 [[TMP7]] to double -; UNROLL-NO-VF-NEXT: [[TMP10:%.*]] = sitofp i16 [[VECTOR_RECUR]] to double -; UNROLL-NO-VF-NEXT: [[TMP11:%.*]] = sitofp i16 [[TMP6]] to double -; UNROLL-NO-VF-NEXT: [[TMP12:%.*]] = fmul fast double [[TMP10]], [[CONV1]] -; UNROLL-NO-VF-NEXT: [[TMP13:%.*]] = fmul fast double [[TMP11]], [[CONV1]] -; UNROLL-NO-VF-NEXT: [[TMP14:%.*]] = fsub fast double [[TMP8]], [[TMP12]] -; UNROLL-NO-VF-NEXT: [[TMP15:%.*]] = fsub fast double [[TMP9]], [[TMP13]] -; UNROLL-NO-VF-NEXT: [[TMP16:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDUCTION]] -; UNROLL-NO-VF-NEXT: [[TMP17:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDUCTION1]] -; UNROLL-NO-VF-NEXT: store double [[TMP14]], double* [[TMP16]], align 8 -; UNROLL-NO-VF-NEXT: store double [[TMP15]], double* [[TMP17]], align 8 +; UNROLL-NO-VF-NEXT: [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 0 +; UNROLL-NO-VF-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 1 +; UNROLL-NO-VF-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[TMP4]] +; UNROLL-NO-VF-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[TMP5]] +; UNROLL-NO-VF-NEXT: [[TMP8:%.*]] = load i16, i16* [[TMP6]], align 2 +; UNROLL-NO-VF-NEXT: [[TMP9]] = load i16, i16* [[TMP7]], align 2 +; UNROLL-NO-VF-NEXT: [[TMP10:%.*]] = sitofp i16 [[TMP8]] to double +; UNROLL-NO-VF-NEXT: [[TMP11:%.*]] = sitofp i16 [[TMP9]] to double +; UNROLL-NO-VF-NEXT: [[TMP12:%.*]] = sitofp i16 [[VECTOR_RECUR]] to double +; UNROLL-NO-VF-NEXT: [[TMP13:%.*]] = sitofp i16 [[TMP8]] to double +; UNROLL-NO-VF-NEXT: [[TMP14:%.*]] = fmul fast double [[TMP12]], [[CONV1]] +; UNROLL-NO-VF-NEXT: [[TMP15:%.*]] = fmul fast double [[TMP13]], [[CONV1]] +; UNROLL-NO-VF-NEXT: [[TMP16:%.*]] = fsub fast double [[TMP10]], [[TMP14]] +; UNROLL-NO-VF-NEXT: [[TMP17:%.*]] = fsub fast double [[TMP11]], [[TMP15]] +; UNROLL-NO-VF-NEXT: [[TMP18:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP4]] +; UNROLL-NO-VF-NEXT: [[TMP19:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP5]] +; UNROLL-NO-VF-NEXT: store double [[TMP16]], double* [[TMP18]], align 8 +; UNROLL-NO-VF-NEXT: store double [[TMP17]], double* [[TMP19]], align 8 ; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; UNROLL-NO-VF-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; UNROLL-NO-VF-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; UNROLL-NO-VF-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; UNROLL-NO-VF-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; UNROLL-NO-VF: middle.block: ; UNROLL-NO-VF-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] ; UNROLL-NO-VF-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; UNROLL-NO-VF: scalar.ph: -; UNROLL-NO-VF-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[TMP0]], [[FOR_PREHEADER]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; UNROLL-NO-VF-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[TMP0]], [[FOR_PREHEADER]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] ; UNROLL-NO-VF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1, [[FOR_PREHEADER]] ] ; UNROLL-NO-VF-NEXT: br label [[SCALAR_BODY:%.*]] ; UNROLL-NO-VF: scalar.body: -; UNROLL-NO-VF-NEXT: [[SCALAR_RECUR:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP19:%.*]], [[SCALAR_BODY]] ] +; UNROLL-NO-VF-NEXT: [[SCALAR_RECUR:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP21:%.*]], [[SCALAR_BODY]] ] ; UNROLL-NO-VF-NEXT: [[ADVARS_IV:%.*]] = phi i64 [ [[ADVARS_IV_NEXT:%.*]], [[SCALAR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; UNROLL-NO-VF-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[ADVARS_IV]] -; UNROLL-NO-VF-NEXT: [[TMP19]] = load i16, i16* [[ARRAYIDX5]], align 2 -; UNROLL-NO-VF-NEXT: [[CONV6:%.*]] = sitofp i16 [[TMP19]] to double +; UNROLL-NO-VF-NEXT: [[TMP21]] = load i16, i16* [[ARRAYIDX5]], align 2 +; UNROLL-NO-VF-NEXT: [[CONV6:%.*]] = sitofp i16 [[TMP21]] to double ; UNROLL-NO-VF-NEXT: [[CONV11:%.*]] = sitofp i16 [[SCALAR_RECUR]] to double ; UNROLL-NO-VF-NEXT: [[MUL12:%.*]] = fmul fast double [[CONV11]], [[CONV1]] ; UNROLL-NO-VF-NEXT: [[SUB13:%.*]] = fsub fast double [[CONV6]], [[MUL12]] @@ -994,18 +994,18 @@ ; UNROLL-NO-VF-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL-NO-VF: vector.body: ; UNROLL-NO-VF-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; UNROLL-NO-VF-NEXT: [[VECTOR_RECUR:%.*]] = phi i32 [ [[E_015]], [[VECTOR_PH]] ], [ [[INDUCTION1:%.*]], [[VECTOR_BODY]] ] +; UNROLL-NO-VF-NEXT: [[VECTOR_RECUR:%.*]] = phi i32 [ [[E_015]], [[VECTOR_PH]] ], [ [[TMP1:%.*]], [[VECTOR_BODY]] ] ; UNROLL-NO-VF-NEXT: [[OFFSET_IDX:%.*]] = sub i32 [[I_016]], [[INDEX]] -; UNROLL-NO-VF-NEXT: [[INDUCTION:%.*]] = add i32 [[OFFSET_IDX]], 0 -; UNROLL-NO-VF-NEXT: [[INDUCTION1]] = add i32 [[OFFSET_IDX]], -1 +; UNROLL-NO-VF-NEXT: [[TMP0:%.*]] = add i32 [[OFFSET_IDX]], 0 +; UNROLL-NO-VF-NEXT: [[TMP1]] = add i32 [[OFFSET_IDX]], -1 ; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 -; UNROLL-NO-VF-NEXT: [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; UNROLL-NO-VF-NEXT: br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; UNROLL-NO-VF-NEXT: [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; UNROLL-NO-VF-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; UNROLL-NO-VF: middle.block: ; UNROLL-NO-VF-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[I_016]], [[N_VEC]] ; UNROLL-NO-VF-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP3]], label [[SCALAR_PH]] ; UNROLL-NO-VF: scalar.ph: -; UNROLL-NO-VF-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[E_015]], [[FOR_COND1_PREHEADER]] ], [ [[INDUCTION1]], [[MIDDLE_BLOCK]] ] +; UNROLL-NO-VF-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[E_015]], [[FOR_COND1_PREHEADER]] ], [ [[TMP1]], [[MIDDLE_BLOCK]] ] ; UNROLL-NO-VF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[I_016]], [[FOR_COND1_PREHEADER]] ] ; UNROLL-NO-VF-NEXT: br label [[FOR_COND1:%.*]] ; UNROLL-NO-VF: for.cond.cleanup: @@ -1018,7 +1018,7 @@ ; UNROLL-NO-VF-NEXT: [[DEC]] = add nsw i32 [[K_0]], -1 ; UNROLL-NO-VF-NEXT: br i1 [[CMP2]], label [[FOR_COND1]], label [[FOR_COND_CLEANUP3]], !llvm.loop [[LOOP8:![0-9]+]] ; UNROLL-NO-VF: for.cond.cleanup3: -; UNROLL-NO-VF-NEXT: [[E_1_LCSSA]] = phi i32 [ [[SCALAR_RECUR]], [[FOR_COND1]] ], [ [[INDUCTION]], [[MIDDLE_BLOCK]] ] +; UNROLL-NO-VF-NEXT: [[E_1_LCSSA]] = phi i32 [ [[SCALAR_RECUR]], [[FOR_COND1]] ], [ [[TMP0]], [[MIDDLE_BLOCK]] ] ; UNROLL-NO-VF-NEXT: [[INC]] = add nuw nsw i32 [[I_016]], 1 ; UNROLL-NO-VF-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 49 ; UNROLL-NO-VF-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER]] @@ -1197,24 +1197,24 @@ ; UNROLL-NO-VF-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL-NO-VF: vector.body: ; UNROLL-NO-VF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; UNROLL-NO-VF-NEXT: [[VECTOR_RECUR:%.*]] = phi i32 [ [[PRE_LOAD:%.*]], [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] +; UNROLL-NO-VF-NEXT: [[VECTOR_RECUR:%.*]] = phi i32 [ [[PRE_LOAD:%.*]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] ; UNROLL-NO-VF-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2 -; UNROLL-NO-VF-NEXT: [[INDUCTION:%.*]] = add i64 [[OFFSET_IDX]], 0 -; UNROLL-NO-VF-NEXT: [[INDUCTION1:%.*]] = add i64 [[OFFSET_IDX]], 2 -; UNROLL-NO-VF-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[INDUCTION]], 2 -; UNROLL-NO-VF-NEXT: [[TMP4:%.*]] = add nuw nsw i64 [[INDUCTION1]], 2 -; UNROLL-NO-VF-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP3]] -; UNROLL-NO-VF-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP4]] -; UNROLL-NO-VF-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP5]], align 4 -; UNROLL-NO-VF-NEXT: [[TMP8]] = load i32, i32* [[TMP6]], align 4 +; UNROLL-NO-VF-NEXT: [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 0 +; UNROLL-NO-VF-NEXT: [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 2 +; UNROLL-NO-VF-NEXT: [[TMP5:%.*]] = add nuw nsw i64 [[TMP3]], 2 +; UNROLL-NO-VF-NEXT: [[TMP6:%.*]] = add nuw nsw i64 [[TMP4]], 2 +; UNROLL-NO-VF-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP5]] +; UNROLL-NO-VF-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP6]] +; UNROLL-NO-VF-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4 +; UNROLL-NO-VF-NEXT: [[TMP10]] = load i32, i32* [[TMP8]], align 4 ; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; UNROLL-NO-VF-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; UNROLL-NO-VF-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; UNROLL-NO-VF-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; UNROLL-NO-VF-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; UNROLL-NO-VF: middle.block: ; UNROLL-NO-VF-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] ; UNROLL-NO-VF-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; UNROLL-NO-VF: scalar.ph: -; UNROLL-NO-VF-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[PRE_LOAD]], [[ENTRY:%.*]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] +; UNROLL-NO-VF-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[PRE_LOAD]], [[ENTRY:%.*]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ] ; UNROLL-NO-VF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] ; UNROLL-NO-VF-NEXT: br label [[SCALAR_BODY:%.*]] ; UNROLL-NO-VF: scalar.body: @@ -1467,19 +1467,19 @@ ; UNROLL-NO-VF-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL-NO-VF: vector.body: ; UNROLL-NO-VF-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; UNROLL-NO-VF-NEXT: [[VECTOR_RECUR:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP1:%.*]], [[VECTOR_BODY]] ] -; UNROLL-NO-VF-NEXT: [[INDUCTION:%.*]] = add i32 [[INDEX]], 0 -; UNROLL-NO-VF-NEXT: [[INDUCTION1:%.*]] = add i32 [[INDEX]], 1 -; UNROLL-NO-VF-NEXT: [[TMP0:%.*]] = add i32 [[INDUCTION]], [[X:%.*]] -; UNROLL-NO-VF-NEXT: [[TMP1]] = add i32 [[INDUCTION1]], [[X]] +; UNROLL-NO-VF-NEXT: [[VECTOR_RECUR:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] +; UNROLL-NO-VF-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 +; UNROLL-NO-VF-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 1 +; UNROLL-NO-VF-NEXT: [[TMP2:%.*]] = add i32 [[TMP0]], [[X:%.*]] +; UNROLL-NO-VF-NEXT: [[TMP3]] = add i32 [[TMP1]], [[X]] ; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 -; UNROLL-NO-VF-NEXT: [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 96 -; UNROLL-NO-VF-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; UNROLL-NO-VF-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 96 +; UNROLL-NO-VF-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; UNROLL-NO-VF: middle.block: ; UNROLL-NO-VF-NEXT: [[CMP_N:%.*]] = icmp eq i32 96, 96 ; UNROLL-NO-VF-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; UNROLL-NO-VF: scalar.ph: -; UNROLL-NO-VF-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP1]], [[MIDDLE_BLOCK]] ] +; UNROLL-NO-VF-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP3]], [[MIDDLE_BLOCK]] ] ; UNROLL-NO-VF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 96, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] ; UNROLL-NO-VF-NEXT: br label [[FOR_BODY:%.*]] ; UNROLL-NO-VF: for.body: @@ -1491,7 +1491,7 @@ ; UNROLL-NO-VF-NEXT: [[CMP:%.*]] = icmp eq i32 [[INC_PHI]], 95 ; UNROLL-NO-VF-NEXT: br i1 [[CMP]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; UNROLL-NO-VF: for.end: -; UNROLL-NO-VF-NEXT: [[VAL_PHI_LCSSA:%.*]] = phi i32 [ [[SCALAR_RECUR]], [[FOR_BODY]] ], [ [[TMP0]], [[MIDDLE_BLOCK]] ] +; UNROLL-NO-VF-NEXT: [[VAL_PHI_LCSSA:%.*]] = phi i32 [ [[SCALAR_RECUR]], [[FOR_BODY]] ], [ [[TMP2]], [[MIDDLE_BLOCK]] ] ; UNROLL-NO-VF-NEXT: ret i32 [[VAL_PHI_LCSSA]] ; ; SINK-AFTER-LABEL: @extract_second_last_iteration( @@ -1644,6 +1644,7 @@ ; UNROLL-NO-IC: middle.block: ; UNROLL-NO-IC-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP49]], [[TMP48]] ; UNROLL-NO-IC-NEXT: [[TMP51:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) +; UNROLL-NO-IC-NEXT: [[RDX_START:%.*]] = add i32 0, [[TMP51]] ; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i64 10240, 10240 ; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x double> [[TMP39]], i32 3 ; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x double> [[TMP39]], i32 2 @@ -1652,10 +1653,10 @@ ; UNROLL-NO-IC-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi double [ [[J]], [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ] ; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL:%.*]] = phi double* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[B]], [[ENTRY]] ] ; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ 10240, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] -; UNROLL-NO-IC-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP51]], [[MIDDLE_BLOCK]] ] +; UNROLL-NO-IC-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; UNROLL-NO-IC-NEXT: br label [[FOR_BODY:%.*]] ; UNROLL-NO-IC: for.cond.cleanup: -; UNROLL-NO-IC-NEXT: [[A_1_LCSSA:%.*]] = phi i32 [ [[A_1:%.*]], [[FOR_BODY]] ], [ [[TMP51]], [[MIDDLE_BLOCK]] ] +; UNROLL-NO-IC-NEXT: [[A_1_LCSSA:%.*]] = phi i32 [ [[A_1:%.*]], [[FOR_BODY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; UNROLL-NO-IC-NEXT: ret i32 [[A_1_LCSSA]] ; UNROLL-NO-IC: for.body: ; UNROLL-NO-IC-NEXT: [[B_ADDR_012:%.*]] = phi double* [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD_PTR:%.*]], [[FOR_BODY]] ] @@ -1781,6 +1782,7 @@ ; SINK-AFTER-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; SINK-AFTER: middle.block: ; SINK-AFTER-NEXT: [[TMP26:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP24]]) +; SINK-AFTER-NEXT: [[RDX_START:%.*]] = add i32 0, [[TMP26]] ; SINK-AFTER-NEXT: [[CMP_N:%.*]] = icmp eq i64 10240, 10240 ; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x double> [[TMP19]], i32 3 ; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x double> [[TMP19]], i32 2 @@ -1789,10 +1791,10 @@ ; SINK-AFTER-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi double [ [[J]], [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ] ; SINK-AFTER-NEXT: [[BC_RESUME_VAL:%.*]] = phi double* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[B]], [[ENTRY]] ] ; SINK-AFTER-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ 10240, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] -; SINK-AFTER-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ] +; SINK-AFTER-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; SINK-AFTER-NEXT: br label [[FOR_BODY:%.*]] ; SINK-AFTER: for.cond.cleanup: -; SINK-AFTER-NEXT: [[A_1_LCSSA:%.*]] = phi i32 [ [[A_1:%.*]], [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ] +; SINK-AFTER-NEXT: [[A_1_LCSSA:%.*]] = phi i32 [ [[A_1:%.*]], [[FOR_BODY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; SINK-AFTER-NEXT: ret i32 [[A_1_LCSSA]] ; SINK-AFTER: for.body: ; SINK-AFTER-NEXT: [[B_ADDR_012:%.*]] = phi double* [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD_PTR:%.*]], [[FOR_BODY]] ] @@ -1923,43 +1925,43 @@ ; UNROLL-NO-VF-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL-NO-VF: vector.body: ; UNROLL-NO-VF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; UNROLL-NO-VF-NEXT: [[VECTOR_RECUR:%.*]] = phi i16 [ [[DOTPRE]], [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] -; UNROLL-NO-VF-NEXT: [[INDUCTION:%.*]] = add i64 [[INDEX]], 0 -; UNROLL-NO-VF-NEXT: [[INDUCTION1:%.*]] = add i64 [[INDEX]], 1 -; UNROLL-NO-VF-NEXT: [[TMP0:%.*]] = add nuw nsw i64 [[INDUCTION]], 1 -; UNROLL-NO-VF-NEXT: [[TMP1:%.*]] = add nuw nsw i64 [[INDUCTION1]], 1 -; UNROLL-NO-VF-NEXT: [[TMP2:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[TMP0]] -; UNROLL-NO-VF-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[TMP1]] -; UNROLL-NO-VF-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP2]], align 2 -; UNROLL-NO-VF-NEXT: [[TMP5]] = load i16, i16* [[TMP3]], align 2 -; UNROLL-NO-VF-NEXT: [[TMP6:%.*]] = sext i16 [[VECTOR_RECUR]] to i32 -; UNROLL-NO-VF-NEXT: [[TMP7:%.*]] = sext i16 [[TMP4]] to i32 -; UNROLL-NO-VF-NEXT: [[TMP8:%.*]] = sext i16 [[TMP4]] to i32 -; UNROLL-NO-VF-NEXT: [[TMP9:%.*]] = sext i16 [[TMP5]] to i32 -; UNROLL-NO-VF-NEXT: [[TMP10:%.*]] = mul nsw i32 [[TMP8]], [[TMP6]] -; UNROLL-NO-VF-NEXT: [[TMP11:%.*]] = mul nsw i32 [[TMP9]], [[TMP7]] -; UNROLL-NO-VF-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDUCTION]] -; UNROLL-NO-VF-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDUCTION1]] -; UNROLL-NO-VF-NEXT: store i32 [[TMP10]], i32* [[TMP12]], align 4 -; UNROLL-NO-VF-NEXT: store i32 [[TMP11]], i32* [[TMP13]], align 4 +; UNROLL-NO-VF-NEXT: [[VECTOR_RECUR:%.*]] = phi i16 [ [[DOTPRE]], [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] +; UNROLL-NO-VF-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; UNROLL-NO-VF-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; UNROLL-NO-VF-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP0]], 1 +; UNROLL-NO-VF-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP1]], 1 +; UNROLL-NO-VF-NEXT: [[TMP4:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[TMP2]] +; UNROLL-NO-VF-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[TMP3]] +; UNROLL-NO-VF-NEXT: [[TMP6:%.*]] = load i16, i16* [[TMP4]], align 2 +; UNROLL-NO-VF-NEXT: [[TMP7]] = load i16, i16* [[TMP5]], align 2 +; UNROLL-NO-VF-NEXT: [[TMP8:%.*]] = sext i16 [[VECTOR_RECUR]] to i32 +; UNROLL-NO-VF-NEXT: [[TMP9:%.*]] = sext i16 [[TMP6]] to i32 +; UNROLL-NO-VF-NEXT: [[TMP10:%.*]] = sext i16 [[TMP6]] to i32 +; UNROLL-NO-VF-NEXT: [[TMP11:%.*]] = sext i16 [[TMP7]] to i32 +; UNROLL-NO-VF-NEXT: [[TMP12:%.*]] = mul nsw i32 [[TMP10]], [[TMP8]] +; UNROLL-NO-VF-NEXT: [[TMP13:%.*]] = mul nsw i32 [[TMP11]], [[TMP9]] +; UNROLL-NO-VF-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]] +; UNROLL-NO-VF-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP1]] +; UNROLL-NO-VF-NEXT: store i32 [[TMP12]], i32* [[TMP14]], align 4 +; UNROLL-NO-VF-NEXT: store i32 [[TMP13]], i32* [[TMP15]], align 4 ; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; UNROLL-NO-VF-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; UNROLL-NO-VF-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; UNROLL-NO-VF-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; UNROLL-NO-VF-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] ; UNROLL-NO-VF: middle.block: ; UNROLL-NO-VF-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; UNROLL-NO-VF-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; UNROLL-NO-VF: scalar.ph: -; UNROLL-NO-VF-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[DOTPRE]], [[ENTRY:%.*]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ] +; UNROLL-NO-VF-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[DOTPRE]], [[ENTRY:%.*]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] ; UNROLL-NO-VF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] ; UNROLL-NO-VF-NEXT: br label [[FOR_BODY:%.*]] ; UNROLL-NO-VF: for.body: -; UNROLL-NO-VF-NEXT: [[SCALAR_RECUR:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP15:%.*]], [[FOR_BODY]] ] +; UNROLL-NO-VF-NEXT: [[SCALAR_RECUR:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP17:%.*]], [[FOR_BODY]] ] ; UNROLL-NO-VF-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; UNROLL-NO-VF-NEXT: [[CONV:%.*]] = sext i16 [[SCALAR_RECUR]] to i32 ; UNROLL-NO-VF-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; UNROLL-NO-VF-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[INDVARS_IV_NEXT]] -; UNROLL-NO-VF-NEXT: [[TMP15]] = load i16, i16* [[ARRAYIDX2]], align 2 -; UNROLL-NO-VF-NEXT: [[CONV3:%.*]] = sext i16 [[TMP15]] to i32 +; UNROLL-NO-VF-NEXT: [[TMP17]] = load i16, i16* [[ARRAYIDX2]], align 2 +; UNROLL-NO-VF-NEXT: [[CONV3:%.*]] = sext i16 [[TMP17]] to i32 ; UNROLL-NO-VF-NEXT: [[MUL:%.*]] = mul nsw i32 [[CONV3]], [[CONV]] ; UNROLL-NO-VF-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]] ; UNROLL-NO-VF-NEXT: store i32 [[MUL]], i32* [[ARRAYIDX5]], align 4 @@ -2174,46 +2176,46 @@ ; UNROLL-NO-VF-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL-NO-VF: vector.body: ; UNROLL-NO-VF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; UNROLL-NO-VF-NEXT: [[VECTOR_RECUR:%.*]] = phi i16 [ [[DOTPRE]], [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] -; UNROLL-NO-VF-NEXT: [[INDUCTION:%.*]] = add i64 [[INDEX]], 0 -; UNROLL-NO-VF-NEXT: [[INDUCTION1:%.*]] = add i64 [[INDEX]], 1 -; UNROLL-NO-VF-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[INDUCTION]] -; UNROLL-NO-VF-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDUCTION1]] -; UNROLL-NO-VF-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x i16], [2 x i16]* [[A]], i64 [[INDUCTION]], i64 1 -; UNROLL-NO-VF-NEXT: [[TMP3:%.*]] = getelementptr inbounds [2 x i16], [2 x i16]* [[A]], i64 [[INDUCTION1]], i64 1 -; UNROLL-NO-VF-NEXT: store i32 7, i32* [[TMP0]], align 4 -; UNROLL-NO-VF-NEXT: store i32 7, i32* [[TMP1]], align 4 -; UNROLL-NO-VF-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP2]], align 2 -; UNROLL-NO-VF-NEXT: [[TMP5]] = load i16, i16* [[TMP3]], align 2 -; UNROLL-NO-VF-NEXT: [[TMP6:%.*]] = sext i16 [[VECTOR_RECUR]] to i32 -; UNROLL-NO-VF-NEXT: [[TMP7:%.*]] = sext i16 [[TMP4]] to i32 -; UNROLL-NO-VF-NEXT: [[TMP8:%.*]] = sext i16 [[TMP4]] to i32 -; UNROLL-NO-VF-NEXT: [[TMP9:%.*]] = sext i16 [[TMP5]] to i32 -; UNROLL-NO-VF-NEXT: [[TMP10:%.*]] = mul nsw i32 [[TMP8]], [[TMP6]] -; UNROLL-NO-VF-NEXT: [[TMP11:%.*]] = mul nsw i32 [[TMP9]], [[TMP7]] -; UNROLL-NO-VF-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDUCTION]] -; UNROLL-NO-VF-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDUCTION1]] -; UNROLL-NO-VF-NEXT: store i32 [[TMP10]], i32* [[TMP12]], align 4 -; UNROLL-NO-VF-NEXT: store i32 [[TMP11]], i32* [[TMP13]], align 4 +; UNROLL-NO-VF-NEXT: [[VECTOR_RECUR:%.*]] = phi i16 [ [[DOTPRE]], [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] +; UNROLL-NO-VF-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; UNROLL-NO-VF-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; UNROLL-NO-VF-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[TMP0]] +; UNROLL-NO-VF-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[TMP1]] +; UNROLL-NO-VF-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x i16], [2 x i16]* [[A]], i64 [[TMP0]], i64 1 +; UNROLL-NO-VF-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i16], [2 x i16]* [[A]], i64 [[TMP1]], i64 1 +; UNROLL-NO-VF-NEXT: store i32 7, i32* [[TMP2]], align 4 +; UNROLL-NO-VF-NEXT: store i32 7, i32* [[TMP3]], align 4 +; UNROLL-NO-VF-NEXT: [[TMP6:%.*]] = load i16, i16* [[TMP4]], align 2 +; UNROLL-NO-VF-NEXT: [[TMP7]] = load i16, i16* [[TMP5]], align 2 +; UNROLL-NO-VF-NEXT: [[TMP8:%.*]] = sext i16 [[VECTOR_RECUR]] to i32 +; UNROLL-NO-VF-NEXT: [[TMP9:%.*]] = sext i16 [[TMP6]] to i32 +; UNROLL-NO-VF-NEXT: [[TMP10:%.*]] = sext i16 [[TMP6]] to i32 +; UNROLL-NO-VF-NEXT: [[TMP11:%.*]] = sext i16 [[TMP7]] to i32 +; UNROLL-NO-VF-NEXT: [[TMP12:%.*]] = mul nsw i32 [[TMP10]], [[TMP8]] +; UNROLL-NO-VF-NEXT: [[TMP13:%.*]] = mul nsw i32 [[TMP11]], [[TMP9]] +; UNROLL-NO-VF-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]] +; UNROLL-NO-VF-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP1]] +; UNROLL-NO-VF-NEXT: store i32 [[TMP12]], i32* [[TMP14]], align 4 +; UNROLL-NO-VF-NEXT: store i32 [[TMP13]], i32* [[TMP15]], align 4 ; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; UNROLL-NO-VF-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; UNROLL-NO-VF-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] +; UNROLL-NO-VF-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; UNROLL-NO-VF-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] ; UNROLL-NO-VF: middle.block: ; UNROLL-NO-VF-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; UNROLL-NO-VF-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; UNROLL-NO-VF: scalar.ph: -; UNROLL-NO-VF-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[DOTPRE]], [[ENTRY:%.*]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ] +; UNROLL-NO-VF-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[DOTPRE]], [[ENTRY:%.*]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] ; UNROLL-NO-VF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] ; UNROLL-NO-VF-NEXT: br label [[FOR_BODY:%.*]] ; UNROLL-NO-VF: for.body: -; UNROLL-NO-VF-NEXT: [[SCALAR_RECUR:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP15:%.*]], [[FOR_BODY]] ] +; UNROLL-NO-VF-NEXT: [[SCALAR_RECUR:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP17:%.*]], [[FOR_BODY]] ] ; UNROLL-NO-VF-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; UNROLL-NO-VF-NEXT: [[ARRAYCIDX:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDVARS_IV]] ; UNROLL-NO-VF-NEXT: [[CUR_INDEX:%.*]] = getelementptr inbounds [2 x i16], [2 x i16]* [[A]], i64 [[INDVARS_IV]], i64 1 ; UNROLL-NO-VF-NEXT: store i32 7, i32* [[ARRAYCIDX]], align 4 ; UNROLL-NO-VF-NEXT: [[CONV:%.*]] = sext i16 [[SCALAR_RECUR]] to i32 -; UNROLL-NO-VF-NEXT: [[TMP15]] = load i16, i16* [[CUR_INDEX]], align 2 -; UNROLL-NO-VF-NEXT: [[CONV3:%.*]] = sext i16 [[TMP15]] to i32 +; UNROLL-NO-VF-NEXT: [[TMP17]] = load i16, i16* [[CUR_INDEX]], align 2 +; UNROLL-NO-VF-NEXT: [[CONV3:%.*]] = sext i16 [[TMP17]] to i32 ; UNROLL-NO-VF-NEXT: [[MUL:%.*]] = mul nsw i32 [[CONV3]], [[CONV]] ; UNROLL-NO-VF-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]] ; UNROLL-NO-VF-NEXT: store i32 [[MUL]], i32* [[ARRAYIDX5]], align 4 @@ -2412,46 +2414,46 @@ ; UNROLL-NO-VF-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL-NO-VF: vector.body: ; UNROLL-NO-VF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; UNROLL-NO-VF-NEXT: [[VECTOR_RECUR:%.*]] = phi i16 [ [[DOTPRE]], [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] -; UNROLL-NO-VF-NEXT: [[INDUCTION:%.*]] = add i64 [[INDEX]], 0 -; UNROLL-NO-VF-NEXT: [[INDUCTION1:%.*]] = add i64 [[INDEX]], 1 -; UNROLL-NO-VF-NEXT: [[TMP0:%.*]] = add nuw nsw i64 [[INDUCTION]], 1 -; UNROLL-NO-VF-NEXT: [[TMP1:%.*]] = add nuw nsw i64 [[INDUCTION1]], 1 -; UNROLL-NO-VF-NEXT: [[TMP2:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[TMP0]] -; UNROLL-NO-VF-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[TMP1]] -; UNROLL-NO-VF-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP2]], align 2 -; UNROLL-NO-VF-NEXT: [[TMP5]] = load i16, i16* [[TMP3]], align 2 -; UNROLL-NO-VF-NEXT: [[TMP6:%.*]] = sext i16 [[VECTOR_RECUR]] to i32 -; UNROLL-NO-VF-NEXT: [[TMP7:%.*]] = sext i16 [[TMP4]] to i32 -; UNROLL-NO-VF-NEXT: [[TMP8:%.*]] = add nsw i32 [[TMP6]], 2 -; UNROLL-NO-VF-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP7]], 2 -; UNROLL-NO-VF-NEXT: [[TMP10:%.*]] = sext i16 [[TMP4]] to i32 -; UNROLL-NO-VF-NEXT: [[TMP11:%.*]] = sext i16 [[TMP5]] to i32 -; UNROLL-NO-VF-NEXT: [[TMP12:%.*]] = mul nsw i32 [[TMP8]], [[TMP10]] -; UNROLL-NO-VF-NEXT: [[TMP13:%.*]] = mul nsw i32 [[TMP9]], [[TMP11]] -; UNROLL-NO-VF-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDUCTION]] -; UNROLL-NO-VF-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDUCTION1]] -; UNROLL-NO-VF-NEXT: store i32 [[TMP12]], i32* [[TMP14]], align 4 -; UNROLL-NO-VF-NEXT: store i32 [[TMP13]], i32* [[TMP15]], align 4 +; UNROLL-NO-VF-NEXT: [[VECTOR_RECUR:%.*]] = phi i16 [ [[DOTPRE]], [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] +; UNROLL-NO-VF-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; UNROLL-NO-VF-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; UNROLL-NO-VF-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP0]], 1 +; UNROLL-NO-VF-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP1]], 1 +; UNROLL-NO-VF-NEXT: [[TMP4:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[TMP2]] +; UNROLL-NO-VF-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[TMP3]] +; UNROLL-NO-VF-NEXT: [[TMP6:%.*]] = load i16, i16* [[TMP4]], align 2 +; UNROLL-NO-VF-NEXT: [[TMP7]] = load i16, i16* [[TMP5]], align 2 +; UNROLL-NO-VF-NEXT: [[TMP8:%.*]] = sext i16 [[VECTOR_RECUR]] to i32 +; UNROLL-NO-VF-NEXT: [[TMP9:%.*]] = sext i16 [[TMP6]] to i32 +; UNROLL-NO-VF-NEXT: [[TMP10:%.*]] = add nsw i32 [[TMP8]], 2 +; UNROLL-NO-VF-NEXT: [[TMP11:%.*]] = add nsw i32 [[TMP9]], 2 +; UNROLL-NO-VF-NEXT: [[TMP12:%.*]] = sext i16 [[TMP6]] to i32 +; UNROLL-NO-VF-NEXT: [[TMP13:%.*]] = sext i16 [[TMP7]] to i32 +; UNROLL-NO-VF-NEXT: [[TMP14:%.*]] = mul nsw i32 [[TMP10]], [[TMP12]] +; UNROLL-NO-VF-NEXT: [[TMP15:%.*]] = mul nsw i32 [[TMP11]], [[TMP13]] +; UNROLL-NO-VF-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]] +; UNROLL-NO-VF-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP1]] +; UNROLL-NO-VF-NEXT: store i32 [[TMP14]], i32* [[TMP16]], align 4 +; UNROLL-NO-VF-NEXT: store i32 [[TMP15]], i32* [[TMP17]], align 4 ; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; UNROLL-NO-VF-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; UNROLL-NO-VF-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] +; UNROLL-NO-VF-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; UNROLL-NO-VF-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] ; UNROLL-NO-VF: middle.block: ; UNROLL-NO-VF-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; UNROLL-NO-VF-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; UNROLL-NO-VF: scalar.ph: -; UNROLL-NO-VF-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[DOTPRE]], [[ENTRY:%.*]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ] +; UNROLL-NO-VF-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[DOTPRE]], [[ENTRY:%.*]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] ; UNROLL-NO-VF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] ; UNROLL-NO-VF-NEXT: br label [[FOR_BODY:%.*]] ; UNROLL-NO-VF: for.body: -; UNROLL-NO-VF-NEXT: [[SCALAR_RECUR:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP17:%.*]], [[FOR_BODY]] ] +; UNROLL-NO-VF-NEXT: [[SCALAR_RECUR:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP19:%.*]], [[FOR_BODY]] ] ; UNROLL-NO-VF-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; UNROLL-NO-VF-NEXT: [[CONV:%.*]] = sext i16 [[SCALAR_RECUR]] to i32 ; UNROLL-NO-VF-NEXT: [[ADD:%.*]] = add nsw i32 [[CONV]], 2 ; UNROLL-NO-VF-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; UNROLL-NO-VF-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[INDVARS_IV_NEXT]] -; UNROLL-NO-VF-NEXT: [[TMP17]] = load i16, i16* [[ARRAYIDX2]], align 2 -; UNROLL-NO-VF-NEXT: [[CONV3:%.*]] = sext i16 [[TMP17]] to i32 +; UNROLL-NO-VF-NEXT: [[TMP19]] = load i16, i16* [[ARRAYIDX2]], align 2 +; UNROLL-NO-VF-NEXT: [[CONV3:%.*]] = sext i16 [[TMP19]] to i32 ; UNROLL-NO-VF-NEXT: [[MUL:%.*]] = mul nsw i32 [[ADD]], [[CONV3]] ; UNROLL-NO-VF-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]] ; UNROLL-NO-VF-NEXT: store i32 [[MUL]], i32* [[ARRAYIDX5]], align 4 @@ -2701,35 +2703,35 @@ ; UNROLL-NO-VF-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL-NO-VF: vector.body: ; UNROLL-NO-VF-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; UNROLL-NO-VF-NEXT: [[VECTOR_RECUR:%.*]] = phi i16 [ 0, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] -; UNROLL-NO-VF-NEXT: [[VECTOR_RECUR1:%.*]] = phi i32 [ -27, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; UNROLL-NO-VF-NEXT: [[VECTOR_RECUR:%.*]] = phi i16 [ 0, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] +; UNROLL-NO-VF-NEXT: [[VECTOR_RECUR1:%.*]] = phi i32 [ -27, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] ; UNROLL-NO-VF-NEXT: [[TMP0:%.*]] = trunc i32 [[INDEX]] to i16 ; UNROLL-NO-VF-NEXT: [[OFFSET_IDX:%.*]] = add i16 -27, [[TMP0]] -; UNROLL-NO-VF-NEXT: [[INDUCTION:%.*]] = add i16 [[OFFSET_IDX]], 0 -; UNROLL-NO-VF-NEXT: [[INDUCTION2:%.*]] = add i16 [[OFFSET_IDX]], 1 -; UNROLL-NO-VF-NEXT: [[TMP1:%.*]] = add i16 [[INDUCTION]], 1 -; UNROLL-NO-VF-NEXT: [[TMP2:%.*]] = add i16 [[INDUCTION2]], 1 -; UNROLL-NO-VF-NEXT: [[TMP3:%.*]] = zext i16 [[TMP1]] to i32 -; UNROLL-NO-VF-NEXT: [[TMP4]] = zext i16 [[TMP2]] to i32 -; UNROLL-NO-VF-NEXT: [[TMP5:%.*]] = add i16 [[TMP1]], 5 -; UNROLL-NO-VF-NEXT: [[TMP6]] = add i16 [[TMP2]], 5 +; UNROLL-NO-VF-NEXT: [[TMP1:%.*]] = add i16 [[OFFSET_IDX]], 0 +; UNROLL-NO-VF-NEXT: [[TMP2:%.*]] = add i16 [[OFFSET_IDX]], 1 +; UNROLL-NO-VF-NEXT: [[TMP3:%.*]] = add i16 [[TMP1]], 1 +; UNROLL-NO-VF-NEXT: [[TMP4:%.*]] = add i16 [[TMP2]], 1 +; UNROLL-NO-VF-NEXT: [[TMP5:%.*]] = zext i16 [[TMP3]] to i32 +; UNROLL-NO-VF-NEXT: [[TMP6]] = zext i16 [[TMP4]] to i32 +; UNROLL-NO-VF-NEXT: [[TMP7:%.*]] = add i16 [[TMP3]], 5 +; UNROLL-NO-VF-NEXT: [[TMP8]] = add i16 [[TMP4]], 5 ; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 -; UNROLL-NO-VF-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 42 -; UNROLL-NO-VF-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] +; UNROLL-NO-VF-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], 42 +; UNROLL-NO-VF-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] ; UNROLL-NO-VF: middle.block: ; UNROLL-NO-VF-NEXT: [[CMP_N:%.*]] = icmp eq i32 43, 42 ; UNROLL-NO-VF-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; UNROLL-NO-VF: scalar.ph: -; UNROLL-NO-VF-NEXT: [[SCALAR_RECUR_INIT3:%.*]] = phi i32 [ -27, [[ENTRY:%.*]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ] -; UNROLL-NO-VF-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ 0, [[ENTRY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] +; UNROLL-NO-VF-NEXT: [[SCALAR_RECUR_INIT2:%.*]] = phi i32 [ -27, [[ENTRY:%.*]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] +; UNROLL-NO-VF-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ 0, [[ENTRY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] ; UNROLL-NO-VF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i16 [ 15, [[MIDDLE_BLOCK]] ], [ -27, [[ENTRY]] ] ; UNROLL-NO-VF-NEXT: br label [[FOR_COND:%.*]] ; UNROLL-NO-VF: for.cond: ; UNROLL-NO-VF-NEXT: [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_COND]] ] ; UNROLL-NO-VF-NEXT: [[SCALAR_RECUR:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[REC_1_PREV:%.*]], [[FOR_COND]] ] -; UNROLL-NO-VF-NEXT: [[SCALAR_RECUR4:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT3]], [[SCALAR_PH]] ], [ [[REC_2_PREV:%.*]], [[FOR_COND]] ] +; UNROLL-NO-VF-NEXT: [[SCALAR_RECUR3:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT2]], [[SCALAR_PH]] ], [ [[REC_2_PREV:%.*]], [[FOR_COND]] ] ; UNROLL-NO-VF-NEXT: [[USE_REC_1:%.*]] = sub i16 [[SCALAR_RECUR]], 10 -; UNROLL-NO-VF-NEXT: [[CMP:%.*]] = icmp eq i32 [[SCALAR_RECUR4]], 15 +; UNROLL-NO-VF-NEXT: [[CMP:%.*]] = icmp eq i32 [[SCALAR_RECUR3]], 15 ; UNROLL-NO-VF-NEXT: [[IV_NEXT]] = add i16 [[IV]], 1 ; UNROLL-NO-VF-NEXT: [[REC_2_PREV]] = zext i16 [[IV_NEXT]] to i32 ; UNROLL-NO-VF-NEXT: [[REC_1_PREV]] = add i16 [[IV_NEXT]], 5 @@ -2911,16 +2913,17 @@ ; UNROLL-NO-IC: middle.block: ; UNROLL-NO-IC-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP49]], [[TMP48]] ; UNROLL-NO-IC-NEXT: [[TMP51:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) +; UNROLL-NO-IC-NEXT: [[RDX_START:%.*]] = add i32 0, [[TMP51]] ; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP43]], i32 3 ; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP43]], i32 2 ; UNROLL-NO-IC-NEXT: br i1 true, label [[BB1:%.*]], label [[SCALAR_PH]] ; UNROLL-NO-IC: scalar.ph: ; UNROLL-NO-IC-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[BB:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ] ; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[Y]], [[BB]] ] -; UNROLL-NO-IC-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[BB]] ], [ [[TMP51]], [[MIDDLE_BLOCK]] ] +; UNROLL-NO-IC-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[BB]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; UNROLL-NO-IC-NEXT: br label [[BB2:%.*]] ; UNROLL-NO-IC: bb1: -; UNROLL-NO-IC-NEXT: [[VAR:%.*]] = phi i32 [ [[VAR6:%.*]], [[BB2]] ], [ [[TMP51]], [[MIDDLE_BLOCK]] ] +; UNROLL-NO-IC-NEXT: [[VAR:%.*]] = phi i32 [ [[VAR6:%.*]], [[BB2]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; UNROLL-NO-IC-NEXT: ret i32 [[VAR]] ; UNROLL-NO-IC: bb2: ; UNROLL-NO-IC-NEXT: [[VAR3:%.*]] = phi i32 [ [[VAR8:%.*]], [[BB2]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -2946,41 +2949,41 @@ ; UNROLL-NO-VF-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP1]], 1 ; UNROLL-NO-VF-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL-NO-VF: vector.body: -; UNROLL-NO-VF-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE5:%.*]] ] -; UNROLL-NO-VF-NEXT: [[VECTOR_RECUR:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[PRED_UDIV_CONTINUE5]] ] -; UNROLL-NO-VF-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[PRED_UDIV_CONTINUE5]] ] -; UNROLL-NO-VF-NEXT: [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[PRED_UDIV_CONTINUE5]] ] +; UNROLL-NO-VF-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE4:%.*]] ] +; UNROLL-NO-VF-NEXT: [[VECTOR_RECUR:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[PRED_UDIV_CONTINUE4]] ] +; UNROLL-NO-VF-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[PRED_UDIV_CONTINUE4]] ] +; UNROLL-NO-VF-NEXT: [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[PRED_UDIV_CONTINUE4]] ] ; UNROLL-NO-VF-NEXT: [[OFFSET_IDX:%.*]] = sub i32 [[Y]], [[INDEX]] ; UNROLL-NO-VF-NEXT: [[VEC_IV:%.*]] = add i32 [[INDEX]], 0 -; UNROLL-NO-VF-NEXT: [[VEC_IV3:%.*]] = add i32 [[INDEX]], 1 +; UNROLL-NO-VF-NEXT: [[VEC_IV2:%.*]] = add i32 [[INDEX]], 1 ; UNROLL-NO-VF-NEXT: [[TMP2:%.*]] = icmp ule i32 [[VEC_IV]], [[TRIP_COUNT_MINUS_1]] -; UNROLL-NO-VF-NEXT: [[TMP3:%.*]] = icmp ule i32 [[VEC_IV3]], [[TRIP_COUNT_MINUS_1]] +; UNROLL-NO-VF-NEXT: [[TMP3:%.*]] = icmp ule i32 [[VEC_IV2]], [[TRIP_COUNT_MINUS_1]] ; UNROLL-NO-VF-NEXT: br i1 [[TMP2]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]] ; UNROLL-NO-VF: pred.udiv.if: -; UNROLL-NO-VF-NEXT: [[INDUCTION:%.*]] = add i32 [[OFFSET_IDX]], 0 -; UNROLL-NO-VF-NEXT: [[TMP4:%.*]] = udiv i32 219220132, [[INDUCTION]] +; UNROLL-NO-VF-NEXT: [[TMP4:%.*]] = add i32 [[OFFSET_IDX]], 0 +; UNROLL-NO-VF-NEXT: [[TMP5:%.*]] = udiv i32 219220132, [[TMP4]] ; UNROLL-NO-VF-NEXT: br label [[PRED_UDIV_CONTINUE]] ; UNROLL-NO-VF: pred.udiv.continue: -; UNROLL-NO-VF-NEXT: [[TMP5:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP4]], [[PRED_UDIV_IF]] ] -; UNROLL-NO-VF-NEXT: br i1 [[TMP3]], label [[PRED_UDIV_IF4:%.*]], label [[PRED_UDIV_CONTINUE5]] +; UNROLL-NO-VF-NEXT: [[TMP6:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP5]], [[PRED_UDIV_IF]] ] +; UNROLL-NO-VF-NEXT: br i1 [[TMP3]], label [[PRED_UDIV_IF3:%.*]], label [[PRED_UDIV_CONTINUE4]] ; UNROLL-NO-VF: pred.udiv.if3: -; UNROLL-NO-VF-NEXT: [[INDUCTION2:%.*]] = add i32 [[OFFSET_IDX]], -1 -; UNROLL-NO-VF-NEXT: [[TMP6:%.*]] = udiv i32 219220132, [[INDUCTION2]] -; UNROLL-NO-VF-NEXT: br label [[PRED_UDIV_CONTINUE5]] +; UNROLL-NO-VF-NEXT: [[TMP7:%.*]] = add i32 [[OFFSET_IDX]], -1 +; UNROLL-NO-VF-NEXT: [[TMP8:%.*]] = udiv i32 219220132, [[TMP7]] +; UNROLL-NO-VF-NEXT: br label [[PRED_UDIV_CONTINUE4]] ; UNROLL-NO-VF: pred.udiv.continue4: -; UNROLL-NO-VF-NEXT: [[TMP7]] = phi i32 [ poison, [[PRED_UDIV_CONTINUE]] ], [ [[TMP6]], [[PRED_UDIV_IF4]] ] -; UNROLL-NO-VF-NEXT: [[TMP8]] = add i32 [[VEC_PHI]], [[VECTOR_RECUR]] -; UNROLL-NO-VF-NEXT: [[TMP9]] = add i32 [[VEC_PHI1]], [[TMP5]] -; UNROLL-NO-VF-NEXT: [[TMP10:%.*]] = select i1 [[TMP2]], i32 [[TMP8]], i32 [[VEC_PHI]] -; UNROLL-NO-VF-NEXT: [[TMP11:%.*]] = select i1 [[TMP3]], i32 [[TMP9]], i32 [[VEC_PHI1]] +; UNROLL-NO-VF-NEXT: [[TMP9]] = phi i32 [ poison, [[PRED_UDIV_CONTINUE]] ], [ [[TMP8]], [[PRED_UDIV_IF3]] ] +; UNROLL-NO-VF-NEXT: [[TMP10]] = add i32 [[VEC_PHI]], [[VECTOR_RECUR]] +; UNROLL-NO-VF-NEXT: [[TMP11]] = add i32 [[VEC_PHI1]], [[TMP6]] +; UNROLL-NO-VF-NEXT: [[TMP12:%.*]] = select i1 [[TMP2]], i32 [[TMP10]], i32 [[VEC_PHI]] +; UNROLL-NO-VF-NEXT: [[TMP13:%.*]] = select i1 [[TMP3]], i32 [[TMP11]], i32 [[VEC_PHI1]] ; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 2 -; UNROLL-NO-VF-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; UNROLL-NO-VF-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF25:![0-9]+]], !llvm.loop [[LOOP26:![0-9]+]] +; UNROLL-NO-VF-NEXT: [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; UNROLL-NO-VF-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF25:![0-9]+]], !llvm.loop [[LOOP26:![0-9]+]] ; UNROLL-NO-VF: middle.block: -; UNROLL-NO-VF-NEXT: [[BIN_RDX:%.*]] = add i32 [[TMP11]], [[TMP10]] +; UNROLL-NO-VF-NEXT: [[BIN_RDX:%.*]] = add i32 [[TMP13]], [[TMP12]] ; UNROLL-NO-VF-NEXT: br i1 true, label [[BB1:%.*]], label [[SCALAR_PH]] ; UNROLL-NO-VF: scalar.ph: -; UNROLL-NO-VF-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[BB:%.*]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; UNROLL-NO-VF-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[BB:%.*]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] ; UNROLL-NO-VF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[Y]], [[BB]] ] ; UNROLL-NO-VF-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[BB]] ], [ [[BIN_RDX]], [[MIDDLE_BLOCK]] ] ; UNROLL-NO-VF-NEXT: br label [[BB2:%.*]] @@ -3065,16 +3068,17 @@ ; SINK-AFTER-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF26:![0-9]+]], !llvm.loop [[LOOP27:![0-9]+]] ; SINK-AFTER: middle.block: ; SINK-AFTER-NEXT: [[TMP27:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP25]]) +; SINK-AFTER-NEXT: [[RDX_START:%.*]] = add i32 0, [[TMP27]] ; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP22]], i32 3 ; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP22]], i32 2 ; SINK-AFTER-NEXT: br i1 true, label [[BB1:%.*]], label [[SCALAR_PH]] ; SINK-AFTER: scalar.ph: ; SINK-AFTER-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[BB:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ] ; SINK-AFTER-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[Y]], [[BB]] ] -; SINK-AFTER-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[BB]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ] +; SINK-AFTER-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[BB]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; SINK-AFTER-NEXT: br label [[BB2:%.*]] ; SINK-AFTER: bb1: -; SINK-AFTER-NEXT: [[VAR:%.*]] = phi i32 [ [[VAR6:%.*]], [[BB2]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ] +; SINK-AFTER-NEXT: [[VAR:%.*]] = phi i32 [ [[VAR6:%.*]], [[BB2]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; SINK-AFTER-NEXT: ret i32 [[VAR]] ; SINK-AFTER: bb2: ; SINK-AFTER-NEXT: [[VAR3:%.*]] = phi i32 [ [[VAR8:%.*]], [[BB2]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -3281,6 +3285,7 @@ ; UNROLL-NO-IC: middle.block: ; UNROLL-NO-IC-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP73]], [[TMP72]] ; UNROLL-NO-IC-NEXT: [[TMP75:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) +; UNROLL-NO-IC-NEXT: [[RDX_START:%.*]] = add i32 0, [[TMP75]] ; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP43]], i32 3 ; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP43]], i32 2 ; UNROLL-NO-IC-NEXT: br i1 true, label [[BB1:%.*]], label [[SCALAR_PH]] @@ -3288,10 +3293,10 @@ ; UNROLL-NO-IC-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[BB:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ] ; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[Y]], [[BB]] ] ; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[BB]] ] -; UNROLL-NO-IC-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[BB]] ], [ [[TMP75]], [[MIDDLE_BLOCK]] ] +; UNROLL-NO-IC-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[BB]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; UNROLL-NO-IC-NEXT: br label [[BB2:%.*]] ; UNROLL-NO-IC: bb1: -; UNROLL-NO-IC-NEXT: [[VAR:%.*]] = phi i32 [ [[VAR6:%.*]], [[BB2]] ], [ [[TMP75]], [[MIDDLE_BLOCK]] ] +; UNROLL-NO-IC-NEXT: [[VAR:%.*]] = phi i32 [ [[VAR6:%.*]], [[BB2]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; UNROLL-NO-IC-NEXT: ret i32 [[VAR]] ; UNROLL-NO-IC: bb2: ; UNROLL-NO-IC-NEXT: [[VAR3:%.*]] = phi i32 [ [[VAR8:%.*]], [[BB2]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -3321,55 +3326,55 @@ ; UNROLL-NO-VF-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP1]], 1 ; UNROLL-NO-VF-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL-NO-VF: vector.body: -; UNROLL-NO-VF-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE10:%.*]] ] -; UNROLL-NO-VF-NEXT: [[VECTOR_RECUR:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[PRED_STORE_CONTINUE10]] ] -; UNROLL-NO-VF-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[PRED_STORE_CONTINUE10]] ] -; UNROLL-NO-VF-NEXT: [[VEC_PHI2:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[PRED_STORE_CONTINUE10]] ] +; UNROLL-NO-VF-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE7:%.*]] ] +; UNROLL-NO-VF-NEXT: [[VECTOR_RECUR:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[PRED_STORE_CONTINUE7]] ] +; UNROLL-NO-VF-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[PRED_STORE_CONTINUE7]] ] +; UNROLL-NO-VF-NEXT: [[VEC_PHI2:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[PRED_STORE_CONTINUE7]] ] ; UNROLL-NO-VF-NEXT: [[OFFSET_IDX:%.*]] = sub i32 [[Y]], [[INDEX]] -; UNROLL-NO-VF-NEXT: [[INDUCTION4:%.*]] = add i32 [[OFFSET_IDX]], 0 -; UNROLL-NO-VF-NEXT: [[INDUCTION5:%.*]] = add i32 [[OFFSET_IDX]], -1 +; UNROLL-NO-VF-NEXT: [[TMP2:%.*]] = add i32 [[OFFSET_IDX]], 0 +; UNROLL-NO-VF-NEXT: [[TMP3:%.*]] = add i32 [[OFFSET_IDX]], -1 ; UNROLL-NO-VF-NEXT: [[VEC_IV:%.*]] = add i32 [[INDEX]], 0 -; UNROLL-NO-VF-NEXT: [[VEC_IV6:%.*]] = add i32 [[INDEX]], 1 -; UNROLL-NO-VF-NEXT: [[TMP2:%.*]] = icmp ule i32 [[VEC_IV]], [[TRIP_COUNT_MINUS_1]] -; UNROLL-NO-VF-NEXT: [[TMP3:%.*]] = icmp ule i32 [[VEC_IV6]], [[TRIP_COUNT_MINUS_1]] -; UNROLL-NO-VF-NEXT: br i1 [[TMP2]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]] +; UNROLL-NO-VF-NEXT: [[VEC_IV3:%.*]] = add i32 [[INDEX]], 1 +; UNROLL-NO-VF-NEXT: [[TMP4:%.*]] = icmp ule i32 [[VEC_IV]], [[TRIP_COUNT_MINUS_1]] +; UNROLL-NO-VF-NEXT: [[TMP5:%.*]] = icmp ule i32 [[VEC_IV3]], [[TRIP_COUNT_MINUS_1]] +; UNROLL-NO-VF-NEXT: br i1 [[TMP4]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]] ; UNROLL-NO-VF: pred.udiv.if: -; UNROLL-NO-VF-NEXT: [[TMP4:%.*]] = udiv i32 219220132, [[INDUCTION4]] +; UNROLL-NO-VF-NEXT: [[TMP6:%.*]] = udiv i32 219220132, [[TMP2]] ; UNROLL-NO-VF-NEXT: br label [[PRED_UDIV_CONTINUE]] ; UNROLL-NO-VF: pred.udiv.continue: -; UNROLL-NO-VF-NEXT: [[TMP5:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP4]], [[PRED_UDIV_IF]] ] -; UNROLL-NO-VF-NEXT: br i1 [[TMP3]], label [[PRED_UDIV_IF7:%.*]], label [[PRED_UDIV_CONTINUE8:%.*]] +; UNROLL-NO-VF-NEXT: [[TMP7:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP6]], [[PRED_UDIV_IF]] ] +; UNROLL-NO-VF-NEXT: br i1 [[TMP5]], label [[PRED_UDIV_IF4:%.*]], label [[PRED_UDIV_CONTINUE5:%.*]] ; UNROLL-NO-VF: pred.udiv.if4: -; UNROLL-NO-VF-NEXT: [[TMP6:%.*]] = udiv i32 219220132, [[INDUCTION5]] -; UNROLL-NO-VF-NEXT: br label [[PRED_UDIV_CONTINUE8]] +; UNROLL-NO-VF-NEXT: [[TMP8:%.*]] = udiv i32 219220132, [[TMP3]] +; UNROLL-NO-VF-NEXT: br label [[PRED_UDIV_CONTINUE5]] ; UNROLL-NO-VF: pred.udiv.continue5: -; UNROLL-NO-VF-NEXT: [[TMP7]] = phi i32 [ poison, [[PRED_UDIV_CONTINUE]] ], [ [[TMP6]], [[PRED_UDIV_IF7]] ] -; UNROLL-NO-VF-NEXT: [[TMP8]] = add i32 [[VEC_PHI]], [[VECTOR_RECUR]] -; UNROLL-NO-VF-NEXT: [[TMP9]] = add i32 [[VEC_PHI2]], [[TMP5]] -; UNROLL-NO-VF-NEXT: br i1 [[TMP2]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; UNROLL-NO-VF-NEXT: [[TMP9]] = phi i32 [ poison, [[PRED_UDIV_CONTINUE]] ], [ [[TMP8]], [[PRED_UDIV_IF4]] ] +; UNROLL-NO-VF-NEXT: [[TMP10]] = add i32 [[VEC_PHI]], [[VECTOR_RECUR]] +; UNROLL-NO-VF-NEXT: [[TMP11]] = add i32 [[VEC_PHI2]], [[TMP7]] +; UNROLL-NO-VF-NEXT: br i1 [[TMP4]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; UNROLL-NO-VF: pred.store.if: -; UNROLL-NO-VF-NEXT: [[INDUCTION:%.*]] = add i32 [[INDEX]], 0 -; UNROLL-NO-VF-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i32 [[INDUCTION]] -; UNROLL-NO-VF-NEXT: store i32 [[INDUCTION4]], i32* [[TMP10]], align 4 +; UNROLL-NO-VF-NEXT: [[TMP12:%.*]] = add i32 [[INDEX]], 0 +; UNROLL-NO-VF-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i32 [[TMP12]] +; UNROLL-NO-VF-NEXT: store i32 [[TMP2]], i32* [[TMP13]], align 4 ; UNROLL-NO-VF-NEXT: br label [[PRED_STORE_CONTINUE]] ; UNROLL-NO-VF: pred.store.continue: -; UNROLL-NO-VF-NEXT: br i1 [[TMP3]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10]] +; UNROLL-NO-VF-NEXT: br i1 [[TMP5]], label [[PRED_STORE_IF6:%.*]], label [[PRED_STORE_CONTINUE7]] ; UNROLL-NO-VF: pred.store.if6: -; UNROLL-NO-VF-NEXT: [[INDUCTION3:%.*]] = add i32 [[INDEX]], 1 -; UNROLL-NO-VF-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[X]], i32 [[INDUCTION3]] -; UNROLL-NO-VF-NEXT: store i32 [[INDUCTION5]], i32* [[TMP11]], align 4 -; UNROLL-NO-VF-NEXT: br label [[PRED_STORE_CONTINUE10]] +; UNROLL-NO-VF-NEXT: [[TMP14:%.*]] = add i32 [[INDEX]], 1 +; UNROLL-NO-VF-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[X]], i32 [[TMP14]] +; UNROLL-NO-VF-NEXT: store i32 [[TMP3]], i32* [[TMP15]], align 4 +; UNROLL-NO-VF-NEXT: br label [[PRED_STORE_CONTINUE7]] ; UNROLL-NO-VF: pred.store.continue7: -; UNROLL-NO-VF-NEXT: [[TMP12:%.*]] = select i1 [[TMP2]], i32 [[TMP8]], i32 [[VEC_PHI]] -; UNROLL-NO-VF-NEXT: [[TMP13:%.*]] = select i1 [[TMP3]], i32 [[TMP9]], i32 [[VEC_PHI2]] +; UNROLL-NO-VF-NEXT: [[TMP16:%.*]] = select i1 [[TMP4]], i32 [[TMP10]], i32 [[VEC_PHI]] +; UNROLL-NO-VF-NEXT: [[TMP17:%.*]] = select i1 [[TMP5]], i32 [[TMP11]], i32 [[VEC_PHI2]] ; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 2 -; UNROLL-NO-VF-NEXT: [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; UNROLL-NO-VF-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF25]], !llvm.loop [[LOOP29:![0-9]+]] +; UNROLL-NO-VF-NEXT: [[TMP18:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; UNROLL-NO-VF-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF25]], !llvm.loop [[LOOP29:![0-9]+]] ; UNROLL-NO-VF: middle.block: -; UNROLL-NO-VF-NEXT: [[BIN_RDX:%.*]] = add i32 [[TMP13]], [[TMP12]] +; UNROLL-NO-VF-NEXT: [[BIN_RDX:%.*]] = add i32 [[TMP17]], [[TMP16]] ; UNROLL-NO-VF-NEXT: br i1 true, label [[BB1:%.*]], label [[SCALAR_PH]] ; UNROLL-NO-VF: scalar.ph: -; UNROLL-NO-VF-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[BB:%.*]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; UNROLL-NO-VF-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[BB:%.*]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] ; UNROLL-NO-VF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[Y]], [[BB]] ] ; UNROLL-NO-VF-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[BB]] ] ; UNROLL-NO-VF-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[BB]] ], [ [[BIN_RDX]], [[MIDDLE_BLOCK]] ] @@ -3490,6 +3495,7 @@ ; SINK-AFTER-NEXT: br i1 [[TMP38]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF26]], !llvm.loop [[LOOP30:![0-9]+]] ; SINK-AFTER: middle.block: ; SINK-AFTER-NEXT: [[TMP39:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP37]]) +; SINK-AFTER-NEXT: [[RDX_START:%.*]] = add i32 0, [[TMP39]] ; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP22]], i32 3 ; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP22]], i32 2 ; SINK-AFTER-NEXT: br i1 true, label [[BB1:%.*]], label [[SCALAR_PH]] @@ -3497,10 +3503,10 @@ ; SINK-AFTER-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[BB:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ] ; SINK-AFTER-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[Y]], [[BB]] ] ; SINK-AFTER-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[BB]] ] -; SINK-AFTER-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[BB]] ], [ [[TMP39]], [[MIDDLE_BLOCK]] ] +; SINK-AFTER-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[BB]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; SINK-AFTER-NEXT: br label [[BB2:%.*]] ; SINK-AFTER: bb1: -; SINK-AFTER-NEXT: [[VAR:%.*]] = phi i32 [ [[VAR6:%.*]], [[BB2]] ], [ [[TMP39]], [[MIDDLE_BLOCK]] ] +; SINK-AFTER-NEXT: [[VAR:%.*]] = phi i32 [ [[VAR6:%.*]], [[BB2]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; SINK-AFTER-NEXT: ret i32 [[VAR]] ; SINK-AFTER: bb2: ; SINK-AFTER-NEXT: [[VAR3:%.*]] = phi i32 [ [[VAR8:%.*]], [[BB2]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -3607,28 +3613,28 @@ ; UNROLL-NO-VF-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL-NO-VF: vector.body: ; UNROLL-NO-VF-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; UNROLL-NO-VF-NEXT: [[VECTOR_RECUR:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; UNROLL-NO-VF-NEXT: [[VECTOR_RECUR:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] ; UNROLL-NO-VF-NEXT: [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i16 -; UNROLL-NO-VF-NEXT: [[INDUCTION:%.*]] = add i16 [[OFFSET_IDX]], 0 -; UNROLL-NO-VF-NEXT: [[INDUCTION1:%.*]] = add i16 [[OFFSET_IDX]], 1 -; UNROLL-NO-VF-NEXT: [[TMP0:%.*]] = add i16 [[INDUCTION]], 1 -; UNROLL-NO-VF-NEXT: [[TMP1:%.*]] = add i16 [[INDUCTION1]], 1 -; UNROLL-NO-VF-NEXT: [[TMP2:%.*]] = or i16 [[TMP0]], [[TMP0]] -; UNROLL-NO-VF-NEXT: [[TMP3:%.*]] = or i16 [[TMP1]], [[TMP1]] -; UNROLL-NO-VF-NEXT: [[TMP4:%.*]] = zext i16 [[TMP2]] to i32 -; UNROLL-NO-VF-NEXT: [[TMP5]] = zext i16 [[TMP3]] to i32 -; UNROLL-NO-VF-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[A_PTR:%.*]], i16 [[INDUCTION]] -; UNROLL-NO-VF-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[A_PTR]], i16 [[INDUCTION1]] -; UNROLL-NO-VF-NEXT: store i32 0, i32* [[TMP6]], align 4 -; UNROLL-NO-VF-NEXT: store i32 0, i32* [[TMP7]], align 4 +; UNROLL-NO-VF-NEXT: [[TMP0:%.*]] = add i16 [[OFFSET_IDX]], 0 +; UNROLL-NO-VF-NEXT: [[TMP1:%.*]] = add i16 [[OFFSET_IDX]], 1 +; UNROLL-NO-VF-NEXT: [[TMP2:%.*]] = add i16 [[TMP0]], 1 +; UNROLL-NO-VF-NEXT: [[TMP3:%.*]] = add i16 [[TMP1]], 1 +; UNROLL-NO-VF-NEXT: [[TMP4:%.*]] = or i16 [[TMP2]], [[TMP2]] +; UNROLL-NO-VF-NEXT: [[TMP5:%.*]] = or i16 [[TMP3]], [[TMP3]] +; UNROLL-NO-VF-NEXT: [[TMP6:%.*]] = zext i16 [[TMP4]] to i32 +; UNROLL-NO-VF-NEXT: [[TMP7]] = zext i16 [[TMP5]] to i32 +; UNROLL-NO-VF-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[A_PTR:%.*]], i16 [[TMP0]] +; UNROLL-NO-VF-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[A_PTR]], i16 [[TMP1]] +; UNROLL-NO-VF-NEXT: store i32 0, i32* [[TMP8]], align 4 +; UNROLL-NO-VF-NEXT: store i32 0, i32* [[TMP9]], align 4 ; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 -; UNROLL-NO-VF-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16 -; UNROLL-NO-VF-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]] +; UNROLL-NO-VF-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16 +; UNROLL-NO-VF-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]] ; UNROLL-NO-VF: middle.block: ; UNROLL-NO-VF-NEXT: [[CMP_N:%.*]] = icmp eq i32 16, 16 ; UNROLL-NO-VF-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; UNROLL-NO-VF: scalar.ph: -; UNROLL-NO-VF-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ] +; UNROLL-NO-VF-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] ; UNROLL-NO-VF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i16 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] ; UNROLL-NO-VF-NEXT: br label [[LOOP:%.*]] ; UNROLL-NO-VF: loop: Index: llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll +++ llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll @@ -226,80 +226,80 @@ ; UNROLL-NO-VF: vector.ph: ; UNROLL-NO-VF-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL-NO-VF: vector.body: -; UNROLL-NO-VF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UREM_CONTINUE33:%.*]] ] -; UNROLL-NO-VF-NEXT: [[INDUCTION:%.*]] = add i64 [[INDEX]], 0 -; UNROLL-NO-VF-NEXT: [[INDUCTION31:%.*]] = add i64 [[INDEX]], 1 -; UNROLL-NO-VF-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[ASD]], i64 [[INDUCTION]] -; UNROLL-NO-VF-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[ASD]], i64 [[INDUCTION31]] -; UNROLL-NO-VF-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[AUD]], i64 [[INDUCTION]] -; UNROLL-NO-VF-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[AUD]], i64 [[INDUCTION31]] -; UNROLL-NO-VF-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[ASR]], i64 [[INDUCTION]] -; UNROLL-NO-VF-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[ASR]], i64 [[INDUCTION31]] -; UNROLL-NO-VF-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[AUR]], i64 [[INDUCTION]] -; UNROLL-NO-VF-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[AUR]], i64 [[INDUCTION31]] -; UNROLL-NO-VF-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP0]], align 4, !alias.scope !5, !noalias !8 -; UNROLL-NO-VF-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP1]], align 4, !alias.scope !5, !noalias !8 -; UNROLL-NO-VF-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP2]], align 4, !alias.scope !12, !noalias !13 -; UNROLL-NO-VF-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP3]], align 4, !alias.scope !12, !noalias !13 -; UNROLL-NO-VF-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP4]], align 4, !alias.scope !14, !noalias !15 -; UNROLL-NO-VF-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP5]], align 4, !alias.scope !14, !noalias !15 -; UNROLL-NO-VF-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP6]], align 4, !alias.scope !15 -; UNROLL-NO-VF-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP7]], align 4, !alias.scope !15 -; UNROLL-NO-VF-NEXT: [[TMP16:%.*]] = add nsw i32 [[TMP8]], 23 -; UNROLL-NO-VF-NEXT: [[TMP17:%.*]] = add nsw i32 [[TMP9]], 23 -; UNROLL-NO-VF-NEXT: [[TMP18:%.*]] = add nsw i32 [[TMP10]], 24 -; UNROLL-NO-VF-NEXT: [[TMP19:%.*]] = add nsw i32 [[TMP11]], 24 -; UNROLL-NO-VF-NEXT: [[TMP20:%.*]] = add nsw i32 [[TMP12]], 25 -; UNROLL-NO-VF-NEXT: [[TMP21:%.*]] = add nsw i32 [[TMP13]], 25 -; UNROLL-NO-VF-NEXT: [[TMP22:%.*]] = add nsw i32 [[TMP14]], 26 -; UNROLL-NO-VF-NEXT: [[TMP23:%.*]] = add nsw i32 [[TMP15]], 26 -; UNROLL-NO-VF-NEXT: [[TMP24:%.*]] = icmp slt i32 [[TMP8]], 100 -; UNROLL-NO-VF-NEXT: [[TMP25:%.*]] = icmp slt i32 [[TMP9]], 100 -; UNROLL-NO-VF-NEXT: br i1 [[TMP24]], label [[PRED_UREM_IF:%.*]], label [[PRED_UREM_CONTINUE:%.*]] +; UNROLL-NO-VF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UREM_CONTINUE32:%.*]] ] +; UNROLL-NO-VF-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; UNROLL-NO-VF-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; UNROLL-NO-VF-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[ASD]], i64 [[TMP0]] +; UNROLL-NO-VF-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[ASD]], i64 [[TMP1]] +; UNROLL-NO-VF-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[AUD]], i64 [[TMP0]] +; UNROLL-NO-VF-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[AUD]], i64 [[TMP1]] +; UNROLL-NO-VF-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[ASR]], i64 [[TMP0]] +; UNROLL-NO-VF-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[ASR]], i64 [[TMP1]] +; UNROLL-NO-VF-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[AUR]], i64 [[TMP0]] +; UNROLL-NO-VF-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[AUR]], i64 [[TMP1]] +; UNROLL-NO-VF-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP2]], align 4, !alias.scope !5, !noalias !8 +; UNROLL-NO-VF-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP3]], align 4, !alias.scope !5, !noalias !8 +; UNROLL-NO-VF-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP4]], align 4, !alias.scope !12, !noalias !13 +; UNROLL-NO-VF-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP5]], align 4, !alias.scope !12, !noalias !13 +; UNROLL-NO-VF-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP6]], align 4, !alias.scope !14, !noalias !15 +; UNROLL-NO-VF-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP7]], align 4, !alias.scope !14, !noalias !15 +; UNROLL-NO-VF-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP8]], align 4, !alias.scope !15 +; UNROLL-NO-VF-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP9]], align 4, !alias.scope !15 +; UNROLL-NO-VF-NEXT: [[TMP18:%.*]] = add nsw i32 [[TMP10]], 23 +; UNROLL-NO-VF-NEXT: [[TMP19:%.*]] = add nsw i32 [[TMP11]], 23 +; UNROLL-NO-VF-NEXT: [[TMP20:%.*]] = add nsw i32 [[TMP12]], 24 +; UNROLL-NO-VF-NEXT: [[TMP21:%.*]] = add nsw i32 [[TMP13]], 24 +; UNROLL-NO-VF-NEXT: [[TMP22:%.*]] = add nsw i32 [[TMP14]], 25 +; UNROLL-NO-VF-NEXT: [[TMP23:%.*]] = add nsw i32 [[TMP15]], 25 +; UNROLL-NO-VF-NEXT: [[TMP24:%.*]] = add nsw i32 [[TMP16]], 26 +; UNROLL-NO-VF-NEXT: [[TMP25:%.*]] = add nsw i32 [[TMP17]], 26 +; UNROLL-NO-VF-NEXT: [[TMP26:%.*]] = icmp slt i32 [[TMP10]], 100 +; UNROLL-NO-VF-NEXT: [[TMP27:%.*]] = icmp slt i32 [[TMP11]], 100 +; UNROLL-NO-VF-NEXT: br i1 [[TMP26]], label [[PRED_UREM_IF:%.*]], label [[PRED_UREM_CONTINUE:%.*]] ; UNROLL-NO-VF: pred.urem.if: -; UNROLL-NO-VF-NEXT: [[TMP26:%.*]] = sdiv i32 [[TMP16]], [[TMP8]] -; UNROLL-NO-VF-NEXT: [[TMP27:%.*]] = udiv i32 [[TMP18]], [[TMP10]] -; UNROLL-NO-VF-NEXT: [[TMP28:%.*]] = srem i32 [[TMP20]], [[TMP12]] -; UNROLL-NO-VF-NEXT: [[TMP29:%.*]] = urem i32 [[TMP22]], [[TMP14]] +; UNROLL-NO-VF-NEXT: [[TMP28:%.*]] = sdiv i32 [[TMP18]], [[TMP10]] +; UNROLL-NO-VF-NEXT: [[TMP29:%.*]] = udiv i32 [[TMP20]], [[TMP12]] +; UNROLL-NO-VF-NEXT: [[TMP30:%.*]] = srem i32 [[TMP22]], [[TMP14]] +; UNROLL-NO-VF-NEXT: [[TMP31:%.*]] = urem i32 [[TMP24]], [[TMP16]] ; UNROLL-NO-VF-NEXT: br label [[PRED_UREM_CONTINUE]] ; UNROLL-NO-VF: pred.urem.continue: -; UNROLL-NO-VF-NEXT: [[TMP30:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP26]], [[PRED_UREM_IF]] ] -; UNROLL-NO-VF-NEXT: [[TMP31:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP27]], [[PRED_UREM_IF]] ] ; UNROLL-NO-VF-NEXT: [[TMP32:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP28]], [[PRED_UREM_IF]] ] ; UNROLL-NO-VF-NEXT: [[TMP33:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP29]], [[PRED_UREM_IF]] ] -; UNROLL-NO-VF-NEXT: br i1 [[TMP25]], label [[PRED_UREM_IF32:%.*]], label [[PRED_UREM_CONTINUE33]] +; UNROLL-NO-VF-NEXT: [[TMP34:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP30]], [[PRED_UREM_IF]] ] +; UNROLL-NO-VF-NEXT: [[TMP35:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP31]], [[PRED_UREM_IF]] ] +; UNROLL-NO-VF-NEXT: br i1 [[TMP27]], label [[PRED_UREM_IF31:%.*]], label [[PRED_UREM_CONTINUE32]] ; UNROLL-NO-VF: pred.urem.if31: -; UNROLL-NO-VF-NEXT: [[TMP34:%.*]] = sdiv i32 [[TMP17]], [[TMP9]] -; UNROLL-NO-VF-NEXT: [[TMP35:%.*]] = udiv i32 [[TMP19]], [[TMP11]] -; UNROLL-NO-VF-NEXT: [[TMP36:%.*]] = srem i32 [[TMP21]], [[TMP13]] -; UNROLL-NO-VF-NEXT: [[TMP37:%.*]] = urem i32 [[TMP23]], [[TMP15]] -; UNROLL-NO-VF-NEXT: br label [[PRED_UREM_CONTINUE33]] +; UNROLL-NO-VF-NEXT: [[TMP36:%.*]] = sdiv i32 [[TMP19]], [[TMP11]] +; UNROLL-NO-VF-NEXT: [[TMP37:%.*]] = udiv i32 [[TMP21]], [[TMP13]] +; UNROLL-NO-VF-NEXT: [[TMP38:%.*]] = srem i32 [[TMP23]], [[TMP15]] +; UNROLL-NO-VF-NEXT: [[TMP39:%.*]] = urem i32 [[TMP25]], [[TMP17]] +; UNROLL-NO-VF-NEXT: br label [[PRED_UREM_CONTINUE32]] ; UNROLL-NO-VF: pred.urem.continue32: -; UNROLL-NO-VF-NEXT: [[TMP38:%.*]] = phi i32 [ poison, [[PRED_UREM_CONTINUE]] ], [ [[TMP34]], [[PRED_UREM_IF32]] ] -; UNROLL-NO-VF-NEXT: [[TMP39:%.*]] = phi i32 [ poison, [[PRED_UREM_CONTINUE]] ], [ [[TMP35]], [[PRED_UREM_IF32]] ] -; UNROLL-NO-VF-NEXT: [[TMP40:%.*]] = phi i32 [ poison, [[PRED_UREM_CONTINUE]] ], [ [[TMP36]], [[PRED_UREM_IF32]] ] -; UNROLL-NO-VF-NEXT: [[TMP41:%.*]] = phi i32 [ poison, [[PRED_UREM_CONTINUE]] ], [ [[TMP37]], [[PRED_UREM_IF32]] ] -; UNROLL-NO-VF-NEXT: [[TMP42:%.*]] = xor i1 [[TMP24]], true -; UNROLL-NO-VF-NEXT: [[TMP43:%.*]] = xor i1 [[TMP25]], true -; UNROLL-NO-VF-NEXT: [[PREDPHI:%.*]] = select i1 [[TMP42]], i32 [[TMP16]], i32 [[TMP30]] -; UNROLL-NO-VF-NEXT: [[PREDPHI34:%.*]] = select i1 [[TMP43]], i32 [[TMP17]], i32 [[TMP38]] -; UNROLL-NO-VF-NEXT: [[PREDPHI35:%.*]] = select i1 [[TMP42]], i32 [[TMP18]], i32 [[TMP31]] -; UNROLL-NO-VF-NEXT: [[PREDPHI36:%.*]] = select i1 [[TMP43]], i32 [[TMP19]], i32 [[TMP39]] -; UNROLL-NO-VF-NEXT: [[PREDPHI37:%.*]] = select i1 [[TMP42]], i32 [[TMP20]], i32 [[TMP32]] -; UNROLL-NO-VF-NEXT: [[PREDPHI38:%.*]] = select i1 [[TMP43]], i32 [[TMP21]], i32 [[TMP40]] -; UNROLL-NO-VF-NEXT: [[PREDPHI39:%.*]] = select i1 [[TMP42]], i32 [[TMP22]], i32 [[TMP33]] -; UNROLL-NO-VF-NEXT: [[PREDPHI40:%.*]] = select i1 [[TMP43]], i32 [[TMP23]], i32 [[TMP41]] -; UNROLL-NO-VF-NEXT: store i32 [[PREDPHI]], i32* [[TMP0]], align 4, !alias.scope !5, !noalias !8 -; UNROLL-NO-VF-NEXT: store i32 [[PREDPHI34]], i32* [[TMP1]], align 4, !alias.scope !5, !noalias !8 -; UNROLL-NO-VF-NEXT: store i32 [[PREDPHI35]], i32* [[TMP2]], align 4, !alias.scope !12, !noalias !13 -; UNROLL-NO-VF-NEXT: store i32 [[PREDPHI36]], i32* [[TMP3]], align 4, !alias.scope !12, !noalias !13 -; UNROLL-NO-VF-NEXT: store i32 [[PREDPHI37]], i32* [[TMP4]], align 4, !alias.scope !14, !noalias !15 -; UNROLL-NO-VF-NEXT: store i32 [[PREDPHI38]], i32* [[TMP5]], align 4, !alias.scope !14, !noalias !15 -; UNROLL-NO-VF-NEXT: store i32 [[PREDPHI39]], i32* [[TMP6]], align 4, !alias.scope !15 -; UNROLL-NO-VF-NEXT: store i32 [[PREDPHI40]], i32* [[TMP7]], align 4, !alias.scope !15 +; UNROLL-NO-VF-NEXT: [[TMP40:%.*]] = phi i32 [ poison, [[PRED_UREM_CONTINUE]] ], [ [[TMP36]], [[PRED_UREM_IF31]] ] +; UNROLL-NO-VF-NEXT: [[TMP41:%.*]] = phi i32 [ poison, [[PRED_UREM_CONTINUE]] ], [ [[TMP37]], [[PRED_UREM_IF31]] ] +; UNROLL-NO-VF-NEXT: [[TMP42:%.*]] = phi i32 [ poison, [[PRED_UREM_CONTINUE]] ], [ [[TMP38]], [[PRED_UREM_IF31]] ] +; UNROLL-NO-VF-NEXT: [[TMP43:%.*]] = phi i32 [ poison, [[PRED_UREM_CONTINUE]] ], [ [[TMP39]], [[PRED_UREM_IF31]] ] +; UNROLL-NO-VF-NEXT: [[TMP44:%.*]] = xor i1 [[TMP26]], true +; UNROLL-NO-VF-NEXT: [[TMP45:%.*]] = xor i1 [[TMP27]], true +; UNROLL-NO-VF-NEXT: [[PREDPHI:%.*]] = select i1 [[TMP44]], i32 [[TMP18]], i32 [[TMP32]] +; UNROLL-NO-VF-NEXT: [[PREDPHI33:%.*]] = select i1 [[TMP45]], i32 [[TMP19]], i32 [[TMP40]] +; UNROLL-NO-VF-NEXT: [[PREDPHI34:%.*]] = select i1 [[TMP44]], i32 [[TMP20]], i32 [[TMP33]] +; UNROLL-NO-VF-NEXT: [[PREDPHI35:%.*]] = select i1 [[TMP45]], i32 [[TMP21]], i32 [[TMP41]] +; UNROLL-NO-VF-NEXT: [[PREDPHI36:%.*]] = select i1 [[TMP44]], i32 [[TMP22]], i32 [[TMP34]] +; UNROLL-NO-VF-NEXT: [[PREDPHI37:%.*]] = select i1 [[TMP45]], i32 [[TMP23]], i32 [[TMP42]] +; UNROLL-NO-VF-NEXT: [[PREDPHI38:%.*]] = select i1 [[TMP44]], i32 [[TMP24]], i32 [[TMP35]] +; UNROLL-NO-VF-NEXT: [[PREDPHI39:%.*]] = select i1 [[TMP45]], i32 [[TMP25]], i32 [[TMP43]] +; UNROLL-NO-VF-NEXT: store i32 [[PREDPHI]], i32* [[TMP2]], align 4, !alias.scope !5, !noalias !8 +; UNROLL-NO-VF-NEXT: store i32 [[PREDPHI33]], i32* [[TMP3]], align 4, !alias.scope !5, !noalias !8 +; UNROLL-NO-VF-NEXT: store i32 [[PREDPHI34]], i32* [[TMP4]], align 4, !alias.scope !12, !noalias !13 +; UNROLL-NO-VF-NEXT: store i32 [[PREDPHI35]], i32* [[TMP5]], align 4, !alias.scope !12, !noalias !13 +; UNROLL-NO-VF-NEXT: store i32 [[PREDPHI36]], i32* [[TMP6]], align 4, !alias.scope !14, !noalias !15 +; UNROLL-NO-VF-NEXT: store i32 [[PREDPHI37]], i32* [[TMP7]], align 4, !alias.scope !14, !noalias !15 +; UNROLL-NO-VF-NEXT: store i32 [[PREDPHI38]], i32* [[TMP8]], align 4, !alias.scope !15 +; UNROLL-NO-VF-NEXT: store i32 [[PREDPHI39]], i32* [[TMP9]], align 4, !alias.scope !15 ; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; UNROLL-NO-VF-NEXT: [[TMP44:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128 -; UNROLL-NO-VF-NEXT: br i1 [[TMP44]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; UNROLL-NO-VF-NEXT: [[TMP46:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128 +; UNROLL-NO-VF-NEXT: br i1 [[TMP46]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; UNROLL-NO-VF: middle.block: ; UNROLL-NO-VF-NEXT: [[CMP_N:%.*]] = icmp eq i64 128, 128 ; UNROLL-NO-VF-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] @@ -493,46 +493,46 @@ ; UNROLL-NO-VF: vector.ph: ; UNROLL-NO-VF-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL-NO-VF: vector.body: -; UNROLL-NO-VF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_SDIV_CONTINUE8:%.*]] ] -; UNROLL-NO-VF-NEXT: [[INDUCTION:%.*]] = add i64 [[INDEX]], 0 -; UNROLL-NO-VF-NEXT: [[INDUCTION6:%.*]] = add i64 [[INDEX]], 1 -; UNROLL-NO-VF-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[ASD]], i64 [[INDUCTION]] -; UNROLL-NO-VF-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[ASD]], i64 [[INDUCTION6]] -; UNROLL-NO-VF-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]], align 4, !alias.scope !19, !noalias !22 -; UNROLL-NO-VF-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]], align 4, !alias.scope !19, !noalias !22 -; UNROLL-NO-VF-NEXT: [[TMP4:%.*]] = add nsw i32 [[TMP2]], 23 -; UNROLL-NO-VF-NEXT: [[TMP5:%.*]] = add nsw i32 [[TMP3]], 23 -; UNROLL-NO-VF-NEXT: [[TMP6:%.*]] = icmp slt i32 [[TMP2]], 100 -; UNROLL-NO-VF-NEXT: [[TMP7:%.*]] = icmp slt i32 [[TMP3]], 100 -; UNROLL-NO-VF-NEXT: br i1 [[TMP6]], label [[PRED_SDIV_IF:%.*]], label [[PRED_SDIV_CONTINUE:%.*]] +; UNROLL-NO-VF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_SDIV_CONTINUE7:%.*]] ] +; UNROLL-NO-VF-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; UNROLL-NO-VF-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; UNROLL-NO-VF-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[ASD]], i64 [[TMP0]] +; UNROLL-NO-VF-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[ASD]], i64 [[TMP1]] +; UNROLL-NO-VF-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 4, !alias.scope !19, !noalias !22 +; UNROLL-NO-VF-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP3]], align 4, !alias.scope !19, !noalias !22 +; UNROLL-NO-VF-NEXT: [[TMP6:%.*]] = add nsw i32 [[TMP4]], 23 +; UNROLL-NO-VF-NEXT: [[TMP7:%.*]] = add nsw i32 [[TMP5]], 23 +; UNROLL-NO-VF-NEXT: [[TMP8:%.*]] = icmp slt i32 [[TMP4]], 100 +; UNROLL-NO-VF-NEXT: [[TMP9:%.*]] = icmp slt i32 [[TMP5]], 100 +; UNROLL-NO-VF-NEXT: br i1 [[TMP8]], label [[PRED_SDIV_IF:%.*]], label [[PRED_SDIV_CONTINUE:%.*]] ; UNROLL-NO-VF: pred.sdiv.if: -; UNROLL-NO-VF-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[BSD]], i64 [[INDUCTION]] -; UNROLL-NO-VF-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4, !alias.scope !22 -; UNROLL-NO-VF-NEXT: [[TMP10:%.*]] = sdiv i32 [[TMP4]], [[TMP2]] -; UNROLL-NO-VF-NEXT: [[TMP11:%.*]] = sdiv i32 [[TMP9]], [[TMP10]] +; UNROLL-NO-VF-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[BSD]], i64 [[TMP0]] +; UNROLL-NO-VF-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4, !alias.scope !22 +; UNROLL-NO-VF-NEXT: [[TMP12:%.*]] = sdiv i32 [[TMP6]], [[TMP4]] +; UNROLL-NO-VF-NEXT: [[TMP13:%.*]] = sdiv i32 [[TMP11]], [[TMP12]] ; UNROLL-NO-VF-NEXT: br label [[PRED_SDIV_CONTINUE]] ; UNROLL-NO-VF: pred.sdiv.continue: -; UNROLL-NO-VF-NEXT: [[TMP12:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP10]], [[PRED_SDIV_IF]] ] -; UNROLL-NO-VF-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP11]], [[PRED_SDIV_IF]] ] -; UNROLL-NO-VF-NEXT: br i1 [[TMP7]], label [[PRED_SDIV_IF7:%.*]], label [[PRED_SDIV_CONTINUE8]] +; UNROLL-NO-VF-NEXT: [[TMP14:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP12]], [[PRED_SDIV_IF]] ] +; UNROLL-NO-VF-NEXT: [[TMP15:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP13]], [[PRED_SDIV_IF]] ] +; UNROLL-NO-VF-NEXT: br i1 [[TMP9]], label [[PRED_SDIV_IF6:%.*]], label [[PRED_SDIV_CONTINUE7]] ; UNROLL-NO-VF: pred.sdiv.if6: -; UNROLL-NO-VF-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[BSD]], i64 [[INDUCTION6]] -; UNROLL-NO-VF-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4, !alias.scope !22 -; UNROLL-NO-VF-NEXT: [[TMP16:%.*]] = sdiv i32 [[TMP5]], [[TMP3]] -; UNROLL-NO-VF-NEXT: [[TMP17:%.*]] = sdiv i32 [[TMP15]], [[TMP16]] -; UNROLL-NO-VF-NEXT: br label [[PRED_SDIV_CONTINUE8]] +; UNROLL-NO-VF-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[BSD]], i64 [[TMP1]] +; UNROLL-NO-VF-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP16]], align 4, !alias.scope !22 +; UNROLL-NO-VF-NEXT: [[TMP18:%.*]] = sdiv i32 [[TMP7]], [[TMP5]] +; UNROLL-NO-VF-NEXT: [[TMP19:%.*]] = sdiv i32 [[TMP17]], [[TMP18]] +; UNROLL-NO-VF-NEXT: br label [[PRED_SDIV_CONTINUE7]] ; UNROLL-NO-VF: pred.sdiv.continue7: -; UNROLL-NO-VF-NEXT: [[TMP18:%.*]] = phi i32 [ poison, [[PRED_SDIV_CONTINUE]] ], [ [[TMP16]], [[PRED_SDIV_IF7]] ] -; UNROLL-NO-VF-NEXT: [[TMP19:%.*]] = phi i32 [ poison, [[PRED_SDIV_CONTINUE]] ], [ [[TMP17]], [[PRED_SDIV_IF7]] ] -; UNROLL-NO-VF-NEXT: [[TMP20:%.*]] = xor i1 [[TMP6]], true -; UNROLL-NO-VF-NEXT: [[TMP21:%.*]] = xor i1 [[TMP7]], true -; UNROLL-NO-VF-NEXT: [[PREDPHI:%.*]] = select i1 [[TMP20]], i32 [[TMP4]], i32 [[TMP13]] -; UNROLL-NO-VF-NEXT: [[PREDPHI9:%.*]] = select i1 [[TMP21]], i32 [[TMP5]], i32 [[TMP19]] -; UNROLL-NO-VF-NEXT: store i32 [[PREDPHI]], i32* [[TMP0]], align 4, !alias.scope !19, !noalias !22 -; UNROLL-NO-VF-NEXT: store i32 [[PREDPHI9]], i32* [[TMP1]], align 4, !alias.scope !19, !noalias !22 +; UNROLL-NO-VF-NEXT: [[TMP20:%.*]] = phi i32 [ poison, [[PRED_SDIV_CONTINUE]] ], [ [[TMP18]], [[PRED_SDIV_IF6]] ] +; UNROLL-NO-VF-NEXT: [[TMP21:%.*]] = phi i32 [ poison, [[PRED_SDIV_CONTINUE]] ], [ [[TMP19]], [[PRED_SDIV_IF6]] ] +; UNROLL-NO-VF-NEXT: [[TMP22:%.*]] = xor i1 [[TMP8]], true +; UNROLL-NO-VF-NEXT: [[TMP23:%.*]] = xor i1 [[TMP9]], true +; UNROLL-NO-VF-NEXT: [[PREDPHI:%.*]] = select i1 [[TMP22]], i32 [[TMP6]], i32 [[TMP15]] +; UNROLL-NO-VF-NEXT: [[PREDPHI8:%.*]] = select i1 [[TMP23]], i32 [[TMP7]], i32 [[TMP21]] +; UNROLL-NO-VF-NEXT: store i32 [[PREDPHI]], i32* [[TMP2]], align 4, !alias.scope !19, !noalias !22 +; UNROLL-NO-VF-NEXT: store i32 [[PREDPHI8]], i32* [[TMP3]], align 4, !alias.scope !19, !noalias !22 ; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; UNROLL-NO-VF-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128 -; UNROLL-NO-VF-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] +; UNROLL-NO-VF-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128 +; UNROLL-NO-VF-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] ; UNROLL-NO-VF: middle.block: ; UNROLL-NO-VF-NEXT: [[CMP_N:%.*]] = icmp eq i64 128, 128 ; UNROLL-NO-VF-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] @@ -703,56 +703,56 @@ ; UNROLL-NO-VF: vector.ph: ; UNROLL-NO-VF-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL-NO-VF: vector.body: -; UNROLL-NO-VF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_SDIV_CONTINUE8:%.*]] ] -; UNROLL-NO-VF-NEXT: [[INDUCTION:%.*]] = add i64 [[INDEX]], 0 -; UNROLL-NO-VF-NEXT: [[INDUCTION6:%.*]] = add i64 [[INDEX]], 1 -; UNROLL-NO-VF-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[ASD]], i64 [[INDUCTION]] -; UNROLL-NO-VF-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[ASD]], i64 [[INDUCTION6]] -; UNROLL-NO-VF-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]], align 4, !alias.scope !28, !noalias !31 -; UNROLL-NO-VF-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]], align 4, !alias.scope !28, !noalias !31 -; UNROLL-NO-VF-NEXT: [[TMP4:%.*]] = add nsw i32 [[TMP2]], 23 -; UNROLL-NO-VF-NEXT: [[TMP5:%.*]] = add nsw i32 [[TMP3]], 23 -; UNROLL-NO-VF-NEXT: [[TMP6:%.*]] = icmp slt i32 [[TMP2]], 100 -; UNROLL-NO-VF-NEXT: [[TMP7:%.*]] = icmp slt i32 [[TMP3]], 100 -; UNROLL-NO-VF-NEXT: [[TMP8:%.*]] = icmp sge i32 [[TMP2]], 200 -; UNROLL-NO-VF-NEXT: [[TMP9:%.*]] = icmp sge i32 [[TMP3]], 200 -; UNROLL-NO-VF-NEXT: [[TMP10:%.*]] = xor i1 [[TMP6]], true, !dbg [[DBG33:![0-9]+]] -; UNROLL-NO-VF-NEXT: [[TMP11:%.*]] = xor i1 [[TMP7]], true, !dbg [[DBG33]] -; UNROLL-NO-VF-NEXT: [[TMP12:%.*]] = select i1 [[TMP10]], i1 [[TMP8]], i1 false, !dbg [[DBG34:![0-9]+]] -; UNROLL-NO-VF-NEXT: [[TMP13:%.*]] = select i1 [[TMP11]], i1 [[TMP9]], i1 false, !dbg [[DBG34]] -; UNROLL-NO-VF-NEXT: [[TMP14:%.*]] = or i1 [[TMP12]], [[TMP6]] -; UNROLL-NO-VF-NEXT: [[TMP15:%.*]] = or i1 [[TMP13]], [[TMP7]] -; UNROLL-NO-VF-NEXT: br i1 [[TMP14]], label [[PRED_SDIV_IF:%.*]], label [[PRED_SDIV_CONTINUE:%.*]] +; UNROLL-NO-VF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_SDIV_CONTINUE7:%.*]] ] +; UNROLL-NO-VF-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; UNROLL-NO-VF-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; UNROLL-NO-VF-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[ASD]], i64 [[TMP0]] +; UNROLL-NO-VF-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[ASD]], i64 [[TMP1]] +; UNROLL-NO-VF-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 4, !alias.scope !28, !noalias !31 +; UNROLL-NO-VF-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP3]], align 4, !alias.scope !28, !noalias !31 +; UNROLL-NO-VF-NEXT: [[TMP6:%.*]] = add nsw i32 [[TMP4]], 23 +; UNROLL-NO-VF-NEXT: [[TMP7:%.*]] = add nsw i32 [[TMP5]], 23 +; UNROLL-NO-VF-NEXT: [[TMP8:%.*]] = icmp slt i32 [[TMP4]], 100 +; UNROLL-NO-VF-NEXT: [[TMP9:%.*]] = icmp slt i32 [[TMP5]], 100 +; UNROLL-NO-VF-NEXT: [[TMP10:%.*]] = icmp sge i32 [[TMP4]], 200 +; UNROLL-NO-VF-NEXT: [[TMP11:%.*]] = icmp sge i32 [[TMP5]], 200 +; UNROLL-NO-VF-NEXT: [[TMP12:%.*]] = xor i1 [[TMP8]], true, !dbg [[DBG33:![0-9]+]] +; UNROLL-NO-VF-NEXT: [[TMP13:%.*]] = xor i1 [[TMP9]], true, !dbg [[DBG33]] +; UNROLL-NO-VF-NEXT: [[TMP14:%.*]] = select i1 [[TMP12]], i1 [[TMP10]], i1 false, !dbg [[DBG34:![0-9]+]] +; UNROLL-NO-VF-NEXT: [[TMP15:%.*]] = select i1 [[TMP13]], i1 [[TMP11]], i1 false, !dbg [[DBG34]] +; UNROLL-NO-VF-NEXT: [[TMP16:%.*]] = or i1 [[TMP14]], [[TMP8]] +; UNROLL-NO-VF-NEXT: [[TMP17:%.*]] = or i1 [[TMP15]], [[TMP9]] +; UNROLL-NO-VF-NEXT: br i1 [[TMP16]], label [[PRED_SDIV_IF:%.*]], label [[PRED_SDIV_CONTINUE:%.*]] ; UNROLL-NO-VF: pred.sdiv.if: -; UNROLL-NO-VF-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[BSD]], i64 [[INDUCTION]] -; UNROLL-NO-VF-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP16]], align 4, !alias.scope !31 -; UNROLL-NO-VF-NEXT: [[TMP18:%.*]] = sdiv i32 [[TMP4]], [[TMP2]] -; UNROLL-NO-VF-NEXT: [[TMP19:%.*]] = sdiv i32 [[TMP17]], [[TMP18]] +; UNROLL-NO-VF-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[BSD]], i64 [[TMP0]] +; UNROLL-NO-VF-NEXT: [[TMP19:%.*]] = load i32, i32* [[TMP18]], align 4, !alias.scope !31 +; UNROLL-NO-VF-NEXT: [[TMP20:%.*]] = sdiv i32 [[TMP6]], [[TMP4]] +; UNROLL-NO-VF-NEXT: [[TMP21:%.*]] = sdiv i32 [[TMP19]], [[TMP20]] ; UNROLL-NO-VF-NEXT: br label [[PRED_SDIV_CONTINUE]] ; UNROLL-NO-VF: pred.sdiv.continue: -; UNROLL-NO-VF-NEXT: [[TMP20:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP18]], [[PRED_SDIV_IF]] ] -; UNROLL-NO-VF-NEXT: [[TMP21:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP19]], [[PRED_SDIV_IF]] ] -; UNROLL-NO-VF-NEXT: br i1 [[TMP15]], label [[PRED_SDIV_IF7:%.*]], label [[PRED_SDIV_CONTINUE8]] +; UNROLL-NO-VF-NEXT: [[TMP22:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP20]], [[PRED_SDIV_IF]] ] +; UNROLL-NO-VF-NEXT: [[TMP23:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP21]], [[PRED_SDIV_IF]] ] +; UNROLL-NO-VF-NEXT: br i1 [[TMP17]], label [[PRED_SDIV_IF6:%.*]], label [[PRED_SDIV_CONTINUE7]] ; UNROLL-NO-VF: pred.sdiv.if6: -; UNROLL-NO-VF-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[BSD]], i64 [[INDUCTION6]] -; UNROLL-NO-VF-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP22]], align 4, !alias.scope !31 -; UNROLL-NO-VF-NEXT: [[TMP24:%.*]] = sdiv i32 [[TMP5]], [[TMP3]] -; UNROLL-NO-VF-NEXT: [[TMP25:%.*]] = sdiv i32 [[TMP23]], [[TMP24]] -; UNROLL-NO-VF-NEXT: br label [[PRED_SDIV_CONTINUE8]] +; UNROLL-NO-VF-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[BSD]], i64 [[TMP1]] +; UNROLL-NO-VF-NEXT: [[TMP25:%.*]] = load i32, i32* [[TMP24]], align 4, !alias.scope !31 +; UNROLL-NO-VF-NEXT: [[TMP26:%.*]] = sdiv i32 [[TMP7]], [[TMP5]] +; UNROLL-NO-VF-NEXT: [[TMP27:%.*]] = sdiv i32 [[TMP25]], [[TMP26]] +; UNROLL-NO-VF-NEXT: br label [[PRED_SDIV_CONTINUE7]] ; UNROLL-NO-VF: pred.sdiv.continue7: -; UNROLL-NO-VF-NEXT: [[TMP26:%.*]] = phi i32 [ poison, [[PRED_SDIV_CONTINUE]] ], [ [[TMP24]], [[PRED_SDIV_IF7]] ] -; UNROLL-NO-VF-NEXT: [[TMP27:%.*]] = phi i32 [ poison, [[PRED_SDIV_CONTINUE]] ], [ [[TMP25]], [[PRED_SDIV_IF7]] ] -; UNROLL-NO-VF-NEXT: [[TMP28:%.*]] = xor i1 [[TMP8]], true, !dbg [[DBG34]] -; UNROLL-NO-VF-NEXT: [[TMP29:%.*]] = xor i1 [[TMP9]], true, !dbg [[DBG34]] -; UNROLL-NO-VF-NEXT: [[TMP30:%.*]] = select i1 [[TMP10]], i1 [[TMP28]], i1 false, !dbg [[DBG34]] -; UNROLL-NO-VF-NEXT: [[TMP31:%.*]] = select i1 [[TMP11]], i1 [[TMP29]], i1 false, !dbg [[DBG34]] -; UNROLL-NO-VF-NEXT: [[PREDPHI:%.*]] = select i1 [[TMP30]], i32 [[TMP4]], i32 [[TMP21]] -; UNROLL-NO-VF-NEXT: [[PREDPHI9:%.*]] = select i1 [[TMP31]], i32 [[TMP5]], i32 [[TMP27]] -; UNROLL-NO-VF-NEXT: store i32 [[PREDPHI]], i32* [[TMP0]], align 4, !alias.scope !28, !noalias !31 -; UNROLL-NO-VF-NEXT: store i32 [[PREDPHI9]], i32* [[TMP1]], align 4, !alias.scope !28, !noalias !31 +; UNROLL-NO-VF-NEXT: [[TMP28:%.*]] = phi i32 [ poison, [[PRED_SDIV_CONTINUE]] ], [ [[TMP26]], [[PRED_SDIV_IF6]] ] +; UNROLL-NO-VF-NEXT: [[TMP29:%.*]] = phi i32 [ poison, [[PRED_SDIV_CONTINUE]] ], [ [[TMP27]], [[PRED_SDIV_IF6]] ] +; UNROLL-NO-VF-NEXT: [[TMP30:%.*]] = xor i1 [[TMP10]], true, !dbg [[DBG34]] +; UNROLL-NO-VF-NEXT: [[TMP31:%.*]] = xor i1 [[TMP11]], true, !dbg [[DBG34]] +; UNROLL-NO-VF-NEXT: [[TMP32:%.*]] = select i1 [[TMP12]], i1 [[TMP30]], i1 false, !dbg [[DBG34]] +; UNROLL-NO-VF-NEXT: [[TMP33:%.*]] = select i1 [[TMP13]], i1 [[TMP31]], i1 false, !dbg [[DBG34]] +; UNROLL-NO-VF-NEXT: [[PREDPHI:%.*]] = select i1 [[TMP32]], i32 [[TMP6]], i32 [[TMP23]] +; UNROLL-NO-VF-NEXT: [[PREDPHI8:%.*]] = select i1 [[TMP33]], i32 [[TMP7]], i32 [[TMP29]] +; UNROLL-NO-VF-NEXT: store i32 [[PREDPHI]], i32* [[TMP2]], align 4, !alias.scope !28, !noalias !31 +; UNROLL-NO-VF-NEXT: store i32 [[PREDPHI8]], i32* [[TMP3]], align 4, !alias.scope !28, !noalias !31 ; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; UNROLL-NO-VF-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128 -; UNROLL-NO-VF-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP35:![0-9]+]] +; UNROLL-NO-VF-NEXT: [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128 +; UNROLL-NO-VF-NEXT: br i1 [[TMP34]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP35:![0-9]+]] ; UNROLL-NO-VF: middle.block: ; UNROLL-NO-VF-NEXT: [[CMP_N:%.*]] = icmp eq i64 128, 128 ; UNROLL-NO-VF-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] @@ -867,11 +867,12 @@ ; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP37:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[TMP19]]) +; CHECK-NEXT: [[RDX_START:%.*]] = add i32 0, [[TMP21]] ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[FOR_INC:%.*]] ] @@ -890,7 +891,7 @@ ; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP38:![0-9]+]] ; CHECK: for.end: -; CHECK-NEXT: [[T7:%.*]] = phi i32 [ [[T6]], [[FOR_INC]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[T7:%.*]] = phi i32 [ [[T6]], [[FOR_INC]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[T7]] ; ; UNROLL-NO-VF-LABEL: @predicated_udiv_scalarized_operand( @@ -903,40 +904,40 @@ ; UNROLL-NO-VF-NEXT: [[N_VEC:%.*]] = sub i64 [[SMAX]], [[N_MOD_VF]] ; UNROLL-NO-VF-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL-NO-VF: vector.body: -; UNROLL-NO-VF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE4:%.*]] ] -; UNROLL-NO-VF-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[PRED_UDIV_CONTINUE4]] ] -; UNROLL-NO-VF-NEXT: [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[PRED_UDIV_CONTINUE4]] ] -; UNROLL-NO-VF-NEXT: [[INDUCTION:%.*]] = add i64 [[INDEX]], 0 -; UNROLL-NO-VF-NEXT: [[INDUCTION2:%.*]] = add i64 [[INDEX]], 1 -; UNROLL-NO-VF-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDUCTION]] -; UNROLL-NO-VF-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDUCTION2]] -; UNROLL-NO-VF-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]], align 4 -; UNROLL-NO-VF-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]], align 4 +; UNROLL-NO-VF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE3:%.*]] ] +; UNROLL-NO-VF-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[PRED_UDIV_CONTINUE3]] ] +; UNROLL-NO-VF-NEXT: [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[PRED_UDIV_CONTINUE3]] ] +; UNROLL-NO-VF-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; UNROLL-NO-VF-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; UNROLL-NO-VF-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]] +; UNROLL-NO-VF-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP1]] +; UNROLL-NO-VF-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 4 +; UNROLL-NO-VF-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP3]], align 4 ; UNROLL-NO-VF-NEXT: br i1 [[C:%.*]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]] ; UNROLL-NO-VF: pred.udiv.if: -; UNROLL-NO-VF-NEXT: [[TMP4:%.*]] = add nsw i32 [[TMP2]], [[X:%.*]] -; UNROLL-NO-VF-NEXT: [[TMP5:%.*]] = udiv i32 [[TMP2]], [[TMP4]] +; UNROLL-NO-VF-NEXT: [[TMP6:%.*]] = add nsw i32 [[TMP4]], [[X:%.*]] +; UNROLL-NO-VF-NEXT: [[TMP7:%.*]] = udiv i32 [[TMP4]], [[TMP6]] ; UNROLL-NO-VF-NEXT: br label [[PRED_UDIV_CONTINUE]] ; UNROLL-NO-VF: pred.udiv.continue: -; UNROLL-NO-VF-NEXT: [[TMP6:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP5]], [[PRED_UDIV_IF]] ] -; UNROLL-NO-VF-NEXT: br i1 [[C]], label [[PRED_UDIV_IF3:%.*]], label [[PRED_UDIV_CONTINUE4]] +; UNROLL-NO-VF-NEXT: [[TMP8:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_UDIV_IF]] ] +; UNROLL-NO-VF-NEXT: br i1 [[C]], label [[PRED_UDIV_IF2:%.*]], label [[PRED_UDIV_CONTINUE3]] ; UNROLL-NO-VF: pred.udiv.if2: -; UNROLL-NO-VF-NEXT: [[TMP7:%.*]] = add nsw i32 [[TMP3]], [[X]] -; UNROLL-NO-VF-NEXT: [[TMP8:%.*]] = udiv i32 [[TMP3]], [[TMP7]] -; UNROLL-NO-VF-NEXT: br label [[PRED_UDIV_CONTINUE4]] +; UNROLL-NO-VF-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP5]], [[X]] +; UNROLL-NO-VF-NEXT: [[TMP10:%.*]] = udiv i32 [[TMP5]], [[TMP9]] +; UNROLL-NO-VF-NEXT: br label [[PRED_UDIV_CONTINUE3]] ; UNROLL-NO-VF: pred.udiv.continue3: -; UNROLL-NO-VF-NEXT: [[TMP9:%.*]] = phi i32 [ poison, [[PRED_UDIV_CONTINUE]] ], [ [[TMP8]], [[PRED_UDIV_IF3]] ] -; UNROLL-NO-VF-NEXT: [[TMP10:%.*]] = xor i1 [[C]], true -; UNROLL-NO-VF-NEXT: [[TMP11:%.*]] = xor i1 [[C]], true -; UNROLL-NO-VF-NEXT: [[PREDPHI:%.*]] = select i1 [[C]], i32 [[TMP6]], i32 [[TMP2]] -; UNROLL-NO-VF-NEXT: [[PREDPHI5:%.*]] = select i1 [[C]], i32 [[TMP9]], i32 [[TMP3]] -; UNROLL-NO-VF-NEXT: [[TMP12]] = add i32 [[VEC_PHI]], [[PREDPHI]] -; UNROLL-NO-VF-NEXT: [[TMP13]] = add i32 [[VEC_PHI1]], [[PREDPHI5]] +; UNROLL-NO-VF-NEXT: [[TMP11:%.*]] = phi i32 [ poison, [[PRED_UDIV_CONTINUE]] ], [ [[TMP10]], [[PRED_UDIV_IF2]] ] +; UNROLL-NO-VF-NEXT: [[TMP12:%.*]] = xor i1 [[C]], true +; UNROLL-NO-VF-NEXT: [[TMP13:%.*]] = xor i1 [[C]], true +; UNROLL-NO-VF-NEXT: [[PREDPHI:%.*]] = select i1 [[C]], i32 [[TMP8]], i32 [[TMP4]] +; UNROLL-NO-VF-NEXT: [[PREDPHI4:%.*]] = select i1 [[C]], i32 [[TMP11]], i32 [[TMP5]] +; UNROLL-NO-VF-NEXT: [[TMP14]] = add i32 [[VEC_PHI]], [[PREDPHI]] +; UNROLL-NO-VF-NEXT: [[TMP15]] = add i32 [[VEC_PHI1]], [[PREDPHI4]] ; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; UNROLL-NO-VF-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; UNROLL-NO-VF-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP37:![0-9]+]] +; UNROLL-NO-VF-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; UNROLL-NO-VF-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP37:![0-9]+]] ; UNROLL-NO-VF: middle.block: -; UNROLL-NO-VF-NEXT: [[BIN_RDX:%.*]] = add i32 [[TMP13]], [[TMP12]] +; UNROLL-NO-VF-NEXT: [[BIN_RDX:%.*]] = add i32 [[TMP15]], [[TMP14]] ; UNROLL-NO-VF-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]] ; UNROLL-NO-VF-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; UNROLL-NO-VF: scalar.ph: Index: llvm/test/Transforms/LoopVectorize/if-pred-stores.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/if-pred-stores.ll +++ llvm/test/Transforms/LoopVectorize/if-pred-stores.ll @@ -11,41 +11,41 @@ ; UNROLL-NEXT: entry: ; UNROLL-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL: vector.body: -; UNROLL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE3:%.*]] ] -; UNROLL-NEXT: [[INDUCTION:%.*]] = add i64 [[INDEX]], 0 -; UNROLL-NEXT: [[INDUCTION1:%.*]] = add i64 [[INDEX]], 1 -; UNROLL-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[F:%.*]], i64 [[INDUCTION]] -; UNROLL-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[F]], i64 [[INDUCTION1]] -; UNROLL-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]], align 4 -; UNROLL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]], align 4 -; UNROLL-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], 100 -; UNROLL-NEXT: [[TMP5:%.*]] = icmp sgt i32 [[TMP3]], 100 -; UNROLL-NEXT: br i1 [[TMP4]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; UNROLL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ] +; UNROLL-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; UNROLL-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; UNROLL-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[F:%.*]], i64 [[TMP0]] +; UNROLL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[F]], i64 [[TMP1]] +; UNROLL-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 4 +; UNROLL-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP3]], align 4 +; UNROLL-NEXT: [[TMP6:%.*]] = icmp sgt i32 [[TMP4]], 100 +; UNROLL-NEXT: [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], 100 +; UNROLL-NEXT: br i1 [[TMP6]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; UNROLL: pred.store.if: -; UNROLL-NEXT: [[TMP6:%.*]] = add nsw i32 [[TMP2]], 20 -; UNROLL-NEXT: store i32 [[TMP6]], i32* [[TMP0]], align 4 +; UNROLL-NEXT: [[TMP8:%.*]] = add nsw i32 [[TMP4]], 20 +; UNROLL-NEXT: store i32 [[TMP8]], i32* [[TMP2]], align 4 ; UNROLL-NEXT: br label [[PRED_STORE_CONTINUE]] ; UNROLL: pred.store.continue: -; UNROLL-NEXT: br i1 [[TMP5]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3]] +; UNROLL-NEXT: br i1 [[TMP7]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]] ; UNROLL: pred.store.if1: -; UNROLL-NEXT: [[TMP7:%.*]] = add nsw i32 [[TMP3]], 20 -; UNROLL-NEXT: store i32 [[TMP7]], i32* [[TMP1]], align 4 -; UNROLL-NEXT: br label [[PRED_STORE_CONTINUE3]] +; UNROLL-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP5]], 20 +; UNROLL-NEXT: store i32 [[TMP9]], i32* [[TMP3]], align 4 +; UNROLL-NEXT: br label [[PRED_STORE_CONTINUE2]] ; UNROLL: pred.store.continue2: ; UNROLL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; UNROLL-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128 -; UNROLL-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; UNROLL-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128 +; UNROLL-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; UNROLL: middle.block: ; UNROLL-NEXT: [[CMP_N:%.*]] = icmp eq i64 128, 128 ; UNROLL-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY:%.*]] ; UNROLL: for.body: ; UNROLL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 128, [[MIDDLE_BLOCK]] ] ; UNROLL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[F]], i64 [[INDVARS_IV]] -; UNROLL-NEXT: [[TMP9:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; UNROLL-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP9]], 100 +; UNROLL-NEXT: [[TMP11:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; UNROLL-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP11]], 100 ; UNROLL-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; UNROLL: if.then: -; UNROLL-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP9]], 20 +; UNROLL-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP11]], 20 ; UNROLL-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX]], align 4 ; UNROLL-NEXT: br label [[FOR_INC]] ; UNROLL: for.inc: @@ -61,30 +61,30 @@ ; UNROLL-NOSIMPLIFY: vector.ph: ; UNROLL-NOSIMPLIFY-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL-NOSIMPLIFY: vector.body: -; UNROLL-NOSIMPLIFY-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE3:%.*]] ] -; UNROLL-NOSIMPLIFY-NEXT: [[INDUCTION:%.*]] = add i64 [[INDEX]], 0 -; UNROLL-NOSIMPLIFY-NEXT: [[INDUCTION1:%.*]] = add i64 [[INDEX]], 1 -; UNROLL-NOSIMPLIFY-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[F:%.*]], i64 [[INDUCTION]] -; UNROLL-NOSIMPLIFY-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[F]], i64 [[INDUCTION1]] -; UNROLL-NOSIMPLIFY-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]], align 4 -; UNROLL-NOSIMPLIFY-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]], align 4 -; UNROLL-NOSIMPLIFY-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], 100 -; UNROLL-NOSIMPLIFY-NEXT: [[TMP5:%.*]] = icmp sgt i32 [[TMP3]], 100 -; UNROLL-NOSIMPLIFY-NEXT: br i1 [[TMP4]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; UNROLL-NOSIMPLIFY-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ] +; UNROLL-NOSIMPLIFY-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; UNROLL-NOSIMPLIFY-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; UNROLL-NOSIMPLIFY-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[F:%.*]], i64 [[TMP0]] +; UNROLL-NOSIMPLIFY-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[F]], i64 [[TMP1]] +; UNROLL-NOSIMPLIFY-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 4 +; UNROLL-NOSIMPLIFY-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP3]], align 4 +; UNROLL-NOSIMPLIFY-NEXT: [[TMP6:%.*]] = icmp sgt i32 [[TMP4]], 100 +; UNROLL-NOSIMPLIFY-NEXT: [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], 100 +; UNROLL-NOSIMPLIFY-NEXT: br i1 [[TMP6]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; UNROLL-NOSIMPLIFY: pred.store.if: -; UNROLL-NOSIMPLIFY-NEXT: [[TMP6:%.*]] = add nsw i32 [[TMP2]], 20 -; UNROLL-NOSIMPLIFY-NEXT: store i32 [[TMP6]], i32* [[TMP0]], align 4 +; UNROLL-NOSIMPLIFY-NEXT: [[TMP8:%.*]] = add nsw i32 [[TMP4]], 20 +; UNROLL-NOSIMPLIFY-NEXT: store i32 [[TMP8]], i32* [[TMP2]], align 4 ; UNROLL-NOSIMPLIFY-NEXT: br label [[PRED_STORE_CONTINUE]] ; UNROLL-NOSIMPLIFY: pred.store.continue: -; UNROLL-NOSIMPLIFY-NEXT: br i1 [[TMP5]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3]] +; UNROLL-NOSIMPLIFY-NEXT: br i1 [[TMP7]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]] ; UNROLL-NOSIMPLIFY: pred.store.if1: -; UNROLL-NOSIMPLIFY-NEXT: [[TMP7:%.*]] = add nsw i32 [[TMP3]], 20 -; UNROLL-NOSIMPLIFY-NEXT: store i32 [[TMP7]], i32* [[TMP1]], align 4 -; UNROLL-NOSIMPLIFY-NEXT: br label [[PRED_STORE_CONTINUE3]] +; UNROLL-NOSIMPLIFY-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP5]], 20 +; UNROLL-NOSIMPLIFY-NEXT: store i32 [[TMP9]], i32* [[TMP3]], align 4 +; UNROLL-NOSIMPLIFY-NEXT: br label [[PRED_STORE_CONTINUE2]] ; UNROLL-NOSIMPLIFY: pred.store.continue2: ; UNROLL-NOSIMPLIFY-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; UNROLL-NOSIMPLIFY-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128 -; UNROLL-NOSIMPLIFY-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; UNROLL-NOSIMPLIFY-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128 +; UNROLL-NOSIMPLIFY-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; UNROLL-NOSIMPLIFY: middle.block: ; UNROLL-NOSIMPLIFY-NEXT: [[CMP_N:%.*]] = icmp eq i64 128, 128 ; UNROLL-NOSIMPLIFY-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -94,11 +94,11 @@ ; UNROLL-NOSIMPLIFY: for.body: ; UNROLL-NOSIMPLIFY-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] ; UNROLL-NOSIMPLIFY-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[F]], i64 [[INDVARS_IV]] -; UNROLL-NOSIMPLIFY-NEXT: [[TMP9:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; UNROLL-NOSIMPLIFY-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP9]], 100 +; UNROLL-NOSIMPLIFY-NEXT: [[TMP11:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; UNROLL-NOSIMPLIFY-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP11]], 100 ; UNROLL-NOSIMPLIFY-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; UNROLL-NOSIMPLIFY: if.then: -; UNROLL-NOSIMPLIFY-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP9]], 20 +; UNROLL-NOSIMPLIFY-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP11]], 20 ; UNROLL-NOSIMPLIFY-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX]], align 4 ; UNROLL-NOSIMPLIFY-NEXT: br label [[FOR_INC]] ; UNROLL-NOSIMPLIFY: for.inc: @@ -210,36 +210,36 @@ ; UNROLL-NEXT: [[IND_END:%.*]] = add i64 [[V_1]], [[N_VEC]] ; UNROLL-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL: vector.body: -; UNROLL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE4:%.*]] ] -; UNROLL-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[V_2:%.*]], [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[PRED_STORE_CONTINUE4]] ] -; UNROLL-NEXT: [[VEC_PHI2:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[PREDPHI5:%.*]], [[PRED_STORE_CONTINUE4]] ] +; UNROLL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE3:%.*]] ] +; UNROLL-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[V_2:%.*]], [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[PRED_STORE_CONTINUE3]] ] +; UNROLL-NEXT: [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[PREDPHI4:%.*]], [[PRED_STORE_CONTINUE3]] ] ; UNROLL-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[V_1]], [[INDEX]] -; UNROLL-NEXT: br i1 [[COND_2:%.*]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE4]] +; UNROLL-NEXT: br i1 [[COND_2:%.*]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE3]] ; UNROLL: pred.store.if: -; UNROLL-NEXT: [[INDUCTION:%.*]] = add i64 [[OFFSET_IDX]], 0 -; UNROLL-NEXT: [[TMP5:%.*]] = getelementptr inbounds [768 x i32], [768 x i32]* [[PTR:%.*]], i64 0, i64 [[INDUCTION]] -; UNROLL-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 -; UNROLL-NEXT: store i32 [[TMP6]], i32* [[TMP5]], align 4 -; UNROLL-NEXT: [[INDUCTION1:%.*]] = add i64 [[OFFSET_IDX]], 1 -; UNROLL-NEXT: [[TMP7:%.*]] = getelementptr inbounds [768 x i32], [768 x i32]* [[PTR]], i64 0, i64 [[INDUCTION1]] -; UNROLL-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -; UNROLL-NEXT: store i32 [[TMP8]], i32* [[TMP7]], align 4 -; UNROLL-NEXT: br label [[PRED_STORE_CONTINUE4]] +; UNROLL-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0 +; UNROLL-NEXT: [[TMP6:%.*]] = getelementptr inbounds [768 x i32], [768 x i32]* [[PTR:%.*]], i64 0, i64 [[TMP5]] +; UNROLL-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 +; UNROLL-NEXT: store i32 [[TMP7]], i32* [[TMP6]], align 4 +; UNROLL-NEXT: [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 1 +; UNROLL-NEXT: [[TMP9:%.*]] = getelementptr inbounds [768 x i32], [768 x i32]* [[PTR]], i64 0, i64 [[TMP8]] +; UNROLL-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP9]], align 4 +; UNROLL-NEXT: store i32 [[TMP10]], i32* [[TMP9]], align 4 +; UNROLL-NEXT: br label [[PRED_STORE_CONTINUE3]] ; UNROLL: pred.store.continue3: -; UNROLL-NEXT: [[TMP9:%.*]] = add i32 [[VEC_PHI]], 1 -; UNROLL-NEXT: [[TMP10:%.*]] = add i32 [[VEC_PHI2]], 1 -; UNROLL-NEXT: [[TMP11:%.*]] = xor i1 [[COND_2]], true -; UNROLL-NEXT: [[TMP12:%.*]] = xor i1 [[COND_2]], true -; UNROLL-NEXT: [[PREDPHI]] = select i1 [[TMP11]], i32 [[VEC_PHI]], i32 [[TMP9]] -; UNROLL-NEXT: [[PREDPHI5]] = select i1 [[TMP12]], i32 [[VEC_PHI2]], i32 [[TMP10]] +; UNROLL-NEXT: [[TMP11:%.*]] = add i32 [[VEC_PHI]], 1 +; UNROLL-NEXT: [[TMP12:%.*]] = add i32 [[VEC_PHI1]], 1 +; UNROLL-NEXT: [[TMP13:%.*]] = xor i1 [[COND_2]], true +; UNROLL-NEXT: [[TMP14:%.*]] = xor i1 [[COND_2]], true +; UNROLL-NEXT: [[PREDPHI]] = select i1 [[TMP13]], i32 [[VEC_PHI]], i32 [[TMP11]] +; UNROLL-NEXT: [[PREDPHI4]] = select i1 [[TMP14]], i32 [[VEC_PHI1]], i32 [[TMP12]] ; UNROLL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; UNROLL-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; UNROLL-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; UNROLL-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; UNROLL-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; UNROLL: middle.block: -; UNROLL-NEXT: [[BIN_RDX:%.*]] = add i32 [[PREDPHI5]], [[PREDPHI]] +; UNROLL-NEXT: [[BIN_RDX:%.*]] = add i32 [[PREDPHI4]], [[PREDPHI]] ; UNROLL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP4]], [[N_VEC]] -; UNROLL-NEXT: [[TMP14:%.*]] = xor i1 [[CMP_N]], true -; UNROLL-NEXT: call void @llvm.assume(i1 [[TMP14]]) +; UNROLL-NEXT: [[TMP16:%.*]] = xor i1 [[CMP_N]], true +; UNROLL-NEXT: call void @llvm.assume(i1 [[TMP16]]) ; UNROLL-NEXT: br label [[SCALAR_PH]] ; UNROLL: scalar.ph: ; UNROLL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[V_1]], [[ENTRY:%.*]] ] @@ -282,37 +282,37 @@ ; UNROLL-NOSIMPLIFY-NEXT: [[IND_END:%.*]] = add i64 [[V_1]], [[N_VEC]] ; UNROLL-NOSIMPLIFY-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL-NOSIMPLIFY: vector.body: -; UNROLL-NOSIMPLIFY-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE4:%.*]] ] -; UNROLL-NOSIMPLIFY-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[V_2:%.*]], [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[PRED_STORE_CONTINUE4]] ] -; UNROLL-NOSIMPLIFY-NEXT: [[VEC_PHI2:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[PREDPHI5:%.*]], [[PRED_STORE_CONTINUE4]] ] +; UNROLL-NOSIMPLIFY-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE3:%.*]] ] +; UNROLL-NOSIMPLIFY-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[V_2:%.*]], [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[PRED_STORE_CONTINUE3]] ] +; UNROLL-NOSIMPLIFY-NEXT: [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[PREDPHI4:%.*]], [[PRED_STORE_CONTINUE3]] ] ; UNROLL-NOSIMPLIFY-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[V_1]], [[INDEX]] ; UNROLL-NOSIMPLIFY-NEXT: br i1 [[COND_2:%.*]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; UNROLL-NOSIMPLIFY: pred.store.if: -; UNROLL-NOSIMPLIFY-NEXT: [[INDUCTION:%.*]] = add i64 [[OFFSET_IDX]], 0 -; UNROLL-NOSIMPLIFY-NEXT: [[TMP4:%.*]] = getelementptr inbounds [768 x i32], [768 x i32]* [[PTR:%.*]], i64 0, i64 [[INDUCTION]] -; UNROLL-NOSIMPLIFY-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4 -; UNROLL-NOSIMPLIFY-NEXT: store i32 [[TMP5]], i32* [[TMP4]], align 4 +; UNROLL-NOSIMPLIFY-NEXT: [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 0 +; UNROLL-NOSIMPLIFY-NEXT: [[TMP5:%.*]] = getelementptr inbounds [768 x i32], [768 x i32]* [[PTR:%.*]], i64 0, i64 [[TMP4]] +; UNROLL-NOSIMPLIFY-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +; UNROLL-NOSIMPLIFY-NEXT: store i32 [[TMP6]], i32* [[TMP5]], align 4 ; UNROLL-NOSIMPLIFY-NEXT: br label [[PRED_STORE_CONTINUE]] ; UNROLL-NOSIMPLIFY: pred.store.continue: -; UNROLL-NOSIMPLIFY-NEXT: br i1 [[COND_2]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4]] +; UNROLL-NOSIMPLIFY-NEXT: br i1 [[COND_2]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3]] ; UNROLL-NOSIMPLIFY: pred.store.if2: -; UNROLL-NOSIMPLIFY-NEXT: [[INDUCTION1:%.*]] = add i64 [[OFFSET_IDX]], 1 -; UNROLL-NOSIMPLIFY-NEXT: [[TMP6:%.*]] = getelementptr inbounds [768 x i32], [768 x i32]* [[PTR]], i64 0, i64 [[INDUCTION1]] -; UNROLL-NOSIMPLIFY-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 -; UNROLL-NOSIMPLIFY-NEXT: store i32 [[TMP7]], i32* [[TMP6]], align 4 -; UNROLL-NOSIMPLIFY-NEXT: br label [[PRED_STORE_CONTINUE4]] +; UNROLL-NOSIMPLIFY-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 1 +; UNROLL-NOSIMPLIFY-NEXT: [[TMP8:%.*]] = getelementptr inbounds [768 x i32], [768 x i32]* [[PTR]], i64 0, i64 [[TMP7]] +; UNROLL-NOSIMPLIFY-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4 +; UNROLL-NOSIMPLIFY-NEXT: store i32 [[TMP9]], i32* [[TMP8]], align 4 +; UNROLL-NOSIMPLIFY-NEXT: br label [[PRED_STORE_CONTINUE3]] ; UNROLL-NOSIMPLIFY: pred.store.continue3: -; UNROLL-NOSIMPLIFY-NEXT: [[TMP8:%.*]] = add i32 [[VEC_PHI]], 1 -; UNROLL-NOSIMPLIFY-NEXT: [[TMP9:%.*]] = add i32 [[VEC_PHI2]], 1 -; UNROLL-NOSIMPLIFY-NEXT: [[TMP10:%.*]] = xor i1 [[COND_2]], true -; UNROLL-NOSIMPLIFY-NEXT: [[TMP11:%.*]] = xor i1 [[COND_2]], true -; UNROLL-NOSIMPLIFY-NEXT: [[PREDPHI]] = select i1 [[TMP10]], i32 [[VEC_PHI]], i32 [[TMP8]] -; UNROLL-NOSIMPLIFY-NEXT: [[PREDPHI5]] = select i1 [[TMP11]], i32 [[VEC_PHI2]], i32 [[TMP9]] +; UNROLL-NOSIMPLIFY-NEXT: [[TMP10:%.*]] = add i32 [[VEC_PHI]], 1 +; UNROLL-NOSIMPLIFY-NEXT: [[TMP11:%.*]] = add i32 [[VEC_PHI1]], 1 +; UNROLL-NOSIMPLIFY-NEXT: [[TMP12:%.*]] = xor i1 [[COND_2]], true +; UNROLL-NOSIMPLIFY-NEXT: [[TMP13:%.*]] = xor i1 [[COND_2]], true +; UNROLL-NOSIMPLIFY-NEXT: [[PREDPHI]] = select i1 [[TMP12]], i32 [[VEC_PHI]], i32 [[TMP10]] +; UNROLL-NOSIMPLIFY-NEXT: [[PREDPHI4]] = select i1 [[TMP13]], i32 [[VEC_PHI1]], i32 [[TMP11]] ; UNROLL-NOSIMPLIFY-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; UNROLL-NOSIMPLIFY-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; UNROLL-NOSIMPLIFY-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; UNROLL-NOSIMPLIFY-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; UNROLL-NOSIMPLIFY-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; UNROLL-NOSIMPLIFY: middle.block: -; UNROLL-NOSIMPLIFY-NEXT: [[BIN_RDX:%.*]] = add i32 [[PREDPHI5]], [[PREDPHI]] +; UNROLL-NOSIMPLIFY-NEXT: [[BIN_RDX:%.*]] = add i32 [[PREDPHI4]], [[PREDPHI]] ; UNROLL-NOSIMPLIFY-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] ; UNROLL-NOSIMPLIFY-NEXT: br i1 [[CMP_N]], label [[FOR_INC26_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; UNROLL-NOSIMPLIFY: scalar.ph: @@ -357,51 +357,51 @@ ; VEC-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP4]], 2 ; VEC-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP4]], [[N_MOD_VF]] ; VEC-NEXT: [[IND_END:%.*]] = add i64 [[V_1]], [[N_VEC]] -; VEC-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> zeroinitializer, i32 [[V_2:%.*]], i32 0 ; VEC-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[COND_2:%.*]], i32 0 ; VEC-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT]], <2 x i1> poison, <2 x i32> zeroinitializer ; VEC-NEXT: br label [[VECTOR_BODY:%.*]] ; VEC: vector.body: ; VEC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ] -; VEC-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ [[TMP5]], [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[PRED_STORE_CONTINUE2]] ] +; VEC-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[PRED_STORE_CONTINUE2]] ] ; VEC-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[V_1]], [[INDEX]] -; VEC-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 0 -; VEC-NEXT: [[TMP7:%.*]] = getelementptr inbounds [768 x i32], [768 x i32]* [[PTR:%.*]], i64 0, i64 [[TMP6]] -; VEC-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 0 -; VEC-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <2 x i32>* -; VEC-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP9]], align 4 -; VEC-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 0 -; VEC-NEXT: br i1 [[TMP10]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; VEC-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0 +; VEC-NEXT: [[TMP6:%.*]] = getelementptr inbounds [768 x i32], [768 x i32]* [[PTR:%.*]], i64 0, i64 [[TMP5]] +; VEC-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 0 +; VEC-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <2 x i32>* +; VEC-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP8]], align 4 +; VEC-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 0 +; VEC-NEXT: br i1 [[TMP9]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; VEC: pred.store.if: -; VEC-NEXT: [[TMP11:%.*]] = getelementptr inbounds [768 x i32], [768 x i32]* [[PTR]], i64 0, i64 [[TMP6]] -; VEC-NEXT: [[TMP12:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0 -; VEC-NEXT: store i32 [[TMP12]], i32* [[TMP11]], align 4 +; VEC-NEXT: [[TMP10:%.*]] = getelementptr inbounds [768 x i32], [768 x i32]* [[PTR]], i64 0, i64 [[TMP5]] +; VEC-NEXT: [[TMP11:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0 +; VEC-NEXT: store i32 [[TMP11]], i32* [[TMP10]], align 4 ; VEC-NEXT: br label [[PRED_STORE_CONTINUE]] ; VEC: pred.store.continue: -; VEC-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 1 -; VEC-NEXT: br i1 [[TMP13]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]] +; VEC-NEXT: [[TMP12:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 1 +; VEC-NEXT: br i1 [[TMP12]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]] ; VEC: pred.store.if1: -; VEC-NEXT: [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], 1 -; VEC-NEXT: [[TMP15:%.*]] = getelementptr inbounds [768 x i32], [768 x i32]* [[PTR]], i64 0, i64 [[TMP14]] -; VEC-NEXT: [[TMP16:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1 -; VEC-NEXT: store i32 [[TMP16]], i32* [[TMP15]], align 4 +; VEC-NEXT: [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 1 +; VEC-NEXT: [[TMP14:%.*]] = getelementptr inbounds [768 x i32], [768 x i32]* [[PTR]], i64 0, i64 [[TMP13]] +; VEC-NEXT: [[TMP15:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1 +; VEC-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4 ; VEC-NEXT: br label [[PRED_STORE_CONTINUE2]] ; VEC: pred.store.continue2: -; VEC-NEXT: [[TMP17:%.*]] = add <2 x i32> [[VEC_PHI]], -; VEC-NEXT: [[TMP18:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], -; VEC-NEXT: [[PREDPHI]] = select <2 x i1> [[TMP18]], <2 x i32> [[VEC_PHI]], <2 x i32> [[TMP17]] +; VEC-NEXT: [[TMP16:%.*]] = add <2 x i32> [[VEC_PHI]], +; VEC-NEXT: [[TMP17:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], +; VEC-NEXT: [[PREDPHI]] = select <2 x i1> [[TMP17]], <2 x i32> [[VEC_PHI]], <2 x i32> [[TMP16]] ; VEC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; VEC-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; VEC-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; VEC-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; VEC-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; VEC: middle.block: -; VEC-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[PREDPHI]]) +; VEC-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[PREDPHI]]) +; VEC-NEXT: [[RDX_START:%.*]] = add i32 [[V_2:%.*]], [[TMP19]] ; VEC-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP4]], [[N_VEC]] -; VEC-NEXT: [[TMP21:%.*]] = xor i1 [[CMP_N]], true -; VEC-NEXT: call void @llvm.assume(i1 [[TMP21]]) +; VEC-NEXT: [[TMP20:%.*]] = xor i1 [[CMP_N]], true +; VEC-NEXT: call void @llvm.assume(i1 [[TMP20]]) ; VEC-NEXT: br label [[SCALAR_PH]] ; VEC: scalar.ph: ; VEC-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[V_1]], [[ENTRY:%.*]] ] -; VEC-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[V_2]], [[ENTRY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ] +; VEC-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[V_2]], [[ENTRY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; VEC-NEXT: br label [[FOR_BODY14:%.*]] ; VEC: for.body14: ; VEC-NEXT: [[INDVARS_IV3:%.*]] = phi i64 [ [[INDVARS_IV_NEXT4:%.*]], [[FOR_INC23:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -460,26 +460,26 @@ ; UNROLL-NEXT: entry: ; UNROLL-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL: vector.body: -; UNROLL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ] -; UNROLL-NEXT: br i1 [[C:%.*]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE6]] +; UNROLL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE3:%.*]] ] +; UNROLL-NEXT: br i1 [[C:%.*]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE3]] ; UNROLL: pred.store.if: -; UNROLL-NEXT: [[INDUCTION3:%.*]] = add i64 [[INDEX]], 0 -; UNROLL-NEXT: [[TMP0:%.*]] = getelementptr i8, i8* undef, i64 [[INDUCTION3]] -; UNROLL-NEXT: [[TMP1:%.*]] = load i8, i8* [[TMP0]], align 1 -; UNROLL-NEXT: [[TMP2:%.*]] = zext i8 [[TMP1]] to i32 -; UNROLL-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8 -; UNROLL-NEXT: store i8 [[TMP3]], i8* [[TMP0]], align 1 -; UNROLL-NEXT: [[INDUCTION4:%.*]] = add i64 [[INDEX]], 1 -; UNROLL-NEXT: [[TMP4:%.*]] = getelementptr i8, i8* undef, i64 [[INDUCTION4]] -; UNROLL-NEXT: [[TMP5:%.*]] = load i8, i8* [[TMP4]], align 1 -; UNROLL-NEXT: [[TMP6:%.*]] = zext i8 [[TMP5]] to i32 -; UNROLL-NEXT: [[TMP7:%.*]] = trunc i32 [[TMP6]] to i8 -; UNROLL-NEXT: store i8 [[TMP7]], i8* [[TMP4]], align 1 -; UNROLL-NEXT: br label [[PRED_STORE_CONTINUE6]] +; UNROLL-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; UNROLL-NEXT: [[TMP1:%.*]] = getelementptr i8, i8* undef, i64 [[TMP0]] +; UNROLL-NEXT: [[TMP2:%.*]] = load i8, i8* [[TMP1]], align 1 +; UNROLL-NEXT: [[TMP3:%.*]] = zext i8 [[TMP2]] to i32 +; UNROLL-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8 +; UNROLL-NEXT: store i8 [[TMP4]], i8* [[TMP1]], align 1 +; UNROLL-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 1 +; UNROLL-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* undef, i64 [[TMP5]] +; UNROLL-NEXT: [[TMP7:%.*]] = load i8, i8* [[TMP6]], align 1 +; UNROLL-NEXT: [[TMP8:%.*]] = zext i8 [[TMP7]] to i32 +; UNROLL-NEXT: [[TMP9:%.*]] = trunc i32 [[TMP8]] to i8 +; UNROLL-NEXT: store i8 [[TMP9]], i8* [[TMP6]], align 1 +; UNROLL-NEXT: br label [[PRED_STORE_CONTINUE3]] ; UNROLL: pred.store.continue3: ; UNROLL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; UNROLL-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], undef -; UNROLL-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; UNROLL-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], undef +; UNROLL-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; UNROLL: middle.block: ; UNROLL-NEXT: [[CMP_N:%.*]] = icmp eq i64 undef, undef ; UNROLL-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY:%.*]] @@ -508,30 +508,30 @@ ; UNROLL-NOSIMPLIFY: vector.ph: ; UNROLL-NOSIMPLIFY-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL-NOSIMPLIFY: vector.body: -; UNROLL-NOSIMPLIFY-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ] +; UNROLL-NOSIMPLIFY-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE3:%.*]] ] ; UNROLL-NOSIMPLIFY-NEXT: br i1 [[C:%.*]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; UNROLL-NOSIMPLIFY: pred.store.if: -; UNROLL-NOSIMPLIFY-NEXT: [[INDUCTION3:%.*]] = add i64 [[INDEX]], 0 -; UNROLL-NOSIMPLIFY-NEXT: [[TMP0:%.*]] = getelementptr i8, i8* undef, i64 [[INDUCTION3]] -; UNROLL-NOSIMPLIFY-NEXT: [[TMP1:%.*]] = load i8, i8* [[TMP0]], align 1 -; UNROLL-NOSIMPLIFY-NEXT: [[TMP2:%.*]] = zext i8 [[TMP1]] to i32 -; UNROLL-NOSIMPLIFY-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8 -; UNROLL-NOSIMPLIFY-NEXT: store i8 [[TMP3]], i8* [[TMP0]], align 1 +; UNROLL-NOSIMPLIFY-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; UNROLL-NOSIMPLIFY-NEXT: [[TMP1:%.*]] = getelementptr i8, i8* undef, i64 [[TMP0]] +; UNROLL-NOSIMPLIFY-NEXT: [[TMP2:%.*]] = load i8, i8* [[TMP1]], align 1 +; UNROLL-NOSIMPLIFY-NEXT: [[TMP3:%.*]] = zext i8 [[TMP2]] to i32 +; UNROLL-NOSIMPLIFY-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8 +; UNROLL-NOSIMPLIFY-NEXT: store i8 [[TMP4]], i8* [[TMP1]], align 1 ; UNROLL-NOSIMPLIFY-NEXT: br label [[PRED_STORE_CONTINUE]] ; UNROLL-NOSIMPLIFY: pred.store.continue: -; UNROLL-NOSIMPLIFY-NEXT: br i1 [[C]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6]] +; UNROLL-NOSIMPLIFY-NEXT: br i1 [[C]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3]] ; UNROLL-NOSIMPLIFY: pred.store.if2: -; UNROLL-NOSIMPLIFY-NEXT: [[INDUCTION2:%.*]] = add i64 [[INDEX]], 1 -; UNROLL-NOSIMPLIFY-NEXT: [[TMP4:%.*]] = getelementptr i8, i8* undef, i64 [[INDUCTION2]] -; UNROLL-NOSIMPLIFY-NEXT: [[TMP5:%.*]] = load i8, i8* [[TMP4]], align 1 -; UNROLL-NOSIMPLIFY-NEXT: [[TMP6:%.*]] = zext i8 [[TMP5]] to i32 -; UNROLL-NOSIMPLIFY-NEXT: [[TMP7:%.*]] = trunc i32 [[TMP6]] to i8 -; UNROLL-NOSIMPLIFY-NEXT: store i8 [[TMP7]], i8* [[TMP4]], align 1 -; UNROLL-NOSIMPLIFY-NEXT: br label [[PRED_STORE_CONTINUE6]] +; UNROLL-NOSIMPLIFY-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 1 +; UNROLL-NOSIMPLIFY-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* undef, i64 [[TMP5]] +; UNROLL-NOSIMPLIFY-NEXT: [[TMP7:%.*]] = load i8, i8* [[TMP6]], align 1 +; UNROLL-NOSIMPLIFY-NEXT: [[TMP8:%.*]] = zext i8 [[TMP7]] to i32 +; UNROLL-NOSIMPLIFY-NEXT: [[TMP9:%.*]] = trunc i32 [[TMP8]] to i8 +; UNROLL-NOSIMPLIFY-NEXT: store i8 [[TMP9]], i8* [[TMP6]], align 1 +; UNROLL-NOSIMPLIFY-NEXT: br label [[PRED_STORE_CONTINUE3]] ; UNROLL-NOSIMPLIFY: pred.store.continue3: ; UNROLL-NOSIMPLIFY-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; UNROLL-NOSIMPLIFY-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], undef -; UNROLL-NOSIMPLIFY-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; UNROLL-NOSIMPLIFY-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], undef +; UNROLL-NOSIMPLIFY-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; UNROLL-NOSIMPLIFY: middle.block: ; UNROLL-NOSIMPLIFY-NEXT: [[CMP_N:%.*]] = icmp eq i64 undef, undef ; UNROLL-NOSIMPLIFY-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -566,34 +566,34 @@ ; VEC: vector.body: ; VEC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE3:%.*]] ] ; VEC-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; VEC-NEXT: [[TMP2:%.*]] = getelementptr i8, i8* undef, i64 [[TMP0]] -; VEC-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* [[TMP2]], i32 0 -; VEC-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <2 x i8>* -; VEC-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i8>, <2 x i8>* [[TMP4]], align 1 -; VEC-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 0 -; VEC-NEXT: br i1 [[TMP5]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; VEC-NEXT: [[TMP1:%.*]] = getelementptr i8, i8* undef, i64 [[TMP0]] +; VEC-NEXT: [[TMP2:%.*]] = getelementptr i8, i8* [[TMP1]], i32 0 +; VEC-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <2 x i8>* +; VEC-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i8>, <2 x i8>* [[TMP3]], align 1 +; VEC-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 0 +; VEC-NEXT: br i1 [[TMP4]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; VEC: pred.store.if: -; VEC-NEXT: [[TMP6:%.*]] = extractelement <2 x i8> [[WIDE_LOAD]], i32 0 -; VEC-NEXT: [[TMP7:%.*]] = zext i8 [[TMP6]] to i32 -; VEC-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8 -; VEC-NEXT: [[TMP9:%.*]] = getelementptr i8, i8* undef, i64 [[TMP0]] -; VEC-NEXT: store i8 [[TMP8]], i8* [[TMP9]], align 1 +; VEC-NEXT: [[TMP5:%.*]] = extractelement <2 x i8> [[WIDE_LOAD]], i32 0 +; VEC-NEXT: [[TMP6:%.*]] = zext i8 [[TMP5]] to i32 +; VEC-NEXT: [[TMP7:%.*]] = trunc i32 [[TMP6]] to i8 +; VEC-NEXT: [[TMP8:%.*]] = getelementptr i8, i8* undef, i64 [[TMP0]] +; VEC-NEXT: store i8 [[TMP7]], i8* [[TMP8]], align 1 ; VEC-NEXT: br label [[PRED_STORE_CONTINUE]] ; VEC: pred.store.continue: -; VEC-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 1 -; VEC-NEXT: br i1 [[TMP10]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3]] +; VEC-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 1 +; VEC-NEXT: br i1 [[TMP9]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3]] ; VEC: pred.store.if2: -; VEC-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 1 -; VEC-NEXT: [[TMP12:%.*]] = extractelement <2 x i8> [[WIDE_LOAD]], i32 1 -; VEC-NEXT: [[TMP13:%.*]] = zext i8 [[TMP12]] to i32 -; VEC-NEXT: [[TMP14:%.*]] = trunc i32 [[TMP13]] to i8 -; VEC-NEXT: [[TMP15:%.*]] = getelementptr i8, i8* undef, i64 [[TMP11]] -; VEC-NEXT: store i8 [[TMP14]], i8* [[TMP15]], align 1 +; VEC-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 1 +; VEC-NEXT: [[TMP11:%.*]] = extractelement <2 x i8> [[WIDE_LOAD]], i32 1 +; VEC-NEXT: [[TMP12:%.*]] = zext i8 [[TMP11]] to i32 +; VEC-NEXT: [[TMP13:%.*]] = trunc i32 [[TMP12]] to i8 +; VEC-NEXT: [[TMP14:%.*]] = getelementptr i8, i8* undef, i64 [[TMP10]] +; VEC-NEXT: store i8 [[TMP13]], i8* [[TMP14]], align 1 ; VEC-NEXT: br label [[PRED_STORE_CONTINUE3]] ; VEC: pred.store.continue3: ; VEC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; VEC-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], undef -; VEC-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; VEC-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], undef +; VEC-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; VEC: middle.block: ; VEC-NEXT: [[CMP_N:%.*]] = icmp eq i64 undef, undef ; VEC-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY:%.*]] @@ -672,32 +672,32 @@ ; UNROLL-NOSIMPLIFY: vector.ph: ; UNROLL-NOSIMPLIFY-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL-NOSIMPLIFY: vector.body: -; UNROLL-NOSIMPLIFY-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ] -; UNROLL-NOSIMPLIFY-NEXT: [[INDUCTION:%.*]] = add i64 [[INDEX]], 0 -; UNROLL-NOSIMPLIFY-NEXT: [[INDUCTION2:%.*]] = add i64 [[INDEX]], 1 -; UNROLL-NOSIMPLIFY-NEXT: [[TMP0:%.*]] = getelementptr i8, i8* [[PTR:%.*]], i64 [[INDUCTION]] -; UNROLL-NOSIMPLIFY-NEXT: [[TMP1:%.*]] = getelementptr i8, i8* [[PTR]], i64 [[INDUCTION2]] -; UNROLL-NOSIMPLIFY-NEXT: store i8 0, i8* [[TMP0]], align 1 -; UNROLL-NOSIMPLIFY-NEXT: store i8 0, i8* [[TMP1]], align 1 +; UNROLL-NOSIMPLIFY-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE3:%.*]] ] +; UNROLL-NOSIMPLIFY-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; UNROLL-NOSIMPLIFY-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; UNROLL-NOSIMPLIFY-NEXT: [[TMP2:%.*]] = getelementptr i8, i8* [[PTR:%.*]], i64 [[TMP0]] +; UNROLL-NOSIMPLIFY-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* [[PTR]], i64 [[TMP1]] +; UNROLL-NOSIMPLIFY-NEXT: store i8 0, i8* [[TMP2]], align 1 +; UNROLL-NOSIMPLIFY-NEXT: store i8 0, i8* [[TMP3]], align 1 ; UNROLL-NOSIMPLIFY-NEXT: br i1 [[C:%.*]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; UNROLL-NOSIMPLIFY: pred.store.if: -; UNROLL-NOSIMPLIFY-NEXT: [[TMP2:%.*]] = load i8, i8* [[TMP0]], align 1 -; UNROLL-NOSIMPLIFY-NEXT: [[TMP3:%.*]] = zext i8 [[TMP2]] to i32 -; UNROLL-NOSIMPLIFY-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8 -; UNROLL-NOSIMPLIFY-NEXT: store i8 [[TMP4]], i8* [[TMP0]], align 1 +; UNROLL-NOSIMPLIFY-NEXT: [[TMP4:%.*]] = load i8, i8* [[TMP2]], align 1 +; UNROLL-NOSIMPLIFY-NEXT: [[TMP5:%.*]] = zext i8 [[TMP4]] to i32 +; UNROLL-NOSIMPLIFY-NEXT: [[TMP6:%.*]] = trunc i32 [[TMP5]] to i8 +; UNROLL-NOSIMPLIFY-NEXT: store i8 [[TMP6]], i8* [[TMP2]], align 1 ; UNROLL-NOSIMPLIFY-NEXT: br label [[PRED_STORE_CONTINUE]] ; UNROLL-NOSIMPLIFY: pred.store.continue: -; UNROLL-NOSIMPLIFY-NEXT: br i1 [[C]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6]] +; UNROLL-NOSIMPLIFY-NEXT: br i1 [[C]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3]] ; UNROLL-NOSIMPLIFY: pred.store.if2: -; UNROLL-NOSIMPLIFY-NEXT: [[TMP5:%.*]] = load i8, i8* [[TMP1]], align 1 -; UNROLL-NOSIMPLIFY-NEXT: [[TMP6:%.*]] = zext i8 [[TMP5]] to i32 -; UNROLL-NOSIMPLIFY-NEXT: [[TMP7:%.*]] = trunc i32 [[TMP6]] to i8 -; UNROLL-NOSIMPLIFY-NEXT: store i8 [[TMP7]], i8* [[TMP1]], align 1 -; UNROLL-NOSIMPLIFY-NEXT: br label [[PRED_STORE_CONTINUE6]] +; UNROLL-NOSIMPLIFY-NEXT: [[TMP7:%.*]] = load i8, i8* [[TMP3]], align 1 +; UNROLL-NOSIMPLIFY-NEXT: [[TMP8:%.*]] = zext i8 [[TMP7]] to i32 +; UNROLL-NOSIMPLIFY-NEXT: [[TMP9:%.*]] = trunc i32 [[TMP8]] to i8 +; UNROLL-NOSIMPLIFY-NEXT: store i8 [[TMP9]], i8* [[TMP3]], align 1 +; UNROLL-NOSIMPLIFY-NEXT: br label [[PRED_STORE_CONTINUE3]] ; UNROLL-NOSIMPLIFY: pred.store.continue3: ; UNROLL-NOSIMPLIFY-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; UNROLL-NOSIMPLIFY-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 -; UNROLL-NOSIMPLIFY-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; UNROLL-NOSIMPLIFY-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; UNROLL-NOSIMPLIFY-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; UNROLL-NOSIMPLIFY: middle.block: ; UNROLL-NOSIMPLIFY-NEXT: [[CMP_N:%.*]] = icmp eq i64 0, 0 ; UNROLL-NOSIMPLIFY-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] Index: llvm/test/Transforms/LoopVectorize/induction.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/induction.ll +++ llvm/test/Transforms/LoopVectorize/induction.ll @@ -680,11 +680,12 @@ ; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[TMP4]]) +; CHECK-NEXT: [[RDX_START:%.*]] = add i64 0, [[TMP6]] ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -696,7 +697,7 @@ ; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: for.end: -; CHECK-NEXT: [[TMP10:%.*]] = phi i64 [ [[TMP9]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[TMP10:%.*]] = phi i64 [ [[TMP9]], [[FOR_BODY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i64 [[TMP10]] ; ; IND-LABEL: @scalarize_induction_variable_01( @@ -814,11 +815,12 @@ ; UNROLL-NO-IC: middle.block: ; UNROLL-NO-IC-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[TMP9]], [[TMP8]] ; UNROLL-NO-IC-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]]) +; UNROLL-NO-IC-NEXT: [[RDX_START:%.*]] = add i64 0, [[TMP11]] ; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]] ; UNROLL-NO-IC-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; UNROLL-NO-IC: scalar.ph: ; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; UNROLL-NO-IC-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] +; UNROLL-NO-IC-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; UNROLL-NO-IC-NEXT: br label [[FOR_BODY:%.*]] ; UNROLL-NO-IC: for.body: ; UNROLL-NO-IC-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -830,7 +832,7 @@ ; UNROLL-NO-IC-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] ; UNROLL-NO-IC-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP12:![0-9]+]] ; UNROLL-NO-IC: for.end: -; UNROLL-NO-IC-NEXT: [[TMP15:%.*]] = phi i64 [ [[TMP14]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] +; UNROLL-NO-IC-NEXT: [[TMP15:%.*]] = phi i64 [ [[TMP14]], [[FOR_BODY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; UNROLL-NO-IC-NEXT: ret i64 [[TMP15]] ; ; INTERLEAVE-LABEL: @scalarize_induction_variable_01( @@ -947,11 +949,12 @@ ; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP21:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float -0.000000e+00, <2 x float> [[TMP19]]) +; CHECK-NEXT: [[RDX_START:%.*]] = fadd float 0.000000e+00, [[TMP21]] ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[FOR_BODY]] ] @@ -967,7 +970,7 @@ ; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: for.end: -; CHECK-NEXT: [[S_LCSSA:%.*]] = phi float [ [[TMP28]], [[FOR_BODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[S_LCSSA:%.*]] = phi float [ [[TMP28]], [[FOR_BODY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret float [[S_LCSSA]] ; ; IND-LABEL: @scalarize_induction_variable_02( @@ -1007,11 +1010,12 @@ ; IND-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; IND: middle.block: ; IND-NEXT: [[TMP20:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float -0.000000e+00, <2 x float> [[TMP18]]) +; IND-NEXT: [[RDX_START:%.*]] = fadd float [[TMP20]], 0.000000e+00 ; IND-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] ; IND-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; IND: scalar.ph: ; IND-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; IND-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP20]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ] +; IND-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[RDX_START]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ] ; IND-NEXT: br label [[FOR_BODY:%.*]] ; IND: for.body: ; IND-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[FOR_BODY]] ] @@ -1027,7 +1031,7 @@ ; IND-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] ; IND-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP14:![0-9]+]] ; IND: for.end: -; IND-NEXT: [[S_LCSSA:%.*]] = phi float [ [[TMP27]], [[FOR_BODY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ] +; IND-NEXT: [[S_LCSSA:%.*]] = phi float [ [[TMP27]], [[FOR_BODY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; IND-NEXT: ret float [[S_LCSSA]] ; ; UNROLL-LABEL: @scalarize_induction_variable_02( @@ -1086,11 +1090,12 @@ ; UNROLL: middle.block: ; UNROLL-NEXT: [[BIN_RDX:%.*]] = fadd fast <2 x float> [[TMP35]], [[TMP34]] ; UNROLL-NEXT: [[TMP37:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float -0.000000e+00, <2 x float> [[BIN_RDX]]) +; UNROLL-NEXT: [[RDX_START:%.*]] = fadd float [[TMP37]], 0.000000e+00 ; UNROLL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] ; UNROLL-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; UNROLL: scalar.ph: ; UNROLL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; UNROLL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP37]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ] +; UNROLL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[RDX_START]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ] ; UNROLL-NEXT: br label [[FOR_BODY:%.*]] ; UNROLL: for.body: ; UNROLL-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[FOR_BODY]] ] @@ -1106,7 +1111,7 @@ ; UNROLL-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] ; UNROLL-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP14:![0-9]+]] ; UNROLL: for.end: -; UNROLL-NEXT: [[S_LCSSA:%.*]] = phi float [ [[TMP44]], [[FOR_BODY]] ], [ [[TMP37]], [[MIDDLE_BLOCK]] ] +; UNROLL-NEXT: [[S_LCSSA:%.*]] = phi float [ [[TMP44]], [[FOR_BODY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; UNROLL-NEXT: ret float [[S_LCSSA]] ; ; UNROLL-NO-IC-LABEL: @scalarize_induction_variable_02( @@ -1167,11 +1172,12 @@ ; UNROLL-NO-IC: middle.block: ; UNROLL-NO-IC-NEXT: [[BIN_RDX:%.*]] = fadd fast <2 x float> [[TMP36]], [[TMP35]] ; UNROLL-NO-IC-NEXT: [[TMP38:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float -0.000000e+00, <2 x float> [[BIN_RDX]]) +; UNROLL-NO-IC-NEXT: [[RDX_START:%.*]] = fadd float 0.000000e+00, [[TMP38]] ; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] ; UNROLL-NO-IC-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; UNROLL-NO-IC: scalar.ph: ; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; UNROLL-NO-IC-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP38]], [[MIDDLE_BLOCK]] ] +; UNROLL-NO-IC-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; UNROLL-NO-IC-NEXT: br label [[FOR_BODY:%.*]] ; UNROLL-NO-IC: for.body: ; UNROLL-NO-IC-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[FOR_BODY]] ] @@ -1187,7 +1193,7 @@ ; UNROLL-NO-IC-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] ; UNROLL-NO-IC-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP14:![0-9]+]] ; UNROLL-NO-IC: for.end: -; UNROLL-NO-IC-NEXT: [[S_LCSSA:%.*]] = phi float [ [[TMP45]], [[FOR_BODY]] ], [ [[TMP38]], [[MIDDLE_BLOCK]] ] +; UNROLL-NO-IC-NEXT: [[S_LCSSA:%.*]] = phi float [ [[TMP45]], [[FOR_BODY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; UNROLL-NO-IC-NEXT: ret float [[S_LCSSA]] ; ; INTERLEAVE-LABEL: @scalarize_induction_variable_02( @@ -1239,10 +1245,11 @@ ; INTERLEAVE: middle.block: ; INTERLEAVE-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP19]], [[TMP18]] ; INTERLEAVE-NEXT: [[TMP21:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[BIN_RDX]]) +; INTERLEAVE-NEXT: [[RDX_START:%.*]] = fadd float [[TMP21]], 0.000000e+00 ; INTERLEAVE-NEXT: br label [[SCALAR_PH]] ; INTERLEAVE: scalar.ph: ; INTERLEAVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; INTERLEAVE-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP21]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ] +; INTERLEAVE-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[RDX_START]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ] ; INTERLEAVE-NEXT: br label [[FOR_BODY:%.*]] ; INTERLEAVE: for.body: ; INTERLEAVE-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[FOR_BODY]] ] @@ -2065,11 +2072,12 @@ ; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP18:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[TMP16]]) +; CHECK-NEXT: [[RDX_START:%.*]] = add i32 0, [[TMP18]] ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[SMAX]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[IF_END:%.*]] ] @@ -2087,7 +2095,7 @@ ; CHECK-NEXT: [[COND:%.*]] = icmp slt i32 [[I_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP25:![0-9]+]] ; CHECK: for.end: -; CHECK-NEXT: [[VAR5:%.*]] = phi i32 [ [[VAR4]], [[IF_END]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[VAR5:%.*]] = phi i32 [ [[VAR4]], [[IF_END]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[VAR5]] ; ; IND-LABEL: @scalarize_induction_variable_05( @@ -2331,11 +2339,12 @@ ; UNROLL-NO-IC: middle.block: ; UNROLL-NO-IC-NEXT: [[BIN_RDX:%.*]] = add <2 x i32> [[TMP33]], [[TMP32]] ; UNROLL-NO-IC-NEXT: [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[BIN_RDX]]) +; UNROLL-NO-IC-NEXT: [[RDX_START:%.*]] = add i32 0, [[TMP35]] ; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[SMAX]], [[N_VEC]] ; UNROLL-NO-IC-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; UNROLL-NO-IC: scalar.ph: ; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; UNROLL-NO-IC-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP35]], [[MIDDLE_BLOCK]] ] +; UNROLL-NO-IC-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; UNROLL-NO-IC-NEXT: br label [[FOR_BODY:%.*]] ; UNROLL-NO-IC: for.body: ; UNROLL-NO-IC-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[IF_END:%.*]] ] @@ -2353,7 +2362,7 @@ ; UNROLL-NO-IC-NEXT: [[COND:%.*]] = icmp slt i32 [[I_NEXT]], [[N]] ; UNROLL-NO-IC-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP25:![0-9]+]] ; UNROLL-NO-IC: for.end: -; UNROLL-NO-IC-NEXT: [[VAR5:%.*]] = phi i32 [ [[VAR4]], [[IF_END]] ], [ [[TMP35]], [[MIDDLE_BLOCK]] ] +; UNROLL-NO-IC-NEXT: [[VAR5:%.*]] = phi i32 [ [[VAR4]], [[IF_END]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; UNROLL-NO-IC-NEXT: ret i32 [[VAR5]] ; ; INTERLEAVE-LABEL: @scalarize_induction_variable_05( @@ -2856,18 +2865,19 @@ ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ , [[VECTOR_PH]] ], [ [[TMP0:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ , [[VECTOR_PH]] ], [ [[TMP0:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0]] = and <2 x i32> [[VEC_PHI]], ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256 ; CHECK-NEXT: br i1 [[TMP1]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> [[TMP0]]) +; CHECK-NEXT: [[RDX_START:%.*]] = and i32 1, [[TMP2]] ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 256, 256 ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 1, [[ENTRY]] ], [ [[TMP2]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 1, [[ENTRY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[A_0:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[A_0_AND:%.*]], [[LOOP]] ] @@ -2877,7 +2887,7 @@ ; CHECK-NEXT: [[EC:%.*]] = icmp eq i8 [[B_NEXT]], 0 ; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP29:![0-9]+]] ; CHECK: exit: -; CHECK-NEXT: [[A_0_AND_LCSSA:%.*]] = phi i32 [ [[A_0_AND]], [[LOOP]] ], [ [[TMP2]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[A_0_AND_LCSSA:%.*]] = phi i32 [ [[A_0_AND]], [[LOOP]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[A_0_AND_LCSSA]] ; ; IND-LABEL: @i8_loop( @@ -2925,7 +2935,7 @@ ; UNROLL-NO-IC-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL-NO-IC: vector.body: ; UNROLL-NO-IC-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; UNROLL-NO-IC-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ , [[VECTOR_PH]] ], [ [[TMP0:%.*]], [[VECTOR_BODY]] ] +; UNROLL-NO-IC-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ , [[VECTOR_PH]] ], [ [[TMP0:%.*]], [[VECTOR_BODY]] ] ; UNROLL-NO-IC-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i32> [ , [[VECTOR_PH]] ], [ [[TMP1:%.*]], [[VECTOR_BODY]] ] ; UNROLL-NO-IC-NEXT: [[TMP0]] = and <2 x i32> [[VEC_PHI]], ; UNROLL-NO-IC-NEXT: [[TMP1]] = and <2 x i32> [[VEC_PHI1]], @@ -2935,11 +2945,12 @@ ; UNROLL-NO-IC: middle.block: ; UNROLL-NO-IC-NEXT: [[BIN_RDX:%.*]] = and <2 x i32> [[TMP1]], [[TMP0]] ; UNROLL-NO-IC-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> [[BIN_RDX]]) +; UNROLL-NO-IC-NEXT: [[RDX_START:%.*]] = and i32 1, [[TMP3]] ; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i32 256, 256 ; UNROLL-NO-IC-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; UNROLL-NO-IC: scalar.ph: ; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; UNROLL-NO-IC-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 1, [[ENTRY]] ], [ [[TMP3]], [[MIDDLE_BLOCK]] ] +; UNROLL-NO-IC-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 1, [[ENTRY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; UNROLL-NO-IC-NEXT: br label [[LOOP:%.*]] ; UNROLL-NO-IC: loop: ; UNROLL-NO-IC-NEXT: [[A_0:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[A_0_AND:%.*]], [[LOOP]] ] @@ -2949,7 +2960,7 @@ ; UNROLL-NO-IC-NEXT: [[EC:%.*]] = icmp eq i8 [[B_NEXT]], 0 ; UNROLL-NO-IC-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP29:![0-9]+]] ; UNROLL-NO-IC: exit: -; UNROLL-NO-IC-NEXT: [[A_0_AND_LCSSA:%.*]] = phi i32 [ [[A_0_AND]], [[LOOP]] ], [ [[TMP3]], [[MIDDLE_BLOCK]] ] +; UNROLL-NO-IC-NEXT: [[A_0_AND_LCSSA:%.*]] = phi i32 [ [[A_0_AND]], [[LOOP]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; UNROLL-NO-IC-NEXT: ret i32 [[A_0_AND_LCSSA]] ; ; INTERLEAVE-LABEL: @i8_loop( @@ -2995,18 +3006,19 @@ ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ , [[VECTOR_PH]] ], [ [[TMP0:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ , [[VECTOR_PH]] ], [ [[TMP0:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0]] = and <2 x i32> [[VEC_PHI]], ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[INDEX_NEXT]], 65536 ; CHECK-NEXT: br i1 [[TMP1]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> [[TMP0]]) +; CHECK-NEXT: [[RDX_START:%.*]] = and i32 1, [[TMP2]] ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 65536, 65536 ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i16 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 1, [[ENTRY]] ], [ [[TMP2]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 1, [[ENTRY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[A_0:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[A_0_AND:%.*]], [[LOOP]] ] @@ -3016,7 +3028,7 @@ ; CHECK-NEXT: [[EC:%.*]] = icmp eq i16 [[B_0_NEXT]], 0 ; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP31:![0-9]+]] ; CHECK: exit: -; CHECK-NEXT: [[A_0_AND_LCSSA:%.*]] = phi i32 [ [[A_0_AND]], [[LOOP]] ], [ [[TMP2]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[A_0_AND_LCSSA:%.*]] = phi i32 [ [[A_0_AND]], [[LOOP]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[A_0_AND_LCSSA]] ; ; IND-LABEL: @i16_loop( @@ -3064,7 +3076,7 @@ ; UNROLL-NO-IC-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL-NO-IC: vector.body: ; UNROLL-NO-IC-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; UNROLL-NO-IC-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ , [[VECTOR_PH]] ], [ [[TMP0:%.*]], [[VECTOR_BODY]] ] +; UNROLL-NO-IC-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ , [[VECTOR_PH]] ], [ [[TMP0:%.*]], [[VECTOR_BODY]] ] ; UNROLL-NO-IC-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i32> [ , [[VECTOR_PH]] ], [ [[TMP1:%.*]], [[VECTOR_BODY]] ] ; UNROLL-NO-IC-NEXT: [[TMP0]] = and <2 x i32> [[VEC_PHI]], ; UNROLL-NO-IC-NEXT: [[TMP1]] = and <2 x i32> [[VEC_PHI1]], @@ -3074,11 +3086,12 @@ ; UNROLL-NO-IC: middle.block: ; UNROLL-NO-IC-NEXT: [[BIN_RDX:%.*]] = and <2 x i32> [[TMP1]], [[TMP0]] ; UNROLL-NO-IC-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> [[BIN_RDX]]) +; UNROLL-NO-IC-NEXT: [[RDX_START:%.*]] = and i32 1, [[TMP3]] ; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i32 65536, 65536 ; UNROLL-NO-IC-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; UNROLL-NO-IC: scalar.ph: ; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL:%.*]] = phi i16 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; UNROLL-NO-IC-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 1, [[ENTRY]] ], [ [[TMP3]], [[MIDDLE_BLOCK]] ] +; UNROLL-NO-IC-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 1, [[ENTRY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; UNROLL-NO-IC-NEXT: br label [[LOOP:%.*]] ; UNROLL-NO-IC: loop: ; UNROLL-NO-IC-NEXT: [[A_0:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[A_0_AND:%.*]], [[LOOP]] ] @@ -3088,7 +3101,7 @@ ; UNROLL-NO-IC-NEXT: [[EC:%.*]] = icmp eq i16 [[B_0_NEXT]], 0 ; UNROLL-NO-IC-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP31:![0-9]+]] ; UNROLL-NO-IC: exit: -; UNROLL-NO-IC-NEXT: [[A_0_AND_LCSSA:%.*]] = phi i32 [ [[A_0_AND]], [[LOOP]] ], [ [[TMP3]], [[MIDDLE_BLOCK]] ] +; UNROLL-NO-IC-NEXT: [[A_0_AND_LCSSA:%.*]] = phi i32 [ [[A_0_AND]], [[LOOP]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; UNROLL-NO-IC-NEXT: ret i32 [[A_0_AND_LCSSA]] ; ; INTERLEAVE-LABEL: @i16_loop( @@ -3138,18 +3151,19 @@ ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ , [[VECTOR_PH]] ], [ [[TMP0:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ , [[VECTOR_PH]] ], [ [[TMP0:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0]] = and <2 x i32> [[VEC_PHI]], ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[INDEX_NEXT]], 0 ; CHECK-NEXT: br i1 [[TMP1]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> [[TMP0]]) +; CHECK-NEXT: [[RDX_START:%.*]] = and i32 1, [[TMP2]] ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 0, 0 ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 1, [[ENTRY]] ], [ [[TMP2]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 1, [[ENTRY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[A_0:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[A_0_AND:%.*]], [[LOOP]] ] @@ -3159,7 +3173,7 @@ ; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[B_NEXT]], 0 ; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP33:![0-9]+]] ; CHECK: exit: -; CHECK-NEXT: [[A_0_AND_LCSSA:%.*]] = phi i32 [ [[A_0_AND]], [[LOOP]] ], [ [[TMP2]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[A_0_AND_LCSSA:%.*]] = phi i32 [ [[A_0_AND]], [[LOOP]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[A_0_AND_LCSSA]] ; ; IND-LABEL: @max_i32_backedgetaken( @@ -3207,7 +3221,7 @@ ; UNROLL-NO-IC-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL-NO-IC: vector.body: ; UNROLL-NO-IC-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; UNROLL-NO-IC-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ , [[VECTOR_PH]] ], [ [[TMP0:%.*]], [[VECTOR_BODY]] ] +; UNROLL-NO-IC-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ , [[VECTOR_PH]] ], [ [[TMP0:%.*]], [[VECTOR_BODY]] ] ; UNROLL-NO-IC-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i32> [ , [[VECTOR_PH]] ], [ [[TMP1:%.*]], [[VECTOR_BODY]] ] ; UNROLL-NO-IC-NEXT: [[TMP0]] = and <2 x i32> [[VEC_PHI]], ; UNROLL-NO-IC-NEXT: [[TMP1]] = and <2 x i32> [[VEC_PHI1]], @@ -3217,11 +3231,12 @@ ; UNROLL-NO-IC: middle.block: ; UNROLL-NO-IC-NEXT: [[BIN_RDX:%.*]] = and <2 x i32> [[TMP1]], [[TMP0]] ; UNROLL-NO-IC-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> [[BIN_RDX]]) +; UNROLL-NO-IC-NEXT: [[RDX_START:%.*]] = and i32 1, [[TMP3]] ; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i32 0, 0 ; UNROLL-NO-IC-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; UNROLL-NO-IC: scalar.ph: ; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; UNROLL-NO-IC-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 1, [[ENTRY]] ], [ [[TMP3]], [[MIDDLE_BLOCK]] ] +; UNROLL-NO-IC-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 1, [[ENTRY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; UNROLL-NO-IC-NEXT: br label [[LOOP:%.*]] ; UNROLL-NO-IC: loop: ; UNROLL-NO-IC-NEXT: [[A_0:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[A_0_AND:%.*]], [[LOOP]] ] @@ -3231,7 +3246,7 @@ ; UNROLL-NO-IC-NEXT: [[EC:%.*]] = icmp eq i32 [[B_NEXT]], 0 ; UNROLL-NO-IC-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP33:![0-9]+]] ; UNROLL-NO-IC: exit: -; UNROLL-NO-IC-NEXT: [[A_0_AND_LCSSA:%.*]] = phi i32 [ [[A_0_AND]], [[LOOP]] ], [ [[TMP3]], [[MIDDLE_BLOCK]] ] +; UNROLL-NO-IC-NEXT: [[A_0_AND_LCSSA:%.*]] = phi i32 [ [[A_0_AND]], [[LOOP]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; UNROLL-NO-IC-NEXT: ret i32 [[A_0_AND_LCSSA]] ; ; INTERLEAVE-LABEL: @max_i32_backedgetaken( @@ -3292,24 +3307,24 @@ ; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP3]], [[N_MOD_VF]] ; CHECK-NEXT: [[CAST_VTC:%.*]] = trunc i32 [[N_VEC]] to i8 ; CHECK-NEXT: [[IND_END:%.*]] = add i8 [[DOTPR_I]], [[CAST_VTC]] -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> , i32 [[C_PROMOTED_I]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP5]] = and <2 x i32> [[BROADCAST_SPLAT]], [[VEC_PHI]] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ , [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP4]] = and <2 x i32> [[BROADCAST_SPLAT]], [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> [[TMP5]]) +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> [[TMP4]]) +; CHECK-NEXT: [[RDX_START:%.*]] = and i32 [[C_PROMOTED_I]], [[TMP6]] ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP3]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[DOTPR_I]], [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[C_PROMOTED_I]], [[ENTRY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[C_PROMOTED_I]], [[ENTRY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[COND_END_I:%.*]] ; CHECK: cond.end.i: ; CHECK-NEXT: [[INC4_I:%.*]] = phi i8 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC_I:%.*]], [[COND_END_I]] ] @@ -3319,7 +3334,7 @@ ; CHECK-NEXT: [[TOBOOL_I:%.*]] = icmp eq i8 [[INC_I]], 0 ; CHECK-NEXT: br i1 [[TOBOOL_I]], label [[LOOPEXIT]], label [[COND_END_I]], !llvm.loop [[LOOP35:![0-9]+]] ; CHECK: loopexit: -; CHECK-NEXT: [[AND_I_LCSSA:%.*]] = phi i32 [ [[AND_I]], [[COND_END_I]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[AND_I_LCSSA:%.*]] = phi i32 [ [[AND_I]], [[COND_END_I]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[AND_I_LCSSA]] ; ; IND-LABEL: @testoverflowcheck( @@ -3336,32 +3351,31 @@ ; IND-NEXT: [[N_VEC:%.*]] = and i32 [[TMP3]], -2 ; IND-NEXT: [[CAST_VTC:%.*]] = trunc i32 [[N_VEC]] to i8 ; IND-NEXT: [[IND_END:%.*]] = add i8 [[DOTPR_I]], [[CAST_VTC]] -; IND-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> , i32 [[C_PROMOTED_I]], i64 0 ; IND-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i64 0 ; IND-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer ; IND-NEXT: br label [[VECTOR_BODY:%.*]] ; IND: vector.body: ; IND-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IND-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 -; IND-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; IND-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]] +; IND-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; IND-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]] ; IND: middle.block: -; IND-NEXT: [[TMP6:%.*]] = and <2 x i32> [[TMP4]], [[BROADCAST_SPLAT]] -; IND-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> [[TMP6]]) +; IND-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> [[BROADCAST_SPLAT]]) +; IND-NEXT: [[RDX_START:%.*]] = and i32 [[C_PROMOTED_I]], [[TMP5]] ; IND-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP3]], [[N_VEC]] ; IND-NEXT: br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[SCALAR_PH]] ; IND: scalar.ph: ; IND-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[DOTPR_I]], [[ENTRY:%.*]] ] -; IND-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ [[C_PROMOTED_I]], [[ENTRY]] ] +; IND-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_START]], [[MIDDLE_BLOCK]] ], [ [[C_PROMOTED_I]], [[ENTRY]] ] ; IND-NEXT: br label [[COND_END_I:%.*]] ; IND: cond.end.i: ; IND-NEXT: [[INC4_I:%.*]] = phi i8 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC_I:%.*]], [[COND_END_I]] ] -; IND-NEXT: [[TMP8:%.*]] = and i32 [[BC_MERGE_RDX]], [[TMP0]] +; IND-NEXT: [[TMP6:%.*]] = and i32 [[BC_MERGE_RDX]], [[TMP0]] ; IND-NEXT: [[INC_I]] = add i8 [[INC4_I]], 1 ; IND-NEXT: [[TOBOOL_I:%.*]] = icmp eq i8 [[INC_I]], 0 ; IND-NEXT: br i1 [[TOBOOL_I]], label [[LOOPEXIT]], label [[COND_END_I]], !llvm.loop [[LOOP35:![0-9]+]] ; IND: loopexit: -; IND-NEXT: [[AND_I_LCSSA:%.*]] = phi i32 [ [[TMP8]], [[COND_END_I]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; IND-NEXT: [[AND_I_LCSSA:%.*]] = phi i32 [ [[TMP6]], [[COND_END_I]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; IND-NEXT: ret i32 [[AND_I_LCSSA]] ; ; UNROLL-LABEL: @testoverflowcheck( @@ -3378,34 +3392,33 @@ ; UNROLL-NEXT: [[N_VEC:%.*]] = and i32 [[TMP3]], -4 ; UNROLL-NEXT: [[CAST_VTC:%.*]] = trunc i32 [[N_VEC]] to i8 ; UNROLL-NEXT: [[IND_END:%.*]] = add i8 [[DOTPR_I]], [[CAST_VTC]] -; UNROLL-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> , i32 [[C_PROMOTED_I]], i64 0 ; UNROLL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i64 0 ; UNROLL-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i64 0 ; UNROLL-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL: vector.body: ; UNROLL-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; UNROLL-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 -; UNROLL-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; UNROLL-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]] +; UNROLL-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; UNROLL-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]] ; UNROLL: middle.block: -; UNROLL-NEXT: [[TMP6:%.*]] = and <2 x i32> [[BROADCAST_SPLATINSERT2]], [[BROADCAST_SPLATINSERT]] -; UNROLL-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <2 x i32> zeroinitializer -; UNROLL-NEXT: [[BIN_RDX:%.*]] = and <2 x i32> [[TMP7]], [[TMP4]] -; UNROLL-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> [[BIN_RDX]]) +; UNROLL-NEXT: [[TMP5:%.*]] = and <2 x i32> [[BROADCAST_SPLATINSERT2]], [[BROADCAST_SPLATINSERT]] +; UNROLL-NEXT: [[BIN_RDX:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <2 x i32> zeroinitializer +; UNROLL-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> [[BIN_RDX]]) +; UNROLL-NEXT: [[RDX_START:%.*]] = and i32 [[C_PROMOTED_I]], [[TMP6]] ; UNROLL-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP3]], [[N_VEC]] ; UNROLL-NEXT: br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[SCALAR_PH]] ; UNROLL: scalar.ph: ; UNROLL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[DOTPR_I]], [[ENTRY:%.*]] ] -; UNROLL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP8]], [[MIDDLE_BLOCK]] ], [ [[C_PROMOTED_I]], [[ENTRY]] ] +; UNROLL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_START]], [[MIDDLE_BLOCK]] ], [ [[C_PROMOTED_I]], [[ENTRY]] ] ; UNROLL-NEXT: br label [[COND_END_I:%.*]] ; UNROLL: cond.end.i: ; UNROLL-NEXT: [[INC4_I:%.*]] = phi i8 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC_I:%.*]], [[COND_END_I]] ] -; UNROLL-NEXT: [[TMP9:%.*]] = and i32 [[BC_MERGE_RDX]], [[TMP0]] +; UNROLL-NEXT: [[TMP7:%.*]] = and i32 [[BC_MERGE_RDX]], [[TMP0]] ; UNROLL-NEXT: [[INC_I]] = add i8 [[INC4_I]], 1 ; UNROLL-NEXT: [[TOBOOL_I:%.*]] = icmp eq i8 [[INC_I]], 0 ; UNROLL-NEXT: br i1 [[TOBOOL_I]], label [[LOOPEXIT]], label [[COND_END_I]], !llvm.loop [[LOOP35:![0-9]+]] ; UNROLL: loopexit: -; UNROLL-NEXT: [[AND_I_LCSSA:%.*]] = phi i32 [ [[TMP9]], [[COND_END_I]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] +; UNROLL-NEXT: [[AND_I_LCSSA:%.*]] = phi i32 [ [[TMP7]], [[COND_END_I]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; UNROLL-NEXT: ret i32 [[AND_I_LCSSA]] ; ; UNROLL-NO-IC-LABEL: @testoverflowcheck( @@ -3423,7 +3436,6 @@ ; UNROLL-NO-IC-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP3]], [[N_MOD_VF]] ; UNROLL-NO-IC-NEXT: [[CAST_VTC:%.*]] = trunc i32 [[N_VEC]] to i8 ; UNROLL-NO-IC-NEXT: [[IND_END:%.*]] = add i8 [[DOTPR_I]], [[CAST_VTC]] -; UNROLL-NO-IC-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> , i32 [[C_PROMOTED_I]], i32 0 ; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i32 0 ; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer ; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i32 0 @@ -3431,21 +3443,22 @@ ; UNROLL-NO-IC-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL-NO-IC: vector.body: ; UNROLL-NO-IC-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; UNROLL-NO-IC-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] -; UNROLL-NO-IC-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i32> [ , [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] -; UNROLL-NO-IC-NEXT: [[TMP5]] = and <2 x i32> [[BROADCAST_SPLAT]], [[VEC_PHI]] -; UNROLL-NO-IC-NEXT: [[TMP6]] = and <2 x i32> [[BROADCAST_SPLAT3]], [[VEC_PHI1]] +; UNROLL-NO-IC-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ , [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; UNROLL-NO-IC-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i32> [ , [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; UNROLL-NO-IC-NEXT: [[TMP4]] = and <2 x i32> [[BROADCAST_SPLAT]], [[VEC_PHI]] +; UNROLL-NO-IC-NEXT: [[TMP5]] = and <2 x i32> [[BROADCAST_SPLAT3]], [[VEC_PHI1]] ; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 -; UNROLL-NO-IC-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; UNROLL-NO-IC-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]] +; UNROLL-NO-IC-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; UNROLL-NO-IC-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]] ; UNROLL-NO-IC: middle.block: -; UNROLL-NO-IC-NEXT: [[BIN_RDX:%.*]] = and <2 x i32> [[TMP6]], [[TMP5]] -; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> [[BIN_RDX]]) +; UNROLL-NO-IC-NEXT: [[BIN_RDX:%.*]] = and <2 x i32> [[TMP5]], [[TMP4]] +; UNROLL-NO-IC-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> [[BIN_RDX]]) +; UNROLL-NO-IC-NEXT: [[RDX_START:%.*]] = and i32 [[C_PROMOTED_I]], [[TMP7]] ; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP3]], [[N_VEC]] ; UNROLL-NO-IC-NEXT: br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[SCALAR_PH]] ; UNROLL-NO-IC: scalar.ph: ; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[DOTPR_I]], [[ENTRY:%.*]] ] -; UNROLL-NO-IC-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[C_PROMOTED_I]], [[ENTRY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] +; UNROLL-NO-IC-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[C_PROMOTED_I]], [[ENTRY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; UNROLL-NO-IC-NEXT: br label [[COND_END_I:%.*]] ; UNROLL-NO-IC: cond.end.i: ; UNROLL-NO-IC-NEXT: [[INC4_I:%.*]] = phi i8 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC_I:%.*]], [[COND_END_I]] ] @@ -3455,7 +3468,7 @@ ; UNROLL-NO-IC-NEXT: [[TOBOOL_I:%.*]] = icmp eq i8 [[INC_I]], 0 ; UNROLL-NO-IC-NEXT: br i1 [[TOBOOL_I]], label [[LOOPEXIT]], label [[COND_END_I]], !llvm.loop [[LOOP35:![0-9]+]] ; UNROLL-NO-IC: loopexit: -; UNROLL-NO-IC-NEXT: [[AND_I_LCSSA:%.*]] = phi i32 [ [[AND_I]], [[COND_END_I]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] +; UNROLL-NO-IC-NEXT: [[AND_I_LCSSA:%.*]] = phi i32 [ [[AND_I]], [[COND_END_I]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; UNROLL-NO-IC-NEXT: ret i32 [[AND_I_LCSSA]] ; ; INTERLEAVE-LABEL: @testoverflowcheck( @@ -3472,34 +3485,33 @@ ; INTERLEAVE-NEXT: [[N_VEC:%.*]] = and i32 [[TMP3]], -8 ; INTERLEAVE-NEXT: [[CAST_VTC:%.*]] = trunc i32 [[N_VEC]] to i8 ; INTERLEAVE-NEXT: [[IND_END:%.*]] = add i8 [[DOTPR_I]], [[CAST_VTC]] -; INTERLEAVE-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> , i32 [[C_PROMOTED_I]], i64 0 ; INTERLEAVE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0 ; INTERLEAVE-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0 ; INTERLEAVE-NEXT: br label [[VECTOR_BODY:%.*]] ; INTERLEAVE: vector.body: ; INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 -; INTERLEAVE-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; INTERLEAVE-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]] +; INTERLEAVE-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; INTERLEAVE-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]] ; INTERLEAVE: middle.block: -; INTERLEAVE-NEXT: [[TMP6:%.*]] = and <4 x i32> [[BROADCAST_SPLATINSERT2]], [[BROADCAST_SPLATINSERT]] -; INTERLEAVE-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> zeroinitializer -; INTERLEAVE-NEXT: [[BIN_RDX:%.*]] = and <4 x i32> [[TMP7]], [[TMP4]] -; INTERLEAVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[BIN_RDX]]) +; INTERLEAVE-NEXT: [[TMP5:%.*]] = and <4 x i32> [[BROADCAST_SPLATINSERT2]], [[BROADCAST_SPLATINSERT]] +; INTERLEAVE-NEXT: [[BIN_RDX:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <4 x i32> zeroinitializer +; INTERLEAVE-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[BIN_RDX]]) +; INTERLEAVE-NEXT: [[RDX_START:%.*]] = and i32 [[C_PROMOTED_I]], [[TMP6]] ; INTERLEAVE-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP3]], [[N_VEC]] ; INTERLEAVE-NEXT: br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[SCALAR_PH]] ; INTERLEAVE: scalar.ph: ; INTERLEAVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[DOTPR_I]], [[ENTRY:%.*]] ] -; INTERLEAVE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP8]], [[MIDDLE_BLOCK]] ], [ [[C_PROMOTED_I]], [[ENTRY]] ] +; INTERLEAVE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_START]], [[MIDDLE_BLOCK]] ], [ [[C_PROMOTED_I]], [[ENTRY]] ] ; INTERLEAVE-NEXT: br label [[COND_END_I:%.*]] ; INTERLEAVE: cond.end.i: ; INTERLEAVE-NEXT: [[INC4_I:%.*]] = phi i8 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC_I:%.*]], [[COND_END_I]] ] -; INTERLEAVE-NEXT: [[TMP9:%.*]] = and i32 [[BC_MERGE_RDX]], [[TMP0]] +; INTERLEAVE-NEXT: [[TMP7:%.*]] = and i32 [[BC_MERGE_RDX]], [[TMP0]] ; INTERLEAVE-NEXT: [[INC_I]] = add i8 [[INC4_I]], 1 ; INTERLEAVE-NEXT: [[TOBOOL_I:%.*]] = icmp eq i8 [[INC_I]], 0 ; INTERLEAVE-NEXT: br i1 [[TOBOOL_I]], label [[LOOPEXIT]], label [[COND_END_I]], !llvm.loop [[LOOP35:![0-9]+]] ; INTERLEAVE: loopexit: -; INTERLEAVE-NEXT: [[AND_I_LCSSA:%.*]] = phi i32 [ [[TMP9]], [[COND_END_I]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] +; INTERLEAVE-NEXT: [[AND_I_LCSSA:%.*]] = phi i32 [ [[TMP7]], [[COND_END_I]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; INTERLEAVE-NEXT: ret i32 [[AND_I_LCSSA]] ; entry: @@ -5253,48 +5265,48 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> zeroinitializer, i32 [[A:%.*]], i32 0 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UREM_CONTINUE2:%.*]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ [[TMP0]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[PRED_UREM_CONTINUE2]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[PRED_UREM_CONTINUE2]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i16> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_UREM_CONTINUE2]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i32 -20, [[INDEX]] -; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[OFFSET_IDX]] to i16 -; CHECK-NEXT: [[TMP2:%.*]] = icmp eq <2 x i16> [[VEC_IND]], zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = xor <2 x i1> [[TMP2]], -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[TMP3]], i32 0 -; CHECK-NEXT: br i1 [[TMP4]], label [[PRED_UREM_IF:%.*]], label [[PRED_UREM_CONTINUE:%.*]] +; CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[OFFSET_IDX]] to i16 +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq <2 x i16> [[VEC_IND]], zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = xor <2 x i1> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0 +; CHECK-NEXT: br i1 [[TMP3]], label [[PRED_UREM_IF:%.*]], label [[PRED_UREM_CONTINUE:%.*]] ; CHECK: pred.urem.if: -; CHECK-NEXT: [[TMP5:%.*]] = add i16 [[TMP1]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = urem i16 [[B:%.*]], [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i16> poison, i16 [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = add i16 [[TMP0]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = urem i16 [[B:%.*]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i16> poison, i16 [[TMP5]], i32 0 ; CHECK-NEXT: br label [[PRED_UREM_CONTINUE]] ; CHECK: pred.urem.continue: -; CHECK-NEXT: [[TMP8:%.*]] = phi <2 x i16> [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_UREM_IF]] ] -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[TMP3]], i32 1 -; CHECK-NEXT: br i1 [[TMP9]], label [[PRED_UREM_IF1:%.*]], label [[PRED_UREM_CONTINUE2]] +; CHECK-NEXT: [[TMP7:%.*]] = phi <2 x i16> [ poison, [[VECTOR_BODY]] ], [ [[TMP6]], [[PRED_UREM_IF]] ] +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1 +; CHECK-NEXT: br i1 [[TMP8]], label [[PRED_UREM_IF1:%.*]], label [[PRED_UREM_CONTINUE2]] ; CHECK: pred.urem.if1: -; CHECK-NEXT: [[TMP10:%.*]] = add i16 [[TMP1]], 1 -; CHECK-NEXT: [[TMP11:%.*]] = urem i16 [[B]], [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x i16> [[TMP8]], i16 [[TMP11]], i32 1 +; CHECK-NEXT: [[TMP9:%.*]] = add i16 [[TMP0]], 1 +; CHECK-NEXT: [[TMP10:%.*]] = urem i16 [[B]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i16> [[TMP7]], i16 [[TMP10]], i32 1 ; CHECK-NEXT: br label [[PRED_UREM_CONTINUE2]] ; CHECK: pred.urem.continue2: -; CHECK-NEXT: [[TMP13:%.*]] = phi <2 x i16> [ [[TMP8]], [[PRED_UREM_CONTINUE]] ], [ [[TMP12]], [[PRED_UREM_IF1]] ] -; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP2]], <2 x i16> zeroinitializer, <2 x i16> [[TMP13]] -; CHECK-NEXT: [[TMP14:%.*]] = sext <2 x i16> [[PREDPHI]] to <2 x i32> -; CHECK-NEXT: [[TMP15]] = or <2 x i32> [[VEC_PHI]], [[TMP14]] +; CHECK-NEXT: [[TMP12:%.*]] = phi <2 x i16> [ [[TMP7]], [[PRED_UREM_CONTINUE]] ], [ [[TMP11]], [[PRED_UREM_IF1]] ] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP1]], <2 x i16> zeroinitializer, <2 x i16> [[TMP12]] +; CHECK-NEXT: [[TMP13:%.*]] = sext <2 x i16> [[PREDPHI]] to <2 x i32> +; CHECK-NEXT: [[TMP14]] = or <2 x i32> [[VEC_PHI]], [[TMP13]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], -; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], 20 -; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP48:![0-9]+]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i32 [[INDEX_NEXT]], 20 +; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP48:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> [[TMP15]]) +; CHECK-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> [[TMP14]]) +; CHECK-NEXT: [[RDX_START:%.*]] = or i32 [[A:%.*]], [[TMP16]] ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 20, 20 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[MIDDLE_BLOCK]] ], [ -20, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[A]], [[ENTRY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[A]], [[ENTRY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[FOR_INC:%.*]] ] @@ -5313,49 +5325,49 @@ ; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[I_NEXT]], 0 ; CHECK-NEXT: br i1 [[COND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP49:![0-9]+]] ; CHECK: for.end: -; CHECK-NEXT: [[VAR7:%.*]] = phi i32 [ [[VAR6]], [[FOR_INC]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[VAR7:%.*]] = phi i32 [ [[VAR6]], [[FOR_INC]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[VAR7]] ; ; IND-LABEL: @PR32419( ; IND-NEXT: entry: ; IND-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; IND: vector.ph: -; IND-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> , i32 [[A:%.*]], i64 0 ; IND-NEXT: br label [[VECTOR_BODY:%.*]] ; IND: vector.body: ; IND-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UREM_CONTINUE2:%.*]] ] -; IND-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ [[TMP0]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[PRED_UREM_CONTINUE2]] ] +; IND-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[PRED_UREM_CONTINUE2]] ] ; IND-NEXT: [[VEC_IND:%.*]] = phi <2 x i16> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_UREM_CONTINUE2]] ] -; IND-NEXT: [[TMP1:%.*]] = trunc i32 [[INDEX]] to i16 -; IND-NEXT: [[TMP2:%.*]] = icmp eq <2 x i16> [[VEC_IND]], zeroinitializer -; IND-NEXT: [[TMP3:%.*]] = xor <2 x i1> [[TMP2]], -; IND-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[TMP3]], i64 0 -; IND-NEXT: br i1 [[TMP4]], label [[PRED_UREM_IF:%.*]], label [[PRED_UREM_CONTINUE:%.*]] +; IND-NEXT: [[TMP0:%.*]] = trunc i32 [[INDEX]] to i16 +; IND-NEXT: [[TMP1:%.*]] = icmp eq <2 x i16> [[VEC_IND]], zeroinitializer +; IND-NEXT: [[TMP2:%.*]] = xor <2 x i1> [[TMP1]], +; IND-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i64 0 +; IND-NEXT: br i1 [[TMP3]], label [[PRED_UREM_IF:%.*]], label [[PRED_UREM_CONTINUE:%.*]] ; IND: pred.urem.if: -; IND-NEXT: [[TMP5:%.*]] = add i16 [[TMP1]], -20 -; IND-NEXT: [[TMP6:%.*]] = urem i16 [[B:%.*]], [[TMP5]] -; IND-NEXT: [[TMP7:%.*]] = insertelement <2 x i16> poison, i16 [[TMP6]], i64 0 +; IND-NEXT: [[TMP4:%.*]] = add i16 [[TMP0]], -20 +; IND-NEXT: [[TMP5:%.*]] = urem i16 [[B:%.*]], [[TMP4]] +; IND-NEXT: [[TMP6:%.*]] = insertelement <2 x i16> poison, i16 [[TMP5]], i64 0 ; IND-NEXT: br label [[PRED_UREM_CONTINUE]] ; IND: pred.urem.continue: -; IND-NEXT: [[TMP8:%.*]] = phi <2 x i16> [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_UREM_IF]] ] -; IND-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[TMP3]], i64 1 -; IND-NEXT: br i1 [[TMP9]], label [[PRED_UREM_IF1:%.*]], label [[PRED_UREM_CONTINUE2]] +; IND-NEXT: [[TMP7:%.*]] = phi <2 x i16> [ poison, [[VECTOR_BODY]] ], [ [[TMP6]], [[PRED_UREM_IF]] ] +; IND-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[TMP2]], i64 1 +; IND-NEXT: br i1 [[TMP8]], label [[PRED_UREM_IF1:%.*]], label [[PRED_UREM_CONTINUE2]] ; IND: pred.urem.if1: -; IND-NEXT: [[TMP10:%.*]] = add i16 [[TMP1]], -19 -; IND-NEXT: [[TMP11:%.*]] = urem i16 [[B]], [[TMP10]] -; IND-NEXT: [[TMP12:%.*]] = insertelement <2 x i16> [[TMP8]], i16 [[TMP11]], i64 1 +; IND-NEXT: [[TMP9:%.*]] = add i16 [[TMP0]], -19 +; IND-NEXT: [[TMP10:%.*]] = urem i16 [[B]], [[TMP9]] +; IND-NEXT: [[TMP11:%.*]] = insertelement <2 x i16> [[TMP7]], i16 [[TMP10]], i64 1 ; IND-NEXT: br label [[PRED_UREM_CONTINUE2]] ; IND: pred.urem.continue2: -; IND-NEXT: [[TMP13:%.*]] = phi <2 x i16> [ [[TMP8]], [[PRED_UREM_CONTINUE]] ], [ [[TMP12]], [[PRED_UREM_IF1]] ] -; IND-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP2]], <2 x i16> zeroinitializer, <2 x i16> [[TMP13]] -; IND-NEXT: [[TMP14:%.*]] = sext <2 x i16> [[PREDPHI]] to <2 x i32> -; IND-NEXT: [[TMP15]] = or <2 x i32> [[VEC_PHI]], [[TMP14]] +; IND-NEXT: [[TMP12:%.*]] = phi <2 x i16> [ [[TMP7]], [[PRED_UREM_CONTINUE]] ], [ [[TMP11]], [[PRED_UREM_IF1]] ] +; IND-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP1]], <2 x i16> zeroinitializer, <2 x i16> [[TMP12]] +; IND-NEXT: [[TMP13:%.*]] = sext <2 x i16> [[PREDPHI]] to <2 x i32> +; IND-NEXT: [[TMP14]] = or <2 x i32> [[VEC_PHI]], [[TMP13]] ; IND-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; IND-NEXT: [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], -; IND-NEXT: [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], 20 -; IND-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP48:![0-9]+]] +; IND-NEXT: [[TMP15:%.*]] = icmp eq i32 [[INDEX_NEXT]], 20 +; IND-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP48:![0-9]+]] ; IND: middle.block: -; IND-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> [[TMP15]]) +; IND-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> [[TMP14]]) +; IND-NEXT: [[RDX_START:%.*]] = or i32 [[TMP16]], [[A:%.*]] ; IND-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; IND: scalar.ph: ; IND-NEXT: br label [[FOR_BODY:%.*]] @@ -5366,74 +5378,74 @@ ; IND: for.inc: ; IND-NEXT: br i1 poison, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP49:![0-9]+]] ; IND: for.end: -; IND-NEXT: [[VAR7:%.*]] = phi i32 [ poison, [[FOR_INC]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ] +; IND-NEXT: [[VAR7:%.*]] = phi i32 [ poison, [[FOR_INC]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; IND-NEXT: ret i32 [[VAR7]] ; ; UNROLL-LABEL: @PR32419( ; UNROLL-NEXT: entry: ; UNROLL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; UNROLL: vector.ph: -; UNROLL-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> , i32 [[A:%.*]], i64 0 ; UNROLL-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL: vector.body: ; UNROLL-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UREM_CONTINUE8:%.*]] ] -; UNROLL-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ [[TMP0]], [[VECTOR_PH]] ], [ [[TMP28:%.*]], [[PRED_UREM_CONTINUE8]] ] -; UNROLL-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP29:%.*]], [[PRED_UREM_CONTINUE8]] ] +; UNROLL-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP27:%.*]], [[PRED_UREM_CONTINUE8]] ] +; UNROLL-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP28:%.*]], [[PRED_UREM_CONTINUE8]] ] ; UNROLL-NEXT: [[VEC_IND:%.*]] = phi <2 x i16> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_UREM_CONTINUE8]] ] -; UNROLL-NEXT: [[TMP1:%.*]] = trunc i32 [[INDEX]] to i16 -; UNROLL-NEXT: [[TMP2:%.*]] = icmp eq <2 x i16> [[VEC_IND]], zeroinitializer -; UNROLL-NEXT: [[TMP3:%.*]] = icmp eq <2 x i16> [[VEC_IND]], +; UNROLL-NEXT: [[TMP0:%.*]] = trunc i32 [[INDEX]] to i16 +; UNROLL-NEXT: [[TMP1:%.*]] = icmp eq <2 x i16> [[VEC_IND]], zeroinitializer +; UNROLL-NEXT: [[TMP2:%.*]] = icmp eq <2 x i16> [[VEC_IND]], +; UNROLL-NEXT: [[TMP3:%.*]] = xor <2 x i1> [[TMP1]], ; UNROLL-NEXT: [[TMP4:%.*]] = xor <2 x i1> [[TMP2]], -; UNROLL-NEXT: [[TMP5:%.*]] = xor <2 x i1> [[TMP3]], -; UNROLL-NEXT: [[TMP6:%.*]] = extractelement <2 x i1> [[TMP4]], i64 0 -; UNROLL-NEXT: br i1 [[TMP6]], label [[PRED_UREM_IF:%.*]], label [[PRED_UREM_CONTINUE:%.*]] +; UNROLL-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP3]], i64 0 +; UNROLL-NEXT: br i1 [[TMP5]], label [[PRED_UREM_IF:%.*]], label [[PRED_UREM_CONTINUE:%.*]] ; UNROLL: pred.urem.if: -; UNROLL-NEXT: [[TMP7:%.*]] = add i16 [[TMP1]], -20 -; UNROLL-NEXT: [[TMP8:%.*]] = urem i16 [[B:%.*]], [[TMP7]] -; UNROLL-NEXT: [[TMP9:%.*]] = insertelement <2 x i16> poison, i16 [[TMP8]], i64 0 +; UNROLL-NEXT: [[TMP6:%.*]] = add i16 [[TMP0]], -20 +; UNROLL-NEXT: [[TMP7:%.*]] = urem i16 [[B:%.*]], [[TMP6]] +; UNROLL-NEXT: [[TMP8:%.*]] = insertelement <2 x i16> poison, i16 [[TMP7]], i64 0 ; UNROLL-NEXT: br label [[PRED_UREM_CONTINUE]] ; UNROLL: pred.urem.continue: -; UNROLL-NEXT: [[TMP10:%.*]] = phi <2 x i16> [ poison, [[VECTOR_BODY]] ], [ [[TMP9]], [[PRED_UREM_IF]] ] -; UNROLL-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP4]], i64 1 -; UNROLL-NEXT: br i1 [[TMP11]], label [[PRED_UREM_IF3:%.*]], label [[PRED_UREM_CONTINUE4:%.*]] +; UNROLL-NEXT: [[TMP9:%.*]] = phi <2 x i16> [ poison, [[VECTOR_BODY]] ], [ [[TMP8]], [[PRED_UREM_IF]] ] +; UNROLL-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP3]], i64 1 +; UNROLL-NEXT: br i1 [[TMP10]], label [[PRED_UREM_IF3:%.*]], label [[PRED_UREM_CONTINUE4:%.*]] ; UNROLL: pred.urem.if3: -; UNROLL-NEXT: [[TMP12:%.*]] = add i16 [[TMP1]], -19 -; UNROLL-NEXT: [[TMP13:%.*]] = urem i16 [[B]], [[TMP12]] -; UNROLL-NEXT: [[TMP14:%.*]] = insertelement <2 x i16> [[TMP10]], i16 [[TMP13]], i64 1 +; UNROLL-NEXT: [[TMP11:%.*]] = add i16 [[TMP0]], -19 +; UNROLL-NEXT: [[TMP12:%.*]] = urem i16 [[B]], [[TMP11]] +; UNROLL-NEXT: [[TMP13:%.*]] = insertelement <2 x i16> [[TMP9]], i16 [[TMP12]], i64 1 ; UNROLL-NEXT: br label [[PRED_UREM_CONTINUE4]] ; UNROLL: pred.urem.continue4: -; UNROLL-NEXT: [[TMP15:%.*]] = phi <2 x i16> [ [[TMP10]], [[PRED_UREM_CONTINUE]] ], [ [[TMP14]], [[PRED_UREM_IF3]] ] -; UNROLL-NEXT: [[TMP16:%.*]] = extractelement <2 x i1> [[TMP5]], i64 0 -; UNROLL-NEXT: br i1 [[TMP16]], label [[PRED_UREM_IF5:%.*]], label [[PRED_UREM_CONTINUE6:%.*]] +; UNROLL-NEXT: [[TMP14:%.*]] = phi <2 x i16> [ [[TMP9]], [[PRED_UREM_CONTINUE]] ], [ [[TMP13]], [[PRED_UREM_IF3]] ] +; UNROLL-NEXT: [[TMP15:%.*]] = extractelement <2 x i1> [[TMP4]], i64 0 +; UNROLL-NEXT: br i1 [[TMP15]], label [[PRED_UREM_IF5:%.*]], label [[PRED_UREM_CONTINUE6:%.*]] ; UNROLL: pred.urem.if5: -; UNROLL-NEXT: [[TMP17:%.*]] = add i16 [[TMP1]], -18 -; UNROLL-NEXT: [[TMP18:%.*]] = urem i16 [[B]], [[TMP17]] -; UNROLL-NEXT: [[TMP19:%.*]] = insertelement <2 x i16> poison, i16 [[TMP18]], i64 0 +; UNROLL-NEXT: [[TMP16:%.*]] = add i16 [[TMP0]], -18 +; UNROLL-NEXT: [[TMP17:%.*]] = urem i16 [[B]], [[TMP16]] +; UNROLL-NEXT: [[TMP18:%.*]] = insertelement <2 x i16> poison, i16 [[TMP17]], i64 0 ; UNROLL-NEXT: br label [[PRED_UREM_CONTINUE6]] ; UNROLL: pred.urem.continue6: -; UNROLL-NEXT: [[TMP20:%.*]] = phi <2 x i16> [ poison, [[PRED_UREM_CONTINUE4]] ], [ [[TMP19]], [[PRED_UREM_IF5]] ] -; UNROLL-NEXT: [[TMP21:%.*]] = extractelement <2 x i1> [[TMP5]], i64 1 -; UNROLL-NEXT: br i1 [[TMP21]], label [[PRED_UREM_IF7:%.*]], label [[PRED_UREM_CONTINUE8]] +; UNROLL-NEXT: [[TMP19:%.*]] = phi <2 x i16> [ poison, [[PRED_UREM_CONTINUE4]] ], [ [[TMP18]], [[PRED_UREM_IF5]] ] +; UNROLL-NEXT: [[TMP20:%.*]] = extractelement <2 x i1> [[TMP4]], i64 1 +; UNROLL-NEXT: br i1 [[TMP20]], label [[PRED_UREM_IF7:%.*]], label [[PRED_UREM_CONTINUE8]] ; UNROLL: pred.urem.if7: -; UNROLL-NEXT: [[TMP22:%.*]] = add i16 [[TMP1]], -17 -; UNROLL-NEXT: [[TMP23:%.*]] = urem i16 [[B]], [[TMP22]] -; UNROLL-NEXT: [[TMP24:%.*]] = insertelement <2 x i16> [[TMP20]], i16 [[TMP23]], i64 1 +; UNROLL-NEXT: [[TMP21:%.*]] = add i16 [[TMP0]], -17 +; UNROLL-NEXT: [[TMP22:%.*]] = urem i16 [[B]], [[TMP21]] +; UNROLL-NEXT: [[TMP23:%.*]] = insertelement <2 x i16> [[TMP19]], i16 [[TMP22]], i64 1 ; UNROLL-NEXT: br label [[PRED_UREM_CONTINUE8]] ; UNROLL: pred.urem.continue8: -; UNROLL-NEXT: [[TMP25:%.*]] = phi <2 x i16> [ [[TMP20]], [[PRED_UREM_CONTINUE6]] ], [ [[TMP24]], [[PRED_UREM_IF7]] ] -; UNROLL-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP2]], <2 x i16> zeroinitializer, <2 x i16> [[TMP15]] -; UNROLL-NEXT: [[PREDPHI9:%.*]] = select <2 x i1> [[TMP3]], <2 x i16> zeroinitializer, <2 x i16> [[TMP25]] -; UNROLL-NEXT: [[TMP26:%.*]] = sext <2 x i16> [[PREDPHI]] to <2 x i32> -; UNROLL-NEXT: [[TMP27:%.*]] = sext <2 x i16> [[PREDPHI9]] to <2 x i32> -; UNROLL-NEXT: [[TMP28]] = or <2 x i32> [[VEC_PHI]], [[TMP26]] -; UNROLL-NEXT: [[TMP29]] = or <2 x i32> [[VEC_PHI1]], [[TMP27]] +; UNROLL-NEXT: [[TMP24:%.*]] = phi <2 x i16> [ [[TMP19]], [[PRED_UREM_CONTINUE6]] ], [ [[TMP23]], [[PRED_UREM_IF7]] ] +; UNROLL-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP1]], <2 x i16> zeroinitializer, <2 x i16> [[TMP14]] +; UNROLL-NEXT: [[PREDPHI9:%.*]] = select <2 x i1> [[TMP2]], <2 x i16> zeroinitializer, <2 x i16> [[TMP24]] +; UNROLL-NEXT: [[TMP25:%.*]] = sext <2 x i16> [[PREDPHI]] to <2 x i32> +; UNROLL-NEXT: [[TMP26:%.*]] = sext <2 x i16> [[PREDPHI9]] to <2 x i32> +; UNROLL-NEXT: [[TMP27]] = or <2 x i32> [[VEC_PHI]], [[TMP25]] +; UNROLL-NEXT: [[TMP28]] = or <2 x i32> [[VEC_PHI1]], [[TMP26]] ; UNROLL-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; UNROLL-NEXT: [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], -; UNROLL-NEXT: [[TMP30:%.*]] = icmp eq i32 [[INDEX_NEXT]], 20 -; UNROLL-NEXT: br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP48:![0-9]+]] +; UNROLL-NEXT: [[TMP29:%.*]] = icmp eq i32 [[INDEX_NEXT]], 20 +; UNROLL-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP48:![0-9]+]] ; UNROLL: middle.block: -; UNROLL-NEXT: [[BIN_RDX:%.*]] = or <2 x i32> [[TMP29]], [[TMP28]] -; UNROLL-NEXT: [[TMP31:%.*]] = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> [[BIN_RDX]]) +; UNROLL-NEXT: [[BIN_RDX:%.*]] = or <2 x i32> [[TMP28]], [[TMP27]] +; UNROLL-NEXT: [[TMP30:%.*]] = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> [[BIN_RDX]]) +; UNROLL-NEXT: [[RDX_START:%.*]] = or i32 [[TMP30]], [[A:%.*]] ; UNROLL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; UNROLL: scalar.ph: ; UNROLL-NEXT: br label [[FOR_BODY:%.*]] @@ -5444,81 +5456,81 @@ ; UNROLL: for.inc: ; UNROLL-NEXT: br i1 poison, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP49:![0-9]+]] ; UNROLL: for.end: -; UNROLL-NEXT: [[VAR7:%.*]] = phi i32 [ poison, [[FOR_INC]] ], [ [[TMP31]], [[MIDDLE_BLOCK]] ] +; UNROLL-NEXT: [[VAR7:%.*]] = phi i32 [ poison, [[FOR_INC]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; UNROLL-NEXT: ret i32 [[VAR7]] ; ; UNROLL-NO-IC-LABEL: @PR32419( ; UNROLL-NO-IC-NEXT: entry: ; UNROLL-NO-IC-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; UNROLL-NO-IC: vector.ph: -; UNROLL-NO-IC-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> zeroinitializer, i32 [[A:%.*]], i32 0 ; UNROLL-NO-IC-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL-NO-IC: vector.body: ; UNROLL-NO-IC-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UREM_CONTINUE8:%.*]] ] -; UNROLL-NO-IC-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ [[TMP0]], [[VECTOR_PH]] ], [ [[TMP28:%.*]], [[PRED_UREM_CONTINUE8]] ] -; UNROLL-NO-IC-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP29:%.*]], [[PRED_UREM_CONTINUE8]] ] +; UNROLL-NO-IC-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP27:%.*]], [[PRED_UREM_CONTINUE8]] ] +; UNROLL-NO-IC-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP28:%.*]], [[PRED_UREM_CONTINUE8]] ] ; UNROLL-NO-IC-NEXT: [[VEC_IND:%.*]] = phi <2 x i16> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_UREM_CONTINUE8]] ] ; UNROLL-NO-IC-NEXT: [[STEP_ADD:%.*]] = add <2 x i16> [[VEC_IND]], ; UNROLL-NO-IC-NEXT: [[OFFSET_IDX:%.*]] = add i32 -20, [[INDEX]] -; UNROLL-NO-IC-NEXT: [[TMP1:%.*]] = trunc i32 [[OFFSET_IDX]] to i16 -; UNROLL-NO-IC-NEXT: [[TMP2:%.*]] = icmp eq <2 x i16> [[VEC_IND]], zeroinitializer -; UNROLL-NO-IC-NEXT: [[TMP3:%.*]] = icmp eq <2 x i16> [[STEP_ADD]], zeroinitializer +; UNROLL-NO-IC-NEXT: [[TMP0:%.*]] = trunc i32 [[OFFSET_IDX]] to i16 +; UNROLL-NO-IC-NEXT: [[TMP1:%.*]] = icmp eq <2 x i16> [[VEC_IND]], zeroinitializer +; UNROLL-NO-IC-NEXT: [[TMP2:%.*]] = icmp eq <2 x i16> [[STEP_ADD]], zeroinitializer +; UNROLL-NO-IC-NEXT: [[TMP3:%.*]] = xor <2 x i1> [[TMP1]], ; UNROLL-NO-IC-NEXT: [[TMP4:%.*]] = xor <2 x i1> [[TMP2]], -; UNROLL-NO-IC-NEXT: [[TMP5:%.*]] = xor <2 x i1> [[TMP3]], -; UNROLL-NO-IC-NEXT: [[TMP6:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0 -; UNROLL-NO-IC-NEXT: br i1 [[TMP6]], label [[PRED_UREM_IF:%.*]], label [[PRED_UREM_CONTINUE:%.*]] +; UNROLL-NO-IC-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP3]], i32 0 +; UNROLL-NO-IC-NEXT: br i1 [[TMP5]], label [[PRED_UREM_IF:%.*]], label [[PRED_UREM_CONTINUE:%.*]] ; UNROLL-NO-IC: pred.urem.if: -; UNROLL-NO-IC-NEXT: [[TMP7:%.*]] = add i16 [[TMP1]], 0 -; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = urem i16 [[B:%.*]], [[TMP7]] -; UNROLL-NO-IC-NEXT: [[TMP9:%.*]] = insertelement <2 x i16> poison, i16 [[TMP8]], i32 0 +; UNROLL-NO-IC-NEXT: [[TMP6:%.*]] = add i16 [[TMP0]], 0 +; UNROLL-NO-IC-NEXT: [[TMP7:%.*]] = urem i16 [[B:%.*]], [[TMP6]] +; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = insertelement <2 x i16> poison, i16 [[TMP7]], i32 0 ; UNROLL-NO-IC-NEXT: br label [[PRED_UREM_CONTINUE]] ; UNROLL-NO-IC: pred.urem.continue: -; UNROLL-NO-IC-NEXT: [[TMP10:%.*]] = phi <2 x i16> [ poison, [[VECTOR_BODY]] ], [ [[TMP9]], [[PRED_UREM_IF]] ] -; UNROLL-NO-IC-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1 -; UNROLL-NO-IC-NEXT: br i1 [[TMP11]], label [[PRED_UREM_IF3:%.*]], label [[PRED_UREM_CONTINUE4:%.*]] +; UNROLL-NO-IC-NEXT: [[TMP9:%.*]] = phi <2 x i16> [ poison, [[VECTOR_BODY]] ], [ [[TMP8]], [[PRED_UREM_IF]] ] +; UNROLL-NO-IC-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP3]], i32 1 +; UNROLL-NO-IC-NEXT: br i1 [[TMP10]], label [[PRED_UREM_IF3:%.*]], label [[PRED_UREM_CONTINUE4:%.*]] ; UNROLL-NO-IC: pred.urem.if3: -; UNROLL-NO-IC-NEXT: [[TMP12:%.*]] = add i16 [[TMP1]], 1 -; UNROLL-NO-IC-NEXT: [[TMP13:%.*]] = urem i16 [[B]], [[TMP12]] -; UNROLL-NO-IC-NEXT: [[TMP14:%.*]] = insertelement <2 x i16> [[TMP10]], i16 [[TMP13]], i32 1 +; UNROLL-NO-IC-NEXT: [[TMP11:%.*]] = add i16 [[TMP0]], 1 +; UNROLL-NO-IC-NEXT: [[TMP12:%.*]] = urem i16 [[B]], [[TMP11]] +; UNROLL-NO-IC-NEXT: [[TMP13:%.*]] = insertelement <2 x i16> [[TMP9]], i16 [[TMP12]], i32 1 ; UNROLL-NO-IC-NEXT: br label [[PRED_UREM_CONTINUE4]] ; UNROLL-NO-IC: pred.urem.continue4: -; UNROLL-NO-IC-NEXT: [[TMP15:%.*]] = phi <2 x i16> [ [[TMP10]], [[PRED_UREM_CONTINUE]] ], [ [[TMP14]], [[PRED_UREM_IF3]] ] -; UNROLL-NO-IC-NEXT: [[TMP16:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0 -; UNROLL-NO-IC-NEXT: br i1 [[TMP16]], label [[PRED_UREM_IF5:%.*]], label [[PRED_UREM_CONTINUE6:%.*]] +; UNROLL-NO-IC-NEXT: [[TMP14:%.*]] = phi <2 x i16> [ [[TMP9]], [[PRED_UREM_CONTINUE]] ], [ [[TMP13]], [[PRED_UREM_IF3]] ] +; UNROLL-NO-IC-NEXT: [[TMP15:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0 +; UNROLL-NO-IC-NEXT: br i1 [[TMP15]], label [[PRED_UREM_IF5:%.*]], label [[PRED_UREM_CONTINUE6:%.*]] ; UNROLL-NO-IC: pred.urem.if5: -; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = add i16 [[TMP1]], 2 -; UNROLL-NO-IC-NEXT: [[TMP18:%.*]] = urem i16 [[B]], [[TMP17]] -; UNROLL-NO-IC-NEXT: [[TMP19:%.*]] = insertelement <2 x i16> poison, i16 [[TMP18]], i32 0 +; UNROLL-NO-IC-NEXT: [[TMP16:%.*]] = add i16 [[TMP0]], 2 +; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = urem i16 [[B]], [[TMP16]] +; UNROLL-NO-IC-NEXT: [[TMP18:%.*]] = insertelement <2 x i16> poison, i16 [[TMP17]], i32 0 ; UNROLL-NO-IC-NEXT: br label [[PRED_UREM_CONTINUE6]] ; UNROLL-NO-IC: pred.urem.continue6: -; UNROLL-NO-IC-NEXT: [[TMP20:%.*]] = phi <2 x i16> [ poison, [[PRED_UREM_CONTINUE4]] ], [ [[TMP19]], [[PRED_UREM_IF5]] ] -; UNROLL-NO-IC-NEXT: [[TMP21:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1 -; UNROLL-NO-IC-NEXT: br i1 [[TMP21]], label [[PRED_UREM_IF7:%.*]], label [[PRED_UREM_CONTINUE8]] +; UNROLL-NO-IC-NEXT: [[TMP19:%.*]] = phi <2 x i16> [ poison, [[PRED_UREM_CONTINUE4]] ], [ [[TMP18]], [[PRED_UREM_IF5]] ] +; UNROLL-NO-IC-NEXT: [[TMP20:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1 +; UNROLL-NO-IC-NEXT: br i1 [[TMP20]], label [[PRED_UREM_IF7:%.*]], label [[PRED_UREM_CONTINUE8]] ; UNROLL-NO-IC: pred.urem.if7: -; UNROLL-NO-IC-NEXT: [[TMP22:%.*]] = add i16 [[TMP1]], 3 -; UNROLL-NO-IC-NEXT: [[TMP23:%.*]] = urem i16 [[B]], [[TMP22]] -; UNROLL-NO-IC-NEXT: [[TMP24:%.*]] = insertelement <2 x i16> [[TMP20]], i16 [[TMP23]], i32 1 +; UNROLL-NO-IC-NEXT: [[TMP21:%.*]] = add i16 [[TMP0]], 3 +; UNROLL-NO-IC-NEXT: [[TMP22:%.*]] = urem i16 [[B]], [[TMP21]] +; UNROLL-NO-IC-NEXT: [[TMP23:%.*]] = insertelement <2 x i16> [[TMP19]], i16 [[TMP22]], i32 1 ; UNROLL-NO-IC-NEXT: br label [[PRED_UREM_CONTINUE8]] ; UNROLL-NO-IC: pred.urem.continue8: -; UNROLL-NO-IC-NEXT: [[TMP25:%.*]] = phi <2 x i16> [ [[TMP20]], [[PRED_UREM_CONTINUE6]] ], [ [[TMP24]], [[PRED_UREM_IF7]] ] -; UNROLL-NO-IC-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP2]], <2 x i16> zeroinitializer, <2 x i16> [[TMP15]] -; UNROLL-NO-IC-NEXT: [[PREDPHI9:%.*]] = select <2 x i1> [[TMP3]], <2 x i16> zeroinitializer, <2 x i16> [[TMP25]] -; UNROLL-NO-IC-NEXT: [[TMP26:%.*]] = sext <2 x i16> [[PREDPHI]] to <2 x i32> -; UNROLL-NO-IC-NEXT: [[TMP27:%.*]] = sext <2 x i16> [[PREDPHI9]] to <2 x i32> -; UNROLL-NO-IC-NEXT: [[TMP28]] = or <2 x i32> [[VEC_PHI]], [[TMP26]] -; UNROLL-NO-IC-NEXT: [[TMP29]] = or <2 x i32> [[VEC_PHI1]], [[TMP27]] +; UNROLL-NO-IC-NEXT: [[TMP24:%.*]] = phi <2 x i16> [ [[TMP19]], [[PRED_UREM_CONTINUE6]] ], [ [[TMP23]], [[PRED_UREM_IF7]] ] +; UNROLL-NO-IC-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP1]], <2 x i16> zeroinitializer, <2 x i16> [[TMP14]] +; UNROLL-NO-IC-NEXT: [[PREDPHI9:%.*]] = select <2 x i1> [[TMP2]], <2 x i16> zeroinitializer, <2 x i16> [[TMP24]] +; UNROLL-NO-IC-NEXT: [[TMP25:%.*]] = sext <2 x i16> [[PREDPHI]] to <2 x i32> +; UNROLL-NO-IC-NEXT: [[TMP26:%.*]] = sext <2 x i16> [[PREDPHI9]] to <2 x i32> +; UNROLL-NO-IC-NEXT: [[TMP27]] = or <2 x i32> [[VEC_PHI]], [[TMP25]] +; UNROLL-NO-IC-NEXT: [[TMP28]] = or <2 x i32> [[VEC_PHI1]], [[TMP26]] ; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; UNROLL-NO-IC-NEXT: [[VEC_IND_NEXT]] = add <2 x i16> [[STEP_ADD]], -; UNROLL-NO-IC-NEXT: [[TMP30:%.*]] = icmp eq i32 [[INDEX_NEXT]], 20 -; UNROLL-NO-IC-NEXT: br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP48:![0-9]+]] +; UNROLL-NO-IC-NEXT: [[TMP29:%.*]] = icmp eq i32 [[INDEX_NEXT]], 20 +; UNROLL-NO-IC-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP48:![0-9]+]] ; UNROLL-NO-IC: middle.block: -; UNROLL-NO-IC-NEXT: [[BIN_RDX:%.*]] = or <2 x i32> [[TMP29]], [[TMP28]] -; UNROLL-NO-IC-NEXT: [[TMP31:%.*]] = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> [[BIN_RDX]]) +; UNROLL-NO-IC-NEXT: [[BIN_RDX:%.*]] = or <2 x i32> [[TMP28]], [[TMP27]] +; UNROLL-NO-IC-NEXT: [[TMP30:%.*]] = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> [[BIN_RDX]]) +; UNROLL-NO-IC-NEXT: [[RDX_START:%.*]] = or i32 [[A:%.*]], [[TMP30]] ; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i32 20, 20 ; UNROLL-NO-IC-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; UNROLL-NO-IC: scalar.ph: ; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[MIDDLE_BLOCK]] ], [ -20, [[ENTRY:%.*]] ] -; UNROLL-NO-IC-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[A]], [[ENTRY]] ], [ [[TMP31]], [[MIDDLE_BLOCK]] ] +; UNROLL-NO-IC-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[A]], [[ENTRY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; UNROLL-NO-IC-NEXT: br label [[FOR_BODY:%.*]] ; UNROLL-NO-IC: for.body: ; UNROLL-NO-IC-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[FOR_INC:%.*]] ] @@ -5537,114 +5549,114 @@ ; UNROLL-NO-IC-NEXT: [[COND:%.*]] = icmp eq i32 [[I_NEXT]], 0 ; UNROLL-NO-IC-NEXT: br i1 [[COND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP49:![0-9]+]] ; UNROLL-NO-IC: for.end: -; UNROLL-NO-IC-NEXT: [[VAR7:%.*]] = phi i32 [ [[VAR6]], [[FOR_INC]] ], [ [[TMP31]], [[MIDDLE_BLOCK]] ] +; UNROLL-NO-IC-NEXT: [[VAR7:%.*]] = phi i32 [ [[VAR6]], [[FOR_INC]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; UNROLL-NO-IC-NEXT: ret i32 [[VAR7]] ; ; INTERLEAVE-LABEL: @PR32419( ; INTERLEAVE-NEXT: entry: ; INTERLEAVE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; INTERLEAVE: vector.ph: -; INTERLEAVE-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> , i32 [[A:%.*]], i64 0 ; INTERLEAVE-NEXT: br label [[VECTOR_BODY:%.*]] ; INTERLEAVE: vector.body: ; INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UREM_CONTINUE16:%.*]] ] -; INTERLEAVE-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP0]], [[VECTOR_PH]] ], [ [[TMP48:%.*]], [[PRED_UREM_CONTINUE16]] ] -; INTERLEAVE-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP49:%.*]], [[PRED_UREM_CONTINUE16]] ] +; INTERLEAVE-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP47:%.*]], [[PRED_UREM_CONTINUE16]] ] +; INTERLEAVE-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP48:%.*]], [[PRED_UREM_CONTINUE16]] ] ; INTERLEAVE-NEXT: [[VEC_IND:%.*]] = phi <4 x i16> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_UREM_CONTINUE16]] ] -; INTERLEAVE-NEXT: [[TMP1:%.*]] = trunc i32 [[INDEX]] to i16 -; INTERLEAVE-NEXT: [[TMP2:%.*]] = icmp eq <4 x i16> [[VEC_IND]], zeroinitializer -; INTERLEAVE-NEXT: [[TMP3:%.*]] = icmp eq <4 x i16> [[VEC_IND]], +; INTERLEAVE-NEXT: [[TMP0:%.*]] = trunc i32 [[INDEX]] to i16 +; INTERLEAVE-NEXT: [[TMP1:%.*]] = icmp eq <4 x i16> [[VEC_IND]], zeroinitializer +; INTERLEAVE-NEXT: [[TMP2:%.*]] = icmp eq <4 x i16> [[VEC_IND]], +; INTERLEAVE-NEXT: [[TMP3:%.*]] = xor <4 x i1> [[TMP1]], ; INTERLEAVE-NEXT: [[TMP4:%.*]] = xor <4 x i1> [[TMP2]], -; INTERLEAVE-NEXT: [[TMP5:%.*]] = xor <4 x i1> [[TMP3]], -; INTERLEAVE-NEXT: [[TMP6:%.*]] = extractelement <4 x i1> [[TMP4]], i64 0 -; INTERLEAVE-NEXT: br i1 [[TMP6]], label [[PRED_UREM_IF:%.*]], label [[PRED_UREM_CONTINUE:%.*]] +; INTERLEAVE-NEXT: [[TMP5:%.*]] = extractelement <4 x i1> [[TMP3]], i64 0 +; INTERLEAVE-NEXT: br i1 [[TMP5]], label [[PRED_UREM_IF:%.*]], label [[PRED_UREM_CONTINUE:%.*]] ; INTERLEAVE: pred.urem.if: -; INTERLEAVE-NEXT: [[TMP7:%.*]] = add i16 [[TMP1]], -20 -; INTERLEAVE-NEXT: [[TMP8:%.*]] = urem i16 [[B:%.*]], [[TMP7]] -; INTERLEAVE-NEXT: [[TMP9:%.*]] = insertelement <4 x i16> poison, i16 [[TMP8]], i64 0 +; INTERLEAVE-NEXT: [[TMP6:%.*]] = add i16 [[TMP0]], -20 +; INTERLEAVE-NEXT: [[TMP7:%.*]] = urem i16 [[B:%.*]], [[TMP6]] +; INTERLEAVE-NEXT: [[TMP8:%.*]] = insertelement <4 x i16> poison, i16 [[TMP7]], i64 0 ; INTERLEAVE-NEXT: br label [[PRED_UREM_CONTINUE]] ; INTERLEAVE: pred.urem.continue: -; INTERLEAVE-NEXT: [[TMP10:%.*]] = phi <4 x i16> [ poison, [[VECTOR_BODY]] ], [ [[TMP9]], [[PRED_UREM_IF]] ] -; INTERLEAVE-NEXT: [[TMP11:%.*]] = extractelement <4 x i1> [[TMP4]], i64 1 -; INTERLEAVE-NEXT: br i1 [[TMP11]], label [[PRED_UREM_IF3:%.*]], label [[PRED_UREM_CONTINUE4:%.*]] +; INTERLEAVE-NEXT: [[TMP9:%.*]] = phi <4 x i16> [ poison, [[VECTOR_BODY]] ], [ [[TMP8]], [[PRED_UREM_IF]] ] +; INTERLEAVE-NEXT: [[TMP10:%.*]] = extractelement <4 x i1> [[TMP3]], i64 1 +; INTERLEAVE-NEXT: br i1 [[TMP10]], label [[PRED_UREM_IF3:%.*]], label [[PRED_UREM_CONTINUE4:%.*]] ; INTERLEAVE: pred.urem.if3: -; INTERLEAVE-NEXT: [[TMP12:%.*]] = add i16 [[TMP1]], -19 -; INTERLEAVE-NEXT: [[TMP13:%.*]] = urem i16 [[B]], [[TMP12]] -; INTERLEAVE-NEXT: [[TMP14:%.*]] = insertelement <4 x i16> [[TMP10]], i16 [[TMP13]], i64 1 +; INTERLEAVE-NEXT: [[TMP11:%.*]] = add i16 [[TMP0]], -19 +; INTERLEAVE-NEXT: [[TMP12:%.*]] = urem i16 [[B]], [[TMP11]] +; INTERLEAVE-NEXT: [[TMP13:%.*]] = insertelement <4 x i16> [[TMP9]], i16 [[TMP12]], i64 1 ; INTERLEAVE-NEXT: br label [[PRED_UREM_CONTINUE4]] ; INTERLEAVE: pred.urem.continue4: -; INTERLEAVE-NEXT: [[TMP15:%.*]] = phi <4 x i16> [ [[TMP10]], [[PRED_UREM_CONTINUE]] ], [ [[TMP14]], [[PRED_UREM_IF3]] ] -; INTERLEAVE-NEXT: [[TMP16:%.*]] = extractelement <4 x i1> [[TMP4]], i64 2 -; INTERLEAVE-NEXT: br i1 [[TMP16]], label [[PRED_UREM_IF5:%.*]], label [[PRED_UREM_CONTINUE6:%.*]] +; INTERLEAVE-NEXT: [[TMP14:%.*]] = phi <4 x i16> [ [[TMP9]], [[PRED_UREM_CONTINUE]] ], [ [[TMP13]], [[PRED_UREM_IF3]] ] +; INTERLEAVE-NEXT: [[TMP15:%.*]] = extractelement <4 x i1> [[TMP3]], i64 2 +; INTERLEAVE-NEXT: br i1 [[TMP15]], label [[PRED_UREM_IF5:%.*]], label [[PRED_UREM_CONTINUE6:%.*]] ; INTERLEAVE: pred.urem.if5: -; INTERLEAVE-NEXT: [[TMP17:%.*]] = add i16 [[TMP1]], -18 -; INTERLEAVE-NEXT: [[TMP18:%.*]] = urem i16 [[B]], [[TMP17]] -; INTERLEAVE-NEXT: [[TMP19:%.*]] = insertelement <4 x i16> [[TMP15]], i16 [[TMP18]], i64 2 +; INTERLEAVE-NEXT: [[TMP16:%.*]] = add i16 [[TMP0]], -18 +; INTERLEAVE-NEXT: [[TMP17:%.*]] = urem i16 [[B]], [[TMP16]] +; INTERLEAVE-NEXT: [[TMP18:%.*]] = insertelement <4 x i16> [[TMP14]], i16 [[TMP17]], i64 2 ; INTERLEAVE-NEXT: br label [[PRED_UREM_CONTINUE6]] ; INTERLEAVE: pred.urem.continue6: -; INTERLEAVE-NEXT: [[TMP20:%.*]] = phi <4 x i16> [ [[TMP15]], [[PRED_UREM_CONTINUE4]] ], [ [[TMP19]], [[PRED_UREM_IF5]] ] -; INTERLEAVE-NEXT: [[TMP21:%.*]] = extractelement <4 x i1> [[TMP4]], i64 3 -; INTERLEAVE-NEXT: br i1 [[TMP21]], label [[PRED_UREM_IF7:%.*]], label [[PRED_UREM_CONTINUE8:%.*]] +; INTERLEAVE-NEXT: [[TMP19:%.*]] = phi <4 x i16> [ [[TMP14]], [[PRED_UREM_CONTINUE4]] ], [ [[TMP18]], [[PRED_UREM_IF5]] ] +; INTERLEAVE-NEXT: [[TMP20:%.*]] = extractelement <4 x i1> [[TMP3]], i64 3 +; INTERLEAVE-NEXT: br i1 [[TMP20]], label [[PRED_UREM_IF7:%.*]], label [[PRED_UREM_CONTINUE8:%.*]] ; INTERLEAVE: pred.urem.if7: -; INTERLEAVE-NEXT: [[TMP22:%.*]] = add i16 [[TMP1]], -17 -; INTERLEAVE-NEXT: [[TMP23:%.*]] = urem i16 [[B]], [[TMP22]] -; INTERLEAVE-NEXT: [[TMP24:%.*]] = insertelement <4 x i16> [[TMP20]], i16 [[TMP23]], i64 3 +; INTERLEAVE-NEXT: [[TMP21:%.*]] = add i16 [[TMP0]], -17 +; INTERLEAVE-NEXT: [[TMP22:%.*]] = urem i16 [[B]], [[TMP21]] +; INTERLEAVE-NEXT: [[TMP23:%.*]] = insertelement <4 x i16> [[TMP19]], i16 [[TMP22]], i64 3 ; INTERLEAVE-NEXT: br label [[PRED_UREM_CONTINUE8]] ; INTERLEAVE: pred.urem.continue8: -; INTERLEAVE-NEXT: [[TMP25:%.*]] = phi <4 x i16> [ [[TMP20]], [[PRED_UREM_CONTINUE6]] ], [ [[TMP24]], [[PRED_UREM_IF7]] ] -; INTERLEAVE-NEXT: [[TMP26:%.*]] = extractelement <4 x i1> [[TMP5]], i64 0 -; INTERLEAVE-NEXT: br i1 [[TMP26]], label [[PRED_UREM_IF9:%.*]], label [[PRED_UREM_CONTINUE10:%.*]] +; INTERLEAVE-NEXT: [[TMP24:%.*]] = phi <4 x i16> [ [[TMP19]], [[PRED_UREM_CONTINUE6]] ], [ [[TMP23]], [[PRED_UREM_IF7]] ] +; INTERLEAVE-NEXT: [[TMP25:%.*]] = extractelement <4 x i1> [[TMP4]], i64 0 +; INTERLEAVE-NEXT: br i1 [[TMP25]], label [[PRED_UREM_IF9:%.*]], label [[PRED_UREM_CONTINUE10:%.*]] ; INTERLEAVE: pred.urem.if9: -; INTERLEAVE-NEXT: [[TMP27:%.*]] = add i16 [[TMP1]], -16 -; INTERLEAVE-NEXT: [[TMP28:%.*]] = urem i16 [[B]], [[TMP27]] -; INTERLEAVE-NEXT: [[TMP29:%.*]] = insertelement <4 x i16> poison, i16 [[TMP28]], i64 0 +; INTERLEAVE-NEXT: [[TMP26:%.*]] = add i16 [[TMP0]], -16 +; INTERLEAVE-NEXT: [[TMP27:%.*]] = urem i16 [[B]], [[TMP26]] +; INTERLEAVE-NEXT: [[TMP28:%.*]] = insertelement <4 x i16> poison, i16 [[TMP27]], i64 0 ; INTERLEAVE-NEXT: br label [[PRED_UREM_CONTINUE10]] ; INTERLEAVE: pred.urem.continue10: -; INTERLEAVE-NEXT: [[TMP30:%.*]] = phi <4 x i16> [ poison, [[PRED_UREM_CONTINUE8]] ], [ [[TMP29]], [[PRED_UREM_IF9]] ] -; INTERLEAVE-NEXT: [[TMP31:%.*]] = extractelement <4 x i1> [[TMP5]], i64 1 -; INTERLEAVE-NEXT: br i1 [[TMP31]], label [[PRED_UREM_IF11:%.*]], label [[PRED_UREM_CONTINUE12:%.*]] +; INTERLEAVE-NEXT: [[TMP29:%.*]] = phi <4 x i16> [ poison, [[PRED_UREM_CONTINUE8]] ], [ [[TMP28]], [[PRED_UREM_IF9]] ] +; INTERLEAVE-NEXT: [[TMP30:%.*]] = extractelement <4 x i1> [[TMP4]], i64 1 +; INTERLEAVE-NEXT: br i1 [[TMP30]], label [[PRED_UREM_IF11:%.*]], label [[PRED_UREM_CONTINUE12:%.*]] ; INTERLEAVE: pred.urem.if11: -; INTERLEAVE-NEXT: [[TMP32:%.*]] = add i16 [[TMP1]], -15 -; INTERLEAVE-NEXT: [[TMP33:%.*]] = urem i16 [[B]], [[TMP32]] -; INTERLEAVE-NEXT: [[TMP34:%.*]] = insertelement <4 x i16> [[TMP30]], i16 [[TMP33]], i64 1 +; INTERLEAVE-NEXT: [[TMP31:%.*]] = add i16 [[TMP0]], -15 +; INTERLEAVE-NEXT: [[TMP32:%.*]] = urem i16 [[B]], [[TMP31]] +; INTERLEAVE-NEXT: [[TMP33:%.*]] = insertelement <4 x i16> [[TMP29]], i16 [[TMP32]], i64 1 ; INTERLEAVE-NEXT: br label [[PRED_UREM_CONTINUE12]] ; INTERLEAVE: pred.urem.continue12: -; INTERLEAVE-NEXT: [[TMP35:%.*]] = phi <4 x i16> [ [[TMP30]], [[PRED_UREM_CONTINUE10]] ], [ [[TMP34]], [[PRED_UREM_IF11]] ] -; INTERLEAVE-NEXT: [[TMP36:%.*]] = extractelement <4 x i1> [[TMP5]], i64 2 -; INTERLEAVE-NEXT: br i1 [[TMP36]], label [[PRED_UREM_IF13:%.*]], label [[PRED_UREM_CONTINUE14:%.*]] +; INTERLEAVE-NEXT: [[TMP34:%.*]] = phi <4 x i16> [ [[TMP29]], [[PRED_UREM_CONTINUE10]] ], [ [[TMP33]], [[PRED_UREM_IF11]] ] +; INTERLEAVE-NEXT: [[TMP35:%.*]] = extractelement <4 x i1> [[TMP4]], i64 2 +; INTERLEAVE-NEXT: br i1 [[TMP35]], label [[PRED_UREM_IF13:%.*]], label [[PRED_UREM_CONTINUE14:%.*]] ; INTERLEAVE: pred.urem.if13: -; INTERLEAVE-NEXT: [[TMP37:%.*]] = add i16 [[TMP1]], -14 -; INTERLEAVE-NEXT: [[TMP38:%.*]] = urem i16 [[B]], [[TMP37]] -; INTERLEAVE-NEXT: [[TMP39:%.*]] = insertelement <4 x i16> [[TMP35]], i16 [[TMP38]], i64 2 +; INTERLEAVE-NEXT: [[TMP36:%.*]] = add i16 [[TMP0]], -14 +; INTERLEAVE-NEXT: [[TMP37:%.*]] = urem i16 [[B]], [[TMP36]] +; INTERLEAVE-NEXT: [[TMP38:%.*]] = insertelement <4 x i16> [[TMP34]], i16 [[TMP37]], i64 2 ; INTERLEAVE-NEXT: br label [[PRED_UREM_CONTINUE14]] ; INTERLEAVE: pred.urem.continue14: -; INTERLEAVE-NEXT: [[TMP40:%.*]] = phi <4 x i16> [ [[TMP35]], [[PRED_UREM_CONTINUE12]] ], [ [[TMP39]], [[PRED_UREM_IF13]] ] -; INTERLEAVE-NEXT: [[TMP41:%.*]] = extractelement <4 x i1> [[TMP5]], i64 3 -; INTERLEAVE-NEXT: br i1 [[TMP41]], label [[PRED_UREM_IF15:%.*]], label [[PRED_UREM_CONTINUE16]] +; INTERLEAVE-NEXT: [[TMP39:%.*]] = phi <4 x i16> [ [[TMP34]], [[PRED_UREM_CONTINUE12]] ], [ [[TMP38]], [[PRED_UREM_IF13]] ] +; INTERLEAVE-NEXT: [[TMP40:%.*]] = extractelement <4 x i1> [[TMP4]], i64 3 +; INTERLEAVE-NEXT: br i1 [[TMP40]], label [[PRED_UREM_IF15:%.*]], label [[PRED_UREM_CONTINUE16]] ; INTERLEAVE: pred.urem.if15: -; INTERLEAVE-NEXT: [[TMP42:%.*]] = add i16 [[TMP1]], -13 -; INTERLEAVE-NEXT: [[TMP43:%.*]] = urem i16 [[B]], [[TMP42]] -; INTERLEAVE-NEXT: [[TMP44:%.*]] = insertelement <4 x i16> [[TMP40]], i16 [[TMP43]], i64 3 +; INTERLEAVE-NEXT: [[TMP41:%.*]] = add i16 [[TMP0]], -13 +; INTERLEAVE-NEXT: [[TMP42:%.*]] = urem i16 [[B]], [[TMP41]] +; INTERLEAVE-NEXT: [[TMP43:%.*]] = insertelement <4 x i16> [[TMP39]], i16 [[TMP42]], i64 3 ; INTERLEAVE-NEXT: br label [[PRED_UREM_CONTINUE16]] ; INTERLEAVE: pred.urem.continue16: -; INTERLEAVE-NEXT: [[TMP45:%.*]] = phi <4 x i16> [ [[TMP40]], [[PRED_UREM_CONTINUE14]] ], [ [[TMP44]], [[PRED_UREM_IF15]] ] -; INTERLEAVE-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP2]], <4 x i16> zeroinitializer, <4 x i16> [[TMP25]] -; INTERLEAVE-NEXT: [[PREDPHI17:%.*]] = select <4 x i1> [[TMP3]], <4 x i16> zeroinitializer, <4 x i16> [[TMP45]] -; INTERLEAVE-NEXT: [[TMP46:%.*]] = sext <4 x i16> [[PREDPHI]] to <4 x i32> -; INTERLEAVE-NEXT: [[TMP47:%.*]] = sext <4 x i16> [[PREDPHI17]] to <4 x i32> -; INTERLEAVE-NEXT: [[TMP48]] = or <4 x i32> [[VEC_PHI]], [[TMP46]] -; INTERLEAVE-NEXT: [[TMP49]] = or <4 x i32> [[VEC_PHI1]], [[TMP47]] +; INTERLEAVE-NEXT: [[TMP44:%.*]] = phi <4 x i16> [ [[TMP39]], [[PRED_UREM_CONTINUE14]] ], [ [[TMP43]], [[PRED_UREM_IF15]] ] +; INTERLEAVE-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP1]], <4 x i16> zeroinitializer, <4 x i16> [[TMP24]] +; INTERLEAVE-NEXT: [[PREDPHI17:%.*]] = select <4 x i1> [[TMP2]], <4 x i16> zeroinitializer, <4 x i16> [[TMP44]] +; INTERLEAVE-NEXT: [[TMP45:%.*]] = sext <4 x i16> [[PREDPHI]] to <4 x i32> +; INTERLEAVE-NEXT: [[TMP46:%.*]] = sext <4 x i16> [[PREDPHI17]] to <4 x i32> +; INTERLEAVE-NEXT: [[TMP47]] = or <4 x i32> [[VEC_PHI]], [[TMP45]] +; INTERLEAVE-NEXT: [[TMP48]] = or <4 x i32> [[VEC_PHI1]], [[TMP46]] ; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 ; INTERLEAVE-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], -; INTERLEAVE-NEXT: [[TMP50:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16 -; INTERLEAVE-NEXT: br i1 [[TMP50]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP48:![0-9]+]] +; INTERLEAVE-NEXT: [[TMP49:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16 +; INTERLEAVE-NEXT: br i1 [[TMP49]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP48:![0-9]+]] ; INTERLEAVE: middle.block: -; INTERLEAVE-NEXT: [[BIN_RDX:%.*]] = or <4 x i32> [[TMP49]], [[TMP48]] -; INTERLEAVE-NEXT: [[TMP51:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[BIN_RDX]]) +; INTERLEAVE-NEXT: [[BIN_RDX:%.*]] = or <4 x i32> [[TMP48]], [[TMP47]] +; INTERLEAVE-NEXT: [[TMP50:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[BIN_RDX]]) +; INTERLEAVE-NEXT: [[RDX_START:%.*]] = or i32 [[TMP50]], [[A:%.*]] ; INTERLEAVE-NEXT: br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; INTERLEAVE: scalar.ph: ; INTERLEAVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ -4, [[MIDDLE_BLOCK]] ], [ -20, [[ENTRY:%.*]] ] -; INTERLEAVE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP51]], [[MIDDLE_BLOCK]] ], [ [[A]], [[ENTRY]] ] +; INTERLEAVE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_START]], [[MIDDLE_BLOCK]] ], [ [[A]], [[ENTRY]] ] ; INTERLEAVE-NEXT: br label [[FOR_BODY:%.*]] ; INTERLEAVE: for.body: ; INTERLEAVE-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[FOR_INC:%.*]] ] @@ -5663,7 +5675,7 @@ ; INTERLEAVE-NEXT: [[COND:%.*]] = icmp eq i32 [[I_NEXT]], 0 ; INTERLEAVE-NEXT: br i1 [[COND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP49:![0-9]+]] ; INTERLEAVE: for.end: -; INTERLEAVE-NEXT: [[VAR7:%.*]] = phi i32 [ [[VAR6]], [[FOR_INC]] ], [ [[TMP51]], [[MIDDLE_BLOCK]] ] +; INTERLEAVE-NEXT: [[VAR7:%.*]] = phi i32 [ [[VAR6]], [[FOR_INC]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; INTERLEAVE-NEXT: ret i32 [[VAR7]] ; entry: @@ -5730,6 +5742,7 @@ ; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP50:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[TMP10]]) +; CHECK-NEXT: [[RDX_START:%.*]] = add i64 0, [[TMP12]] ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 113, 112 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i32> [[VEC_IND2]], i32 1 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <2 x i32> [[VEC_IND2]], i32 0 @@ -5738,10 +5751,10 @@ ; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 42, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 113, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY]] ] ; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ 113, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: exit: -; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi i64 [ [[C23:%.*]], [[LOOP]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi i64 [ [[C23:%.*]], [[LOOP]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i64 [[DOTLCSSA]] ; CHECK: loop: ; CHECK-NEXT: [[C5:%.*]] = phi i64 [ [[C23]], [[LOOP]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] @@ -5956,6 +5969,7 @@ ; UNROLL-NO-IC: middle.block: ; UNROLL-NO-IC-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[TMP21]], [[TMP20]] ; UNROLL-NO-IC-NEXT: [[TMP23:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]]) +; UNROLL-NO-IC-NEXT: [[RDX_START:%.*]] = add i64 0, [[TMP23]] ; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i64 113, 112 ; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i32> [[STEP_ADD5]], i32 1 ; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <2 x i32> [[STEP_ADD5]], i32 0 @@ -5964,10 +5978,10 @@ ; UNROLL-NO-IC-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 42, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ] ; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 113, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY]] ] ; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ 113, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY]] ] -; UNROLL-NO-IC-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ] +; UNROLL-NO-IC-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; UNROLL-NO-IC-NEXT: br label [[LOOP:%.*]] ; UNROLL-NO-IC: exit: -; UNROLL-NO-IC-NEXT: [[DOTLCSSA:%.*]] = phi i64 [ [[C23:%.*]], [[LOOP]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ] +; UNROLL-NO-IC-NEXT: [[DOTLCSSA:%.*]] = phi i64 [ [[C23:%.*]], [[LOOP]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; UNROLL-NO-IC-NEXT: ret i64 [[DOTLCSSA]] ; UNROLL-NO-IC: loop: ; UNROLL-NO-IC-NEXT: [[C5:%.*]] = phi i64 [ [[C23]], [[LOOP]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] Index: llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll +++ llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll @@ -762,17 +762,17 @@ ; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> ; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[STRIDED_VEC]], [[VEC_IND]] -; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[STRIDED_VEC2]], [[VEC_IND]] -; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[STRIDED_VEC3]], [[VEC_IND]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[NEXT_GEP]] to <12 x i32>* -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> -; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <12 x i32> -; CHECK-NEXT: store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[TMP7]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i32> [[STRIDED_VEC2]], [[VEC_IND]] +; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[STRIDED_VEC3]], [[VEC_IND]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[NEXT_GEP]] to <12 x i32>* +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <8 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> [[TMP7]], <12 x i32> +; CHECK-NEXT: store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[TMP5]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], -; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -844,30 +844,15 @@ ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ , [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_INTFLOAT:%.*]], %struct.IntFloat* [[A:%.*]], i64 [[INDEX]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <8 x i32>* -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP1]], align 4 -; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> -; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[STRIDED_VEC2]] to <4 x float> -; CHECK-NEXT: [[TMP3]] = add <4 x i32> [[STRIDED_VEC]], [[VEC_PHI1]] -; CHECK-NEXT: [[TMP4]] = fadd fast <4 x float> [[VEC_PHI]], [[TMP2]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3]]) -; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP4]]) ; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ poison, [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] -; CHECK-NEXT: [[ADD3_LCSSA:%.*]] = phi float [ poison, [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] -; CHECK-NEXT: store i32 [[ADD_LCSSA]], i32* @SA, align 4 -; CHECK-NEXT: store float [[ADD3_LCSSA]], float* @SB, align 4 +; CHECK-NEXT: store float 0x7FF8000000000000, float* @SB, align 4 ; CHECK-NEXT: ret void ; CHECK: for.body: ; CHECK-NEXT: br i1 poison, label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] Index: llvm/test/Transforms/LoopVectorize/loop-form.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/loop-form.ll +++ llvm/test/Transforms/LoopVectorize/loop-form.ll @@ -1211,10 +1211,11 @@ ; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[TMP4]]) +; CHECK-NEXT: [[RDX_START:%.*]] = add i32 0, [[TMP6]] ; CHECK-NEXT: br label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 200, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] ; CHECK: loop.header: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] Index: llvm/test/Transforms/LoopVectorize/pr51614-fold-tail-by-masking.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/pr51614-fold-tail-by-masking.ll +++ llvm/test/Transforms/LoopVectorize/pr51614-fold-tail-by-masking.ll @@ -15,13 +15,13 @@ ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE4:%.*]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i16> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[PRED_LOAD_CONTINUE4]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE2:%.*]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i16> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[PRED_LOAD_CONTINUE2]] ] ; CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[INDEX]] to i16 ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i16 41, [[TMP0]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i32> poison, i32 [[INDEX]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT1]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[VEC_IV:%.*]] = add <2 x i32> [[BROADCAST_SPLAT2]], +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[INDEX]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[VEC_IV:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], ; CHECK-NEXT: [[TMP1:%.*]] = icmp ule <2 x i32> [[VEC_IV]], ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0 ; CHECK-NEXT: br i1 [[TMP2]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] @@ -39,7 +39,7 @@ ; CHECK-NEXT: [[TMP11:%.*]] = phi <2 x i16> [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_LOAD_IF]] ] ; CHECK-NEXT: [[TMP12:%.*]] = phi <2 x i16> [ poison, [[VECTOR_BODY]] ], [ [[TMP10]], [[PRED_LOAD_IF]] ] ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1 -; CHECK-NEXT: br i1 [[TMP13]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4]] +; CHECK-NEXT: br i1 [[TMP13]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2]] ; CHECK: pred.load.if1: ; CHECK-NEXT: [[TMP14:%.*]] = add i16 [[OFFSET_IDX]], -1 ; CHECK-NEXT: [[TMP15:%.*]] = add nsw i16 [[TMP14]], -1 @@ -49,10 +49,10 @@ ; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds [40 x [4 x i16]], [40 x [4 x i16]]* @A, i16 0, i16 [[TMP15]], i16 3 ; CHECK-NEXT: [[TMP20:%.*]] = load i16, i16* [[TMP19]], align 1 ; CHECK-NEXT: [[TMP21:%.*]] = insertelement <2 x i16> [[TMP12]], i16 [[TMP20]], i32 1 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE4]] +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE2]] ; CHECK: pred.load.continue2: -; CHECK-NEXT: [[TMP22:%.*]] = phi <2 x i16> [ [[TMP11]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP18]], [[PRED_LOAD_IF3]] ] -; CHECK-NEXT: [[TMP23:%.*]] = phi <2 x i16> [ [[TMP12]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP21]], [[PRED_LOAD_IF3]] ] +; CHECK-NEXT: [[TMP22:%.*]] = phi <2 x i16> [ [[TMP11]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP18]], [[PRED_LOAD_IF1]] ] +; CHECK-NEXT: [[TMP23:%.*]] = phi <2 x i16> [ [[TMP12]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP21]], [[PRED_LOAD_IF1]] ] ; CHECK-NEXT: [[TMP24:%.*]] = add nsw <2 x i16> [[TMP22]], [[TMP23]] ; CHECK-NEXT: [[TMP25]] = add <2 x i16> [[VEC_PHI]], [[TMP24]] ; CHECK-NEXT: [[TMP26:%.*]] = select <2 x i1> [[TMP1]], <2 x i16> [[TMP25]], <2 x i16> [[VEC_PHI]] @@ -61,10 +61,11 @@ ; CHECK-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP28:%.*]] = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> [[TMP26]]) +; CHECK-NEXT: [[RDX_START:%.*]] = add i16 0, [[TMP28]] ; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i16 [ -1, [[MIDDLE_BLOCK]] ], [ 41, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i16 [ 0, [[ENTRY]] ], [ [[TMP28]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i16 [ 0, [[ENTRY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IVMINUS1:%.*]], [[LOOP]] ] @@ -79,7 +80,7 @@ ; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i16 [[IV]], 1 ; CHECK-NEXT: br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP2:![0-9]+]] ; CHECK: exit: -; CHECK-NEXT: [[PREVSUM_LCSSA:%.*]] = phi i16 [ [[PREVSUM]], [[LOOP]] ], [ [[TMP28]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[PREVSUM_LCSSA:%.*]] = phi i16 [ [[PREVSUM]], [[LOOP]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i16 [[PREVSUM_LCSSA]] ; entry: Index: llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll +++ llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll @@ -16,7 +16,7 @@ ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ , [[VECTOR_PH]] ], [ [[PREDPHI7:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI7:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add <2 x i32> [[VEC_PHI]], ; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i32> [[TMP0]], ; CHECK-NEXT: [[TMP2:%.*]] = xor <2 x i32> [[BROADCAST_SPLAT]], @@ -36,11 +36,12 @@ ; CHECK: middle.block: ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i32> [[PREDPHI5]], i32 1 ; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[PREDPHI7]]) +; CHECK-NEXT: [[RDX_START:%.*]] = add i32 35902, [[TMP10]] ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 176, 176 ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 182, [[MIDDLE_BLOCK]] ], [ 6, [[BB:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 35902, [[BB]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 35902, [[BB]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] ; CHECK: loop.header: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] @@ -62,7 +63,7 @@ ; CHECK-NEXT: br i1 [[EC]], label [[LOOP_HEADER]], label [[EXIT]], !llvm.loop [[LOOP2:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: [[E_1:%.*]] = phi i32 [ [[P_1]], [[LOOP_LATCH]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] -; CHECK-NEXT: [[E_2:%.*]] = phi i32 [ [[P_2]], [[LOOP_LATCH]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[E_2:%.*]] = phi i32 [ [[P_2]], [[LOOP_LATCH]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: [[RES:%.*]] = add i32 [[E_1]], [[E_2]] ; CHECK-NEXT: ret i32 [[RES]] ; Index: llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll +++ llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll @@ -1344,34 +1344,34 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x float> , float [[S:%.*]], i64 0 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ [[TMP0]], [[VECTOR_PH]] ], [ [[PREDPHI3:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP2:%.*]] = bitcast float* [[TMP1]] to <4 x float>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP2]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP4:%.*]] = bitcast float* [[TMP3]] to <4 x float>* -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP4]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]] -; CHECK-NEXT: [[TMP6:%.*]] = fcmp ule <4 x float> [[WIDE_LOAD1]], -; CHECK-NEXT: [[TMP7:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], -; CHECK-NEXT: [[TMP8:%.*]] = and <4 x i1> [[TMP5]], [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = and <4 x i1> [[TMP8]], [[TMP7]] -; CHECK-NEXT: [[TMP10:%.*]] = xor <4 x i1> [[TMP7]], -; CHECK-NEXT: [[TMP11:%.*]] = and <4 x i1> [[TMP8]], [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = xor <4 x i1> [[TMP5]], -; CHECK-NEXT: [[PREDPHI_V:%.*]] = select <4 x i1> [[TMP9]], <4 x float> [[WIDE_LOAD1]], <4 x float> [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP13:%.*]] = select <4 x i1> [[TMP12]], <4 x i1> , <4 x i1> [[TMP11]] -; CHECK-NEXT: [[PREDPHI2:%.*]] = select <4 x i1> [[TMP13]], <4 x float> , <4 x float> [[PREDPHI_V]] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[TMP0]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; CHECK-NEXT: [[TMP5:%.*]] = fcmp ule <4 x float> [[WIDE_LOAD1]], +; CHECK-NEXT: [[TMP6:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], +; CHECK-NEXT: [[TMP7:%.*]] = and <4 x i1> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = and <4 x i1> [[TMP7]], [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = xor <4 x i1> [[TMP6]], +; CHECK-NEXT: [[TMP10:%.*]] = and <4 x i1> [[TMP7]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = xor <4 x i1> [[TMP4]], +; CHECK-NEXT: [[PREDPHI_V:%.*]] = select <4 x i1> [[TMP8]], <4 x float> [[WIDE_LOAD1]], <4 x float> [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP12:%.*]] = select <4 x i1> [[TMP11]], <4 x i1> , <4 x i1> [[TMP10]] +; CHECK-NEXT: [[PREDPHI2:%.*]] = select <4 x i1> [[TMP12]], <4 x float> , <4 x float> [[PREDPHI_V]] ; CHECK-NEXT: [[PREDPHI3]] = fadd fast <4 x float> [[VEC_PHI]], [[PREDPHI2]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128 -; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128 +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[PREDPHI3]]) +; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[PREDPHI3]]) +; CHECK-NEXT: [[RDX_START:%.*]] = fadd float [[TMP14]], [[S:%.*]] ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] @@ -1388,7 +1388,7 @@ ; CHECK: for.inc: ; CHECK-NEXT: br i1 poison, label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP29:![0-9]+]] ; CHECK: for.end: -; CHECK-NEXT: [[SUM_1_LCSSA:%.*]] = phi float [ poison, [[FOR_INC]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[SUM_1_LCSSA:%.*]] = phi float [ poison, [[FOR_INC]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret float [[SUM_1_LCSSA]] ; entry: @@ -1441,7 +1441,7 @@ ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP30:%.*]], [[PRED_LOAD_CONTINUE6]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP30:%.*]], [[PRED_LOAD_CONTINUE6]] ] ; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i32> [[VEC_IND]], ; CHECK-NEXT: [[TMP1:%.*]] = and <4 x i32> [[VEC_PHI]], ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i1> [[TMP0]], i64 0 @@ -1497,13 +1497,14 @@ ; CHECK-NEXT: [[TMP32:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[TMP30]], <4 x i32> [[VEC_PHI]] ; CHECK-NEXT: [[TMP33:%.*]] = trunc <4 x i32> [[TMP32]] to <4 x i8> ; CHECK-NEXT: [[TMP34:%.*]] = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> [[TMP33]]) +; CHECK-NEXT: [[RDX_START:%.*]] = add i8 [[TMP34]], -1 ; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[DOTLR_PH:%.*]] ; CHECK: .lr.ph: ; CHECK-NEXT: br i1 poison, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]], !llvm.loop [[LOOP31:![0-9]+]] ; CHECK: ._crit_edge: -; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i8 [ poison, [[DOTLR_PH]] ], [ [[TMP34]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i8 [ poison, [[DOTLR_PH]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i8 [[SUM_0_LCSSA]] ; entry: @@ -1537,7 +1538,7 @@ ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP29:%.*]], [[PRED_LOAD_CONTINUE6]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP29:%.*]], [[PRED_LOAD_CONTINUE6]] ] ; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i32> [[VEC_IND]], ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i64 0 ; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] Index: llvm/test/Transforms/LoopVectorize/reduction-inloop.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/reduction-inloop.ll +++ llvm/test/Transforms/LoopVectorize/reduction-inloop.ll @@ -796,34 +796,34 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x float> , float [[S:%.*]], i64 0 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ [[TMP0]], [[VECTOR_PH]] ], [ [[PREDPHI3:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP2:%.*]] = bitcast float* [[TMP1]] to <4 x float>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP2]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP4:%.*]] = bitcast float* [[TMP3]] to <4 x float>* -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP4]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]] -; CHECK-NEXT: [[TMP6:%.*]] = fcmp ule <4 x float> [[WIDE_LOAD1]], -; CHECK-NEXT: [[TMP7:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], -; CHECK-NEXT: [[TMP8:%.*]] = and <4 x i1> [[TMP5]], [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = and <4 x i1> [[TMP8]], [[TMP7]] -; CHECK-NEXT: [[TMP10:%.*]] = xor <4 x i1> [[TMP7]], -; CHECK-NEXT: [[TMP11:%.*]] = and <4 x i1> [[TMP8]], [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = xor <4 x i1> [[TMP5]], -; CHECK-NEXT: [[PREDPHI_V:%.*]] = select <4 x i1> [[TMP9]], <4 x float> [[WIDE_LOAD1]], <4 x float> [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP13:%.*]] = select <4 x i1> [[TMP12]], <4 x i1> , <4 x i1> [[TMP11]] -; CHECK-NEXT: [[PREDPHI2:%.*]] = select <4 x i1> [[TMP13]], <4 x float> , <4 x float> [[PREDPHI_V]] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[TMP0]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; CHECK-NEXT: [[TMP5:%.*]] = fcmp ule <4 x float> [[WIDE_LOAD1]], +; CHECK-NEXT: [[TMP6:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], +; CHECK-NEXT: [[TMP7:%.*]] = and <4 x i1> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = and <4 x i1> [[TMP7]], [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = xor <4 x i1> [[TMP6]], +; CHECK-NEXT: [[TMP10:%.*]] = and <4 x i1> [[TMP7]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = xor <4 x i1> [[TMP4]], +; CHECK-NEXT: [[PREDPHI_V:%.*]] = select <4 x i1> [[TMP8]], <4 x float> [[WIDE_LOAD1]], <4 x float> [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP12:%.*]] = select <4 x i1> [[TMP11]], <4 x i1> , <4 x i1> [[TMP10]] +; CHECK-NEXT: [[PREDPHI2:%.*]] = select <4 x i1> [[TMP12]], <4 x float> , <4 x float> [[PREDPHI_V]] ; CHECK-NEXT: [[PREDPHI3]] = fadd fast <4 x float> [[VEC_PHI]], [[PREDPHI2]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128 -; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]] +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128 +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[PREDPHI3]]) +; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[PREDPHI3]]) +; CHECK-NEXT: [[RDX_START:%.*]] = fadd float [[TMP14]], [[S:%.*]] ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] @@ -840,7 +840,7 @@ ; CHECK: for.inc: ; CHECK-NEXT: br i1 poison, label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP33:![0-9]+]] ; CHECK: for.end: -; CHECK-NEXT: [[SUM_1_LCSSA:%.*]] = phi float [ poison, [[FOR_INC]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[SUM_1_LCSSA:%.*]] = phi float [ poison, [[FOR_INC]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret float [[SUM_1_LCSSA]] ; entry: @@ -998,7 +998,7 @@ ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i8> [ , [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i8> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = sext i32 [[INDEX]] to i64 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i8>* @@ -1009,13 +1009,14 @@ ; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP5:%.*]] = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> [[TMP3]]) +; CHECK-NEXT: [[RDX_START:%.*]] = add i8 [[TMP5]], -1 ; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[DOTLR_PH:%.*]] ; CHECK: .lr.ph: ; CHECK-NEXT: br i1 poison, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]], !llvm.loop [[LOOP37:![0-9]+]] ; CHECK: ._crit_edge: -; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i8 [ poison, [[DOTLR_PH]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i8 [ poison, [[DOTLR_PH]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i8 [[SUM_0_LCSSA]] ; entry: Index: llvm/test/Transforms/LoopVectorize/reduction-predselect.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/reduction-predselect.ll +++ llvm/test/Transforms/LoopVectorize/reduction-predselect.ll @@ -690,13 +690,14 @@ ; CHECK-NEXT: br i1 [[TMP43]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP44:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP42]]) +; CHECK-NEXT: [[RDX_START:%.*]] = fadd float [[TMP44]], 0.000000e+00 ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: br i1 poison, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK: for.end: -; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi float [ poison, [[FOR_BODY]] ], [ [[TMP44]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi float [ poison, [[FOR_BODY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret float [[RESULT_0_LCSSA]] ; entry: @@ -729,7 +730,7 @@ ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ , [[VECTOR_PH]] ], [ [[TMP42:%.*]], [[PRED_LOAD_CONTINUE6]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ , [[VECTOR_PH]] ], [ [[TMP42:%.*]], [[PRED_LOAD_CONTINUE6]] ] ; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i32> [[VEC_IND]], ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i64 0 ; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] @@ -795,13 +796,14 @@ ; CHECK-NEXT: br i1 [[TMP43]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP44:%.*]] = call fast float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[TMP42]]) +; CHECK-NEXT: [[RDX_START:%.*]] = fmul float [[TMP44]], 0.000000e+00 ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: br i1 poison, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] ; CHECK: for.end: -; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi float [ poison, [[FOR_BODY]] ], [ [[TMP44]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi float [ poison, [[FOR_BODY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret float [[RESULT_0_LCSSA]] ; entry: Index: llvm/test/Transforms/LoopVectorize/reduction-small-size.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/reduction-small-size.ll +++ llvm/test/Transforms/LoopVectorize/reduction-small-size.ll @@ -31,11 +31,12 @@ ; CHECK-NEXT: [[TMP6:%.*]] = trunc <4 x i32> [[TMP5]] to <4 x i8> ; CHECK-NEXT: [[TMP7:%.*]] = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> [[TMP6]]) ; CHECK-NEXT: [[TMP8:%.*]] = zext i8 [[TMP7]] to i32 +; CHECK-NEXT: [[RDX_START:%.*]] = add i32 0, [[TMP8]] ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[IF_END:%.*]] ] @@ -51,7 +52,7 @@ ; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[I_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[COND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] ; CHECK: for.end: -; CHECK-NEXT: [[T2:%.*]] = phi i32 [ [[R_NEXT]], [[IF_END]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[T2:%.*]] = phi i32 [ [[R_NEXT]], [[IF_END]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: [[T3:%.*]] = trunc i32 [[T2]] to i8 ; CHECK-NEXT: ret i8 [[T3]] ; @@ -92,27 +93,27 @@ ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP1]], 4 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP1]], [[N_MOD_VF]] ; CHECK-NEXT: [[IND_END:%.*]] = add i32 [[X]], [[N_VEC]] -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[Y:%.*]], i32 0 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP2]], [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP3:%.*]] = and <4 x i32> [[VEC_PHI]], -; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP3]], +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP2:%.*]] = and <4 x i32> [[VEC_PHI]], +; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i32> [[TMP2]], ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: [[TMP6:%.*]] = trunc <4 x i32> [[TMP4]] to <4 x i1> -; CHECK-NEXT: [[TMP7]] = sext <4 x i1> [[TMP6]] to <4 x i32> -; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: [[TMP5:%.*]] = trunc <4 x i32> [[TMP3]] to <4 x i1> +; CHECK-NEXT: [[TMP6]] = sext <4 x i1> [[TMP5]] to <4 x i32> +; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP8:%.*]] = trunc <4 x i32> [[TMP7]] to <4 x i1> -; CHECK-NEXT: [[TMP9:%.*]] = call i1 @llvm.vector.reduce.add.v4i1(<4 x i1> [[TMP8]]) -; CHECK-NEXT: [[TMP10:%.*]] = sext i1 [[TMP9]] to i32 +; CHECK-NEXT: [[TMP7:%.*]] = trunc <4 x i32> [[TMP6]] to <4 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.add.v4i1(<4 x i1> [[TMP7]]) +; CHECK-NEXT: [[TMP9:%.*]] = sext i1 [[TMP8]] to i32 +; CHECK-NEXT: [[RDX_START:%.*]] = add i32 [[Y:%.*]], [[TMP9]] ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP1]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[X]], [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[Y]], [[ENTRY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[Y]], [[ENTRY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[FOR_BODY]] ] @@ -123,7 +124,7 @@ ; CHECK-NEXT: [[COND:%.*]] = icmp sgt i32 [[I]], 77 ; CHECK-NEXT: br i1 [[COND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: for.end: -; CHECK-NEXT: [[T1:%.*]] = phi i32 [ [[R_NEXT]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[T1:%.*]] = phi i32 [ [[R_NEXT]], [[FOR_BODY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[T1]] ; entry: Index: llvm/test/Transforms/LoopVectorize/scalable-reduction-inloop.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/scalable-reduction-inloop.ll +++ llvm/test/Transforms/LoopVectorize/scalable-reduction-inloop.ll @@ -5,32 +5,80 @@ define i8 @reduction_add_trunc(i8* noalias nocapture %A) { ; CHECK-LABEL: @reduction_add_trunc( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP1:%.*]] = mul i32 [[TMP0]], 16 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 256, [[TMP1]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP3:%.*]] = mul i32 [[TMP2]], 16 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 256, [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 256, [[N_MOD_VF]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( zeroinitializer, i32 255, i32 0), %vector.ph ], [ [[TMP34:%.*]], %vector.body ] -; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, %vector.ph ], [ [[TMP36:%.*]], %vector.body ] -; CHECK: [[TMP14:%.*]] = and [[VEC_PHI]], shufflevector ( insertelement ( poison, i32 255, i32 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP15:%.*]] = and [[VEC_PHI1]], shufflevector ( insertelement ( poison, i32 255, i32 0), poison, zeroinitializer) -; CHECK: [[WIDE_LOAD:%.*]] = load , * -; CHECK: [[WIDE_LOAD2:%.*]] = load , * -; CHECK-NEXT: [[TMP26:%.*]] = zext [[WIDE_LOAD]] to -; CHECK-NEXT: [[TMP27:%.*]] = zext [[WIDE_LOAD2]] to -; CHECK-NEXT: [[TMP28:%.*]] = add [[TMP14]], [[TMP26]] -; CHECK-NEXT: [[TMP29:%.*]] = add [[TMP15]], [[TMP27]] -; CHECK-NEXT: [[TMP30:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP31:%.*]] = mul i32 [[TMP30]], 16 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP31]] -; CHECK-NEXT: [[TMP32:%.*]] = icmp eq i32 [[INDEX_NEXT]], {{%.*}} -; CHECK-NEXT: [[TMP33:%.*]] = trunc [[TMP28]] to -; CHECK-NEXT: [[TMP34]] = zext [[TMP33]] to -; CHECK-NEXT: [[TMP35:%.*]] = trunc [[TMP29]] to -; CHECK-NEXT: [[TMP36]] = zext [[TMP35]] to +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP28:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP30:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP6:%.*]] = mul i32 [[TMP5]], 8 +; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP6]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = mul i32 [[TMP7]], 1 +; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[INDEX]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = and [[VEC_PHI]], shufflevector ( insertelement ( poison, i32 255, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP11:%.*]] = and [[VEC_PHI1]], shufflevector ( insertelement ( poison, i32 255, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, i8* [[A]], i32 [[TMP9]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[TMP12]], i32 0 +; CHECK-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to * +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP15]], align 4 +; CHECK-NEXT: [[TMP16:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP17:%.*]] = mul i32 [[TMP16]], 8 +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, i8* [[TMP12]], i32 [[TMP17]] +; CHECK-NEXT: [[TMP19:%.*]] = bitcast i8* [[TMP18]] to * +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load , * [[TMP19]], align 4 +; CHECK-NEXT: [[TMP20:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-NEXT: [[TMP21:%.*]] = zext [[WIDE_LOAD2]] to +; CHECK-NEXT: [[TMP22:%.*]] = add [[TMP10]], [[TMP20]] +; CHECK-NEXT: [[TMP23:%.*]] = add [[TMP11]], [[TMP21]] +; CHECK-NEXT: [[TMP24:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP25:%.*]] = mul i32 [[TMP24]], 16 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP25]] +; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: [[TMP27:%.*]] = trunc [[TMP22]] to +; CHECK-NEXT: [[TMP28]] = zext [[TMP27]] to +; CHECK-NEXT: [[TMP29:%.*]] = trunc [[TMP23]] to +; CHECK-NEXT: [[TMP30]] = zext [[TMP29]] to +; CHECK-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP37:%.*]] = trunc [[TMP34]] to -; CHECK-NEXT: [[TMP38:%.*]] = trunc [[TMP36]] to -; CHECK-NEXT: [[BIN_RDX:%.*]] = add [[TMP38]], [[TMP37]] -; CHECK-NEXT: [[TMP39:%.*]] = call i8 @llvm.vector.reduce.add.nxv8i8( [[BIN_RDX]]) -; CHECK-NEXT: [[TMP40:%.*]] = zext i8 [[TMP39]] to i32 +; CHECK-NEXT: [[TMP31:%.*]] = trunc [[TMP28]] to +; CHECK-NEXT: [[TMP32:%.*]] = trunc [[TMP30]] to +; CHECK-NEXT: [[BIN_RDX:%.*]] = add [[TMP32]], [[TMP31]] +; CHECK-NEXT: [[TMP33:%.*]] = call i8 @llvm.vector.reduce.add.nxv8i8( [[BIN_RDX]]) +; CHECK-NEXT: [[TMP34:%.*]] = zext i8 [[TMP33]] to i32 +; CHECK-NEXT: [[RDX_START:%.*]] = add i32 255, [[TMP34]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 256, [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 255, [[ENTRY]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i32 [ [[INDVARS_IV_NEXT:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[SUM_02P:%.*]] = phi i32 [ [[L9:%.*]], [[LOOP]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[SUM_02:%.*]] = and i32 [[SUM_02P]], 255 +; CHECK-NEXT: [[L2:%.*]] = getelementptr inbounds i8, i8* [[A]], i32 [[INDVARS_IV]] +; CHECK-NEXT: [[L3:%.*]] = load i8, i8* [[L2]], align 4 +; CHECK-NEXT: [[L3E:%.*]] = zext i8 [[L3]] to i32 +; CHECK-NEXT: [[L9]] = add i32 [[SUM_02]], [[L3E]] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i32 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], 256 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ [[L9]], [[LOOP]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[RET:%.*]] = trunc i32 [[SUM_0_LCSSA]] to i8 +; CHECK-NEXT: ret i8 [[RET]] ; entry: br label %loop Index: llvm/test/Transforms/LoopVectorize/trunc-reductions.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/trunc-reductions.ll +++ llvm/test/Transforms/LoopVectorize/trunc-reductions.ll @@ -9,25 +9,17 @@ ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <8 x i8> [ , [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = sext i32 [[INDEX]] to i64 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[PTR:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i8>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, <8 x i8>* [[TMP2]], align 1 -; CHECK-NEXT: [[TMP3]] = and <8 x i8> [[VEC_PHI]], [[WIDE_LOAD]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 -; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256 -; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256 +; CHECK-NEXT: br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP5:%.*]] = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> [[TMP3]]) ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: br i1 poison, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] ; CHECK: for.end: -; CHECK-NEXT: [[AND_LCSSA_OFF0:%.*]] = phi i8 [ poison, [[FOR_BODY]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ] -; CHECK-NEXT: ret i8 [[AND_LCSSA_OFF0]] +; CHECK-NEXT: ret i8 0 ; entry: br label %for.body @@ -147,9 +139,24 @@ define i8 @reduction_smin_trunc(i8* noalias nocapture %ptr) { ; CHECK-LABEL: @reduction_smin_trunc( -; CHECK-NOT: vector.body -; CHECK-NOT: <8 x -; CHECK: ret +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[SUM_02P:%.*]] = phi i32 [ [[TMP1:%.*]], [[FOR_BODY]] ], [ 256, [[ENTRY]] ] +; CHECK-NEXT: [[SUM_02:%.*]] = and i32 [[SUM_02P]], 255 +; CHECK-NEXT: [[TMP0:%.*]] = sext i32 [[IV]] to i64 +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, i8* [[PTR:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[LOAD:%.*]] = load i8, i8* [[GEP]], align 1 +; CHECK-NEXT: [[EXT:%.*]] = sext i8 [[LOAD]] to i32 +; CHECK-NEXT: [[TMP1]] = call i32 @llvm.smin.i32(i32 [[SUM_02]], i32 [[EXT]]) +; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[IV_NEXT]], 256 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; CHECK: for.end: +; CHECK-NEXT: [[RET:%.*]] = trunc i32 [[TMP1]] to i8 +; CHECK-NEXT: ret i8 [[RET]] +; entry: br label %for.body @@ -173,9 +180,24 @@ define i8 @reduction_umin_trunc(i8* noalias nocapture %ptr) { ; CHECK-LABEL: @reduction_umin_trunc( -; CHECK-NOT: vector.body -; CHECK-NOT: <8 x -; CHECK: ret +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[SUM_02P:%.*]] = phi i32 [ [[TMP1:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[SUM_02:%.*]] = and i32 [[SUM_02P]], 255 +; CHECK-NEXT: [[TMP0:%.*]] = sext i32 [[IV]] to i64 +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, i8* [[PTR:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[LOAD:%.*]] = load i8, i8* [[GEP]], align 1 +; CHECK-NEXT: [[EXT:%.*]] = zext i8 [[LOAD]] to i32 +; CHECK-NEXT: [[TMP1]] = call i32 @llvm.umin.i32(i32 [[SUM_02]], i32 [[EXT]]) +; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[IV_NEXT]], 256 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; CHECK: for.end: +; CHECK-NEXT: [[RET:%.*]] = trunc i32 [[TMP1]] to i8 +; CHECK-NEXT: ret i8 [[RET]] +; entry: br label %for.body @@ -199,9 +221,24 @@ define i16 @reduction_smax_trunc(i16* noalias nocapture %ptr) { ; CHECK-LABEL: @reduction_smax_trunc( -; CHECK-NOT: vector.body -; CHECK-NOT: <8 x -; CHECK: ret +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[SUM_02P:%.*]] = phi i32 [ [[TMP1:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[SUM_02:%.*]] = and i32 [[SUM_02P]], 65535 +; CHECK-NEXT: [[TMP0:%.*]] = sext i32 [[IV]] to i64 +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i16, i16* [[PTR:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[LOAD:%.*]] = load i16, i16* [[GEP]], align 2 +; CHECK-NEXT: [[EXT:%.*]] = sext i16 [[LOAD]] to i32 +; CHECK-NEXT: [[TMP1]] = call i32 @llvm.smax.i32(i32 [[SUM_02]], i32 [[EXT]]) +; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[IV_NEXT]], 256 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; CHECK: for.end: +; CHECK-NEXT: [[RET:%.*]] = trunc i32 [[TMP1]] to i16 +; CHECK-NEXT: ret i16 [[RET]] +; entry: br label %for.body @@ -225,9 +262,24 @@ define i16 @reduction_umax_trunc(i16* noalias nocapture %ptr) { ; CHECK-LABEL: @reduction_umax_trunc( -; CHECK-NOT: vector.body -; CHECK-NOT: <8 x -; CHECK: ret +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[SUM_02P:%.*]] = phi i32 [ [[TMP1:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[SUM_02:%.*]] = and i32 [[SUM_02P]], 65535 +; CHECK-NEXT: [[TMP0:%.*]] = sext i32 [[IV]] to i64 +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i16, i16* [[PTR:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[LOAD:%.*]] = load i16, i16* [[GEP]], align 2 +; CHECK-NEXT: [[EXT:%.*]] = zext i16 [[LOAD]] to i32 +; CHECK-NEXT: [[TMP1]] = call i32 @llvm.umax.i32(i32 [[SUM_02]], i32 [[EXT]]) +; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[IV_NEXT]], 256 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; CHECK: for.end: +; CHECK-NEXT: [[RET:%.*]] = trunc i32 [[TMP1]] to i16 +; CHECK-NEXT: ret i16 [[RET]] +; entry: br label %for.body Index: llvm/test/Transforms/PhaseOrdering/AArch64/peel-multiple-unreachable-exits-for-vectorization.ll =================================================================== --- llvm/test/Transforms/PhaseOrdering/AArch64/peel-multiple-unreachable-exits-for-vectorization.ll +++ llvm/test/Transforms/PhaseOrdering/AArch64/peel-multiple-unreachable-exits-for-vectorization.ll @@ -43,39 +43,39 @@ ; CHECK-NEXT: [[TMP4:%.*]] = select i1 [[TMP3]], i64 4, i64 [[N_MOD_VF]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[TMP4]] ; CHECK-NEXT: [[IND_END:%.*]] = add i64 [[N_VEC]], 1 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> , i64 [[SUM_NEXT_PEEL]], i64 0 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ [[TMP5]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI17:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI17:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = or i64 [[INDEX]], 1 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i64, i64* [[START_I]], i64 [[OFFSET_IDX]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i64* [[TMP6]] to <2 x i64>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, <2 x i64>* [[TMP7]], align 4 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i64, i64* [[TMP6]], i64 2 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64* [[TMP8]] to <2 x i64>* -; CHECK-NEXT: [[WIDE_LOAD18:%.*]] = load <2 x i64>, <2 x i64>* [[TMP9]], align 4 -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i64, i64* [[START_I2_PEEL]], i64 [[OFFSET_IDX]] -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i64* [[TMP10]] to <2 x i64>* -; CHECK-NEXT: [[WIDE_LOAD19:%.*]] = load <2 x i64>, <2 x i64>* [[TMP11]], align 4 -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i64, i64* [[TMP10]], i64 2 -; CHECK-NEXT: [[TMP13:%.*]] = bitcast i64* [[TMP12]] to <2 x i64>* -; CHECK-NEXT: [[WIDE_LOAD20:%.*]] = load <2 x i64>, <2 x i64>* [[TMP13]], align 4 -; CHECK-NEXT: [[TMP14:%.*]] = add <2 x i64> [[WIDE_LOAD]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP15:%.*]] = add <2 x i64> [[WIDE_LOAD18]], [[VEC_PHI17]] -; CHECK-NEXT: [[TMP16]] = add <2 x i64> [[TMP14]], [[WIDE_LOAD19]] -; CHECK-NEXT: [[TMP17]] = add <2 x i64> [[TMP15]], [[WIDE_LOAD20]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i64, i64* [[START_I]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i64* [[TMP5]] to <2 x i64>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, <2 x i64>* [[TMP6]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i64, i64* [[TMP5]], i64 2 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i64* [[TMP7]] to <2 x i64>* +; CHECK-NEXT: [[WIDE_LOAD18:%.*]] = load <2 x i64>, <2 x i64>* [[TMP8]], align 4 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i64, i64* [[START_I2_PEEL]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i64* [[TMP9]] to <2 x i64>* +; CHECK-NEXT: [[WIDE_LOAD19:%.*]] = load <2 x i64>, <2 x i64>* [[TMP10]], align 4 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i64, i64* [[TMP9]], i64 2 +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i64* [[TMP11]] to <2 x i64>* +; CHECK-NEXT: [[WIDE_LOAD20:%.*]] = load <2 x i64>, <2 x i64>* [[TMP12]], align 4 +; CHECK-NEXT: [[TMP13:%.*]] = add <2 x i64> [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP14:%.*]] = add <2 x i64> [[WIDE_LOAD18]], [[VEC_PHI17]] +; CHECK-NEXT: [[TMP15]] = add <2 x i64> [[TMP13]], [[WIDE_LOAD19]] +; CHECK-NEXT: [[TMP16]] = add <2 x i64> [[TMP14]], [[WIDE_LOAD20]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[TMP17]], [[TMP16]] -; CHECK-NEXT: [[TMP19:%.*]] = tail call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]]) +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[TMP16]], [[TMP15]] +; CHECK-NEXT: [[TMP18:%.*]] = tail call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]]) +; CHECK-NEXT: [[RDX_START:%.*]] = add i64 [[SUM_NEXT_PEEL]], [[TMP18]] ; CHECK-NEXT: br label [[LOOP_PREHEADER21]] ; CHECK: loop.preheader21: ; CHECK-NEXT: [[IV_PH:%.*]] = phi i64 [ 1, [[LOOP_PREHEADER]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ] -; CHECK-NEXT: [[SUM_PH:%.*]] = phi i64 [ [[SUM_NEXT_PEEL]], [[LOOP_PREHEADER]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[SUM_PH:%.*]] = phi i64 [ [[SUM_NEXT_PEEL]], [[LOOP_PREHEADER]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[AT_WITH_INT_CONVERSION_EXIT12:%.*]] ], [ [[IV_PH]], [[LOOP_PREHEADER21]] ] @@ -170,47 +170,47 @@ ; CHECK-NEXT: [[TMP4:%.*]] = select i1 [[TMP3]], i64 4, i64 [[N_MOD_VF]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[TMP4]] ; CHECK-NEXT: [[IND_END:%.*]] = add i64 [[N_VEC]], 1 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> , i64 [[SUM_NEXT_PEEL]], i64 0 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ [[TMP5]], [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI30:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI30:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = or i64 [[INDEX]], 1 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i64, i64* [[START_I]], i64 [[OFFSET_IDX]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i64* [[TMP6]] to <2 x i64>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, <2 x i64>* [[TMP7]], align 4 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i64, i64* [[TMP6]], i64 2 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64* [[TMP8]] to <2 x i64>* -; CHECK-NEXT: [[WIDE_LOAD31:%.*]] = load <2 x i64>, <2 x i64>* [[TMP9]], align 4 -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i64, i64* [[START_I2_PEEL]], i64 [[OFFSET_IDX]] -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i64* [[TMP10]] to <2 x i64>* -; CHECK-NEXT: [[WIDE_LOAD32:%.*]] = load <2 x i64>, <2 x i64>* [[TMP11]], align 4 -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i64, i64* [[TMP10]], i64 2 -; CHECK-NEXT: [[TMP13:%.*]] = bitcast i64* [[TMP12]] to <2 x i64>* -; CHECK-NEXT: [[WIDE_LOAD33:%.*]] = load <2 x i64>, <2 x i64>* [[TMP13]], align 4 -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i64, i64* [[START_I14_PEEL]], i64 [[OFFSET_IDX]] -; CHECK-NEXT: [[TMP15:%.*]] = bitcast i64* [[TMP14]] to <2 x i64>* -; CHECK-NEXT: [[WIDE_LOAD34:%.*]] = load <2 x i64>, <2 x i64>* [[TMP15]], align 4 -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i64, i64* [[TMP14]], i64 2 -; CHECK-NEXT: [[TMP17:%.*]] = bitcast i64* [[TMP16]] to <2 x i64>* -; CHECK-NEXT: [[WIDE_LOAD35:%.*]] = load <2 x i64>, <2 x i64>* [[TMP17]], align 4 -; CHECK-NEXT: [[TMP18:%.*]] = add <2 x i64> [[WIDE_LOAD]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP19:%.*]] = add <2 x i64> [[WIDE_LOAD31]], [[VEC_PHI30]] -; CHECK-NEXT: [[TMP20:%.*]] = add <2 x i64> [[TMP18]], [[WIDE_LOAD32]] -; CHECK-NEXT: [[TMP21:%.*]] = add <2 x i64> [[TMP19]], [[WIDE_LOAD33]] -; CHECK-NEXT: [[TMP22]] = add <2 x i64> [[TMP20]], [[WIDE_LOAD34]] -; CHECK-NEXT: [[TMP23]] = add <2 x i64> [[TMP21]], [[WIDE_LOAD35]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i64, i64* [[START_I]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i64* [[TMP5]] to <2 x i64>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, <2 x i64>* [[TMP6]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i64, i64* [[TMP5]], i64 2 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i64* [[TMP7]] to <2 x i64>* +; CHECK-NEXT: [[WIDE_LOAD31:%.*]] = load <2 x i64>, <2 x i64>* [[TMP8]], align 4 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i64, i64* [[START_I2_PEEL]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i64* [[TMP9]] to <2 x i64>* +; CHECK-NEXT: [[WIDE_LOAD32:%.*]] = load <2 x i64>, <2 x i64>* [[TMP10]], align 4 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i64, i64* [[TMP9]], i64 2 +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i64* [[TMP11]] to <2 x i64>* +; CHECK-NEXT: [[WIDE_LOAD33:%.*]] = load <2 x i64>, <2 x i64>* [[TMP12]], align 4 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i64, i64* [[START_I14_PEEL]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[TMP14:%.*]] = bitcast i64* [[TMP13]] to <2 x i64>* +; CHECK-NEXT: [[WIDE_LOAD34:%.*]] = load <2 x i64>, <2 x i64>* [[TMP14]], align 4 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i64, i64* [[TMP13]], i64 2 +; CHECK-NEXT: [[TMP16:%.*]] = bitcast i64* [[TMP15]] to <2 x i64>* +; CHECK-NEXT: [[WIDE_LOAD35:%.*]] = load <2 x i64>, <2 x i64>* [[TMP16]], align 4 +; CHECK-NEXT: [[TMP17:%.*]] = add <2 x i64> [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP18:%.*]] = add <2 x i64> [[WIDE_LOAD31]], [[VEC_PHI30]] +; CHECK-NEXT: [[TMP19:%.*]] = add <2 x i64> [[TMP17]], [[WIDE_LOAD32]] +; CHECK-NEXT: [[TMP20:%.*]] = add <2 x i64> [[TMP18]], [[WIDE_LOAD33]] +; CHECK-NEXT: [[TMP21]] = add <2 x i64> [[TMP19]], [[WIDE_LOAD34]] +; CHECK-NEXT: [[TMP22]] = add <2 x i64> [[TMP20]], [[WIDE_LOAD35]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[TMP23]], [[TMP22]] -; CHECK-NEXT: [[TMP25:%.*]] = tail call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]]) +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[TMP22]], [[TMP21]] +; CHECK-NEXT: [[TMP24:%.*]] = tail call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]]) +; CHECK-NEXT: [[RDX_START:%.*]] = add i64 [[SUM_NEXT_PEEL]], [[TMP24]] ; CHECK-NEXT: br label [[LOOP_PREHEADER36]] ; CHECK: loop.preheader36: ; CHECK-NEXT: [[IV_PH:%.*]] = phi i64 [ 1, [[LOOP_PREHEADER]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ] -; CHECK-NEXT: [[SUM_PH:%.*]] = phi i64 [ [[SUM_NEXT_PEEL]], [[LOOP_PREHEADER]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[SUM_PH:%.*]] = phi i64 [ [[SUM_NEXT_PEEL]], [[LOOP_PREHEADER]] ], [ [[RDX_START]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[AT_WITH_INT_CONVERSION_EXIT24:%.*]] ], [ [[IV_PH]], [[LOOP_PREHEADER36]] ]