Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -503,6 +503,13 @@ const VPIteration &Instance, bool IfPredicateInstr, VPTransformState &State); + /// Record the fact that an existing scalar lane defined for DefInstance also + /// produces a result which can be used for UseInstance. Used to avoid + /// redundant computation during replication. + void reuseScalarLane(Instruction *Instr, const VPIteration &DefInstance, + const VPIteration &UseInstance); + + /// Widen an integer or floating-point induction variable \p IV. If \p Trunc /// is provided, the integer induction variable will first be truncated to /// the corresponding type. @@ -2687,6 +2694,13 @@ PredicatedInstructions.push_back(Cloned); } +void InnerLoopVectorizer::reuseScalarLane(Instruction *Instr, + const VPIteration &DefInstance, + const VPIteration &UseInstance) { + auto *Def = VectorLoopValueMap.getScalarValue(Instr, DefInstance); + VectorLoopValueMap.setScalarValue(Instr, UseInstance, Def); +} + PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, Value *End, Value *Step, Instruction *DL) { @@ -7534,12 +7548,25 @@ [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, Range); + auto isGloballyUniform = [&](ElementCount VF) { + if (!IsUniform) + return false; + if (isa(I) && Legal->isUniformMemOp(*I)) + return true; + return Legal->isUniform(I); + }; + + bool IsGloballyUniform = + LoopVectorizationPlanner::getDecisionAndClampRange(isGloballyUniform, + Range); + bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); }, Range); auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), - IsUniform, IsPredicated); + IsUniform, IsGloballyUniform, + IsPredicated); setRecipe(I, Recipe); // Find if I uses a predicated instruction. If so, it will use its scalar @@ -8132,6 +8159,21 @@ return; } + // For a globally uniform instruction, we only need to compute a single lane + // and reuse it across each unrolled iteration. For simplicity sake, we + // preserve the invariant for per-unrolled iteration invariance and record + // the instruction in the first lane of every vector. We could update all + // the consuming logic to understand global uniformity, but this is less + // error prone.. + if (IsGloballyUniform) { + assert(IsUniform); + State.ILV->scalarizeInstruction(Ingredient, *this, {0, 0}, + IsPredicated, State); + for (unsigned Part = 1; Part < State.UF; ++Part) + State.ILV->reuseScalarLane(Ingredient, {0, 0}, {Part, 0}); + return; + } + // Generate scalar instances for all VF lanes of all UF parts, unless the // instruction is uniform inwhich case generate only the first lane for each // of the UF parts. Index: llvm/lib/Transforms/Vectorize/VPlan.h =================================================================== --- llvm/lib/Transforms/Vectorize/VPlan.h +++ llvm/lib/Transforms/Vectorize/VPlan.h @@ -1120,6 +1120,13 @@ /// Indicator if only a single replica per lane is needed. bool IsUniform; + /// Does the expression compute the same value across all lanes of the + /// unrolled computation? Note that expression can be uniform within a + /// single vector bundle and not uniform across unrollings. This is common + /// for e.g. widen loads where we need to materialize the base address for + /// each vector load. + bool IsGloballyUniform; + /// Indicator if the replicas are also predicated. bool IsPredicated; @@ -1129,9 +1136,10 @@ public: template VPReplicateRecipe(Instruction *I, iterator_range Operands, - bool IsUniform, bool IsPredicated = false) + bool IsUniform, bool IsGloballyUniform, bool IsPredicated) : VPRecipeBase(VPReplicateSC), VPUser(Operands), Ingredient(I), - IsUniform(IsUniform), IsPredicated(IsPredicated) { + IsUniform(IsUniform), IsGloballyUniform(IsGloballyUniform), + IsPredicated(IsPredicated) { // Retain the previous behavior of predicateInstructions(), where an // insert-element of a predicated instruction got hoisted into the // predicated basic block iff it was its only user. This is achieved by Index: llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll +++ llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll @@ -21,12 +21,9 @@ ; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 12 ; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[ADDR:%.*]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[ADDR]], align 4 -; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[ADDR]], align 4 -; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[ADDR]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 -; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 -; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4097, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -40,7 +37,7 @@ ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 4096 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOPEXIT]], label [[FOR_BODY]], [[LOOP2:!llvm.loop !.*]] ; CHECK: loopexit: -; CHECK-NEXT: [[LOAD_LCSSA:%.*]] = phi i32 [ [[LOAD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[LOAD_LCSSA:%.*]] = phi i32 [ [[LOAD]], [[FOR_BODY]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[LOAD_LCSSA]] ; entry: @@ -65,43 +62,34 @@ ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 12 ; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[ADDR:%.*]], align 4 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[ADDR]], align 4 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <4 x i32> undef, i32 [[TMP5]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT5:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT4]], <4 x i32> undef, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[ADDR]], align 4 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <4 x i32> undef, i32 [[TMP6]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT7:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT6]], <4 x i32> undef, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[ADDR]], align 4 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <4 x i32> undef, i32 [[TMP7]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT9:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT8]], <4 x i32> undef, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP8]] = add <4 x i32> [[VEC_PHI]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP9]] = add <4 x i32> [[VEC_PHI1]], [[BROADCAST_SPLAT5]] -; CHECK-NEXT: [[TMP10]] = add <4 x i32> [[VEC_PHI2]], [[BROADCAST_SPLAT7]] -; CHECK-NEXT: [[TMP11]] = add <4 x i32> [[VEC_PHI3]], [[BROADCAST_SPLAT9]] +; CHECK-NEXT: [[TMP5]] = add <4 x i32> [[VEC_PHI]], [[BROADCAST_SPLAT9]] +; CHECK-NEXT: [[TMP6]] = add <4 x i32> [[VEC_PHI1]], [[BROADCAST_SPLAT9]] +; CHECK-NEXT: [[TMP7]] = add <4 x i32> [[VEC_PHI2]], [[BROADCAST_SPLAT9]] +; CHECK-NEXT: [[TMP8]] = add <4 x i32> [[VEC_PHI3]], [[BROADCAST_SPLAT9]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 -; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 -; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP4:!llvm.loop !.*]] +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP4:!llvm.loop !.*]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP9]], [[TMP8]] -; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP10]], [[BIN_RDX]] -; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP11]], [[BIN_RDX10]] -; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP6]], [[TMP5]] +; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP7]], [[BIN_RDX]] +; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP8]], [[BIN_RDX10]] +; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4097, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -112,7 +100,7 @@ ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 4096 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOPEXIT]], label [[FOR_BODY]], [[LOOP5:!llvm.loop !.*]] ; CHECK: loopexit: -; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[FOR_BODY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[ACCUM_NEXT_LCSSA]] ; entry: @@ -144,20 +132,11 @@ ; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 12 ; CHECK-NEXT: [[TMP4:%.*]] = udiv i32 [[BYTE_OFFSET:%.*]], 4 -; CHECK-NEXT: [[TMP5:%.*]] = udiv i32 [[BYTE_OFFSET]], 4 -; CHECK-NEXT: [[TMP6:%.*]] = udiv i32 [[BYTE_OFFSET]], 4 -; CHECK-NEXT: [[TMP7:%.*]] = udiv i32 [[BYTE_OFFSET]], 4 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[ADDR:%.*]], i32 [[TMP4]] -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[ADDR]], i32 [[TMP5]] -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[ADDR]], i32 [[TMP6]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[ADDR]], i32 [[TMP7]] -; CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP8]], align 4 -; CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP9]], align 4 -; CHECK-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP10]], align 4 -; CHECK-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP11]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i32, i32* [[ADDR:%.*]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 -; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 -; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP6:!llvm.loop !.*]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP6:!llvm.loop !.*]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4097, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -173,7 +152,7 @@ ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 4096 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOPEXIT]], label [[FOR_BODY]], [[LOOP7:!llvm.loop !.*]] ; CHECK: loopexit: -; CHECK-NEXT: [[LOAD_LCSSA:%.*]] = phi i32 [ [[LOAD]], [[FOR_BODY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[LOAD_LCSSA:%.*]] = phi i32 [ [[LOAD]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[LOAD_LCSSA]] ; entry: @@ -410,28 +389,25 @@ ; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 12 ; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[A]], align 4, !alias.scope !12 -; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[A]], align 4, !alias.scope !12 -; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[A]], align 4, !alias.scope !12 -; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[A]], align 4, !alias.scope !12 ; CHECK-NEXT: store i32 [[TMP4]], i32* [[B]], align 4, !alias.scope !15, !noalias !12 ; CHECK-NEXT: store i32 [[TMP4]], i32* [[B]], align 4, !alias.scope !15, !noalias !12 ; CHECK-NEXT: store i32 [[TMP4]], i32* [[B]], align 4, !alias.scope !15, !noalias !12 ; CHECK-NEXT: store i32 [[TMP4]], i32* [[B]], align 4, !alias.scope !15, !noalias !12 -; CHECK-NEXT: store i32 [[TMP5]], i32* [[B]], align 4, !alias.scope !15, !noalias !12 -; CHECK-NEXT: store i32 [[TMP5]], i32* [[B]], align 4, !alias.scope !15, !noalias !12 -; CHECK-NEXT: store i32 [[TMP5]], i32* [[B]], align 4, !alias.scope !15, !noalias !12 -; CHECK-NEXT: store i32 [[TMP5]], i32* [[B]], align 4, !alias.scope !15, !noalias !12 -; CHECK-NEXT: store i32 [[TMP6]], i32* [[B]], align 4, !alias.scope !15, !noalias !12 -; CHECK-NEXT: store i32 [[TMP6]], i32* [[B]], align 4, !alias.scope !15, !noalias !12 -; CHECK-NEXT: store i32 [[TMP6]], i32* [[B]], align 4, !alias.scope !15, !noalias !12 -; CHECK-NEXT: store i32 [[TMP6]], i32* [[B]], align 4, !alias.scope !15, !noalias !12 -; CHECK-NEXT: store i32 [[TMP7]], i32* [[B]], align 4, !alias.scope !15, !noalias !12 -; CHECK-NEXT: store i32 [[TMP7]], i32* [[B]], align 4, !alias.scope !15, !noalias !12 -; CHECK-NEXT: store i32 [[TMP7]], i32* [[B]], align 4, !alias.scope !15, !noalias !12 -; CHECK-NEXT: store i32 [[TMP7]], i32* [[B]], align 4, !alias.scope !15, !noalias !12 +; CHECK-NEXT: store i32 [[TMP4]], i32* [[B]], align 4, !alias.scope !15, !noalias !12 +; CHECK-NEXT: store i32 [[TMP4]], i32* [[B]], align 4, !alias.scope !15, !noalias !12 +; CHECK-NEXT: store i32 [[TMP4]], i32* [[B]], align 4, !alias.scope !15, !noalias !12 +; CHECK-NEXT: store i32 [[TMP4]], i32* [[B]], align 4, !alias.scope !15, !noalias !12 +; CHECK-NEXT: store i32 [[TMP4]], i32* [[B]], align 4, !alias.scope !15, !noalias !12 +; CHECK-NEXT: store i32 [[TMP4]], i32* [[B]], align 4, !alias.scope !15, !noalias !12 +; CHECK-NEXT: store i32 [[TMP4]], i32* [[B]], align 4, !alias.scope !15, !noalias !12 +; CHECK-NEXT: store i32 [[TMP4]], i32* [[B]], align 4, !alias.scope !15, !noalias !12 +; CHECK-NEXT: store i32 [[TMP4]], i32* [[B]], align 4, !alias.scope !15, !noalias !12 +; CHECK-NEXT: store i32 [[TMP4]], i32* [[B]], align 4, !alias.scope !15, !noalias !12 +; CHECK-NEXT: store i32 [[TMP4]], i32* [[B]], align 4, !alias.scope !15, !noalias !12 +; CHECK-NEXT: store i32 [[TMP4]], i32* [[B]], align 4, !alias.scope !15, !noalias !12 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 -; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 -; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP17:!llvm.loop !.*]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP17:!llvm.loop !.*]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4097, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[SCALAR_PH]]