Index: llvm/lib/Analysis/LoopUnrollAnalyzer.cpp =================================================================== --- llvm/lib/Analysis/LoopUnrollAnalyzer.cpp +++ llvm/lib/Analysis/LoopUnrollAnalyzer.cpp @@ -35,6 +35,11 @@ return true; } + // If we have a loop invariant computation, we only need to compute it once. + // Given that, all but the first occurance are free. + if (!IterationNumber->isZero() && SE.isLoopInvariant(S, L)) + return true; + auto *AR = dyn_cast(S); if (!AR || AR->getLoop() != L) return false; Index: llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp =================================================================== --- llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -725,13 +725,68 @@ return MaxPercentThresholdBoost; } -// Returns loop size estimation for unrolled loop. -static uint64_t getUnrolledLoopSize( - unsigned LoopSize, - TargetTransformInfo::UnrollingPreferences &UP) { - assert(LoopSize >= UP.BEInsns && "LoopSize should not be less than BEInsns!"); - return (uint64_t)(LoopSize - UP.BEInsns) * UP.Count + UP.BEInsns; -} +/// Helper class to estimate the size of a loop with a given unroll factor. +/// Note that there's a more expensive estimator used for full unrolling if the +/// result of this one doesn't show profit, but this is the only estimate used +/// for partial and runtime unrolling. +/// +/// TODO: There are a number of ways to make this more sophisticated. +/// 1) Extend the uniform logic to understand expressions which are uniform for +/// a given number of iterations (but not all iterations). An example: +/// for (int i = 0; i < N; i++) { +/// if (a[i/4]) +/// b[i]; +/// } +/// 2) Recognize patterns which repeat predictably with unrolling. An example: +/// for (int i = 0; i < N; i++) { +/// if (a[i % 4]) +/// b[i]; +/// } +/// 3) Integrate PeelCount. If we know we're peeling once, the globally +/// uniform expressions don't contribute to the loop size of the remaining +/// (potentially unrollable) loop. +/// 4) Use aliasing information to reason about CSEable loads. Consider the +/// load from 'a' in example #1 with an unroll factor of 4. +/// 5) Before this gets too complicated, we should consider merging this with +/// the static unroll cost model. +class LoopSizeEstimator { + Loop &TheLoop; + ScalarEvolution &SE; + unsigned LoopSize; + unsigned UniformSize; + +public: + LoopSizeEstimator(Loop &L, ScalarEvolution &SE, unsigned LoopSize) + : TheLoop(L), SE(SE), LoopSize(LoopSize), UniformSize(-1U) { + } + + uint64_t getEstimatedLoopSize(TargetTransformInfo::UnrollingPreferences &UP) { + if (-1U == UniformSize) + UniformSize = computeUniformInstCount(); + unsigned LocalUniformSize = UniformSize; + // Avoid degenerate case where entire loop body is uniform across all + // iterations. We should break the backedge, not unroll that. + if (LocalUniformSize + UP.BEInsns + 1 > LoopSize) + LocalUniformSize = 0; + + assert(LoopSize >= (UP.BEInsns + LocalUniformSize) && + "LoopSize should not be less than BEInsns!"); + return (uint64_t)(LoopSize - UP.BEInsns - LocalUniformSize) * UP.Count + + UP.BEInsns + LocalUniformSize; + } +private: + unsigned computeUniformInstCount() { + unsigned Count = 0; + for (BasicBlock *BB : TheLoop.blocks()) + for (Instruction &I : *BB) { + if (!SE.isSCEVable(I.getType())) + continue; + if (SE.isLoopInvariant(SE.getSCEV(&I), &TheLoop)) + Count++; + } + return Count; + } +}; // Returns true if unroll count was set explicitly. // Calculates unroll count and writes it to UP.Count. @@ -749,6 +804,8 @@ TargetTransformInfo::UnrollingPreferences &UP, TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound) { + LoopSizeEstimator LSE(*L, SE, LoopSize); + // Check for explicit Count. // 1st priority is unroll count set by "unroll-count" option. bool UserUnrollCount = UnrollCount.getNumOccurrences() > 0; @@ -756,7 +813,7 @@ UP.Count = UnrollCount; UP.AllowExpensiveTripCount = true; UP.Force = true; - if (UP.AllowRemainder && getUnrolledLoopSize(LoopSize, UP) < UP.Threshold) + if (UP.AllowRemainder && LSE.getEstimatedLoopSize(UP) < UP.Threshold) return true; } @@ -768,13 +825,13 @@ UP.AllowExpensiveTripCount = true; UP.Force = true; if ((UP.AllowRemainder || (TripMultiple % PragmaCount == 0)) && - getUnrolledLoopSize(LoopSize, UP) < PragmaUnrollThreshold) + LSE.getEstimatedLoopSize(UP) < PragmaUnrollThreshold) return true; } bool PragmaFullUnroll = hasUnrollFullPragma(L); if (PragmaFullUnroll && TripCount != 0) { UP.Count = TripCount; - if (getUnrolledLoopSize(LoopSize, UP) < PragmaUnrollThreshold) + if (LSE.getEstimatedLoopSize(UP) < PragmaUnrollThreshold) return false; } @@ -823,7 +880,7 @@ if (FullUnrollTripCount && FullUnrollTripCount <= UP.FullUnrollMaxCount) { // When computing the unrolled size, note that BEInsns are not replicated // like the rest of the loop body. - if (getUnrolledLoopSize(LoopSize, UP) < UP.Threshold) { + if (LSE.getEstimatedLoopSize(UP) < UP.Threshold) { UseUpperBound = (FullUnrollMaxTripCount == FullUnrollTripCount); TripCount = FullUnrollTripCount; TripMultiple = UP.UpperBound ? 1 : TripMultiple; @@ -870,7 +927,7 @@ UP.Count = TripCount; if (UP.PartialThreshold != NoThreshold) { // Reduce unroll count to be modulo of TripCount for partial unrolling. - if (getUnrolledLoopSize(LoopSize, UP) > UP.PartialThreshold) + if (LSE.getEstimatedLoopSize(UP) > UP.PartialThreshold) UP.Count = (std::max(UP.PartialThreshold, UP.BEInsns + 1) - UP.BEInsns) / (LoopSize - UP.BEInsns); @@ -885,7 +942,7 @@ // remainder loop is allowed. UP.Count = UP.DefaultUnrollRuntimeCount; while (UP.Count != 0 && - getUnrolledLoopSize(LoopSize, UP) > UP.PartialThreshold) + LSE.getEstimatedLoopSize(UP) > UP.PartialThreshold) UP.Count >>= 1; } if (UP.Count < 2) { @@ -969,7 +1026,7 @@ // Reduce unroll count to be the largest power-of-two factor of // the original count which satisfies the threshold limit. while (UP.Count != 0 && - getUnrolledLoopSize(LoopSize, UP) > UP.PartialThreshold) + LSE.getEstimatedLoopSize(UP) > UP.PartialThreshold) UP.Count >>= 1; #ifndef NDEBUG Index: llvm/test/Transforms/LoopUnroll/X86/full-uniform.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopUnroll/X86/full-uniform.ll @@ -0,0 +1,118 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: -p +; RUN: opt < %s -S -unroll-partial-threshold=0 -unroll-threshold=80 -loop-unroll -unroll-allow-partial=0 -unroll-runtime=0 -unroll-allow-remainder=0 -unroll-max-percent-threshold-boost=0 | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1" +target triple = "x86_64-unknown-linux-gnu" + +;; Two instructions are loop invariant and thus don't count against +;; the unroll cost for iteration > 1. As a result, we can unroll this. +define i32 @uniform_contents(i32* %A, i32* %B, i32 %c, i32 %d) { +; CHECK-LABEL: @uniform_contents( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label %loop +; CHECK: loop: +; CHECK-NEXT: %v1 = load i32, i32* %A, align 4 +; CHECK-NEXT: %earlyexit = icmp eq i32 %v1, 0 +; CHECK-NEXT: br i1 %earlyexit, label %loop_exit, label %latch +; CHECK: latch: +; CHECK-NEXT: %index = udiv i32 %c, %d +; CHECK-NEXT: %gep2 = getelementptr i32, i32* %A, i32 %index +; CHECK-NEXT: %v2 = load i32, i32* %gep2, align 4 +; CHECK-NEXT: %gep1.1 = getelementptr i32, i32* %A, i64 1 +; CHECK-NEXT: %v1.1 = load i32, i32* %gep1.1, align 4 +; CHECK-NEXT: %earlyexit.1 = icmp eq i32 %v1.1, 0 +; CHECK-NEXT: br i1 %earlyexit.1, label %loop_exit, label %latch.1 +; CHECK: loop_exit: +; CHECK-NEXT: %accum.lcssa = phi i32 [ 0, %loop ], [ %v2, %latch ], [ %accum.next.1, %latch.1 ], [ %accum.next.2, %latch.2 ], [ %accum.next.3, %latch.3 ], [ %accum.next.4, %latch.4 ], [ %accum.next.5, %latch.5 ], [ %accum.next.6, %latch.6 ], [ %accum.next.7, %latch.7 ], [ %accum.next.7, %latch.8 ] +; CHECK-NEXT: ret i32 %accum.lcssa +; CHECK: latch.1: +; CHECK-NEXT: %index.1 = udiv i32 %c, %d +; CHECK-NEXT: %gep2.1 = getelementptr i32, i32* %A, i32 %index.1 +; CHECK-NEXT: %v2.1 = load i32, i32* %gep2.1, align 4 +; CHECK-NEXT: %accum.next.1 = add i32 %v2, %v2.1 +; CHECK-NEXT: %gep1.2 = getelementptr i32, i32* %A, i64 2 +; CHECK-NEXT: %v1.2 = load i32, i32* %gep1.2, align 4 +; CHECK-NEXT: %earlyexit.2 = icmp eq i32 %v1.2, 0 +; CHECK-NEXT: br i1 %earlyexit.2, label %loop_exit, label %latch.2 +; CHECK: latch.2: +; CHECK-NEXT: %index.2 = udiv i32 %c, %d +; CHECK-NEXT: %gep2.2 = getelementptr i32, i32* %A, i32 %index.2 +; CHECK-NEXT: %v2.2 = load i32, i32* %gep2.2, align 4 +; CHECK-NEXT: %accum.next.2 = add i32 %accum.next.1, %v2.2 +; CHECK-NEXT: %gep1.3 = getelementptr i32, i32* %A, i64 3 +; CHECK-NEXT: %v1.3 = load i32, i32* %gep1.3, align 4 +; CHECK-NEXT: %earlyexit.3 = icmp eq i32 %v1.3, 0 +; CHECK-NEXT: br i1 %earlyexit.3, label %loop_exit, label %latch.3 +; CHECK: latch.3: +; CHECK-NEXT: %index.3 = udiv i32 %c, %d +; CHECK-NEXT: %gep2.3 = getelementptr i32, i32* %A, i32 %index.3 +; CHECK-NEXT: %v2.3 = load i32, i32* %gep2.3, align 4 +; CHECK-NEXT: %accum.next.3 = add i32 %accum.next.2, %v2.3 +; CHECK-NEXT: %gep1.4 = getelementptr i32, i32* %A, i64 4 +; CHECK-NEXT: %v1.4 = load i32, i32* %gep1.4, align 4 +; CHECK-NEXT: %earlyexit.4 = icmp eq i32 %v1.4, 0 +; CHECK-NEXT: br i1 %earlyexit.4, label %loop_exit, label %latch.4 +; CHECK: latch.4: +; CHECK-NEXT: %index.4 = udiv i32 %c, %d +; CHECK-NEXT: %gep2.4 = getelementptr i32, i32* %A, i32 %index.4 +; CHECK-NEXT: %v2.4 = load i32, i32* %gep2.4, align 4 +; CHECK-NEXT: %accum.next.4 = add i32 %accum.next.3, %v2.4 +; CHECK-NEXT: %gep1.5 = getelementptr i32, i32* %A, i64 5 +; CHECK-NEXT: %v1.5 = load i32, i32* %gep1.5, align 4 +; CHECK-NEXT: %earlyexit.5 = icmp eq i32 %v1.5, 0 +; CHECK-NEXT: br i1 %earlyexit.5, label %loop_exit, label %latch.5 +; CHECK: latch.5: +; CHECK-NEXT: %index.5 = udiv i32 %c, %d +; CHECK-NEXT: %gep2.5 = getelementptr i32, i32* %A, i32 %index.5 +; CHECK-NEXT: %v2.5 = load i32, i32* %gep2.5, align 4 +; CHECK-NEXT: %accum.next.5 = add i32 %accum.next.4, %v2.5 +; CHECK-NEXT: %gep1.6 = getelementptr i32, i32* %A, i64 6 +; CHECK-NEXT: %v1.6 = load i32, i32* %gep1.6, align 4 +; CHECK-NEXT: %earlyexit.6 = icmp eq i32 %v1.6, 0 +; CHECK-NEXT: br i1 %earlyexit.6, label %loop_exit, label %latch.6 +; CHECK: latch.6: +; CHECK-NEXT: %index.6 = udiv i32 %c, %d +; CHECK-NEXT: %gep2.6 = getelementptr i32, i32* %A, i32 %index.6 +; CHECK-NEXT: %v2.6 = load i32, i32* %gep2.6, align 4 +; CHECK-NEXT: %accum.next.6 = add i32 %accum.next.5, %v2.6 +; CHECK-NEXT: %gep1.7 = getelementptr i32, i32* %A, i64 7 +; CHECK-NEXT: %v1.7 = load i32, i32* %gep1.7, align 4 +; CHECK-NEXT: %earlyexit.7 = icmp eq i32 %v1.7, 0 +; CHECK-NEXT: br i1 %earlyexit.7, label %loop_exit, label %latch.7 +; CHECK: latch.7: +; CHECK-NEXT: %index.7 = udiv i32 %c, %d +; CHECK-NEXT: %gep2.7 = getelementptr i32, i32* %A, i32 %index.7 +; CHECK-NEXT: %v2.7 = load i32, i32* %gep2.7, align 4 +; CHECK-NEXT: %accum.next.7 = add i32 %accum.next.6, %v2.7 +; CHECK-NEXT: %gep1.8 = getelementptr i32, i32* %A, i64 8 +; CHECK-NEXT: %v1.8 = load i32, i32* %gep1.8, align 4 +; CHECK-NEXT: %earlyexit.8 = icmp eq i32 %v1.8, 0 +; CHECK-NEXT: br i1 %earlyexit.8, label %loop_exit, label %latch.8 +; CHECK: latch.8: +; CHECK-NEXT: %index.8 = udiv i32 %c, %d +; CHECK-NEXT: %gep2.8 = getelementptr i32, i32* %A, i32 %index.8 +; CHECK-NEXT: %v2.8 = load i32, i32* %gep2.8, align 4 +; CHECK-NEXT: br label %loop_exit +; +entry: + br label %loop +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %latch ] + %accum = phi i32 [ 0, %entry ], [ %accum.next, %latch ] + %gep1 = getelementptr i32, i32* %A, i64 %iv + %v1 = load i32, i32* %gep1 + %earlyexit = icmp eq i32 %v1, 0 + br i1 %earlyexit, label %loop_exit, label %latch +latch: + %index = udiv i32 %c, %d + %gep2 = getelementptr i32, i32* %A, i32 %index + %v2 = load i32, i32* %gep2 + %iv.next = add i64 %iv, 1 + %accum.next = add i32 %accum, %v2 + %exit = icmp eq i64 %iv, 8 + br i1 %exit, label %loop_exit, label %loop + +loop_exit: + ret i32 %accum +} + Index: llvm/test/Transforms/LoopUnroll/X86/partial-uniform.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopUnroll/X86/partial-uniform.ll @@ -0,0 +1,86 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -S -unroll-partial-threshold=36 -unroll-threshold=0 -loop-unroll -unroll-allow-partial -unroll-runtime -unroll-allow-remainder -unroll-max-percent-threshold-boost=0 | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1" +target triple = "x86_64-unknown-linux-gnu" + +;; Two instructions are loop invariant and thus don't count against +;; the unroll cost for iteration > 1. As a result, we can unroll this +;; at cost 36 not 42. +define i32 @uniform_contents(i32* %A, i32* %B, i32 %c, i32 %d) { +; CHECK-LABEL: @uniform_contents( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT_3:%.*]], [[LATCH_3:%.*]] ] +; CHECK-NEXT: [[ACCUM:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ACCUM_NEXT_3:%.*]], [[LATCH_3]] ] +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[IV]] +; CHECK-NEXT: [[V1:%.*]] = load i32, i32* [[GEP1]], align 4 +; CHECK-NEXT: [[EARLYEXIT:%.*]] = icmp eq i32 [[V1]], 0 +; CHECK-NEXT: br i1 [[EARLYEXIT]], label [[LOOP_EXIT:%.*]], label [[LATCH:%.*]] +; CHECK: latch: +; CHECK-NEXT: [[INDEX:%.*]] = udiv i32 [[C:%.*]], [[D:%.*]] +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr i32, i32* [[A]], i32 [[INDEX]] +; CHECK-NEXT: [[V2:%.*]] = load i32, i32* [[GEP2]], align 4 +; CHECK-NEXT: [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[ACCUM_NEXT:%.*]] = add i32 [[ACCUM]], [[V2]] +; CHECK-NEXT: [[GEP1_1:%.*]] = getelementptr i32, i32* [[A]], i64 [[IV_NEXT]] +; CHECK-NEXT: [[V1_1:%.*]] = load i32, i32* [[GEP1_1]], align 4 +; CHECK-NEXT: [[EARLYEXIT_1:%.*]] = icmp eq i32 [[V1_1]], 0 +; CHECK-NEXT: br i1 [[EARLYEXIT_1]], label [[LOOP_EXIT]], label [[LATCH_1:%.*]] +; CHECK: loop_exit: +; CHECK-NEXT: [[ACCUM_LCSSA:%.*]] = phi i32 [ [[ACCUM]], [[LOOP]] ], [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[ACCUM_NEXT_1:%.*]], [[LATCH_1]] ], [ [[ACCUM_NEXT_1]], [[LATCH_2:%.*]] ], [ [[ACCUM_NEXT_2:%.*]], [[LOOP_3:%.*]] ] +; CHECK-NEXT: ret i32 [[ACCUM_LCSSA]] +; CHECK: latch.1: +; CHECK-NEXT: [[INDEX_1:%.*]] = udiv i32 [[C]], [[D]] +; CHECK-NEXT: [[GEP2_1:%.*]] = getelementptr i32, i32* [[A]], i32 [[INDEX_1]] +; CHECK-NEXT: [[V2_1:%.*]] = load i32, i32* [[GEP2_1]], align 4 +; CHECK-NEXT: [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV_NEXT]], 1 +; CHECK-NEXT: [[ACCUM_NEXT_1]] = add i32 [[ACCUM_NEXT]], [[V2_1]] +; CHECK-NEXT: [[GEP1_2:%.*]] = getelementptr i32, i32* [[A]], i64 [[IV_NEXT_1]] +; CHECK-NEXT: [[V1_2:%.*]] = load i32, i32* [[GEP1_2]], align 4 +; CHECK-NEXT: [[EARLYEXIT_2:%.*]] = icmp eq i32 [[V1_2]], 0 +; CHECK-NEXT: br i1 [[EARLYEXIT_2]], label [[LOOP_EXIT]], label [[LATCH_2]] +; CHECK: latch.2: +; CHECK-NEXT: [[INDEX_2:%.*]] = udiv i32 [[C]], [[D]] +; CHECK-NEXT: [[GEP2_2:%.*]] = getelementptr i32, i32* [[A]], i32 [[INDEX_2]] +; CHECK-NEXT: [[V2_2:%.*]] = load i32, i32* [[GEP2_2]], align 4 +; CHECK-NEXT: [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV_NEXT_1]], 1 +; CHECK-NEXT: [[ACCUM_NEXT_2]] = add i32 [[ACCUM_NEXT_1]], [[V2_2]] +; CHECK-NEXT: [[EXIT_2:%.*]] = icmp eq i64 [[IV_NEXT_1]], 198 +; CHECK-NEXT: br i1 [[EXIT_2]], label [[LOOP_EXIT]], label [[LOOP_3]] +; CHECK: loop.3: +; CHECK-NEXT: [[GEP1_3:%.*]] = getelementptr i32, i32* [[A]], i64 [[IV_NEXT_2]] +; CHECK-NEXT: [[V1_3:%.*]] = load i32, i32* [[GEP1_3]], align 4 +; CHECK-NEXT: [[EARLYEXIT_3:%.*]] = icmp eq i32 [[V1_3]], 0 +; CHECK-NEXT: br i1 [[EARLYEXIT_3]], label [[LOOP_EXIT]], label [[LATCH_3]] +; CHECK: latch.3: +; CHECK-NEXT: [[INDEX_3:%.*]] = udiv i32 [[C]], [[D]] +; CHECK-NEXT: [[GEP2_3:%.*]] = getelementptr i32, i32* [[A]], i32 [[INDEX_3]] +; CHECK-NEXT: [[V2_3:%.*]] = load i32, i32* [[GEP2_3]], align 4 +; CHECK-NEXT: [[IV_NEXT_3]] = add nuw nsw i64 [[IV_NEXT_2]], 1 +; CHECK-NEXT: [[ACCUM_NEXT_3]] = add i32 [[ACCUM_NEXT_2]], [[V2_3]] +; CHECK-NEXT: br label [[LOOP]] +; +entry: + br label %loop +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %latch ] + %accum = phi i32 [ 0, %entry ], [ %accum.next, %latch ] + %gep1 = getelementptr i32, i32* %A, i64 %iv + %v1 = load i32, i32* %gep1 + %earlyexit = icmp eq i32 %v1, 0 + br i1 %earlyexit, label %loop_exit, label %latch +latch: + %index = udiv i32 %c, %d + %gep2 = getelementptr i32, i32* %A, i32 %index + %v2 = load i32, i32* %gep2 + %iv.next = add i64 %iv, 1 + %accum.next = add i32 %accum, %v2 + %exit = icmp eq i64 %iv, 198 + br i1 %exit, label %loop_exit, label %loop + +loop_exit: + ret i32 %accum +} + Index: llvm/test/Transforms/LoopUnroll/nonlatchcondbr.ll =================================================================== --- llvm/test/Transforms/LoopUnroll/nonlatchcondbr.ll +++ llvm/test/Transforms/LoopUnroll/nonlatchcondbr.ll @@ -116,7 +116,7 @@ ; CHECK-NEXT: [[ARRAYIDX_PHI_TRANS_INSERT_2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INC_2]] ; CHECK-NEXT: [[DOTPRE_2:%.*]] = load i32, i32* [[ARRAYIDX_PHI_TRANS_INSERT_2]], align 4 ; CHECK-NEXT: call void @bar(i32 [[DOTPRE_2]]) -; CHECK-NEXT: [[INC_3]] = add nsw i64 [[INC_2]], 1 +; CHECK-NEXT: [[INC_3]] = add nuw nsw i64 [[INC_2]], 1 ; CHECK-NEXT: br i1 true, label [[FOR_BODY_3:%.*]], label [[FOR_BODY_FOR_BODY_CRIT_EDGE_3]] ; CHECK: for.body.3: ; CHECK-NEXT: [[CMP_3:%.*]] = call i1 @foo(i64 [[INC_2]]) @@ -124,7 +124,7 @@ ; CHECK: for.body.for.body_crit_edge.3: ; CHECK-NEXT: [[ARRAYIDX_PHI_TRANS_INSERT_3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INC_3]] ; CHECK-NEXT: [[DOTPRE_3]] = load i32, i32* [[ARRAYIDX_PHI_TRANS_INSERT_3]], align 4 -; CHECK-NEXT: br label [[FOR_HEADER]], !llvm.loop !0 +; CHECK-NEXT: br label [[FOR_HEADER]], [[LOOP0:!llvm.loop !.*]] ; entry: br i1 true, label %for.preheader, label %for.end @@ -202,7 +202,7 @@ ; CHECK: for.body.for.body_crit_edge.3: ; CHECK-NEXT: [[ARRAYIDX_PHI_TRANS_INSERT_3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INC_3]] ; CHECK-NEXT: [[DOTPRE_3]] = load i32, i32* [[ARRAYIDX_PHI_TRANS_INSERT_3]], align 4 -; CHECK-NEXT: br label [[FOR_HEADER]], !llvm.loop !2 +; CHECK-NEXT: br label [[FOR_HEADER]], [[LOOP2:!llvm.loop !.*]] ; entry: %0 = load i32, i32* %A, align 4