Index: lib/Transforms/Utils/LoopUnrollPeel.cpp =================================================================== --- lib/Transforms/Utils/LoopUnrollPeel.cpp +++ lib/Transforms/Utils/LoopUnrollPeel.cpp @@ -367,8 +367,9 @@ // side of the branch drops linearly with the iteration number, and we use // a 0.9 fudge factor to make the drop-off less sharp... if (PeeledHeaderWeight) { - uint64_t FallThruWeight = - PeeledHeaderWeight * ((float)(AvgIters - IterNumber) / AvgIters * 0.9); + // Use ceil to get preference for in loop edge. + uint64_t FallThruWeight = ceil( + PeeledHeaderWeight * ((float)(AvgIters - IterNumber) / AvgIters * 0.9)); uint64_t ExitWeight = PeeledHeaderWeight - FallThruWeight; PeeledHeaderWeight -= ExitWeight; @@ -399,6 +400,34 @@ // The # of times the loop body executes is the sum of the exit block // weight and the # of times the backedges are taken. CurHeaderWeight = TrueWeight + FalseWeight; + + // if ExitWeight is too small (for example due to normalization of weights + // then we will not be able to distribute this small weight between all + // peeled copies of branch. To get the distribution we'd like to see we can + // scale the backedge and exit weight to get enough value for ExitWeight. + // Taking into account that we know the multipliers used in + // updateBranchWeights in advance we can pre-compute what value of ExitWeight + // will be enough to have at least 1 for ExitWeight at the end. + // If AvgIters is greater then 10 we can have a problem with precision of + // float then just avoid any modification at all. + if (AvgIters > 10) + return; + + float C = 1.0; + for (unsigned i = 0; i < AvgIters; i++) + C *= ((float)(AvgIters - i) / AvgIters * 0.9); + uint64_t MinExit = ceil(1 / C); + // If ExitWeight is enough big, no need to scale. + if (ExitWeight >= MinExit) + return; + + uint64_t Multiplier = ceil((float)MinExit / ExitWeight); + // Handle overflow of CurHeaderWeight. + if (CurHeaderWeight * Multiplier >= 1u << 31) + return; + + ExitWeight *= Multiplier; + CurHeaderWeight *= Multiplier; } /// Update the weights of original Latch block after peeling off all iterations. Index: test/Transforms/LoopUnroll/peel-loop-pgo-small-weigth.ll =================================================================== --- /dev/null +++ test/Transforms/LoopUnroll/peel-loop-pgo-small-weigth.ll @@ -0,0 +1,56 @@ +; RUN: opt < %s -S -debug-only=loop-unroll -loop-unroll 2>&1 | FileCheck %s +; RUN: opt < %s -S -debug-only=loop-unroll -passes='require,function(require,unroll)' 2>&1 | FileCheck %s +; REQUIRES: asserts + +; Make sure we use the profile information correctly to peel-off 3 iterations +; from the loop, and update the branch weights for the peeled loop properly. + +; CHECK: Loop Unroll: F[basic] +; CHECK: PEELING loop %for.body with iteration count 4! + +; CHECK-LABEL: @basic +; CHECK: br i1 %{{.*}}, label %[[NEXT0:.*]], label %for.cond.for.end_crit_edge, !prof !1 +; CHECK: [[NEXT0]]: +; CHECK: br i1 %{{.*}}, label %[[NEXT1:.*]], label %for.cond.for.end_crit_edge, !prof !2 +; CHECK: [[NEXT1]]: +; CHECK: br i1 %{{.*}}, label %[[NEXT2:.*]], label %for.cond.for.end_crit_edge, !prof !3 +; CHECK: [[NEXT2]]: +; CHECK: br i1 %{{.*}}, label %[[NEXT3:.*]], label %for.cond.for.end_crit_edge, !prof !4 +; CHECK: [[NEXT3]]: +; CHECK: br i1 %{{.*}}, label %for.body, label %{{.*}}, !prof !5 + +define void @basic(i32* %p, i32 %k) #0 !prof !0 { +entry: + %cmp3 = icmp slt i32 0, %k + br i1 %cmp3, label %for.body.lr.ph, label %for.end + +for.body.lr.ph: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.lr.ph, %for.body + %i.05 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] + %p.addr.04 = phi i32* [ %p, %for.body.lr.ph ], [ %incdec.ptr, %for.body ] + %incdec.ptr = getelementptr inbounds i32, i32* %p.addr.04, i32 1 + store i32 %i.05, i32* %p.addr.04, align 4 + %inc = add nsw i32 %i.05, 1 + %cmp = icmp slt i32 %inc, %k + br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge, !prof !1 + +for.cond.for.end_crit_edge: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.cond.for.end_crit_edge, %entry + ret void +} + +attributes #0 = { nounwind } + +!0 = !{!"function_entry_count", i64 1} +!1 = !{!"branch_weights", i32 4, i32 1} + +;CHECK: !1 = !{!"branch_weights", i32 16, i32 1} +;CHECK: !2 = !{!"branch_weights", i32 11, i32 5} +;CHECK: !3 = !{!"branch_weights", i32 5, i32 6} +;CHECK: !4 = !{!"branch_weights", i32 2, i32 3} +;CHECK: !5 = !{!"branch_weights", i32 34, i32 2} + Index: test/Transforms/LoopUnroll/peel-loop-pgo.ll =================================================================== --- test/Transforms/LoopUnroll/peel-loop-pgo.ll +++ test/Transforms/LoopUnroll/peel-loop-pgo.ll @@ -103,8 +103,8 @@ !15 = !{!"function_entry_count", i64 1} !16 = !{!"branch_weights", i32 3001, i32 1001} -;CHECK: !15 = !{!"branch_weights", i32 900, i32 101} -;CHECK: !16 = !{!"branch_weights", i32 540, i32 360} -;CHECK: !17 = !{!"branch_weights", i32 162, i32 378} -;CHECK: !18 = !{!"branch_weights", i32 1399, i32 162} +;CHECK: !15 = !{!"branch_weights", i32 901, i32 100} +;CHECK: !16 = !{!"branch_weights", i32 541, i32 360} +;CHECK: !17 = !{!"branch_weights", i32 163, i32 378} +;CHECK: !18 = !{!"branch_weights", i32 1396, i32 163}