Index: llvm/trunk/include/llvm/Analysis/BlockFrequencyInfoImpl.h =================================================================== --- llvm/trunk/include/llvm/Analysis/BlockFrequencyInfoImpl.h +++ llvm/trunk/include/llvm/Analysis/BlockFrequencyInfoImpl.h @@ -718,9 +718,6 @@ /// /// It has some known flaws. /// -/// - Loop scale is limited to 4096 per loop (2^12) to avoid exhausting -/// BlockFrequency's 64-bit integer precision. -/// /// - The model of irreducible control flow is a rough approximation. /// /// Modelling irreducible control flow exactly involves setting up and Index: llvm/trunk/lib/Analysis/BlockFrequencyInfoImpl.cpp =================================================================== --- llvm/trunk/lib/Analysis/BlockFrequencyInfoImpl.cpp +++ llvm/trunk/lib/Analysis/BlockFrequencyInfoImpl.cpp @@ -331,32 +331,35 @@ return true; } -/// \brief Get the maximum allowed loop scale. -/// -/// Gives the maximum number of estimated iterations allowed for a loop. Very -/// large numbers cause problems downstream (even within 64-bits). -static Scaled64 getMaxLoopScale() { return Scaled64(1, 12); } - /// \brief Compute the loop scale for a loop. void BlockFrequencyInfoImplBase::computeLoopScale(LoopData &Loop) { // Compute loop scale. DEBUG(dbgs() << "compute-loop-scale: " << getLoopName(Loop) << "\n"); + // Infinite loops need special handling. If we give the back edge an infinite + // mass, they may saturate all the other scales in the function down to 1, + // making all the other region temperatures look exactly the same. Choose an + // arbitrary scale to avoid these issues. + // + // FIXME: An alternate way would be to select a symbolic scale which is later + // replaced to be the maximum of all computed scales plus 1. This would + // appropriately describe the loop as having a large scale, without skewing + // the final frequency computation. + const Scaled64 InifiniteLoopScale(1, 12); + // LoopScale == 1 / ExitMass // ExitMass == HeadMass - BackedgeMass BlockMass ExitMass = BlockMass::getFull() - Loop.BackedgeMass; - // Block scale stores the inverse of the scale. - Loop.Scale = ExitMass.toScaled().inverse(); + // Block scale stores the inverse of the scale. If this is an infinite loop, + // its exit mass will be zero. In this case, use an arbitrary scale for the + // loop scale. + Loop.Scale = + ExitMass.isEmpty() ? InifiniteLoopScale : ExitMass.toScaled().inverse(); DEBUG(dbgs() << " - exit-mass = " << ExitMass << " (" << BlockMass::getFull() << " - " << Loop.BackedgeMass << ")\n" << " - scale = " << Loop.Scale << "\n"); - - if (Loop.Scale > getMaxLoopScale()) { - Loop.Scale = getMaxLoopScale(); - DEBUG(dbgs() << " - reduced-to-max-scale: " << getMaxLoopScale() << "\n"); - } } /// \brief Package up a loop. @@ -424,15 +427,24 @@ const Scaled64 &Min, const Scaled64 &Max) { // Scale the Factor to a size that creates integers. Ideally, integers would // be scaled so that Max == UINT64_MAX so that they can be best - // differentiated. However, the register allocator currently deals poorly - // with large numbers. Instead, push Min up a little from 1 to give some - // room to differentiate small, unequal numbers. - // - // TODO: fix issues downstream so that ScalingFactor can be - // Scaled64(1,64)/Max. - Scaled64 ScalingFactor = Min.inverse(); - if ((Max / Min).lg() < 60) + // differentiated. However, in the presence of large frequency values, small + // frequencies are scaled down to 1, making it impossible to differentiate + // small, unequal numbers. When the spread between Min and Max frequencies + // fits well within MaxBits, we make the scale be at least 8. + const unsigned MaxBits = 64; + const unsigned SpreadBits = (Max / Min).lg(); + Scaled64 ScalingFactor; + if (SpreadBits <= MaxBits - 3) { + // If the values are small enough, make the scaling factor at least 8 to + // allow distinguishing small values. + ScalingFactor = Min.inverse(); ScalingFactor <<= 3; + } else { + // If the values need more than MaxBits to be represented, saturate small + // frequency values down to 1 by using a scaling factor that benefits large + // frequency values. + ScalingFactor = Scaled64(1, MaxBits) / Max; + } // Translate the floats to integers. DEBUG(dbgs() << "float-to-int: min = " << Min << ", max = " << Max Index: llvm/trunk/test/Analysis/BlockFrequencyInfo/bad_input.ll =================================================================== --- llvm/trunk/test/Analysis/BlockFrequencyInfo/bad_input.ll +++ llvm/trunk/test/Analysis/BlockFrequencyInfo/bad_input.ll @@ -32,7 +32,8 @@ entry: br i1 %x, label %for.body, label %for.end, !prof !1 -; Check that the loop scale maxes out at 4096, giving 2048 here. +; Check that the infinite loop is arbitrarily scaled to max out at 4096, +; giving 2048 here. ; CHECK-NEXT: for.body: float = 2048.0, for.body: %i = phi i32 [ 0, %entry ], [ %inc, %for.body ] Index: llvm/trunk/test/Analysis/BlockFrequencyInfo/loops_with_profile_info.ll =================================================================== --- llvm/trunk/test/Analysis/BlockFrequencyInfo/loops_with_profile_info.ll +++ llvm/trunk/test/Analysis/BlockFrequencyInfo/loops_with_profile_info.ll @@ -0,0 +1,204 @@ +; RUN: opt < %s -analyze -block-freq | FileCheck %s + +; This code contains three loops. One is triple-nested, the +; second is double nested and the third is a single loop. At +; runtime, all three loops execute 1,000,000 times each. We use to +; give different frequencies to each of the loops because loop +; scales were limited to no more than 4,096. +; +; This was penalizing the hotness of the second and third loops +; because BFI was reducing the loop scale for for.cond16 and +; for.cond26 to a max of 4,096. +; +; Without this restriction, all loops are now correctly given the same +; frequency values. +; +; Original C code: +; +; +; int g; +; __attribute__((noinline)) void bar() { +; g++; +; } +; +; extern int printf(const char*, ...); +; +; int main() +; { +; int i, j, k; +; +; g = 0; +; for (i = 0; i < 100; i++) +; for (j = 0; j < 100; j++) +; for (k = 0; k < 100; k++) +; bar(); +; +; printf ("g = %d\n", g); +; g = 0; +; +; for (i = 0; i < 100; i++) +; for (j = 0; j < 10000; j++) +; bar(); +; +; printf ("g = %d\n", g); +; g = 0; +; +; +; for (i = 0; i < 1000000; i++) +; bar(); +; +; printf ("g = %d\n", g); +; g = 0; +; } + +@g = common global i32 0, align 4 +@.str = private unnamed_addr constant [8 x i8] c"g = %d\0A\00", align 1 + +declare void @bar() +declare i32 @printf(i8*, ...) + +; CHECK: Printing analysis {{.*}} for function 'main': +; CHECK-NEXT: block-frequency-info: main +define i32 @main() { +entry: + %retval = alloca i32, align 4 + %i = alloca i32, align 4 + %j = alloca i32, align 4 + %k = alloca i32, align 4 + store i32 0, i32* %retval + store i32 0, i32* @g, align 4 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc10, %entry + %0 = load i32, i32* %i, align 4 + %cmp = icmp slt i32 %0, 100 + br i1 %cmp, label %for.body, label %for.end12, !prof !1 + +for.body: ; preds = %for.cond + store i32 0, i32* %j, align 4 + br label %for.cond1 + +for.cond1: ; preds = %for.inc7, %for.body + %1 = load i32, i32* %j, align 4 + %cmp2 = icmp slt i32 %1, 100 + br i1 %cmp2, label %for.body3, label %for.end9, !prof !2 + +for.body3: ; preds = %for.cond1 + store i32 0, i32* %k, align 4 + br label %for.cond4 + +for.cond4: ; preds = %for.inc, %for.body3 + %2 = load i32, i32* %k, align 4 + %cmp5 = icmp slt i32 %2, 100 + br i1 %cmp5, label %for.body6, label %for.end, !prof !3 + +; CHECK: - for.body6: float = 500000.5, int = 4000003 +for.body6: ; preds = %for.cond4 + call void @bar() + br label %for.inc + +for.inc: ; preds = %for.body6 + %3 = load i32, i32* %k, align 4 + %inc = add nsw i32 %3, 1 + store i32 %inc, i32* %k, align 4 + br label %for.cond4 + +for.end: ; preds = %for.cond4 + br label %for.inc7 + +for.inc7: ; preds = %for.end + %4 = load i32, i32* %j, align 4 + %inc8 = add nsw i32 %4, 1 + store i32 %inc8, i32* %j, align 4 + br label %for.cond1 + +for.end9: ; preds = %for.cond1 + br label %for.inc10 + +for.inc10: ; preds = %for.end9 + %5 = load i32, i32* %i, align 4 + %inc11 = add nsw i32 %5, 1 + store i32 %inc11, i32* %i, align 4 + br label %for.cond + +for.end12: ; preds = %for.cond + %6 = load i32, i32* @g, align 4 + %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @.str, i32 0, i32 0), i32 %6) + store i32 0, i32* @g, align 4 + store i32 0, i32* %i, align 4 + br label %for.cond13 + +for.cond13: ; preds = %for.inc22, %for.end12 + %7 = load i32, i32* %i, align 4 + %cmp14 = icmp slt i32 %7, 100 + br i1 %cmp14, label %for.body15, label %for.end24, !prof !1 + +for.body15: ; preds = %for.cond13 + store i32 0, i32* %j, align 4 + br label %for.cond16 + +for.cond16: ; preds = %for.inc19, %for.body15 + %8 = load i32, i32* %j, align 4 + %cmp17 = icmp slt i32 %8, 10000 + br i1 %cmp17, label %for.body18, label %for.end21, !prof !4 + +; CHECK: - for.body18: float = 500000.5, int = 4000003 +for.body18: ; preds = %for.cond16 + call void @bar() + br label %for.inc19 + +for.inc19: ; preds = %for.body18 + %9 = load i32, i32* %j, align 4 + %inc20 = add nsw i32 %9, 1 + store i32 %inc20, i32* %j, align 4 + br label %for.cond16 + +for.end21: ; preds = %for.cond16 + br label %for.inc22 + +for.inc22: ; preds = %for.end21 + %10 = load i32, i32* %i, align 4 + %inc23 = add nsw i32 %10, 1 + store i32 %inc23, i32* %i, align 4 + br label %for.cond13 + +for.end24: ; preds = %for.cond13 + %11 = load i32, i32* @g, align 4 + %call25 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @.str, i32 0, i32 0), i32 %11) + store i32 0, i32* @g, align 4 + store i32 0, i32* %i, align 4 + br label %for.cond26 + +for.cond26: ; preds = %for.inc29, %for.end24 + %12 = load i32, i32* %i, align 4 + %cmp27 = icmp slt i32 %12, 1000000 + br i1 %cmp27, label %for.body28, label %for.end31, !prof !5 + +; CHECK: - for.body28: float = 500000.5, int = 4000003 +for.body28: ; preds = %for.cond26 + call void @bar() + br label %for.inc29 + +for.inc29: ; preds = %for.body28 + %13 = load i32, i32* %i, align 4 + %inc30 = add nsw i32 %13, 1 + store i32 %inc30, i32* %i, align 4 + br label %for.cond26 + +for.end31: ; preds = %for.cond26 + %14 = load i32, i32* @g, align 4 + %call32 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @.str, i32 0, i32 0), i32 %14) + store i32 0, i32* @g, align 4 + %15 = load i32, i32* %retval + ret i32 %15 +} + +!llvm.ident = !{!0} + +!0 = !{!"clang version 3.7.0 (trunk 232635) (llvm/trunk 232636)"} +!1 = !{!"branch_weights", i32 101, i32 2} +!2 = !{!"branch_weights", i32 10001, i32 101} +!3 = !{!"branch_weights", i32 1000001, i32 10001} +!4 = !{!"branch_weights", i32 1000001, i32 101} +!5 = !{!"branch_weights", i32 1000001, i32 2}