diff --git a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp --- a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp @@ -25,6 +25,8 @@ #include "llvm/IR/DebugInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/ProfDataUtils.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" @@ -50,6 +52,9 @@ cl::desc("Allow loop rotation multiple times in order to reach " "a better latch exit")); +// Probability that a rotated loop has zero trip count / is never entered. +static constexpr uint32_t ZeroTripCountWeights[] = {1, 127}; + namespace { /// A simple loop rotation transformation. class LoopRotate { @@ -244,6 +249,93 @@ return false; } +static void updateBranchWeights(BranchInst &PreHeaderBI, BranchInst &LoopBI, + bool HasConditionalPreHeader, + bool SuccsSwapped) { + MDNode *WeightMD = getBranchWeightMDNode(PreHeaderBI); + if (WeightMD == nullptr) + return; + + // LoopBI should currently be a clone of PreHeaderBI with the same + // metadata. But we double check to make sure we don't have a degenerate case + // where instsimplify changed the instructions. + if (WeightMD != getBranchWeightMDNode(LoopBI)) + return; + + SmallVector Weights; + extractFromBranchWeightMD(WeightMD, Weights); + if (Weights.size() != 2) + return; + uint32_t OrigLoopExitWeight = Weights[0]; + uint32_t OrigLoopBackedgeWeight = Weights[1]; + + if (SuccsSwapped) + std::swap(OrigLoopExitWeight, OrigLoopBackedgeWeight); + + // Update branch weights. Consider the following edge-counts: + // + // | |-------- | + // V V | V + // Br i1 ... | Br i1 ... + // | | | | | + // x| y| | becomes: | y0| |----- + // V V | | V V | + // Exit Loop | | Loop | + // | | | Br i1 ... | + // ----- | | | | + // x0| x1| y1 | | + // V V ---- + // Exit + // + // The following must hold: + // - x == x0 + x1 # counts to "exit" must stay the same. + // - y0 == x - x0 == x1 # how often loop was entered at all. + // - y1 == y - y0 # How often loop was repeated (after first iter.). + // + // We cannot generally deduce how often we had a zero-trip count loop so we + // have to make a guess for how to distribute x among the new x0 and x1. + + uint32_t ExitWeight0 = 0; // aka x0 + if (HasConditionalPreHeader) { + // Here we cannot know how many 0-trip count loops we have, so we guess: + if (OrigLoopBackedgeWeight > OrigLoopExitWeight) { + // If the loop count is bigger than the exit count then we set + // probabilities as if 0-trip count nearly never happens. + ExitWeight0 = ZeroTripCountWeights[0]; + // Scale up counts if necessary so we can match `ZeroTripCountWeights` for + // the `ExitWeight0`:`ExitWeight1` (aka `x0`:`x1` ratio`) ratio. + while (OrigLoopExitWeight < ZeroTripCountWeights[1] + ExitWeight0) { + // ... but don't overflow. + uint32_t const HighBit = uint32_t{1} << (sizeof(uint32_t) * 8 - 1); + if ((OrigLoopBackedgeWeight & HighBit) != 0 || + (OrigLoopExitWeight & HighBit) != 0) + break; + OrigLoopBackedgeWeight <<= 1; + OrigLoopExitWeight <<= 1; + } + } else { + // If there's a higher exit-count than backedge-count then we set + // probabilities as if there are only 0-trip and 1-trip cases. + ExitWeight0 = OrigLoopExitWeight - OrigLoopBackedgeWeight; + } + } + uint32_t ExitWeight1 = OrigLoopExitWeight - ExitWeight0; // aka x1 + uint32_t EnterWeight = ExitWeight1; // aka y0 + uint32_t LoopBackWeight = OrigLoopBackedgeWeight - EnterWeight; // aka y1 + + MDBuilder MDB(LoopBI.getContext()); + MDNode *LoopWeightMD = + MDB.createBranchWeights(SuccsSwapped ? LoopBackWeight : ExitWeight1, + SuccsSwapped ? ExitWeight1 : LoopBackWeight); + LoopBI.setMetadata(LLVMContext::MD_prof, LoopWeightMD); + if (HasConditionalPreHeader) { + MDNode *PreHeaderWeightMD = + MDB.createBranchWeights(SuccsSwapped ? EnterWeight : ExitWeight0, + SuccsSwapped ? ExitWeight0 : EnterWeight); + PreHeaderBI.setMetadata(LLVMContext::MD_prof, PreHeaderWeightMD); + } +} + /// Rotate loop LP. Return true if the loop is rotated. /// /// \param SimplifiedLatch is true if the latch was just folded into the final @@ -363,7 +455,8 @@ // loop. Otherwise loop is not suitable for rotation. BasicBlock *Exit = BI->getSuccessor(0); BasicBlock *NewHeader = BI->getSuccessor(1); - if (L->contains(Exit)) + bool BISuccsSwapped = L->contains(Exit); + if (BISuccsSwapped) std::swap(Exit, NewHeader); assert(NewHeader && "Unable to determine new loop header"); assert(L->contains(NewHeader) && !L->contains(Exit) && @@ -605,9 +698,14 @@ // to split as many edges. BranchInst *PHBI = cast(OrigPreheader->getTerminator()); assert(PHBI->isConditional() && "Should be clone of BI condbr!"); - if (!isa(PHBI->getCondition()) || - PHBI->getSuccessor(cast(PHBI->getCondition())->isZero()) != - NewHeader) { + const Value *Cond = PHBI->getCondition(); + const bool HasConditionalPreHeader = + !isa(Cond) || + PHBI->getSuccessor(cast(Cond)->isZero()) != NewHeader; + + updateBranchWeights(*PHBI, *BI, HasConditionalPreHeader, BISuccsSwapped); + + if (HasConditionalPreHeader) { // The conditional branch can't be folded, handle the general case. // Split edges as necessary to preserve LoopSimplify form. diff --git a/llvm/test/Transforms/LoopRotate/update-branch-weights.ll b/llvm/test/Transforms/LoopRotate/update-branch-weights.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopRotate/update-branch-weights.ll @@ -0,0 +1,156 @@ +; RUN: opt < %s -passes='print' -disable-output 2>&1 | FileCheck %s --check-prefixes=BFI_BEFORE +; RUN: opt < %s -passes='loop(loop-rotate),print' -disable-output 2>&1 | FileCheck %s --check-prefixes=BFI_AFTER +; RUN: opt < %s -passes='loop(loop-rotate)' -S | FileCheck %s --check-prefixes=IR + +@g = global i32 0 + +; We should get the same "count =" results for "outer_loop_body" and +; "inner_loop_body" before and after the transformation. + +; BFI_BEFORE-LABEL: block-frequency-info: func0 +; BFI_BEFORE: - entry: {{.*}} count = 1 +; BFI_BEFORE: - outer_loop_header: {{.*}} count = 1001 +; BFI_BEFORE: - outer_loop_body: {{.*}} count = 1000 +; BFI_BEFORE: - inner_loop_header: {{.*}} count = 4000 +; BFI_BEFORE: - inner_loop_body: {{.*}} count = 3000 +; BFI_BEFORE: - inner_loop_exit: {{.*}} count = 1000 +; BFI_BEFORE: - outer_loop_exit: {{.*}} count = 1 + +; BFI_AFTER-LABEL: block-frequency-info: func0 +; BFI_AFTER: - entry: {{.*}} count = 1 +; BFI_AFTER: - outer_loop_body: {{.*}} count = 1000 +; BFI_AFTER: - inner_loop_body: {{.*}} count = 3000 +; BFI_AFTER: - inner_loop_exit: {{.*}} count = 1000 +; BFI_AFTER: - outer_loop_exit: {{.*}} count = 1 + +; IR: inner_loop_body: +; IR: br i1 %cmp1, label %inner_loop_body, label %inner_loop_exit, !prof [[PROF_FUNC0_0:![0-9]+]] +; IR: inner_loop_exit: +; IR: br i1 %cmp0, label %outer_loop_body, label %outer_loop_exit, !prof [[PROF_FUNC0_1:![0-9]+]] +; +; A function with known loop-bounds where after loop-rotation we end with an +; unconditional branch in the pre-header. +define void @func0() !prof !0 { +entry: + br label %outer_loop_header + +outer_loop_header: + %i0 = phi i32 [0, %entry], [%i0_inc, %inner_loop_exit] + %cmp0 = icmp slt i32 %i0, 1000 + br i1 %cmp0, label %outer_loop_body, label %outer_loop_exit, !prof !1 + +outer_loop_body: + store volatile i32 %i0, ptr @g, align 4 + br label %inner_loop_header + +inner_loop_header: + %i1 = phi i32 [0, %outer_loop_body], [%i1_inc, %inner_loop_body] + %cmp1 = icmp slt i32 %i1, 3 + br i1 %cmp1, label %inner_loop_body, label %inner_loop_exit, !prof !2 + +inner_loop_body: + store volatile i32 %i1, ptr @g, align 4 + %i1_inc = add i32 %i1, 1 + br label %inner_loop_header + +inner_loop_exit: + %i0_inc = add i32 %i0, 1 + br label %outer_loop_header + +outer_loop_exit: + ret void +} + +; BFI_BEFORE-LABEL: block-frequency-info: func1 +; BFI_BEFORE: - entry: {{.*}} count = 1024 +; BFI_BEFORE: - loop_header: {{.*}} count = 21504 +; BFI_BEFORE: - loop_body: {{.*}} count = 20480 +; BFI_BEFORE: - loop_exit: {{.*}} count = 1024 + +; BFI_AFTER-LABEL: block-frequency-info: func1 +; BFI_AFTER: - entry: {{.*}} count = 1024 +; BFI_AFTER: - loop_body.lr.ph: {{.*}} count = 1024 +; BFI_AFTER: - loop_body: {{.*}} count = 20608 +; BFI_AFTER: - loop_header.loop_exit_crit_edge: {{.*}} count = 1024 +; BFI_AFTER: - loop_exit: {{.*}} count = 1024 + +; IR: entry: +; IR: br i1 %cmp1, label %loop_body.lr.ph, label %loop_exit, !prof [[PROF_FUNC1_0:![0-9]+]] + +; IR: loop_body: +; IR: br i1 %cmp, label %loop_body, label %loop_header.loop_exit_crit_edge, !prof [[PROF_FUNC1_1:![0-9]+]] + +; A function with unknown loop-bounds so loop-rotation ends up with a +; condition jump in pre-header and loop body. branch_weight shows body is +; executed more often than header. +define void @func1(i32 %n) !prof !3 { +entry: + br label %loop_header + +loop_header: + %i = phi i32 [0, %entry], [%i_inc, %loop_body] + %cmp = icmp slt i32 %i, %n + br i1 %cmp, label %loop_body, label %loop_exit, !prof !4 + +loop_body: + store volatile i32 %i, ptr @g, align 4 + %i_inc = add i32 %i, 1 + br label %loop_header + +loop_exit: + ret void +} + +; BFI_BEFORE-LABEL: block-frequency-info: func2 +; BFI_BEFORE: - entry: {{.*}} count = 1024 +; BFI_BEFORE: - loop_header: {{.*}} count = 1056 +; BFI_BEFORE: - loop_body: {{.*}} count = 32 +; BFI_BEFORE: - loop_exit: {{.*}} count = 1024 + +; BFI_AFTER-LABEL: block-frequency-info: func2 +; - entry: {{.*}} count = 1024 +; - loop_body.lr.ph: {{.*}} count = 32 +; - loop_body: {{.*}} count = 32 +; - loop_header.loop_exit_crit_edge: {{.*}} count = 32 +; - loop_exit: {{.*}} count = 1024 + +; IR: entry: +; IR: br i1 %cmp1, label %loop_exit, label %loop_body.lr.ph, !prof [[PROF_FUNC2_0:![0-9]+]] + +; IR: loop_body: +; IR: br i1 %cmp, label %loop_header.loop_exit_crit_edge, label %loop_body, !prof [[PROF_FUNC2_1:![0-9]+]] + +; A function with unknown loop-bounds so loop-rotation ends up with a +; condition jump in pre-header and loop body. Similar to `func1` but here +; loop-exit count is higher than backedge count. +define void @func2(i32 %n) !prof !3 { +entry: + br label %loop_header + +loop_header: + %i = phi i32 [0, %entry], [%i_inc, %loop_body] + %cmp = icmp slt i32 %i, %n + br i1 %cmp, label %loop_exit, label %loop_body, !prof !5 + +loop_body: + store volatile i32 %i, ptr @g, align 4 + %i_inc = add i32 %i, 1 + br label %loop_header + +loop_exit: + ret void +} + +!0 = !{!"function_entry_count", i64 1} +!1 = !{!"branch_weights", i32 1000, i32 1} +!2 = !{!"branch_weights", i32 3000, i32 1000} +!3 = !{!"function_entry_count", i64 1024} +!4 = !{!"branch_weights", i32 40, i32 2} +!5 = !{!"branch_weights", i32 10240, i32 320} + +; IR: [[PROF_FUNC0_0]] = !{!"branch_weights", i32 2000, i32 1000} +; IR: [[PROF_FUNC0_1]] = !{!"branch_weights", i32 999, i32 1} +; IR: [[PROF_FUNC1_0]] = !{!"branch_weights", i32 127, i32 1} +; IR: [[PROF_FUNC1_1]] = !{!"branch_weights", i32 2433, i32 127} +; IR: [[PROF_FUNC2_0]] = !{!"branch_weights", i32 9920, i32 320} +; IR: [[PROF_FUNC2_1]] = !{!"branch_weights", i32 320, i32 0} diff --git a/llvm/test/Transforms/LoopSimplify/merge-exits.ll b/llvm/test/Transforms/LoopSimplify/merge-exits.ll --- a/llvm/test/Transforms/LoopSimplify/merge-exits.ll +++ b/llvm/test/Transforms/LoopSimplify/merge-exits.ll @@ -103,7 +103,7 @@ ; CHECK-NEXT: [[T10:%.*]] = fcmp olt float [[T4]], 2.500000e+00 ; CHECK-NEXT: [[T12:%.*]] = icmp sgt i64 [[TMP0]], [[INDVARS_IV_NEXT]] ; CHECK-NEXT: [[OR_COND:%.*]] = and i1 [[T10]], [[T12]] -; CHECK-NEXT: br i1 [[OR_COND]], label [[BB]], label [[BB1_BB3_CRIT_EDGE:%.*]], !prof [[PROF0]] +; CHECK-NEXT: br i1 [[OR_COND]], label [[BB]], label [[BB1_BB3_CRIT_EDGE:%.*]], !prof [[PROF1:![0-9]+]] ; CHECK: bb1.bb3_crit_edge: ; CHECK-NEXT: [[T4_LCSSA:%.*]] = phi float [ [[T4]], [[BB]] ] ; CHECK-NEXT: [[T9_LCSSA:%.*]] = phi float [ [[T9]], [[BB]] ]