Index: lib/CodeGen/MachineBlockPlacement.cpp =================================================================== --- lib/CodeGen/MachineBlockPlacement.cpp +++ lib/CodeGen/MachineBlockPlacement.cpp @@ -455,15 +455,18 @@ const MachineBasicBlock *OldTop); bool hasViableTopFallthrough(const MachineBasicBlock *Top, const BlockFilterSet &LoopBlockSet); + BlockFrequency TopFallThroughFreq(const MachineBasicBlock *Top, + const BlockFilterSet &LoopBlockSet); MachineBasicBlock *findBestLoopTop( const MachineLoop &L, const BlockFilterSet &LoopBlockSet); MachineBasicBlock *findBestLoopExit( - const MachineLoop &L, const BlockFilterSet &LoopBlockSet); + const MachineLoop &L, const BlockFilterSet &LoopBlockSet, + BlockFrequency &ExitFreq); BlockFilterSet collectLoopBlockSet(const MachineLoop &L); void buildLoopChains(const MachineLoop &L); void rotateLoop( BlockChain &LoopChain, const MachineBasicBlock *ExitingBB, - const BlockFilterSet &LoopBlockSet); + BlockFrequency ExitFreq, const BlockFilterSet &LoopBlockSet); void rotateLoopWithProfile( BlockChain &LoopChain, const MachineLoop &L, const BlockFilterSet &LoopBlockSet); @@ -1869,7 +1872,8 @@ /// fallthrough opportunities. MachineBasicBlock * MachineBlockPlacement::findBestLoopExit(const MachineLoop &L, - const BlockFilterSet &LoopBlockSet) { + const BlockFilterSet &LoopBlockSet, + BlockFrequency &ExitFreq) { // We don't want to layout the loop linearly in all cases. If the loop header // is just a normal basic block in the loop, we want to look for what block // within the loop is the best one to layout at the top. However, if the loop @@ -1980,6 +1984,7 @@ LLVM_DEBUG(dbgs() << " Best exiting block: " << getBlockName(ExitingBB) << "\n"); + ExitFreq = BestExitEdgeFreq; return ExitingBB; } @@ -2016,6 +2021,42 @@ return false; } +// Find out the possible fall through frequence to the top of a loop. +BlockFrequency +MachineBlockPlacement::TopFallThroughFreq( + const MachineBasicBlock *Top, + const BlockFilterSet &LoopBlockSet) { + BlockFrequency MaxFreq = 0; + for (MachineBasicBlock *Pred : Top->predecessors()) { + BlockChain *PredChain = BlockToChain[Pred]; + if (!LoopBlockSet.count(Pred) && + (!PredChain || Pred == *std::prev(PredChain->end()))) { + // Found a Pred block can be placed before Top. + // Check if Top is the best successor of Pred. + auto TopProb = MBPI->getEdgeProbability(Pred, Top); + bool TopOK = true; + for (MachineBasicBlock *Succ : Pred->successors()) { + auto SuccProb = MBPI->getEdgeProbability(Pred, Succ); + BlockChain *SuccChain = BlockToChain[Succ]; + // Check if Succ can be placed after Pred. + // Succ should not be in any chain, or it is the head of some chain. + if (!LoopBlockSet.count(Succ) && (SuccProb > TopProb) && + (!SuccChain || Succ == *SuccChain->begin())) { + TopOK = false; + break; + } + } + if (TopOK) { + BlockFrequency EdgeFreq = MBFI->getBlockFreq(Pred) * + MBPI->getEdgeProbability(Pred, Top); + if (EdgeFreq > MaxFreq) + MaxFreq = EdgeFreq; + } + } + } + return MaxFreq; +} + /// Attempt to rotate an exiting block to the bottom of the loop. /// /// Once we have built a chain, try to rotate it to line up the hot exit block @@ -2024,6 +2065,7 @@ /// of its bottom already, don't rotate it. void MachineBlockPlacement::rotateLoop(BlockChain &LoopChain, const MachineBasicBlock *ExitingBB, + BlockFrequency ExitFreq, const BlockFilterSet &LoopBlockSet) { if (!ExitingBB) return; @@ -2047,6 +2089,12 @@ (!SuccChain || Succ == *SuccChain->begin())) return; } + + // Rotate will destroy the top fallthrough, we need to ensure the new exit + // frequency is larger than top fallthrough. + BlockFrequency FallThrough2Top = TopFallThroughFreq(Top, LoopBlockSet); + if (FallThrough2Top >= ExitFreq) + return; } BlockChain::iterator ExitIt = llvm::find(LoopChain, ExitingBB); @@ -2310,8 +2358,9 @@ // Loops are processed innermost to uttermost, make sure we clear // PreferredLoopExit before processing a new loop. PreferredLoopExit = nullptr; + BlockFrequency ExitFreq; if (!RotateLoopWithProfile && LoopTop == L.getHeader()) - PreferredLoopExit = findBestLoopExit(L, LoopBlockSet); + PreferredLoopExit = findBestLoopExit(L, LoopBlockSet, ExitFreq); BlockChain &LoopChain = *BlockToChain[LoopTop]; @@ -2331,7 +2380,7 @@ if (RotateLoopWithProfile) rotateLoopWithProfile(LoopChain, L, LoopBlockSet); else - rotateLoop(LoopChain, PreferredLoopExit, LoopBlockSet); + rotateLoop(LoopChain, PreferredLoopExit, ExitFreq, LoopBlockSet); LLVM_DEBUG({ // Crash at the end so we get all of the debugging output first. Index: test/CodeGen/AArch64/cmpxchg-idioms.ll =================================================================== --- test/CodeGen/AArch64/cmpxchg-idioms.ll +++ test/CodeGen/AArch64/cmpxchg-idioms.ll @@ -111,7 +111,7 @@ ; CHECK: mov w22, #2 ; CHECK-NOT: mov w22, #4 ; CHECK-NOT: cmn w22, #4 -; CHECK: b [[LOOP2:LBB[0-9]+_[0-9]+]] +; CHECK: [[LOOP2:LBB[0-9]+_[0-9]+]]: ; %for.cond ; CHECK-NOT: b.ne [[LOOP2]] ; CHECK-NOT: b {{LBB[0-9]+_[0-9]+}} ; CHECK: bl _foo Index: test/CodeGen/AMDGPU/si-annotate-cf.ll =================================================================== --- test/CodeGen/AMDGPU/si-annotate-cf.ll +++ test/CodeGen/AMDGPU/si-annotate-cf.ll @@ -96,20 +96,20 @@ ; FUNC-LABEL: {{^}}loop_land_info_assert: ; SI: v_cmp_lt_i32_e64 [[CMP4:s\[[0-9:]+\]]], s{{[0-9]+}}, 4{{$}} ; SI: s_and_b64 [[CMP4M:s\[[0-9]+:[0-9]+\]]], exec, [[CMP4]] -; SI: s_branch [[INFLOOP:BB[0-9]+_[0-9]+]] + +; SI: [[WHILELOOP:BB[0-9]+_[0-9]+]]: ; %while.cond +; SI: s_cbranch_vccz [[FOR_COND_PH:BB[0-9]+_[0-9]+]] ; SI: [[CONVEX_EXIT:BB[0-9_]+]] ; SI: s_mov_b64 vcc, ; SI-NEXT: s_cbranch_vccnz [[ENDPGM:BB[0-9]+_[0-9]+]] -; SI: s_cbranch_vccnz [[INFLOOP]] + +; SI: s_cbranch_vccnz [[WHILELOOP]] ; SI: ; %if.else ; SI: buffer_store_dword -; SI: [[INFLOOP]]: -; SI: s_cbranch_vccnz [[CONVEX_EXIT]] - -; SI: ; %for.cond.preheader +; SI: [[FOR_COND_PH]]: ; %for.cond.preheader ; SI: s_cbranch_vccz [[ENDPGM]] ; SI: [[ENDPGM]]: Index: test/CodeGen/PowerPC/licm-remat.ll =================================================================== --- test/CodeGen/PowerPC/licm-remat.ll +++ test/CodeGen/PowerPC/licm-remat.ll @@ -24,8 +24,7 @@ ; CHECK-DAG: addi 25, 3, _ZN6snappy8internalL8wordmaskE@toc@l ; CHECK-DAG: addis 5, 2, _ZN6snappy8internalL10char_tableE@toc@ha ; CHECK-DAG: addi 24, 5, _ZN6snappy8internalL10char_tableE@toc@l -; CHECK: b .[[LABEL1:[A-Z0-9_]+]] -; CHECK: .[[LABEL1]]: # %for.cond +; CHECK: .LBB0_1: # %for.cond ; CHECK-NOT: addis {{[0-9]+}}, 2, _ZN6snappy8internalL8wordmaskE@toc@ha ; CHECK-NOT: addis {{[0-9]+}}, 2, _ZN6snappy8internalL10char_tableE@toc@ha ; CHECK: bctrl Index: test/CodeGen/Thumb/consthoist-physical-addr.ll =================================================================== --- test/CodeGen/Thumb/consthoist-physical-addr.ll +++ test/CodeGen/Thumb/consthoist-physical-addr.ll @@ -10,8 +10,9 @@ ; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: ldr r3, .LCPI0_0 -; CHECK-NEXT: b .LBB0_4 ; CHECK-NEXT: .LBB0_1: +; CHECK-NEXT: cmp r2, #128 +; CHECK-NEXT: beq .LBB0_5 ; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: str r4, [r3, #8] ; CHECK-NEXT: lsls r4, r2, #2 @@ -20,16 +21,15 @@ ; CHECK-NEXT: movs r5, #1 ; CHECK-NEXT: str r5, [r3, #12] ; CHECK-NEXT: isb sy -; CHECK-NEXT: .LBB0_2: +; CHECK-NEXT: .LBB0_3: ; CHECK-NEXT: ldr r5, [r3, #12] ; CHECK-NEXT: cmp r5, #0 -; CHECK-NEXT: bne .LBB0_2 +; CHECK-NEXT: bne .LBB0_3 ; CHECK-NEXT: ldr r5, [r3, #4] ; CHECK-NEXT: str r5, [r1, r4] ; CHECK-NEXT: adds r2, r2, #1 -; CHECK-NEXT: .LBB0_4: -; CHECK-NEXT: cmp r2, #128 -; CHECK-NEXT: bne .LBB0_1 +; CHECK-NEXT: b .LBB0_1 +; CHECK-NEXT: .LBB0_5: ; CHECK-NEXT: movs r0, #0 ; CHECK-NEXT: pop {r4, r5, r7, pc} ; CHECK-NEXT: .p2align 2 Index: test/CodeGen/X86/block-placement.ll =================================================================== --- test/CodeGen/X86/block-placement.ll +++ test/CodeGen/X86/block-placement.ll @@ -82,14 +82,14 @@ ; Check that we sink cold loop blocks after the hot loop body. ; CHECK-LABEL: test_loop_cold_blocks: ; CHECK: %entry -; CHECK-NOT: .p2align -; CHECK: %unlikely1 -; CHECK-NOT: .p2align -; CHECK: %unlikely2 ; CHECK: .p2align ; CHECK: %body1 ; CHECK: %body2 ; CHECK: %body3 +; CHECK-NOT: .p2align +; CHECK: %unlikely1 +; CHECK-NOT: .p2align +; CHECK: %unlikely2 ; CHECK: %exit entry: @@ -1546,8 +1546,8 @@ ; CHECK-LABEL: not_rotate_if_extra_branch_regression ; CHECK: %.entry ; CHECK: %.first_backedge -; CHECK: %.slow ; CHECK: %.second_header +; CHECK: %.slow .entry: %sum.0 = shl nsw i32 %count, 1 br label %.first_header Index: test/CodeGen/X86/code_placement_cold_loop_blocks.ll =================================================================== --- test/CodeGen/X86/code_placement_cold_loop_blocks.ll +++ test/CodeGen/X86/code_placement_cold_loop_blocks.ll @@ -42,10 +42,10 @@ ; will merged to the outer loop chain. ; ; CHECK-LABEL: nested_loop_0: +; CHECK: callq b ; CHECK: callq c ; CHECK: callq d ; CHECK: callq e -; CHECK: callq b ; CHECK: callq f entry: Index: test/CodeGen/X86/code_placement_no_header_change.ll =================================================================== --- test/CodeGen/X86/code_placement_no_header_change.ll +++ test/CodeGen/X86/code_placement_no_header_change.ll @@ -7,9 +7,9 @@ ; Later backedge1 and backedge2 is rotated before loop header. ; CHECK-LABEL: bar ; CHECK: %.entry +; CHECK: %.header ; CHECK: %.backedge1 ; CHECK: %.backedge2 -; CHECK: %.header ; CHECK: %.exit .entry: %c = shl nsw i32 %count, 2 Index: test/CodeGen/X86/loop-blocks.ll =================================================================== --- test/CodeGen/X86/loop-blocks.ll +++ test/CodeGen/X86/loop-blocks.ll @@ -201,12 +201,12 @@ } ; CHECK-LABEL: check_minsize: -; CHECK: jmp .LBB4_1 ; CHECK-NOT: align -; CHECK-NEXT: .LBB4_2: -; CHECK-NEXT: callq loop_latch -; CHECK-NEXT: .LBB4_1: +; CHECK: .LBB4_1: ; CHECK-NEXT: callq loop_header +; CHECK: callq loop_latch +; CHECK: .LBB4_3: +; CHECK: callq exit define void @check_minsize() minsize nounwind { Index: test/CodeGen/X86/no_rotate.ll =================================================================== --- test/CodeGen/X86/no_rotate.ll +++ test/CodeGen/X86/no_rotate.ll @@ -0,0 +1,76 @@ +; RUN: llc -mtriple=i686-linux < %s | FileCheck %s + +; Don't rotate the loop if the number of fall through to exit is not larger +; than the number of fall through to header. + +define void @no_rotate() { +; CHECK-LABEL: no_rotate +; CHECK: %entry +; CHECK: %header +; CHECK: %middle +; CHECK: %latch1 +; CHECK: %latch2 +; CHECK: %end +entry: + br label %header + +header: + %val1 = call i1 @foo() + br i1 %val1, label %middle, label %end + +middle: + %val2 = call i1 @foo() + br i1 %val2, label %latch1, label %end + +latch1: + %val3 = call i1 @foo() + br i1 %val3, label %latch2, label %header + +latch2: + %val4 = call i1 @foo() + br label %header + +end: + ret void +} + +define void @do_rotate() { +; CHECK-LABEL: do_rotate +; CHECK: %entry +; CHECK: %then +; CHECK: %else +; CHECK: %latch1 +; CHECK: %latch2 +; CHECK: %header +; CHECK: %end +entry: + %val0 = call i1 @foo() + br i1 %val0, label %then, label %else + +then: + call void @a() + br label %header + +else: + call void @b() + br label %header + +header: + %val1 = call i1 @foo() + br i1 %val1, label %latch1, label %end + +latch1: + %val3 = call i1 @foo() + br i1 %val3, label %latch2, label %header + +latch2: + %val4 = call i1 @foo() + br label %header + +end: + ret void +} + +declare i1 @foo() +declare void @a() +declare void @b() Index: test/DebugInfo/X86/PR37234.ll =================================================================== --- test/DebugInfo/X86/PR37234.ll +++ test/DebugInfo/X86/PR37234.ll @@ -21,18 +21,18 @@ ; CHECK-LABEL: # %bb.{{.*}}: ; CHECK: #DEBUG_VALUE: main:aa <- 0 ; CHECK: #DEBUG_VALUE: main:aa <- $[[REG:[0-9a-z]+]] -; CHECK: jmp .LBB0_1 -; CHECK: .LBB0_2: +; CHECK: .LBB0_1: +; CHECK: #DEBUG_VALUE: main:aa <- $[[REG]] +; CHECK: je .LBB0_4 +; CHECK: # %bb.{{.*}}: ; CHECK: #DEBUG_VALUE: main:aa <- $[[REG]] ; CHECK: jne .LBB0_1 ; CHECK: # %bb.{{.*}}: ; CHECK: #DEBUG_VALUE: main:aa <- $[[REG]] ; CHECK: incl %[[REG]] ; CHECK: #DEBUG_VALUE: main:aa <- $[[REG]] -; CHECK: .LBB0_1: -; CHECK: #DEBUG_VALUE: main:aa <- $[[REG]] -; CHECK: jne .LBB0_2 -; CHECK: # %bb.{{.*}}: +; CHECK: jmp .LBB0_1 +; CHECK: .LBB0_4: ; CHECK: #DEBUG_VALUE: main:aa <- $[[REG]] ; CHECK: retq