Index: llvm/trunk/lib/CodeGen/MachineBlockPlacement.cpp =================================================================== --- llvm/trunk/lib/CodeGen/MachineBlockPlacement.cpp +++ llvm/trunk/lib/CodeGen/MachineBlockPlacement.cpp @@ -451,6 +451,8 @@ void buildChain(const MachineBasicBlock *BB, BlockChain &Chain, BlockFilterSet *BlockFilter = nullptr); + bool canMoveBottomBlockToTop(const MachineBasicBlock *BottomBlock, + const MachineBasicBlock *OldTop); MachineBasicBlock *findBestLoopTop( const MachineLoop &L, const BlockFilterSet &LoopBlockSet); MachineBasicBlock *findBestLoopExit( @@ -1756,6 +1758,39 @@ << getBlockName(*Chain.begin()) << "\n"); } +// If bottom of block BB has only one successor OldTop, in most cases it is +// profitable to move it before OldTop, except the following case: +// +// -->OldTop<- +// | . | +// | . | +// | . | +// ---Pred | +// | | +// BB----- +// +// If BB is moved before OldTop, Pred needs a taken branch to BB, and it can't +// layout the other successor below it, so it can't reduce taken branch. +// In this case we keep its original layout. +bool +MachineBlockPlacement::canMoveBottomBlockToTop( + const MachineBasicBlock *BottomBlock, + const MachineBasicBlock *OldTop) { + if (BottomBlock->pred_size() != 1) + return true; + MachineBasicBlock *Pred = *BottomBlock->pred_begin(); + if (Pred->succ_size() != 2) + return true; + + MachineBasicBlock *OtherBB = *Pred->succ_begin(); + if (OtherBB == BottomBlock) + OtherBB = *Pred->succ_rbegin(); + if (OtherBB == OldTop) + return false; + + return true; +} + /// Find the best loop top block for layout. /// /// Look for a block which is strictly better than the loop header for laying @@ -1800,6 +1835,9 @@ if (Pred->succ_size() > 1) continue; + if (!canMoveBottomBlockToTop(Pred, L.getHeader())) + continue; + BlockFrequency PredFreq = MBFI->getBlockFreq(Pred); if (!BestPred || PredFreq > BestPredFreq || (!(PredFreq < BestPredFreq) && Index: llvm/trunk/test/CodeGen/AMDGPU/si-annotate-cf.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/si-annotate-cf.ll +++ llvm/trunk/test/CodeGen/AMDGPU/si-annotate-cf.ll @@ -96,20 +96,20 @@ ; FUNC-LABEL: {{^}}loop_land_info_assert: ; SI: v_cmp_lt_i32_e64 [[CMP4:s\[[0-9:]+\]]], s{{[0-9]+}}, 4{{$}} ; SI: s_and_b64 [[CMP4M:s\[[0-9]+:[0-9]+\]]], exec, [[CMP4]] -; SI: s_mov_b64 vcc, [[CMP4M]] -; SI-NEXT: s_cbranch_vccnz [[CONVEX_EXIT:BB[0-9_]+]] -; SI-NEXT: s_branch [[FOR_COND_PREHDR:BB[0-9_]+]] +; SI: s_branch [[INFLOOP:BB[0-9]+_[0-9]+]] + +; SI: [[CONVEX_EXIT:BB[0-9_]+]] +; SI: s_mov_b64 vcc, +; SI-NEXT: s_cbranch_vccnz [[ENDPGM:BB[0-9]+_[0-9]+]] +; SI: s_cbranch_vccnz [[INFLOOP]] ; SI: ; %if.else ; SI: buffer_store_dword -; SI: [[INFLOOP:BB[0-9]+_[0-9]+]]: +; SI: [[INFLOOP]]: +; SI: s_cbranch_vccnz [[CONVEX_EXIT]] -; SI: [[CONVEX_EXIT]]: -; SI: s_mov_b64 vcc, -; SI-NEXT: s_cbranch_vccnz [[ENDPGM:BB[0-9]+_[0-9]+]] -; SI: s_branch [[INFLOOP]] -; SI-NEXT: [[FOR_COND_PREHDR]]: +; SI: ; %for.cond.preheader ; SI: s_cbranch_vccz [[ENDPGM]] ; SI: [[ENDPGM]]: Index: llvm/trunk/test/CodeGen/PowerPC/licm-remat.ll =================================================================== --- llvm/trunk/test/CodeGen/PowerPC/licm-remat.ll +++ llvm/trunk/test/CodeGen/PowerPC/licm-remat.ll @@ -24,8 +24,8 @@ ; CHECK-DAG: addi 25, 3, _ZN6snappy8internalL8wordmaskE@toc@l ; CHECK-DAG: addis 5, 2, _ZN6snappy8internalL10char_tableE@toc@ha ; CHECK-DAG: addi 24, 5, _ZN6snappy8internalL10char_tableE@toc@l -; CHECK: b .LBB0_2 -; CHECK: .LBB0_2: # %for.cond +; CHECK: b .[[LABEL1:[A-Z0-9_]+]] +; CHECK: .[[LABEL1]]: # %for.cond ; CHECK-NOT: addis {{[0-9]+}}, 2, _ZN6snappy8internalL8wordmaskE@toc@ha ; CHECK-NOT: addis {{[0-9]+}}, 2, _ZN6snappy8internalL10char_tableE@toc@ha ; CHECK: bctrl Index: llvm/trunk/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll +++ llvm/trunk/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll @@ -20,22 +20,7 @@ ; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: orq $2097152, %r14 ## imm = 0x200000 ; CHECK-NEXT: andl $15728640, %r14d ## imm = 0xF00000 -; CHECK-NEXT: jmp LBB0_1 ; CHECK-NEXT: .p2align 4, 0x90 -; CHECK-NEXT: LBB0_3: ## %bb.i -; CHECK-NEXT: ## in Loop: Header=BB0_1 Depth=1 -; CHECK-NEXT: movl 0, %eax -; CHECK-NEXT: xorps %xmm0, %xmm0 -; CHECK-NEXT: cvtsi2ssq %rax, %xmm0 -; CHECK-NEXT: movl 4, %eax -; CHECK-NEXT: xorps %xmm1, %xmm1 -; CHECK-NEXT: cvtsi2ssq %rax, %xmm1 -; CHECK-NEXT: movl 8, %eax -; CHECK-NEXT: xorps %xmm2, %xmm2 -; CHECK-NEXT: cvtsi2ssq %rax, %xmm2 -; CHECK-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] -; CHECK-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] -; CHECK-NEXT: movaps %xmm0, 0 ; CHECK-NEXT: LBB0_1: ## %bb4 ; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: xorl %eax, %eax @@ -50,7 +35,21 @@ ; CHECK-NEXT: ## in Loop: Header=BB0_1 Depth=1 ; CHECK-NEXT: cmpq $1048576, %r14 ## imm = 0x100000 ; CHECK-NEXT: jne LBB0_1 -; CHECK-NEXT: jmp LBB0_3 +; CHECK-NEXT: ## %bb.3: ## %bb.i +; CHECK-NEXT: ## in Loop: Header=BB0_1 Depth=1 +; CHECK-NEXT: movl 0, %eax +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2ssq %rax, %xmm0 +; CHECK-NEXT: movl 4, %eax +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: cvtsi2ssq %rax, %xmm1 +; CHECK-NEXT: movl 8, %eax +; CHECK-NEXT: xorps %xmm2, %xmm2 +; CHECK-NEXT: cvtsi2ssq %rax, %xmm2 +; CHECK-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; CHECK-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; CHECK-NEXT: movaps %xmm0, 0 +; CHECK-NEXT: jmp LBB0_1 entry: br label %bb4 Index: llvm/trunk/test/CodeGen/X86/avx-cmp.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx-cmp.ll +++ llvm/trunk/test/CodeGen/X86/avx-cmp.ll @@ -35,11 +35,7 @@ ; CHECK-NEXT: # %bb.1: # %for.cond5.preheader ; CHECK-NEXT: xorl %ebx, %ebx ; CHECK-NEXT: movb $1, %bpl -; CHECK-NEXT: jmp .LBB2_2 ; CHECK-NEXT: .p2align 4, 0x90 -; CHECK-NEXT: .LBB2_5: # %if.then -; CHECK-NEXT: # in Loop: Header=BB2_2 Depth=1 -; CHECK-NEXT: callq scale ; CHECK-NEXT: .LBB2_2: # %for.cond5 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: testb %bl, %bl @@ -52,7 +48,10 @@ ; CHECK-NEXT: # in Loop: Header=BB2_2 Depth=1 ; CHECK-NEXT: vucomisd {{\.LCPI.*}}, %xmm0 ; CHECK-NEXT: jne .LBB2_5 -; CHECK-NEXT: jp .LBB2_5 +; CHECK-NEXT: jnp .LBB2_2 +; CHECK-NEXT: .LBB2_5: # %if.then +; CHECK-NEXT: # in Loop: Header=BB2_2 Depth=1 +; CHECK-NEXT: callq scale ; CHECK-NEXT: jmp .LBB2_2 ; CHECK-NEXT: .LBB2_6: # %for.end52 ; CHECK-NEXT: addq $8, %rsp Index: llvm/trunk/test/CodeGen/X86/avx512-i1test.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512-i1test.ll +++ llvm/trunk/test/CodeGen/X86/avx512-i1test.ll @@ -15,16 +15,15 @@ ; CHECK-NEXT: retq ; CHECK-NEXT: .LBB0_1: # %bb56 ; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: jmp .LBB0_2 ; CHECK-NEXT: .p2align 4, 0x90 -; CHECK-NEXT: .LBB0_3: # %bb35 -; CHECK-NEXT: # in Loop: Header=BB0_2 Depth=1 -; CHECK-NEXT: testb %al, %al ; CHECK-NEXT: .LBB0_2: # %bb33 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: testb %al, %al ; CHECK-NEXT: jne .LBB0_2 -; CHECK-NEXT: jmp .LBB0_3 +; CHECK-NEXT: # %bb.3: # %bb35 +; CHECK-NEXT: # in Loop: Header=BB0_2 Depth=1 +; CHECK-NEXT: testb %al, %al +; CHECK-NEXT: jmp .LBB0_2 bb1: br i1 undef, label %L_10, label %L_10 Index: llvm/trunk/test/CodeGen/X86/code_placement_no_header_change.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/code_placement_no_header_change.ll +++ llvm/trunk/test/CodeGen/X86/code_placement_no_header_change.ll @@ -0,0 +1,36 @@ +; RUN: llc -mtriple=i686-linux < %s | FileCheck %s + + +define i32 @bar(i32 %count) { +; Test checks that basic block backedge2 is not moved before header, +; because it can't reduce taken branches. +; Later backedge1 and backedge2 is rotated before loop header. +; CHECK-LABEL: bar +; CHECK: %.entry +; CHECK: %.backedge1 +; CHECK: %.backedge2 +; CHECK: %.header +; CHECK: %.exit +.entry: + %c = shl nsw i32 %count, 2 + br label %.header + +.header: + %val1 = call i32 @foo() + %cond1 = icmp sgt i32 %val1, 1 + br i1 %cond1, label %.exit, label %.backedge1 + +.backedge1: + %val2 = call i32 @foo() + %cond2 = icmp sgt i32 %val2, 1 + br i1 %cond2, label %.header, label %.backedge2 + +.backedge2: + %val3 = call i32 @foo() + br label %.header + +.exit: + ret i32 %c +} + +declare i32 @foo() Index: llvm/trunk/test/DebugInfo/X86/PR37234.ll =================================================================== --- llvm/trunk/test/DebugInfo/X86/PR37234.ll +++ llvm/trunk/test/DebugInfo/X86/PR37234.ll @@ -22,19 +22,18 @@ ; CHECK: #DEBUG_VALUE: main:aa <- 0 ; CHECK: #DEBUG_VALUE: main:aa <- $[[REG:[0-9a-z]+]] ; CHECK: jmp .LBB0_1 -; CHECK: .LBB0_3: -; CHECK: #DEBUG_VALUE: main:aa <- $[[REG]] -; CHECK: incl %[[REG]] -; CHECK: #DEBUG_VALUE: main:aa <- $[[REG]] +; CHECK: .LBB0_2: +; CHECK: #DEBUG_VALUE: main:aa <- $[[REG]] +; CHECK: jne .LBB0_1 +; CHECK: # %bb.{{.*}}: +; CHECK: #DEBUG_VALUE: main:aa <- $[[REG]] +; CHECK: incl %[[REG]] +; CHECK: #DEBUG_VALUE: main:aa <- $[[REG]] ; CHECK: .LBB0_1: ; CHECK: #DEBUG_VALUE: main:aa <- $[[REG]] -; CHECK: je .LBB0_4 +; CHECK: jne .LBB0_2 ; CHECK: # %bb.{{.*}}: ; CHECK: #DEBUG_VALUE: main:aa <- $[[REG]] -; CHECK: jne .LBB0_1 -; CHECK: jmp .LBB0_3 -; CHECK: .LBB0_4: -; CHECK: #DEBUG_VALUE: main:aa <- $[[REG]] ; CHECK: retq source_filename = "PR37234.cpp"