Index: lib/CodeGen/MachineBlockPlacement.cpp =================================================================== --- lib/CodeGen/MachineBlockPlacement.cpp +++ lib/CodeGen/MachineBlockPlacement.cpp @@ -452,6 +452,12 @@ void buildChain(const MachineBasicBlock *BB, BlockChain &Chain, BlockFilterSet *BlockFilter = nullptr); + bool canMoveBottomBlockToTop(const MachineBasicBlock *BottomBlock, + const MachineBasicBlock *OldTop); + bool hasRarePredecessors(const MachineBasicBlock *Latch, + const MachineBasicBlock *Exit); + MachineBasicBlock *findBestLoopTopHelper(MachineBasicBlock *OldTop, + const MachineLoop &L, const BlockFilterSet &LoopBlockSet); MachineBasicBlock *findBestLoopTop( const MachineLoop &L, const BlockFilterSet &LoopBlockSet); MachineBasicBlock *findBestLoopExit( @@ -1749,19 +1755,82 @@ << getBlockName(*Chain.begin()) << "\n"); } -/// \brief Find the best loop top block for layout. +// If BottomBlock has only one successor OldTop, in most cases it is profitable +// to move it before OldTop, except following case: +// +// ---OldTop-- +// | . | +// | . | +// | . | +// ---Pred | +// | | +// BB----- +// +// In this case we keep its original layout. +bool +MachineBlockPlacement::canMoveBottomBlockToTop( + const MachineBasicBlock *BottomBlock, + const MachineBasicBlock *OldTop) { + if (BottomBlock->pred_size() != 1) + return true; + MachineBasicBlock *Pred = *BottomBlock->pred_begin(); + if (Pred->succ_size() != 2) + return true; + + MachineBasicBlock *OtherBB = *Pred->succ_begin(); + if (OtherBB == BottomBlock) + OtherBB = *Pred->succ_rbegin(); + if (OtherBB == OldTop) + return false; + + return true; +} + +// Check if the latch block has rarer predecessors than exit block. +// The rare predecessors is defined as the total predecessor frequency minus +// the max predecessor frequency. It is the number of reduced taken branches +// when move the latch to the top of loop. +bool +MachineBlockPlacement::hasRarePredecessors(const MachineBasicBlock *Latch, + const MachineBasicBlock *Exit) +{ + BlockFrequency MaxPredFreq; + BlockFrequency TotalPredFreq; + for (MachineBasicBlock *Pred : Latch->predecessors()) { + BlockFrequency PredFreq = MBFI->getBlockFreq(Pred); + TotalPredFreq += PredFreq; + if (PredFreq > MaxPredFreq) + MaxPredFreq = PredFreq; + } + BlockFrequency ReducedBranches = TotalPredFreq - MaxPredFreq; + BlockFrequency ExitFreq = MBFI->getBlockFreq(Exit); + return ReducedBranches < ExitFreq; +} + +/// \brief Helper function of findBestLoopTop. Find the best loop top block +/// from predecessors of old top. +/// +/// Look for a block which is strictly better than the old top for laying +/// out before the old top of the loop. This looks for only two patterns: +/// +/// 1. a block has only one successor, the old loop top +/// +/// Because such a block will always result in an unconditional jump, +/// rotating it in front of the old top is always profitable. +/// +/// 2. a block has two successors, one is old top, another is exit +/// it has more than one predecessors /// -/// Look for a block which is strictly better than the loop header for laying -/// out at the top of the loop. This looks for one and only one pattern: -/// a latch block with no conditional exit. This block will cause a conditional -/// jump around it or will be the bottom of the loop if we lay it out in place, -/// but if it it doesn't end up at the bottom of the loop for any reason, -/// rotation alone won't fix it. Because such a block will always result in an -/// unconditional jump (for the backedge) rotating it in front of the loop -/// header is always profitable. +/// If it is below one of its predecessors P, only P can fall through to +/// it, all other predecessors need a jump to it, and another conditional +/// jump to loop header. If it is moved before loop header, all its +/// predecessors jump to it, then fall through to loop header. So all its +/// predecessors except P can reduce one taken branch. MachineBasicBlock * -MachineBlockPlacement::findBestLoopTop(const MachineLoop &L, - const BlockFilterSet &LoopBlockSet) { +MachineBlockPlacement::findBestLoopTopHelper( + MachineBasicBlock *OldTop, + const MachineLoop &L, + const BlockFilterSet &LoopBlockSet) { // Placing the latch block before the header may introduce an extra branch // that skips this block the first time the loop is executed, which we want // to avoid when optimising for size. @@ -1770,33 +1839,55 @@ // In practice this never happens though: there always seems to be a preheader // that can fallthrough and that is also placed before the header. if (F->getFunction().optForSize()) - return L.getHeader(); + return OldTop; // Check that the header hasn't been fused with a preheader block due to // crazy branches. If it has, we need to start with the header at the top to // prevent pulling the preheader into the loop body. - BlockChain &HeaderChain = *BlockToChain[L.getHeader()]; + BlockChain &HeaderChain = *BlockToChain[OldTop]; if (!LoopBlockSet.count(*HeaderChain.begin())) - return L.getHeader(); + return OldTop; - DEBUG(dbgs() << "Finding best loop top for: " << getBlockName(L.getHeader()) + DEBUG(dbgs() << "Finding best loop top for: " << getBlockName(OldTop) << "\n"); BlockFrequency BestPredFreq; MachineBasicBlock *BestPred = nullptr; - for (MachineBasicBlock *Pred : L.getHeader()->predecessors()) { + for (MachineBasicBlock *Pred : OldTop->predecessors()) { if (!LoopBlockSet.count(Pred)) continue; + if (Pred == L.getHeader()) + continue; DEBUG(dbgs() << " header pred: " << getBlockName(Pred) << ", has " << Pred->succ_size() << " successors, "; MBFI->printBlockFreq(dbgs(), Pred) << " freq\n"); - if (Pred->succ_size() > 1) + if (Pred->succ_size() > 2) continue; + if (Pred->succ_size() == 2) { + // The candidate should have an exit edge. + MachineBasicBlock *OutBB = *Pred->succ_begin(); + if (OutBB == OldTop) + OutBB = *Pred->succ_rbegin(); + if (LoopBlockSet.count(OutBB)) + continue; + + // It must have more than 1 predecessors. + if (Pred->pred_size() == 1) + continue; + + // Move the candidate to top must reduce taken branches. + if (hasRarePredecessors(Pred, OutBB)) + continue; + } else + // OldTop is the only successor of Pred. + if (!canMoveBottomBlockToTop(Pred, OldTop)) + continue; + BlockFrequency PredFreq = MBFI->getBlockFreq(Pred); if (!BestPred || PredFreq > BestPredFreq || (!(PredFreq < BestPredFreq) && - Pred->isLayoutSuccessor(L.getHeader()))) { + Pred->isLayoutSuccessor(OldTop))) { BestPred = Pred; BestPredFreq = PredFreq; } @@ -1805,7 +1896,7 @@ // If no direct predecessor is fine, just use the loop header. if (!BestPred) { DEBUG(dbgs() << " final top unchanged\n"); - return L.getHeader(); + return OldTop; } // Walk backwards through any straight line of predecessors. @@ -1818,6 +1909,24 @@ return BestPred; } +/// \brief Find the best loop top block for layout. +/// +/// This function iteratively calls findBestLoopTopHelper, until no new better +/// BB can be found. +MachineBasicBlock * +MachineBlockPlacement::findBestLoopTop(const MachineLoop &L, + const BlockFilterSet &LoopBlockSet) { + MachineBasicBlock *OldTop = nullptr; + MachineBasicBlock *NewTop = L.getHeader(); + while (NewTop != OldTop) { + OldTop = NewTop; + NewTop = findBestLoopTopHelper(OldTop, L, LoopBlockSet); + if (NewTop != OldTop) + ComputedEdges[NewTop] = { OldTop, false }; + } + return NewTop; +} + /// \brief Find the best loop exiting block for layout. /// /// This routine implements the logic to analyze the loop looking for the best Index: test/CodeGen/AArch64/neg-imm.ll =================================================================== --- test/CodeGen/AArch64/neg-imm.ll +++ test/CodeGen/AArch64/neg-imm.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -disable-block-placement -o - %s | FileCheck %s ; LSR used to pick a sub-optimal solution due to the target responding ; conservatively to isLegalAddImmediate for negative values. Index: test/CodeGen/AArch64/tailmerging_in_mbp.ll =================================================================== --- test/CodeGen/AArch64/tailmerging_in_mbp.ll +++ test/CodeGen/AArch64/tailmerging_in_mbp.ll @@ -1,9 +1,8 @@ ; RUN: llc <%s -mtriple=aarch64-eabi -verify-machine-dom-info | FileCheck %s ; CHECK-LABEL: test: -; CHECK: LBB0_7: -; CHECK: b.hi -; CHECK-NEXT: b +; CHECK-LABEL: %cond.false12.i +; CHECK: b.gt ; CHECK-NEXT: LBB0_8: ; CHECK-NEXT: mov x8, x9 ; CHECK-NEXT: LBB0_9: Index: test/CodeGen/AMDGPU/collapse-endcf.ll =================================================================== --- test/CodeGen/AMDGPU/collapse-endcf.ll +++ test/CodeGen/AMDGPU/collapse-endcf.ll @@ -205,6 +205,11 @@ ; Make sure scc liveness is updated if sor_b64 is removed ; GCN-LABEL: {{^}}scc_liveness: +; GCN: %bb10 +; GCN: s_or_b64 exec, exec, s{{\[[0-9]+:[0-9]+\]}} +; GCN: s_andn2_b64 +; GCN-NEXT: s_cbranch_execz + ; GCN: [[BB1_LOOP:BB[0-9]+_[0-9]+]]: ; GCN: s_andn2_b64 exec, exec, ; GCN-NEXT: s_cbranch_execnz [[BB1_LOOP]] @@ -215,10 +220,6 @@ ; GCN-NOT: s_or_b64 exec, exec ; GCN: s_or_b64 exec, exec, s{{\[[0-9]+:[0-9]+\]}} -; GCN: s_andn2_b64 -; GCN-NEXT: s_cbranch_execnz - -; GCN: s_or_b64 exec, exec, s{{\[[0-9]+:[0-9]+\]}} ; GCN: buffer_store_dword ; GCN: buffer_store_dword ; GCN: buffer_store_dword Index: test/CodeGen/AMDGPU/global_smrd_cfg.ll =================================================================== --- test/CodeGen/AMDGPU/global_smrd_cfg.ll +++ test/CodeGen/AMDGPU/global_smrd_cfg.ll @@ -1,27 +1,28 @@ ; RUN: llc -mtriple amdgcn--amdhsa -mcpu=fiji -amdgpu-scalarize-global-loads=true -verify-machineinstrs < %s | FileCheck %s -; CHECK-LABEL: %bb11 +; CHECK-LABEL: %bb22 -; Load from %arg in a Loop body has alias store +; Load from %arg has alias store in Loop ; CHECK: flat_load_dword -; CHECK-LABEL: %bb20 -; CHECK: flat_store_dword +; ##################################################################### + +; Load from %arg1 has no-alias store in Loop - arg1[i+1] never alias arg1[i] + +; CHECK: s_load_dword ; ##################################################################### -; CHECK-LABEL: %bb22 +; CHECK-LABEL: %bb11 -; Load from %arg has alias store in Loop +; Load from %arg in a Loop body has alias store ; CHECK: flat_load_dword -; ##################################################################### - -; Load from %arg1 has no-alias store in Loop - arg1[i+1] never alias arg1[i] +; CHECK-LABEL: %bb20 -; CHECK: s_load_dword +; CHECK: flat_store_dword define amdgpu_kernel void @cfg(i32 addrspace(1)* nocapture readonly %arg, i32 addrspace(1)* nocapture %arg1, i32 %arg2) #0 { bb: Index: test/CodeGen/AMDGPU/hoist-cond.ll =================================================================== --- test/CodeGen/AMDGPU/hoist-cond.ll +++ test/CodeGen/AMDGPU/hoist-cond.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -march=amdgcn -verify-machineinstrs -disable-block-placement < %s | FileCheck %s ; Check that invariant compare is hoisted out of the loop. ; At the same time condition shall not be serialized into a VGPR and deserialized later Index: test/CodeGen/AMDGPU/loop_break.ll =================================================================== --- test/CodeGen/AMDGPU/loop_break.ll +++ test/CodeGen/AMDGPU/loop_break.ll @@ -1,5 +1,5 @@ ; RUN: opt -mtriple=amdgcn-- -S -structurizecfg -si-annotate-control-flow %s | FileCheck -check-prefix=OPT %s -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -verify-machineinstrs -disable-block-placement < %s | FileCheck -check-prefix=GCN %s ; Uses llvm.amdgcn.break Index: test/CodeGen/AMDGPU/madmk.ll =================================================================== --- test/CodeGen/AMDGPU/madmk.ll +++ test/CodeGen/AMDGPU/madmk.ll @@ -186,9 +186,9 @@ } ; SI-LABEL: {{^}}kill_madmk_verifier_error: +; SI: s_or_b64 ; SI: s_xor_b64 ; SI: v_mac_f32_e32 {{v[0-9]+}}, 0x472aee8c, {{v[0-9]+}} -; SI: s_or_b64 define amdgpu_kernel void @kill_madmk_verifier_error() nounwind { bb: br label %bb2 Index: test/CodeGen/AMDGPU/multilevel-break.ll =================================================================== --- test/CodeGen/AMDGPU/multilevel-break.ll +++ test/CodeGen/AMDGPU/multilevel-break.ll @@ -1,5 +1,5 @@ ; RUN: opt -S -mtriple=amdgcn-- -structurizecfg -si-annotate-control-flow < %s | FileCheck -check-prefix=OPT %s -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -verify-machineinstrs -disable-block-placement < %s | FileCheck -check-prefix=GCN %s ; OPT-LABEL: {{^}}define amdgpu_vs void @multi_else_break( ; OPT: main_body: Index: test/CodeGen/AMDGPU/valu-i1.ll =================================================================== --- test/CodeGen/AMDGPU/valu-i1.ll +++ test/CodeGen/AMDGPU/valu-i1.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs -enable-misched -asm-verbose < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -verify-machineinstrs -enable-misched -asm-verbose -disable-block-placement < %s | FileCheck -check-prefix=SI %s declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone Index: test/CodeGen/ARM/code-placement.ll =================================================================== --- test/CodeGen/ARM/code-placement.ll +++ test/CodeGen/ARM/code-placement.ll @@ -38,8 +38,9 @@ br i1 %0, label %bb5, label %bb.nph15 bb1: ; preds = %bb2.preheader, %bb1 +; CHECK: LBB1_[[BB3:.]]: @ %bb3 ; CHECK: LBB1_[[PREHDR:.]]: @ %bb2.preheader -; CHECK: blt LBB1_[[BB3:.]] +; CHECK: blt LBB1_[[BB3]] %indvar = phi i32 [ %indvar.next, %bb1 ], [ 0, %bb2.preheader ] ; [#uses=2] %sum.08 = phi i32 [ %2, %bb1 ], [ %sum.110, %bb2.preheader ] ; [#uses=1] %tmp17 = sub i32 %i.07, %indvar ; [#uses=1] @@ -53,7 +54,6 @@ bb3: ; preds = %bb1, %bb2.preheader ; CHECK: LBB1_[[BB1:.]]: @ %bb1 ; CHECK: bne LBB1_[[BB1]] -; CHECK: LBB1_[[BB3]]: @ %bb3 %sum.0.lcssa = phi i32 [ %sum.110, %bb2.preheader ], [ %2, %bb1 ] ; [#uses=2] %3 = add i32 %pass.011, 1 ; [#uses=2] %exitcond18 = icmp eq i32 %3, %passes ; [#uses=1] Index: test/CodeGen/ARM/swifterror.ll =================================================================== --- test/CodeGen/ARM/swifterror.ll +++ test/CodeGen/ARM/swifterror.ll @@ -183,7 +183,7 @@ ; CHECK-APPLE: mov r0, #16 ; CHECK-APPLE: malloc ; CHECK-APPLE: strb r{{.*}}, [r0, #8] -; CHECK-APPLE: ble +; CHECK-APPLE: b ; CHECK-APPLE: mov r8, [[ID]] ; CHECK-O0-LABEL: foo_loop: Index: test/CodeGen/PowerPC/cmp_elimination.ll =================================================================== --- test/CodeGen/PowerPC/cmp_elimination.ll +++ test/CodeGen/PowerPC/cmp_elimination.ll @@ -718,13 +718,14 @@ define void @func28(i32 signext %a) { ; CHECK-LABEL: @func28 ; CHECK: cmplwi [[REG1:[0-9]+]], [[REG2:[0-9]+]] -; CHECK: .[[LABEL1:[A-Z0-9_]+]]: +; CHECK: .[[LABEL2:[A-Z0-9_]+]]: +; CHECK: cmpwi [[REG1]], [[REG2]] +; CHECK: ble 0, .[[LABEL1:[A-Z0-9_]+]] ; CHECK-NOT: cmp -; CHECK: bne 0, .[[LABEL2:[A-Z0-9_]+]] +; CHECK: bne 0, .[[LABEL2]] ; CHECK: bl dummy1 -; CHECK: .[[LABEL2]]: -; CHECK: cmpwi [[REG1]], [[REG2]] -; CHECK: bgt 0, .[[LABEL1]] +; CHECK: b .[[LABEL2]] +; CHECK: .[[LABEL1]]: ; CHECK: blr entry: br label %do.body Index: test/CodeGen/PowerPC/licm-remat.ll =================================================================== --- test/CodeGen/PowerPC/licm-remat.ll +++ test/CodeGen/PowerPC/licm-remat.ll @@ -24,8 +24,8 @@ ; CHECK-DAG: addi 25, 23, _ZN6snappy8internalL8wordmaskE@toc@l ; CHECK-DAG: addis 5, 2, _ZN6snappy8internalL10char_tableE@toc@ha ; CHECK-DAG: addi 24, 5, _ZN6snappy8internalL10char_tableE@toc@l -; CHECK: b .LBB0_2 -; CHECK: .LBB0_2: # %for.cond +; CHECK: b .[[LABEL1:[A-Z0-9_]+]] +; CHECK: .[[LABEL1]]: # %for.cond ; CHECK-NOT: addis {{[0-9]+}}, 2, _ZN6snappy8internalL8wordmaskE@toc@ha ; CHECK-NOT: addis {{[0-9]+}}, 2, _ZN6snappy8internalL10char_tableE@toc@ha ; CHECK: bctrl Index: test/CodeGen/SystemZ/atomicrmw-minmax-01.ll =================================================================== --- test/CodeGen/SystemZ/atomicrmw-minmax-01.ll +++ test/CodeGen/SystemZ/atomicrmw-minmax-01.ll @@ -1,8 +1,8 @@ ; Test 8-bit atomic min/max operations. ; -; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s -; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s -check-prefix=CHECK-SHIFT1 -; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s -check-prefix=CHECK-SHIFT2 +; RUN: llc < %s -mtriple=s390x-linux-gnu -disable-block-placement | FileCheck %s +; RUN: llc < %s -mtriple=s390x-linux-gnu -disable-block-placement | FileCheck %s -check-prefix=CHECK-SHIFT1 +; RUN: llc < %s -mtriple=s390x-linux-gnu -disable-block-placement | FileCheck %s -check-prefix=CHECK-SHIFT2 ; Check signed minimum. ; - CHECK is for the main loop. Index: test/CodeGen/SystemZ/atomicrmw-minmax-02.ll =================================================================== --- test/CodeGen/SystemZ/atomicrmw-minmax-02.ll +++ test/CodeGen/SystemZ/atomicrmw-minmax-02.ll @@ -1,8 +1,8 @@ ; Test 8-bit atomic min/max operations. ; -; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s -; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s -check-prefix=CHECK-SHIFT1 -; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s -check-prefix=CHECK-SHIFT2 +; RUN: llc < %s -mtriple=s390x-linux-gnu -disable-block-placement | FileCheck %s +; RUN: llc < %s -mtriple=s390x-linux-gnu -disable-block-placement | FileCheck %s -check-prefix=CHECK-SHIFT1 +; RUN: llc < %s -mtriple=s390x-linux-gnu -disable-block-placement | FileCheck %s -check-prefix=CHECK-SHIFT2 ; Check signed minimum. ; - CHECK is for the main loop. Index: test/CodeGen/SystemZ/loop-01.ll =================================================================== --- test/CodeGen/SystemZ/loop-01.ll +++ test/CodeGen/SystemZ/loop-01.ll @@ -1,7 +1,7 @@ ; Test loop tuning. ; -; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s -; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 \ +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 -disable-block-placement | FileCheck %s +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 -disable-block-placement \ ; RUN: | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-Z13 ; Test that strength reduction is applied to addresses with a scale factor, Index: test/CodeGen/SystemZ/loop-02.ll =================================================================== --- test/CodeGen/SystemZ/loop-02.ll +++ test/CodeGen/SystemZ/loop-02.ll @@ -1,7 +1,7 @@ ; Test BRCTH. ; RUN: llc < %s -verify-machineinstrs -mtriple=s390x-linux-gnu -mcpu=z196 \ -; RUN: -no-integrated-as | FileCheck %s +; RUN: -no-integrated-as -disable-block-placement | FileCheck %s ; Test a loop that should be converted into dbr form and then use BRCTH. define void @f2(i32 *%src, i32 *%dest) { Index: test/CodeGen/SystemZ/swifterror.ll =================================================================== --- test/CodeGen/SystemZ/swifterror.ll +++ test/CodeGen/SystemZ/swifterror.ll @@ -1,5 +1,5 @@ -; RUN: llc < %s -mtriple=s390x-linux-gnu| FileCheck %s -; RUN: llc < %s -O0 -mtriple=s390x-linux-gnu | FileCheck --check-prefix=CHECK-O0 %s +; RUN: llc < %s -mtriple=s390x-linux-gnu -disable-block-placement | FileCheck %s +; RUN: llc < %s -O0 -mtriple=s390x-linux-gnu -disable-block-placement | FileCheck --check-prefix=CHECK-O0 %s declare i8* @malloc(i64) declare void @free(i8*) Index: test/CodeGen/X86/2009-02-26-MachineLICMBug.ll =================================================================== --- test/CodeGen/X86/2009-02-26-MachineLICMBug.ll +++ test/CodeGen/X86/2009-02-26-MachineLICMBug.ll @@ -20,20 +20,7 @@ ; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: orq $2097152, %r14 ## imm = 0x200000 ; CHECK-NEXT: andl $15728640, %r14d ## imm = 0xF00000 -; CHECK-NEXT: jmp LBB0_1 ; CHECK-NEXT: .p2align 4, 0x90 -; CHECK-NEXT: LBB0_3: ## %bb.i -; CHECK-NEXT: ## in Loop: Header=BB0_1 Depth=1 -; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: pinsrd $1, 4, %xmm0 -; CHECK-NEXT: pinsrd $2, 8, %xmm0 -; CHECK-NEXT: movdqa %xmm0, %xmm1 -; CHECK-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],mem[1],xmm1[2],mem[3],xmm1[4],mem[5],xmm1[6],mem[7] -; CHECK-NEXT: psrld $16, %xmm0 -; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] -; CHECK-NEXT: addps {{.*}}(%rip), %xmm0 -; CHECK-NEXT: addps %xmm1, %xmm0 -; CHECK-NEXT: movaps %xmm0, 0 ; CHECK-NEXT: LBB0_1: ## %bb4 ; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: xorl %eax, %eax @@ -48,7 +35,19 @@ ; CHECK-NEXT: ## in Loop: Header=BB0_1 Depth=1 ; CHECK-NEXT: cmpq $1048576, %r14 ## imm = 0x100000 ; CHECK-NEXT: jne LBB0_1 -; CHECK-NEXT: jmp LBB0_3 +; CHECK-NEXT: ## %bb.3: ## %bb.i +; CHECK-NEXT: ## in Loop: Header=BB0_1 Depth=1 +; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: pinsrd $1, 4, %xmm0 +; CHECK-NEXT: pinsrd $2, 8, %xmm0 +; CHECK-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],mem[1],xmm1[2],mem[3],xmm1[4],mem[5],xmm1[6],mem[7] +; CHECK-NEXT: psrld $16, %xmm0 +; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] +; CHECK-NEXT: addps {{.*}}(%rip), %xmm0 +; CHECK-NEXT: addps %xmm1, %xmm0 +; CHECK-NEXT: movaps %xmm0, 0 +; CHECK-NEXT: jmp LBB0_1 entry: br label %bb4 Index: test/CodeGen/X86/avx-cmp.ll =================================================================== --- test/CodeGen/X86/avx-cmp.ll +++ test/CodeGen/X86/avx-cmp.ll @@ -32,11 +32,7 @@ ; CHECK-NEXT: jne .LBB2_6 ; CHECK-NEXT: # %bb.1: # %for.cond5.preheader ; CHECK-NEXT: xorl %ebx, %ebx -; CHECK-NEXT: jmp .LBB2_2 ; CHECK-NEXT: .p2align 4, 0x90 -; CHECK-NEXT: .LBB2_5: # %if.then -; CHECK-NEXT: # in Loop: Header=BB2_2 Depth=1 -; CHECK-NEXT: callq scale ; CHECK-NEXT: .LBB2_2: # %for.cond5 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: testb %bl, %bl @@ -49,7 +45,10 @@ ; CHECK-NEXT: # in Loop: Header=BB2_2 Depth=1 ; CHECK-NEXT: vucomisd {{\.LCPI.*}}, %xmm0 ; CHECK-NEXT: jne .LBB2_5 -; CHECK-NEXT: jp .LBB2_5 +; CHECK-NEXT: jnp .LBB2_2 +; CHECK-NEXT: .LBB2_5: # %if.then +; CHECK-NEXT: # in Loop: Header=BB2_2 Depth=1 +; CHECK-NEXT: callq scale ; CHECK-NEXT: jmp .LBB2_2 ; CHECK-NEXT: .LBB2_6: # %for.end52 ; CHECK-NEXT: popq %rbx Index: test/CodeGen/X86/avx512-i1test.ll =================================================================== --- test/CodeGen/X86/avx512-i1test.ll +++ test/CodeGen/X86/avx512-i1test.ll @@ -15,16 +15,15 @@ ; CHECK-NEXT: retq ; CHECK-NEXT: .LBB0_1: # %bb56 ; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: jmp .LBB0_2 ; CHECK-NEXT: .p2align 4, 0x90 -; CHECK-NEXT: .LBB0_3: # %bb35 -; CHECK-NEXT: # in Loop: Header=BB0_2 Depth=1 -; CHECK-NEXT: testb %al, %al ; CHECK-NEXT: .LBB0_2: # %bb33 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: testb %al, %al ; CHECK-NEXT: jne .LBB0_2 -; CHECK-NEXT: jmp .LBB0_3 +; CHECK-NEXT: # %bb.3: # %bb35 +; CHECK-NEXT: # in Loop: Header=BB0_2 Depth=1 +; CHECK-NEXT: testb %al, %al +; CHECK-NEXT: jmp .LBB0_2 bb1: br i1 undef, label %L_10, label %L_10 Index: test/CodeGen/X86/block-placement.ll =================================================================== --- test/CodeGen/X86/block-placement.ll +++ test/CodeGen/X86/block-placement.ll @@ -124,7 +124,7 @@ ret i32 %sum } -!0 = !{!"branch_weights", i32 4, i32 64} +!0 = !{!"branch_weights", i32 1, i32 64} define i32 @test_loop_early_exits(i32 %i, i32* %a) { ; Check that we sink early exit blocks out of loop bodies. @@ -961,11 +961,11 @@ ; CHECK: %while.cond.outer ; Third rotated loop top ; CHECK: .p2align +; CHECK: %if.end20 ; CHECK: %while.cond ; CHECK: %while.body ; CHECK: %land.lhs.true ; CHECK: %if.then19 -; CHECK: %if.end20 ; CHECK: %if.then8 ; CHECK: ret Index: test/CodeGen/X86/branch_instruction_and_target_split_perf_nops.mir =================================================================== --- test/CodeGen/X86/branch_instruction_and_target_split_perf_nops.mir +++ test/CodeGen/X86/branch_instruction_and_target_split_perf_nops.mir @@ -27,12 +27,12 @@ # return result; # } # -# CHECK: 49: eb 4a jmp 74 -# CHECK: 57: eb 3c jmp 60 -# CHECK: 65: eb 2e jmp 46 -# CHECK: 73: eb 20 jmp 32 -# CHECK: 81: eb 12 jmp 18 -# CHECK: 93: 7f 8b jg -117 +# CHECK: 22: eb 72 jmp 114 +# CHECK: 5e: eb 36 jmp 54 +# CHECK: 6c: eb 28 jmp 40 +# CHECK: 7a: eb 1a jmp 26 +# CHECK: 88: eb 0c jmp 12 +# CHECK: 94: 7f a3 jg -93 # Test 2: # @@ -57,11 +57,11 @@ # return w; # } # -# CHECK: 129: eb 13 jmp 19 -# CHECK: 12e: eb a0 jmp -96 -# CHECK: 132: eb 9c jmp -100 -# CHECK: 137: eb 97 jmp -105 -# CHECK: 13c: eb 92 jmp -110 +# CHECK: 129: eb 41 jmp 65 +# CHECK: 137: eb 92 jmp -110 +# CHECK: 144: eb 85 jmp -123 +# CHECK: 152: e9 74 ff ff ff jmp -140 +# CHECK: 167: e9 5f ff ff ff jmp -161 --- | ; ModuleID = 'D:\iusers\opaparo\dev_test\branch_instruction_and_target_split_perf_nops.ll' source_filename = "D:\5C\5Ciusers\5C\5Copaparo\5C\5Cdev_test\5C\5Cbranch_instruction_and_target_split_perf_nops.c" Index: test/CodeGen/X86/code_placement_ignore_succ_in_inner_loop.ll =================================================================== --- test/CodeGen/X86/code_placement_ignore_succ_in_inner_loop.ll +++ test/CodeGen/X86/code_placement_ignore_succ_in_inner_loop.ll @@ -1,13 +1,12 @@ ; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux < %s | FileCheck %s define void @foo() { -; Test that when determining the edge probability from a node in an inner loop -; to a node in an outer loop, the weights on edges in the inner loop should be -; ignored if we are building the chain for the outer loop. +; After moving the latch to the top of loop, there is no fall through from the +; latch to outer loop. ; ; CHECK-LABEL: foo: -; CHECK: callq c ; CHECK: callq b +; CHECK: callq c entry: %call = call zeroext i1 @a() Index: test/CodeGen/X86/code_placement_loop_rotation2.ll =================================================================== --- test/CodeGen/X86/code_placement_loop_rotation2.ll +++ test/CodeGen/X86/code_placement_loop_rotation2.ll @@ -5,13 +5,13 @@ ; Test a nested loop case when profile data is not available. ; ; CHECK-LABEL: foo: +; CHECK: callq h ; CHECK: callq b -; CHECK: callq c -; CHECK: callq d +; CHECK: callq g ; CHECK: callq e ; CHECK: callq f -; CHECK: callq g -; CHECK: callq h +; CHECK: callq c +; CHECK: callq d entry: br label %header Index: test/CodeGen/X86/move_latch_to_loop_top.ll =================================================================== --- test/CodeGen/X86/move_latch_to_loop_top.ll +++ test/CodeGen/X86/move_latch_to_loop_top.ll @@ -0,0 +1,209 @@ +; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux < %s | FileCheck %s + +; The block latch should be moved before header. +;CHECK-LABEL: test1: +;CHECK: %latch +;CHECK: %header +;CHECK: %false +define i32 @test1(i32* %p) { +entry: + br label %header + +header: + %x1 = phi i64 [0, %entry], [%x2, %latch] + %count1 = phi i32 [0, %entry], [%count4, %latch] + %0 = ptrtoint i32* %p to i64 + %1 = add i64 %0, %x1 + %2 = inttoptr i64 %1 to i32* + %data = load i32, i32* %2 + %3 = icmp eq i32 %data, 0 + br i1 %3, label %latch, label %false + +false: + %count2 = add i32 %count1, 1 + br label %latch + +latch: + %count4 = phi i32 [%count2, %false], [%count1, %header] + %x2 = add i64 %x1, 1 + %4 = icmp eq i64 %x2, 100 + br i1 %4, label %exit, label %header + +exit: + ret i32 %count4 +} + +; The block latch and one of false/true should be moved before header. +;CHECK-LABEL: test2: +;CHECK: %false +;CHECK: %latch +;CHECK: %header +;CHECK: %true +define i32 @test2(i32* %p) { +entry: + br label %header + +header: + %x1 = phi i64 [0, %entry], [%x2, %latch] + %count1 = phi i32 [0, %entry], [%count4, %latch] + %0 = ptrtoint i32* %p to i64 + %1 = add i64 %0, %x1 + %2 = inttoptr i64 %1 to i32* + %data = load i32, i32* %2 + %3 = icmp eq i32 %data, 0 + br i1 %3, label %true, label %false + +false: + %count2 = add i32 %count1, 1 + br label %latch + +true: + %count3 = add i32 %count1, 2 + br label %latch + +latch: + %count4 = phi i32 [%count2, %false], [%count3, %true] + %x2 = add i64 %x1, 1 + %4 = icmp eq i64 %x2, 100 + br i1 %4, label %exit, label %header + +exit: + ret i32 %count4 +} + +; More blocks can be moved before header. +;CHECK-LABEL: test3: +;CHECK: %true3 +;CHECK: %endif3 +;CHECK: %latch +;CHECK: %header +;CHECK: %false +define i32 @test3(i32* %p) { +entry: + br label %header + +header: + %x1 = phi i64 [0, %entry], [%x2, %latch] + %count1 = phi i32 [0, %entry], [%count12, %latch] + %0 = ptrtoint i32* %p to i64 + %1 = add i64 %0, %x1 + %2 = inttoptr i64 %1 to i32* + %data = load i32, i32* %2 + %3 = icmp eq i32 %data, 0 + br i1 %3, label %true, label %false + +false: + %count2 = add i32 %count1, 1 + %cond = icmp sgt i32 %count2, 10 + br i1 %cond, label %true2, label %false2 + +false2: + %count3 = and i32 %count2, 7 + br label %endif2 + +true2: + %count4 = mul i32 %count2, 3 + br label %endif2 + +endif2: + %count5 = phi i32 [%count3, %false2], [%count4, %true2] + %count6 = sub i32 %count5, 5 + br label %latch + +true: + %count7 = add i32 %count1, 2 + %cond2 = icmp slt i32 %count7, 20 + br i1 %cond2, label %true3, label %false3 + +false3: + %count8 = or i32 %count7, 3 + br label %endif3 + +true3: + %count9 = xor i32 %count7, 55 + br label %endif3 + +endif3: + %count10 = phi i32 [%count8, %false3], [%count9, %true3] + %count11 = add i32 %count10, 3 + br label %latch + +latch: + %count12 = phi i32 [%count6, %endif2], [%count11, %endif3] + %x2 = add i64 %x1, 1 + %4 = icmp eq i64 %x2, 100 + br i1 %4, label %exit, label %header + +exit: + ret i32 %count12 +} + +; The exit block has higher frequency than false block, so latch block should +; fall through to exit block. +;CHECK-LABEL: test4: +;CHECK: %false +;CHECK: %header +;CHECK: %true +;CHECK: %latch +;CHECK: %exit +define i32 @test4(i32 %t, i32* %p) { +entry: + br label %header + +header: + %x1 = phi i64 [0, %entry], [%x2, %latch] + %count1 = phi i32 [0, %entry], [%count4, %latch] + %0 = ptrtoint i32* %p to i64 + %1 = add i64 %0, %x1 + %2 = inttoptr i64 %1 to i32* + %data = load i32, i32* %2 + %3 = icmp eq i32 %data, 0 + br i1 %3, label %true, label %false, !prof !1 + +false: + %count2 = add i32 %count1, 1 + br label %latch + +true: + %count3 = add i32 %count1, 2 + br label %latch + +latch: + %count4 = phi i32 [%count2, %false], [%count3, %true] + %x2 = add i64 %x1, 1 + %4 = icmp eq i64 %x2, 100 + br i1 %4, label %exit, label %header, !prof !2 + +exit: + ret i32 %count4 +} + +!1 = !{!"branch_weights", i32 100, i32 1} +!2 = !{!"branch_weights", i32 16, i32 16} + +; If move latch to loop top doesn't reduce taken branch, don't do it. +;CHECK-LABEL: test5: +;CHECK: %entry +;CHECK: %header +;CHECK: %latch +define void @test5(i32* %p) { +entry: + br label %header + +header: + %x1 = phi i64 [0, %entry], [%x1, %header], [%x2, %latch] + %0 = ptrtoint i32* %p to i64 + %1 = add i64 %0, %x1 + %2 = inttoptr i64 %1 to i32* + %data = load i32, i32* %2 + %3 = icmp eq i32 %data, 0 + br i1 %3, label %latch, label %header + +latch: + %x2 = add i64 %x1, 1 + br label %header + +exit: + ret void +} + Index: test/CodeGen/X86/pr5145.ll =================================================================== --- test/CodeGen/X86/pr5145.ll +++ test/CodeGen/X86/pr5145.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=x86_64-- < %s | FileCheck %s +; RUN: llc -disable-block-placement -mtriple=x86_64-- < %s | FileCheck %s @sc8 = external global i8 define void @atomic_maxmin_i8() { Index: test/CodeGen/X86/ragreedy-bug.ll =================================================================== --- test/CodeGen/X86/ragreedy-bug.ll +++ test/CodeGen/X86/ragreedy-bug.ll @@ -18,7 +18,7 @@ ; CHECK-NEXT: movl ; CHECK-NEXT: andl ; CHECK-NEXT: testl -; CHECK-NEXT: je +; CHECK-NEXT: jne ; CHECK: cond.false.i.i ; CHECK: maskrune ; CHECK-NEXT: movzbl Index: test/CodeGen/X86/swifterror.ll =================================================================== --- test/CodeGen/X86/swifterror.ll +++ test/CodeGen/X86/swifterror.ll @@ -1,6 +1,6 @@ -; RUN: llc -verify-machineinstrs < %s -mtriple=x86_64-apple-darwin | FileCheck --check-prefix=CHECK-APPLE %s -; RUN: llc -verify-machineinstrs -O0 < %s -mtriple=x86_64-apple-darwin | FileCheck --check-prefix=CHECK-O0 %s -; RUN: llc -verify-machineinstrs < %s -mtriple=i386-apple-darwin | FileCheck --check-prefix=CHECK-i386 %s +; RUN: llc -verify-machineinstrs < %s -mtriple=x86_64-apple-darwin -disable-block-placement | FileCheck --check-prefix=CHECK-APPLE %s +; RUN: llc -verify-machineinstrs -O0 < %s -mtriple=x86_64-apple-darwin -disable-block-placement | FileCheck --check-prefix=CHECK-O0 %s +; RUN: llc -verify-machineinstrs < %s -mtriple=i386-apple-darwin -disable-block-placement | FileCheck --check-prefix=CHECK-i386 %s declare i8* @malloc(i64) declare void @free(i8*) Index: test/CodeGen/X86/tail-dup-merge-loop-headers.ll =================================================================== --- test/CodeGen/X86/tail-dup-merge-loop-headers.ll +++ test/CodeGen/X86/tail-dup-merge-loop-headers.ll @@ -66,18 +66,19 @@ ; After layout we tail merge blocks merge_other and merge_predecessor_split. ; We do this even though they share only a single instruction, because ; merge_predecessor_split falls through to their shared successor: -; outer_loop_latch. +; outer_loop_latch. And then outer_loop_latch is moved before shared_loop_header +; to reduce taken branches. ; The rest of the blocks in the function are noise unfortunately. Bugpoint ; couldn't shrink the test any further. ; CHECK-LABEL: loop_shared_header ; CHECK: # %entry ; CHECK: # %shared_preheader +; CHECK: # %outer_loop_latch +; CHECK: # %outer_loop_latch ; CHECK: # %shared_loop_header ; CHECK: # %inner_loop_body -; CHECK: # %outer_loop_latch ; CHECK: # %merge_predecessor_split -; CHECK: # %outer_loop_latch ; CHECK: # %cleanup define i32 @loop_shared_header(i8* %exe, i32 %exesz, i32 %headsize, i32 %min, i32 %wwprva, i32 %e_lfanew, i8* readonly %wwp, i32 %wwpsz, i16 zeroext %sects) local_unnamed_addr #0 { entry: Index: test/CodeGen/X86/tail-dup-repeat.ll =================================================================== --- test/CodeGen/X86/tail-dup-repeat.ll +++ test/CodeGen/X86/tail-dup-repeat.ll @@ -28,7 +28,6 @@ br label %dup1 ; CHECK: # %if.end70 -; CHECK-NEXT: # in Loop: ; CHECK-NEXT: movl $12, (%rdx) ; CHECK-NEXT: movl $2, (%rcx) ; CHECK-NEXT: testl %eax, %eax Index: test/CodeGen/X86/x86-cmov-converter.ll =================================================================== --- test/CodeGen/X86/x86-cmov-converter.ll +++ test/CodeGen/X86/x86-cmov-converter.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=x86_64-pc-linux -x86-cmov-converter=true -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=x86_64-pc-linux -x86-cmov-converter=true -verify-machineinstrs -disable-block-placement < %s | FileCheck %s ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; This test checks that x86-cmov-converter optimization transform CMOV Index: test/DebugInfo/X86/dbg-value-transfer-order.ll =================================================================== --- test/DebugInfo/X86/dbg-value-transfer-order.ll +++ test/DebugInfo/X86/dbg-value-transfer-order.ll @@ -24,6 +24,12 @@ ; with the Orders insertion point vector. ; CHECK-LABEL: f: # @f +; CHECK: .LBB0_3: +; Check that this DEBUG_VALUE comes before the left shift. +; CHECK: #DEBUG_VALUE: bit_offset <- $ecx +; CHECK: .cv_loc 0 1 8 28 # t.c:8:28 +; CHECK: movl $1, %[[reg:[^ ]*]] +; CHECK: shll %cl, %[[reg]] ; CHECK: .LBB0_1: # %while.body ; CHECK: movl $32, %ecx ; CHECK: testl {{.*}} @@ -31,12 +37,7 @@ ; CHECK: # %bb.2: # %if.then ; CHECK: callq if_then ; CHECK: movl %eax, %ecx -; CHECK: .LBB0_3: # %if.end -; Check that this DEBUG_VALUE comes before the left shift. -; CHECK: #DEBUG_VALUE: bit_offset <- $ecx -; CHECK: .cv_loc 0 1 8 28 # t.c:8:28 -; CHECK: movl $1, %[[reg:[^ ]*]] -; CHECK: shll %cl, %[[reg]] +; CHECK: jmp .LBB0_3 ; ModuleID = 't.c' source_filename = "t.c"