Index: lib/CodeGen/MachineBlockPlacement.cpp =================================================================== --- lib/CodeGen/MachineBlockPlacement.cpp +++ lib/CodeGen/MachineBlockPlacement.cpp @@ -403,7 +403,11 @@ void buildCFGChains(); void optimizeBranches(); void alignBlocks(); + /// Returns true if a block should be tail-duplicated to increase fallthrough + /// opportunities. bool shouldTailDuplicate(MachineBasicBlock *BB); + /// Returns true if a block can tail duplicate into all unplaced + /// predecessors. Filters based on loop. bool canTailDuplicateUnplacedPreds( MachineBasicBlock *BB, MachineBasicBlock *Succ, BlockChain &Chain, const BlockFilterSet *BlockFilter); @@ -565,7 +569,22 @@ return SuccProb; } -/// Check if a block should be tail duplicated. +/// Check if \p BB has exactly the successors in \p Successors. +static bool hasSameSuccessors( + MachineBasicBlock &BB, SmallPtrSetImpl &Successors) { + if (BB.succ_size() != Successors.size()) + return false; + // We don't want to count self-loops + if (Successors.count(&BB)) + return false; + for (MachineBasicBlock *Succ : BB.successors()) + if (!Successors.count(Succ)) + return false; + return true; +} + +/// Check if a block should be tail duplicated to increase fallthrough +/// opportunities. /// \p BB Block to check. bool MachineBlockPlacement::shouldTailDuplicate(MachineBasicBlock *BB) { // Blocks with single successors don't create additional fallthrough @@ -592,14 +611,45 @@ if (!shouldTailDuplicate(Succ)) return false; + // For CFG checking. + SmallPtrSet Successors(BB->succ_begin(), BB->succ_end()); for (MachineBasicBlock *Pred : Succ->predecessors()) { // Make sure all unplaced and unfiltered predecessors can be // tail-duplicated into. if (Pred == BB || (BlockFilter && !BlockFilter->count(Pred)) || BlockToChain[Pred] == &Chain) continue; - if (!TailDup.canTailDuplicate(Succ, Pred)) + if (!TailDup.canTailDuplicate(Succ, Pred)) { + if (Successors.size() > 1 + && hasSameSuccessors(*Pred, Successors)) + // This looks like a tail-duplicated block. Skip it. + // For example: + // A A + // |\ |\ + // | \ | \ + // | C | C + // | / | | + // |/ | | + // B => B | + // |\ |\/| + // | \ |/\| + // | D | D + // | / | / + // |/ |/ + // E E + // + // After B was duplicated into C, the layout looks like the one on the + // right. B and C now have the same successors. When considering whether + // E can be duplicated into all its unplaced predecessors, we ignore C. + // This allows lattices to be laid out in 2 separate chains (ABE...) and + // later (CD...) This is a reasonable heuristic because it allows the + // creation of 2 fallthrough paths with links between them. + // We look for the CFG pattern rather than recording the blocks because + // we want layout to be repeatable, and if some other pass does the + // tail-duplication, we want to lay it out the same way. + continue; return false; + } } return true; } Index: test/CodeGen/AArch64/branch-relax-cbz.ll =================================================================== --- test/CodeGen/AArch64/branch-relax-cbz.ll +++ test/CodeGen/AArch64/branch-relax-cbz.ll @@ -6,23 +6,18 @@ ; CHECK-NEXT: ; BB#1: ; %b3 ; CHECK: ldr [[LOAD:w[0-9]+]] -; CHECK: cbz [[LOAD]], [[SKIP_LONG_B:LBB[0-9]+_[0-9]+]] -; CHECK-NEXT: b [[B8:LBB[0-9]+_[0-9]+]] - -; CHECK-NEXT: [[SKIP_LONG_B]]: +; CHECK: cbnz [[LOAD]], [[B8:LBB[0-9]+_[0-9]+]] ; CHECK-NEXT: b [[B7:LBB[0-9]+_[0-9]+]] +; CHECK-NEXT: [[B8]]: ; %b8 +; CHECK-NEXT: ret + ; CHECK-NEXT: [[B2]]: ; %b2 ; CHECK: mov w{{[0-9]+}}, #93 ; CHECK: bl _extfunc ; CHECK: cbz w{{[0-9]+}}, [[B7]] +; CHECK-NEXT: b [[B8]] -; CHECK-NEXT: [[B8]]: ; %b8 -; CHECK-NEXT: ret - -; CHECK-NEXT: [[B7]]: ; %b7 -; CHECK: mov w{{[0-9]+}}, #13 -; CHECK: b _extfunc define void @split_block_no_fallthrough(i64 %val) #0 { bb: %c0 = icmp sgt i64 %val, -5 Index: test/CodeGen/AArch64/optimize-cond-branch.ll =================================================================== --- test/CodeGen/AArch64/optimize-cond-branch.ll +++ test/CodeGen/AArch64/optimize-cond-branch.ll @@ -11,7 +11,8 @@ ; ; CHECK-LABEL: func ; CHECK-NOT: and -; CHECK: tbnz +; Layout reverses the test. +; CHECK: tbz define void @func() { %c0 = icmp sgt i64 0, 0 br i1 %c0, label %b1, label %b6 Index: test/CodeGen/AMDGPU/basic-branch.ll =================================================================== --- test/CodeGen/AMDGPU/basic-branch.ll +++ test/CodeGen/AMDGPU/basic-branch.ll @@ -8,13 +8,10 @@ ; GCNNOOPT: v_writelane_b32 ; GCN: s_cbranch_scc1 [[END:BB[0-9]+_[0-9]+]] - -; GCN: ; BB#1 ; GCNNOOPT: v_readlane_b32 ; GCNNOOPT: v_readlane_b32 ; GCN: buffer_store_dword -; GCNOPT-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; TODO: This waitcnt can be eliminated +; GCNNOOPT: s_endpgm ; GCN: {{^}}[[END]]: ; GCN: s_endpgm Index: test/CodeGen/AMDGPU/cf-loop-on-constant.ll =================================================================== --- test/CodeGen/AMDGPU/cf-loop-on-constant.ll +++ test/CodeGen/AMDGPU/cf-loop-on-constant.ll @@ -2,7 +2,7 @@ ; RUN: llc -march=amdgcn -verify-machineinstrs -O0 < %s ; GCN-LABEL: {{^}}test_loop: -; GCN: [[LABEL:BB[0-9+]_[0-9]+]]: +; GCN: [[LABEL:BB[0-9+]_[0-9]+]]: ; %for.body{{$}} ; GCN: ds_read_b32 ; GCN: ds_write_b32 ; GCN: s_branch [[LABEL]] Index: test/CodeGen/AMDGPU/convergent-inlineasm.ll =================================================================== --- test/CodeGen/AMDGPU/convergent-inlineasm.ll +++ test/CodeGen/AMDGPU/convergent-inlineasm.ll @@ -29,6 +29,7 @@ ; GCN: v_cmp_ne_u32_e64 ; GCN: BB{{[0-9]+_[0-9]+}}: + define void @nonconvergent_inlineasm(i64 addrspace(1)* nocapture %arg) { bb: %tmp = call i32 @llvm.amdgcn.workitem.id.x() Index: test/CodeGen/AMDGPU/salu-to-valu.ll =================================================================== --- test/CodeGen/AMDGPU/salu-to-valu.ll +++ test/CodeGen/AMDGPU/salu-to-valu.ll @@ -439,7 +439,7 @@ ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 ; GCN-NOHSA: buffer_store_dword [[ONE]] ; GCN-HSA: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[ONE]] -; GCN; {{^}}[[EXIT]]: +; GCN: {{^}}[[EXIT]]: ; GCN: s_endpgm define void @sopc_vopc_legalize_bug(i32 %cond, i32 addrspace(1)* %out, i32 addrspace(1)* %in) { bb3: ; preds = %bb2 Index: test/CodeGen/AMDGPU/skip-if-dead.ll =================================================================== --- test/CodeGen/AMDGPU/skip-if-dead.ll +++ test/CodeGen/AMDGPU/skip-if-dead.ll @@ -267,7 +267,9 @@ ; CHECK: v_cmp_eq_f32_e32 vcc, 0, [[PHIREG]] ; CHECK-NEXT: s_cbranch_vccnz [[BB10:BB[0-9]+_[0-9]+]] -; CHECK-NEXT: s_branch [[END:BB[0-9]+_[0-9]+]] + +; CHECK: [[END:BB[0-9]+_[0-9]+]]: ; %end +; CHECK-NEXT: s_endpgm ; CHECK [[BB8]]: ; %BB8 ; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 8 @@ -278,9 +280,8 @@ ; CHECK: [[BB10]]: ; %bb10 ; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 9 ; CHECK: buffer_store_dword +; CHECK: s_endpgm -; CHECK: [[END:BB[0-9]+_[0-9]+]]: ; %end -; CHECK-NEXT: s_endpgm define amdgpu_ps void @phi_use_def_before_kill() #0 { bb: Index: test/CodeGen/ARM/atomic-cmpxchg.ll =================================================================== --- test/CodeGen/ARM/atomic-cmpxchg.ll +++ test/CodeGen/ARM/atomic-cmpxchg.ll @@ -66,14 +66,14 @@ ; CHECK-ARMV7-NEXT: [[HEAD:.LBB[0-9_]+]]: ; CHECK-ARMV7-NEXT: strexb [[SUCCESS:r[0-9]+]], r2, [r0] ; CHECK-ARMV7-NEXT: cmp [[SUCCESS]], #0 -; CHECK-ARMV7-NEXT: moveq [[RES:r[0-9]+]], #1 +; CHECK-ARMV7-NEXT: moveq r0, #1 ; CHECK-ARMV7-NEXT: bxeq lr ; CHECK-ARMV7-NEXT: [[TRY]]: -; CHECK-ARMV7-NEXT: ldrexb [[LD:r[0-9]+]], [r0] -; CHECK-ARMV7-NEXT: cmp [[LD]], [[DESIRED]] +; CHECK-ARMV7-NEXT: ldrexb [[SUCCESS]], [r0] +; CHECK-ARMV7-NEXT: cmp [[SUCCESS]], r1 ; CHECK-ARMV7-NEXT: beq [[HEAD]] ; CHECK-ARMV7-NEXT: clrex -; CHECK-ARMV7-NEXT: mov [[RES]], #0 +; CHECK-ARMV7-NEXT: mov r0, #0 ; CHECK-ARMV7-NEXT: bx lr ; CHECK-THUMBV7-LABEL: test_cmpxchg_res_i8: Index: test/CodeGen/ARM/fold-stack-adjust.ll =================================================================== --- test/CodeGen/ARM/fold-stack-adjust.ll +++ test/CodeGen/ARM/fold-stack-adjust.ll @@ -135,7 +135,7 @@ ; Important to check for beginning of basic block, because if it gets ; if-converted the test is probably no longer checking what it should. -; CHECK: {{LBB[0-9]+_2}}: +; CHECK: %end ; CHECK-NEXT: vpop {d7, d8} ; CHECK-NEXT: pop {r4, pc} Index: test/CodeGen/PowerPC/tail-dup-layout.ll =================================================================== --- test/CodeGen/PowerPC/tail-dup-layout.ll +++ test/CodeGen/PowerPC/tail-dup-layout.ll @@ -1,22 +1,22 @@ -; RUN: llc -outline-optional-branches -O2 < %s | FileCheck %s +; RUN: llc -O2 < %s | FileCheck %s target datalayout = "e-m:e-i64:64-n32:64" target triple = "powerpc64le-grtev4-linux-gnu" ; Intended layout: -; The outlining flag produces the layout +; The chain-based outlining produces the layout ; test1 ; test2 ; test3 ; test4 -; exit ; optional1 ; optional2 ; optional3 ; optional4 +; exit ; Tail duplication puts test n+1 at the end of optional n ; so optional1 includes a copy of test2 at the end, and branches ; to test3 (at the top) or falls through to optional 2. -; The CHECK statements check for the whole string of tests and exit block, +; The CHECK statements check for the whole string of tests ; and then check that the correct test has been duplicated into the end of ; the optional blocks and that the optional blocks are in the correct order. ;CHECK-LABEL: straight_test: @@ -33,8 +33,7 @@ ;CHECK-NEXT: [[TEST4LABEL:[._0-9A-Za-z]+]]: # %test4 ;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28 ;CHECK-NEXT: bne 0, .[[OPT4LABEL:[._0-9A-Za-z]+]] -;CHECK-NEXT: [[EXITLABEL:[._0-9A-Za-z]+]]: # %exit -;CHECK: blr +;CHECK-NEXT: b [[EXITLABEL:[._0-9A-Za-z]+]] ;CHECK-NEXT: [[OPT1LABEL]] ;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30 ;CHECK-NEXT: beq 0, [[TEST3LABEL]] @@ -45,7 +44,8 @@ ;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28 ;CHECK-NEXT: beq 0, [[EXITLABEL]] ;CHECK-NEXT: [[OPT4LABEL]] -;CHECK: b [[EXITLABEL]] +;CHECK: [[EXITLABEL]]: # %exit +;CHECK: blr define void @straight_test(i32 %tag) { entry: @@ -94,6 +94,113 @@ ret void } +; Intended layout: +; The chain-based outlining produces the layout +; entry +; --- Begin loop --- +; for.latch +; for.check +; test1 +; test2 +; test3 +; test4 +; optional1 +; optional2 +; optional3 +; optional4 +; --- End loop --- +; exit +; The CHECK statements check for the whole string of tests and exit block, +; and then check that the correct test has been duplicated into the end of +; the optional blocks and that the optional blocks are in the correct order. +;CHECK-LABEL: loop_test: +;CHECK: add [[TAGPTRREG:[0-9]+]], 3, 4 +;CHECK: [[LATCHLABEL:[._0-9A-Za-z]+]]: # %for.latch +;CHECK: addi +;CHECK: [[CHECKLABEL:[._0-9A-Za-z]+]]: # %for.check +;CHECK: lwz [[TAGREG:[0-9]+]], 0([[TAGPTRREG]]) +;CHECK: # %test1 +;CHECK: andi. {{[0-9]+}}, [[TAGREG]], 1 +;CHECK-NEXT: bc 12, 1, [[OPT1LABEL:[._0-9A-Za-z]+]] +;CHECK-NEXT: # %test2 +;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30 +;CHECK-NEXT: bne 0, [[OPT2LABEL:[._0-9A-Za-z]+]] +;CHECK-NEXT: [[TEST3LABEL:[._0-9A-Za-z]+]]: # %test3 +;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29 +;CHECK-NEXT: bne 0, [[OPT3LABEL:[._0-9A-Za-z]+]] +;CHECK-NEXT: [[TEST4LABEL:[._0-9A-Za-z]+]]: # %{{(test4|optional3)}} +;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28 +;CHECK-NEXT: beq 0, [[LATCHLABEL]] +;CHECK-NEXT: b [[OPT4LABEL:[._0-9A-Za-z]+]] +;CHECK: [[OPT1LABEL]] +;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30 +;CHECK-NEXT: beq 0, [[TEST3LABEL]] +;CHECK-NEXT: [[OPT2LABEL]] +;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29 +;CHECK-NEXT: beq 0, [[TEST4LABEL]] +;CHECK-NEXT: [[OPT3LABEL]] +;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28 +;CHECK-NEXT: beq 0, [[LATCHLABEL]] +;CHECK-NEXT: [[OPT4LABEL]] +;CHECK: b [[LATCHLABEL]] +define void @loop_test(i32* %tags, i32 %count) { +entry: + br label %for.check +for.check: + %count.loop = phi i32 [%count, %entry], [%count.sub, %for.latch] + %done.count = icmp ugt i32 %count.loop, 0 + %tag_ptr = getelementptr inbounds i32, i32* %tags, i32 %count + %tag = load i32, i32* %tag_ptr + %done.tag = icmp eq i32 %tag, 0 + %done = and i1 %done.count, %done.tag + br i1 %done, label %test1, label %exit +test1: + %tagbit1 = and i32 %tag, 1 + %tagbit1eq0 = icmp eq i32 %tagbit1, 0 + br i1 %tagbit1eq0, label %test2, label %optional1 +optional1: + call void @a() + call void @a() + call void @a() + call void @a() + br label %test2 +test2: + %tagbit2 = and i32 %tag, 2 + %tagbit2eq0 = icmp eq i32 %tagbit2, 0 + br i1 %tagbit2eq0, label %test3, label %optional2 +optional2: + call void @b() + call void @b() + call void @b() + call void @b() + br label %test3 +test3: + %tagbit3 = and i32 %tag, 4 + %tagbit3eq0 = icmp eq i32 %tagbit3, 0 + br i1 %tagbit3eq0, label %test4, label %optional3 +optional3: + call void @c() + call void @c() + call void @c() + call void @c() + br label %test4 +test4: + %tagbit4 = and i32 %tag, 8 + %tagbit4eq0 = icmp eq i32 %tagbit4, 0 + br i1 %tagbit4eq0, label %for.latch, label %optional4 +optional4: + call void @d() + call void @d() + call void @d() + call void @d() + br label %for.latch +for.latch: + %count.sub = sub i32 %count.loop, 1 + br label %for.check +exit: + ret void +} + ; The block then2 is not unavoidable, but since it can be tail-duplicated, it ; should be placed as a fallthrough from test2 and copied. ; CHECK-LABEL: avoidable_test: @@ -105,7 +212,6 @@ ; CHECK: # %then2 ; CHECK: rlwinm. {{[0-9]+}}, {{[0-9]+}}, 0, 29, 29 ; CHECK-NOT: # %{{[-_a-zA-Z0-9]+}} -; CHECK: # %end2 ; CHECK: # %else1 ; CHECK: bl a ; CHECK: bl a @@ -113,6 +219,7 @@ ; CHECK: rlwinm. {{[0-9]+}}, {{[0-9]+}}, 0, 29, 29 ; CHECK: # %else2 ; CHECK: bl c +; CHECK: # %end2 define void @avoidable_test(i32 %tag) { entry: br label %test1 @@ -141,7 +248,6 @@ call void @d() ret void } - declare void @a() declare void @b() declare void @c() Index: test/CodeGen/WebAssembly/mem-intrinsics.ll =================================================================== --- test/CodeGen/WebAssembly/mem-intrinsics.ll +++ test/CodeGen/WebAssembly/mem-intrinsics.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -tail-dup-placement=0| FileCheck %s +; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -tail-dup-placement=0 | FileCheck %s ; Test memcpy, memmove, and memset intrinsics. Index: test/CodeGen/X86/block-placement.ll =================================================================== --- test/CodeGen/X86/block-placement.ll +++ test/CodeGen/X86/block-placement.ll @@ -314,7 +314,7 @@ define void @unnatural_cfg1() { ; Test that we can handle a loop with an inner unnatural loop at the end of ; a function. This is a gross CFG reduced out of the single source GCC. -; CHECK: unnatural_cfg1 +; CHECK-LABEL: unnatural_cfg1 ; CHECK: %entry ; CHECK: %loop.body1 ; CHECK: %loop.body2 @@ -352,17 +352,22 @@ ; Test that we can handle a loop with a nested natural loop *and* an unnatural ; loop. This was reduced from a crash on block placement when run over ; single-source GCC. -; CHECK: unnatural_cfg2 +; The tail-duplication outlining algorithm places +; %loop.body3 and %loop.inner1.begin out-of-line at the end of the loop, +; because %loop.body4 is unnavoidable within the loop and short, +; and %loop.inner1.begin has an alternate fallthrough of %loop.body3 +; CHECK-LABEL: unnatural_cfg2 ; CHECK: %entry ; CHECK: %loop.body1 ; CHECK: %loop.body2 +; CHECK: %loop.body4 +; CHECK: %loop.inner2.begin +; CHECK: %loop.inner2.begin +; The loop.inner2.end block is folded ; CHECK: %loop.body3 ; CHECK: %loop.inner1.begin ; The end block is folded with %loop.body3... ; CHECK-NOT: %loop.inner1.end -; CHECK: %loop.body4 -; CHECK: %loop.inner2.begin -; The loop.inner2.end block is folded ; CHECK: %loop.header ; CHECK: %bail @@ -559,7 +564,7 @@ ; didn't correctly locate the fallthrough successor, assuming blindly that the ; first one was the fallthrough successor. As a result, we would add an ; erroneous jump to the landing pad thinking *that* was the default successor. -; CHECK: test_eh_lpad_successor +; CHECK-LABEL: test_eh_lpad_successor ; CHECK: %entry ; CHECK-NOT: jmp ; CHECK: %loop @@ -587,7 +592,7 @@ ; fallthrough simply won't occur. Make sure we don't crash trying to update ; terminators for such constructs. ; -; CHECK: test_eh_throw +; CHECK-LABEL: test_eh_throw ; CHECK: %entry ; CHECK: %cleanup @@ -609,7 +614,7 @@ ; attempt to merge onto the wrong end of the inner loop just because we find it ; first. This was reduced from a crasher in GCC's single source. ; -; CHECK: test_unnatural_cfg_backwards_inner_loop +; CHECK-LABEL: test_unnatural_cfg_backwards_inner_loop ; CHECK: %entry ; CHECK: %loop2b ; CHECK: %loop1 @@ -649,7 +654,7 @@ ; fallthrough because that happens to always produce unanalyzable branches on ; x86. ; -; CHECK: unanalyzable_branch_to_loop_header +; CHECK-LABEL: unanalyzable_branch_to_loop_header ; CHECK: %entry ; CHECK: %loop ; CHECK: %exit @@ -673,7 +678,7 @@ ; This branch is now analyzable and hence the destination block becomes the ; hotter one. The right order is entry->bar->exit->foo. ; -; CHECK: unanalyzable_branch_to_best_succ +; CHECK-LABEL: unanalyzable_branch_to_best_succ ; CHECK: %entry ; CHECK: %bar ; CHECK: %exit @@ -699,7 +704,7 @@ ; Ensure that we can handle unanalyzable branches where the destination block ; gets selected as the best free block in the CFG. ; -; CHECK: unanalyzable_branch_to_free_block +; CHECK-LABEL: unanalyzable_branch_to_free_block ; CHECK: %entry ; CHECK: %a ; CHECK: %b @@ -729,7 +734,7 @@ ; Ensure that we don't crash as we're building up many unanalyzable branches, ; blocks, and loops. ; -; CHECK: many_unanalyzable_branches +; CHECK-LABEL: many_unanalyzable_branches ; CHECK: %entry ; CHECK: %exit @@ -948,7 +953,7 @@ ; strange layouts that are siginificantly less efficient, often times maing ; it discontiguous. ; -; CHECK: @benchmark_heapsort +; CHECK-LABEL: @benchmark_heapsort ; CHECK: %entry ; First rotated loop top. ; CHECK: .p2align Index: test/CodeGen/X86/tail-dup-merge-loop-headers.ll =================================================================== --- test/CodeGen/X86/tail-dup-merge-loop-headers.ll +++ test/CodeGen/X86/tail-dup-merge-loop-headers.ll @@ -6,13 +6,13 @@ ; CHECK-LABEL: tail_dup_merge_loops ; CHECK: # %entry ; CHECK-NOT: # %{{[a-zA-Z_]+}} +; CHECK: # %exit +; CHECK-NOT: # %{{[a-zA-Z_]+}} ; CHECK: # %inner_loop_exit ; CHECK-NOT: # %{{[a-zA-Z_]+}} ; CHECK: # %inner_loop_latch ; CHECK-NOT: # %{{[a-zA-Z_]+}} ; CHECK: # %inner_loop_test -; CHECK-NOT: # %{{[a-zA-Z_]+}} -; CHECK: # %exit define void @tail_dup_merge_loops(i32 %a, i8* %b, i8* %c) local_unnamed_addr #0 { entry: %notlhs674.i = icmp eq i32 %a, 0 Index: test/CodeGen/X86/tail-dup-repeat.ll =================================================================== --- test/CodeGen/X86/tail-dup-repeat.ll +++ test/CodeGen/X86/tail-dup-repeat.ll @@ -1,4 +1,4 @@ -; RUN: llc -O2 -tail-dup-placement-threshold=4 -o - %s | FileCheck %s +; RUN: llc -O3 -tail-dup-placement-threshold=4 -o - %s | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" Index: test/CodeGen/X86/tail-opts.ll =================================================================== --- test/CodeGen/X86/tail-opts.ll +++ test/CodeGen/X86/tail-opts.ll @@ -112,14 +112,13 @@ ; CHECK: ucomiss %xmm{{[0-2]}}, %xmm{{[0-2]}} ; CHECK-NEXT: jbe .LBB2_3 ; CHECK-NEXT: ucomiss %xmm{{[0-2]}}, %xmm{{[0-2]}} -; CHECK-NEXT: ja .LBB2_4 -; CHECK-NEXT: jmp .LBB2_2 -; CHECK-NEXT: .LBB2_3: -; CHECK-NEXT: ucomiss %xmm{{[0-2]}}, %xmm{{[0-2]}} ; CHECK-NEXT: jbe .LBB2_2 ; CHECK-NEXT: .LBB2_4: ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: ret +; CHECK-NEXT: .LBB2_3: +; CHECK-NEXT: ucomiss %xmm{{[0-2]}}, %xmm{{[0-2]}} +; CHECK-NEXT: ja .LBB2_4 ; CHECK-NEXT: .LBB2_2: ; CHECK-NEXT: movb $1, %al ; CHECK-NEXT: ret Index: test/CodeGen/X86/twoaddr-coalesce-3.ll =================================================================== --- test/CodeGen/X86/twoaddr-coalesce-3.ll +++ test/CodeGen/X86/twoaddr-coalesce-3.ll @@ -19,7 +19,7 @@ ; Check that only one mov will be generated in the kernel loop. ; CHECK-LABEL: foo: -; CHECK: [[LOOP1:^[a-zA-Z0-9_.]+]]: {{#.*}} %for.body +; CHECK: [[LOOP1:^[a-zA-Z0-9_.]+]]: {{#.*}} %for.body{{$}} ; CHECK-NOT: mov ; CHECK: movl {{.*}}, [[REG1:%[a-z0-9]+]] ; CHECK-NOT: mov @@ -56,7 +56,7 @@ ; Check that only two mov will be generated in the kernel loop. ; CHECK-LABEL: goo: -; CHECK: [[LOOP2:^[a-zA-Z0-9_.]+]]: {{#.*}} %for.body +; CHECK: [[LOOP2:^[a-zA-Z0-9_.]+]]: {{#.*}} %for.body{{$}} ; CHECK-NOT: mov ; CHECK: movl {{.*}}, [[REG2:%[a-z0-9]+]] ; CHECK-NOT: mov