Skip to content

Commit 7fbec9b

Browse files
author
Kyle Butt
committedFeb 15, 2017
Codegen: Make chains from trellis-shaped CFGs
Lay out trellis-shaped CFGs optimally. A trellis of the shape below: A B |\ /| | \ / | | X | | / \ | |/ \| C D would be laid out A; B->C ; D by the current layout algorithm. Now we identify trellises and lay them out either A->C; B->D or A->D; B->C. This scales with an increasing number of predecessors. A trellis is a a group of 2 or more predecessor blocks that all have the same successors. because of this we can tail duplicate to extend existing trellises. As an example consider the following CFG: B D F H / \ / \ / \ / \ A---C---E---G---Ret Where A,C,E,G are all small (Currently 2 instructions). The CFG preserving layout is then A,B,C,D,E,F,G,H,Ret. The current code will copy C into B, E into D and G into F and yield the layout A,C,B(C),E,D(E),F(G),G,H,ret define void @straight_test(i32 %tag) { entry: br label %test1 test1: ; A %tagbit1 = and i32 %tag, 1 %tagbit1eq0 = icmp eq i32 %tagbit1, 0 br i1 %tagbit1eq0, label %test2, label %optional1 optional1: ; B call void @A() br label %test2 test2: ; C %tagbit2 = and i32 %tag, 2 %tagbit2eq0 = icmp eq i32 %tagbit2, 0 br i1 %tagbit2eq0, label %test3, label %optional2 optional2: ; D call void @b() br label %test3 test3: ; E %tagbit3 = and i32 %tag, 4 %tagbit3eq0 = icmp eq i32 %tagbit3, 0 br i1 %tagbit3eq0, label %test4, label %optional3 optional3: ; F call void @c() br label %test4 test4: ; G %tagbit4 = and i32 %tag, 8 %tagbit4eq0 = icmp eq i32 %tagbit4, 0 br i1 %tagbit4eq0, label %exit, label %optional4 optional4: ; H call void @d() br label %exit exit: ret void } here is the layout after D27742: straight_test: # @straight_test ; ... Prologue elided ; BB#0: # %entry ; A (merged with test1) ; ... More prologue elided mr 30, 3 andi. 3, 30, 1 bc 12, 1, .LBB0_2 ; BB#1: # %test2 ; C rlwinm. 3, 30, 0, 30, 30 beq 0, .LBB0_3 b .LBB0_4 .LBB0_2: # %optional1 ; B (copy of C) bl a nop rlwinm. 3, 30, 0, 30, 30 bne 0, .LBB0_4 .LBB0_3: # %test3 ; E rlwinm. 3, 30, 0, 29, 29 beq 0, .LBB0_5 b .LBB0_6 .LBB0_4: # %optional2 ; D (copy of E) bl b nop rlwinm. 3, 30, 0, 29, 29 bne 0, .LBB0_6 .LBB0_5: # %test4 ; G rlwinm. 3, 30, 0, 28, 28 beq 0, .LBB0_8 b .LBB0_7 .LBB0_6: # %optional3 ; F (copy of G) bl c nop rlwinm. 3, 30, 0, 28, 28 beq 0, .LBB0_8 .LBB0_7: # %optional4 ; H bl d nop .LBB0_8: # %exit ; Ret ld 30, 96(1) # 8-byte Folded Reload addi 1, 1, 112 ld 0, 16(1) mtlr 0 blr The tail-duplication has produced some benefit, but it has also produced a trellis which is not laid out optimally. With this patch, we improve the layouts of such trellises, and decrease the cost calculation for tail-duplication accordingly. This patch produces the layout A,C,E,G,B,D,F,H,Ret. This layout does have back edges, which is a negative, but it has a bigger compensating positive, which is that it handles the case where there are long strings of skipped blocks much better than the original layout. Both layouts handle runs of executed blocks equally well. Branch prediction also improves if there is any correlation between subsequent optional blocks. Here is the resulting concrete layout: straight_test: # @straight_test ; BB#0: # %entry ; A (merged with test1) mr 30, 3 andi. 3, 30, 1 bc 12, 1, .LBB0_4 ; BB#1: # %test2 ; C rlwinm. 3, 30, 0, 30, 30 bne 0, .LBB0_5 .LBB0_2: # %test3 ; E rlwinm. 3, 30, 0, 29, 29 bne 0, .LBB0_6 .LBB0_3: # %test4 ; G rlwinm. 3, 30, 0, 28, 28 bne 0, .LBB0_7 b .LBB0_8 .LBB0_4: # %optional1 ; B (Copy of C) bl a nop rlwinm. 3, 30, 0, 30, 30 beq 0, .LBB0_2 .LBB0_5: # %optional2 ; D (Copy of E) bl b nop rlwinm. 3, 30, 0, 29, 29 beq 0, .LBB0_3 .LBB0_6: # %optional3 ; F (Copy of G) bl c nop rlwinm. 3, 30, 0, 28, 28 beq 0, .LBB0_8 .LBB0_7: # %optional4 ; H bl d nop .LBB0_8: # %exit Differential Revision: https://reviews.llvm.org/D28522 llvm-svn: 295223
1 parent a4601b5 commit 7fbec9b

24 files changed

+741
-143
lines changed
 

‎llvm/lib/CodeGen/MachineBlockPlacement.cpp

+293-17
Large diffs are not rendered by default.

‎llvm/test/CodeGen/AArch64/branch-relax-cbz.ll

+6-7
Original file line numberDiff line numberDiff line change
@@ -6,23 +6,22 @@
66

77
; CHECK-NEXT: ; BB#1: ; %b3
88
; CHECK: ldr [[LOAD:w[0-9]+]]
9-
; CHECK: cbz [[LOAD]], [[SKIP_LONG_B:LBB[0-9]+_[0-9]+]]
10-
; CHECK-NEXT: b [[B8:LBB[0-9]+_[0-9]+]]
11-
12-
; CHECK-NEXT: [[SKIP_LONG_B]]:
9+
; CHECK: cbnz [[LOAD]], [[B8:LBB[0-9]+_[0-9]+]]
1310
; CHECK-NEXT: b [[B7:LBB[0-9]+_[0-9]+]]
1411

12+
; CHECK-NEXT: [[B8]]: ; %b8
13+
; CHECK-NEXT: ret
14+
1515
; CHECK-NEXT: [[B2]]: ; %b2
1616
; CHECK: mov w{{[0-9]+}}, #93
1717
; CHECK: bl _extfunc
1818
; CHECK: cbz w{{[0-9]+}}, [[B7]]
19-
20-
; CHECK-NEXT: [[B8]]: ; %b8
21-
; CHECK-NEXT: ret
19+
; CHECK-NEXT: b [[B8]]
2220

2321
; CHECK-NEXT: [[B7]]: ; %b7
2422
; CHECK: mov w{{[0-9]+}}, #13
2523
; CHECK: b _extfunc
24+
2625
define void @split_block_no_fallthrough(i64 %val) #0 {
2726
bb:
2827
%c0 = icmp sgt i64 %val, -5

‎llvm/test/CodeGen/AArch64/combine-comparisons-by-cse.ll

+1-1
Original file line numberDiff line numberDiff line change
@@ -264,7 +264,7 @@ declare void @do_something() #1
264264
define i32 @do_nothing_if_resultant_opcodes_would_differ() #0 {
265265
; CHECK-LABEL: do_nothing_if_resultant_opcodes_would_differ
266266
; CHECK: cmn
267-
; CHECK: b.gt
267+
; CHECK: b.le
268268
; CHECK: cmp
269269
; CHECK: b.gt
270270
entry:

‎llvm/test/CodeGen/AArch64/optimize-cond-branch.ll

+1-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ target triple = "arm64--"
1111
;
1212
; CHECK-LABEL: func
1313
; CHECK-NOT: and
14-
; CHECK: tbnz
14+
; CHECK: tbz
1515
define void @func() {
1616
%c0 = icmp sgt i64 0, 0
1717
br i1 %c0, label %b1, label %b6

‎llvm/test/CodeGen/AMDGPU/basic-branch.ll

+1-4
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,10 @@
88
; GCNNOOPT: v_writelane_b32
99
; GCN: s_cbranch_scc1 [[END:BB[0-9]+_[0-9]+]]
1010

11-
12-
; GCN: ; BB#1
1311
; GCNNOOPT: v_readlane_b32
1412
; GCNNOOPT: v_readlane_b32
1513
; GCN: buffer_store_dword
16-
; GCNOPT-NEXT: s_waitcnt vmcnt(0) expcnt(0)
17-
; TODO: This waitcnt can be eliminated
14+
; GCNNOOPT: s_endpgm
1815

1916
; GCN: {{^}}[[END]]:
2017
; GCN: s_endpgm

‎llvm/test/CodeGen/AMDGPU/branch-relaxation.ll

+2-1
Original file line numberDiff line numberDiff line change
@@ -491,7 +491,8 @@ ret:
491491

492492
; GCN-LABEL: {{^}}long_branch_hang:
493493
; GCN: s_cmp_lt_i32 s{{[0-9]+}}, 6
494-
; GCN-NEXT: s_cbranch_scc0 [[LONG_BR_0:BB[0-9]+_[0-9]+]]
494+
; GCN-NEXT: s_cbranch_scc1 {{BB[0-9]+_[0-9]+}}
495+
; GCN-NEXT: s_branch [[LONG_BR_0:BB[0-9]+_[0-9]+]]
495496
; GCN-NEXT: BB{{[0-9]+_[0-9]+}}:
496497

497498
; GCN: s_add_u32 vcc_lo, vcc_lo, [[LONG_BR_DEST0:BB[0-9]+_[0-9]+]]-(

‎llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
; RUN: llc -march=amdgcn -verify-machineinstrs -O0 < %s
33

44
; GCN-LABEL: {{^}}test_loop:
5-
; GCN: [[LABEL:BB[0-9+]_[0-9]+]]:
5+
; GCN: [[LABEL:BB[0-9+]_[0-9]+]]: ; %for.body{{$}}
66
; GCN: ds_read_b32
77
; GCN: ds_write_b32
88
; GCN: s_branch [[LABEL]]

‎llvm/test/CodeGen/AMDGPU/convergent-inlineasm.ll

+1
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ bb5: ; preds = %bb3, %bb
2929
; GCN: v_cmp_ne_u32_e64
3030

3131
; GCN: BB{{[0-9]+_[0-9]+}}:
32+
3233
define void @nonconvergent_inlineasm(i64 addrspace(1)* nocapture %arg) {
3334
bb:
3435
%tmp = call i32 @llvm.amdgcn.workitem.id.x()

‎llvm/test/CodeGen/AMDGPU/salu-to-valu.ll

+1-1
Original file line numberDiff line numberDiff line change
@@ -439,7 +439,7 @@ entry:
439439
; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
440440
; GCN-NOHSA: buffer_store_dword [[ONE]]
441441
; GCN-HSA: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[ONE]]
442-
; GCN; {{^}}[[EXIT]]:
442+
; GCN: {{^}}[[EXIT]]:
443443
; GCN: s_endpgm
444444
define void @sopc_vopc_legalize_bug(i32 %cond, i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
445445
bb3: ; preds = %bb2

‎llvm/test/CodeGen/ARM/2007-05-22-tailmerge-3.ll

+4-4
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,11 @@
1212
; CHECK: bl _quux
1313
; CHECK-NOT: bl _quux
1414

15-
; NOMERGE: bl _baz
16-
; NOMERGE: bl _baz
15+
; NOMERGE-DAG: bl _baz
16+
; NOMERGE-DAG: bl _baz
1717

18-
; NOMERGE: bl _quux
19-
; NOMERGE: bl _quux
18+
; NOMERGE-DAG: bl _quux
19+
; NOMERGE-DAG: bl _quux
2020

2121
; ModuleID = 'tail.c'
2222
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64"

‎llvm/test/CodeGen/ARM/atomic-cmpxchg.ll

+4-4
Original file line numberDiff line numberDiff line change
@@ -66,14 +66,14 @@ entry:
6666
; CHECK-ARMV7-NEXT: [[HEAD:.LBB[0-9_]+]]:
6767
; CHECK-ARMV7-NEXT: strexb [[SUCCESS:r[0-9]+]], r2, [r0]
6868
; CHECK-ARMV7-NEXT: cmp [[SUCCESS]], #0
69-
; CHECK-ARMV7-NEXT: moveq [[RES:r[0-9]+]], #1
69+
; CHECK-ARMV7-NEXT: moveq r0, #1
7070
; CHECK-ARMV7-NEXT: bxeq lr
7171
; CHECK-ARMV7-NEXT: [[TRY]]:
72-
; CHECK-ARMV7-NEXT: ldrexb [[LD:r[0-9]+]], [r0]
73-
; CHECK-ARMV7-NEXT: cmp [[LD]], [[DESIRED]]
72+
; CHECK-ARMV7-NEXT: ldrexb [[SUCCESS]], [r0]
73+
; CHECK-ARMV7-NEXT: cmp [[SUCCESS]], r1
7474
; CHECK-ARMV7-NEXT: beq [[HEAD]]
7575
; CHECK-ARMV7-NEXT: clrex
76-
; CHECK-ARMV7-NEXT: mov [[RES]], #0
76+
; CHECK-ARMV7-NEXT: mov r0, #0
7777
; CHECK-ARMV7-NEXT: bx lr
7878

7979
; CHECK-THUMBV7-LABEL: test_cmpxchg_res_i8:

‎llvm/test/CodeGen/ARM/fold-stack-adjust.ll

+1-1
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,7 @@ define void @test_fold_point(i1 %tst) minsize {
135135

136136
; Important to check for beginning of basic block, because if it gets
137137
; if-converted the test is probably no longer checking what it should.
138-
; CHECK: {{LBB[0-9]+_2}}:
138+
; CHECK: %end
139139
; CHECK-NEXT: vpop {d7, d8}
140140
; CHECK-NEXT: pop {r4, pc}
141141

‎llvm/test/CodeGen/PowerPC/tail-dup-break-cfg.ll

+7-7
Original file line numberDiff line numberDiff line change
@@ -16,14 +16,14 @@ target triple = "powerpc64le-grtev4-linux-gnu"
1616
;CHECK-NEXT: bc 12, 1, [[BODY1LABEL:[._0-9A-Za-z]+]]
1717
;CHECK-NEXT: # %test2
1818
;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30
19-
;CHECK-NEXT: beq 0, [[EXITLABEL:[._0-9A-Za-z]+]]
20-
;CHECK-NEXT: b [[BODY2LABEL:[._0-9A-Za-z]+]]
19+
;CHECK-NEXT: bne 0, [[BODY2LABEL:[._0-9A-Za-z]+]]
20+
;CHECK: [[EXITLABEL:[._0-9A-Za-z]+]]: # %exit
21+
;CHECK: blr
2122
;CHECK-NEXT: [[BODY1LABEL]]
2223
;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30
2324
;CHECK-NEXT: beq 0, [[EXITLABEL]]
24-
;CHECK-NEXT: [[BODY2LABEL]]
25-
;CHECK: [[EXITLABEL:[._0-9A-Za-z]+]]: # %exit
26-
;CHECK: blr
25+
;CHECK-NEXT: [[BODY2LABEL:[._0-9A-Za-z]+]]:
26+
;CHECK: b [[EXITLABEL]]
2727
define void @tail_dup_break_cfg(i32 %tag) {
2828
entry:
2929
br label %test1
@@ -79,7 +79,7 @@ body1:
7979
test2:
8080
%tagbit2 = and i32 %tag, 2
8181
%tagbit2eq0 = icmp ne i32 %tagbit2, 0
82-
br i1 %tagbit2eq0, label %body2, label %exit, !prof !1 ; %body2 more likely
82+
br i1 %tagbit2eq0, label %body2, label %exit, !prof !3 ; %body2 more likely
8383
body2:
8484
call void @b()
8585
call void @b()
@@ -137,4 +137,4 @@ ret:
137137

138138
!1 = !{!"branch_weights", i32 5, i32 3}
139139
!2 = !{!"branch_weights", i32 95, i32 5}
140-
!3 = !{!"branch_weights", i32 7, i32 3}
140+
!3 = !{!"branch_weights", i32 8, i32 3}

0 commit comments

Comments
 (0)
Please sign in to comment.