This is used to help get simplified CFG for divergent regions as well as
get better code generation in some cases.
For example, with below IR:
define amdgpu_kernel void @test() {
bb:
br label %bb1
bb1:
%tmp = phi i32 [ 0, %bb ], [ %tmp5, %bb4 ]
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%cnd = icmp eq i32 %tid, 0
br i1 %cnd, label %bb4, label %bb2
bb2:
%tmp3 = add nsw i32 %tmp, 1
br label %bb4
bb4:
%tmp5 = phi i32 [ %tmp3, %bb2 ], [ %tmp, %bb1 ]
store volatile i32 %tmp5, ptr addrspace(1) undef
br label %bb1
}We got below assembly before the change:
v_mov_b32_e32 v1, 0
v_cmp_eq_u32_e32 vcc, 0, v0
s_branch .LBB0_2
.LBB0_1: ; %bb4
; in Loop: Header=BB0_2 Depth=1
s_mov_b32 s2, -1
s_mov_b32 s3, 0xf000
buffer_store_dword v1, off, s[0:3], 0
s_waitcnt vmcnt(0)
.LBB0_2: ; %bb
; =>This Inner Loop Header: Depth=1
s_and_saveexec_b64 s[0:1], vcc
s_xor_b64 s[0:1], exec, s[0:1]
; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 killed $exec
s_cbranch_execnz .LBB0_1
; %bb.3: ; %bb2
; in Loop: Header=BB0_2 Depth=1
s_or_b64 exec, exec, s[0:1]
s_waitcnt expcnt(0)
v_add_i32_e64 v1, s[0:1], 1, v1
s_branch .LBB0_1After the change:
s_mov_b32 s0, 0
v_cmp_eq_u32_e32 vcc, 0, v0
s_mov_b32 s2, -1
s_mov_b32 s3, 0xf000
v_mov_b32_e32 v0, s0
s_branch .LBB0_2
.LBB0_1: ; %bb4
; in Loop: Header=BB0_2 Depth=1
buffer_store_dword v0, off, s[0:3], 0
s_waitcnt vmcnt(0)
.LBB0_2: ; %bb1
; =>This Inner Loop Header: Depth=1
s_and_saveexec_b64 s[0:1], vcc
s_cbranch_execnz .LBB0_1
; %bb.3: ; %bb2
; in Loop: Header=BB0_2 Depth=1
s_or_b64 exec, exec, s[0:1]
s_waitcnt expcnt(0)
v_add_i32_e64 v0, s[0:1], 1, v0
s_branch .LBB0_1We are using one less VGPR, one less s_xor_, and better LICM with one
additional branch after the change. Please note the experiment
was done with reverting the workaround D139780, as it will stop the
tail-duplication completely for this case.
Should add a comment that this is not a hard requirement