diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll --- a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -amdgpu-s-branch-bits=4 -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s @@ -19,25 +19,29 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1 -; GCN-LABEL: {{^}}uniform_conditional_max_short_forward_branch: -; GCN: s_load_dword [[CND:s[0-9]+]] -; GCN: s_cmp_eq_u32 [[CND]], 0 -; GCN-NEXT: s_cbranch_scc1 [[BB3:.LBB[0-9]+_[0-9]+]] - - -; GCN-NEXT: ; %bb.1: ; %bb2 -; GCN-NEXT: ;;#ASMSTART -; GCN-NEXT: v_nop_e64 -; GCN-NEXT: v_nop_e64 -; GCN-NEXT: v_nop_e64 -; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_sleep 0 - -; GCN-NEXT: [[BB3]]: ; %bb3 -; GCN: v_mov_b32_e32 [[V_CND:v[0-9]+]], [[CND]] -; GCN: buffer_store_dword [[V_CND]] -; GCN: s_endpgm define amdgpu_kernel void @uniform_conditional_max_short_forward_branch(ptr addrspace(1) %arg, i32 %cnd) #0 { +; GCN-LABEL: uniform_conditional_max_short_forward_branch: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dword s2, s[0:1], 0xb +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_eq_u32 s2, 0 +; GCN-NEXT: s_cbranch_scc1 .LBB0_2 +; GCN-NEXT: ; %bb.1: ; %bb2 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: s_sleep 0 +; GCN-NEXT: .LBB0_2: ; %bb3 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_endpgm bb: %cmp = icmp eq i32 %cnd, 0 br i1 %cmp, label %bb3, label %bb2 ; +8 dword branch @@ -56,31 +60,35 @@ ret void } -; GCN-LABEL: {{^}}uniform_conditional_min_long_forward_branch: -; GCN: s_load_dword [[CND:s[0-9]+]] -; GCN: s_cmp_eq_u32 [[CND]], 0 -; GCN-NEXT: s_cbranch_scc0 [[LONGBB:.LBB[0-9]+_[0-9]+]] - -; GCN-NEXT: {{.LBB[0-9]+_[0-9]+}}: ; %bb0 -; GCN-NEXT: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]] -; GCN-NEXT: [[POST_GETPC:.Lpost_getpc[0-9]+]]:{{$}} -; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], ([[ENDBB:.LBB[0-9]+_[0-9]+]]-[[POST_GETPC]])&4294967295 -; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], ([[ENDBB]]-[[POST_GETPC]])>>32 -; GCN-NEXT: s_setpc_b64 s[[[PC_LO]]:[[PC_HI]]] - -; GCN-NEXT: [[LONGBB]]: -; GCN-NEXT: ;;#ASMSTART -; GCN: v_nop_e64 -; GCN: v_nop_e64 -; GCN: v_nop_e64 -; GCN: v_nop_e64 -; GCN-NEXT: ;;#ASMEND - -; GCN-NEXT: [[ENDBB]]: -; GCN: v_mov_b32_e32 [[V_CND:v[0-9]+]], [[CND]] -; GCN: buffer_store_dword [[V_CND]] -; GCN: s_endpgm define amdgpu_kernel void @uniform_conditional_min_long_forward_branch(ptr addrspace(1) %arg, i32 %cnd) #0 { +; GCN-LABEL: uniform_conditional_min_long_forward_branch: +; GCN: ; %bb.0: ; %bb0 +; GCN-NEXT: s_load_dword s2, s[0:1], 0xb +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_eq_u32 s2, 0 +; GCN-NEXT: s_cbranch_scc0 .LBB1_1 +; GCN-NEXT: .LBB1_3: ; %bb0 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: .Lpost_getpc0: +; GCN-NEXT: s_add_u32 s4, s4, (.LBB1_2-.Lpost_getpc0)&4294967295 +; GCN-NEXT: s_addc_u32 s5, s5, (.LBB1_2-.Lpost_getpc0)>>32 +; GCN-NEXT: s_setpc_b64 s[4:5] +; GCN-NEXT: .LBB1_1: ; %bb2 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: .LBB1_2: ; %bb3 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_endpgm bb0: %cmp = icmp eq i32 %cnd, 0 br i1 %cmp, label %bb3, label %bb2 ; +9 dword branch @@ -99,31 +107,37 @@ ret void } -; GCN-LABEL: {{^}}uniform_conditional_min_long_forward_vcnd_branch: -; GCN: s_load_dword [[CND:s[0-9]+]] - -; GCN-DAG: v_cmp_eq_f32_e64 [[UNMASKED:s\[[0-9]+:[0-9]+\]]], [[CND]], 0 -; GCN-DAG: s_and_b64 vcc, exec, [[UNMASKED]] -; GCN: s_cbranch_vccz [[LONGBB:.LBB[0-9]+_[0-9]+]] - -; GCN-NEXT: {{.LBB[0-9]+_[0-9]+}}: ; %bb0 -; GCN-NEXT: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]] -; GCN-NEXT: [[POST_GETPC:.Lpost_getpc[0-9]+]]:{{$}} -; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], ([[ENDBB:.LBB[0-9]+_[0-9]+]]-[[POST_GETPC]])&4294967295 -; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], ([[ENDBB]]-[[POST_GETPC]])>>32 -; GCN-NEXT: s_setpc_b64 s[[[PC_LO]]:[[PC_HI]]] - -; GCN-NEXT: [[LONGBB]]: -; GCN: v_nop_e64 -; GCN: v_nop_e64 -; GCN: v_nop_e64 -; GCN: v_nop_e64 - -; GCN: [[ENDBB]]: -; GCN: v_mov_b32_e32 [[V_CND:v[0-9]+]], [[CND]] -; GCN: buffer_store_dword [[V_CND]] -; GCN: s_endpgm define amdgpu_kernel void @uniform_conditional_min_long_forward_vcnd_branch(ptr addrspace(1) %arg, float %cnd) #0 { +; GCN-LABEL: uniform_conditional_min_long_forward_vcnd_branch: +; GCN: ; %bb.0: ; %bb0 +; GCN-NEXT: s_load_dword s2, s[0:1], 0xb +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_f32_e64 s[4:5], s2, 0 +; GCN-NEXT: s_and_b64 vcc, exec, s[4:5] +; GCN-NEXT: s_cbranch_vccz .LBB2_1 +; GCN-NEXT: .LBB2_3: ; %bb0 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: .Lpost_getpc1: +; GCN-NEXT: s_add_u32 s4, s4, (.LBB2_2-.Lpost_getpc1)&4294967295 +; GCN-NEXT: s_addc_u32 s5, s5, (.LBB2_2-.Lpost_getpc1)>>32 +; GCN-NEXT: s_setpc_b64 s[4:5] +; GCN-NEXT: .LBB2_1: ; %bb2 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; 32 bytes +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: .LBB2_2: ; %bb3 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_endpgm bb0: %cmp = fcmp oeq float %cnd, 0.0 br i1 %cmp, label %bb3, label %bb2 ; + 8 dword branch @@ -141,21 +155,44 @@ ret void } -; GCN-LABEL: {{^}}min_long_forward_vbranch: - -; GCN: buffer_load_dword -; GCN: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}} -; GCN: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], vcc - -; GCN: v_nop_e64 -; GCN: v_nop_e64 -; GCN: v_nop_e64 -; GCN: v_nop_e64 - -; GCN: s_or_b64 exec, exec, [[SAVE]] -; GCN: buffer_store_dword -; GCN: s_endpgm define amdgpu_kernel void @min_long_forward_vbranch(ptr addrspace(1) %arg) #0 { +; GCN-LABEL: min_long_forward_vbranch: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GCN-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GCN-NEXT: s_cbranch_execnz .LBB3_1 +; GCN-NEXT: .LBB3_3: ; %bb +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: .Lpost_getpc2: +; GCN-NEXT: s_add_u32 s4, s4, (.LBB3_2-.Lpost_getpc2)&4294967295 +; GCN-NEXT: s_addc_u32 s5, s5, (.LBB3_2-.Lpost_getpc2)>>32 +; GCN-NEXT: s_setpc_b64 s[4:5] +; GCN-NEXT: .LBB3_1: ; %bb2 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; 32 bytes +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: .LBB3_2: ; %bb3 +; GCN-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN-NEXT: s_mov_b32 s0, s2 +; GCN-NEXT: s_mov_b32 s1, s2 +; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_endpgm bb: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = zext i32 %tid to i64 @@ -177,34 +214,29 @@ ret void } -; GCN-LABEL: {{^}}long_backward_sbranch: -; GCN: s_mov_b32 [[LOOPIDX:s[0-9]+]], 0{{$}} - -; GCN: .L[[LOOPBB:BB[0-9]+_[0-9]+]]: ; %bb2 -; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: s_add_i32 [[INC:s[0-9]+]], [[LOOPIDX]], 1 -; GCN-NEXT: s_cmp_lt_i32 [[INC]], 10 - -; GCN-NEXT: ;;#ASMSTART -; GCN-NEXT: v_nop_e64 -; GCN-NEXT: v_nop_e64 -; GCN-NEXT: v_nop_e64 -; GCN-NEXT: ;;#ASMEND - -; GCN-NEXT: s_cbranch_scc0 [[ENDBB:.LBB[0-9]+_[0-9]+]] - -; GCN-NEXT: {{.LBB[0-9]+_[0-9]+}}: ; %bb2 -; GCN-NEXT: ; in Loop: Header=[[LOOPBB]] Depth=1 - -; GCN-NEXT: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]] -; GCN-NEXT: [[POST_GETPC:.Lpost_getpc[0-9]+]]:{{$}} -; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], (.L[[LOOPBB]]-[[POST_GETPC]])&4294967295 -; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], (.L[[LOOPBB]]-[[POST_GETPC]])>>32 -; GCN-NEXT: s_setpc_b64 s[[[PC_LO]]:[[PC_HI]]] - -; GCN-NEXT: [[ENDBB]]: -; GCN-NEXT: s_endpgm define amdgpu_kernel void @long_backward_sbranch(ptr addrspace(1) %arg) #0 { +; GCN-LABEL: long_backward_sbranch: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_mov_b32 s0, 0 +; GCN-NEXT: .LBB4_1: ; %bb2 +; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-NEXT: s_add_i32 s0, s0, 1 +; GCN-NEXT: s_cmp_lt_i32 s0, 10 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: s_cbranch_scc0 .LBB4_2 +; GCN-NEXT: .LBB4_3: ; %bb2 +; GCN-NEXT: ; in Loop: Header=BB4_1 Depth=1 +; GCN-NEXT: s_getpc_b64 s[2:3] +; GCN-NEXT: .Lpost_getpc3: +; GCN-NEXT: s_add_u32 s2, s2, (.LBB4_1-.Lpost_getpc3)&4294967295 +; GCN-NEXT: s_addc_u32 s3, s3, (.LBB4_1-.Lpost_getpc3)>>32 +; GCN-NEXT: s_setpc_b64 s[2:3] +; GCN-NEXT: .LBB4_2: ; %bb3 +; GCN-NEXT: s_endpgm bb: br label %bb2 @@ -226,34 +258,59 @@ ; Requires expansion of unconditional branch from %bb2 to %bb4 (and ; expansion of conditional branch from %bb to %bb3. -; GCN-LABEL: {{^}}uniform_unconditional_min_long_forward_branch: -; GCN: s_cmp_eq_u32 -; GCN: s_cbranch_scc{{[0-1]}} [[BB1:.LBB[0-9]+_[0-9]+]] - -; GCN-NEXT: {{.LBB[0-9]+_[0-9]+}}: ; %bb0 -; GCN-NEXT: s_getpc_b64 s[[[PC0_LO:[0-9]+]]:[[PC0_HI:[0-9]+]]] -; GCN-NEXT: [[POST_GETPC:.Lpost_getpc[0-9]+]]:{{$}} -; GCN-NEXT: s_add_u32 s[[PC0_LO]], s[[PC0_LO]], ([[BB4:.LBB[0-9]_[0-9]+]]-[[POST_GETPC]])&4294967295 -; GCN-NEXT: s_addc_u32 s[[PC0_HI]], s[[PC0_HI]], ([[BB4]]-[[POST_GETPC]])>>32 -; GCN-NEXT: s_setpc_b64 s[[[PC0_LO]]:[[PC0_HI]]] - -; GCN: [[BB1]]: -; GCN: v_mov_b32_e32 [[BB2_K:v[0-9]+]], 17 -; GCN: buffer_store_dword [[BB2_K]] - -; GCN: v_mov_b32_e32 [[BB4_K:v[0-9]+]], 63 -; GCN: buffer_store_dword [[BB4_K]] -; GCN: s_endpgm - -; GCN: [[BB4]]: ; %bb3 -; GCN: v_nop_e64 -; GCN: v_nop_e64 -; GCN: v_nop_e64 -; GCN: v_nop_e64 -; GCN: ;;#ASMEND - -; GCN: .Lfunc_end{{[0-9]+}}: define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(ptr addrspace(1) %arg, i32 %arg1) { +; GCN-LABEL: uniform_unconditional_min_long_forward_branch: +; GCN: ; %bb.0: ; %bb0 +; GCN-NEXT: s_load_dword s2, s[0:1], 0xb +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_eq_u32 s2, 0 +; GCN-NEXT: s_mov_b64 s[2:3], -1 +; GCN-NEXT: s_cbranch_scc0 .LBB5_1 +; GCN-NEXT: .LBB5_7: ; %bb0 +; GCN-NEXT: s_getpc_b64 s[2:3] +; GCN-NEXT: .Lpost_getpc5: +; GCN-NEXT: s_add_u32 s2, s2, (.LBB5_4-.Lpost_getpc5)&4294967295 +; GCN-NEXT: s_addc_u32 s3, s3, (.LBB5_4-.Lpost_getpc5)>>32 +; GCN-NEXT: s_setpc_b64 s[2:3] +; GCN-NEXT: .LBB5_1: ; %Flow +; GCN-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN-NEXT: s_cbranch_vccnz .LBB5_3 +; GCN-NEXT: .LBB5_2: ; %bb2 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_mov_b32_e32 v0, 17 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: .LBB5_3: ; %bb4 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 63 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_endpgm +; GCN-NEXT: .LBB5_4: ; %bb3 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: s_cbranch_execnz .LBB5_5 +; GCN-NEXT: .LBB5_9: ; %bb3 +; GCN-NEXT: s_getpc_b64 s[2:3] +; GCN-NEXT: .Lpost_getpc6: +; GCN-NEXT: s_add_u32 s2, s2, (.LBB5_2-.Lpost_getpc6)&4294967295 +; GCN-NEXT: s_addc_u32 s3, s3, (.LBB5_2-.Lpost_getpc6)>>32 +; GCN-NEXT: s_setpc_b64 s[2:3] +; GCN-NEXT: .LBB5_5: ; %bb3 +; GCN-NEXT: s_getpc_b64 s[2:3] +; GCN-NEXT: .Lpost_getpc4: +; GCN-NEXT: s_add_u32 s2, s2, (.LBB5_3-.Lpost_getpc4)&4294967295 +; GCN-NEXT: s_addc_u32 s3, s3, (.LBB5_3-.Lpost_getpc4)>>32 +; GCN-NEXT: s_setpc_b64 s[2:3] bb0: %tmp = icmp ne i32 %arg1, 0 br i1 %tmp, label %bb2, label %bb3 @@ -276,31 +333,29 @@ ret void } -; GCN-LABEL: {{^}}uniform_unconditional_min_long_backward_branch: -; GCN-NEXT: ; %bb.0: ; %entry -; GCN-NEXT: s_and_b64 vcc, exec, -1 -; GCN-NEXT: .L[[LOOP:BB[0-9]_[0-9]+]]: ; %loop -; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: ;;#ASMSTART -; GCN-NEXT: v_nop_e64 -; GCN-NEXT: v_nop_e64 -; GCN-NEXT: v_nop_e64 -; GCN-NEXT: v_nop_e64 -; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_mov_b64 vcc, vcc -; GCN-NEXT: s_cbranch_vccz .LBB6_2 -; GCN-NEXT: {{.LBB[0-9]+_[0-9]+}}: ; %loop -; GCN-NEXT: ; in Loop: Header=[[LOOP]] Depth=1 - -; GCN-NEXT: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]] -; GCN-NEXT: [[POST_GETPC:.Lpost_getpc[0-9]+]]:{{$}} -; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], (.L[[LOOP]]-[[POST_GETPC]])&4294967295 -; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], (.L[[LOOP]]-[[POST_GETPC]])>>32 -; GCN-NEXT: s_setpc_b64 s[[[PC_LO]]:[[PC_HI]]] -; GCN-NEXT: .LBB6_2: ; %DummyReturnBlock -; GCN-NEXT: s_endpgm -; GCN-NEXT: .Lfunc_end{{[0-9]+}}: define amdgpu_kernel void @uniform_unconditional_min_long_backward_branch(ptr addrspace(1) %arg, i32 %arg1) { +; GCN-LABEL: uniform_unconditional_min_long_backward_branch: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_and_b64 vcc, exec, -1 +; GCN-NEXT: .LBB6_1: ; %loop +; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: s_mov_b64 vcc, vcc +; GCN-NEXT: s_cbranch_vccz .LBB6_2 +; GCN-NEXT: .LBB6_3: ; %loop +; GCN-NEXT: ; in Loop: Header=BB6_1 Depth=1 +; GCN-NEXT: s_getpc_b64 s[0:1] +; GCN-NEXT: .Lpost_getpc7: +; GCN-NEXT: s_add_u32 s0, s0, (.LBB6_1-.Lpost_getpc7)&4294967295 +; GCN-NEXT: s_addc_u32 s1, s1, (.LBB6_1-.Lpost_getpc7)>>32 +; GCN-NEXT: s_setpc_b64 s[0:1] +; GCN-NEXT: .LBB6_2: ; %DummyReturnBlock +; GCN-NEXT: s_endpgm entry: br label %loop @@ -317,42 +372,44 @@ ; Expansion of branch from %bb1 to %bb3 introduces need to expand ; branch from %bb0 to %bb2 -; GCN-LABEL: {{^}}expand_requires_expand: -; GCN-NEXT: ; %bb.0: ; %bb0 -; GCN: s_load_dword -; GCN: {{s|v}}_cmp_lt_i32 -; GCN: s_cbranch - -; GCN: s_load_dword -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 -; GCN-NEXT: s_cselect_b64 -; GCN: s_cbranch_vccz [[BB2:.LBB[0-9]_[0-9]+]] - -; GCN-NEXT: {{.LBB[0-9]+_[0-9]+}}: -; GCN-NEXT: s_getpc_b64 s[[[PC1_LO:[0-9]+]]:[[PC1_HI:[0-9]+]]] -; GCN-NEXT: [[POST_GETPC:.Lpost_getpc[0-9]+]]:{{$}} -; GCN-NEXT: s_add_u32 s[[PC1_LO]], s[[PC1_LO]], ([[BB3:.LBB[0-9]+_[0-9]+]]-[[POST_GETPC]])&4294967295 -; GCN-NEXT: s_addc_u32 s[[PC1_HI]], s[[PC1_HI]], ([[BB3:.LBB[0-9]+_[0-9]+]]-[[POST_GETPC]])>>32 -; GCN-NEXT: s_setpc_b64 s[[[PC1_LO]]:[[PC1_HI]]] - -; GCN-NEXT: [[BB2]]: ; %bb2 -; GCN-NEXT: ;;#ASMSTART -; GCN-NEXT: v_nop_e64 -; GCN-NEXT: v_nop_e64 -; GCN-NEXT: v_nop_e64 -; GCN-NEXT: v_nop_e64 -; GCN-NEXT: ;;#ASMEND - -; GCN-NEXT: [[BB3]]: ; %bb3 -; GCN-NEXT: ;;#ASMSTART -; GCN-NEXT: v_nop_e64 -; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: ;;#ASMSTART -; GCN-NEXT: v_nop_e64 -; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_endpgm define amdgpu_kernel void @expand_requires_expand(i32 %cond0) #0 { +; GCN-LABEL: expand_requires_expand: +; GCN: ; %bb.0: ; %bb0 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x9 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_lt_i32 s0, 0 +; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN-NEXT: s_and_b64 vcc, exec, s[0:1] +; GCN-NEXT: s_cbranch_vccnz .LBB7_2 +; GCN-NEXT: ; %bb.1: ; %bb1 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_lg_u32 s0, 3 +; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN-NEXT: .LBB7_2: ; %Flow +; GCN-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN-NEXT: s_cbranch_vccz .LBB7_3 +; GCN-NEXT: .LBB7_5: ; %Flow +; GCN-NEXT: s_getpc_b64 s[0:1] +; GCN-NEXT: .Lpost_getpc8: +; GCN-NEXT: s_add_u32 s0, s0, (.LBB7_4-.Lpost_getpc8)&4294967295 +; GCN-NEXT: s_addc_u32 s1, s1, (.LBB7_4-.Lpost_getpc8)>>32 +; GCN-NEXT: s_setpc_b64 s[0:1] +; GCN-NEXT: .LBB7_3: ; %bb2 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: .LBB7_4: ; %bb3 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: s_endpgm bb0: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %cmp0 = icmp slt i32 %cond0, 0 @@ -383,30 +440,36 @@ ; Requires expanding of required skip branch. -; GCN-LABEL: {{^}}uniform_inside_divergent: -; GCN: v_cmp_gt_u32_e32 vcc, 16, v{{[0-9]+}} -; GCN-NEXT: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc -; GCN-NEXT: s_cbranch_execnz [[IF:.LBB[0-9]+_[0-9]+]] - -; GCN-NEXT: {{.LBB[0-9]+_[0-9]+}}: ; %entry -; GCN-NEXT: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]] -; GCN-NEXT: [[POST_GETPC:.Lpost_getpc[0-9]+]]:{{$}} -; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], ([[BB2:.LBB[0-9]_[0-9]+]]-[[POST_GETPC]])&4294967295 -; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], ([[BB2:.LBB[0-9]_[0-9]+]]-[[POST_GETPC]])>>32 -; GCN-NEXT: s_setpc_b64 s[[[PC_LO]]:[[PC_HI]]] - -; GCN-NEXT: [[IF]]: ; %if -; GCN: s_cmp_lg_u32 -; GCN: s_cbranch_scc1 [[ENDIF:.LBB[0-9]+_[0-9]+]] - -; GCN-NEXT: ; %bb.2: ; %if_uniform -; GCN: buffer_store_dword - -; GCN-NEXT: [[ENDIF]]: ; %endif -; GCN-NEXT: s_or_b64 exec, exec, [[MASK]] -; GCN-NEXT: s_sleep 5 -; GCN-NEXT: s_endpgm define amdgpu_kernel void @uniform_inside_divergent(ptr addrspace(1) %out, i32 %cond) #0 { +; GCN-LABEL: uniform_inside_divergent: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_cbranch_execnz .LBB8_1 +; GCN-NEXT: .LBB8_4: ; %entry +; GCN-NEXT: s_getpc_b64 s[0:1] +; GCN-NEXT: .Lpost_getpc9: +; GCN-NEXT: s_add_u32 s0, s0, (.LBB8_3-.Lpost_getpc9)&4294967295 +; GCN-NEXT: s_addc_u32 s1, s1, (.LBB8_3-.Lpost_getpc9)>>32 +; GCN-NEXT: s_setpc_b64 s[0:1] +; GCN-NEXT: .LBB8_1: ; %if +; GCN-NEXT: s_load_dword s6, s[0:1], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_lg_u32 s6, 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_cbranch_scc1 .LBB8_3 +; GCN-NEXT: ; %bb.2: ; %if_uniform +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 1 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: .LBB8_3: ; %endif +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_sleep 5 +; GCN-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %d_cmp = icmp ult i32 %tid, 16 @@ -430,37 +493,56 @@ ; si_mask_branch -; GCN-LABEL: {{^}}analyze_mask_branch: -; GCN: v_cmp_nlt_f32_e32 vcc -; GCN-NEXT: s_and_saveexec_b64 [[TEMP_MASK:s\[[0-9]+:[0-9]+\]]], vcc -; GCN-NEXT: s_xor_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], exec, [[TEMP_MASK]] - -; GCN: .LBB{{[0-9]+_[0-9]+}}: ; %Flow1 -; GCN-NEXT: s_andn2_saveexec_b64 [[MASK]], [[MASK]] -; GCN-NEXT: s_cbranch_execnz - -; GCN: .L[[LOOP_BODY:BB[0-9]+_[0-9]+]]: ; %loop{{$}} -; GCN: ;;#ASMSTART -; GCN: v_nop_e64 -; GCN: v_nop_e64 -; GCN: v_nop_e64 -; GCN: v_nop_e64 -; GCN: v_nop_e64 -; GCN: v_nop_e64 -; GCN: ;;#ASMEND -; GCN: s_cbranch_{{vccz|vccnz}} [[RET:.LBB[0-9]+_[0-9]+]] - -; GCN-NEXT: {{.LBB[0-9]+_[0-9]+}}: ; %loop -; GCN-NEXT: ; in Loop: Header=[[LOOP_BODY]] Depth=1 -; GCN-NEXT: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]] -; GCN-NEXT: [[POST_GETPC:.Lpost_getpc[0-9]+]]:{{$}} -; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], (.L[[LOOP_BODY]]-[[POST_GETPC]])&4294967295 -; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], (.L[[LOOP_BODY]]-[[POST_GETPC]])>>32 -; GCN-NEXT: s_setpc_b64 s[[[PC_LO]]:[[PC_HI]]] - -; GCN-NEXT: [[RET]]: ; %UnifiedReturnBlock -; GCN-NEXT: s_endpgm define amdgpu_kernel void @analyze_mask_branch() #0 { +; GCN-LABEL: analyze_mask_branch: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: v_mov_b32_e64 v0, 0 +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 +; GCN-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GCN-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GCN-NEXT: s_cbranch_execz .LBB9_2 +; GCN-NEXT: ; %bb.1: ; %ret +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_mov_b32_e32 v0, 7 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: .LBB9_2: ; %Flow1 +; GCN-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GCN-NEXT: s_cbranch_execnz .LBB9_3 +; GCN-NEXT: .LBB9_6: ; %Flow1 +; GCN-NEXT: s_getpc_b64 s[0:1] +; GCN-NEXT: .Lpost_getpc10: +; GCN-NEXT: s_add_u32 s0, s0, (.LBB9_5-.Lpost_getpc10)&4294967295 +; GCN-NEXT: s_addc_u32 s1, s1, (.LBB9_5-.Lpost_getpc10)>>32 +; GCN-NEXT: s_setpc_b64 s[0:1] +; GCN-NEXT: .LBB9_3: ; %loop.preheader +; GCN-NEXT: s_and_b64 vcc, exec, 0 +; GCN-NEXT: .LBB9_4: ; %loop +; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: s_mov_b64 vcc, vcc +; GCN-NEXT: s_cbranch_vccnz .LBB9_5 +; GCN-NEXT: .LBB9_8: ; %loop +; GCN-NEXT: ; in Loop: Header=BB9_4 Depth=1 +; GCN-NEXT: s_getpc_b64 s[0:1] +; GCN-NEXT: .Lpost_getpc11: +; GCN-NEXT: s_add_u32 s0, s0, (.LBB9_4-.Lpost_getpc11)&4294967295 +; GCN-NEXT: s_addc_u32 s1, s1, (.LBB9_4-.Lpost_getpc11)>>32 +; GCN-NEXT: s_setpc_b64 s[0:1] +; GCN-NEXT: .LBB9_5: ; %UnifiedReturnBlock +; GCN-NEXT: s_endpgm entry: %reg = call float asm sideeffect "v_mov_b32_e64 $0, 0", "=v"() %cmp0 = fcmp ogt float %reg, 0.000000e+00 @@ -487,28 +569,71 @@ ret void } -; GCN-LABEL: {{^}}long_branch_hang: -; GCN: s_cmp_lt_i32 s{{[0-9]+}}, 6 -; GCN: s_cbranch_scc{{[0-1]}} [[LONG_BR_0:.LBB[0-9]+_[0-9]+]] -; GCN-NEXT: BB{{[0-9]+_[0-9]+}}: - -; GCN: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]] -; GCN-NEXT: [[POST_GETPC:.Lpost_getpc[0-9]+]]:{{$}} -; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], ([[LONG_BR_DEST0:.LBB[0-9]+_[0-9]+]]-[[POST_GETPC]])&4294967295 -; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], ([[LONG_BR_DEST0]]-[[POST_GETPC]])>>32 -; GCN-NEXT: s_setpc_b64 s[[[PC_LO]]:[[PC_HI]]] -; GCN-NEXT: [[LONG_BR_0]]: - -; GCN: [[LONG_BR_DEST0]]: - -; GCN-DAG: s_cmp_lt_i32 -; GCN-DAG: s_cmp_ge_i32 - -; GCN: s_cbranch_vccz -; GCN: s_setpc_b64 - -; GCN: s_endpgm define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i64 %arg5) #0 { +; GCN-LABEL: long_branch_hang: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_eq_u32 s4, 0 +; GCN-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN-NEXT: s_cmp_lg_u32 s4, 0 +; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GCN-NEXT: s_cmp_lt_i32 s7, 6 +; GCN-NEXT: s_cbranch_scc1 .LBB10_1 +; GCN-NEXT: .LBB10_8: ; %bb +; GCN-NEXT: s_getpc_b64 s[8:9] +; GCN-NEXT: .Lpost_getpc12: +; GCN-NEXT: s_add_u32 s8, s8, (.LBB10_2-.Lpost_getpc12)&4294967295 +; GCN-NEXT: s_addc_u32 s9, s9, (.LBB10_2-.Lpost_getpc12)>>32 +; GCN-NEXT: s_setpc_b64 s[8:9] +; GCN-NEXT: .LBB10_1: ; %bb13 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: s_cbranch_execz .LBB10_3 +; GCN-NEXT: s_branch .LBB10_4 +; GCN-NEXT: .LBB10_2: +; GCN-NEXT: s_mov_b64 s[8:9], 0 +; GCN-NEXT: .LBB10_3: ; %bb9 +; GCN-NEXT: s_cmp_lt_i32 s7, 1 +; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GCN-NEXT: s_cmp_ge_i32 s6, s7 +; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GCN-NEXT: s_and_b64 s[8:9], s[10:11], s[8:9] +; GCN-NEXT: .LBB10_4: ; %Flow5 +; GCN-NEXT: s_andn2_b64 vcc, exec, s[8:9] +; GCN-NEXT: s_cbranch_vccz .LBB10_5 +; GCN-NEXT: .LBB10_10: ; %Flow5 +; GCN-NEXT: s_getpc_b64 s[2:3] +; GCN-NEXT: .Lpost_getpc13: +; GCN-NEXT: s_add_u32 s2, s2, (.LBB10_6-.Lpost_getpc13)&4294967295 +; GCN-NEXT: s_addc_u32 s3, s3, (.LBB10_6-.Lpost_getpc13)>>32 +; GCN-NEXT: s_setpc_b64 s[2:3] +; GCN-NEXT: .LBB10_5: ; %bb14 +; GCN-NEXT: s_cmp_lt_i32 s5, 9 +; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN-NEXT: s_cmp_lt_i32 s6, s7 +; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] +; GCN-NEXT: s_and_b64 s[2:3], s[2:3], s[4:5] +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] +; GCN-NEXT: s_branch .LBB10_7 +; GCN-NEXT: .LBB10_6: +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: .LBB10_7: ; %bb19 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xf +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 +; GCN-NEXT: s_endpgm bb: %tmp = icmp slt i32 %arg2, 9 %tmp6 = icmp eq i32 %arg1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll --- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll +++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll @@ -1,48 +1,125 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -amdgpu-remove-redundant-endcf < %s | FileCheck -enable-var-scope -check-prefix=GCN %s ; Disabled endcf collapse at -O0. ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -O0 -amdgpu-remove-redundant-endcf < %s | FileCheck -enable-var-scope -check-prefix=GCN-O0 %s -; GCN-LABEL: {{^}}simple_nested_if: -; GCN: s_and_saveexec_b64 [[SAVEEXEC:s\[[0-9:]+\]]] -; GCN-NEXT: s_cbranch_execz [[ENDIF:.LBB[0-9_]+]] -; GCN: s_and_b64 exec, exec, vcc -; GCN-NEXT: s_cbranch_execz [[ENDIF]] -; GCN-NEXT: ; %bb.{{[0-9]+}}: -; GCN: store_dword -; GCN-NEXT: {{^}}[[ENDIF]]: -; GCN-NEXT: s_or_b64 exec, exec, [[SAVEEXEC]] -; GCN: ds_write_b32 -; GCN: s_endpgm -; -; GCN-O0-LABEL: {{^}}simple_nested_if: -; GCN-O0: s_mov_b64 s[{{[0-9:]+}}], exec -; GCN-O0-DAG: v_writelane_b32 [[VGPR:v[0-9]+]], s{{[0-9]+}}, [[OUTER_SPILL_LANE_0:[0-9]+]] -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[OUTER_SPILL_LANE_1:[0-9]+]] -; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] -; GCN-O0-NEXT: s_mov_b64 exec, s[{{[0-9:]+}}] -; GCN-O0-NEXT: s_cbranch_execz [[ENDIF_OUTER:.LBB[0-9_]+]] -; GCN-O0-NEXT: ; %bb.{{[0-9]+}}: -; GCN-O0: s_mov_b64 s[{{[0-9:]+}}], exec -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_SPILL_LANE_0:[0-9]+]] -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_SPILL_LANE_1:[0-9]+]] -; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] -; GCN-O0-NEXT: s_mov_b64 exec, s[{{[0-9:]+}}] -; GCN-O0-NEXT: s_cbranch_execz [[ENDIF_INNER:.LBB[0-9_]+]] -; GCN-O0-NEXT: ; %bb.{{[0-9]+}}: -; GCN-O0: store_dword -; GCN-O0-NEXT: {{^}}[[ENDIF_INNER]]: -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[INNER_SPILL_LANE_0]] -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[INNER_SPILL_LANE_1]] -; GCN-O0-NEXT: s_or_b64 exec, exec, s[{{[0-9:]+}}] -; GCN-O0-NEXT: {{^}}[[ENDIF_OUTER]]: -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[OUTER_SPILL_LANE_0]] -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[OUTER_SPILL_LANE_1]] -; GCN-O0-NEXT: s_or_b64 exec, exec, s[{{[0-9:]+}}] -; GCN-O0: ds_write_b32 -; GCN-O0: s_endpgm -; define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) { +; GCN-LABEL: simple_nested_if: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_cmp_lt_u32_e32 vcc, 1, v0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_cbranch_execz .LBB0_3 +; GCN-NEXT: ; %bb.1: ; %bb.outer.then +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, 0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 2, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v2, v[1:2], s[0:3], 0 addr64 +; GCN-NEXT: s_and_b64 exec, exec, vcc +; GCN-NEXT: s_cbranch_execz .LBB0_3 +; GCN-NEXT: ; %bb.2: ; %bb.inner.then +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v2, s1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s0, v1 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc +; GCN-NEXT: s_mov_b32 s0, s2 +; GCN-NEXT: s_mov_b32 s1, s2 +; GCN-NEXT: v_mov_b32_e32 v2, 1 +; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 offset:4 +; GCN-NEXT: .LBB0_3: ; %bb.outer.end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v0, 3 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_mov_b32 m0, -1 +; GCN-NEXT: ds_write_b32 v1, v0 +; GCN-NEXT: s_endpgm +; +; GCN-O0-LABEL: simple_nested_if: +; GCN-O0: ; %bb.0: ; %bb +; GCN-O0-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GCN-O0-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GCN-O0-NEXT: s_mov_b32 s10, -1 +; GCN-O0-NEXT: s_mov_b32 s11, 0xe8f000 +; GCN-O0-NEXT: s_add_u32 s8, s8, s3 +; GCN-O0-NEXT: s_addc_u32 s9, s9, 0 +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: v_writelane_b32 v1, s0, 0 +; GCN-O0-NEXT: v_writelane_b32 v1, s1, 1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v0 +; GCN-O0-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b32 s0, 1 +; GCN-O0-NEXT: v_cmp_gt_u32_e64 s[2:3], v0, s0 +; GCN-O0-NEXT: s_mov_b64 s[0:1], exec +; GCN-O0-NEXT: v_writelane_b32 v1, s0, 2 +; GCN-O0-NEXT: v_writelane_b32 v1, s1, 3 +; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] +; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] +; GCN-O0-NEXT: s_cbranch_execz .LBB0_4 +; GCN-O0-NEXT: ; %bb.1: ; %bb.outer.then +; GCN-O0-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: v_readlane_b32 s4, v1, 0 +; GCN-O0-NEXT: v_readlane_b32 s5, v1, 1 +; GCN-O0-NEXT: s_mov_b32 s2, 0xf000 +; GCN-O0-NEXT: s_mov_b32 s0, 0 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s2 +; GCN-O0-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GCN-O0-NEXT: s_mov_b64 s[6:7], s[0:1] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_ashrrev_i32_e64 v4, 31, v0 +; GCN-O0-NEXT: s_waitcnt expcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v2, v0 +; GCN-O0-NEXT: v_mov_b32_e32 v3, v4 +; GCN-O0-NEXT: s_mov_b32 s0, 2 +; GCN-O0-NEXT: v_lshl_b64 v[3:4], v[2:3], s0 +; GCN-O0-NEXT: v_mov_b32_e32 v2, 0 +; GCN-O0-NEXT: buffer_store_dword v2, v[3:4], s[4:7], 0 addr64 +; GCN-O0-NEXT: v_cmp_ne_u32_e64 s[2:3], v0, s0 +; GCN-O0-NEXT: s_mov_b64 s[0:1], exec +; GCN-O0-NEXT: v_writelane_b32 v1, s0, 4 +; GCN-O0-NEXT: v_writelane_b32 v1, s1, 5 +; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] +; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] +; GCN-O0-NEXT: s_cbranch_execz .LBB0_3 +; GCN-O0-NEXT: ; %bb.2: ; %bb.inner.then +; GCN-O0-NEXT: s_waitcnt expcnt(0) +; GCN-O0-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: v_readlane_b32 s0, v1, 0 +; GCN-O0-NEXT: v_readlane_b32 s1, v1, 1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, 1 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_add_i32_e64 v2, s[2:3], v2, v0 +; GCN-O0-NEXT: v_ashrrev_i32_e64 v4, 31, v2 +; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v3, v4 +; GCN-O0-NEXT: s_mov_b32 s2, 2 +; GCN-O0-NEXT: v_lshl_b64 v[2:3], v[2:3], s2 +; GCN-O0-NEXT: s_mov_b32 s2, 0xf000 +; GCN-O0-NEXT: s_mov_b32 s4, 0 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GCN-O0-NEXT: s_mov_b32 s5, s2 +; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] +; GCN-O0-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 +; GCN-O0-NEXT: .LBB0_3: ; %Flow +; GCN-O0-NEXT: v_readlane_b32 s0, v1, 4 +; GCN-O0-NEXT: v_readlane_b32 s1, v1, 5 +; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN-O0-NEXT: .LBB0_4: ; %bb.outer.end +; GCN-O0-NEXT: v_readlane_b32 s0, v1, 2 +; GCN-O0-NEXT: v_readlane_b32 s1, v1, 3 +; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN-O0-NEXT: s_waitcnt expcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v2, 3 +; GCN-O0-NEXT: v_mov_b32_e32 v0, 0 +; GCN-O0-NEXT: s_mov_b32 m0, -1 +; GCN-O0-NEXT: ds_write_b32 v0, v2 +; GCN-O0-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = icmp ugt i32 %tmp, 1 @@ -65,53 +142,151 @@ ret void } -; GCN-LABEL: {{^}}uncollapsable_nested_if: -; GCN: s_and_saveexec_b64 [[SAVEEXEC_OUTER:s\[[0-9:]+\]]] -; GCN-NEXT: s_cbranch_execz [[ENDIF_OUTER:.LBB[0-9_]+]] -; GCN: s_and_saveexec_b64 [[SAVEEXEC_INNER:s\[[0-9:]+\]]] -; GCN-NEXT: s_cbranch_execz [[ENDIF_INNER:.LBB[0-9_]+]] -; GCN-NEXT: ; %bb.{{[0-9]+}}: -; GCN: store_dword -; GCN-NEXT: {{^}}[[ENDIF_INNER]]: -; GCN-NEXT: s_or_b64 exec, exec, [[SAVEEXEC_INNER]] -; GCN: store_dword -; GCN-NEXT: {{^}}[[ENDIF_OUTER]]: -; GCN-NEXT: s_or_b64 exec, exec, [[SAVEEXEC_OUTER]] -; GCN: ds_write_b32 -; GCN: s_endpgm -; -; GCN-O0-LABEL: {{^}}uncollapsable_nested_if: -; GCN-O0: s_mov_b64 s[{{[0-9:]+}}], exec -; GCN-O0-DAG: v_writelane_b32 [[VGPR:v[0-9]+]], s{{[0-9]+}}, [[OUTER_SPILL_LANE_0:[0-9]+]] -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[OUTER_SPILL_LANE_1:[0-9]+]] -; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}] -; GCN-O0-NEXT: s_mov_b64 exec, s[{{[0-9:]+}}] -; GCN-O0-NEXT: s_cbranch_execz [[ENDIF_OUTER:.LBB[0-9_]+]] -; GCN-O0-NEXT: ; %bb.{{[0-9]+}}: -; GCN-O0: s_mov_b64 s[{{[0-9:]+}}], exec -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_SPILL_LANE_0:[0-9]+]] -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_SPILL_LANE_1:[0-9]+]] -; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}] -; GCN-O0-NEXT: s_mov_b64 exec, s[{{[0-9:]+}}] -; GCN-O0-NEXT: s_cbranch_execz [[ENDIF_INNER:.LBB[0-9_]+]] -; GCN-O0-NEXT: ; %bb.{{[0-9]+}}: -; GCN-O0: store_dword -; GCN-O0-NEXT: s_branch [[ENDIF_INNER]] -; GCN-O0-NEXT: {{^}}[[ENDIF_OUTER]]: -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[OUTER_SPILL_LANE_0]] -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[OUTER_SPILL_LANE_1]] -; GCN-O0-NEXT: s_or_b64 exec, exec, s[{{[0-9:]+}}] -; GCN-O0-NEXT: s_branch [[LAST_BB:.LBB[0-9_]+]] -; GCN-O0-NEXT: {{^}}[[ENDIF_INNER]]: -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[INNER_SPILL_LANE_0]] -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[INNER_SPILL_LANE_1]] -; GCN-O0-NEXT: s_or_b64 exec, exec, s[{{[0-9:]+}}] -; GCN-O0: s_branch [[ENDIF_OUTER]] -; GCN-O0-NEXT: {{^}}[[LAST_BB]]: -; GCN-O0: ds_write_b32 -; GCN-O0: s_endpgm -; define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %arg) { +; GCN-LABEL: uncollapsable_nested_if: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_cmp_lt_u32_e32 vcc, 1, v0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_cbranch_execz .LBB1_4 +; GCN-NEXT: ; %bb.1: ; %bb.outer.then +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 2, v0 +; GCN-NEXT: v_mov_b32_e32 v4, 0 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v2, s1 +; GCN-NEXT: v_add_i32_e32 v1, vcc, s0, v3 +; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 2, v0 +; GCN-NEXT: buffer_store_dword v4, v[3:4], s[0:3], 0 addr64 +; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GCN-NEXT: s_cbranch_execz .LBB1_3 +; GCN-NEXT: ; %bb.2: ; %bb.inner.then +; GCN-NEXT: s_mov_b32 s0, s2 +; GCN-NEXT: s_mov_b32 s1, s2 +; GCN-NEXT: v_mov_b32_e32 v0, 1 +; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 offset:4 +; GCN-NEXT: .LBB1_3: ; %bb.inner.end +; GCN-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-NEXT: s_mov_b32 s0, s2 +; GCN-NEXT: s_mov_b32 s1, s2 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 2 +; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 offset:8 +; GCN-NEXT: .LBB1_4: ; %Flow +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 3 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_mov_b32 m0, -1 +; GCN-NEXT: ds_write_b32 v1, v0 +; GCN-NEXT: s_endpgm +; +; GCN-O0-LABEL: uncollapsable_nested_if: +; GCN-O0: ; %bb.0: ; %bb +; GCN-O0-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GCN-O0-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GCN-O0-NEXT: s_mov_b32 s10, -1 +; GCN-O0-NEXT: s_mov_b32 s11, 0xe8f000 +; GCN-O0-NEXT: s_add_u32 s8, s8, s3 +; GCN-O0-NEXT: s_addc_u32 s9, s9, 0 +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: v_writelane_b32 v1, s0, 0 +; GCN-O0-NEXT: v_writelane_b32 v1, s1, 1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v0 +; GCN-O0-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b32 s0, 1 +; GCN-O0-NEXT: v_cmp_gt_u32_e64 s[2:3], v0, s0 +; GCN-O0-NEXT: s_mov_b64 s[0:1], exec +; GCN-O0-NEXT: v_writelane_b32 v1, s0, 2 +; GCN-O0-NEXT: v_writelane_b32 v1, s1, 3 +; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] +; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] +; GCN-O0-NEXT: s_cbranch_execz .LBB1_3 +; GCN-O0-NEXT: ; %bb.1: ; %bb.outer.then +; GCN-O0-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: v_readlane_b32 s4, v1, 0 +; GCN-O0-NEXT: v_readlane_b32 s5, v1, 1 +; GCN-O0-NEXT: s_mov_b32 s2, 0xf000 +; GCN-O0-NEXT: s_mov_b32 s0, 0 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s2 +; GCN-O0-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GCN-O0-NEXT: s_mov_b64 s[6:7], s[0:1] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_ashrrev_i32_e64 v4, 31, v0 +; GCN-O0-NEXT: s_waitcnt expcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v2, v0 +; GCN-O0-NEXT: v_mov_b32_e32 v3, v4 +; GCN-O0-NEXT: s_mov_b32 s0, 2 +; GCN-O0-NEXT: v_lshl_b64 v[3:4], v[2:3], s0 +; GCN-O0-NEXT: v_mov_b32_e32 v2, 0 +; GCN-O0-NEXT: buffer_store_dword v2, v[3:4], s[4:7], 0 addr64 +; GCN-O0-NEXT: v_cmp_ne_u32_e64 s[2:3], v0, s0 +; GCN-O0-NEXT: s_mov_b64 s[0:1], exec +; GCN-O0-NEXT: v_writelane_b32 v1, s0, 4 +; GCN-O0-NEXT: v_writelane_b32 v1, s1, 5 +; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] +; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] +; GCN-O0-NEXT: s_cbranch_execz .LBB1_4 +; GCN-O0-NEXT: ; %bb.2: ; %bb.inner.then +; GCN-O0-NEXT: s_waitcnt expcnt(0) +; GCN-O0-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: v_readlane_b32 s0, v1, 0 +; GCN-O0-NEXT: v_readlane_b32 s1, v1, 1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, 1 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_add_i32_e64 v2, s[2:3], v2, v0 +; GCN-O0-NEXT: v_ashrrev_i32_e64 v4, 31, v2 +; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v3, v4 +; GCN-O0-NEXT: s_mov_b32 s2, 2 +; GCN-O0-NEXT: v_lshl_b64 v[2:3], v[2:3], s2 +; GCN-O0-NEXT: s_mov_b32 s2, 0xf000 +; GCN-O0-NEXT: s_mov_b32 s4, 0 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GCN-O0-NEXT: s_mov_b32 s5, s2 +; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] +; GCN-O0-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 +; GCN-O0-NEXT: s_branch .LBB1_4 +; GCN-O0-NEXT: .LBB1_3: ; %Flow +; GCN-O0-NEXT: v_readlane_b32 s0, v1, 2 +; GCN-O0-NEXT: v_readlane_b32 s1, v1, 3 +; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN-O0-NEXT: s_branch .LBB1_5 +; GCN-O0-NEXT: .LBB1_4: ; %bb.inner.end +; GCN-O0-NEXT: s_waitcnt expcnt(0) +; GCN-O0-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: v_readlane_b32 s2, v1, 4 +; GCN-O0-NEXT: v_readlane_b32 s3, v1, 5 +; GCN-O0-NEXT: s_or_b64 exec, exec, s[2:3] +; GCN-O0-NEXT: v_readlane_b32 s0, v1, 0 +; GCN-O0-NEXT: v_readlane_b32 s1, v1, 1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, 2 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_add_i32_e64 v2, s[2:3], v2, v0 +; GCN-O0-NEXT: v_ashrrev_i32_e64 v4, 31, v2 +; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v3, v4 +; GCN-O0-NEXT: v_lshl_b64 v[2:3], v[2:3], v0 +; GCN-O0-NEXT: s_mov_b32 s2, 0xf000 +; GCN-O0-NEXT: s_mov_b32 s4, 0 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GCN-O0-NEXT: s_mov_b32 s5, s2 +; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] +; GCN-O0-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 +; GCN-O0-NEXT: s_branch .LBB1_3 +; GCN-O0-NEXT: .LBB1_5: ; %bb.outer.end +; GCN-O0-NEXT: s_waitcnt expcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v2, 3 +; GCN-O0-NEXT: v_mov_b32_e32 v0, 0 +; GCN-O0-NEXT: s_mov_b32 m0, -1 +; GCN-O0-NEXT: ds_write_b32 v0, v2 +; GCN-O0-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = icmp ugt i32 %tmp, 1 @@ -140,65 +315,170 @@ ret void } -; GCN-LABEL: {{^}}nested_if_if_else: -; GCN: s_and_saveexec_b64 [[SAVEEXEC_OUTER:s\[[0-9:]+\]]] -; GCN-NEXT: s_cbranch_execz [[ENDIF_OUTER:.LBB[0-9_]+]] -; GCN: s_and_saveexec_b64 [[SAVEEXEC_INNER:s\[[0-9:]+\]]] -; GCN-NEXT: s_xor_b64 [[SAVEEXEC_INNER2:s\[[0-9:]+\]]], exec, [[SAVEEXEC_INNER]] -; GCN-NEXT: s_cbranch_execz [[THEN_INNER:.LBB[0-9_]+]] -; GCN-NEXT: ; %bb.{{[0-9]+}}: -; GCN: store_dword -; GCN: {{^}}[[THEN_INNER]]: -; GCN-NEXT: s_andn2_saveexec_b64 [[SAVEEXEC_INNER2]], [[SAVEEXEC_INNER2]] -; GCN-NEXT: s_cbranch_execz [[ENDIF_OUTER]] -; GCN: store_dword -; GCN-NEXT: {{^}}[[ENDIF_OUTER]]: -; GCN-NEXT: s_or_b64 exec, exec, [[SAVEEXEC_OUTER]] -; GCN: ds_write_b32 -; GCN: s_endpgm -; -; GCN-O0-LABEL: {{^}}nested_if_if_else: -; GCN-O0: s_mov_b64 s[{{[0-9:]+}}], exec -; GCN-O0-DAG: v_writelane_b32 [[VGPR:v[0-9]+]], s{{[0-9]+}}, [[OUTER_SPILL_LANE_0:[0-9]+]] -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[OUTER_SPILL_LANE_1:[0-9]+]] -; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}] -; GCN-O0-NEXT: s_mov_b64 exec, s[{{[0-9:]+}}] -; GCN-O0-NEXT: s_cbranch_execz [[ENDIF_OUTER:.LBB[0-9_]+]] -; GCN-O0-NEXT: ; %bb.{{[0-9]+}}: -; GCN-O0: s_mov_b64 s[{{[0-9:]+}}], exec -; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] -; GCN-O0-NEXT: s_xor_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[THEN_SPILL_LANE_0:[0-9]+]] -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[THEN_SPILL_LANE_1:[0-9]+]] -; GCN-O0-NEXT: s_mov_b64 exec, s[{{[0-9:]+}}] -; GCN-O0-NEXT: s_cbranch_execz [[THEN_INNER:.LBB[0-9_]+]] -; GCN-O0-NEXT: s_branch [[TEMP_BB:.LBB[0-9_]+]] -; GCN-O0-NEXT: {{^}}[[THEN_INNER]]: -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[THEN_SPILL_LANE_0]] -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[THEN_SPILL_LANE_1]] -; GCN-O0-NEXT: s_or_saveexec_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}] -; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}], exec, s[{{[0-9:]+}}] -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_SPILL_LANE_0:[0-9]+]] -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_SPILL_LANE_1:[0-9]+]] -; GCN-O0-NEXT: s_xor_b64 exec, exec, s[{{[0-9:]+}}] -; GCN-O0-NEXT: s_cbranch_execz [[ENDIF_INNER:.LBB[0-9_]+]] -; GCN-O0-NEXT: ; %bb.{{[0-9]+}}: -; GCN-O0: store_dword -; GCN-O0-NEXT: s_branch [[ENDIF_INNER]] -; GCN-O0-NEXT: {{^}}[[TEMP_BB]]: -; GCN-O0: s_branch [[THEN_INNER]] -; GCN-O0-NEXT: {{^}}[[ENDIF_INNER]]: -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[INNER_SPILL_LANE_0]] -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[INNER_SPILL_LANE_1]] -; GCN-O0-NEXT: s_or_b64 exec, exec, s[{{[0-9:]+}}] -; GCN-O0-NEXT: {{^}}[[ENDIF_OUTER]]: -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[OUTER_SPILL_LANE_0]] -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[OUTER_SPILL_LANE_1]] -; GCN-O0-NEXT: s_or_b64 exec, exec, s[{{[0-9:]+}}] -; GCN-O0: ds_write_b32 -; GCN-O0: s_endpgm -; define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) { +; GCN-LABEL: nested_if_if_else: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, 0 +; GCN-NEXT: v_cmp_lt_u32_e32 vcc, 1, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v2, v[1:2], s[0:3], 0 addr64 +; GCN-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GCN-NEXT: s_cbranch_execz .LBB2_5 +; GCN-NEXT: ; %bb.1: ; %bb.outer.then +; GCN-NEXT: v_mov_b32_e32 v4, s1 +; GCN-NEXT: v_add_i32_e32 v3, vcc, s0, v1 +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 2, v0 +; GCN-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GCN-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GCN-NEXT: s_cbranch_execz .LBB2_3 +; GCN-NEXT: ; %bb.2: ; %bb.else +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: v_mov_b32_e32 v0, 2 +; GCN-NEXT: buffer_store_dword v0, v[3:4], s[4:7], 0 addr64 offset:8 +; GCN-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GCN-NEXT: .LBB2_3: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GCN-NEXT: s_cbranch_execz .LBB2_5 +; GCN-NEXT: ; %bb.4: ; %bb.then +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 1 +; GCN-NEXT: buffer_store_dword v0, v[3:4], s[4:7], 0 addr64 offset:4 +; GCN-NEXT: .LBB2_5: ; %bb.outer.end +; GCN-NEXT: s_or_b64 exec, exec, s[2:3] +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 3 +; GCN-NEXT: s_mov_b32 m0, -1 +; GCN-NEXT: ds_write_b32 v2, v0 +; GCN-NEXT: s_endpgm +; +; GCN-O0-LABEL: nested_if_if_else: +; GCN-O0: ; %bb.0: ; %bb +; GCN-O0-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GCN-O0-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GCN-O0-NEXT: s_mov_b32 s10, -1 +; GCN-O0-NEXT: s_mov_b32 s11, 0xe8f000 +; GCN-O0-NEXT: s_add_u32 s8, s8, s3 +; GCN-O0-NEXT: s_addc_u32 s9, s9, 0 +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_mov_b64 s[2:3], s[0:1] +; GCN-O0-NEXT: v_writelane_b32 v1, s2, 0 +; GCN-O0-NEXT: v_writelane_b32 v1, s3, 1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v0 +; GCN-O0-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b32 s2, 0xf000 +; GCN-O0-NEXT: s_mov_b32 s4, 0 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GCN-O0-NEXT: s_mov_b32 s5, s2 +; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] +; GCN-O0-NEXT: s_mov_b32 s4, 0 +; GCN-O0-NEXT: ; implicit-def: $sgpr4 +; GCN-O0-NEXT: v_mov_b32_e32 v4, 0 +; GCN-O0-NEXT: s_waitcnt expcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v2, v0 +; GCN-O0-NEXT: v_mov_b32_e32 v3, v4 +; GCN-O0-NEXT: s_mov_b32 s4, 2 +; GCN-O0-NEXT: v_lshl_b64 v[3:4], v[2:3], s4 +; GCN-O0-NEXT: v_mov_b32_e32 v2, 0 +; GCN-O0-NEXT: buffer_store_dword v2, v[3:4], s[0:3], 0 addr64 +; GCN-O0-NEXT: s_mov_b32 s0, 1 +; GCN-O0-NEXT: v_cmp_gt_u32_e64 s[2:3], v0, s0 +; GCN-O0-NEXT: s_mov_b64 s[0:1], exec +; GCN-O0-NEXT: v_writelane_b32 v1, s0, 2 +; GCN-O0-NEXT: v_writelane_b32 v1, s1, 3 +; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] +; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] +; GCN-O0-NEXT: s_cbranch_execz .LBB2_6 +; GCN-O0-NEXT: ; %bb.1: ; %bb.outer.then +; GCN-O0-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b32 s0, 2 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_cmp_ne_u32_e64 s[0:1], v0, s0 +; GCN-O0-NEXT: s_mov_b64 s[2:3], exec +; GCN-O0-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] +; GCN-O0-NEXT: s_xor_b64 s[2:3], s[0:1], s[2:3] +; GCN-O0-NEXT: v_writelane_b32 v1, s2, 4 +; GCN-O0-NEXT: v_writelane_b32 v1, s3, 5 +; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] +; GCN-O0-NEXT: s_cbranch_execz .LBB2_2 +; GCN-O0-NEXT: s_branch .LBB2_4 +; GCN-O0-NEXT: .LBB2_2: ; %Flow +; GCN-O0-NEXT: v_readlane_b32 s0, v1, 4 +; GCN-O0-NEXT: v_readlane_b32 s1, v1, 5 +; GCN-O0-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GCN-O0-NEXT: s_and_b64 s[0:1], exec, s[0:1] +; GCN-O0-NEXT: v_writelane_b32 v1, s0, 6 +; GCN-O0-NEXT: v_writelane_b32 v1, s1, 7 +; GCN-O0-NEXT: s_xor_b64 exec, exec, s[0:1] +; GCN-O0-NEXT: s_cbranch_execz .LBB2_5 +; GCN-O0-NEXT: ; %bb.3: ; %bb.then +; GCN-O0-NEXT: s_waitcnt expcnt(0) +; GCN-O0-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: v_readlane_b32 s0, v1, 0 +; GCN-O0-NEXT: v_readlane_b32 s1, v1, 1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, 1 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_add_i32_e64 v2, s[2:3], v2, v0 +; GCN-O0-NEXT: v_ashrrev_i32_e64 v4, 31, v2 +; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v3, v4 +; GCN-O0-NEXT: s_mov_b32 s2, 2 +; GCN-O0-NEXT: v_lshl_b64 v[2:3], v[2:3], s2 +; GCN-O0-NEXT: s_mov_b32 s2, 0xf000 +; GCN-O0-NEXT: s_mov_b32 s4, 0 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GCN-O0-NEXT: s_mov_b32 s5, s2 +; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] +; GCN-O0-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 +; GCN-O0-NEXT: s_branch .LBB2_5 +; GCN-O0-NEXT: .LBB2_4: ; %bb.else +; GCN-O0-NEXT: s_waitcnt expcnt(0) +; GCN-O0-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: v_readlane_b32 s0, v1, 0 +; GCN-O0-NEXT: v_readlane_b32 s1, v1, 1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, 2 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_add_i32_e64 v2, s[2:3], v2, v0 +; GCN-O0-NEXT: v_ashrrev_i32_e64 v4, 31, v2 +; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v3, v4 +; GCN-O0-NEXT: v_lshl_b64 v[2:3], v[2:3], v0 +; GCN-O0-NEXT: s_mov_b32 s2, 0xf000 +; GCN-O0-NEXT: s_mov_b32 s4, 0 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GCN-O0-NEXT: s_mov_b32 s5, s2 +; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] +; GCN-O0-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 +; GCN-O0-NEXT: s_branch .LBB2_2 +; GCN-O0-NEXT: .LBB2_5: ; %Flow1 +; GCN-O0-NEXT: v_readlane_b32 s0, v1, 6 +; GCN-O0-NEXT: v_readlane_b32 s1, v1, 7 +; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN-O0-NEXT: .LBB2_6: ; %bb.outer.end +; GCN-O0-NEXT: v_readlane_b32 s0, v1, 2 +; GCN-O0-NEXT: v_readlane_b32 s1, v1, 3 +; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN-O0-NEXT: s_waitcnt expcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v2, 3 +; GCN-O0-NEXT: v_mov_b32_e32 v0, 0 +; GCN-O0-NEXT: s_mov_b32 m0, -1 +; GCN-O0-NEXT: ds_write_b32 v0, v2 +; GCN-O0-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %tmp @@ -227,88 +507,225 @@ ret void } -; GCN-LABEL: {{^}}nested_if_else_if: -; GCN: s_and_saveexec_b64 [[SAVEEXEC_OUTER:s\[[0-9:]+\]]] -; GCN-NEXT: s_xor_b64 [[SAVEEXEC_OUTER2:s\[[0-9:]+\]]], exec, [[SAVEEXEC_OUTER]] -; GCN-NEXT: s_cbranch_execz [[THEN_OUTER:.LBB[0-9_]+]] -; GCN-NEXT: ; %bb.{{[0-9]+}}: -; GCN: store_dword -; GCN-NEXT: s_and_saveexec_b64 [[SAVEEXEC_INNER_IF_OUTER_ELSE:s\[[0-9:]+\]]] -; GCN-NEXT: s_cbranch_execz [[THEN_OUTER_FLOW:.LBB[0-9_]+]] -; GCN-NEXT: ; %bb.{{[0-9]+}}: -; GCN: store_dword -; GCN-NEXT: {{^}}[[THEN_OUTER_FLOW]]: -; GCN-NEXT: s_or_b64 exec, exec, [[SAVEEXEC_INNER_IF_OUTER_ELSE]] -; GCN: {{^}}[[THEN_OUTER]]: -; GCN-NEXT: s_andn2_saveexec_b64 [[SAVEEXEC_OUTER2]], [[SAVEEXEC_OUTER2]] -; GCN-NEXT: s_cbranch_execz [[ENDIF_OUTER:.LBB[0-9_]+]] -; GCN-NEXT: ; %bb.{{[0-9]+}}: -; GCN: store_dword -; GCN-NEXT: s_and_saveexec_b64 [[SAVEEXEC_ELSE:s\[[0-9:]+\]]], -; GCN-NEXT: s_cbranch_execz [[FLOW1:.LBB[0-9_]+]] -; GCN-NEXT: ; %bb.{{[0-9]+}}: -; GCN: store_dword -; GCN-NEXT: [[FLOW1]]: -; GCN-NEXT: s_or_b64 exec, exec, [[SAVEEXEC_ELSE]] -; GCN: s_or_b64 exec, exec, [[SAVEEXEC_OUTER2]] -; GCN: ds_write_b32 -; GCN: s_endpgm -; -; GCN-O0-LABEL: {{^}}nested_if_else_if: -; GCN-O0: s_mov_b64 s[{{[0-9:]+}}], exec -; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] -; GCN-O0-NEXT: s_xor_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] -; GCN-O0-DAG: v_writelane_b32 [[VGPR:v[0-9]+]], s{{[0-9]+}}, [[OUTER_SPILL_LANE_0:[0-9]+]] -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[OUTER_SPILL_LANE_1:[0-9]+]] -; GCN-O0-NEXT: s_mov_b64 exec, s[{{[0-9:]+}}] -; GCN-O0-NEXT: s_cbranch_execz [[THEN_OUTER:.LBB[0-9_]+]] -; GCN-O0-NEXT: s_branch [[INNER_IF_OUTER_ELSE:.LBB[0-9_]+]] -; GCN-O0-NEXT: {{^}}[[THEN_OUTER]]: -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[OUTER_SPILL_LANE_0]] -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[OUTER_SPILL_LANE_1]] -; GCN-O0-NEXT: s_or_saveexec_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}] -; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}], exec, s[{{[0-9:]+}}] -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[OUTER_2_SPILL_LANE_0:[0-9]+]] -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[OUTER_2_SPILL_LANE_1:[0-9]+]] -; GCN-O0-NEXT: s_xor_b64 exec, exec, s[{{[0-9:]+}}] -; GCN-O0-NEXT: s_cbranch_execz [[ENDIF_OUTER:.LBB[0-9_]+]] -; GCN-O0-NEXT: ; %bb.{{[0-9]+}}: -; GCN-O0: store_dword -; GCN-O0: s_mov_b64 s[{{[0-9:]+}}], exec -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[ELSE_SPILL_LANE_0:[0-9]+]] -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[ELSE_SPILL_LANE_1:[0-9]+]] -; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] -; GCN-O0-NEXT: s_mov_b64 exec, s[{{[0-9:]+}}] -; GCN-O0-NEXT: s_cbranch_execz [[FLOW1:.LBB[0-9_]+]] -; GCN-O0-NEXT: ; %bb.{{[0-9]+}}: -; GCN-O0: store_dword -; GCN-O0-NEXT: s_branch [[FLOW1]] -; GCN-O0-NEXT: {{^}}[[INNER_IF_OUTER_ELSE]] -; GCN-O0: s_mov_b64 s[{{[0-9:]+}}], exec -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_IF_OUTER_ELSE_SPILL_LANE_0:[0-9]+]] -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_IF_OUTER_ELSE_SPILL_LANE_1:[0-9]+]] -; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] -; GCN-O0-NEXT: s_mov_b64 exec, s[{{[0-9:]+}}] -; GCN-O0-NEXT: s_cbranch_execz [[THEN_OUTER_FLOW:.LBB[0-9_]+]] -; GCN-O0-NEXT: ; %bb.{{[0-9]+}}: -; GCN-O0: store_dword -; GCN-O0-NEXT: {{^}}[[THEN_OUTER_FLOW]] -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[INNER_IF_OUTER_ELSE_SPILL_LANE_0]] -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[INNER_IF_OUTER_ELSE_SPILL_LANE_1]] -; GCN-O0-NEXT: s_or_b64 exec, exec, s[{{[0-9:]+}}] -; GCN-O0-NEXT: s_branch [[THEN_OUTER]] -; GCN-O0-NEXT: {{^}}[[FLOW1]] -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[ELSE_SPILL_LANE_0]] -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[ELSE_SPILL_LANE_1]] -; GCN-O0-NEXT: s_or_b64 exec, exec, s[{{[0-9:]+}}] -; GCN-O0-NEXT: {{^}}[[ENDIF_OUTER]] -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[OUTER_2_SPILL_LANE_0]] -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[OUTER_2_SPILL_LANE_1]] -; GCN-O0-NEXT: s_or_b64 exec, exec, s[{{[0-9:]+}}] -; GCN-O0: ds_write_b32 -; GCN-O0: s_endpgm -; define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) { +; GCN-LABEL: nested_if_else_if: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 2, v0 +; GCN-NEXT: v_mov_b32_e32 v4, 0 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v2, s1 +; GCN-NEXT: v_add_i32_e32 v1, vcc, s0, v3 +; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 2, v0 +; GCN-NEXT: buffer_store_dword v4, v[3:4], s[0:3], 0 addr64 +; GCN-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[0:1] +; GCN-NEXT: s_cbranch_execz .LBB3_4 +; GCN-NEXT: ; %bb.1: ; %bb.outer.else +; GCN-NEXT: s_mov_b32 s0, s2 +; GCN-NEXT: s_mov_b32 s1, s2 +; GCN-NEXT: v_mov_b32_e32 v3, 3 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 +; GCN-NEXT: buffer_store_dword v3, v[1:2], s[0:3], 0 addr64 offset:12 +; GCN-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GCN-NEXT: s_cbranch_execz .LBB3_3 +; GCN-NEXT: ; %bb.2: ; %bb.inner.then2 +; GCN-NEXT: s_mov_b32 s10, 0 +; GCN-NEXT: s_mov_b32 s11, 0xf000 +; GCN-NEXT: s_mov_b32 s8, s10 +; GCN-NEXT: s_mov_b32 s9, s10 +; GCN-NEXT: v_mov_b32_e32 v0, 4 +; GCN-NEXT: buffer_store_dword v0, v[1:2], s[8:11], 0 addr64 offset:16 +; GCN-NEXT: .LBB3_3: ; %Flow +; GCN-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: .LBB3_4: ; %Flow2 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB3_8 +; GCN-NEXT: ; %bb.5: ; %bb.outer.then +; GCN-NEXT: s_mov_b32 s0, s2 +; GCN-NEXT: s_mov_b32 s1, s2 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v3, 1 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 +; GCN-NEXT: buffer_store_dword v3, v[1:2], s[0:3], 0 addr64 offset:4 +; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GCN-NEXT: s_cbranch_execz .LBB3_7 +; GCN-NEXT: ; %bb.6: ; %bb.inner.then +; GCN-NEXT: v_mov_b32_e32 v0, 2 +; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 offset:8 +; GCN-NEXT: .LBB3_7: ; %Flow1 +; GCN-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-NEXT: .LBB3_8: ; %bb.outer.end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 3 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_mov_b32 m0, -1 +; GCN-NEXT: ds_write_b32 v1, v0 +; GCN-NEXT: s_endpgm +; +; GCN-O0-LABEL: nested_if_else_if: +; GCN-O0: ; %bb.0: ; %bb +; GCN-O0-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GCN-O0-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GCN-O0-NEXT: s_mov_b32 s10, -1 +; GCN-O0-NEXT: s_mov_b32 s11, 0xe8f000 +; GCN-O0-NEXT: s_add_u32 s8, s8, s3 +; GCN-O0-NEXT: s_addc_u32 s9, s9, 0 +; GCN-O0-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v0 +; GCN-O0-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b32 s0, 0 +; GCN-O0-NEXT: ; implicit-def: $sgpr0 +; GCN-O0-NEXT: v_mov_b32_e32 v4, 0 +; GCN-O0-NEXT: s_waitcnt expcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v2, v0 +; GCN-O0-NEXT: v_mov_b32_e32 v3, v4 +; GCN-O0-NEXT: s_mov_b32 s0, 2 +; GCN-O0-NEXT: s_mov_b32 s1, s0 +; GCN-O0-NEXT: v_lshl_b64 v[3:4], v[2:3], s1 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_mov_b32 s2, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v3 +; GCN-O0-NEXT: s_mov_b32 s1, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v6, v4 +; GCN-O0-NEXT: v_add_i32_e64 v5, s[2:3], s2, v2 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s1 +; GCN-O0-NEXT: v_addc_u32_e64 v2, s[2:3], v2, v6, s[2:3] +; GCN-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v6, v2 +; GCN-O0-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b32 s1, 0xf000 +; GCN-O0-NEXT: s_mov_b32 s2, 0 +; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b32 s3, s1 +; GCN-O0-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GCN-O0-NEXT: s_mov_b64 s[6:7], s[2:3] +; GCN-O0-NEXT: v_mov_b32_e32 v2, 0 +; GCN-O0-NEXT: buffer_store_dword v2, v[3:4], s[4:7], 0 addr64 +; GCN-O0-NEXT: v_cmp_lt_u32_e64 s[0:1], v0, s0 +; GCN-O0-NEXT: s_mov_b64 s[2:3], exec +; GCN-O0-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] +; GCN-O0-NEXT: s_xor_b64 s[2:3], s[0:1], s[2:3] +; GCN-O0-NEXT: v_writelane_b32 v1, s2, 0 +; GCN-O0-NEXT: v_writelane_b32 v1, s3, 1 +; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] +; GCN-O0-NEXT: s_cbranch_execz .LBB3_1 +; GCN-O0-NEXT: s_branch .LBB3_4 +; GCN-O0-NEXT: .LBB3_1: ; %Flow2 +; GCN-O0-NEXT: v_readlane_b32 s0, v1, 0 +; GCN-O0-NEXT: v_readlane_b32 s1, v1, 1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GCN-O0-NEXT: s_and_b64 s[0:1], exec, s[0:1] +; GCN-O0-NEXT: v_writelane_b32 v1, s0, 2 +; GCN-O0-NEXT: v_writelane_b32 v1, s1, 3 +; GCN-O0-NEXT: s_xor_b64 exec, exec, s[0:1] +; GCN-O0-NEXT: s_cbranch_execz .LBB3_8 +; GCN-O0-NEXT: ; %bb.2: ; %bb.outer.then +; GCN-O0-NEXT: s_waitcnt expcnt(0) +; GCN-O0-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b32 s0, 0xf000 +; GCN-O0-NEXT: s_mov_b32 s2, 0 +; GCN-O0-NEXT: s_mov_b32 s4, s2 +; GCN-O0-NEXT: s_mov_b32 s5, s0 +; GCN-O0-NEXT: s_mov_b32 s0, s2 +; GCN-O0-NEXT: s_mov_b32 s1, s2 +; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] +; GCN-O0-NEXT: v_mov_b32_e32 v2, 1 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v2, v[3:4], s[0:3], 0 addr64 offset:4 +; GCN-O0-NEXT: s_mov_b32 s0, 2 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s0 +; GCN-O0-NEXT: s_mov_b64 s[0:1], exec +; GCN-O0-NEXT: v_writelane_b32 v1, s0, 4 +; GCN-O0-NEXT: v_writelane_b32 v1, s1, 5 +; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] +; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] +; GCN-O0-NEXT: s_cbranch_execz .LBB3_7 +; GCN-O0-NEXT: ; %bb.3: ; %bb.inner.then +; GCN-O0-NEXT: s_waitcnt expcnt(0) +; GCN-O0-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b32 s0, 0xf000 +; GCN-O0-NEXT: s_mov_b32 s2, 0 +; GCN-O0-NEXT: s_mov_b32 s4, s2 +; GCN-O0-NEXT: s_mov_b32 s5, s0 +; GCN-O0-NEXT: s_mov_b32 s0, s2 +; GCN-O0-NEXT: s_mov_b32 s1, s2 +; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] +; GCN-O0-NEXT: v_mov_b32_e32 v0, 2 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 offset:8 +; GCN-O0-NEXT: s_branch .LBB3_7 +; GCN-O0-NEXT: .LBB3_4: ; %bb.outer.else +; GCN-O0-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b32 s0, 0xf000 +; GCN-O0-NEXT: s_mov_b32 s2, 0 +; GCN-O0-NEXT: s_mov_b32 s4, s2 +; GCN-O0-NEXT: s_mov_b32 s5, s0 +; GCN-O0-NEXT: s_mov_b32 s0, s2 +; GCN-O0-NEXT: s_mov_b32 s1, s2 +; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] +; GCN-O0-NEXT: s_waitcnt expcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v2, 3 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v2, v[3:4], s[0:3], 0 addr64 offset:12 +; GCN-O0-NEXT: s_mov_b32 s0, 2 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s0 +; GCN-O0-NEXT: s_mov_b64 s[0:1], exec +; GCN-O0-NEXT: v_writelane_b32 v1, s0, 6 +; GCN-O0-NEXT: v_writelane_b32 v1, s1, 7 +; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] +; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] +; GCN-O0-NEXT: s_cbranch_execz .LBB3_6 +; GCN-O0-NEXT: ; %bb.5: ; %bb.inner.then2 +; GCN-O0-NEXT: s_waitcnt expcnt(0) +; GCN-O0-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b32 s0, 0xf000 +; GCN-O0-NEXT: s_mov_b32 s2, 0 +; GCN-O0-NEXT: s_mov_b32 s4, s2 +; GCN-O0-NEXT: s_mov_b32 s5, s0 +; GCN-O0-NEXT: s_mov_b32 s0, s2 +; GCN-O0-NEXT: s_mov_b32 s1, s2 +; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] +; GCN-O0-NEXT: v_mov_b32_e32 v0, 4 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 offset:16 +; GCN-O0-NEXT: .LBB3_6: ; %Flow +; GCN-O0-NEXT: v_readlane_b32 s0, v1, 6 +; GCN-O0-NEXT: v_readlane_b32 s1, v1, 7 +; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN-O0-NEXT: s_branch .LBB3_1 +; GCN-O0-NEXT: .LBB3_7: ; %Flow1 +; GCN-O0-NEXT: v_readlane_b32 s0, v1, 4 +; GCN-O0-NEXT: v_readlane_b32 s1, v1, 5 +; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN-O0-NEXT: .LBB3_8: ; %bb.outer.end +; GCN-O0-NEXT: v_readlane_b32 s0, v1, 2 +; GCN-O0-NEXT: v_readlane_b32 s1, v1, 3 +; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN-O0-NEXT: s_waitcnt expcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v2, 3 +; GCN-O0-NEXT: v_mov_b32_e32 v0, 0 +; GCN-O0-NEXT: s_mov_b32 m0, -1 +; GCN-O0-NEXT: ds_write_b32 v0, v2 +; GCN-O0-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %tmp @@ -343,33 +760,74 @@ ret void } -; GCN-LABEL: {{^}}s_endpgm_unsafe_barrier: -; GCN: s_and_saveexec_b64 [[SAVEEXEC:s\[[0-9:]+\]]] -; GCN-NEXT: s_cbranch_execz [[ENDIF:.LBB[0-9_]+]] -; GCN-NEXT: ; %bb.{{[0-9]+}}: -; GCN: store_dword -; GCN-NEXT: {{^}}[[ENDIF]]: -; GCN-NEXT: s_or_b64 exec, exec, [[SAVEEXEC]] -; GCN: s_barrier -; GCN-NEXT: s_endpgm -; -; GCN-O0-LABEL: {{^}}s_endpgm_unsafe_barrier: -; GCN-O0: s_mov_b64 s[{{[0-9:]+}}], exec -; GCN-O0-DAG: v_writelane_b32 [[VGPR:v[0-9]+]], s{{[0-9]+}}, [[SPILL_LANE_0:[0-9]+]] -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[SPILL_LANE_1:[0-9]+]] -; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}] -; GCN-O0-NEXT: s_mov_b64 exec, s[{{[0-9:]+}}] -; GCN-O0-NEXT: s_cbranch_execz [[ENDIF:.LBB[0-9_]+]] -; GCN-O0-NEXT: ; %bb.{{[0-9]+}}: -; GCN-O0: store_dword -; GCN-O0-NEXT: {{^}}[[ENDIF]]: -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[SPILL_LANE_0]] -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[SPILL_LANE_1]] -; GCN-O0-NEXT: s_or_b64 exec, exec, s[{{[0-9:]+}}] -; GCN-O0: s_barrier -; GCN-O0: s_endpgm -; define amdgpu_kernel void @s_endpgm_unsafe_barrier(ptr addrspace(1) nocapture %arg) { +; GCN-LABEL: s_endpgm_unsafe_barrier: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_cmp_lt_u32_e32 vcc, 1, v0 +; GCN-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GCN-NEXT: s_cbranch_execz .LBB4_2 +; GCN-NEXT: ; %bb.1: ; %bb.then +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v1, v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: .LBB4_2: ; %bb.end +; GCN-NEXT: s_or_b64 exec, exec, s[2:3] +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_barrier +; GCN-NEXT: s_endpgm +; +; GCN-O0-LABEL: s_endpgm_unsafe_barrier: +; GCN-O0: ; %bb.0: ; %bb +; GCN-O0-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GCN-O0-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GCN-O0-NEXT: s_mov_b32 s10, -1 +; GCN-O0-NEXT: s_mov_b32 s11, 0xe8f000 +; GCN-O0-NEXT: s_add_u32 s8, s8, s3 +; GCN-O0-NEXT: s_addc_u32 s9, s9, 0 +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: v_writelane_b32 v1, s0, 0 +; GCN-O0-NEXT: v_writelane_b32 v1, s1, 1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v0 +; GCN-O0-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b32 s0, 1 +; GCN-O0-NEXT: v_cmp_gt_u32_e64 s[2:3], v0, s0 +; GCN-O0-NEXT: s_mov_b64 s[0:1], exec +; GCN-O0-NEXT: v_writelane_b32 v1, s0, 2 +; GCN-O0-NEXT: v_writelane_b32 v1, s1, 3 +; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] +; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] +; GCN-O0-NEXT: s_cbranch_execz .LBB4_2 +; GCN-O0-NEXT: ; %bb.1: ; %bb.then +; GCN-O0-NEXT: s_waitcnt expcnt(0) +; GCN-O0-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: v_readlane_b32 s0, v1, 0 +; GCN-O0-NEXT: v_readlane_b32 s1, v1, 1 +; GCN-O0-NEXT: s_mov_b32 s2, 0xf000 +; GCN-O0-NEXT: s_mov_b32 s4, 0 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GCN-O0-NEXT: s_mov_b32 s5, s2 +; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_ashrrev_i32_e64 v0, 31, v2 +; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v3, v0 +; GCN-O0-NEXT: s_mov_b32 s4, 2 +; GCN-O0-NEXT: v_lshl_b64 v[2:3], v[2:3], s4 +; GCN-O0-NEXT: v_mov_b32_e32 v0, 0 +; GCN-O0-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 +; GCN-O0-NEXT: .LBB4_2: ; %bb.end +; GCN-O0-NEXT: v_readlane_b32 s0, v1, 2 +; GCN-O0-NEXT: v_readlane_b32 s1, v1, 3 +; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-O0-NEXT: s_barrier +; GCN-O0-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = icmp ugt i32 %tmp, 1 @@ -385,103 +843,320 @@ ret void } -; GCN-LABEL: {{^}}scc_liveness: - -; GCN: [[BB1_OUTER_LOOP:.LBB[0-9]+_[0-9]+]]: -; GCN: s_or_b64 exec, exec, [[SAVEEXEC_OUTER:s\[[0-9:]+\]]] -; -; GCN: [[BB1_INNER_LOOP:.LBB[0-9]+_[0-9]+]]: -; GCN: s_or_b64 exec, exec, s{{\[[0-9]+:[0-9]+\]}} -; GCN: s_andn2_b64 -; GCN-NEXT: s_cbranch_execz - -; GCN: [[BB1_LOOP:.LBB[0-9]+_[0-9]+]]: -; GCN: s_andn2_b64 exec, exec, -; GCN-NEXT: s_cbranch_execnz [[BB1_LOOP]] - -; GCN: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offen - -; GCN: s_and_saveexec_b64 [[SAVEEXEC_OUTER]], {{vcc|s\[[0-9:]+\]}} -; GCN-NEXT: s_cbranch_execz [[BB1_OUTER_LOOP]] - -; GCN-NOT: s_or_b64 exec, exec - -; GCN: s_or_b64 exec, exec, s{{\[[0-9]+:[0-9]+\]}} -; GCN: buffer_store_dword -; GCN: buffer_store_dword -; GCN: buffer_store_dword -; GCN: buffer_store_dword -; GCN: s_setpc_b64 -; -; GCN-O0-LABEL: {{^}}scc_liveness: -; GCN-O0-COUNT-2: buffer_store_dword -; GCN-O0-DAG: v_writelane_b32 [[VGPR:v[0-9]+]], s{{[0-9]+}}, [[INNER_LOOP_IN_EXEC_SPILL_LANE_0:[0-9]+]] -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_IN_EXEC_SPILL_LANE_1:[0-9]+]] -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_0:[0-9]+]] -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_1:[0-9]+]] -; GCN-O0: [[INNER_LOOP:.LBB[0-9]+_[0-9]+]]: -; GCN-O0: buffer_load_dword -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_0]] -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_1]] -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[INNER_LOOP_IN_EXEC_SPILL_LANE_0]] -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[INNER_LOOP_IN_EXEC_SPILL_LANE_1]] -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[OUTER_LOOP_EXEC_SPILL_LANE_0:[0-9]+]] -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[OUTER_LOOP_EXEC_SPILL_LANE_1:[0-9]+]] -; GCN-O0: s_or_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_OUT_EXEC_SPILL_LANE_0:[0-9]+]] -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_OUT_EXEC_SPILL_LANE_1:[0-9]+]] -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_IN_EXEC_SPILL_LANE_0]] -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_IN_EXEC_SPILL_LANE_1]] -; GCN-O0-NEXT: s_mov_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}] -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_0]] -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_1]] -; GCN-O0-NEXT: s_andn2_b64 exec, exec, s[{{[0-9:]+}}] -; GCN-O0-NEXT: s_cbranch_execnz [[INNER_LOOP]] -; GCN-O0-NEXT: ; %bb.{{[0-9]+}}: -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[INNER_LOOP_OUT_EXEC_SPILL_LANE_0]] -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[INNER_LOOP_OUT_EXEC_SPILL_LANE_1]] -; GCN-O0-NEXT: s_or_b64 exec, exec, s[{{[0-9:]+}}] -; GCN-O0: s_mov_b64 s[{{[0-9:]+}}], exec -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[FLOW2_IN_EXEC_SPILL_LANE_0:[0-9]+]] -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[FLOW2_IN_EXEC_SPILL_LANE_1:[0-9]+]] -; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] -; GCN-O0-NEXT: s_mov_b64 exec, s[{{[0-9:]+}}] -; GCN-O0-NEXT: s_cbranch_execz [[FLOW2:.LBB[0-9_]+]] -; GCN-O0: {{^}}[[FLOW2]]: -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[FLOW2_IN_EXEC_SPILL_LANE_0]] -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[FLOW2_IN_EXEC_SPILL_LANE_1]] -; GCN-O0: s_branch [[FLOW:.LBB[0-9_]+]] -; GCN-O0: {{^}}[[FLOW]]: -; GCN-O0: s_mov_b64 s[{{[0-9:]+}}], exec -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[FLOW3_IN_EXEC_SPILL_LANE_0:[0-9]+]] -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[FLOW3_IN_EXEC_SPILL_LANE_1:[0-9]+]] -; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] -; GCN-O0-NEXT: s_mov_b64 exec, s[{{[0-9:]+}}] -; GCN-O0-NEXT: s_cbranch_execz [[FLOW3:.LBB[0-9_]+]] -; GCN-O0: ; %bb.{{[0-9]+}}: -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[FLOW1_OUT_EXEC_SPILL_LANE_0:[0-9]+]] -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[FLOW1_OUT_EXEC_SPILL_LANE_1:[0-9]+]] -; GCN-O0: {{^}}[[FLOW3]]: -; GCN-O0-COUNT-4: buffer_load_dword -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[OUTER_LOOP_EXEC_SPILL_LANE_0]] -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[OUTER_LOOP_EXEC_SPILL_LANE_1]] -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[FLOW1_OUT_EXEC_SPILL_LANE_0]] -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[FLOW1_OUT_EXEC_SPILL_LANE_1]] -; GCN-O0: s_and_b64 s[{{[0-9:]+}}], exec, s[{{[0-9:]+}}] -; GCN-O0-NEXT: s_or_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] -; GCN-O0-COUNT-2: s_mov_b64 -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_IN_EXEC_SPILL_LANE_0]] -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_IN_EXEC_SPILL_LANE_1]] -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_0]] -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_1]] -; GCN-O0-COUNT-4: buffer_store_dword -; GCN-O0: s_andn2_b64 exec, exec, s[{{[0-9:]+}}] -; GCN-O0-NEXT: s_cbranch_execnz [[INNER_LOOP]] -; GCN-O0: ; %bb.{{[0-9]+}}: -; GCN-O0-COUNT-4: buffer_store_dword -; GCN-O0: s_setpc_b64 -; define void @scc_liveness(i32 %arg) local_unnamed_addr #0 { +; GCN-LABEL: scc_liveness: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_movk_i32 s4, 0x207 +; GCN-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0 +; GCN-NEXT: s_mov_b32 s8, 0 +; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GCN-NEXT: s_mov_b64 s[12:13], 0 +; GCN-NEXT: s_mov_b64 s[6:7], 0 +; GCN-NEXT: s_branch .LBB5_3 +; GCN-NEXT: .LBB5_1: ; %Flow +; GCN-NEXT: ; in Loop: Header=BB5_3 Depth=1 +; GCN-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-NEXT: .LBB5_2: ; %bb10 +; GCN-NEXT: ; in Loop: Header=BB5_3 Depth=1 +; GCN-NEXT: s_or_b64 exec, exec, s[14:15] +; GCN-NEXT: s_mov_b64 s[6:7], 0 +; GCN-NEXT: s_andn2_b64 exec, exec, s[12:13] +; GCN-NEXT: s_cbranch_execz .LBB5_7 +; GCN-NEXT: .LBB5_3: ; %bb1 +; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-NEXT: s_and_b64 s[10:11], exec, vcc +; GCN-NEXT: s_or_b64 s[6:7], s[10:11], s[6:7] +; GCN-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN-NEXT: s_cbranch_execnz .LBB5_3 +; GCN-NEXT: ; %bb.4: ; %bb2 +; GCN-NEXT: ; in Loop: Header=BB5_3 Depth=1 +; GCN-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-NEXT: s_and_b64 s[6:7], exec, s[4:5] +; GCN-NEXT: s_mov_b32 s9, s8 +; GCN-NEXT: s_mov_b32 s10, s8 +; GCN-NEXT: s_mov_b32 s11, s8 +; GCN-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NEXT: s_or_b64 s[12:13], s[6:7], s[12:13] +; GCN-NEXT: v_mov_b32_e32 v1, s9 +; GCN-NEXT: v_mov_b32_e32 v2, s10 +; GCN-NEXT: v_mov_b32_e32 v3, s11 +; GCN-NEXT: s_and_saveexec_b64 s[14:15], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB5_2 +; GCN-NEXT: ; %bb.5: ; %bb4 +; GCN-NEXT: ; in Loop: Header=BB5_3 Depth=1 +; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_gt_f32_e64 s[6:7], 0, v0 +; GCN-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NEXT: v_mov_b32_e32 v1, s9 +; GCN-NEXT: v_mov_b32_e32 v2, s10 +; GCN-NEXT: v_mov_b32_e32 v3, s11 +; GCN-NEXT: s_and_saveexec_b64 s[10:11], s[6:7] +; GCN-NEXT: s_cbranch_execz .LBB5_1 +; GCN-NEXT: ; %bb.6: ; %bb8 +; GCN-NEXT: ; in Loop: Header=BB5_3 Depth=1 +; GCN-NEXT: s_mov_b32 s9, s8 +; GCN-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NEXT: v_mov_b32_e32 v1, s9 +; GCN-NEXT: v_mov_b32_e32 v2, s10 +; GCN-NEXT: v_mov_b32_e32 v3, s11 +; GCN-NEXT: s_branch .LBB5_1 +; GCN-NEXT: .LBB5_7: ; %bb12 +; GCN-NEXT: s_or_b64 exec, exec, s[12:13] +; GCN-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, v0, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-O0-LABEL: scc_liveness: +; GCN-O0: ; %bb.0: ; %bb +; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 s[4:5], 0 +; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5] +; GCN-O0-NEXT: s_waitcnt expcnt(1) +; GCN-O0-NEXT: v_writelane_b32 v1, s6, 0 +; GCN-O0-NEXT: v_writelane_b32 v1, s7, 1 +; GCN-O0-NEXT: v_writelane_b32 v1, s4, 2 +; GCN-O0-NEXT: v_writelane_b32 v1, s5, 3 +; GCN-O0-NEXT: .LBB5_1: ; %bb1 +; GCN-O0-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-O0-NEXT: s_waitcnt expcnt(0) +; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: v_readlane_b32 s8, v1, 2 +; GCN-O0-NEXT: v_readlane_b32 s9, v1, 3 +; GCN-O0-NEXT: v_readlane_b32 s6, v1, 0 +; GCN-O0-NEXT: v_readlane_b32 s7, v1, 1 +; GCN-O0-NEXT: v_writelane_b32 v1, s6, 4 +; GCN-O0-NEXT: v_writelane_b32 v1, s7, 5 +; GCN-O0-NEXT: s_mov_b32 s4, 0x207 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_cmp_lt_i32_e64 s[4:5], v0, s4 +; GCN-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] +; GCN-O0-NEXT: v_writelane_b32 v1, s4, 6 +; GCN-O0-NEXT: v_writelane_b32 v1, s5, 7 +; GCN-O0-NEXT: v_writelane_b32 v1, s6, 0 +; GCN-O0-NEXT: v_writelane_b32 v1, s7, 1 +; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5] +; GCN-O0-NEXT: v_writelane_b32 v1, s6, 2 +; GCN-O0-NEXT: v_writelane_b32 v1, s7, 3 +; GCN-O0-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN-O0-NEXT: s_cbranch_execnz .LBB5_1 +; GCN-O0-NEXT: ; %bb.2: ; %bb2 +; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: v_readlane_b32 s4, v1, 6 +; GCN-O0-NEXT: v_readlane_b32 s5, v1, 7 +; GCN-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-O0-NEXT: s_mov_b32 s6, 0 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_cmp_ne_u32_e64 s[4:5], v0, s6 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GCN-O0-NEXT: v_writelane_b32 v1, s4, 8 +; GCN-O0-NEXT: v_writelane_b32 v1, s5, 9 +; GCN-O0-NEXT: s_mov_b32 s4, 0 +; GCN-O0-NEXT: s_mov_b32 s8, s4 +; GCN-O0-NEXT: s_mov_b32 s9, s4 +; GCN-O0-NEXT: s_mov_b32 s10, s4 +; GCN-O0-NEXT: s_mov_b32 s11, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s9 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s10 +; GCN-O0-NEXT: v_mov_b32_e32 v5, s11 +; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 s[4:5], exec +; GCN-O0-NEXT: v_writelane_b32 v1, s4, 10 +; GCN-O0-NEXT: v_writelane_b32 v1, s5, 11 +; GCN-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] +; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] +; GCN-O0-NEXT: s_cbranch_execz .LBB5_5 +; GCN-O0-NEXT: ; %bb.3: ; %bb4 +; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1 +; GCN-O0-NEXT: ; implicit-def: $sgpr4 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 +; GCN-O0-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GCN-O0-NEXT: s_mov_b32 s4, 0 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_cmp_lt_f32_e64 s[6:7], v0, s4 +; GCN-O0-NEXT: s_mov_b32 s8, s4 +; GCN-O0-NEXT: s_mov_b32 s9, s4 +; GCN-O0-NEXT: s_mov_b32 s10, s4 +; GCN-O0-NEXT: s_mov_b32 s11, s4 +; GCN-O0-NEXT: s_waitcnt expcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v2, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s9 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s10 +; GCN-O0-NEXT: v_mov_b32_e32 v5, s11 +; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 s[4:5], exec +; GCN-O0-NEXT: v_writelane_b32 v1, s4, 12 +; GCN-O0-NEXT: v_writelane_b32 v1, s5, 13 +; GCN-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] +; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] +; GCN-O0-NEXT: s_cbranch_execz .LBB5_6 +; GCN-O0-NEXT: ; %bb.4: ; %bb8 +; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1 +; GCN-O0-NEXT: s_mov_b32 s10, 0 +; GCN-O0-NEXT: ; implicit-def: $sgpr4 +; GCN-O0-NEXT: ; implicit-def: $sgpr5 +; GCN-O0-NEXT: ; implicit-def: $sgpr9 +; GCN-O0-NEXT: ; implicit-def: $sgpr5 +; GCN-O0-NEXT: ; implicit-def: $sgpr8 +; GCN-O0-NEXT: ; implicit-def: $sgpr5 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GCN-O0-NEXT: s_mov_b32 s5, s10 +; GCN-O0-NEXT: s_mov_b32 s6, s9 +; GCN-O0-NEXT: s_mov_b32 s7, s8 +; GCN-O0-NEXT: s_waitcnt expcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v2, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v5, s7 +; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_branch .LBB5_6 +; GCN-O0-NEXT: .LBB5_5: ; %Flow2 +; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1 +; GCN-O0-NEXT: s_waitcnt expcnt(0) +; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-O0-NEXT: v_readlane_b32 s4, v1, 10 +; GCN-O0-NEXT: v_readlane_b32 s5, v1, 11 +; GCN-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_branch .LBB5_7 +; GCN-O0-NEXT: .LBB5_6: ; %Flow +; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1 +; GCN-O0-NEXT: s_waitcnt expcnt(0) +; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GCN-O0-NEXT: v_readlane_b32 s4, v1, 12 +; GCN-O0-NEXT: v_readlane_b32 s5, v1, 13 +; GCN-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_branch .LBB5_5 +; GCN-O0-NEXT: .LBB5_7: ; %bb10 +; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1 +; GCN-O0-NEXT: v_readlane_b32 s6, v1, 8 +; GCN-O0-NEXT: v_readlane_b32 s7, v1, 9 +; GCN-O0-NEXT: s_mov_b64 s[4:5], -1 +; GCN-O0-NEXT: v_writelane_b32 v1, s4, 14 +; GCN-O0-NEXT: v_writelane_b32 v1, s5, 15 +; GCN-O0-NEXT: s_mov_b64 s[4:5], exec +; GCN-O0-NEXT: v_writelane_b32 v1, s4, 16 +; GCN-O0-NEXT: v_writelane_b32 v1, s5, 17 +; GCN-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] +; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] +; GCN-O0-NEXT: s_cbranch_execz .LBB5_9 +; GCN-O0-NEXT: ; %bb.8: ; %Flow1 +; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1 +; GCN-O0-NEXT: s_mov_b64 s[4:5], 0 +; GCN-O0-NEXT: s_xor_b64 s[4:5], exec, -1 +; GCN-O0-NEXT: v_writelane_b32 v1, s4, 14 +; GCN-O0-NEXT: v_writelane_b32 v1, s5, 15 +; GCN-O0-NEXT: .LBB5_9: ; %Flow3 +; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1 +; GCN-O0-NEXT: s_waitcnt expcnt(0) +; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GCN-O0-NEXT: v_readlane_b32 s8, v1, 16 +; GCN-O0-NEXT: v_readlane_b32 s9, v1, 17 +; GCN-O0-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-O0-NEXT: v_readlane_b32 s6, v1, 4 +; GCN-O0-NEXT: v_readlane_b32 s7, v1, 5 +; GCN-O0-NEXT: v_readlane_b32 s4, v1, 14 +; GCN-O0-NEXT: v_readlane_b32 s5, v1, 15 +; GCN-O0-NEXT: s_and_b64 s[4:5], exec, s[4:5] +; GCN-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] +; GCN-O0-NEXT: s_mov_b64 s[6:7], 0 +; GCN-O0-NEXT: s_mov_b64 s[8:9], s[4:5] +; GCN-O0-NEXT: v_writelane_b32 v1, s8, 0 +; GCN-O0-NEXT: v_writelane_b32 v1, s9, 1 +; GCN-O0-NEXT: v_writelane_b32 v1, s6, 2 +; GCN-O0-NEXT: v_writelane_b32 v1, s7, 3 +; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5] +; GCN-O0-NEXT: v_writelane_b32 v1, s6, 18 +; GCN-O0-NEXT: v_writelane_b32 v1, s7, 19 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN-O0-NEXT: s_cbranch_execnz .LBB5_1 +; GCN-O0-NEXT: ; %bb.10: ; %bb12 +; GCN-O0-NEXT: v_readlane_b32 s4, v1, 18 +; GCN-O0-NEXT: v_readlane_b32 s5, v1, 19 +; GCN-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-O0-NEXT: ; %bb.11: ; %bb12 +; GCN-O0-NEXT: s_waitcnt expcnt(0) +; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v0, v5 +; GCN-O0-NEXT: ; implicit-def: $sgpr4 +; GCN-O0-NEXT: v_mov_b32_e32 v6, s4 +; GCN-O0-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v0, v4 +; GCN-O0-NEXT: ; implicit-def: $sgpr4 +; GCN-O0-NEXT: v_mov_b32_e32 v6, s4 +; GCN-O0-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v0, v3 +; GCN-O0-NEXT: ; implicit-def: $sgpr4 +; GCN-O0-NEXT: v_mov_b32_e32 v6, s4 +; GCN-O0-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v0, v2 +; GCN-O0-NEXT: ; implicit-def: $sgpr4 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s4 +; GCN-O0-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-O0-NEXT: s_setpc_b64 s[30:31] bb: br label %bb1 diff --git a/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll b/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll --- a/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll @@ -1,18 +1,43 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -mcpu=verde -amdgpu-early-ifcvt=0 -machine-sink-split-probability-threshold=0 -structurizecfg-skip-uniform-regions -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-early-ifcvt=0 -machine-sink-split-probability-threshold=0 -structurizecfg-skip-uniform-regions -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s -; GCN-LABEL: {{^}}uniform_if_scc: -; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 0 -; GCN-DAG: s_mov_b32 [[S_VAL:s[0-9]+]], 0 -; GCN: s_cbranch_scc1 [[IF_LABEL:.L[0-9_A-Za-z]+]] - -; Fall-through to the else -; GCN: s_mov_b32 [[S_VAL]], 1 - -; GCN: [[IF_LABEL]]: -; GCN: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[S_VAL]] -; GCN: buffer_store_dword [[V_VAL]] define amdgpu_kernel void @uniform_if_scc(i32 %cond, ptr addrspace(1) %out) { +; SI-LABEL: uniform_if_scc: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dword s2, s[0:1], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_cmp_eq_u32 s2, 0 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_cbranch_scc1 .LBB0_2 +; SI-NEXT: ; %bb.1: ; %else +; SI-NEXT: s_mov_b32 s2, 1 +; SI-NEXT: .LBB0_2: ; %done +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: uniform_if_scc: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dword s2, s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_cmp_eq_u32 s2, 0 +; VI-NEXT: s_mov_b32 s2, 0 +; VI-NEXT: s_cbranch_scc1 .LBB0_2 +; VI-NEXT: ; %bb.1: ; %else +; VI-NEXT: s_mov_b32 s2, 1 +; VI-NEXT: .LBB0_2: ; %done +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %else @@ -29,18 +54,44 @@ ret void } -; GCN-LABEL: {{^}}uniform_if_vcc: -; GCN-DAG: v_cmp_eq_f32_e64 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 0{{$}} -; GCN-DAG: s_mov_b32 [[S_VAL:s[0-9]+]], 0 -; GCN: s_cbranch_vccnz [[IF_LABEL:.L[0-9_A-Za-z]+]] - -; Fall-through to the else -; GCN: s_mov_b32 [[S_VAL]], 1 - -; GCN: [[IF_LABEL]]: -; GCN: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[S_VAL]] -; GCN: buffer_store_dword [[V_VAL]] define amdgpu_kernel void @uniform_if_vcc(float %cond, ptr addrspace(1) %out) { +; SI-LABEL: uniform_if_vcc: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dword s3, s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cmp_eq_f32_e64 s[4:5], s3, 0 +; SI-NEXT: s_and_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccnz .LBB1_2 +; SI-NEXT: ; %bb.1: ; %else +; SI-NEXT: s_mov_b32 s2, 1 +; SI-NEXT: .LBB1_2: ; %done +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: uniform_if_vcc: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dword s3, s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s2, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_cmp_eq_f32_e64 s[4:5], s3, 0 +; VI-NEXT: s_and_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccnz .LBB1_2 +; VI-NEXT: ; %bb.1: ; %else +; VI-NEXT: s_mov_b32 s2, 1 +; VI-NEXT: .LBB1_2: ; %done +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm entry: %cmp0 = fcmp oeq float %cond, 0.0 br i1 %cmp0, label %if, label %else @@ -57,18 +108,42 @@ ret void } -; GCN-LABEL: {{^}}uniform_if_swap_br_targets_scc: -; GCN-DAG: s_cmp_lg_u32 s{{[0-9]+}}, 0 -; GCN-DAG: s_mov_b32 [[S_VAL:s[0-9]+]], 0 -; GCN: s_cbranch_scc1 [[IF_LABEL:.L[0-9_A-Za-z]+]] - -; Fall-through to the else -; GCN: s_mov_b32 [[S_VAL]], 1 - -; GCN: [[IF_LABEL]]: -; GCN: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[S_VAL]] -; GCN: buffer_store_dword [[V_VAL]] define amdgpu_kernel void @uniform_if_swap_br_targets_scc(i32 %cond, ptr addrspace(1) %out) { +; SI-LABEL: uniform_if_swap_br_targets_scc: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dword s2, s[0:1], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s2, 0 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_cbranch_scc1 .LBB2_2 +; SI-NEXT: ; %bb.1: ; %else +; SI-NEXT: s_mov_b32 s2, 1 +; SI-NEXT: .LBB2_2: ; %done +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: uniform_if_swap_br_targets_scc: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dword s2, s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s2, 0 +; VI-NEXT: s_mov_b32 s2, 0 +; VI-NEXT: s_cbranch_scc1 .LBB2_2 +; VI-NEXT: ; %bb.1: ; %else +; VI-NEXT: s_mov_b32 s2, 1 +; VI-NEXT: .LBB2_2: ; %done +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %else, label %if @@ -85,18 +160,44 @@ ret void } -; GCN-LABEL: {{^}}uniform_if_swap_br_targets_vcc: -; GCN-DAG: v_cmp_neq_f32_e64 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 0{{$}} -; GCN-DAG: s_mov_b32 [[S_VAL:s[0-9]+]], 0 -; GCN: s_cbranch_vccnz [[IF_LABEL:.L[0-9_A-Za-z]+]] - -; Fall-through to the else -; GCN: s_mov_b32 [[S_VAL]], 1 - -; GCN: [[IF_LABEL]]: -; GCN: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[S_VAL]] -; GCN: buffer_store_dword [[V_VAL]] define amdgpu_kernel void @uniform_if_swap_br_targets_vcc(float %cond, ptr addrspace(1) %out) { +; SI-LABEL: uniform_if_swap_br_targets_vcc: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dword s3, s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cmp_neq_f32_e64 s[4:5], s3, 0 +; SI-NEXT: s_and_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccnz .LBB3_2 +; SI-NEXT: ; %bb.1: ; %else +; SI-NEXT: s_mov_b32 s2, 1 +; SI-NEXT: .LBB3_2: ; %done +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: uniform_if_swap_br_targets_vcc: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dword s3, s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s2, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_cmp_neq_f32_e64 s[4:5], s3, 0 +; VI-NEXT: s_and_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccnz .LBB3_2 +; VI-NEXT: ; %bb.1: ; %else +; VI-NEXT: s_mov_b32 s2, 1 +; VI-NEXT: .LBB3_2: ; %done +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm entry: %cmp0 = fcmp oeq float %cond, 0.0 br i1 %cmp0, label %else, label %if @@ -113,16 +214,44 @@ ret void } -; GCN-LABEL: {{^}}uniform_if_move_valu: -; GCN: v_add_f32_e32 [[CMP:v[0-9]+]] ; Using a floating-point value in an integer compare will cause the compare to ; be selected for the SALU and then later moved to the VALU. -; GCN: v_cmp_ne_u32_e32 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], 5, [[CMP]] -; GCN: s_cbranch_vccnz [[ENDIF_LABEL:.L[0-9_A-Za-z]+]] -; GCN: buffer_store_dword -; GCN: [[ENDIF_LABEL]]: -; GCN: s_endpgm define amdgpu_kernel void @uniform_if_move_valu(ptr addrspace(1) %out, float %a) { +; SI-LABEL: uniform_if_move_valu: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: v_mov_b32_e32 v0, 0x41200000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_add_f32_e32 v0, s2, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 5, v0 +; SI-NEXT: s_cbranch_vccnz .LBB4_2 +; SI-NEXT: ; %bb.1: ; %if +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: .LBB4_2: ; %endif +; SI-NEXT: s_endpgm +; +; VI-LABEL: uniform_if_move_valu: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: v_mov_b32_e32 v0, 0x41200000 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_add_f32_e32 v0, s2, v0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 5, v0 +; VI-NEXT: s_cbranch_vccnz .LBB4_2 +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: .LBB4_2: ; %endif +; VI-NEXT: s_endpgm entry: %a.0 = fadd float %a, 10.0 %cond = bitcast float %a.0 to i32 @@ -137,16 +266,44 @@ ret void } -; GCN-LABEL: {{^}}uniform_if_move_valu_commute: -; GCN: v_add_f32_e32 [[CMP:v[0-9]+]] ; Using a floating-point value in an integer compare will cause the compare to ; be selected for the SALU and then later moved to the VALU. -; GCN: v_cmp_gt_u32_e32 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], 6, [[CMP]] -; GCN: s_cbranch_vccnz [[ENDIF_LABEL:.L[0-9_A-Za-z]+]] -; GCN: buffer_store_dword -; GCN: [[ENDIF_LABEL]]: -; GCN: s_endpgm define amdgpu_kernel void @uniform_if_move_valu_commute(ptr addrspace(1) %out, float %a) { +; SI-LABEL: uniform_if_move_valu_commute: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: v_mov_b32_e32 v0, 0x41200000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_add_f32_e32 v0, s2, v0 +; SI-NEXT: v_cmp_gt_u32_e32 vcc, 6, v0 +; SI-NEXT: s_cbranch_vccnz .LBB5_2 +; SI-NEXT: ; %bb.1: ; %if +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: .LBB5_2: ; %endif +; SI-NEXT: s_endpgm +; +; VI-LABEL: uniform_if_move_valu_commute: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: v_mov_b32_e32 v0, 0x41200000 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_add_f32_e32 v0, s2, v0 +; VI-NEXT: v_cmp_gt_u32_e32 vcc, 6, v0 +; VI-NEXT: s_cbranch_vccnz .LBB5_2 +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: .LBB5_2: ; %endif +; VI-NEXT: s_endpgm entry: %a.0 = fadd float %a, 10.0 %cond = bitcast float %a.0 to i32 @@ -162,19 +319,44 @@ } -; GCN-LABEL: {{^}}uniform_if_else_ret: -; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 0 -; GCN: s_cbranch_scc0 [[IF_LABEL:.L[0-9_A-Za-z]+]] - -; GCN: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 -; GCN: buffer_store_dword [[TWO]] -; GCN: s_endpgm - -; GCN: {{^}}[[IF_LABEL]]: -; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 -; GCN: buffer_store_dword [[ONE]] -; GCN: s_endpgm define amdgpu_kernel void @uniform_if_else_ret(ptr addrspace(1) nocapture %out, i32 %a) { +; SI-LABEL: uniform_if_else_ret: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s2, 0 +; SI-NEXT: s_cbranch_scc0 .LBB6_2 +; SI-NEXT: ; %bb.1: ; %if.else +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, 2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; SI-NEXT: .LBB6_2: ; %if.then +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, 1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: uniform_if_else_ret: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s2, 0 +; VI-NEXT: s_cbranch_scc0 .LBB6_2 +; VI-NEXT: ; %bb.1: ; %if.else +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_mov_b32_e32 v0, 2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm +; VI-NEXT: .LBB6_2: ; %if.then +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_mov_b32_e32 v0, 1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm entry: %cmp = icmp eq i32 %a, 0 br i1 %cmp, label %if.then, label %if.else @@ -191,23 +373,59 @@ ret void } -; GCN-LABEL: {{^}}uniform_if_else: -; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 0 -; GCN: s_cbranch_scc0 [[IF_LABEL:.L[0-9_A-Za-z]+]] - -; GCN: v_mov_b32_e32 [[IMM_REG:v[0-9]+]], 2 -; GCN: s_branch [[ENDIF_LABEL:.L[0-9_A-Za-z]+]] - -; GCN: [[IF_LABEL]]: -; GCN: v_mov_b32_e32 [[IMM_REG]], 1 - -; GCN-NEXT: [[ENDIF_LABEL]]: -; GCN: buffer_store_dword [[IMM_REG]] - -; GCN: v_mov_b32_e32 [[THREE:v[0-9]+]], 3 -; GCN: buffer_store_dword [[THREE]] -; GCN: s_endpgm define amdgpu_kernel void @uniform_if_else(ptr addrspace(1) nocapture %out0, ptr addrspace(1) nocapture %out1, i32 %a) { +; SI-LABEL: uniform_if_else: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dword s4, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s4, 0 +; SI-NEXT: s_cbranch_scc0 .LBB7_2 +; SI-NEXT: ; %bb.1: ; %if.else +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_mov_b32_e32 v0, 2 +; SI-NEXT: s_branch .LBB7_3 +; SI-NEXT: .LBB7_2: ; %if.then +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_mov_b32_e32 v0, 1 +; SI-NEXT: .LBB7_3: ; %if.end +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, 3 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: uniform_if_else: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dword s4, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s4, 0 +; VI-NEXT: s_cbranch_scc0 .LBB7_2 +; VI-NEXT: ; %bb.1: ; %if.else +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: v_mov_b32_e32 v0, 2 +; VI-NEXT: s_branch .LBB7_3 +; VI-NEXT: .LBB7_2: ; %if.then +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: v_mov_b32_e32 v0, 1 +; VI-NEXT: .LBB7_3: ; %if.end +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 +; VI-NEXT: v_mov_b32_e32 v0, 3 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm entry: %cmp = icmp eq i32 %a, 0 br i1 %cmp, label %if.then, label %if.else @@ -225,13 +443,42 @@ ret void } -; GCN-LABEL: {{^}}icmp_2_users: -; GCN: s_cmp_lt_i32 s{{[0-9]+}}, 1 -; GCN: s_cbranch_scc1 [[LABEL:.L[0-9_A-Za-z]+]] -; GCN: buffer_store_dword -; GCN: [[LABEL]]: -; GCN: s_endpgm define amdgpu_kernel void @icmp_2_users(ptr addrspace(1) %out, i32 %cond) { +; SI-LABEL: icmp_2_users: +; SI: ; %bb.0: ; %main_body +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_cmp_gt_i32 s4, 0 +; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; SI-NEXT: s_cmp_lt_i32 s4, 1 +; SI-NEXT: s_cbranch_scc1 .LBB8_2 +; SI-NEXT: ; %bb.1: ; %IF +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[2:3] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: .LBB8_2: ; %ENDIF +; SI-NEXT: s_endpgm +; +; VI-LABEL: icmp_2_users: +; VI: ; %bb.0: ; %main_body +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_cmp_gt_i32 s4, 0 +; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; VI-NEXT: s_cmp_lt_i32 s4, 1 +; VI-NEXT: s_cbranch_scc1 .LBB8_2 +; VI-NEXT: ; %bb.1: ; %IF +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[2:3] +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: .LBB8_2: ; %ENDIF +; VI-NEXT: s_endpgm main_body: %0 = icmp sgt i32 %cond, 0 %1 = sext i1 %0 to i32 @@ -245,18 +492,52 @@ ret void } -; GCN-LABEL: {{^}}icmp_users_different_blocks: -; GCN: s_load_dwordx2 s[[[COND0:[0-9]+]]:[[COND1:[0-9]+]]] -; GCN: s_cmp_lt_i32 s[[COND0]], 1 -; GCN: s_cbranch_scc1 [[EXIT:.L[0-9_A-Za-z]+]] -; GCN: s_cmp_gt_i32 s[[COND1]], 0{{$}} -; GCN: s_cbranch_vccz [[BODY:.L[0-9_A-Za-z]+]] -; GCN: {{^}}[[EXIT]]: -; GCN: s_endpgm -; GCN: {{^}}[[BODY]]: -; GCN: buffer_store -; GCN: s_endpgm define amdgpu_kernel void @icmp_users_different_blocks(i32 %cond0, i32 %cond1, ptr addrspace(1) %out) { +; SI-LABEL: icmp_users_different_blocks: +; SI: ; %bb.0: ; %bb +; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_cmp_lt_i32 s2, 1 +; SI-NEXT: s_cbranch_scc1 .LBB9_2 +; SI-NEXT: ; %bb.1: ; %bb2 +; SI-NEXT: s_cmp_gt_i32 s3, 0 +; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; SI-NEXT: s_and_b64 vcc, exec, s[2:3] +; SI-NEXT: s_cbranch_vccz .LBB9_3 +; SI-NEXT: .LBB9_2: ; %bb9 +; SI-NEXT: s_endpgm +; SI-NEXT: .LBB9_3: ; %bb7 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] +; SI-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: icmp_users_different_blocks: +; VI: ; %bb.0: ; %bb +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_cmp_lt_i32 s2, 1 +; VI-NEXT: s_cbranch_scc1 .LBB9_2 +; VI-NEXT: ; %bb.1: ; %bb2 +; VI-NEXT: s_cmp_gt_i32 s3, 0 +; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; VI-NEXT: s_and_b64 vcc, exec, s[2:3] +; VI-NEXT: s_cbranch_vccz .LBB9_3 +; VI-NEXT: .LBB9_2: ; %bb9 +; VI-NEXT: s_endpgm +; VI-NEXT: .LBB9_3: ; %bb7 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] +; VI-NEXT: v_sub_u32_e32 v0, vcc, v0, v1 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %cmp0 = icmp sgt i32 %cond0, 0 @@ -276,13 +557,30 @@ ret void } -; SI-LABEL: {{^}}uniform_loop: -; SI: {{^}}[[LOOP_LABEL:.L[0-9_A-Za-z]+]]: -; SI: s_add_i32 [[I:s[0-9]+]], s{{[0-9]+}}, -1 -; SI: s_cmp_lg_u32 [[I]], 0 -; SI: s_cbranch_scc1 [[LOOP_LABEL]] -; SI: s_endpgm define amdgpu_kernel void @uniform_loop(ptr addrspace(1) %out, i32 %a) { +; SI-LABEL: uniform_loop: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: .LBB10_1: ; %loop +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_add_i32 s0, s0, -1 +; SI-NEXT: s_cmp_lg_u32 s0, 0 +; SI-NEXT: s_cbranch_scc1 .LBB10_1 +; SI-NEXT: ; %bb.2: ; %done +; SI-NEXT: s_endpgm +; +; VI-LABEL: uniform_loop: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: .LBB10_1: ; %loop +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_i32 s0, s0, -1 +; VI-NEXT: s_cmp_lg_u32 s0, 0 +; VI-NEXT: s_cbranch_scc1 .LBB10_1 +; VI-NEXT: ; %bb.2: ; %done +; VI-NEXT: s_endpgm entry: br label %loop @@ -298,16 +596,51 @@ ; Test uniform and divergent. -; GCN-LABEL: {{^}}uniform_inside_divergent: -; GCN: v_cmp_gt_u32_e32 vcc, 16, v{{[0-9]+}} -; GCN: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc -; GCN: s_cmp_lg_u32 {{s[0-9]+}}, 0 -; GCN: s_cbranch_scc0 [[IF_UNIFORM_LABEL:.L[0-9_A-Za-z]+]] -; GCN: s_endpgm -; GCN: {{^}}[[IF_UNIFORM_LABEL]]: -; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 -; GCN: buffer_store_dword [[ONE]] define amdgpu_kernel void @uniform_inside_divergent(ptr addrspace(1) %out, i32 %cond) { +; SI-LABEL: uniform_inside_divergent: +; SI: ; %bb.0: ; %entry +; SI-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0 +; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc +; SI-NEXT: s_cbranch_execz .LBB11_2 +; SI-NEXT: ; %bb.1: ; %if +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s4, 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_cbranch_scc0 .LBB11_3 +; SI-NEXT: .LBB11_2: ; %endif +; SI-NEXT: s_endpgm +; SI-NEXT: .LBB11_3: ; %if_uniform +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, 1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: uniform_inside_divergent: +; VI: ; %bb.0: ; %entry +; VI-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0 +; VI-NEXT: s_and_saveexec_b64 s[2:3], vcc +; VI-NEXT: s_cbranch_execz .LBB11_2 +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s4, 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_cbranch_scc0 .LBB11_3 +; VI-NEXT: .LBB11_2: ; %endif +; VI-NEXT: s_endpgm +; VI-NEXT: .LBB11_3: ; %if_uniform +; VI-NEXT: v_mov_b32_e32 v0, 1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %d_cmp = icmp ult i32 %tid, 16 @@ -326,18 +659,52 @@ ret void } -; GCN-LABEL: {{^}}divergent_inside_uniform: -; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 0 -; GCN: s_cbranch_scc0 [[IF_LABEL:.L[0-9_A-Za-z]+]] -; GCN: [[ENDIF_LABEL:.L[0-9_A-Za-z]+]]: -; GCN: [[IF_LABEL]]: -; GCN: v_cmp_gt_u32_e32 vcc, 16, v{{[0-9]+}} -; GCN: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc -; GCN: s_cbranch_execz [[ENDIF_LABEL]] -; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 -; GCN: buffer_store_dword [[ONE]] -; GCN: s_endpgm define amdgpu_kernel void @divergent_inside_uniform(ptr addrspace(1) %out, i32 %cond) { +; SI-LABEL: divergent_inside_uniform: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s2, 0 +; SI-NEXT: s_cbranch_scc0 .LBB12_2 +; SI-NEXT: .LBB12_1: ; %endif +; SI-NEXT: s_endpgm +; SI-NEXT: .LBB12_2: ; %if +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_cbranch_execz .LBB12_1 +; SI-NEXT: ; %bb.3: ; %if_uniform +; SI-NEXT: v_mov_b32_e32 v0, 1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: divergent_inside_uniform: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s2, 0 +; VI-NEXT: s_cbranch_scc0 .LBB12_2 +; VI-NEXT: .LBB12_1: ; %endif +; VI-NEXT: s_endpgm +; VI-NEXT: .LBB12_2: ; %if +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_cbranch_execz .LBB12_1 +; VI-NEXT: ; %bb.3: ; %if_uniform +; VI-NEXT: v_mov_b32_e32 v0, 1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm entry: %u_cmp = icmp eq i32 %cond, 0 br i1 %u_cmp, label %if, label %endif @@ -356,19 +723,61 @@ ret void } -; GCN-LABEL: {{^}}divergent_if_uniform_if: -; GCN: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc -; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 -; GCN: buffer_store_dword [[ONE]] -; GCN: s_or_b64 exec, exec, [[MASK]] -; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 0 -; GCN: s_cbranch_scc0 [[IF_UNIFORM:.L[0-9_A-Za-z]+]] -; GCN: s_endpgm -; GCN: [[IF_UNIFORM]]: -; GCN: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 -; GCN: buffer_store_dword [[TWO]] define amdgpu_kernel void @divergent_if_uniform_if(ptr addrspace(1) %out, i32 %cond) { +; SI-LABEL: divergent_if_uniform_if: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc +; SI-NEXT: s_cbranch_execz .LBB13_2 +; SI-NEXT: ; %bb.1: ; %if +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_mov_b32_e32 v0, 1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: .LBB13_2: ; %endif +; SI-NEXT: s_or_b64 exec, exec, s[2:3] +; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s0, 0 +; SI-NEXT: s_cbranch_scc0 .LBB13_4 +; SI-NEXT: ; %bb.3: ; %exit +; SI-NEXT: s_endpgm +; SI-NEXT: .LBB13_4: ; %if_uniform +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, 2 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: divergent_if_uniform_if: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: s_and_saveexec_b64 s[2:3], vcc +; VI-NEXT: s_cbranch_execz .LBB13_2 +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: v_mov_b32_e32 v0, 1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: .LBB13_2: ; %endif +; VI-NEXT: s_or_b64 exec, exec, s[2:3] +; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s0, 0 +; VI-NEXT: s_cbranch_scc0 .LBB13_4 +; VI-NEXT: ; %bb.3: ; %exit +; VI-NEXT: s_endpgm +; VI-NEXT: .LBB13_4: ; %if_uniform +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: v_mov_b32_e32 v0, 2 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %d_cmp = icmp eq i32 %tid, 0 @@ -395,20 +804,50 @@ ; the first, leaving an scc use in a different block than it was ; defed. -; GCN-LABEL: {{^}}cse_uniform_condition_different_blocks: -; GCN: s_load_dword [[COND:s[0-9]+]] -; GCN: s_cmp_lt_i32 [[COND]], 1 -; GCN: s_cbranch_scc1 .LBB[[FNNUM:[0-9]+]]_3 - -; GCN: %bb.1: -; GCN-NOT: cmp -; GCN: buffer_load_dword -; GCN: buffer_store_dword -; GCN: s_cbranch_scc1 .LBB[[FNNUM]]_3 - -; GCN: .LBB[[FNNUM]]_3: -; GCN: s_endpgm define amdgpu_kernel void @cse_uniform_condition_different_blocks(i32 %cond, ptr addrspace(1) %out) { +; SI-LABEL: cse_uniform_condition_different_blocks: +; SI: ; %bb.0: ; %bb +; SI-NEXT: s_load_dword s2, s[0:1], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_cmp_lt_i32 s2, 1 +; SI-NEXT: s_cbranch_scc1 .LBB14_3 +; SI-NEXT: ; %bb.1: ; %bb2 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_cbranch_scc1 .LBB14_3 +; SI-NEXT: ; %bb.2: ; %bb7 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: .LBB14_3: ; %bb9 +; SI-NEXT: s_endpgm +; +; VI-LABEL: cse_uniform_condition_different_blocks: +; VI: ; %bb.0: ; %bb +; VI-NEXT: s_load_dword s2, s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_cmp_lt_i32 s2, 1 +; VI-NEXT: s_cbranch_scc1 .LBB14_3 +; VI-NEXT: ; %bb.1: ; %bb2 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_cbranch_scc1 .LBB14_3 +; VI-NEXT: ; %bb.2: ; %bb7 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: .LBB14_3: ; %bb9 +; VI-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tmp1 = icmp sgt i32 %cond, 0 @@ -428,21 +867,44 @@ ret void } -; GCN-LABEL: {{^}}uniform_if_scc_i64_eq: -; VI-DAG: s_cmp_eq_u64 s{{\[[0-9]+:[0-9]+\]}}, 0 -; GCN-DAG: s_mov_b32 [[S_VAL:s[0-9]+]], 0 -; SI-DAG: v_cmp_eq_u64_e64 -; SI: s_cbranch_vccnz [[IF_LABEL:.L[0-9_A-Za-z]+]] - -; VI: s_cbranch_scc1 [[IF_LABEL:.L[0-9_A-Za-z]+]] - ; Fall-through to the else -; GCN: s_mov_b32 [[S_VAL]], 1 - -; GCN: [[IF_LABEL]]: -; GCN: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[S_VAL]] -; GCN: buffer_store_dword [[V_VAL]] define amdgpu_kernel void @uniform_if_scc_i64_eq(i64 %cond, ptr addrspace(1) %out) { +; SI-LABEL: uniform_if_scc_i64_eq: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cmp_eq_u64_e64 s[4:5], s[0:1], 0 +; SI-NEXT: s_mov_b32 s0, 0 +; SI-NEXT: s_and_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccnz .LBB15_2 +; SI-NEXT: ; %bb.1: ; %else +; SI-NEXT: s_mov_b32 s0, 1 +; SI-NEXT: .LBB15_2: ; %done +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: uniform_if_scc_i64_eq: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_cmp_eq_u64 s[0:1], 0 +; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: s_cbranch_scc1 .LBB15_2 +; VI-NEXT: ; %bb.1: ; %else +; VI-NEXT: s_mov_b32 s0, 1 +; VI-NEXT: .LBB15_2: ; %done +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm entry: %cmp0 = icmp eq i64 %cond, 0 br i1 %cmp0, label %if, label %else @@ -459,22 +921,44 @@ ret void } -; GCN-LABEL: {{^}}uniform_if_scc_i64_ne: -; VI-DAG: s_cmp_lg_u64 s{{\[[0-9]+:[0-9]+\]}}, 0 -; GCN-DAG: s_mov_b32 [[S_VAL:s[0-9]+]], 0 - -; SI-DAG: v_cmp_ne_u64_e64 -; SI: s_cbranch_vccnz [[IF_LABEL:.L[0-9_A-Za-z]+]] - -; VI: s_cbranch_scc1 [[IF_LABEL:.L[0-9_A-Za-z]+]] - ; Fall-through to the else -; GCN: s_mov_b32 [[S_VAL]], 1 - -; GCN: [[IF_LABEL]]: -; GCN: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[S_VAL]] -; GCN: buffer_store_dword [[V_VAL]] define amdgpu_kernel void @uniform_if_scc_i64_ne(i64 %cond, ptr addrspace(1) %out) { +; SI-LABEL: uniform_if_scc_i64_ne: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; SI-NEXT: s_mov_b32 s0, 0 +; SI-NEXT: s_and_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccnz .LBB16_2 +; SI-NEXT: ; %bb.1: ; %else +; SI-NEXT: s_mov_b32 s0, 1 +; SI-NEXT: .LBB16_2: ; %done +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: uniform_if_scc_i64_ne: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u64 s[0:1], 0 +; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: s_cbranch_scc1 .LBB16_2 +; VI-NEXT: ; %bb.1: ; %else +; VI-NEXT: s_mov_b32 s0, 1 +; VI-NEXT: .LBB16_2: ; %done +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm entry: %cmp0 = icmp ne i64 %cond, 0 br i1 %cmp0, label %if, label %else @@ -491,18 +975,45 @@ ret void } -; GCN-LABEL: {{^}}uniform_if_scc_i64_sgt: -; GCN-DAG: s_mov_b32 [[S_VAL:s[0-9]+]], 0 -; GCN-DAG: v_cmp_gt_i64_e64 -; GCN: s_cbranch_vccnz [[IF_LABEL:.L[0-9_A-Za-z]+]] - ; Fall-through to the else -; GCN: s_mov_b32 [[S_VAL]], 1 - -; GCN: [[IF_LABEL]]: -; GCN: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[S_VAL]] -; GCN: buffer_store_dword [[V_VAL]] define amdgpu_kernel void @uniform_if_scc_i64_sgt(i64 %cond, ptr addrspace(1) %out) { +; SI-LABEL: uniform_if_scc_i64_sgt: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cmp_gt_i64_e64 s[4:5], s[0:1], 0 +; SI-NEXT: s_mov_b32 s0, 0 +; SI-NEXT: s_and_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccnz .LBB17_2 +; SI-NEXT: ; %bb.1: ; %else +; SI-NEXT: s_mov_b32 s0, 1 +; SI-NEXT: .LBB17_2: ; %done +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: uniform_if_scc_i64_sgt: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_cmp_gt_i64_e64 s[4:5], s[0:1], 0 +; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: s_and_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccnz .LBB17_2 +; VI-NEXT: ; %bb.1: ; %else +; VI-NEXT: s_mov_b32 s0, 1 +; VI-NEXT: .LBB17_2: ; %done +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm entry: %cmp0 = icmp sgt i64 %cond, 0 br i1 %cmp0, label %if, label %else @@ -519,9 +1030,44 @@ ret void } -; GCN-LABEL: {{^}}move_to_valu_i64_eq: -; GCN: v_cmp_eq_u64_e32 define amdgpu_kernel void @move_to_valu_i64_eq(ptr addrspace(1) %out) { +; SI-LABEL: move_to_valu_i64_eq: +; SI: ; %bb.0: +; SI-NEXT: s_mov_b32 m0, -1 +; SI-NEXT: ds_read_b64 v[0:1], v0 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; SI-NEXT: s_cbranch_vccnz .LBB18_2 +; SI-NEXT: ; %bb.1: ; %else +; SI-NEXT: s_mov_b32 s2, 1 +; SI-NEXT: .LBB18_2: ; %done +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: move_to_valu_i64_eq: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: ds_read_b64 v[0:1], v0 +; VI-NEXT: s_mov_b32 s2, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; VI-NEXT: s_cbranch_vccnz .LBB18_2 +; VI-NEXT: ; %bb.1: ; %else +; VI-NEXT: s_mov_b32 s2, 1 +; VI-NEXT: .LBB18_2: ; %done +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm %cond = load volatile i64, ptr addrspace(3) undef %cmp0 = icmp eq i64 %cond, 0 br i1 %cmp0, label %if, label %else @@ -538,9 +1084,44 @@ ret void } -; GCN-LABEL: {{^}}move_to_valu_i64_ne: -; GCN: v_cmp_ne_u64_e32 define amdgpu_kernel void @move_to_valu_i64_ne(ptr addrspace(1) %out) { +; SI-LABEL: move_to_valu_i64_ne: +; SI: ; %bb.0: +; SI-NEXT: s_mov_b32 m0, -1 +; SI-NEXT: ds_read_b64 v[0:1], v0 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; SI-NEXT: s_cbranch_vccnz .LBB19_2 +; SI-NEXT: ; %bb.1: ; %else +; SI-NEXT: s_mov_b32 s2, 1 +; SI-NEXT: .LBB19_2: ; %done +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: move_to_valu_i64_ne: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: ds_read_b64 v[0:1], v0 +; VI-NEXT: s_mov_b32 s2, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; VI-NEXT: s_cbranch_vccnz .LBB19_2 +; VI-NEXT: ; %bb.1: ; %else +; VI-NEXT: s_mov_b32 s2, 1 +; VI-NEXT: .LBB19_2: ; %done +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm %cond = load volatile i64, ptr addrspace(3) undef %cmp0 = icmp ne i64 %cond, 0 br i1 %cmp0, label %if, label %else @@ -557,10 +1138,52 @@ ret void } -; GCN-LABEL: {{^}}move_to_valu_vgpr_operand_phi: -; GCN: v_add_{{[iu]}}32_e32 -; GCN: ds_write_b32 define void @move_to_valu_vgpr_operand_phi(ptr addrspace(3) %out) { +; SI-LABEL: move_to_valu_vgpr_operand_phi: +; SI: ; %bb.0: ; %bb0 +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 28, v0 +; SI-NEXT: v_mov_b32_e32 v1, 1 +; SI-NEXT: s_and_b64 vcc, exec, 0 +; SI-NEXT: s_mov_b32 m0, -1 +; SI-NEXT: s_branch .LBB20_2 +; SI-NEXT: .LBB20_1: ; %bb3 +; SI-NEXT: ; in Loop: Header=BB20_2 Depth=1 +; SI-NEXT: v_add_i32_e64 v0, s[4:5], 8, v0 +; SI-NEXT: .LBB20_2: ; %bb1 +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_cbranch_scc1 .LBB20_1 +; SI-NEXT: ; %bb.3: ; %bb2 +; SI-NEXT: ; in Loop: Header=BB20_2 Depth=1 +; SI-NEXT: ds_write_b32 v0, v1 +; SI-NEXT: s_mov_b64 vcc, vcc +; SI-NEXT: s_cbranch_vccz .LBB20_1 +; SI-NEXT: ; %bb.4: ; %DummyReturnBlock +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: move_to_valu_vgpr_operand_phi: +; VI: ; %bb.0: ; %bb0 +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 28, v0 +; VI-NEXT: v_mov_b32_e32 v1, 1 +; VI-NEXT: s_and_b64 vcc, exec, 0 +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_branch .LBB20_2 +; VI-NEXT: .LBB20_1: ; %bb3 +; VI-NEXT: ; in Loop: Header=BB20_2 Depth=1 +; VI-NEXT: v_add_u32_e64 v0, s[4:5], 8, v0 +; VI-NEXT: .LBB20_2: ; %bb1 +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_cbranch_scc1 .LBB20_1 +; VI-NEXT: ; %bb.3: ; %bb2 +; VI-NEXT: ; in Loop: Header=BB20_2 Depth=1 +; VI-NEXT: ds_write_b32 v0, v1 +; VI-NEXT: s_mov_b64 vcc, vcc +; VI-NEXT: s_cbranch_vccz .LBB20_1 +; VI-NEXT: ; %bb.4: ; %DummyReturnBlock +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] bb0: br label %bb1 @@ -582,3 +1205,5 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0 attributes #0 = { nounwind readnone } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}}