Index: llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll +++ llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll @@ -1,25 +1,55 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s - -; SI-LABEL: {{^}}i1_copy_from_loop: -; -; SI: ; %Flow -; SI: s_or_b64 [[EXIT_MASK:s\[[0-9]+:[0-9]+\]]] -; SI: s_and_b64 [[ACCUM_MASK:s\[[0-9]+:[0-9]+\]]], [[CC_MASK:s\[[0-9]+:[0-9]+\]]], exec -; SI: s_or_b64 [[I1_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, [[ACCUM_MASK]] -; SI: s_cbranch_execz [[FOR_END_LABEL:BB0_[0-9]+]] - -; SI: ; %for.body -; SI: v_cmp_lt_u32_e64 [[CC_MASK]], s{{[0-9]+}}, 4 - -; SI: [[FOR_END_LABEL]] -; SI: s_or_b64 exec, exec, [[EXIT_MASK]] -; SI: s_and_saveexec_b64 {{s\[[0-9]+:[0-9]+\]}}, [[I1_VALUE]] -; SI: s_cbranch_execz [[EXIT:BB0_[0-9]+]] -; SI: [[EXIT]] -; SI-NEXT: s_endpgm define amdgpu_ps void @i1_copy_from_loop(<4 x i32> inreg %rsrc, i32 %tid) { +; SI-LABEL: i1_copy_from_loop: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: ; implicit-def: $sgpr8_sgpr9 +; SI-NEXT: ; implicit-def: $sgpr10_sgpr11 +; SI-NEXT: s_branch BB0_3 +; SI-NEXT: BB0_1: ; %Flow1 +; SI-NEXT: ; in Loop: Header=BB0_3 Depth=1 +; SI-NEXT: s_or_b64 exec, exec, s[14:15] +; SI-NEXT: BB0_2: ; %Flow +; SI-NEXT: ; in Loop: Header=BB0_3 Depth=1 +; SI-NEXT: s_and_b64 s[14:15], exec, s[10:11] +; SI-NEXT: s_or_b64 s[4:5], s[14:15], s[4:5] +; SI-NEXT: s_andn2_b64 s[8:9], s[8:9], exec +; SI-NEXT: s_and_b64 s[12:13], s[12:13], exec +; SI-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] +; SI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; SI-NEXT: s_cbranch_execz BB0_6 +; SI-NEXT: BB0_3: ; %for.body +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_or_b64 s[10:11], s[10:11], exec +; SI-NEXT: s_cmp_gt_u32 s6, 3 +; SI-NEXT: v_cmp_lt_u32_e64 s[12:13], s6, 4 +; SI-NEXT: s_cbranch_scc1 BB0_2 +; SI-NEXT: ; %bb.4: ; %mid.loop +; SI-NEXT: ; in Loop: Header=BB0_3 Depth=1 +; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 idxen offen +; SI-NEXT: s_mov_b64 s[12:13], -1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_le_f32_e32 vcc, 0, v1 +; SI-NEXT: s_mov_b64 s[10:11], -1 +; SI-NEXT: s_and_saveexec_b64 s[14:15], vcc +; SI-NEXT: s_cbranch_execz BB0_1 +; SI-NEXT: ; %bb.5: ; %end.loop +; SI-NEXT: ; in Loop: Header=BB0_3 Depth=1 +; SI-NEXT: s_add_i32 s6, s6, 1 +; SI-NEXT: s_xor_b64 s[10:11], exec, -1 +; SI-NEXT: s_branch BB0_1 +; SI-NEXT: BB0_6: ; %for.end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_and_saveexec_b64 s[0:1], s[8:9] +; SI-NEXT: s_cbranch_execz BB0_8 +; SI-NEXT: ; %bb.7: ; %if +; SI-NEXT: exp mrt0 v0, v0, v0, v0 done vm +; SI-NEXT: BB0_8: ; %end +; SI-NEXT: s_endpgm entry: br label %for.body Index: llvm/test/CodeGen/AMDGPU/multilevel-break.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/multilevel-break.ll +++ llvm/test/CodeGen/AMDGPU/multilevel-break.ll @@ -1,57 +1,100 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: opt -S -mtriple=amdgcn-- -structurizecfg -si-annotate-control-flow < %s | FileCheck -check-prefix=OPT %s ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s -; OPT-LABEL: {{^}}define amdgpu_vs void @multi_else_break( -; OPT: main_body: -; OPT: LOOP.outer: -; OPT: LOOP: -; OPT: [[if:%[0-9]+]] = call { i1, i64 } @llvm.amdgcn.if.i64( -; OPT: [[if_exec:%[0-9]+]] = extractvalue { i1, i64 } [[if]], 1 -; -; OPT: Flow: -; ; Ensure two if.break calls, for both the inner and outer loops - -; OPT: call void @llvm.amdgcn.end.cf -; OPT-NEXT: call i64 @llvm.amdgcn.if.break.i64.i64(i1 -; OPT-NEXT: call i1 @llvm.amdgcn.loop.i64(i64 -; OPT-NEXT: call i64 @llvm.amdgcn.if.break.i64.i64(i1 -; -; OPT: Flow1: - -; GCN-LABEL: {{^}}multi_else_break: - -; GCN: ; %main_body -; GCN: s_mov_b64 [[LEFT_OUTER:s\[[0-9]+:[0-9]+\]]], 0{{$}} - -; GCN: [[FLOW2:BB[0-9]+_[0-9]+]]: ; %Flow2 -; GCN: s_or_b64 exec, exec, [[LEFT_INNER:s\[[0-9]+:[0-9]+\]]] -; GCN: s_and_b64 [[TMP1:s\[[0-9]+:[0-9]+\]]], exec, [[BREAK_OUTER:s\[[0-9]+:[0-9]+\]]] -; GCN: s_or_b64 [[LEFT_OUTER:s\[[0-9]+:[0-9]+\]]], [[TMP1]], [[LEFT_OUTER]] -; GCN: s_andn2_b64 exec, exec, [[LEFT_OUTER]] -; GCN: s_cbranch_execz [[IF_BLOCK:BB[0-9]+_[0-9]+]] - -; GCN: [[OUTER_LOOP:BB[0-9]+_[0-9]+]]: ; %LOOP.outer{{$}} -; GCN: s_mov_b64 [[LEFT_INNER]], 0{{$}} - -; GCN: ; %Flow -; GCN: s_or_b64 exec, exec, [[SAVE_EXEC:s\[[0-9]+:[0-9]+\]]] -; GCN: s_and_b64 [[TMP0:s\[[0-9]+:[0-9]+\]]], exec, [[BREAK_INNER:s\[[0-9]+:[0-9]+\]]] -; GCN: s_or_b64 [[LEFT_INNER]], [[TMP0]], [[LEFT_INNER]] -; GCN: s_andn2_b64 exec, exec, [[LEFT_INNER]] -; GCN: s_cbranch_execz [[FLOW2]] - -; GCN: [[INNER_LOOP:BB[0-9]+_[0-9]+]]: ; %LOOP{{$}} -; GCN: s_and_saveexec_b64 [[SAVE_EXEC]], vcc - ; FIXME: duplicate comparison -; GCN: ; %ENDIF -; GCN-DAG: v_cmp_eq_u32_e32 vcc, -; GCN-DAG: v_cmp_ne_u32_e64 [[TMP51NEG:s\[[0-9]+:[0-9]+\]]], - -; GCN: [[IF_BLOCK]]: ; %IF -; GCN-NEXT: s_endpgm define amdgpu_vs void @multi_else_break(<4 x float> %vec, i32 %ub, i32 %cont) { +; OPT-LABEL: @multi_else_break( +; OPT-NEXT: main_body: +; OPT-NEXT: br label [[LOOP_OUTER:%.*]] +; OPT: LOOP.outer: +; OPT-NEXT: [[PHI_BROKEN2:%.*]] = phi i64 [ [[TMP9:%.*]], [[FLOW1:%.*]] ], [ 0, [[MAIN_BODY:%.*]] ] +; OPT-NEXT: [[TMP43:%.*]] = phi i32 [ 0, [[MAIN_BODY]] ], [ [[TMP4:%.*]], [[FLOW1]] ] +; OPT-NEXT: br label [[LOOP:%.*]] +; OPT: LOOP: +; OPT-NEXT: [[PHI_BROKEN:%.*]] = phi i64 [ [[TMP7:%.*]], [[FLOW:%.*]] ], [ 0, [[LOOP_OUTER]] ] +; OPT-NEXT: [[TMP0:%.*]] = phi i32 [ undef, [[LOOP_OUTER]] ], [ [[TMP4]], [[FLOW]] ] +; OPT-NEXT: [[TMP45:%.*]] = phi i32 [ [[TMP43]], [[LOOP_OUTER]] ], [ [[TMP47:%.*]], [[FLOW]] ] +; OPT-NEXT: [[TMP47]] = add i32 [[TMP45]], 1 +; OPT-NEXT: [[TMP48:%.*]] = icmp slt i32 [[TMP45]], [[UB:%.*]] +; OPT-NEXT: [[TMP1:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[TMP48]]) +; OPT-NEXT: [[TMP2:%.*]] = extractvalue { i1, i64 } [[TMP1]], 0 +; OPT-NEXT: [[TMP3:%.*]] = extractvalue { i1, i64 } [[TMP1]], 1 +; OPT-NEXT: br i1 [[TMP2]], label [[ENDIF:%.*]], label [[FLOW]] +; OPT: Flow: +; OPT-NEXT: [[TMP4]] = phi i32 [ [[TMP47]], [[ENDIF]] ], [ [[TMP0]], [[LOOP]] ] +; OPT-NEXT: [[TMP5:%.*]] = phi i1 [ [[TMP51:%.*]], [[ENDIF]] ], [ true, [[LOOP]] ] +; OPT-NEXT: [[TMP6:%.*]] = phi i1 [ [[TMP11:%.*]], [[ENDIF]] ], [ true, [[LOOP]] ] +; OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP3]]) +; OPT-NEXT: [[TMP7]] = call i64 @llvm.amdgcn.if.break.i64.i64(i1 [[TMP6]], i64 [[PHI_BROKEN]]) +; OPT-NEXT: [[TMP8:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP7]]) +; OPT-NEXT: [[TMP9]] = call i64 @llvm.amdgcn.if.break.i64.i64(i1 [[TMP5]], i64 [[PHI_BROKEN2]]) +; OPT-NEXT: br i1 [[TMP8]], label [[FLOW1]], label [[LOOP]] +; OPT: Flow1: +; OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP7]]) +; OPT-NEXT: [[TMP10:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP9]]) +; OPT-NEXT: br i1 [[TMP10]], label [[IF:%.*]], label [[LOOP_OUTER]] +; OPT: IF: +; OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP9]]) +; OPT-NEXT: ret void +; OPT: ENDIF: +; OPT-NEXT: [[TMP51]] = icmp eq i32 [[TMP47]], [[CONT:%.*]] +; OPT-NEXT: [[TMP11]] = xor i1 [[TMP51]], true +; OPT-NEXT: br label [[FLOW]] +; +; GCN-LABEL: multi_else_break: +; GCN: ; %bb.0: ; %main_body +; GCN-NEXT: s_mov_b64 s[2:3], 0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_branch BB0_2 +; GCN-NEXT: BB0_1: ; %Flow2 +; GCN-NEXT: ; in Loop: Header=BB0_2 Depth=1 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_and_b64 s[0:1], exec, s[8:9] +; GCN-NEXT: s_or_b64 s[2:3], s[0:1], s[2:3] +; GCN-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GCN-NEXT: s_cbranch_execz BB0_6 +; GCN-NEXT: BB0_2: ; %LOOP.outer +; GCN-NEXT: ; =>This Loop Header: Depth=1 +; GCN-NEXT: ; Child Loop BB0_4 Depth 2 +; GCN-NEXT: ; implicit-def: $sgpr6_sgpr7 +; GCN-NEXT: ; implicit-def: $sgpr8_sgpr9 +; GCN-NEXT: s_mov_b64 s[4:5], 0 +; GCN-NEXT: v_mov_b32_e32 v1, v0 +; GCN-NEXT: s_branch BB0_4 +; GCN-NEXT: BB0_3: ; %Flow +; GCN-NEXT: ; in Loop: Header=BB0_4 Depth=2 +; GCN-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-NEXT: s_and_b64 s[0:1], exec, s[6:7] +; GCN-NEXT: s_or_b64 s[4:5], s[0:1], s[4:5] +; GCN-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN-NEXT: s_cbranch_execz BB0_1 +; GCN-NEXT: BB0_4: ; %LOOP +; GCN-NEXT: ; Parent Loop BB0_2 Depth=1 +; GCN-NEXT: ; => This Inner Loop Header: Depth=2 +; GCN-NEXT: v_mov_b32_e32 v2, v1 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 1, v2 +; GCN-NEXT: v_cmp_lt_i32_e32 vcc, v2, v4 +; GCN-NEXT: s_or_b64 s[8:9], s[8:9], exec +; GCN-NEXT: s_or_b64 s[6:7], s[6:7], exec +; GCN-NEXT: s_and_saveexec_b64 s[10:11], vcc +; GCN-NEXT: s_cbranch_execz BB0_3 +; GCN-NEXT: ; %bb.5: ; %ENDIF +; GCN-NEXT: ; in Loop: Header=BB0_4 Depth=2 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], v5, v1 +; GCN-NEXT: s_andn2_b64 s[8:9], s[8:9], exec +; GCN-NEXT: s_andn2_b64 s[6:7], s[6:7], exec +; GCN-NEXT: s_and_b64 s[12:13], vcc, exec +; GCN-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GCN-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] +; GCN-NEXT: s_or_b64 s[6:7], s[6:7], s[0:1] +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: s_branch BB0_3 +; GCN-NEXT: BB0_6: ; %IF +; GCN-NEXT: s_endpgm main_body: br label %LOOP.outer @@ -73,56 +116,141 @@ br i1 %tmp51, label %LOOP, label %LOOP.outer } -; OPT-LABEL: define amdgpu_kernel void @multi_if_break_loop( -; OPT: llvm.amdgcn.if.break -; OPT: llvm.amdgcn.loop -; OPT: llvm.amdgcn.if.break -; OPT: llvm.amdgcn.end.cf - -; GCN-LABEL: {{^}}multi_if_break_loop: -; GCN: s_mov_b64 [[SAVED_MASK:s\[[0-9]+:[0-9]+\]]], 0{{$}} - -; GCN: [[LOOP:BB[0-9]+_[0-9]+]]: ; %Flow4 -; GCN: s_and_b64 [[ANDTMP0:s\[[0-9]+:[0-9]+\]]], exec, {{s\[[0-9]+:[0-9]+\]}} -; GCN: s_or_b64 [[MASK1:s\[[0-9]+:[0-9]+\]]], [[ANDTMP0]], [[SAVED_MASK]] -; GCN: s_and_b64 [[BROKEN_THREADS_MASK:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, exec -; GCN: s_andn2_b64 exec, exec, [[MASK1]] -; GCN-NEXT: s_cbranch_execz [[LOOP_EXIT:BB[0-9]+_[0-9]+]] - -; GCN: ; %bb1{{$}} -; GCN: buffer_load_dword [[LOAD0:v[0-9]+]], - -; GCN: ; %LeafBlock1 -; GCN: v_cmp_eq_u32_e32 vcc, 1, [[LOAD0]] -; GCN: s_and_b64 vcc, exec, vcc -; GCN: s_cbranch_vccz [[FLOW:BB[0-9]+_[0-9]+]] - -; GCN: ; %case1 -; GCN: buffer_load_dword [[LOAD2:v[0-9]+]], -; GCN: v_cmp_ge_i32_e32 vcc, {{v[0-9]+}}, [[LOAD2]] -; GCN: s_orn2_b64 [[BROKEN_THREADS_MASK]], vcc, exec -; GCN: BB1_{{[0-9]+}}: -; GCN: s_mov_b64 [[FALSE_MASK:s\[[0-9]+:[0-9]+\]]], 0 -; GCN: s_and_b64 vcc, exec, [[FALSE_MASK]] -; GCN: s_cbranch_vccz [[LOOP]] - -; GCN: ; %LeafBlock -; GCN: v_cmp_eq_u32_e32 vcc, 0, [[LOAD0]] -; GCN: s_and_b64 vcc, exec, vcc -; GCN: s_cbranch_vccz [[LOOP]] - -; GCN: ; %case0 -; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], -; GCN-DAG: v_cmp_ge_i32_e32 vcc, {{v[0-9]+}}, [[LOAD1]] -; GCN: s_andn2_b64 [[BROKEN_THREADS_MASK]], [[BROKEN_THREADS_MASK]], exec -; GCN: s_and_b64 [[TMP_MASK:s\[[0-9]+:[0-9]+\]]], vcc, exec -; GCN: s_or_b64 [[BROKEN_THREADS_MASK]], [[BROKEN_THREADS_MASK]], [[TMP_MASK]] -; GCN: s_branch [[LOOP]] - -; GCN: [[LOOP_EXIT]]: ; %Flow6 -; GCN: s_or_b64 exec, exec, [[SAVED_MASK]] - define amdgpu_kernel void @multi_if_break_loop(i32 %arg) #0 { +; OPT-LABEL: @multi_if_break_loop( +; OPT-NEXT: bb: +; OPT-NEXT: [[ID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; OPT-NEXT: [[TMP:%.*]] = sub i32 [[ID]], [[ARG:%.*]] +; OPT-NEXT: br label [[BB1:%.*]] +; OPT: bb1: +; OPT-NEXT: [[PHI_BROKEN:%.*]] = phi i64 [ [[TMP5:%.*]], [[FLOW4:%.*]] ], [ 0, [[BB:%.*]] ] +; OPT-NEXT: [[LSR_IV:%.*]] = phi i32 [ undef, [[BB]] ], [ [[LSR_IV_NEXT:%.*]], [[FLOW4]] ] +; OPT-NEXT: [[LSR_IV_NEXT]] = add i32 [[LSR_IV]], 1 +; OPT-NEXT: [[CMP0:%.*]] = icmp slt i32 [[LSR_IV_NEXT]], 0 +; OPT-NEXT: [[LOAD0:%.*]] = load volatile i32, i32 addrspace(1)* undef, align 4 +; OPT-NEXT: br label [[NODEBLOCK:%.*]] +; OPT: NodeBlock: +; OPT-NEXT: [[PIVOT:%.*]] = icmp slt i32 [[LOAD0]], 1 +; OPT-NEXT: [[TMP0:%.*]] = xor i1 [[PIVOT]], true +; OPT-NEXT: br i1 [[TMP0]], label [[LEAFBLOCK1:%.*]], label [[FLOW:%.*]] +; OPT: LeafBlock1: +; OPT-NEXT: [[SWITCHLEAF2:%.*]] = icmp eq i32 [[LOAD0]], 1 +; OPT-NEXT: br i1 [[SWITCHLEAF2]], label [[CASE1:%.*]], label [[FLOW3:%.*]] +; OPT: Flow3: +; OPT-NEXT: [[TMP1:%.*]] = phi i1 [ [[TMP11:%.*]], [[CASE1]] ], [ true, [[LEAFBLOCK1]] ] +; OPT-NEXT: [[TMP2:%.*]] = phi i1 [ false, [[CASE1]] ], [ true, [[LEAFBLOCK1]] ] +; OPT-NEXT: br label [[FLOW]] +; OPT: LeafBlock: +; OPT-NEXT: [[SWITCHLEAF:%.*]] = icmp eq i32 [[LOAD0]], 0 +; OPT-NEXT: br i1 [[SWITCHLEAF]], label [[CASE0:%.*]], label [[FLOW5:%.*]] +; OPT: Flow4: +; OPT-NEXT: [[TMP3:%.*]] = phi i1 [ [[TMP12:%.*]], [[FLOW5]] ], [ [[TMP8:%.*]], [[FLOW]] ] +; OPT-NEXT: [[TMP4:%.*]] = phi i1 [ [[TMP13:%.*]], [[FLOW5]] ], [ [[TMP9:%.*]], [[FLOW]] ] +; OPT-NEXT: [[TMP5]] = call i64 @llvm.amdgcn.if.break.i64.i64(i1 [[TMP3]], i64 [[PHI_BROKEN]]) +; OPT-NEXT: [[TMP6:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP5]]) +; OPT-NEXT: br i1 [[TMP6]], label [[FLOW6:%.*]], label [[BB1]] +; OPT: case0: +; OPT-NEXT: [[LOAD1:%.*]] = load volatile i32, i32 addrspace(1)* undef, align 4 +; OPT-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP]], [[LOAD1]] +; OPT-NEXT: [[TMP7:%.*]] = xor i1 [[CMP1]], true +; OPT-NEXT: br label [[FLOW5]] +; OPT: Flow: +; OPT-NEXT: [[TMP8]] = phi i1 [ [[TMP1]], [[FLOW3]] ], [ true, [[NODEBLOCK]] ] +; OPT-NEXT: [[TMP9]] = phi i1 [ [[TMP2]], [[FLOW3]] ], [ false, [[NODEBLOCK]] ] +; OPT-NEXT: [[TMP10:%.*]] = phi i1 [ false, [[FLOW3]] ], [ true, [[NODEBLOCK]] ] +; OPT-NEXT: br i1 [[TMP10]], label [[LEAFBLOCK:%.*]], label [[FLOW4]] +; OPT: case1: +; OPT-NEXT: [[LOAD2:%.*]] = load volatile i32, i32 addrspace(1)* undef, align 4 +; OPT-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP]], [[LOAD2]] +; OPT-NEXT: [[TMP11]] = xor i1 [[CMP2]], true +; OPT-NEXT: br label [[FLOW3]] +; OPT: Flow5: +; OPT-NEXT: [[TMP12]] = phi i1 [ [[TMP7]], [[CASE0]] ], [ [[TMP8]], [[LEAFBLOCK]] ] +; OPT-NEXT: [[TMP13]] = phi i1 [ false, [[CASE0]] ], [ true, [[LEAFBLOCK]] ] +; OPT-NEXT: br label [[FLOW4]] +; OPT: Flow6: +; OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP5]]) +; OPT-NEXT: [[TMP14:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[TMP4]]) +; OPT-NEXT: [[TMP15:%.*]] = extractvalue { i1, i64 } [[TMP14]], 0 +; OPT-NEXT: [[TMP16:%.*]] = extractvalue { i1, i64 } [[TMP14]], 1 +; OPT-NEXT: br i1 [[TMP15]], label [[NEWDEFAULT:%.*]], label [[BB9:%.*]] +; OPT: NewDefault: +; OPT-NEXT: br label [[BB9]] +; OPT: bb9: +; OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP16]]) +; OPT-NEXT: ret void +; +; GCN-LABEL: multi_if_break_loop: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dword s2, s[0:1], 0x9 +; GCN-NEXT: s_mov_b64 s[0:1], 0 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GCN-NEXT: s_branch BB1_2 +; GCN-NEXT: BB1_1: ; %Flow4 +; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 +; GCN-NEXT: s_and_b64 s[6:7], exec, s[6:7] +; GCN-NEXT: s_or_b64 s[0:1], s[6:7], s[0:1] +; GCN-NEXT: s_andn2_b64 s[4:5], s[4:5], exec +; GCN-NEXT: s_and_b64 s[6:7], s[8:9], exec +; GCN-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] +; GCN-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN-NEXT: s_cbranch_execz BB1_9 +; GCN-NEXT: BB1_2: ; %bb1 +; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 1, v1 +; GCN-NEXT: s_and_b64 vcc, exec, vcc +; GCN-NEXT: s_cbranch_vccnz BB1_6 +; GCN-NEXT: ; %bb.3: ; %LeafBlock1 +; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 +; GCN-NEXT: s_mov_b64 s[6:7], -1 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GCN-NEXT: s_and_b64 vcc, exec, vcc +; GCN-NEXT: s_mov_b64 s[8:9], -1 +; GCN-NEXT: s_cbranch_vccz BB1_5 +; GCN-NEXT: ; %bb.4: ; %case1 +; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_ge_i32_e32 vcc, v0, v2 +; GCN-NEXT: s_mov_b64 s[8:9], 0 +; GCN-NEXT: s_orn2_b64 s[6:7], vcc, exec +; GCN-NEXT: BB1_5: ; %Flow3 +; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 +; GCN-NEXT: s_mov_b64 s[10:11], 0 +; GCN-NEXT: s_and_b64 vcc, exec, s[10:11] +; GCN-NEXT: s_cbranch_vccz BB1_1 +; GCN-NEXT: s_branch BB1_7 +; GCN-NEXT: BB1_6: ; in Loop: Header=BB1_2 Depth=1 +; GCN-NEXT: s_mov_b64 s[8:9], 0 +; GCN-NEXT: s_mov_b64 s[6:7], -1 +; GCN-NEXT: s_and_b64 vcc, exec, -1 +; GCN-NEXT: s_cbranch_execz BB1_1 +; GCN-NEXT: BB1_7: ; %LeafBlock +; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN-NEXT: s_and_b64 vcc, exec, vcc +; GCN-NEXT: s_mov_b64 s[8:9], -1 +; GCN-NEXT: s_cbranch_vccz BB1_1 +; GCN-NEXT: ; %bb.8: ; %case0 +; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; GCN-NEXT: s_mov_b64 s[8:9], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_ge_i32_e32 vcc, v0, v1 +; GCN-NEXT: s_andn2_b64 s[6:7], s[6:7], exec +; GCN-NEXT: s_and_b64 s[10:11], vcc, exec +; GCN-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] +; GCN-NEXT: s_branch BB1_1 +; GCN-NEXT: BB1_9: ; %Flow6 +; GCN-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN-NEXT: s_and_saveexec_b64 s[0:1], s[4:5] +; GCN-NEXT: s_endpgm bb: %id = call i32 @llvm.amdgcn.workitem.id.x() %tmp = sub i32 %id, %arg @@ -134,8 +262,8 @@ %cmp0 = icmp slt i32 %lsr.iv.next, 0 %load0 = load volatile i32, i32 addrspace(1)* undef, align 4 switch i32 %load0, label %bb9 [ - i32 0, label %case0 - i32 1, label %case1 + i32 0, label %case0 + i32 1, label %case1 ] case0: Index: llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll +++ llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll @@ -1,19 +1,34 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI %s ; -; ; Most SALU instructions ignore control flow, so we need to make sure ; they don't overwrite values from other blocks. ; If the branch decision is made based on a value in an SGPR then all ; threads will execute the same code paths, so we don't need to worry ; about instructions in different blocks overwriting each other. -; SI-LABEL: {{^}}sgpr_if_else_salu_br: -; SI: s_add -; SI: s_branch - -; SI: s_sub define amdgpu_kernel void @sgpr_if_else_salu_br(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) { +; SI-LABEL: sgpr_if_else_salu_br: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xb +; SI-NEXT: s_load_dword s0, s[0:1], 0xf +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s8, 0 +; SI-NEXT: s_cbranch_scc0 BB0_2 +; SI-NEXT: ; %bb.1: ; %else +; SI-NEXT: s_add_i32 s0, s11, s0 +; SI-NEXT: s_branch BB0_3 +; SI-NEXT: BB0_2: ; %if +; SI-NEXT: s_sub_i32 s0, s9, s10 +; SI-NEXT: BB0_3: ; %endif +; SI-NEXT: s_add_i32 s0, s0, s8 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm entry: %0 = icmp eq i32 %a, 0 br i1 %0, label %if, label %else @@ -33,26 +48,30 @@ ret void } -; SI-LABEL: {{^}}sgpr_if_else_salu_br_opt: -; SI: s_cmp_lg_u32 -; SI: s_cbranch_scc0 [[IF:BB[0-9]+_[0-9]+]] - -; SI: ; %bb.1: ; %else -; SI: s_load_dword [[LOAD0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2e -; SI: s_load_dword [[LOAD1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x37 -; SI-NOT: add -; SI: s_branch [[ENDIF:BB[0-9]+_[0-9]+]] - -; SI: [[IF]]: ; %if -; SI: s_load_dword [[LOAD0]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c -; SI: s_load_dword [[LOAD1]], s{{\[[0-9]+:[0-9]+\]}}, 0x25 -; SI-NOT: add - -; SI: [[ENDIF]]: ; %endif -; SI: s_add_i32 s{{[0-9]+}}, [[LOAD0]], [[LOAD1]] -; SI: buffer_store_dword -; SI-NEXT: s_endpgm define amdgpu_kernel void @sgpr_if_else_salu_br_opt(i32 addrspace(1)* %out, [8 x i32], i32 %a, [8 x i32], i32 %b, [8 x i32], i32 %c, [8 x i32], i32 %d, [8 x i32], i32 %e) { +; SI-LABEL: sgpr_if_else_salu_br_opt: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dword s2, s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s2, 0 +; SI-NEXT: s_cbranch_scc0 BB1_2 +; SI-NEXT: ; %bb.1: ; %else +; SI-NEXT: s_load_dword s3, s[0:1], 0x2e +; SI-NEXT: s_load_dword s0, s[0:1], 0x37 +; SI-NEXT: s_branch BB1_3 +; SI-NEXT: BB1_2: ; %if +; SI-NEXT: s_load_dword s3, s[0:1], 0x1c +; SI-NEXT: s_load_dword s0, s[0:1], 0x25 +; SI-NEXT: BB1_3: ; %endif +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_add_i32 s0, s3, s0 +; SI-NEXT: s_add_i32 s0, s0, s2 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm entry: %cmp0 = icmp eq i32 %a, 0 br i1 %cmp0, label %if, label %else @@ -74,12 +93,34 @@ ; The two S_ADD instructions should write to different registers, since ; different threads will take different control flow paths. - -; SI-LABEL: {{^}}sgpr_if_else_valu_br: -; SI: s_add_i32 [[SGPR:s[0-9]+]] -; SI-NOT: s_add_i32 [[SGPR]] - define amdgpu_kernel void @sgpr_if_else_valu_br(i32 addrspace(1)* %out, float %a, i32 %b, i32 %c, i32 %d, i32 %e) { +; SI-LABEL: sgpr_if_else_valu_br: +; SI: ; %bb.0: ; %entry +; SI-NEXT: v_cvt_f32_u32_e32 v0, v0 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xc +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v0 +; SI-NEXT: s_and_saveexec_b64 s[8:9], vcc +; SI-NEXT: s_xor_b64 s[8:9], exec, s[8:9] +; SI-NEXT: s_cbranch_execz BB2_2 +; SI-NEXT: ; %bb.1: ; %else +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_add_i32 s6, s2, s3 +; SI-NEXT: BB2_2: ; %Flow +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_or_saveexec_b64 s[2:3], s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: s_xor_b64 exec, exec, s[2:3] +; SI-NEXT: ; %bb.3: ; %if +; SI-NEXT: s_add_i32 s0, s0, s1 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: ; %bb.4: ; %endif +; SI-NEXT: s_or_b64 exec, exec, s[2:3] +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %tid_f = uitofp i32 %tid to float @@ -100,24 +141,53 @@ ret void } -; SI-LABEL: {{^}}sgpr_if_else_valu_cmp_phi_br: - -; SI: ; %else -; SI: buffer_load_dword [[AVAL:v[0-9]+]] -; SI: v_cmp_gt_i32_e32 vcc, 0, [[AVAL]] -; SI: s_and_b64 [[PHI:s\[[0-9]+:[0-9]+\]]], vcc, exec - -; SI: ; %if -; SI: buffer_load_dword [[AVAL:v[0-9]+]] -; SI-DAG: v_cmp_eq_u32_e32 [[CMP_ELSE:vcc]], 0, [[AVAL]] -; SI-DAG: s_andn2_b64 [[PHI]], [[PHI]], exec -; SI-DAG: s_and_b64 [[TMP:s\[[0-9]+:[0-9]+\]]], [[CMP_ELSE]], exec -; SI: s_or_b64 [[PHI]], [[PHI]], [[TMP]] - -; SI: ; %endif -; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[PHI]] -; SI: buffer_store_dword [[RESULT]], define amdgpu_kernel void @sgpr_if_else_valu_cmp_phi_br(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) { +; SI-LABEL: sgpr_if_else_valu_cmp_phi_br: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s14, 0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: ; implicit-def: $sgpr2_sgpr3 +; SI-NEXT: s_and_saveexec_b64 s[8:9], vcc +; SI-NEXT: s_xor_b64 s[8:9], exec, s[8:9] +; SI-NEXT: s_cbranch_execz BB3_2 +; SI-NEXT: ; %bb.1: ; %else +; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0xd +; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; SI-NEXT: v_mov_b32_e32 v2, 0 +; SI-NEXT: s_mov_b32 s15, 0xf000 +; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_load_dword v1, v[1:2], s[12:15], 0 addr64 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1 +; SI-NEXT: s_and_b64 s[2:3], vcc, exec +; SI-NEXT: s_or_b64 s[2:3], s[0:1], s[2:3] +; SI-NEXT: BB3_2: ; %Flow +; SI-NEXT: s_or_saveexec_b64 s[0:1], s[8:9] +; SI-NEXT: s_xor_b64 exec, exec, s[0:1] +; SI-NEXT: s_cbranch_execz BB3_4 +; SI-NEXT: ; %bb.3: ; %if +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_andn2_b64 s[2:3], s[2:3], exec +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b64 s[6:7], vcc, exec +; SI-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7] +; SI-NEXT: BB3_4: ; %endif +; SI-NEXT: s_or_b64 exec, exec, s[0:1] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[2:3] +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %tmp1 = icmp eq i32 %tid, 0