Index: llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp @@ -206,17 +206,22 @@ bool Changed = false; std::vector Updates; + // TODO: For now we unify all exit blocks, even though they are uniformly + // reachable, if there are any exits not uniformly reached. This is to + // workaround the limitation of structurizer, which can not handle multiple + // function exits. After structurizer is able to handle multiple function + // exits, we should only unify UnreachableBlocks that are not uniformly + // reachable. + bool HasDivergentExitBlock = llvm::any_of( + PDT.roots(), [&](auto BB) { return !isUniformlyReached(DA, *BB); }); + for (BasicBlock *BB : PDT.roots()) { if (isa(BB->getTerminator())) { - if (!isUniformlyReached(DA, *BB)) + if (HasDivergentExitBlock) ReturningBlocks.push_back(BB); } else if (isa(BB->getTerminator())) { - // TODO: For now we unify UnreachableBlocks even though they are uniformly - // reachable. This is to workaround the limitation of structurizer, which - // can not handle multiple function exits. After structurizer is able to - // handle multiple function exits, we should only unify UnreachableBlocks - // that are not uniformly reachable. - UnreachableBlocks.push_back(BB); + if (HasDivergentExitBlock) + UnreachableBlocks.push_back(BB); } else if (BranchInst *BI = dyn_cast(BB->getTerminator())) { ConstantInt *BoolTrue = ConstantInt::getTrue(F.getContext()); Index: llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll @@ -50,28 +50,20 @@ define amdgpu_kernel void @sgpr_trunc_brcond(i32 %cond) { ; GCN-LABEL: sgpr_trunc_brcond: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s1, s[0:1], 0x9 -; GCN-NEXT: s_mov_b32 s0, -1 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_xor_b32 s1, s1, -1 -; GCN-NEXT: s_and_b32 s1, s1, 1 -; GCN-NEXT: s_cmp_lg_u32 s1, 0 -; GCN-NEXT: s_cbranch_scc0 .LBB3_2 -; GCN-NEXT: ; %bb.1: ; %bb1 -; GCN-NEXT: v_mov_b32_e32 v0, 1 -; GCN-NEXT: s_mov_b32 s0, 0 -; GCN-NEXT: flat_store_dword v[0:1], v0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: .LBB3_2: ; %Flow ; GCN-NEXT: s_xor_b32 s0, s0, -1 ; GCN-NEXT: s_and_b32 s0, s0, 1 ; GCN-NEXT: s_cmp_lg_u32 s0, 0 -; GCN-NEXT: s_cbranch_scc1 .LBB3_4 -; GCN-NEXT: ; %bb.3: ; %bb0 +; GCN-NEXT: s_cbranch_scc1 .LBB3_2 +; GCN-NEXT: ; %bb.1: ; %bb0 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: flat_store_dword v[0:1], v0 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: .LBB3_4: ; %UnifiedUnreachableBlock +; GCN-NEXT: .LBB3_2: ; %bb1 +; GCN-NEXT: v_mov_b32_e32 v0, 1 +; GCN-NEXT: flat_store_dword v[0:1], v0 +; GCN-NEXT: s_waitcnt vmcnt(0) entry: %trunc = trunc i32 %cond to i1 br i1 %trunc, label %bb0, label %bb1 @@ -90,27 +82,19 @@ ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_and_b32 s1, s0, s1 -; GCN-NEXT: s_xor_b32 s1, s1, -1 -; GCN-NEXT: s_and_b32 s1, s1, 1 -; GCN-NEXT: s_mov_b32 s0, -1 -; GCN-NEXT: s_cmp_lg_u32 s1, 0 -; GCN-NEXT: s_cbranch_scc0 .LBB4_2 -; GCN-NEXT: ; %bb.1: ; %bb1 -; GCN-NEXT: v_mov_b32_e32 v0, 1 -; GCN-NEXT: s_mov_b32 s0, 0 -; GCN-NEXT: flat_store_dword v[0:1], v0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: .LBB4_2: ; %Flow +; GCN-NEXT: s_and_b32 s0, s0, s1 ; GCN-NEXT: s_xor_b32 s0, s0, -1 ; GCN-NEXT: s_and_b32 s0, s0, 1 ; GCN-NEXT: s_cmp_lg_u32 s0, 0 -; GCN-NEXT: s_cbranch_scc1 .LBB4_4 -; GCN-NEXT: ; %bb.3: ; %bb0 +; GCN-NEXT: s_cbranch_scc1 .LBB4_2 +; GCN-NEXT: ; %bb.1: ; %bb0 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: flat_store_dword v[0:1], v0 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: .LBB4_4: ; %UnifiedUnreachableBlock +; GCN-NEXT: .LBB4_2: ; %bb1 +; GCN-NEXT: v_mov_b32_e32 v0, 1 +; GCN-NEXT: flat_store_dword v[0:1], v0 +; GCN-NEXT: s_waitcnt vmcnt(0) entry: %trunc0 = trunc i32 %cond0 to i1 %trunc1 = trunc i32 %cond1 to i1 Index: llvm/test/CodeGen/AMDGPU/si-unify-exit-return-unreachable.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/si-unify-exit-return-unreachable.ll @@ -0,0 +1,222 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -amdgpu-unify-divergent-exit-nodes -verify -structurizecfg -verify -si-annotate-control-flow -verify -S %s -o - | FileCheck -check-prefix=IR %s + +; A test with a divergent unreachable block and uniform return block. The +; compiler needs to create a regions that includes them so that +; StructurizeCFG correctly transform the CFG, and then SI Annotate Control +; Flow does not fail during annotation. + +define void @my_func(i32 %0) { +; IR-LABEL: @my_func( +; IR-NEXT: entry: +; IR-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) null, align 8 +; IR-NEXT: br label [[NODEBLOCK:%.*]] +; IR: NodeBlock: +; IR-NEXT: [[PIVOT:%.*]] = icmp sge i32 [[TMP1]], 1 +; IR-NEXT: br i1 [[PIVOT]], label [[LEAFBLOCK1:%.*]], label [[FLOW:%.*]] +; IR: LeafBlock1: +; IR-NEXT: [[SWITCHLEAF2:%.*]] = icmp ne i32 [[TMP1]], 1 +; IR-NEXT: br label [[FLOW]] +; IR: Flow: +; IR-NEXT: [[TMP2:%.*]] = phi i1 [ [[SWITCHLEAF2]], [[LEAFBLOCK1]] ], [ false, [[NODEBLOCK]] ] +; IR-NEXT: [[TMP3:%.*]] = phi i1 [ false, [[LEAFBLOCK1]] ], [ true, [[NODEBLOCK]] ] +; IR-NEXT: br i1 [[TMP3]], label [[LEAFBLOCK:%.*]], label [[FLOW11:%.*]] +; IR: LeafBlock: +; IR-NEXT: [[SWITCHLEAF:%.*]] = icmp eq i32 [[TMP1]], 0 +; IR-NEXT: br i1 [[SWITCHLEAF]], label [[SW_BB2:%.*]], label [[FLOW12:%.*]] +; IR: Flow11: +; IR-NEXT: [[TMP4:%.*]] = phi i1 [ [[TMP9:%.*]], [[FLOW12]] ], [ false, [[FLOW]] ] +; IR-NEXT: [[TMP5:%.*]] = phi i1 [ [[TMP10:%.*]], [[FLOW12]] ], [ [[TMP2]], [[FLOW]] ] +; IR-NEXT: [[TMP6:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[TMP5]]) +; IR-NEXT: [[TMP7:%.*]] = extractvalue { i1, i64 } [[TMP6]], 0 +; IR-NEXT: [[TMP8:%.*]] = extractvalue { i1, i64 } [[TMP6]], 1 +; IR-NEXT: br i1 [[TMP7]], label [[DO_BODY:%.*]], label [[FLOW17:%.*]] +; IR: sw.bb2: +; IR-NEXT: br label [[NODEBLOCK7:%.*]] +; IR: Flow12: +; IR-NEXT: [[TMP9]] = phi i1 [ [[TMP24:%.*]], [[FLOW15:%.*]] ], [ false, [[LEAFBLOCK]] ] +; IR-NEXT: [[TMP10]] = phi i1 [ [[TMP25:%.*]], [[FLOW15]] ], [ true, [[LEAFBLOCK]] ] +; IR-NEXT: br label [[FLOW11]] +; IR: NodeBlock7: +; IR-NEXT: [[PIVOT8:%.*]] = icmp sge i32 [[TMP0:%.*]], 2 +; IR-NEXT: [[TMP11:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[PIVOT8]]) +; IR-NEXT: [[TMP12:%.*]] = extractvalue { i1, i64 } [[TMP11]], 0 +; IR-NEXT: [[TMP13:%.*]] = extractvalue { i1, i64 } [[TMP11]], 1 +; IR-NEXT: br i1 [[TMP12]], label [[LEAFBLOCK5:%.*]], label [[FLOW13:%.*]] +; IR: LeafBlock5: +; IR-NEXT: [[SWITCHLEAF6:%.*]] = icmp eq i32 [[TMP0]], 2 +; IR-NEXT: br label [[FLOW13]] +; IR: Flow13: +; IR-NEXT: [[TMP14:%.*]] = phi i1 [ true, [[LEAFBLOCK5]] ], [ false, [[NODEBLOCK7]] ] +; IR-NEXT: [[TMP15:%.*]] = phi i1 [ [[SWITCHLEAF6]], [[LEAFBLOCK5]] ], [ false, [[NODEBLOCK7]] ] +; IR-NEXT: [[TMP16:%.*]] = call { i1, i64 } @llvm.amdgcn.else.i64.i64(i64 [[TMP13]]) +; IR-NEXT: [[TMP17:%.*]] = extractvalue { i1, i64 } [[TMP16]], 0 +; IR-NEXT: [[TMP18:%.*]] = extractvalue { i1, i64 } [[TMP16]], 1 +; IR-NEXT: br i1 [[TMP17]], label [[LEAFBLOCK3:%.*]], label [[FLOW14:%.*]] +; IR: LeafBlock3: +; IR-NEXT: [[SWITCHLEAF4:%.*]] = icmp eq i32 [[TMP0]], 0 +; IR-NEXT: [[SWITCHLEAF4_INV:%.*]] = xor i1 [[SWITCHLEAF4]], true +; IR-NEXT: br label [[FLOW14]] +; IR: Flow14: +; IR-NEXT: [[TMP19:%.*]] = phi i1 [ [[SWITCHLEAF4_INV]], [[LEAFBLOCK3]] ], [ [[TMP14]], [[FLOW13]] ] +; IR-NEXT: [[TMP20:%.*]] = phi i1 [ [[SWITCHLEAF4]], [[LEAFBLOCK3]] ], [ [[TMP15]], [[FLOW13]] ] +; IR-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP18]]) +; IR-NEXT: [[TMP21:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[TMP20]]) +; IR-NEXT: [[TMP22:%.*]] = extractvalue { i1, i64 } [[TMP21]], 0 +; IR-NEXT: [[TMP23:%.*]] = extractvalue { i1, i64 } [[TMP21]], 1 +; IR-NEXT: br i1 [[TMP22]], label [[LAND_LHS_TRUE_I:%.*]], label [[FLOW15]] +; IR: land.lhs.true.i: +; IR-NEXT: br label [[LEAFBLOCK9:%.*]] +; IR: Flow15: +; IR-NEXT: [[TMP24]] = phi i1 [ [[TMP29:%.*]], [[FLOW16:%.*]] ], [ false, [[FLOW14]] ] +; IR-NEXT: [[TMP25]] = phi i1 [ [[TMP30:%.*]], [[FLOW16]] ], [ [[TMP19]], [[FLOW14]] ] +; IR-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP23]]) +; IR-NEXT: br label [[FLOW12]] +; IR: LeafBlock9: +; IR-NEXT: [[SWITCHLEAF10:%.*]] = icmp sgt i32 [[TMP0]], 1 +; IR-NEXT: [[TMP26:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[SWITCHLEAF10]]) +; IR-NEXT: [[TMP27:%.*]] = extractvalue { i1, i64 } [[TMP26]], 0 +; IR-NEXT: [[TMP28:%.*]] = extractvalue { i1, i64 } [[TMP26]], 1 +; IR-NEXT: br i1 [[TMP27]], label [[DO_BODY_I_I_I_I:%.*]], label [[FLOW16]] +; IR: do.body.i.i.i.i: +; IR-NEXT: tail call fastcc void null() +; IR-NEXT: br label [[FLOW16]] +; IR: Flow16: +; IR-NEXT: [[TMP29]] = phi i1 [ true, [[DO_BODY_I_I_I_I]] ], [ false, [[LEAFBLOCK9]] ] +; IR-NEXT: [[TMP30]] = phi i1 [ false, [[DO_BODY_I_I_I_I]] ], [ true, [[LEAFBLOCK9]] ] +; IR-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP28]]) +; IR-NEXT: br label [[FLOW15]] +; IR: do.body: +; IR-NEXT: tail call fastcc void null() +; IR-NEXT: br label [[FLOW17]] +; IR: Flow17: +; IR-NEXT: [[TMP31:%.*]] = phi i1 [ true, [[DO_BODY]] ], [ [[TMP4]], [[FLOW11]] ] +; IR-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP8]]) +; IR-NEXT: [[TMP32:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[TMP31]]) +; IR-NEXT: [[TMP33:%.*]] = extractvalue { i1, i64 } [[TMP32]], 0 +; IR-NEXT: [[TMP34:%.*]] = extractvalue { i1, i64 } [[TMP32]], 1 +; IR-NEXT: br i1 [[TMP33]], label [[UNIFIEDUNREACHABLEBLOCK:%.*]], label [[UNIFIEDRETURNBLOCK:%.*]] +; IR: UnifiedUnreachableBlock: +; IR-NEXT: call void @llvm.amdgcn.unreachable() +; IR-NEXT: br label [[UNIFIEDRETURNBLOCK]] +; IR: UnifiedReturnBlock: +; IR-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP34]]) +; IR-NEXT: ret void +; +; GCN-LABEL: my_func: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[6:7], 0 +; GCN-NEXT: s_load_dword s8, s[6:7], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_lt_i32 s8, 1 +; GCN-NEXT: s_mov_b64 s[4:5], -1 +; GCN-NEXT: s_cbranch_scc1 .LBB0_2 +; GCN-NEXT: ; %bb.1: ; %LeafBlock1 +; GCN-NEXT: s_mov_b64 s[4:5], 0 +; GCN-NEXT: s_cmp_lg_u32 s8, 1 +; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN-NEXT: .LBB0_2: ; %Flow +; GCN-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN-NEXT: s_mov_b64 s[4:5], 0 +; GCN-NEXT: s_cbranch_vccnz .LBB0_13 +; GCN-NEXT: ; %bb.3: ; %LeafBlock +; GCN-NEXT: s_cmp_eq_u32 s8, 0 +; GCN-NEXT: s_cbranch_scc1 .LBB0_5 +; GCN-NEXT: ; %bb.4: +; GCN-NEXT: s_mov_b64 s[6:7], -1 +; GCN-NEXT: s_mov_b64 s[4:5], 0 +; GCN-NEXT: s_and_saveexec_b64 s[8:9], s[6:7] +; GCN-NEXT: s_cbranch_execnz .LBB0_14 +; GCN-NEXT: s_branch .LBB0_15 +; GCN-NEXT: .LBB0_5: ; %NodeBlock7 +; GCN-NEXT: v_cmp_lt_i32_e32 vcc, 1, v0 +; GCN-NEXT: s_mov_b64 s[8:9], 0 +; GCN-NEXT: s_mov_b64 s[6:7], 0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: ; %bb.6: ; %LeafBlock5 +; GCN-NEXT: s_mov_b64 s[6:7], exec +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 +; GCN-NEXT: s_and_b64 s[8:9], vcc, exec +; GCN-NEXT: ; %bb.7: ; %Flow13 +; GCN-NEXT: s_andn2_saveexec_b64 s[10:11], s[4:5] +; GCN-NEXT: ; %bb.8: ; %LeafBlock3 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 +; GCN-NEXT: s_andn2_b64 s[6:7], s[6:7], exec +; GCN-NEXT: s_andn2_b64 s[8:9], s[8:9], exec +; GCN-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN-NEXT: s_and_b64 s[12:13], vcc, exec +; GCN-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] +; GCN-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] +; GCN-NEXT: ; %bb.9: ; %Flow14 +; GCN-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-NEXT: s_mov_b64 s[4:5], 0 +; GCN-NEXT: s_and_saveexec_b64 s[10:11], s[8:9] +; GCN-NEXT: s_cbranch_execz .LBB0_18 +; GCN-NEXT: ; %bb.10: ; %LeafBlock9 +; GCN-NEXT: v_cmp_lt_i32_e32 vcc, 1, v0 +; GCN-NEXT: s_mov_b64 s[8:9], -1 +; GCN-NEXT: s_mov_b64 s[4:5], 0 +; GCN-NEXT: s_and_saveexec_b64 s[12:13], vcc +; GCN-NEXT: ; %bb.11: ; %do.body.i.i.i.i +; GCN-NEXT: s_mov_b64 s[4:5], exec +; GCN-NEXT: s_xor_b64 s[8:9], exec, -1 +; GCN-NEXT: ; %bb.12: ; %Flow16 +; GCN-NEXT: s_or_b64 exec, exec, s[12:13] +; GCN-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN-NEXT: s_andn2_b64 s[6:7], s[6:7], exec +; GCN-NEXT: s_and_b64 s[8:9], s[8:9], exec +; GCN-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; GCN-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-NEXT: .LBB0_13: ; %Flow11 +; GCN-NEXT: s_and_saveexec_b64 s[8:9], s[6:7] +; GCN-NEXT: .LBB0_14: ; %do.body +; GCN-NEXT: s_or_b64 s[4:5], s[4:5], exec +; GCN-NEXT: .LBB0_15: ; %Flow17 +; GCN-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GCN-NEXT: ; %bb.16: ; %UnifiedUnreachableBlock +; GCN-NEXT: ; divergent unreachable +; GCN-NEXT: ; %bb.17: ; %UnifiedReturnBlock +; GCN-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB0_18: ; %Flow15 +; GCN-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-NEXT: s_and_saveexec_b64 s[8:9], s[6:7] +; GCN-NEXT: s_cbranch_execnz .LBB0_14 +; GCN-NEXT: s_branch .LBB0_15 +entry: + %1 = load i32, ptr addrspace(4) null, align 8 + switch i32 %1, label %do.body [ + i32 1, label %sw.bb + i32 0, label %sw.bb2 + ] + +sw.bb: + ret void + +sw.bb2: + switch i32 %0, label %do.body [ + i32 0, label %land.lhs.true.i + i32 2, label %land.lhs.true.i + ] + +land.lhs.true.i: + switch i32 %0, label %do.body.i.i.i.i [ + i32 0, label %do.body + i32 1, label %do.body + ] + +do.body.i.i.i.i: + tail call fastcc void null() + unreachable + +do.body: + tail call fastcc void null() + unreachable + +} Index: llvm/test/CodeGen/AMDGPU/skip-if-dead.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/skip-if-dead.ll +++ llvm/test/CodeGen/AMDGPU/skip-if-dead.ll @@ -1292,45 +1292,40 @@ ; SI-LABEL: no_skip_no_successors: ; SI: ; %bb.0: ; %bb ; SI-NEXT: v_cmp_nge_f32_e64 s[4:5], s1, 0 -; SI-NEXT: s_mov_b64 s[2:3], exec -; SI-NEXT: s_mov_b64 s[0:1], -1 ; SI-NEXT: s_and_b64 vcc, exec, s[4:5] ; SI-NEXT: s_cbranch_vccz .LBB12_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_and_b64 vcc, exec, s[0:1] -; SI-NEXT: s_cbranch_vccnz .LBB12_4 -; SI-NEXT: .LBB12_2: ; %UnifiedUnreachableBlock -; SI-NEXT: .LBB12_3: ; %bb3 -; SI-NEXT: s_branch .LBB12_2 -; SI-NEXT: .LBB12_4: ; %bb6 +; SI-NEXT: ; %bb.1: ; %bb6 +; SI-NEXT: s_mov_b64 s[2:3], exec ; SI-NEXT: s_andn2_b64 s[2:3], s[2:3], exec -; SI-NEXT: s_cbranch_scc0 .LBB12_6 -; SI-NEXT: ; %bb.5: ; %bb6 +; SI-NEXT: s_cbranch_scc0 .LBB12_5 +; SI-NEXT: ; %bb.2: ; %bb6 ; SI-NEXT: s_mov_b64 exec, 0 -; SI-NEXT: .LBB12_6: +; SI-NEXT: .LBB12_3: ; %bb3 +; SI-NEXT: v_mov_b32_e32 v0, 0x3e7ae148 +; SI-NEXT: v_cmp_nge_f32_e32 vcc, s0, v0 +; SI-NEXT: s_and_b64 vcc, exec, vcc +; SI-NEXT: ; %bb.4: ; %bb5 +; SI-NEXT: .LBB12_5: ; SI-NEXT: s_mov_b64 exec, 0 ; SI-NEXT: exp null off, off, off, off done vm ; SI-NEXT: s_endpgm ; ; GFX10-WAVE64-LABEL: no_skip_no_successors: ; GFX10-WAVE64: ; %bb.0: ; %bb -; GFX10-WAVE64-NEXT: v_cmp_nge_f32_e64 s[0:1], s1, 0 -; GFX10-WAVE64-NEXT: s_mov_b64 s[2:3], exec -; GFX10-WAVE64-NEXT: s_and_b64 vcc, exec, s[0:1] -; GFX10-WAVE64-NEXT: s_mov_b64 s[0:1], -1 +; GFX10-WAVE64-NEXT: v_cmp_nge_f32_e64 s[4:5], s1, 0 +; GFX10-WAVE64-NEXT: s_and_b64 vcc, exec, s[4:5] ; GFX10-WAVE64-NEXT: s_cbranch_vccz .LBB12_3 -; GFX10-WAVE64-NEXT: ; %bb.1: ; %Flow -; GFX10-WAVE64-NEXT: s_and_b64 vcc, exec, s[0:1] -; GFX10-WAVE64-NEXT: s_cbranch_vccnz .LBB12_4 -; GFX10-WAVE64-NEXT: .LBB12_2: ; %UnifiedUnreachableBlock -; GFX10-WAVE64-NEXT: .LBB12_3: ; %bb3 -; GFX10-WAVE64-NEXT: s_branch .LBB12_2 -; GFX10-WAVE64-NEXT: .LBB12_4: ; %bb6 +; GFX10-WAVE64-NEXT: ; %bb.1: ; %bb6 +; GFX10-WAVE64-NEXT: s_mov_b64 s[2:3], exec ; GFX10-WAVE64-NEXT: s_andn2_b64 s[2:3], s[2:3], exec -; GFX10-WAVE64-NEXT: s_cbranch_scc0 .LBB12_6 -; GFX10-WAVE64-NEXT: ; %bb.5: ; %bb6 +; GFX10-WAVE64-NEXT: s_cbranch_scc0 .LBB12_5 +; GFX10-WAVE64-NEXT: ; %bb.2: ; %bb6 ; GFX10-WAVE64-NEXT: s_mov_b64 exec, 0 -; GFX10-WAVE64-NEXT: .LBB12_6: +; GFX10-WAVE64-NEXT: .LBB12_3: ; %bb3 +; GFX10-WAVE64-NEXT: v_cmp_nle_f32_e64 s[0:1], 0x3e7ae148, s0 +; GFX10-WAVE64-NEXT: s_and_b64 vcc, exec, s[0:1] +; GFX10-WAVE64-NEXT: ; %bb.4: ; %bb5 +; GFX10-WAVE64-NEXT: .LBB12_5: ; GFX10-WAVE64-NEXT: s_mov_b64 exec, 0 ; GFX10-WAVE64-NEXT: exp null off, off, off, off done vm ; GFX10-WAVE64-NEXT: s_endpgm @@ -1338,46 +1333,42 @@ ; GFX10-WAVE32-LABEL: no_skip_no_successors: ; GFX10-WAVE32: ; %bb.0: ; %bb ; GFX10-WAVE32-NEXT: v_cmp_nge_f32_e64 s1, s1, 0 -; GFX10-WAVE32-NEXT: s_mov_b32 s0, exec_lo ; GFX10-WAVE32-NEXT: s_and_b32 vcc_lo, exec_lo, s1 -; GFX10-WAVE32-NEXT: s_mov_b32 s1, -1 ; GFX10-WAVE32-NEXT: s_cbranch_vccz .LBB12_3 -; GFX10-WAVE32-NEXT: ; %bb.1: ; %Flow -; GFX10-WAVE32-NEXT: s_and_b32 vcc_lo, exec_lo, s1 -; GFX10-WAVE32-NEXT: s_cbranch_vccnz .LBB12_4 -; GFX10-WAVE32-NEXT: .LBB12_2: ; %UnifiedUnreachableBlock -; GFX10-WAVE32-NEXT: .LBB12_3: ; %bb3 -; GFX10-WAVE32-NEXT: s_branch .LBB12_2 -; GFX10-WAVE32-NEXT: .LBB12_4: ; %bb6 -; GFX10-WAVE32-NEXT: s_andn2_b32 s0, s0, exec_lo -; GFX10-WAVE32-NEXT: s_cbranch_scc0 .LBB12_6 -; GFX10-WAVE32-NEXT: ; %bb.5: ; %bb6 +; GFX10-WAVE32-NEXT: ; %bb.1: ; %bb6 +; GFX10-WAVE32-NEXT: s_mov_b32 s2, exec_lo +; GFX10-WAVE32-NEXT: s_andn2_b32 s2, s2, exec_lo +; GFX10-WAVE32-NEXT: s_cbranch_scc0 .LBB12_5 +; GFX10-WAVE32-NEXT: ; %bb.2: ; %bb6 ; GFX10-WAVE32-NEXT: s_mov_b32 exec_lo, 0 -; GFX10-WAVE32-NEXT: .LBB12_6: +; GFX10-WAVE32-NEXT: .LBB12_3: ; %bb3 +; GFX10-WAVE32-NEXT: v_cmp_nle_f32_e64 s0, 0x3e7ae148, s0 +; GFX10-WAVE32-NEXT: s_and_b32 vcc_lo, exec_lo, s0 +; GFX10-WAVE32-NEXT: ; %bb.4: ; %bb5 +; GFX10-WAVE32-NEXT: .LBB12_5: ; GFX10-WAVE32-NEXT: s_mov_b32 exec_lo, 0 ; GFX10-WAVE32-NEXT: exp null off, off, off, off done vm ; GFX10-WAVE32-NEXT: s_endpgm ; ; GFX11-LABEL: no_skip_no_successors: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: v_cmp_nge_f32_e64 s[0:1], s1, 0 -; GFX11-NEXT: s_mov_b64 s[2:3], exec +; GFX11-NEXT: v_cmp_nge_f32_e64 s[4:5], s1, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: s_and_b64 vcc, exec, s[0:1] -; GFX11-NEXT: s_mov_b64 s[0:1], -1 +; GFX11-NEXT: s_and_b64 vcc, exec, s[4:5] ; GFX11-NEXT: s_cbranch_vccz .LBB12_3 -; GFX11-NEXT: ; %bb.1: ; %Flow -; GFX11-NEXT: s_and_b64 vcc, exec, s[0:1] -; GFX11-NEXT: s_cbranch_vccnz .LBB12_4 -; GFX11-NEXT: .LBB12_2: ; %UnifiedUnreachableBlock -; GFX11-NEXT: .LBB12_3: ; %bb3 -; GFX11-NEXT: s_branch .LBB12_2 -; GFX11-NEXT: .LBB12_4: ; %bb6 +; GFX11-NEXT: ; %bb.1: ; %bb6 +; GFX11-NEXT: s_mov_b64 s[2:3], exec +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b64 s[2:3], s[2:3], exec -; GFX11-NEXT: s_cbranch_scc0 .LBB12_6 -; GFX11-NEXT: ; %bb.5: ; %bb6 +; GFX11-NEXT: s_cbranch_scc0 .LBB12_5 +; GFX11-NEXT: ; %bb.2: ; %bb6 ; GFX11-NEXT: s_mov_b64 exec, 0 -; GFX11-NEXT: .LBB12_6: +; GFX11-NEXT: .LBB12_3: ; %bb3 +; GFX11-NEXT: v_cmp_nle_f32_e64 s[0:1], 0x3e7ae148, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_and_b64 vcc, exec, s[0:1] +; GFX11-NEXT: ; %bb.4: ; %bb5 +; GFX11-NEXT: .LBB12_5: ; GFX11-NEXT: s_mov_b64 exec, 0 ; GFX11-NEXT: exp mrt0 off, off, off, off done ; GFX11-NEXT: s_endpgm Index: llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll +++ llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll @@ -7,57 +7,135 @@ ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[16:17] -; GCN-NEXT: v_writelane_b32 v40, s33, 2 +; GCN-NEXT: v_writelane_b32 v40, s33, 16 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_addk_i32 s32, 0x400 +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 +; GCN-NEXT: v_writelane_b32 v40, s34, 2 +; GCN-NEXT: v_writelane_b32 v40, s35, 3 +; GCN-NEXT: v_writelane_b32 v40, s36, 4 +; GCN-NEXT: v_writelane_b32 v40, s37, 5 +; GCN-NEXT: v_writelane_b32 v40, s38, 6 +; GCN-NEXT: v_writelane_b32 v40, s39, 7 +; GCN-NEXT: v_writelane_b32 v40, s40, 8 +; GCN-NEXT: v_writelane_b32 v40, s41, 9 +; GCN-NEXT: v_writelane_b32 v40, s42, 10 +; GCN-NEXT: v_writelane_b32 v40, s43, 11 +; GCN-NEXT: v_writelane_b32 v40, s44, 12 +; GCN-NEXT: v_writelane_b32 v40, s45, 13 +; GCN-NEXT: v_writelane_b32 v40, s46, 14 +; GCN-NEXT: v_writelane_b32 v40, s47, 15 +; GCN-NEXT: v_mov_b32_e32 v41, v31 +; GCN-NEXT: s_mov_b32 s42, s15 +; GCN-NEXT: s_mov_b32 s43, s14 +; GCN-NEXT: s_mov_b32 s44, s13 +; GCN-NEXT: s_mov_b32 s45, s12 +; GCN-NEXT: s_mov_b64 s[34:35], s[10:11] +; GCN-NEXT: s_mov_b64 s[36:37], s[8:9] +; GCN-NEXT: s_mov_b64 s[38:39], s[6:7] +; GCN-NEXT: s_mov_b64 s[40:41], s[4:5] ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: flat_load_dword v0, v[0:1] +; GCN-NEXT: s_mov_b64 s[4:5], 0 +; GCN-NEXT: s_mov_b64 s[8:9], -1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 21, v0 -; GCN-NEXT: v_readfirstlane_b32 s16, v0 -; GCN-NEXT: s_cbranch_vccz .LBB0_3 -; GCN-NEXT: ; %bb.1: ; %bb4 -; GCN-NEXT: s_cmp_lg_u32 s16, 9 -; GCN-NEXT: s_cbranch_scc1 .LBB0_4 -; GCN-NEXT: ; %bb.2: ; %bb7 -; GCN-NEXT: s_getpc_b64 s[16:17] -; GCN-NEXT: s_add_u32 s16, s16, wibble@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s17, s17, wibble@rel32@hi+12 -; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GCN-NEXT: s_branch .LBB0_7 -; GCN-NEXT: .LBB0_3: ; %bb2 -; GCN-NEXT: s_cmp_eq_u32 s16, 21 -; GCN-NEXT: s_cbranch_scc1 .LBB0_6 -; GCN-NEXT: .LBB0_4: ; %bb9 +; GCN-NEXT: s_mov_b64 s[46:47], 0 +; GCN-NEXT: s_mov_b64 s[6:7], 0 +; GCN-NEXT: s_cbranch_vccz .LBB0_9 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_b64 vcc, exec, s[8:9] +; GCN-NEXT: s_cbranch_vccz .LBB0_10 +; GCN-NEXT: .LBB0_2: ; %Flow1 +; GCN-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GCN-NEXT: s_cbranch_vccnz .LBB0_4 +; GCN-NEXT: .LBB0_3: ; %bb9 ; GCN-NEXT: s_getpc_b64 s[16:17] ; GCN-NEXT: s_add_u32 s16, s16, wibble@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s17, s17, wibble@rel32@hi+12 +; GCN-NEXT: s_mov_b64 s[4:5], s[40:41] +; GCN-NEXT: s_mov_b64 s[6:7], s[38:39] +; GCN-NEXT: s_mov_b64 s[8:9], s[36:37] +; GCN-NEXT: s_mov_b64 s[10:11], s[34:35] +; GCN-NEXT: s_mov_b32 s12, s45 +; GCN-NEXT: s_mov_b32 s13, s44 +; GCN-NEXT: s_mov_b32 s14, s43 +; GCN-NEXT: s_mov_b32 s15, s42 +; GCN-NEXT: v_mov_b32_e32 v31, v41 ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GCN-NEXT: v_cmp_lt_f32_e32 vcc, 0, v0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execnz .LBB0_7 -; GCN-NEXT: ; %bb.5: ; %bb9.bb12_crit_edge -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: .LBB0_6: ; %bb12 +; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b64 s[4:5], 0 +; GCN-NEXT: s_andn2_b64 s[6:7], s[46:47], exec +; GCN-NEXT: s_and_b64 s[8:9], vcc, exec +; GCN-NEXT: s_or_b64 s[46:47], s[6:7], s[8:9] +; GCN-NEXT: .LBB0_4: ; %Flow2 +; GCN-NEXT: s_and_saveexec_b64 s[6:7], s[46:47] +; GCN-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; GCN-NEXT: s_cbranch_execz .LBB0_6 +; GCN-NEXT: ; %bb.5: ; %bb12 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: flat_store_dword v[0:1], v2 -; GCN-NEXT: .LBB0_7: ; %UnifiedReturnBlock +; GCN-NEXT: .LBB0_6: ; %Flow3 +; GCN-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN-NEXT: s_cbranch_vccnz .LBB0_8 +; GCN-NEXT: ; %bb.7: ; %bb7 +; GCN-NEXT: s_getpc_b64 s[16:17] +; GCN-NEXT: s_add_u32 s16, s16, wibble@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s17, s17, wibble@rel32@hi+12 +; GCN-NEXT: s_mov_b64 s[4:5], s[40:41] +; GCN-NEXT: s_mov_b64 s[6:7], s[38:39] +; GCN-NEXT: s_mov_b64 s[8:9], s[36:37] +; GCN-NEXT: s_mov_b64 s[10:11], s[34:35] +; GCN-NEXT: s_mov_b32 s12, s45 +; GCN-NEXT: s_mov_b32 s13, s44 +; GCN-NEXT: s_mov_b32 s14, s43 +; GCN-NEXT: s_mov_b32 s15, s42 +; GCN-NEXT: v_mov_b32_e32 v31, v41 +; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GCN-NEXT: .LBB0_8: ; %UnifiedReturnBlock +; GCN-NEXT: v_readlane_b32 s47, v40, 15 +; GCN-NEXT: v_readlane_b32 s46, v40, 14 +; GCN-NEXT: v_readlane_b32 s45, v40, 13 +; GCN-NEXT: v_readlane_b32 s44, v40, 12 +; GCN-NEXT: v_readlane_b32 s43, v40, 11 +; GCN-NEXT: v_readlane_b32 s42, v40, 10 +; GCN-NEXT: v_readlane_b32 s41, v40, 9 +; GCN-NEXT: v_readlane_b32 s40, v40, 8 +; GCN-NEXT: v_readlane_b32 s39, v40, 7 +; GCN-NEXT: v_readlane_b32 s38, v40, 6 +; GCN-NEXT: v_readlane_b32 s37, v40, 5 +; GCN-NEXT: v_readlane_b32 s36, v40, 4 +; GCN-NEXT: v_readlane_b32 s35, v40, 3 +; GCN-NEXT: v_readlane_b32 s34, v40, 2 ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: v_readlane_b32 s33, v40, 2 +; GCN-NEXT: v_readlane_b32 s33, v40, 16 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB0_9: ; %bb2 +; GCN-NEXT: v_cmp_eq_u32_e64 s[46:47], 21, v0 +; GCN-NEXT: v_cmp_ne_u32_e64 s[6:7], 21, v0 +; GCN-NEXT: s_mov_b64 vcc, exec +; GCN-NEXT: s_cbranch_execnz .LBB0_2 +; GCN-NEXT: .LBB0_10: ; %bb4 +; GCN-NEXT: s_mov_b64 s[4:5], -1 +; GCN-NEXT: v_cmp_ne_u32_e64 s[6:7], 9, v0 +; GCN-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GCN-NEXT: s_cbranch_vccz .LBB0_3 +; GCN-NEXT: s_branch .LBB0_4 ; SI-OPT-LABEL: @widget( ; SI-OPT-NEXT: bb: ; SI-OPT-NEXT: [[TMP:%.*]] = load i32, i32 addrspace(1)* null, align 16