diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp --- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp +++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp @@ -12,6 +12,7 @@ #include "llvm/ADT/SCCIterator.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LegacyDivergenceAnalysis.h" @@ -246,6 +247,7 @@ SmallVector Order; BBSet Visited; + BBSet FlowSet; SmallVector AffectedPhis; BBPhiMap DeletedPhis; @@ -278,6 +280,9 @@ void addPhiValues(BasicBlock *From, BasicBlock *To); + void findUndefBlocks(BasicBlock *PHIBlock, + const SmallSet &Incomings, + SmallVector &UndefBlks) const; void setPhiValues(); void simplifyAffectedPhis(); @@ -632,6 +637,67 @@ AddedPhis[To].push_back(From); } +/// When we are reconstructing a PHI inside \p PHIBlock with incoming values +/// from predecessors \p Incomings, we have a chance to mark the available value +/// from some blocks as undefined. The function will find out all such blocks +/// and return in \p UndefBlks. +void StructurizeCFG::findUndefBlocks( + BasicBlock *PHIBlock, const SmallSet &Incomings, + SmallVector &UndefBlks) const { + // We may get a post-structured CFG like below: + // + // | P1 + // |/ + // F1 + // |\ + // | N + // |/ + // F2 + // |\ + // | P2 + // |/ + // F3 + // |\ + // B + // + // B is the block that has a PHI being reconstructed. P1/P2 are predecessors + // of B before structurization. F1/F2/F3 are flow blocks inserted during + // structurization process. Block N is not a predecessor of B before + // structurization, but are placed between the predecessors(P1/P2) of B after + // structurization. This usually means that threads went to N never take the + // path N->F2->F3->B. For example, the threads take the branch F1->N may + // always take the branch F2->P2. So, when we are reconstructing a PHI + // originally in B, we can safely say the incoming value from N is undefined. + SmallSet VisitedBlock; + SmallVector Stack; + if (PHIBlock == ParentRegion->getExit()) { + for (auto P : predecessors(PHIBlock)) { + if (ParentRegion->contains(P)) + Stack.push_back(P); + } + } else { + append_range(Stack, predecessors(PHIBlock)); + } + + // Do a backward traversal over the CFG, and stop further searching if + // the block is not a Flow. If a block is neither flow block nor the + // incoming predecessor, then the incoming value from the block is + // undefined value for the PHI being reconstructed. + while (!Stack.empty()) { + BasicBlock *Current = Stack.pop_back_val(); + if (VisitedBlock.contains(Current)) + continue; + + VisitedBlock.insert(Current); + if (FlowSet.contains(Current)) { + for (auto P : predecessors(Current)) + Stack.push_back(P); + } else if (!Incomings.contains(Current)) { + UndefBlks.push_back(Current); + } + } +} + /// Add the real PHI value as soon as everything is set up void StructurizeCFG::setPhiValues() { SmallVector InsertedPhis; @@ -643,6 +709,8 @@ if (!DeletedPhis.count(To)) continue; + SmallVector UndefBlks; + bool CachedUndefs = false; PhiMap &Map = DeletedPhis[To]; for (const auto &PI : Map) { PHINode *Phi = PI.first; @@ -651,15 +719,30 @@ Updater.AddAvailableValue(&Func->getEntryBlock(), Undef); Updater.AddAvailableValue(To, Undef); - NearestCommonDominator Dominator(DT); - Dominator.addBlock(To); + SmallSet Incomings; + SmallVector ConstantPreds; for (const auto &VI : PI.second) { + Incomings.insert(VI.first); Updater.AddAvailableValue(VI.first, VI.second); - Dominator.addAndRememberBlock(VI.first); + if (isa(VI.second)) + ConstantPreds.push_back(VI.first); } - if (!Dominator.resultIsRememberedBlock()) - Updater.AddAvailableValue(Dominator.result(), Undef); + if (!CachedUndefs) { + findUndefBlocks(To, Incomings, UndefBlks); + CachedUndefs = true; + } + + for (auto UB : UndefBlks) { + // If this undef block is dominated by any predecessor(before + // structurization) of reconstructed PHI with constant incoming value, + // don't mark the available value as undefined. Setting undef to such + // block will stop us from getting optimal phi insertion. + if (any_of(ConstantPreds, + [&](BasicBlock *CP) { return DT->dominates(CP, UB); })) + continue; + Updater.AddAvailableValue(UB, Undef); + } for (BasicBlock *FI : From) Phi->setIncomingValueForBlock(FI, Updater.GetValueAtEndOfBlock(FI)); @@ -759,6 +842,7 @@ Order.back()->getEntry(); BasicBlock *Flow = BasicBlock::Create(Context, FlowBlockName, Func, Insert); + FlowSet.insert(Flow); DT->addNewBlock(Flow, Dominator); ParentRegion->getRegionInfo()->setRegionFor(Flow, ParentRegion); return Flow; @@ -1103,6 +1187,7 @@ Loops.clear(); LoopPreds.clear(); LoopConds.clear(); + FlowSet.clear(); return true; } diff --git a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll --- a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll +++ b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll @@ -9,34 +9,32 @@ ; OPT-NEXT: main_body: ; OPT-NEXT: br label [[LOOP_OUTER:%.*]] ; OPT: LOOP.outer: -; OPT-NEXT: [[PHI_BROKEN2:%.*]] = phi i64 [ [[TMP10:%.*]], [[FLOW1:%.*]] ], [ 0, [[MAIN_BODY:%.*]] ] -; OPT-NEXT: [[TMP43:%.*]] = phi i32 [ 0, [[MAIN_BODY]] ], [ [[TMP4:%.*]], [[FLOW1]] ] +; OPT-NEXT: [[PHI_BROKEN2:%.*]] = phi i64 [ [[TMP8:%.*]], [[FLOW1:%.*]] ], [ 0, [[MAIN_BODY:%.*]] ] +; OPT-NEXT: [[TMP43:%.*]] = phi i32 [ 0, [[MAIN_BODY]] ], [ [[TMP3:%.*]], [[FLOW1]] ] ; OPT-NEXT: br label [[LOOP:%.*]] ; OPT: LOOP: -; OPT-NEXT: [[PHI_BROKEN:%.*]] = phi i64 [ [[TMP8:%.*]], [[FLOW:%.*]] ], [ 0, [[LOOP_OUTER]] ] -; OPT-NEXT: [[TMP0:%.*]] = phi i32 [ undef, [[LOOP_OUTER]] ], [ [[TMP4]], [[FLOW]] ] -; OPT-NEXT: [[TMP45:%.*]] = phi i32 [ [[TMP43]], [[LOOP_OUTER]] ], [ [[TMP5:%.*]], [[FLOW]] ] +; OPT-NEXT: [[PHI_BROKEN:%.*]] = phi i64 [ [[TMP6:%.*]], [[FLOW:%.*]] ], [ 0, [[LOOP_OUTER]] ] +; OPT-NEXT: [[TMP45:%.*]] = phi i32 [ [[TMP43]], [[LOOP_OUTER]] ], [ [[TMP3]], [[FLOW]] ] ; OPT-NEXT: [[TMP48:%.*]] = icmp slt i32 [[TMP45]], [[UB:%.*]] -; OPT-NEXT: [[TMP1:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[TMP48]]) -; OPT-NEXT: [[TMP2:%.*]] = extractvalue { i1, i64 } [[TMP1]], 0 -; OPT-NEXT: [[TMP3:%.*]] = extractvalue { i1, i64 } [[TMP1]], 1 -; OPT-NEXT: br i1 [[TMP2]], label [[ENDIF:%.*]], label [[FLOW]] +; OPT-NEXT: [[TMP0:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[TMP48]]) +; OPT-NEXT: [[TMP1:%.*]] = extractvalue { i1, i64 } [[TMP0]], 0 +; OPT-NEXT: [[TMP2:%.*]] = extractvalue { i1, i64 } [[TMP0]], 1 +; OPT-NEXT: br i1 [[TMP1]], label [[ENDIF:%.*]], label [[FLOW]] ; OPT: Flow: -; OPT-NEXT: [[TMP4]] = phi i32 [ [[TMP47:%.*]], [[ENDIF]] ], [ [[TMP0]], [[LOOP]] ] -; OPT-NEXT: [[TMP5]] = phi i32 [ [[TMP47]], [[ENDIF]] ], [ undef, [[LOOP]] ] -; OPT-NEXT: [[TMP6:%.*]] = phi i1 [ [[TMP51:%.*]], [[ENDIF]] ], [ true, [[LOOP]] ] -; OPT-NEXT: [[TMP7:%.*]] = phi i1 [ [[TMP51_INV:%.*]], [[ENDIF]] ], [ true, [[LOOP]] ] -; OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP3]]) -; OPT-NEXT: [[TMP8]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[TMP7]], i64 [[PHI_BROKEN]]) -; OPT-NEXT: [[TMP9:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP8]]) -; OPT-NEXT: [[TMP10]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[TMP6]], i64 [[PHI_BROKEN2]]) -; OPT-NEXT: br i1 [[TMP9]], label [[FLOW1]], label [[LOOP]] +; OPT-NEXT: [[TMP3]] = phi i32 [ [[TMP47:%.*]], [[ENDIF]] ], [ undef, [[LOOP]] ] +; OPT-NEXT: [[TMP4:%.*]] = phi i1 [ [[TMP51:%.*]], [[ENDIF]] ], [ true, [[LOOP]] ] +; OPT-NEXT: [[TMP5:%.*]] = phi i1 [ [[TMP51_INV:%.*]], [[ENDIF]] ], [ true, [[LOOP]] ] +; OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP2]]) +; OPT-NEXT: [[TMP6]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[TMP5]], i64 [[PHI_BROKEN]]) +; OPT-NEXT: [[TMP7:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP6]]) +; OPT-NEXT: [[TMP8]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[TMP4]], i64 [[PHI_BROKEN2]]) +; OPT-NEXT: br i1 [[TMP7]], label [[FLOW1]], label [[LOOP]] ; OPT: Flow1: -; OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP8]]) -; OPT-NEXT: [[TMP11:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP10]]) -; OPT-NEXT: br i1 [[TMP11]], label [[IF:%.*]], label [[LOOP_OUTER]] +; OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP6]]) +; OPT-NEXT: [[TMP9:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP8]]) +; OPT-NEXT: br i1 [[TMP9]], label [[IF:%.*]], label [[LOOP_OUTER]] ; OPT: IF: -; OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP10]]) +; OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP8]]) ; OPT-NEXT: ret void ; OPT: ENDIF: ; OPT-NEXT: [[TMP47]] = add i32 [[TMP45]], 1 @@ -156,7 +154,7 @@ ; OPT-NEXT: [[CMP2]] = icmp sge i32 [[TMP]], [[LOAD2]] ; OPT-NEXT: br label [[FLOW3]] ; OPT: Flow5: -; OPT-NEXT: [[TMP9]] = phi i32 [ [[LSR_IV_NEXT]], [[CASE0]] ], [ [[TMP6]], [[LEAFBLOCK]] ] +; OPT-NEXT: [[TMP9]] = phi i32 [ [[LSR_IV_NEXT]], [[CASE0]] ], [ undef, [[LEAFBLOCK]] ] ; OPT-NEXT: [[TMP10]] = phi i1 [ [[CMP1]], [[CASE0]] ], [ [[TMP7]], [[LEAFBLOCK]] ] ; OPT-NEXT: br label [[FLOW4]] ; OPT: bb9: diff --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll --- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll +++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll @@ -222,16 +222,16 @@ ; GLOBALNESS1-NEXT: v_readlane_b32 s6, v41, 40 ; GLOBALNESS1-NEXT: v_readlane_b32 s7, v41, 41 ; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[6:7] -; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_29 +; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_28 ; GLOBALNESS1-NEXT: .LBB1_2: ; %Flow6 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: s_or_b64 exec, exec, s[4:5] -; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], 0 +; GLOBALNESS1-NEXT: s_mov_b64 s[8:9], 0 +; GLOBALNESS1-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GLOBALNESS1-NEXT: .LBB1_3: ; %Flow19 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: v_accvgpr_write_b32 a63, v31 -; GLOBALNESS1-NEXT: v_readlane_b32 s4, v42, 10 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[6:7] +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[8:9] ; GLOBALNESS1-NEXT: v_accvgpr_write_b32 a62, v30 ; GLOBALNESS1-NEXT: v_accvgpr_write_b32 a61, v29 ; GLOBALNESS1-NEXT: v_accvgpr_write_b32 a60, v28 @@ -263,11 +263,10 @@ ; GLOBALNESS1-NEXT: v_accvgpr_write_b32 a34, v2 ; GLOBALNESS1-NEXT: v_accvgpr_write_b32 a33, v1 ; GLOBALNESS1-NEXT: v_accvgpr_write_b32 a32, v0 -; GLOBALNESS1-NEXT: v_readlane_b32 s5, v42, 11 -; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_30 +; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_29 ; GLOBALNESS1-NEXT: .LBB1_4: ; %bb5 ; GLOBALNESS1-NEXT: ; =>This Loop Header: Depth=1 -; GLOBALNESS1-NEXT: ; Child Loop BB1_17 Depth 2 +; GLOBALNESS1-NEXT: ; Child Loop BB1_16 Depth 2 ; GLOBALNESS1-NEXT: v_readlane_b32 s60, v41, 0 ; GLOBALNESS1-NEXT: v_readlane_b32 s61, v41, 1 ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], s[60:61], s[60:61] op_sel:[0,1] @@ -315,11 +314,10 @@ ; GLOBALNESS1-NEXT: v_readlane_b32 s91, v41, 31 ; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) ; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[52:53] -; GLOBALNESS1-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[54:55] -; GLOBALNESS1-NEXT: ; kill: killed $sgpr4_sgpr5 -; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], -1 -; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_10 +; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], -1 +; GLOBALNESS1-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_9 ; GLOBALNESS1-NEXT: ; %bb.5: ; %NodeBlock ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: s_cmp_lt_i32 s59, 1 @@ -327,23 +325,19 @@ ; GLOBALNESS1-NEXT: ; %bb.6: ; %LeafBlock3 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: s_cmp_lg_u32 s59, 1 -; GLOBALNESS1-NEXT: s_mov_b64 s[8:9], -1 -; GLOBALNESS1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], -1 +; GLOBALNESS1-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_8 ; GLOBALNESS1-NEXT: s_branch .LBB1_9 ; GLOBALNESS1-NEXT: .LBB1_7: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], 0 -; GLOBALNESS1-NEXT: ; implicit-def: $sgpr8_sgpr9 +; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], 0 +; GLOBALNESS1-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GLOBALNESS1-NEXT: .LBB1_8: ; %LeafBlock ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: s_cmp_lg_u32 s59, 0 -; GLOBALNESS1-NEXT: s_mov_b64 s[8:9], 0 -; GLOBALNESS1-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GLOBALNESS1-NEXT: .LBB1_9: ; %Flow18 -; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: v_writelane_b32 v42, s8, 10 -; GLOBALNESS1-NEXT: v_writelane_b32 v42, s9, 11 -; GLOBALNESS1-NEXT: .LBB1_10: ; %Flow16 +; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], 0 +; GLOBALNESS1-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GLOBALNESS1-NEXT: .LBB1_9: ; %Flow16 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: v_readlane_b32 s68, v41, 0 ; GLOBALNESS1-NEXT: v_readlane_b32 s69, v41, 1 @@ -411,8 +405,8 @@ ; GLOBALNESS1-NEXT: s_mov_b32 s98, s57 ; GLOBALNESS1-NEXT: s_mov_b32 s99, s57 ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], s[68:69], s[68:69] op_sel:[0,1] -; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], -1 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[4:5] +; GLOBALNESS1-NEXT: s_mov_b64 s[8:9], -1 +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[6:7] ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[2:3], s[70:71], s[70:71] op_sel:[0,1] ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[4:5], s[72:73], s[72:73] op_sel:[0,1] ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[6:7], s[74:75], s[74:75] op_sel:[0,1] @@ -429,7 +423,7 @@ ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[28:29], s[96:97], s[96:97] op_sel:[0,1] ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[30:31], s[98:99], s[98:99] op_sel:[0,1] ; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_3 -; GLOBALNESS1-NEXT: ; %bb.11: ; %baz.exit.i +; GLOBALNESS1-NEXT: ; %bb.10: ; %baz.exit.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], 0, 0 ; GLOBALNESS1-NEXT: flat_load_dword v0, v[0:1] @@ -568,8 +562,8 @@ ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[28:29], s[88:89], s[88:89] op_sel:[0,1] ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[30:31], s[90:91], s[90:91] op_sel:[0,1] ; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[88:89], s[54:55] -; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_26 -; GLOBALNESS1-NEXT: ; %bb.12: ; %bb33.i +; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_25 +; GLOBALNESS1-NEXT: ; %bb.11: ; %bb33.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[2:3], 0, 0 ; GLOBALNESS1-NEXT: global_load_dwordx2 v[0:1], v[2:3], off @@ -577,12 +571,12 @@ ; GLOBALNESS1-NEXT: v_readlane_b32 s5, v41, 37 ; GLOBALNESS1-NEXT: s_mov_b32 s91, s59 ; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_14 -; GLOBALNESS1-NEXT: ; %bb.13: ; %bb39.i +; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_13 +; GLOBALNESS1-NEXT: ; %bb.12: ; %bb39.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v45, v44 ; GLOBALNESS1-NEXT: global_store_dwordx2 v[2:3], v[44:45], off -; GLOBALNESS1-NEXT: .LBB1_14: ; %bb44.lr.ph.i +; GLOBALNESS1-NEXT: .LBB1_13: ; %bb44.lr.ph.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: v_cmp_ne_u32_e32 vcc, 0, v46 ; GLOBALNESS1-NEXT: v_cndmask_b32_e32 v2, 0, v40, vcc @@ -593,40 +587,40 @@ ; GLOBALNESS1-NEXT: v_cmp_eq_u32_e64 s[58:59], 0, v2 ; GLOBALNESS1-NEXT: v_readlane_b32 s63, v41, 33 ; GLOBALNESS1-NEXT: v_readlane_b32 s65, v41, 35 -; GLOBALNESS1-NEXT: s_branch .LBB1_17 -; GLOBALNESS1-NEXT: .LBB1_15: ; %Flow7 -; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_17 Depth=2 +; GLOBALNESS1-NEXT: s_branch .LBB1_16 +; GLOBALNESS1-NEXT: .LBB1_14: ; %Flow7 +; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 ; GLOBALNESS1-NEXT: s_or_b64 exec, exec, s[4:5] -; GLOBALNESS1-NEXT: .LBB1_16: ; %bb63.i -; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_17 Depth=2 +; GLOBALNESS1-NEXT: .LBB1_15: ; %bb63.i +; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 ; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[50:51] -; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_25 -; GLOBALNESS1-NEXT: .LBB1_17: ; %bb44.i +; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_24 +; GLOBALNESS1-NEXT: .LBB1_16: ; %bb44.i ; GLOBALNESS1-NEXT: ; Parent Loop BB1_4 Depth=1 ; GLOBALNESS1-NEXT: ; => This Inner Loop Header: Depth=2 ; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[100:101] -; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_16 -; GLOBALNESS1-NEXT: ; %bb.18: ; %bb46.i -; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_17 Depth=2 +; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_15 +; GLOBALNESS1-NEXT: ; %bb.17: ; %bb46.i +; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 ; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[46:47] -; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_16 -; GLOBALNESS1-NEXT: ; %bb.19: ; %bb50.i -; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_17 Depth=2 +; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_15 +; GLOBALNESS1-NEXT: ; %bb.18: ; %bb50.i +; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 ; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[62:63] -; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_22 -; GLOBALNESS1-NEXT: ; %bb.20: ; %bb3.i.i -; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_17 Depth=2 +; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_21 +; GLOBALNESS1-NEXT: ; %bb.19: ; %bb3.i.i +; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 ; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[64:65] -; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_22 -; GLOBALNESS1-NEXT: ; %bb.21: ; %bb6.i.i -; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_17 Depth=2 +; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_21 +; GLOBALNESS1-NEXT: ; %bb.20: ; %bb6.i.i +; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 ; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[56:57] -; GLOBALNESS1-NEXT: .LBB1_22: ; %spam.exit.i -; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_17 Depth=2 +; GLOBALNESS1-NEXT: .LBB1_21: ; %spam.exit.i +; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 ; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[48:49] -; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_16 -; GLOBALNESS1-NEXT: ; %bb.23: ; %bb55.i -; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_17 Depth=2 +; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_15 +; GLOBALNESS1-NEXT: ; %bb.22: ; %bb55.i +; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 ; GLOBALNESS1-NEXT: s_add_u32 s60, s38, 40 ; GLOBALNESS1-NEXT: s_addc_u32 s61, s39, 0 ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[40:41] @@ -650,14 +644,14 @@ ; GLOBALNESS1-NEXT: global_store_dwordx2 v[0:1], a[32:33], off ; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[52:53] ; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[58:59] -; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_15 -; GLOBALNESS1-NEXT: ; %bb.24: ; %bb62.i -; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_17 Depth=2 +; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_14 +; GLOBALNESS1-NEXT: ; %bb.23: ; %bb62.i +; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v45, v44 ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], 0, 0 ; GLOBALNESS1-NEXT: global_store_dwordx2 v[0:1], v[44:45], off -; GLOBALNESS1-NEXT: s_branch .LBB1_15 -; GLOBALNESS1-NEXT: .LBB1_25: ; %Flow14 +; GLOBALNESS1-NEXT: s_branch .LBB1_14 +; GLOBALNESS1-NEXT: .LBB1_24: ; %Flow14 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: v_readlane_b32 s56, v41, 0 ; GLOBALNESS1-NEXT: v_readlane_b32 s57, v41, 1 @@ -726,35 +720,35 @@ ; GLOBALNESS1-NEXT: v_readlane_b32 s65, v41, 9 ; GLOBALNESS1-NEXT: v_readlane_b32 s66, v41, 10 ; GLOBALNESS1-NEXT: v_readlane_b32 s67, v41, 11 -; GLOBALNESS1-NEXT: .LBB1_26: ; %Flow15 +; GLOBALNESS1-NEXT: .LBB1_25: ; %Flow15 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: s_or_b64 exec, exec, s[88:89] ; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[54:55] ; GLOBALNESS1-NEXT: s_mov_b64 s[54:55], s[92:93] ; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_2 -; GLOBALNESS1-NEXT: ; %bb.27: ; %bb67.i +; GLOBALNESS1-NEXT: ; %bb.26: ; %bb67.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: v_readlane_b32 s6, v41, 38 ; GLOBALNESS1-NEXT: v_readlane_b32 s7, v41, 39 ; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_1 -; GLOBALNESS1-NEXT: ; %bb.28: ; %bb69.i +; GLOBALNESS1-NEXT: ; %bb.27: ; %bb69.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v45, v44 ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[32:33], 0, 0 ; GLOBALNESS1-NEXT: global_store_dwordx2 v[32:33], v[44:45], off ; GLOBALNESS1-NEXT: s_branch .LBB1_1 -; GLOBALNESS1-NEXT: .LBB1_29: ; %bb73.i +; GLOBALNESS1-NEXT: .LBB1_28: ; %bb73.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v45, v44 ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[32:33], 0, 0 ; GLOBALNESS1-NEXT: global_store_dwordx2 v[32:33], v[44:45], off ; GLOBALNESS1-NEXT: s_branch .LBB1_2 -; GLOBALNESS1-NEXT: .LBB1_30: ; %loop.exit.guard +; GLOBALNESS1-NEXT: .LBB1_29: ; %loop.exit.guard ; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], -1 -; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_32 -; GLOBALNESS1-NEXT: ; %bb.31: ; %bb7.i.i +; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_31 +; GLOBALNESS1-NEXT: ; %bb.30: ; %bb7.i.i ; GLOBALNESS1-NEXT: s_add_u32 s8, s38, 40 ; GLOBALNESS1-NEXT: s_addc_u32 s9, s39, 0 ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[40:41] @@ -769,10 +763,10 @@ ; GLOBALNESS1-NEXT: s_addc_u32 s17, s17, widget@rel32@hi+12 ; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], 0 -; GLOBALNESS1-NEXT: .LBB1_32: ; %Flow +; GLOBALNESS1-NEXT: .LBB1_31: ; %Flow ; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_34 -; GLOBALNESS1-NEXT: ; %bb.33: ; %bb11.i.i +; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_33 +; GLOBALNESS1-NEXT: ; %bb.32: ; %bb11.i.i ; GLOBALNESS1-NEXT: s_add_u32 s8, s38, 40 ; GLOBALNESS1-NEXT: s_addc_u32 s9, s39, 0 ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[40:41] @@ -786,7 +780,7 @@ ; GLOBALNESS1-NEXT: s_add_u32 s16, s16, widget@rel32@lo+4 ; GLOBALNESS1-NEXT: s_addc_u32 s17, s17, widget@rel32@hi+12 ; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GLOBALNESS1-NEXT: .LBB1_34: ; %UnifiedUnreachableBlock +; GLOBALNESS1-NEXT: .LBB1_33: ; %UnifiedUnreachableBlock ; ; GLOBALNESS0-LABEL: kernel: ; GLOBALNESS0: ; %bb.0: ; %bb @@ -982,16 +976,16 @@ ; GLOBALNESS0-NEXT: v_readlane_b32 s6, v41, 40 ; GLOBALNESS0-NEXT: v_readlane_b32 s7, v41, 41 ; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[6:7] -; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_29 +; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_28 ; GLOBALNESS0-NEXT: .LBB1_2: ; %Flow6 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: s_or_b64 exec, exec, s[4:5] -; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], 0 +; GLOBALNESS0-NEXT: s_mov_b64 s[8:9], 0 +; GLOBALNESS0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GLOBALNESS0-NEXT: .LBB1_3: ; %Flow19 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: v_accvgpr_write_b32 a63, v31 -; GLOBALNESS0-NEXT: v_readlane_b32 s4, v42, 10 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[6:7] +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[8:9] ; GLOBALNESS0-NEXT: v_accvgpr_write_b32 a62, v30 ; GLOBALNESS0-NEXT: v_accvgpr_write_b32 a61, v29 ; GLOBALNESS0-NEXT: v_accvgpr_write_b32 a60, v28 @@ -1023,11 +1017,10 @@ ; GLOBALNESS0-NEXT: v_accvgpr_write_b32 a34, v2 ; GLOBALNESS0-NEXT: v_accvgpr_write_b32 a33, v1 ; GLOBALNESS0-NEXT: v_accvgpr_write_b32 a32, v0 -; GLOBALNESS0-NEXT: v_readlane_b32 s5, v42, 11 -; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_30 +; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_29 ; GLOBALNESS0-NEXT: .LBB1_4: ; %bb5 ; GLOBALNESS0-NEXT: ; =>This Loop Header: Depth=1 -; GLOBALNESS0-NEXT: ; Child Loop BB1_17 Depth 2 +; GLOBALNESS0-NEXT: ; Child Loop BB1_16 Depth 2 ; GLOBALNESS0-NEXT: v_readlane_b32 s60, v41, 0 ; GLOBALNESS0-NEXT: v_readlane_b32 s61, v41, 1 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], s[60:61], s[60:61] op_sel:[0,1] @@ -1075,11 +1068,10 @@ ; GLOBALNESS0-NEXT: v_readlane_b32 s91, v41, 31 ; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0) ; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[100:101] -; GLOBALNESS0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[54:55] -; GLOBALNESS0-NEXT: ; kill: killed $sgpr4_sgpr5 -; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], -1 -; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_10 +; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], -1 +; GLOBALNESS0-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_9 ; GLOBALNESS0-NEXT: ; %bb.5: ; %NodeBlock ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: s_cmp_lt_i32 s59, 1 @@ -1087,23 +1079,19 @@ ; GLOBALNESS0-NEXT: ; %bb.6: ; %LeafBlock3 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: s_cmp_lg_u32 s59, 1 -; GLOBALNESS0-NEXT: s_mov_b64 s[8:9], -1 -; GLOBALNESS0-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], -1 +; GLOBALNESS0-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_8 ; GLOBALNESS0-NEXT: s_branch .LBB1_9 ; GLOBALNESS0-NEXT: .LBB1_7: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], 0 -; GLOBALNESS0-NEXT: ; implicit-def: $sgpr8_sgpr9 +; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], 0 +; GLOBALNESS0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GLOBALNESS0-NEXT: .LBB1_8: ; %LeafBlock ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: s_cmp_lg_u32 s59, 0 -; GLOBALNESS0-NEXT: s_mov_b64 s[8:9], 0 -; GLOBALNESS0-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GLOBALNESS0-NEXT: .LBB1_9: ; %Flow18 -; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: v_writelane_b32 v42, s8, 10 -; GLOBALNESS0-NEXT: v_writelane_b32 v42, s9, 11 -; GLOBALNESS0-NEXT: .LBB1_10: ; %Flow16 +; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], 0 +; GLOBALNESS0-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GLOBALNESS0-NEXT: .LBB1_9: ; %Flow16 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: v_readlane_b32 s64, v41, 0 ; GLOBALNESS0-NEXT: v_readlane_b32 s65, v41, 1 @@ -1169,8 +1157,8 @@ ; GLOBALNESS0-NEXT: s_mov_b32 s98, s57 ; GLOBALNESS0-NEXT: s_mov_b32 s99, s57 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], s[68:69], s[68:69] op_sel:[0,1] -; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], -1 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[4:5] +; GLOBALNESS0-NEXT: s_mov_b64 s[8:9], -1 +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[6:7] ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[2:3], s[70:71], s[70:71] op_sel:[0,1] ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[4:5], s[72:73], s[72:73] op_sel:[0,1] ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[6:7], s[74:75], s[74:75] op_sel:[0,1] @@ -1189,7 +1177,7 @@ ; GLOBALNESS0-NEXT: v_readlane_b32 s66, v41, 2 ; GLOBALNESS0-NEXT: v_readlane_b32 s67, v41, 3 ; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_3 -; GLOBALNESS0-NEXT: ; %bb.11: ; %baz.exit.i +; GLOBALNESS0-NEXT: ; %bb.10: ; %baz.exit.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], 0, 0 ; GLOBALNESS0-NEXT: flat_load_dword v0, v[0:1] @@ -1328,8 +1316,8 @@ ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[28:29], s[88:89], s[88:89] op_sel:[0,1] ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[30:31], s[90:91], s[90:91] op_sel:[0,1] ; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[88:89], s[54:55] -; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_26 -; GLOBALNESS0-NEXT: ; %bb.12: ; %bb33.i +; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_25 +; GLOBALNESS0-NEXT: ; %bb.11: ; %bb33.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[2:3], 0, 0 ; GLOBALNESS0-NEXT: global_load_dwordx2 v[0:1], v[2:3], off @@ -1337,12 +1325,12 @@ ; GLOBALNESS0-NEXT: v_readlane_b32 s5, v41, 37 ; GLOBALNESS0-NEXT: s_mov_b32 s91, s59 ; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_14 -; GLOBALNESS0-NEXT: ; %bb.13: ; %bb39.i +; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_13 +; GLOBALNESS0-NEXT: ; %bb.12: ; %bb39.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v45, v44 ; GLOBALNESS0-NEXT: global_store_dwordx2 v[2:3], v[44:45], off -; GLOBALNESS0-NEXT: .LBB1_14: ; %bb44.lr.ph.i +; GLOBALNESS0-NEXT: .LBB1_13: ; %bb44.lr.ph.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: v_cmp_ne_u32_e32 vcc, 0, v46 ; GLOBALNESS0-NEXT: v_cndmask_b32_e32 v2, 0, v40, vcc @@ -1353,40 +1341,40 @@ ; GLOBALNESS0-NEXT: v_cmp_eq_u32_e64 s[58:59], 0, v2 ; GLOBALNESS0-NEXT: v_readlane_b32 s61, v41, 35 ; GLOBALNESS0-NEXT: v_readlane_b32 s63, v41, 33 -; GLOBALNESS0-NEXT: s_branch .LBB1_17 -; GLOBALNESS0-NEXT: .LBB1_15: ; %Flow7 -; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_17 Depth=2 +; GLOBALNESS0-NEXT: s_branch .LBB1_16 +; GLOBALNESS0-NEXT: .LBB1_14: ; %Flow7 +; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 ; GLOBALNESS0-NEXT: s_or_b64 exec, exec, s[4:5] -; GLOBALNESS0-NEXT: .LBB1_16: ; %bb63.i -; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_17 Depth=2 +; GLOBALNESS0-NEXT: .LBB1_15: ; %bb63.i +; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 ; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[52:53] -; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_25 -; GLOBALNESS0-NEXT: .LBB1_17: ; %bb44.i +; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_24 +; GLOBALNESS0-NEXT: .LBB1_16: ; %bb44.i ; GLOBALNESS0-NEXT: ; Parent Loop BB1_4 Depth=1 ; GLOBALNESS0-NEXT: ; => This Inner Loop Header: Depth=2 ; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[46:47] -; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_16 -; GLOBALNESS0-NEXT: ; %bb.18: ; %bb46.i -; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_17 Depth=2 +; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_15 +; GLOBALNESS0-NEXT: ; %bb.17: ; %bb46.i +; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 ; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[50:51] -; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_16 -; GLOBALNESS0-NEXT: ; %bb.19: ; %bb50.i -; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_17 Depth=2 +; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_15 +; GLOBALNESS0-NEXT: ; %bb.18: ; %bb50.i +; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 ; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[62:63] -; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_22 -; GLOBALNESS0-NEXT: ; %bb.20: ; %bb3.i.i -; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_17 Depth=2 +; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_21 +; GLOBALNESS0-NEXT: ; %bb.19: ; %bb3.i.i +; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 ; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[60:61] -; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_22 -; GLOBALNESS0-NEXT: ; %bb.21: ; %bb6.i.i -; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_17 Depth=2 +; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_21 +; GLOBALNESS0-NEXT: ; %bb.20: ; %bb6.i.i +; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 ; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[56:57] -; GLOBALNESS0-NEXT: .LBB1_22: ; %spam.exit.i -; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_17 Depth=2 +; GLOBALNESS0-NEXT: .LBB1_21: ; %spam.exit.i +; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 ; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[48:49] -; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_16 -; GLOBALNESS0-NEXT: ; %bb.23: ; %bb55.i -; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_17 Depth=2 +; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_15 +; GLOBALNESS0-NEXT: ; %bb.22: ; %bb55.i +; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 ; GLOBALNESS0-NEXT: s_add_u32 s64, s38, 40 ; GLOBALNESS0-NEXT: s_addc_u32 s65, s39, 0 ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[40:41] @@ -1410,14 +1398,14 @@ ; GLOBALNESS0-NEXT: global_store_dwordx2 v[0:1], a[32:33], off ; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[100:101] ; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[58:59] -; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_15 -; GLOBALNESS0-NEXT: ; %bb.24: ; %bb62.i -; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_17 Depth=2 +; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_14 +; GLOBALNESS0-NEXT: ; %bb.23: ; %bb62.i +; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v45, v44 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], 0, 0 ; GLOBALNESS0-NEXT: global_store_dwordx2 v[0:1], v[44:45], off -; GLOBALNESS0-NEXT: s_branch .LBB1_15 -; GLOBALNESS0-NEXT: .LBB1_25: ; %Flow14 +; GLOBALNESS0-NEXT: s_branch .LBB1_14 +; GLOBALNESS0-NEXT: .LBB1_24: ; %Flow14 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: v_readlane_b32 s56, v41, 0 ; GLOBALNESS0-NEXT: v_readlane_b32 s57, v41, 1 @@ -1486,35 +1474,35 @@ ; GLOBALNESS0-NEXT: v_readlane_b32 s61, v41, 5 ; GLOBALNESS0-NEXT: v_readlane_b32 s62, v41, 6 ; GLOBALNESS0-NEXT: v_readlane_b32 s63, v41, 7 -; GLOBALNESS0-NEXT: .LBB1_26: ; %Flow15 +; GLOBALNESS0-NEXT: .LBB1_25: ; %Flow15 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: s_or_b64 exec, exec, s[88:89] ; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[54:55] ; GLOBALNESS0-NEXT: s_mov_b64 s[54:55], s[92:93] ; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_2 -; GLOBALNESS0-NEXT: ; %bb.27: ; %bb67.i +; GLOBALNESS0-NEXT: ; %bb.26: ; %bb67.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: v_readlane_b32 s6, v41, 38 ; GLOBALNESS0-NEXT: v_readlane_b32 s7, v41, 39 ; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_1 -; GLOBALNESS0-NEXT: ; %bb.28: ; %bb69.i +; GLOBALNESS0-NEXT: ; %bb.27: ; %bb69.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v45, v44 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[32:33], 0, 0 ; GLOBALNESS0-NEXT: global_store_dwordx2 v[32:33], v[44:45], off ; GLOBALNESS0-NEXT: s_branch .LBB1_1 -; GLOBALNESS0-NEXT: .LBB1_29: ; %bb73.i +; GLOBALNESS0-NEXT: .LBB1_28: ; %bb73.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v45, v44 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[32:33], 0, 0 ; GLOBALNESS0-NEXT: global_store_dwordx2 v[32:33], v[44:45], off ; GLOBALNESS0-NEXT: s_branch .LBB1_2 -; GLOBALNESS0-NEXT: .LBB1_30: ; %loop.exit.guard +; GLOBALNESS0-NEXT: .LBB1_29: ; %loop.exit.guard ; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], -1 -; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_32 -; GLOBALNESS0-NEXT: ; %bb.31: ; %bb7.i.i +; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_31 +; GLOBALNESS0-NEXT: ; %bb.30: ; %bb7.i.i ; GLOBALNESS0-NEXT: s_add_u32 s8, s38, 40 ; GLOBALNESS0-NEXT: s_addc_u32 s9, s39, 0 ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[40:41] @@ -1529,10 +1517,10 @@ ; GLOBALNESS0-NEXT: s_addc_u32 s17, s17, widget@rel32@hi+12 ; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], 0 -; GLOBALNESS0-NEXT: .LBB1_32: ; %Flow +; GLOBALNESS0-NEXT: .LBB1_31: ; %Flow ; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_34 -; GLOBALNESS0-NEXT: ; %bb.33: ; %bb11.i.i +; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_33 +; GLOBALNESS0-NEXT: ; %bb.32: ; %bb11.i.i ; GLOBALNESS0-NEXT: s_add_u32 s8, s38, 40 ; GLOBALNESS0-NEXT: s_addc_u32 s9, s39, 0 ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[40:41] @@ -1546,7 +1534,7 @@ ; GLOBALNESS0-NEXT: s_add_u32 s16, s16, widget@rel32@lo+4 ; GLOBALNESS0-NEXT: s_addc_u32 s17, s17, widget@rel32@hi+12 ; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GLOBALNESS0-NEXT: .LBB1_34: ; %UnifiedUnreachableBlock +; GLOBALNESS0-NEXT: .LBB1_33: ; %UnifiedUnreachableBlock bb: store i32 0, i32 addrspace(1)* null, align 4 %tmp4 = load i32, i32 addrspace(1)* %arg1.global, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/while-break.ll b/llvm/test/CodeGen/AMDGPU/while-break.ll --- a/llvm/test/CodeGen/AMDGPU/while-break.ll +++ b/llvm/test/CodeGen/AMDGPU/while-break.ll @@ -10,7 +10,6 @@ ; GCN-NEXT: .LBB0_1: ; %Flow2 ; GCN-NEXT: ; in Loop: Header=BB0_2 Depth=1 ; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GCN-NEXT: v_mov_b32_e32 v1, v5 ; GCN-NEXT: s_and_b32 s2, exec_lo, s3 ; GCN-NEXT: s_or_b32 s0, s2, s0 ; GCN-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 @@ -20,22 +19,18 @@ ; GCN-NEXT: s_add_i32 s1, s1, 1 ; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: v_cmp_ge_i32_e32 vcc_lo, s1, v2 -; GCN-NEXT: ; implicit-def: $vgpr4 ; GCN-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GCN-NEXT: s_xor_b32 s3, exec_lo, s3 ; GCN-NEXT: ; %bb.3: ; %else ; GCN-NEXT: ; in Loop: Header=BB0_2 Depth=1 ; GCN-NEXT: v_cmp_lt_i32_e32 vcc_lo, s1, v3 -; GCN-NEXT: v_mov_b32_e32 v4, v1 ; GCN-NEXT: s_and_b32 s2, vcc_lo, exec_lo ; GCN-NEXT: ; %bb.4: ; %Flow ; GCN-NEXT: ; in Loop: Header=BB0_2 Depth=1 -; GCN-NEXT: s_or_saveexec_b32 s3, s3 -; GCN-NEXT: v_mov_b32_e32 v5, v4 -; GCN-NEXT: s_xor_b32 exec_lo, exec_lo, s3 +; GCN-NEXT: s_andn2_saveexec_b32 s3, s3 ; GCN-NEXT: ; %bb.5: ; %if ; GCN-NEXT: ; in Loop: Header=BB0_2 Depth=1 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v1 +; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GCN-NEXT: s_or_b32 s2, s2, exec_lo ; GCN-NEXT: ; %bb.6: ; %Flow1 ; GCN-NEXT: ; in Loop: Header=BB0_2 Depth=1 @@ -46,12 +41,11 @@ ; GCN-NEXT: ; %bb.7: ; %latch ; GCN-NEXT: ; in Loop: Header=BB0_2 Depth=1 ; GCN-NEXT: v_cmp_lt_i32_e32 vcc_lo, s1, v0 -; GCN-NEXT: v_mov_b32_e32 v4, v5 ; GCN-NEXT: s_orn2_b32 s3, vcc_lo, exec_lo ; GCN-NEXT: s_branch .LBB0_1 ; GCN-NEXT: .LBB0_8: ; %end ; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GCN-NEXT: v_mov_b32_e32 v0, v4 +; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: ; return to shader part epilog entry: br label %header diff --git a/llvm/test/Transforms/StructurizeCFG/workarounds/needs-fr-ule.ll b/llvm/test/Transforms/StructurizeCFG/workarounds/needs-fr-ule.ll --- a/llvm/test/Transforms/StructurizeCFG/workarounds/needs-fr-ule.ll +++ b/llvm/test/Transforms/StructurizeCFG/workarounds/needs-fr-ule.ll @@ -68,7 +68,7 @@ ; CHECK: cond.end61: ; CHECK-NEXT: br label [[FLOW7]] ; CHECK: Flow14: -; CHECK-NEXT: [[TMP15:%.*]] = phi i1 [ [[TMP20:%.*]], [[FLOW15:%.*]] ], [ [[TMP17:%.*]], [[LOOP_EXIT_GUARD1]] ] +; CHECK-NEXT: [[TMP15:%.*]] = phi i1 [ [[TMP20:%.*]], [[FLOW15:%.*]] ], [ undef, [[LOOP_EXIT_GUARD1]] ] ; CHECK-NEXT: [[TMP16:%.*]] = phi i1 [ [[TMP21:%.*]], [[FLOW15]] ], [ [[DOTINV]], [[LOOP_EXIT_GUARD1]] ] ; CHECK-NEXT: br label [[FLOW13:%.*]] ; CHECK: if.then69: @@ -102,7 +102,7 @@ ; CHECK: exit: ; CHECK-NEXT: ret void ; CHECK: Flow12: -; CHECK-NEXT: [[TMP17]] = phi i1 [ true, [[LOR_RHS]] ], [ undef, [[WHILE_COND]] ] +; CHECK-NEXT: [[TMP17:%.*]] = phi i1 [ true, [[LOR_RHS]] ], [ undef, [[WHILE_COND]] ] ; CHECK-NEXT: [[TMP18:%.*]] = phi i1 [ false, [[LOR_RHS]] ], [ true, [[WHILE_COND]] ] ; CHECK-NEXT: [[TMP19:%.*]] = phi i1 [ [[PRED9:%.*]], [[LOR_RHS]] ], [ [[PRED3]], [[WHILE_COND]] ] ; CHECK-NEXT: br i1 [[TMP19]], label [[IRR_GUARD]], label [[FLOW13]] diff --git a/llvm/test/Transforms/StructurizeCFG/workarounds/needs-unified-loop-exits.ll b/llvm/test/Transforms/StructurizeCFG/workarounds/needs-unified-loop-exits.ll --- a/llvm/test/Transforms/StructurizeCFG/workarounds/needs-unified-loop-exits.ll +++ b/llvm/test/Transforms/StructurizeCFG/workarounds/needs-unified-loop-exits.ll @@ -38,7 +38,7 @@ ; CHECK: Flow: ; CHECK-NEXT: [[TMP2:%.*]] = phi i1 [ false, [[FLOW2]] ], [ undef, [[H2]] ] ; CHECK-NEXT: [[TMP3:%.*]] = phi i1 [ false, [[FLOW2]] ], [ true, [[H2]] ] -; CHECK-NEXT: [[TMP4:%.*]] = phi i1 [ [[TMP6:%.*]], [[FLOW2]] ], [ true, [[H2]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi i1 [ [[TMP7:%.*]], [[FLOW2]] ], [ true, [[H2]] ] ; CHECK-NEXT: br i1 [[TMP4]], label [[LOOP_EXIT_GUARD1:%.*]], label [[H2]] ; CHECK: L2: ; CHECK-NEXT: br label [[FLOW2]] @@ -51,17 +51,18 @@ ; CHECK: exit: ; CHECK-NEXT: ret void ; CHECK: Flow5: -; CHECK-NEXT: [[TMP5:%.*]] = phi i1 [ false, [[L1:%.*]] ], [ true, [[LOOP_EXIT_GUARD1]] ] +; CHECK-NEXT: [[TMP5:%.*]] = phi i1 [ undef, [[L1:%.*]] ], [ [[TMP2]], [[LOOP_EXIT_GUARD1]] ] +; CHECK-NEXT: [[TMP6:%.*]] = phi i1 [ false, [[L1]] ], [ true, [[LOOP_EXIT_GUARD1]] ] ; CHECK-NEXT: br label [[FLOW4]] ; CHECK: loop.exit.guard: -; CHECK-NEXT: br i1 [[TMP7:%.*]], label [[C:%.*]], label [[EXIT]] +; CHECK-NEXT: br i1 [[TMP8:%.*]], label [[C:%.*]], label [[EXIT]] ; CHECK: Flow2: -; CHECK-NEXT: [[TMP6]] = phi i1 [ false, [[L2]] ], [ true, [[B2]] ] +; CHECK-NEXT: [[TMP7]] = phi i1 [ false, [[L2]] ], [ true, [[B2]] ] ; CHECK-NEXT: br label [[FLOW]] ; CHECK: Flow4: -; CHECK-NEXT: [[TMP7]] = phi i1 [ [[TMP2]], [[FLOW5]] ], [ [[TMP0]], [[FLOW3]] ] -; CHECK-NEXT: [[TMP8:%.*]] = phi i1 [ [[TMP5]], [[FLOW5]] ], [ true, [[FLOW3]] ] -; CHECK-NEXT: br i1 [[TMP8]], label [[LOOP_EXIT_GUARD:%.*]], label [[H1]] +; CHECK-NEXT: [[TMP8]] = phi i1 [ [[TMP5]], [[FLOW5]] ], [ [[TMP0]], [[FLOW3]] ] +; CHECK-NEXT: [[TMP9:%.*]] = phi i1 [ [[TMP6]], [[FLOW5]] ], [ true, [[FLOW3]] ] +; CHECK-NEXT: br i1 [[TMP9]], label [[LOOP_EXIT_GUARD:%.*]], label [[H1]] ; CHECK: loop.exit.guard1: ; CHECK-NEXT: br i1 [[TMP3]], label [[L1]], label [[FLOW5]] ;