diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp --- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp +++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp @@ -12,6 +12,7 @@ #include "llvm/ADT/SCCIterator.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LegacyDivergenceAnalysis.h" @@ -246,6 +247,7 @@ SmallVector Order; BBSet Visited; + BBSet FlowSet; SmallVector AffectedPhis; BBPhiMap DeletedPhis; @@ -643,6 +645,8 @@ if (!DeletedPhis.count(To)) continue; + SmallVector UndefBlks; + bool CachedUndefs = false; PhiMap &Map = DeletedPhis[To]; for (const auto &PI : Map) { PHINode *Phi = PI.first; @@ -651,15 +655,51 @@ Updater.AddAvailableValue(&Func->getEntryBlock(), Undef); Updater.AddAvailableValue(To, Undef); - NearestCommonDominator Dominator(DT); - Dominator.addBlock(To); + SmallSet Incomings; for (const auto &VI : PI.second) { + Incomings.insert(VI.first); Updater.AddAvailableValue(VI.first, VI.second); - Dominator.addAndRememberBlock(VI.first); } - if (!Dominator.resultIsRememberedBlock()) - Updater.AddAvailableValue(Dominator.result(), Undef); + if (!CachedUndefs) { + // Build the undef blocks cache. + SmallSet VisitedBlock; + SmallVector Stack; + if (To == ParentRegion->getExit()) { + for (auto P : predecessors(To)) { + if (ParentRegion->contains(P)) + Stack.push_back(P); + } + } else { + append_range(Stack, predecessors(To)); + } + + while (!Stack.empty() && !Incomings.empty()) { + BasicBlock *Current = Stack.pop_back_val(); + if (VisitedBlock.contains(Current)) + continue; + + VisitedBlock.insert(Current); + if (!FlowSet.contains(Current)) { + if (Incomings.contains(Current)) { + Incomings.erase(Current); + } else { + // If this is not flow node and not the incoming block before + // structurization, then this block does not contribute any value + // to the phi. + UndefBlks.push_back(Current); + } + continue; + } + + for (auto P : predecessors(Current)) + Stack.push_back(P); + } + CachedUndefs = true; + } + + for (auto UB : UndefBlks) + Updater.AddAvailableValue(UB, Undef); for (BasicBlock *FI : From) Phi->setIncomingValueForBlock(FI, Updater.GetValueAtEndOfBlock(FI)); @@ -759,6 +799,7 @@ Order.back()->getEntry(); BasicBlock *Flow = BasicBlock::Create(Context, FlowBlockName, Func, Insert); + FlowSet.insert(Flow); DT->addNewBlock(Flow, Dominator); ParentRegion->getRegionInfo()->setRegionFor(Flow, ParentRegion); return Flow; @@ -1103,6 +1144,7 @@ Loops.clear(); LoopPreds.clear(); LoopConds.clear(); + FlowSet.clear(); return true; } diff --git a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll --- a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll +++ b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll @@ -9,34 +9,32 @@ ; OPT-NEXT: main_body: ; OPT-NEXT: br label [[LOOP_OUTER:%.*]] ; OPT: LOOP.outer: -; OPT-NEXT: [[PHI_BROKEN2:%.*]] = phi i64 [ [[TMP10:%.*]], [[FLOW1:%.*]] ], [ 0, [[MAIN_BODY:%.*]] ] -; OPT-NEXT: [[TMP43:%.*]] = phi i32 [ 0, [[MAIN_BODY]] ], [ [[TMP4:%.*]], [[FLOW1]] ] +; OPT-NEXT: [[PHI_BROKEN2:%.*]] = phi i64 [ [[TMP8:%.*]], [[FLOW1:%.*]] ], [ 0, [[MAIN_BODY:%.*]] ] +; OPT-NEXT: [[TMP43:%.*]] = phi i32 [ 0, [[MAIN_BODY]] ], [ [[TMP3:%.*]], [[FLOW1]] ] ; OPT-NEXT: br label [[LOOP:%.*]] ; OPT: LOOP: -; OPT-NEXT: [[PHI_BROKEN:%.*]] = phi i64 [ [[TMP8:%.*]], [[FLOW:%.*]] ], [ 0, [[LOOP_OUTER]] ] -; OPT-NEXT: [[TMP0:%.*]] = phi i32 [ undef, [[LOOP_OUTER]] ], [ [[TMP4]], [[FLOW]] ] -; OPT-NEXT: [[TMP45:%.*]] = phi i32 [ [[TMP43]], [[LOOP_OUTER]] ], [ [[TMP5:%.*]], [[FLOW]] ] +; OPT-NEXT: [[PHI_BROKEN:%.*]] = phi i64 [ [[TMP6:%.*]], [[FLOW:%.*]] ], [ 0, [[LOOP_OUTER]] ] +; OPT-NEXT: [[TMP45:%.*]] = phi i32 [ [[TMP43]], [[LOOP_OUTER]] ], [ [[TMP3]], [[FLOW]] ] ; OPT-NEXT: [[TMP48:%.*]] = icmp slt i32 [[TMP45]], [[UB:%.*]] -; OPT-NEXT: [[TMP1:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[TMP48]]) -; OPT-NEXT: [[TMP2:%.*]] = extractvalue { i1, i64 } [[TMP1]], 0 -; OPT-NEXT: [[TMP3:%.*]] = extractvalue { i1, i64 } [[TMP1]], 1 -; OPT-NEXT: br i1 [[TMP2]], label [[ENDIF:%.*]], label [[FLOW]] +; OPT-NEXT: [[TMP0:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[TMP48]]) +; OPT-NEXT: [[TMP1:%.*]] = extractvalue { i1, i64 } [[TMP0]], 0 +; OPT-NEXT: [[TMP2:%.*]] = extractvalue { i1, i64 } [[TMP0]], 1 +; OPT-NEXT: br i1 [[TMP1]], label [[ENDIF:%.*]], label [[FLOW]] ; OPT: Flow: -; OPT-NEXT: [[TMP4]] = phi i32 [ [[TMP47:%.*]], [[ENDIF]] ], [ [[TMP0]], [[LOOP]] ] -; OPT-NEXT: [[TMP5]] = phi i32 [ [[TMP47]], [[ENDIF]] ], [ undef, [[LOOP]] ] -; OPT-NEXT: [[TMP6:%.*]] = phi i1 [ [[TMP51:%.*]], [[ENDIF]] ], [ true, [[LOOP]] ] -; OPT-NEXT: [[TMP7:%.*]] = phi i1 [ [[TMP51_INV:%.*]], [[ENDIF]] ], [ true, [[LOOP]] ] -; OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP3]]) -; OPT-NEXT: [[TMP8]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[TMP7]], i64 [[PHI_BROKEN]]) -; OPT-NEXT: [[TMP9:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP8]]) -; OPT-NEXT: [[TMP10]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[TMP6]], i64 [[PHI_BROKEN2]]) -; OPT-NEXT: br i1 [[TMP9]], label [[FLOW1]], label [[LOOP]] +; OPT-NEXT: [[TMP3]] = phi i32 [ [[TMP47:%.*]], [[ENDIF]] ], [ undef, [[LOOP]] ] +; OPT-NEXT: [[TMP4:%.*]] = phi i1 [ [[TMP51:%.*]], [[ENDIF]] ], [ true, [[LOOP]] ] +; OPT-NEXT: [[TMP5:%.*]] = phi i1 [ [[TMP51_INV:%.*]], [[ENDIF]] ], [ true, [[LOOP]] ] +; OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP2]]) +; OPT-NEXT: [[TMP6]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[TMP5]], i64 [[PHI_BROKEN]]) +; OPT-NEXT: [[TMP7:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP6]]) +; OPT-NEXT: [[TMP8]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[TMP4]], i64 [[PHI_BROKEN2]]) +; OPT-NEXT: br i1 [[TMP7]], label [[FLOW1]], label [[LOOP]] ; OPT: Flow1: -; OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP8]]) -; OPT-NEXT: [[TMP11:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP10]]) -; OPT-NEXT: br i1 [[TMP11]], label [[IF:%.*]], label [[LOOP_OUTER]] +; OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP6]]) +; OPT-NEXT: [[TMP9:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP8]]) +; OPT-NEXT: br i1 [[TMP9]], label [[IF:%.*]], label [[LOOP_OUTER]] ; OPT: IF: -; OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP10]]) +; OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP8]]) ; OPT-NEXT: ret void ; OPT: ENDIF: ; OPT-NEXT: [[TMP47]] = add i32 [[TMP45]], 1 @@ -156,7 +154,7 @@ ; OPT-NEXT: [[CMP2]] = icmp sge i32 [[TMP]], [[LOAD2]] ; OPT-NEXT: br label [[FLOW3]] ; OPT: Flow5: -; OPT-NEXT: [[TMP9]] = phi i32 [ [[LSR_IV_NEXT]], [[CASE0]] ], [ [[TMP6]], [[LEAFBLOCK]] ] +; OPT-NEXT: [[TMP9]] = phi i32 [ [[LSR_IV_NEXT]], [[CASE0]] ], [ undef, [[LEAFBLOCK]] ] ; OPT-NEXT: [[TMP10]] = phi i1 [ [[CMP1]], [[CASE0]] ], [ [[TMP7]], [[LEAFBLOCK]] ] ; OPT-NEXT: br label [[FLOW4]] ; OPT: bb9: diff --git a/llvm/test/CodeGen/AMDGPU/while-break.ll b/llvm/test/CodeGen/AMDGPU/while-break.ll --- a/llvm/test/CodeGen/AMDGPU/while-break.ll +++ b/llvm/test/CodeGen/AMDGPU/while-break.ll @@ -10,7 +10,6 @@ ; GCN-NEXT: .LBB0_1: ; %Flow2 ; GCN-NEXT: ; in Loop: Header=BB0_2 Depth=1 ; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GCN-NEXT: v_mov_b32_e32 v1, v5 ; GCN-NEXT: s_and_b32 s2, exec_lo, s3 ; GCN-NEXT: s_or_b32 s0, s2, s0 ; GCN-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 @@ -20,22 +19,18 @@ ; GCN-NEXT: s_add_i32 s1, s1, 1 ; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: v_cmp_ge_i32_e32 vcc_lo, s1, v2 -; GCN-NEXT: ; implicit-def: $vgpr4 ; GCN-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GCN-NEXT: s_xor_b32 s3, exec_lo, s3 ; GCN-NEXT: ; %bb.3: ; %else ; GCN-NEXT: ; in Loop: Header=BB0_2 Depth=1 ; GCN-NEXT: v_cmp_lt_i32_e32 vcc_lo, s1, v3 -; GCN-NEXT: v_mov_b32_e32 v4, v1 ; GCN-NEXT: s_and_b32 s2, vcc_lo, exec_lo ; GCN-NEXT: ; %bb.4: ; %Flow ; GCN-NEXT: ; in Loop: Header=BB0_2 Depth=1 -; GCN-NEXT: s_or_saveexec_b32 s3, s3 -; GCN-NEXT: v_mov_b32_e32 v5, v4 -; GCN-NEXT: s_xor_b32 exec_lo, exec_lo, s3 +; GCN-NEXT: s_andn2_saveexec_b32 s3, s3 ; GCN-NEXT: ; %bb.5: ; %if ; GCN-NEXT: ; in Loop: Header=BB0_2 Depth=1 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v1 +; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GCN-NEXT: s_or_b32 s2, s2, exec_lo ; GCN-NEXT: ; %bb.6: ; %Flow1 ; GCN-NEXT: ; in Loop: Header=BB0_2 Depth=1 @@ -46,12 +41,11 @@ ; GCN-NEXT: ; %bb.7: ; %latch ; GCN-NEXT: ; in Loop: Header=BB0_2 Depth=1 ; GCN-NEXT: v_cmp_lt_i32_e32 vcc_lo, s1, v0 -; GCN-NEXT: v_mov_b32_e32 v4, v5 ; GCN-NEXT: s_orn2_b32 s3, vcc_lo, exec_lo ; GCN-NEXT: s_branch .LBB0_1 ; GCN-NEXT: .LBB0_8: ; %end ; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GCN-NEXT: v_mov_b32_e32 v0, v4 +; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: ; return to shader part epilog entry: br label %header diff --git a/llvm/test/Transforms/StructurizeCFG/workarounds/needs-fr-ule.ll b/llvm/test/Transforms/StructurizeCFG/workarounds/needs-fr-ule.ll --- a/llvm/test/Transforms/StructurizeCFG/workarounds/needs-fr-ule.ll +++ b/llvm/test/Transforms/StructurizeCFG/workarounds/needs-fr-ule.ll @@ -68,7 +68,7 @@ ; CHECK: cond.end61: ; CHECK-NEXT: br label [[FLOW7]] ; CHECK: Flow14: -; CHECK-NEXT: [[TMP15:%.*]] = phi i1 [ [[TMP20:%.*]], [[FLOW15:%.*]] ], [ [[TMP17:%.*]], [[LOOP_EXIT_GUARD1]] ] +; CHECK-NEXT: [[TMP15:%.*]] = phi i1 [ [[TMP20:%.*]], [[FLOW15:%.*]] ], [ undef, [[LOOP_EXIT_GUARD1]] ] ; CHECK-NEXT: [[TMP16:%.*]] = phi i1 [ [[TMP21:%.*]], [[FLOW15]] ], [ [[DOTINV]], [[LOOP_EXIT_GUARD1]] ] ; CHECK-NEXT: br label [[FLOW13:%.*]] ; CHECK: if.then69: @@ -102,7 +102,7 @@ ; CHECK: exit: ; CHECK-NEXT: ret void ; CHECK: Flow12: -; CHECK-NEXT: [[TMP17]] = phi i1 [ true, [[LOR_RHS]] ], [ undef, [[WHILE_COND]] ] +; CHECK-NEXT: [[TMP17:%.*]] = phi i1 [ true, [[LOR_RHS]] ], [ undef, [[WHILE_COND]] ] ; CHECK-NEXT: [[TMP18:%.*]] = phi i1 [ false, [[LOR_RHS]] ], [ true, [[WHILE_COND]] ] ; CHECK-NEXT: [[TMP19:%.*]] = phi i1 [ [[PRED9:%.*]], [[LOR_RHS]] ], [ [[PRED3]], [[WHILE_COND]] ] ; CHECK-NEXT: br i1 [[TMP19]], label [[IRR_GUARD]], label [[FLOW13]]