diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp --- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp +++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp @@ -34,6 +34,7 @@ #include "llvm/IR/Use.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" +#include "llvm/IR/ValueHandle.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" @@ -197,6 +198,7 @@ SmallVector Order; BBSet Visited; + SmallVector AffectedPhis; BBPhiMap DeletedPhis; BB2BBVecMap AddedPhis; @@ -232,6 +234,8 @@ void setPhiValues(); + void simplifyAffectedPhis(); + void killTerminator(BasicBlock *BB); void changeExit(RegionNode *Node, BasicBlock *NewExit, @@ -585,9 +589,14 @@ void StructurizeCFG::delPhiValues(BasicBlock *From, BasicBlock *To) { PhiMap &Map = DeletedPhis[To]; for (PHINode &Phi : To->phis()) { + bool Recorded = false; while (Phi.getBasicBlockIndex(From) != -1) { Value *Deleted = Phi.removeIncomingValue(From, false); Map[&Phi].push_back(std::make_pair(From, Deleted)); + if (!Recorded) { + AffectedPhis.push_back(&Phi); + Recorded = true; + } } } } @@ -632,28 +641,29 @@ for (BasicBlock *FI : From) Phi->setIncomingValueForBlock(FI, Updater.GetValueAtEndOfBlock(FI)); + AffectedPhis.push_back(Phi); } DeletedPhis.erase(To); } assert(DeletedPhis.empty()); - // Simplify any phis inserted by the SSAUpdater if possible + AffectedPhis.append(InsertedPhis.begin(), InsertedPhis.end()); +} + +void StructurizeCFG::simplifyAffectedPhis() { bool Changed; do { Changed = false; - SimplifyQuery Q(Func->getParent()->getDataLayout()); Q.DT = DT; - for (size_t i = 0; i < InsertedPhis.size(); ++i) { - PHINode *Phi = InsertedPhis[i]; - if (Value *V = SimplifyInstruction(Phi, Q)) { - Phi->replaceAllUsesWith(V); - Phi->eraseFromParent(); - InsertedPhis[i] = InsertedPhis.back(); - InsertedPhis.pop_back(); - i--; - Changed = true; + for (auto VH : AffectedPhis) { + if (auto Phi = dyn_cast_or_null(VH)) { + if (auto NewValue = SimplifyInstruction(Phi, Q)) { + Phi->replaceAllUsesWith(NewValue); + Phi->eraseFromParent(); + Changed = true; + } } } } while (Changed); @@ -886,6 +896,7 @@ BasicBlock *Exit = ParentRegion->getExit(); bool EntryDominatesExit = DT->dominates(ParentRegion->getEntry(), Exit); + AffectedPhis.clear(); DeletedPhis.clear(); AddedPhis.clear(); Conditions.clear(); @@ -1044,6 +1055,7 @@ insertConditions(false); insertConditions(true); setPhiValues(); + simplifyAffectedPhis(); rebuildSSA(); // Cleanup diff --git a/llvm/test/CodeGen/AMDGPU/loop_break.ll b/llvm/test/CodeGen/AMDGPU/loop_break.ll --- a/llvm/test/CodeGen/AMDGPU/loop_break.ll +++ b/llvm/test/CodeGen/AMDGPU/loop_break.ll @@ -91,8 +91,8 @@ ; OPT-NEXT: br label [[BB1:%.*]] ; OPT: bb1: ; OPT-NEXT: [[PHI_BROKEN:%.*]] = phi i64 [ [[TMP0:%.*]], [[FLOW:%.*]] ], [ 0, [[BB:%.*]] ] -; OPT-NEXT: [[LSR_IV:%.*]] = phi i32 [ undef, [[BB]] ], [ [[MY_TMP2:%.*]], [[FLOW]] ] -; OPT-NEXT: [[LSR_IV_NEXT:%.*]] = add i32 [[LSR_IV]], 1 +; OPT-NEXT: [[LSR_IV:%.*]] = phi i32 [ undef, [[BB]] ], [ [[LSR_IV_NEXT:%.*]], [[FLOW]] ] +; OPT-NEXT: [[LSR_IV_NEXT]] = add i32 [[LSR_IV]], 1 ; OPT-NEXT: [[CMP0:%.*]] = icmp slt i32 [[LSR_IV_NEXT]], 0 ; OPT-NEXT: br i1 [[CMP0]], label [[BB4:%.*]], label [[FLOW]] ; OPT: bb4: @@ -100,7 +100,6 @@ ; OPT-NEXT: [[CMP1:%.*]] = icmp sge i32 [[MY_TMP]], [[LOAD]] ; OPT-NEXT: br label [[FLOW]] ; OPT: Flow: -; OPT-NEXT: [[MY_TMP2]] = phi i32 [ [[LSR_IV_NEXT]], [[BB4]] ], [ undef, [[BB1]] ] ; OPT-NEXT: [[MY_TMP3:%.*]] = phi i1 [ [[CMP1]], [[BB4]] ], [ undef, [[BB1]] ] ; OPT-NEXT: [[TMP0]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[MY_TMP3]], i64 [[PHI_BROKEN]]) ; OPT-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP0]]) @@ -185,8 +184,8 @@ ; OPT-NEXT: br label [[BB1:%.*]] ; OPT: bb1: ; OPT-NEXT: [[PHI_BROKEN:%.*]] = phi i64 [ [[TMP0:%.*]], [[FLOW:%.*]] ], [ 0, [[BB:%.*]] ] -; OPT-NEXT: [[LSR_IV:%.*]] = phi i32 [ undef, [[BB]] ], [ [[MY_TMP2:%.*]], [[FLOW]] ] -; OPT-NEXT: [[LSR_IV_NEXT:%.*]] = add i32 [[LSR_IV]], 1 +; OPT-NEXT: [[LSR_IV:%.*]] = phi i32 [ undef, [[BB]] ], [ [[LSR_IV_NEXT:%.*]], [[FLOW]] ] +; OPT-NEXT: [[LSR_IV_NEXT]] = add i32 [[LSR_IV]], 1 ; OPT-NEXT: [[CMP0:%.*]] = icmp slt i32 [[LSR_IV_NEXT]], 0 ; OPT-NEXT: br i1 [[CMP0]], label [[BB4:%.*]], label [[FLOW]] ; OPT: bb4: @@ -194,7 +193,6 @@ ; OPT-NEXT: [[CMP1:%.*]] = icmp sge i32 [[MY_TMP]], [[LOAD]] ; OPT-NEXT: br label [[FLOW]] ; OPT: Flow: -; OPT-NEXT: [[MY_TMP2]] = phi i32 [ [[LSR_IV_NEXT]], [[BB4]] ], [ undef, [[BB1]] ] ; OPT-NEXT: [[MY_TMP3:%.*]] = phi i1 [ [[CMP1]], [[BB4]] ], [ icmp ne (i32 addrspace(3)* inttoptr (i32 4 to i32 addrspace(3)*), i32 addrspace(3)* @lds), [[BB1]] ] ; OPT-NEXT: [[TMP0]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[MY_TMP3]], i64 [[PHI_BROKEN]]) ; OPT-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP0]]) @@ -278,8 +276,8 @@ ; OPT-NEXT: br label [[BB1:%.*]] ; OPT: bb1: ; OPT-NEXT: [[PHI_BROKEN:%.*]] = phi i64 [ [[TMP0:%.*]], [[FLOW:%.*]] ], [ 0, [[BB:%.*]] ] -; OPT-NEXT: [[LSR_IV:%.*]] = phi i32 [ undef, [[BB]] ], [ [[MY_TMP2:%.*]], [[FLOW]] ] -; OPT-NEXT: [[LSR_IV_NEXT:%.*]] = add i32 [[LSR_IV]], 1 +; OPT-NEXT: [[LSR_IV:%.*]] = phi i32 [ undef, [[BB]] ], [ [[LSR_IV_NEXT:%.*]], [[FLOW]] ] +; OPT-NEXT: [[LSR_IV_NEXT]] = add i32 [[LSR_IV]], 1 ; OPT-NEXT: [[CMP0:%.*]] = icmp slt i32 [[LSR_IV_NEXT]], 0 ; OPT-NEXT: br i1 [[CMP0]], label [[BB4:%.*]], label [[FLOW]] ; OPT: bb4: @@ -287,7 +285,6 @@ ; OPT-NEXT: [[CMP1:%.*]] = icmp sge i32 [[MY_TMP]], [[LOAD]] ; OPT-NEXT: br label [[FLOW]] ; OPT: Flow: -; OPT-NEXT: [[MY_TMP2]] = phi i32 [ [[LSR_IV_NEXT]], [[BB4]] ], [ undef, [[BB1]] ] ; OPT-NEXT: [[MY_TMP3:%.*]] = phi i1 [ [[CMP1]], [[BB4]] ], [ true, [[BB1]] ] ; OPT-NEXT: [[TMP0]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[MY_TMP3]], i64 [[PHI_BROKEN]]) ; OPT-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP0]]) @@ -367,8 +364,8 @@ ; OPT-NEXT: br label [[BB1:%.*]] ; OPT: bb1: ; OPT-NEXT: [[PHI_BROKEN:%.*]] = phi i64 [ [[TMP0:%.*]], [[FLOW:%.*]] ], [ 0, [[BB:%.*]] ] -; OPT-NEXT: [[LSR_IV:%.*]] = phi i32 [ undef, [[BB]] ], [ [[MY_TMP2:%.*]], [[FLOW]] ] -; OPT-NEXT: [[LSR_IV_NEXT:%.*]] = add i32 [[LSR_IV]], 1 +; OPT-NEXT: [[LSR_IV:%.*]] = phi i32 [ undef, [[BB]] ], [ [[LSR_IV_NEXT:%.*]], [[FLOW]] ] +; OPT-NEXT: [[LSR_IV_NEXT]] = add i32 [[LSR_IV]], 1 ; OPT-NEXT: [[CMP0:%.*]] = icmp slt i32 [[LSR_IV_NEXT]], 0 ; OPT-NEXT: br i1 [[CMP0]], label [[BB4:%.*]], label [[FLOW]] ; OPT: bb4: @@ -376,7 +373,6 @@ ; OPT-NEXT: [[CMP1:%.*]] = icmp sge i32 [[MY_TMP]], [[LOAD]] ; OPT-NEXT: br label [[FLOW]] ; OPT: Flow: -; OPT-NEXT: [[MY_TMP2]] = phi i32 [ [[LSR_IV_NEXT]], [[BB4]] ], [ undef, [[BB1]] ] ; OPT-NEXT: [[MY_TMP3:%.*]] = phi i1 [ [[CMP1]], [[BB4]] ], [ false, [[BB1]] ] ; OPT-NEXT: [[TMP0]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[MY_TMP3]], i64 [[PHI_BROKEN]]) ; OPT-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP0]]) @@ -459,8 +455,8 @@ ; OPT-NEXT: br label [[BB1:%.*]] ; OPT: bb1: ; OPT-NEXT: [[PHI_BROKEN:%.*]] = phi i64 [ [[TMP1:%.*]], [[FLOW:%.*]] ], [ 0, [[BB:%.*]] ] -; OPT-NEXT: [[LSR_IV:%.*]] = phi i32 [ undef, [[BB]] ], [ [[MY_TMP2:%.*]], [[FLOW]] ] -; OPT-NEXT: [[LSR_IV_NEXT:%.*]] = add i32 [[LSR_IV]], 1 +; OPT-NEXT: [[LSR_IV:%.*]] = phi i32 [ undef, [[BB]] ], [ [[LSR_IV_NEXT:%.*]], [[FLOW]] ] +; OPT-NEXT: [[LSR_IV_NEXT]] = add i32 [[LSR_IV]], 1 ; OPT-NEXT: [[CMP0:%.*]] = icmp slt i32 [[LSR_IV_NEXT]], 0 ; OPT-NEXT: br i1 [[CMP0]], label [[BB4:%.*]], label [[FLOW]] ; OPT: bb4: @@ -468,7 +464,6 @@ ; OPT-NEXT: [[CMP1:%.*]] = icmp sge i32 [[MY_TMP]], [[LOAD]] ; OPT-NEXT: br label [[FLOW]] ; OPT: Flow: -; OPT-NEXT: [[MY_TMP2]] = phi i32 [ [[LSR_IV_NEXT]], [[BB4]] ], [ undef, [[BB1]] ] ; OPT-NEXT: [[MY_TMP3:%.*]] = phi i1 [ [[CMP1]], [[BB4]] ], [ true, [[BB1]] ] ; OPT-NEXT: [[TMP0:%.*]] = xor i1 [[MY_TMP3]], true ; OPT-NEXT: [[TMP1]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[TMP0]], i64 [[PHI_BROKEN]]) diff --git a/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll b/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll --- a/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll +++ b/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll @@ -48,8 +48,8 @@ ; IR: bb4: ; IR-NEXT: br label [[FLOW:%.*]] ; IR: bb5: -; IR-NEXT: [[PHI_BROKEN:%.*]] = phi i64 [ [[TMP3:%.*]], [[BB10:%.*]] ], [ 0, [[BB:%.*]] ] -; IR-NEXT: [[MY_TMP6:%.*]] = phi i32 [ 0, [[BB]] ], [ [[MY_TMP11:%.*]], [[BB10]] ] +; IR-NEXT: [[PHI_BROKEN:%.*]] = phi i64 [ [[TMP6:%.*]], [[BB10:%.*]] ], [ 0, [[BB:%.*]] ] +; IR-NEXT: [[MY_TMP6:%.*]] = phi i32 [ 0, [[BB]] ], [ [[TMP5:%.*]], [[BB10]] ] ; IR-NEXT: [[MY_TMP7:%.*]] = icmp eq i32 [[MY_TMP6]], 1 ; IR-NEXT: [[TMP0:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[MY_TMP7]]) ; IR-NEXT: [[TMP1:%.*]] = extractvalue { i1, i64 } [[TMP0]], 0 @@ -60,15 +60,13 @@ ; IR: bb9: ; IR-NEXT: br i1 false, label [[BB3:%.*]], label [[BB9:%.*]] ; IR: bb10: -; IR-NEXT: [[MY_TMP11]] = phi i32 [ [[TMP6:%.*]], [[FLOW]] ] -; IR-NEXT: [[MY_TMP12:%.*]] = phi i1 [ [[TMP5:%.*]], [[FLOW]] ] -; IR-NEXT: [[TMP3]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[MY_TMP12]], i64 [[PHI_BROKEN]]) -; IR-NEXT: [[TMP4:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP3]]) -; IR-NEXT: br i1 [[TMP4]], label [[BB23:%.*]], label [[BB5]] +; IR-NEXT: [[TMP3:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP6]]) +; IR-NEXT: br i1 [[TMP3]], label [[BB23:%.*]], label [[BB5]] ; IR: Flow: -; IR-NEXT: [[TMP5]] = phi i1 [ [[MY_TMP22:%.*]], [[BB4]] ], [ true, [[BB5]] ] -; IR-NEXT: [[TMP6]] = phi i32 [ [[MY_TMP21:%.*]], [[BB4]] ], [ undef, [[BB5]] ] +; IR-NEXT: [[TMP4:%.*]] = phi i1 [ [[MY_TMP22:%.*]], [[BB4]] ], [ true, [[BB5]] ] +; IR-NEXT: [[TMP5]] = phi i32 [ [[MY_TMP21:%.*]], [[BB4]] ], [ undef, [[BB5]] ] ; IR-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP2]]) +; IR-NEXT: [[TMP6]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[TMP4]], i64 [[PHI_BROKEN]]) ; IR-NEXT: br label [[BB10]] ; IR: bb13: ; IR-NEXT: [[MY_TMP14:%.*]] = phi i1 [ [[MY_TMP22]], [[BB3]] ], [ true, [[BB8]] ] @@ -84,7 +82,7 @@ ; IR-NEXT: [[MY_TMP22]] = phi i1 [ false, [[BB16]] ], [ [[MY_TMP14]], [[BB13]] ] ; IR-NEXT: br label [[BB9]] ; IR: bb23: -; IR-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP3]]) +; IR-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP6]]) ; IR-NEXT: ret void ; bb: diff --git a/llvm/test/Transforms/StructurizeCFG/AMDGPU/loop-subregion-misordered.ll b/llvm/test/Transforms/StructurizeCFG/AMDGPU/loop-subregion-misordered.ll --- a/llvm/test/Transforms/StructurizeCFG/AMDGPU/loop-subregion-misordered.ll +++ b/llvm/test/Transforms/StructurizeCFG/AMDGPU/loop-subregion-misordered.ll @@ -28,7 +28,7 @@ ; CHECK-NEXT: [[I_INITIAL:%.*]] = load volatile i32, i32 addrspace(1)* [[GEP]], align 4 ; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] ; CHECK: LOOP.HEADER: -; CHECK-NEXT: [[I:%.*]] = phi i32 [ [[I_INITIAL]], [[ENTRY:%.*]] ], [ [[TMP7:%.*]], [[FLOW3:%.*]] ] +; CHECK-NEXT: [[I:%.*]] = phi i32 [ [[I_INITIAL]], [[ENTRY:%.*]] ], [ [[TMP4:%.*]], [[FLOW3:%.*]] ] ; CHECK-NEXT: call void asm sideeffect "s_nop 0x100b ; CHECK-NEXT: [[TMP12:%.*]] = zext i32 [[I]] to i64 ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* null, i64 [[TMP12]] @@ -50,8 +50,8 @@ ; CHECK-NEXT: [[TMP25:%.*]] = mul nuw nsw i32 [[TMP24]], 52 ; CHECK-NEXT: br label [[INNER_LOOP:%.*]] ; CHECK: Flow2: -; CHECK-NEXT: [[TMP4:%.*]] = phi i32 [ [[TMP59:%.*]], [[INNER_LOOP_BREAK:%.*]] ], [ [[TMP9:%.*]], [[FLOW]] ] -; CHECK-NEXT: [[TMP5:%.*]] = phi i1 [ true, [[INNER_LOOP_BREAK]] ], [ [[TMP11:%.*]], [[FLOW]] ] +; CHECK-NEXT: [[TMP4]] = phi i32 [ [[TMP59:%.*]], [[INNER_LOOP_BREAK:%.*]] ], [ [[TMP8:%.*]], [[FLOW]] ] +; CHECK-NEXT: [[TMP5:%.*]] = phi i1 [ true, [[INNER_LOOP_BREAK]] ], [ [[TMP10:%.*]], [[FLOW]] ] ; CHECK-NEXT: br i1 [[TMP5]], label [[END_ELSE_BLOCK:%.*]], label [[FLOW3]] ; CHECK: INNER_LOOP: ; CHECK-NEXT: [[INNER_LOOP_J:%.*]] = phi i32 [ [[INNER_LOOP_J_INC:%.*]], [[INNER_LOOP]] ], [ [[TMP25]], [[BB18:%.*]] ] @@ -68,28 +68,26 @@ ; CHECK-NEXT: [[TMP6:%.*]] = xor i1 [[LOAD13]], true ; CHECK-NEXT: br i1 [[TMP6]], label [[INCREMENT_I]], label [[FLOW1:%.*]] ; CHECK: Flow3: -; CHECK-NEXT: [[TMP7]] = phi i32 [ [[I_FINAL:%.*]], [[END_ELSE_BLOCK]] ], [ undef, [[FLOW2]] ] -; CHECK-NEXT: [[TMP8:%.*]] = phi i1 [ [[CMP_END_ELSE_BLOCK:%.*]], [[END_ELSE_BLOCK]] ], [ true, [[FLOW2]] ] -; CHECK-NEXT: br i1 [[TMP8]], label [[FLOW4:%.*]], label [[LOOP_HEADER]] +; CHECK-NEXT: [[TMP7:%.*]] = phi i1 [ [[CMP_END_ELSE_BLOCK:%.*]], [[END_ELSE_BLOCK]] ], [ true, [[FLOW2]] ] +; CHECK-NEXT: br i1 [[TMP7]], label [[FLOW4:%.*]], label [[LOOP_HEADER]] ; CHECK: Flow4: -; CHECK-NEXT: br i1 [[TMP10:%.*]], label [[BB64:%.*]], label [[RETURN:%.*]] +; CHECK-NEXT: br i1 [[TMP9:%.*]], label [[BB64:%.*]], label [[RETURN:%.*]] ; CHECK: bb64: ; CHECK-NEXT: call void asm sideeffect "s_nop 42", "~{memory}"() #0 ; CHECK-NEXT: br label [[RETURN]] ; CHECK: Flow: -; CHECK-NEXT: [[TMP9]] = phi i32 [ [[TMP1]], [[FLOW1]] ], [ undef, [[LOOP_HEADER]] ] -; CHECK-NEXT: [[TMP10]] = phi i1 [ [[TMP2]], [[FLOW1]] ], [ false, [[LOOP_HEADER]] ] -; CHECK-NEXT: [[TMP11]] = phi i1 [ [[TMP3]], [[FLOW1]] ], [ false, [[LOOP_HEADER]] ] -; CHECK-NEXT: [[TMP12:%.*]] = phi i1 [ false, [[FLOW1]] ], [ true, [[LOOP_HEADER]] ] -; CHECK-NEXT: br i1 [[TMP12]], label [[BB18]], label [[FLOW2]] +; CHECK-NEXT: [[TMP8]] = phi i32 [ [[TMP1]], [[FLOW1]] ], [ undef, [[LOOP_HEADER]] ] +; CHECK-NEXT: [[TMP9]] = phi i1 [ [[TMP2]], [[FLOW1]] ], [ false, [[LOOP_HEADER]] ] +; CHECK-NEXT: [[TMP10]] = phi i1 [ [[TMP3]], [[FLOW1]] ], [ false, [[LOOP_HEADER]] ] +; CHECK-NEXT: [[TMP11:%.*]] = phi i1 [ false, [[FLOW1]] ], [ true, [[LOOP_HEADER]] ] +; CHECK-NEXT: br i1 [[TMP11]], label [[BB18]], label [[FLOW2]] ; CHECK: INCREMENT_I: ; CHECK-NEXT: [[INC_I]] = add i32 [[I]], 1 ; CHECK-NEXT: call void asm sideeffect "s_nop 0x1336 ; CHECK-NEXT: br label [[FLOW1]] ; CHECK: END_ELSE_BLOCK: -; CHECK-NEXT: [[I_FINAL]] = phi i32 [ [[TMP4]], [[FLOW2]] ] ; CHECK-NEXT: call void asm sideeffect "s_nop 0x1337 -; CHECK-NEXT: [[CMP_END_ELSE_BLOCK]] = icmp eq i32 [[I_FINAL]], -1 +; CHECK-NEXT: [[CMP_END_ELSE_BLOCK]] = icmp eq i32 [[TMP4]], -1 ; CHECK-NEXT: br label [[FLOW3]] ; CHECK: RETURN: ; CHECK-NEXT: call void asm sideeffect "s_nop 0x99