Index: llvm/trunk/lib/Target/AMDGPU/SILowerControlFlow.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ llvm/trunk/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -134,6 +134,38 @@ char &llvm::SILowerControlFlowID = SILowerControlFlow::ID; +static bool isSimpleIf(const MachineInstr &MI, const MachineRegisterInfo *MRI) { + unsigned SaveExecReg = MI.getOperand(0).getReg(); + auto U = MRI->use_instr_nodbg_begin(SaveExecReg); + + if (U == MRI->use_instr_nodbg_end() || + std::next(U) != MRI->use_instr_nodbg_end() || + U->getOpcode() != AMDGPU::SI_END_CF) + return false; + + // Check for SI_KILL_TERMINATOR on path from if to endif. + // if there is any such terminator simplififcations are not safe. + auto SMBB = MI.getParent(); + auto EMBB = U->getParent(); + DenseSet Visited; + SmallVector Worklist(SMBB->succ_begin(), + SMBB->succ_end()); + + while (!Worklist.empty()) { + MachineBasicBlock *MBB = Worklist.pop_back_val(); + + if (MBB == EMBB || !Visited.insert(MBB).second) + continue; + for(auto &Term : MBB->terminators()) + if (Term.getOpcode() == AMDGPU::SI_KILL_TERMINATOR) + return false; + + Worklist.append(MBB->succ_begin(), MBB->succ_end()); + } + + return true; +} + void SILowerControlFlow::emitIf(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); const DebugLoc &DL = MI.getDebugLoc(); @@ -152,11 +184,7 @@ // If there is only one use of save exec register and that use is SI_END_CF, // we can optimize SI_IF by returning the full saved exec mask instead of // just cleared bits. - bool SimpleIf = false; - auto U = MRI->use_instr_nodbg_begin(SaveExecReg); - SimpleIf = U != MRI->use_instr_nodbg_end() && - std::next(U) == MRI->use_instr_nodbg_end() && - U->getOpcode() == AMDGPU::SI_END_CF; + bool SimpleIf = isSimpleIf(MI, MRI); // Add an implicit def of exec to discourage scheduling VALU after this which // will interfere with trying to form s_and_saveexec_b64 later. Index: llvm/trunk/test/CodeGen/AMDGPU/si-lower-control-flow-kill.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/si-lower-control-flow-kill.ll +++ llvm/trunk/test/CodeGen/AMDGPU/si-lower-control-flow-kill.ll @@ -0,0 +1,71 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; GCN-LABEL: {{^}}if_with_kill: +; GCN: s_and_saveexec_b64 [[SAVEEXEC:s\[[0-9:]+\]]], +; GCN-NEXT: s_xor_b64 s[{{[0-9:]+}}], exec, [[SAVEEXEC]] +define amdgpu_ps void @if_with_kill(i32 %arg) { +.entry: + %cmp = icmp eq i32 %arg, 32 + br i1 %cmp, label %then, label %endif + +then: + tail call void @llvm.AMDGPU.kilp() + br label %endif + +endif: + ret void +} + +; GCN-LABEL: {{^}}if_with_loop_kill_after: +; GCN: s_and_saveexec_b64 [[SAVEEXEC:s\[[0-9:]+\]]], +; GCN-NEXT: s_xor_b64 s[{{[0-9:]+}}], exec, [[SAVEEXEC]] +define amdgpu_ps void @if_with_loop_kill_after(i32 %arg) { +.entry: + %cmp = icmp eq i32 %arg, 32 + br i1 %cmp, label %then, label %endif + +then: + %sub = sub i32 %arg, 1 + br label %loop + +loop: + %ind = phi i32 [%sub, %then], [%dec, %loop] + %dec = sub i32 %ind, 1 + %cc = icmp ne i32 %ind, 0 + br i1 %cc, label %loop, label %break + +break: + tail call void @llvm.AMDGPU.kilp() + br label %endif + +endif: + ret void +} + +; GCN-LABEL: {{^}}if_with_kill_inside_loop: +; GCN: s_and_saveexec_b64 [[SAVEEXEC:s\[[0-9:]+\]]], +; GCN-NEXT: s_xor_b64 s[{{[0-9:]+}}], exec, [[SAVEEXEC]] +define amdgpu_ps void @if_with_kill_inside_loop(i32 %arg) { +.entry: + %cmp = icmp eq i32 %arg, 32 + br i1 %cmp, label %then, label %endif + +then: + %sub = sub i32 %arg, 1 + br label %loop + +loop: + %ind = phi i32 [%sub, %then], [%dec, %loop] + %dec = sub i32 %ind, 1 + %cc = icmp ne i32 %ind, 0 + tail call void @llvm.AMDGPU.kilp() + br i1 %cc, label %loop, label %break + +break: + br label %endif + +endif: + ret void +} + +declare void @llvm.AMDGPU.kilp() Index: llvm/trunk/test/CodeGen/AMDGPU/skip-if-dead.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/skip-if-dead.ll +++ llvm/trunk/test/CodeGen/AMDGPU/skip-if-dead.ll @@ -202,6 +202,7 @@ ; CHECK-LABEL: {{^}}test_kill_divergent_loop: ; CHECK: v_cmp_eq_u32_e32 vcc, 0, v0 ; CHECK-NEXT: s_and_saveexec_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], vcc +; CHECK-NEXT: s_xor_b64 [[SAVEEXEC]], exec, [[SAVEEXEC]] ; CHECK-NEXT: ; mask branch [[EXIT:BB[0-9]+_[0-9]+]] ; CHECK-NEXT: s_cbranch_execz [[EXIT]] @@ -336,6 +337,7 @@ ; CHECK-LABEL: {{^}}if_after_kill_block: ; CHECK: ; BB#0: ; CHECK: s_and_saveexec_b64 +; CHECK: s_xor_b64 ; CHECK-NEXT: mask branch [[BB4:BB[0-9]+_[0-9]+]] ; CHECK: v_cmpx_le_f32_e32 vcc, 0,