Index: llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -51,6 +51,7 @@ #include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/CodeGen/LiveIntervals.h" @@ -81,6 +82,7 @@ const SIInstrInfo *TII = nullptr; LiveIntervals *LIS = nullptr; MachineRegisterInfo *MRI = nullptr; + DenseSet LoweredEndCf; const TargetRegisterClass *BoolRC = nullptr; unsigned AndOpc; @@ -103,6 +105,13 @@ void combineMasks(MachineInstr &MI); + // Skip to the next instruction, ignoring debug instructions, and trivial + // block boundaries (blocks that have one (typically fallthrough) successor, + // and the successor has one predecessor. + MachineBasicBlock::iterator + skipIgnoreExecInstsTrivialSucc(MachineBasicBlock &MBB, + MachineBasicBlock::iterator It) const; + public: static char ID; @@ -396,6 +405,36 @@ MI.eraseFromParent(); } +MachineBasicBlock::iterator +SILowerControlFlow::skipIgnoreExecInstsTrivialSucc( + MachineBasicBlock &MBB, MachineBasicBlock::iterator It) const { + + SmallSet Visited; + MachineBasicBlock *B = &MBB; + do { + if (!Visited.insert(B).second) + return MBB.end(); + + auto E = B->end(); + for ( ; It != E; ++It) { + if (TII->mayReadEXEC(*MRI, *It)) + break; + } + + if (It != E) + return It; + + if (B->succ_size() != 1) + return MBB.end(); + + // If there is one trivial successor, advance to the next block. + MachineBasicBlock *Succ = *B->succ_begin(); + + It = Succ->begin(); + B = Succ; + } while (true); +} + void SILowerControlFlow::emitEndCf(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); @@ -403,6 +442,18 @@ MachineInstr *Def = MRI.getUniqueVRegDef(CFMask); const DebugLoc &DL = MI.getDebugLoc(); + // If the only instruction immediately following this END_CF is an another + // END_CF in the only successor we can avoid emitting exec mask restore here. + auto Next = skipIgnoreExecInstsTrivialSucc(MBB, std::next(MI.getIterator())); + if (Next != MBB.end() && (Next->getOpcode() == AMDGPU::SI_END_CF || + LoweredEndCf.count(&*Next))) { + LLVM_DEBUG(dbgs() << "Skip redundant "; MI.dump()); + if (LIS) + LIS->RemoveMachineInstrFromMaps(MI); + MI.eraseFromParent(); + return; + } + MachineBasicBlock::iterator InsPt = Def && Def->getParent() == &MBB ? std::next(MachineBasicBlock::iterator(Def)) : MBB.begin(); @@ -410,6 +461,8 @@ .addReg(Exec) .add(MI.getOperand(0)); + LoweredEndCf.insert(NewMI); + if (LIS) LIS->ReplaceMachineInstrInMaps(MI, *NewMI); @@ -556,5 +609,7 @@ } } + LoweredEndCf.clear(); + return true; } Index: llvm/test/CodeGen/AMDGPU/collapse-endcf.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/collapse-endcf.ll +++ llvm/test/CodeGen/AMDGPU/collapse-endcf.ll @@ -1,10 +1,12 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -amdgpu-remove-redundant-endcf < %s | FileCheck -enable-var-scope -check-prefixes=GCN,ALL %s -; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -amdgpu-opt-exec-mask-pre-ra=0 < %s | FileCheck -enable-var-scope -check-prefixes=DISABLED,ALL %s +; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s -; ALL-LABEL: {{^}}simple_nested_if: +; GCN-LABEL: {{^}}simple_nested_if: ; GCN: s_and_saveexec_b64 [[SAVEEXEC:s\[[0-9:]+\]]] ; GCN-NEXT: s_cbranch_execz [[ENDIF:BB[0-9_]+]] -; GCN: s_and_b64 exec, exec, vcc + +; TODO: this does not need to save exec, just perform the and. +; GCN: s_and_saveexec_b64 s[{{[0-9:]+}}], vcc + ; GCN-NEXT: s_cbranch_execz [[ENDIF]] ; GCN-NEXT: ; %bb.{{[0-9]+}}: ; GCN: store_dword @@ -13,9 +15,6 @@ ; GCN: ds_write_b32 ; GCN: s_endpgm - -; DISABLED: s_or_b64 exec, exec -; DISABLED: s_or_b64 exec, exec define amdgpu_kernel void @simple_nested_if(i32 addrspace(1)* nocapture %arg) { bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -39,7 +38,7 @@ ret void } -; ALL-LABEL: {{^}}uncollapsable_nested_if: +; GCN-LABEL: {{^}}uncollapsable_nested_if: ; GCN: s_and_saveexec_b64 [[SAVEEXEC_OUTER:s\[[0-9:]+\]]] ; GCN-NEXT: s_cbranch_execz [[ENDIF_OUTER:BB[0-9_]+]] ; GCN: s_and_saveexec_b64 [[SAVEEXEC_INNER:s\[[0-9:]+\]]] @@ -82,7 +81,7 @@ ret void } -; ALL-LABEL: {{^}}nested_if_if_else: +; GCN-LABEL: {{^}}nested_if_if_else: ; GCN: s_and_saveexec_b64 [[SAVEEXEC_OUTER:s\[[0-9:]+\]]] ; GCN-NEXT: s_cbranch_execz [[ENDIF_OUTER:BB[0-9_]+]] ; GCN: s_and_saveexec_b64 [[SAVEEXEC_INNER:s\[[0-9:]+\]]] @@ -128,7 +127,7 @@ ret void } -; ALL-LABEL: {{^}}nested_if_else_if: +; GCN-LABEL: {{^}}nested_if_else_if: ; GCN: s_and_saveexec_b64 [[SAVEEXEC_OUTER:s\[[0-9:]+\]]] ; GCN-NEXT: s_xor_b64 [[SAVEEXEC_OUTER2:s\[[0-9:]+\]]], exec, [[SAVEEXEC_OUTER]] ; GCN-NEXT: s_cbranch_execz [[THEN_OUTER:BB[0-9_]+]] @@ -151,9 +150,9 @@ ; GCN-NEXT: ; %bb.{{[0-9]+}}: ; GCN: store_dword ; GCN-NEXT: [[FLOW1]]: -; GCN-NEXT: s_or_b64 exec, exec, [[SAVEEXEC_INNER_IF_OUTER_THEN]] -; GCN-NEXT: {{^}}[[ENDIF_OUTER]]: -; GCN-NEXT: s_or_b64 exec, exec, [[SAVEEXEC_OUTER]] +; GCN-NEXT: s_or_b64 exec, exec, [[SAVEEXEC_OUTER3]] +; GCN-NOT: s_or_b64 exec +; GCN-NOT: {{^.*:}} ; GCN: ds_write_b32 ; GCN: s_endpgm define amdgpu_kernel void @nested_if_else_if(i32 addrspace(1)* nocapture %arg) { @@ -191,7 +190,7 @@ ret void } -; ALL-LABEL: {{^}}s_endpgm_unsafe_barrier: +; GCN-LABEL: {{^}}s_endpgm_unsafe_barrier: ; GCN: s_and_saveexec_b64 [[SAVEEXEC:s\[[0-9:]+\]]] ; GCN-NEXT: s_cbranch_execz [[ENDIF:BB[0-9_]+]] ; GCN-NEXT: ; %bb.{{[0-9]+}}: @@ -216,8 +215,7 @@ ret void } -; Make sure scc liveness is updated if sor_b64 is removed -; ALL-LABEL: {{^}}scc_liveness: +; GCN-LABEL: {{^}}scc_liveness: ; GCN: %bb10 ; GCN: s_or_b64 exec, exec, s{{\[[0-9]+:[0-9]+\]}} @@ -229,7 +227,9 @@ ; GCN-NEXT: s_cbranch_execnz [[BB1_LOOP]] ; GCN: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen -; GCN: s_and_b64 exec, exec, {{vcc|s\[[0-9:]+\]}} + +; TODO: this does not need to save exec, just perform the and. +; GCN: s_and_saveexec_b64 s[{{[0-9:]+}}], {{vcc|s\[[0-9:]+\]}} ; GCN-NOT: s_or_b64 exec, exec Index: llvm/test/CodeGen/AMDGPU/collapse-endcf.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/collapse-endcf.mir +++ llvm/test/CodeGen/AMDGPU/collapse-endcf.mir @@ -46,7 +46,6 @@ ; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %7, %4, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GCN: bb.3: ; GCN: successors: %bb.4(0x80000000) - ; GCN: $exec = S_OR_B64 $exec, [[COPY4]], implicit-def $scc ; GCN: DBG_VALUE ; GCN: bb.4: ; GCN: $exec = S_OR_B64 $exec, [[COPY2]], implicit-def $scc @@ -146,7 +145,6 @@ ; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %7, %4, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GCN: bb.3: ; GCN: successors: %bb.4(0x80000000) - ; GCN: $exec = S_OR_B64 $exec, [[COPY4]], implicit-def $scc ; GCN: bb.4: ; GCN: successors: %bb.5(0x80000000) ; GCN: bb.5: @@ -246,7 +244,6 @@ ; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %7, %4, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GCN: bb.3: ; GCN: successors: %bb.4(0x80000000) - ; GCN: $exec = S_OR_B64 $exec, [[COPY4]], implicit-def $scc ; GCN: bb.4: ; GCN: successors: %bb.5(0x80000000) ; GCN: DBG_VALUE @@ -347,7 +344,6 @@ ; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %7, %4, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GCN: bb.3: ; GCN: successors: %bb.4(0x80000000) - ; GCN: $exec = S_OR_B64 $exec, [[COPY4]], implicit-def $scc ; GCN: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; GCN: [[S_BREV_B32_:%[0-9]+]]:sgpr_32 = S_BREV_B32 [[DEF]] ; GCN: KILL [[DEF]] @@ -450,7 +446,6 @@ ; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %7, %4, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GCN: bb.3: ; GCN: successors: %bb.4(0x80000000) - ; GCN: $exec = S_OR_B64 $exec, [[COPY4]], implicit-def $scc ; GCN: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; GCN: [[S_BREV_B32_:%[0-9]+]]:sgpr_32 = S_BREV_B32 [[DEF]] ; GCN: KILL [[DEF]] @@ -749,7 +744,6 @@ ; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %7, %4, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GCN: bb.3: ; GCN: successors: %bb.5(0x80000000) - ; GCN: $exec = S_OR_B64 $exec, [[COPY4]], implicit-def $scc ; GCN: S_BRANCH %bb.5 ; GCN: bb.4: ; GCN: $exec = S_OR_B64 $exec, [[COPY2]], implicit-def $scc Index: llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll +++ llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll @@ -58,7 +58,7 @@ ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 ; GFX9-NEXT: v_cmp_lt_u32_e64 s[4:5], v0, v1 ; GFX9-NEXT: s_and_saveexec_b64 s[10:11], s[4:5] -; GFX9-NEXT: s_cbranch_execz BB1_4 +; GFX9-NEXT: s_cbranch_execz BB1_3 ; GFX9-NEXT: ; %bb.1: ; %bb19 ; GFX9-NEXT: v_cvt_f32_u32_e32 v7, v6 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffffff, v6 @@ -100,9 +100,7 @@ ; GFX9-NEXT: v_add_u32_e32 v3, v3, v6 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[12:13] ; GFX9-NEXT: s_cbranch_execnz BB1_2 -; GFX9-NEXT: ; %bb.3: ; %Flow -; GFX9-NEXT: s_or_b64 exec, exec, s[12:13] -; GFX9-NEXT: BB1_4: ; %Flow3 +; GFX9-NEXT: BB1_3: ; %Flow3 ; GFX9-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31]