Index: lib/Target/AMDGPU/SIFixSGPRCopies.cpp =================================================================== --- lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -68,6 +68,7 @@ #include "AMDGPU.h" #include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" +#include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" @@ -82,6 +83,9 @@ namespace { class SIFixSGPRCopies : public MachineFunctionPass { + + MachineDominatorTree *MDT; + public: static char ID; @@ -94,6 +98,7 @@ } void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); AU.setPreservesCFG(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -101,8 +106,12 @@ } // End anonymous namespace -INITIALIZE_PASS(SIFixSGPRCopies, DEBUG_TYPE, - "SI Fix SGPR copies", false, false) +INITIALIZE_PASS_BEGIN(SIFixSGPRCopies, DEBUG_TYPE, + "SI Fix SGPR copies", false, false) +INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) +INITIALIZE_PASS_END(SIFixSGPRCopies, DEBUG_TYPE, + "SI Fix SGPR copies", false, false) + char SIFixSGPRCopies::ID = 0; @@ -236,11 +245,31 @@ return true; } +static bool hasUniformTerminator(const MachineBasicBlock *MBB) { + MachineBasicBlock::const_iterator Term = MBB->getFirstTerminator(); + + // No terminator means this is a fall-through which is a uniform branch. + if (Term == MBB->end()) + return true; + + switch (Term->getOpcode()) { + default: + return false; + case AMDGPU::S_BRANCH: + case AMDGPU::S_CBRANCH_SCC0: + case AMDGPU::S_CBRANCH_SCC1: + case AMDGPU::S_CBRANCH_VCCNZ: + case AMDGPU::S_CBRANCH_VCCZ: + return true; + } +} + bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { const SISubtarget &ST = MF.getSubtarget(); MachineRegisterInfo &MRI = MF.getRegInfo(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); const SIInstrInfo *TII = ST.getInstrInfo(); + MDT = &getAnalysis(); SmallVector Worklist; @@ -271,11 +300,23 @@ break; } case AMDGPU::PHI: { - DEBUG(dbgs() << "Fixing PHI: " << MI); unsigned Reg = MI.getOperand(0).getReg(); if (!TRI->isSGPRClass(MRI.getRegClass(Reg))) break; + // We don't need to fix the PHI if the common denominator of the + // two incoming blocks terminates with a uniform branch. + if (MI.getNumExplicitOperands() == 5) { + MachineBasicBlock *MBB0 = MI.getOperand(2).getMBB(); + MachineBasicBlock *MBB1 = MI.getOperand(4).getMBB(); + + MachineBasicBlock *NCD = MDT->findNearestCommonDominator(MBB0, MBB1); + if (NCD && hasUniformTerminator(NCD)) { + DEBUG(dbgs() << "Not fixing PHI for uniform branch: " << MI << '\n'); + break; + } + } + // If a PHI node defines an SGPR and any of its operands are VGPRs, // then we need to move it to the VALU. // @@ -302,10 +343,6 @@ // ... // use sgpr2 // - // FIXME: This is OK if the branching decision is made based on an - // SGPR value. - bool SGPRBranch = false; - // The one exception to this rule is when one of the operands // is defined by a SI_BREAK, SI_IF_BREAK, or SI_ELSE_BREAK // instruction. In this case, there we know the program will @@ -313,6 +350,7 @@ // the first block (where the condition is computed), so there // is no chance for values to be over-written. + DEBUG(dbgs() << "Fixing PHI: " << MI); bool HasBreakDef = false; for (unsigned i = 1; i < MI.getNumOperands(); i+=2) { unsigned Reg = MI.getOperand(i).getReg(); @@ -336,7 +374,7 @@ } } - if (!SGPRBranch && !HasBreakDef) + if (!HasBreakDef) TII->moveToVALU(MI); break; } Index: test/CodeGen/AMDGPU/cf-loop-on-constant.ll =================================================================== --- test/CodeGen/AMDGPU/cf-loop-on-constant.ll +++ test/CodeGen/AMDGPU/cf-loop-on-constant.ll @@ -97,7 +97,7 @@ ; GCN: v_and_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}} ; GCN: v_cmp_eq_i32_e32 vcc, 1, -; GCN: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, exec, vcc +; GCN: s_and_b64 vcc, exec, vcc ; GCN: [[LOOPBB:BB[0-9]+_[0-9]+]] ; GCN: s_cbranch_vccnz [[LOOPBB]] ; GCN-NEXT: ; BB#2 Index: test/CodeGen/AMDGPU/lds-m0-init-in-loop.ll =================================================================== --- test/CodeGen/AMDGPU/lds-m0-init-in-loop.ll +++ test/CodeGen/AMDGPU/lds-m0-init-in-loop.ll @@ -12,7 +12,7 @@ ; GCN: ds_read_b32 ; GCN: buffer_store_dword -; GCN: s_cbranch_vccz BB0_2 +; GCN: s_cbranch_scc0 BB0_2 ; GCN: BB0_3: ; GCN-NEXT: s_endpgm Index: test/CodeGen/AMDGPU/loop_break.ll =================================================================== --- test/CodeGen/AMDGPU/loop_break.ll +++ test/CodeGen/AMDGPU/loop_break.ll @@ -27,9 +27,8 @@ ; GCN: [[LOOP_ENTRY:BB[0-9]+_[0-9]+]]: ; %bb1 ; GCN: s_or_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], exec, [[INITMASK]] -; GCN: v_cmp_lt_i32_e32 vcc, -; GCN: s_and_b64 vcc, exec, vcc -; GCN-NEXT: s_cbranch_vccnz [[FLOW:BB[0-9]+_[0-9]+]] +; GCN: s_cmp_gt_i32 s{{[0-9]+}}, -1 +; GCN-NEXT: s_cbranch_scc1 [[FLOW:BB[0-9]+_[0-9]+]] ; GCN: ; BB#2: ; %bb4 ; GCN: buffer_load_dword Index: test/CodeGen/AMDGPU/uniform-cfg.ll =================================================================== --- test/CodeGen/AMDGPU/uniform-cfg.ll +++ test/CodeGen/AMDGPU/uniform-cfg.ll @@ -281,12 +281,9 @@ ; SI-LABEL: {{^}}uniform_loop: ; SI: {{^}}[[LOOP_LABEL:[A-Z0-9_a-z]+]]: -; FIXME: We need to teach SIFixSGPRCopies about uniform branches so we -; get s_add_i32 here. -; SI: v_add_i32_e32 [[I:v[0-9]+]], vcc, -1, v{{[0-9]+}} -; SI: v_cmp_ne_i32_e32 vcc, 0, [[I]] -; SI: s_and_b64 vcc, exec, vcc -; SI: s_cbranch_vccnz [[LOOP_LABEL]] +; SI: s_add_i32 [[I:s[0-9]+]], s{{[0-9]+}}, -1 +; SI: s_cmp_lg_i32 [[I]], 0 +; SI: s_cbranch_scc1 [[LOOP_LABEL]] ; SI: s_endpgm define void @uniform_loop(i32 addrspace(1)* %out, i32 %a) { entry: Index: test/CodeGen/AMDGPU/valu-i1.ll =================================================================== --- test/CodeGen/AMDGPU/valu-i1.ll +++ test/CodeGen/AMDGPU/valu-i1.ll @@ -79,9 +79,8 @@ ; SI: [[LABEL_LOOP:BB[0-9]+_[0-9]+]]: ; SI: buffer_load_dword ; SI-DAG: buffer_store_dword -; SI-DAG: v_cmp_eq_i32_e32 vcc, -; SI-DAG: s_and_b64 vcc, exec, vcc -; SI: s_cbranch_vccz [[LABEL_LOOP]] +; SI-DAG: s_cmp_eq_i32 +; SI: s_cbranch_scc0 [[LABEL_LOOP]] ; SI: [[LABEL_EXIT]]: ; SI: s_endpgm