Index: llvm/trunk/lib/Target/AMDGPU/SIFixSGPRCopies.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -68,6 +68,7 @@ #include "AMDGPU.h" #include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" +#include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" @@ -82,6 +83,9 @@ namespace { class SIFixSGPRCopies : public MachineFunctionPass { + + MachineDominatorTree *MDT; + public: static char ID; @@ -92,6 +96,8 @@ StringRef getPassName() const override { return "SI Fix SGPR copies"; } void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addPreserved(); AU.setPreservesCFG(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -99,8 +105,12 @@ } // End anonymous namespace -INITIALIZE_PASS(SIFixSGPRCopies, DEBUG_TYPE, - "SI Fix SGPR copies", false, false) +INITIALIZE_PASS_BEGIN(SIFixSGPRCopies, DEBUG_TYPE, + "SI Fix SGPR copies", false, false) +INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) +INITIALIZE_PASS_END(SIFixSGPRCopies, DEBUG_TYPE, + "SI Fix SGPR copies", false, false) + char SIFixSGPRCopies::ID = 0; @@ -274,11 +284,22 @@ return false; } +static bool hasTerminatorThatModifiesExec(const MachineBasicBlock &MBB, + const TargetRegisterInfo &TRI) { + for (MachineBasicBlock::const_iterator I = MBB.getFirstTerminator(), + E = MBB.end(); I != E; ++I) { + if (I->modifiesRegister(AMDGPU::EXEC, &TRI)) + return true; + } + return false; +} + bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { const SISubtarget &ST = MF.getSubtarget(); MachineRegisterInfo &MRI = MF.getRegInfo(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); const SIInstrInfo *TII = ST.getInstrInfo(); + MDT = &getAnalysis(); SmallVector Worklist; @@ -309,11 +330,23 @@ break; } case AMDGPU::PHI: { - DEBUG(dbgs() << "Fixing PHI: " << MI); unsigned Reg = MI.getOperand(0).getReg(); if (!TRI->isSGPRClass(MRI.getRegClass(Reg))) break; + // We don't need to fix the PHI if the common dominator of the + // two incoming blocks terminates with a uniform branch. + if (MI.getNumExplicitOperands() == 5) { + MachineBasicBlock *MBB0 = MI.getOperand(2).getMBB(); + MachineBasicBlock *MBB1 = MI.getOperand(4).getMBB(); + + MachineBasicBlock *NCD = MDT->findNearestCommonDominator(MBB0, MBB1); + if (NCD && !hasTerminatorThatModifiesExec(*NCD, *TRI)) { + DEBUG(dbgs() << "Not fixing PHI for uniform branch: " << MI << '\n'); + break; + } + } + // If a PHI node defines an SGPR and any of its operands are VGPRs, // then we need to move it to the VALU. // @@ -340,10 +373,6 @@ // ... // use sgpr2 // - // FIXME: This is OK if the branching decision is made based on an - // SGPR value. - bool SGPRBranch = false; - // The one exception to this rule is when one of the operands // is defined by a SI_BREAK, SI_IF_BREAK, or SI_ELSE_BREAK // instruction. In this case, there we know the program will @@ -353,7 +382,8 @@ SmallSet Visited; if (phiHasVGPROperands(MI, MRI, TRI, TII) || - (!SGPRBranch && !phiHasBreakDef(MI, MRI, Visited))) { + !phiHasBreakDef(MI, MRI, Visited)) { + DEBUG(dbgs() << "Fixing PHI: " << MI); TII->moveToVALU(MI); } break; Index: llvm/trunk/test/CodeGen/AMDGPU/branch-relaxation.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/branch-relaxation.ll +++ llvm/trunk/test/CodeGen/AMDGPU/branch-relaxation.ll @@ -163,15 +163,13 @@ ret void } -; FIXME: Should be able to use s_cbranch_scc0 ; GCN-LABEL: {{^}}long_backward_sbranch: -; GCN: v_mov_b32_e32 [[LOOPIDX:v[0-9]+]], 0{{$}} +; GCN: s_mov_b32 [[LOOPIDX:s[0-9]+]], 0{{$}} ; GCN: [[LOOPBB:BB[0-9]+_[0-9]+]]: ; %bb2 ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: v_add_i32_e32 [[INC:v[0-9]+]], vcc, 1, [[LOOPIDX]] -; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 10, [[INC]] -; GCN-NEXT: s_and_b64 vcc, exec, vcc +; GCN-NEXT: s_add_i32 [[INC:s[0-9]+]], [[LOOPIDX]], 1 +; GCN-NEXT: s_cmp_lt_i32 [[INC]], 10 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: v_nop_e64 @@ -179,7 +177,7 @@ ; GCN-NEXT: v_nop_e64 ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_cbranch_vccz [[ENDBB:BB[0-9]+_[0-9]+]] +; GCN-NEXT: s_cbranch_scc0 [[ENDBB:BB[0-9]+_[0-9]+]] ; GCN-NEXT: [[LONG_JUMP:BB[0-9]+_[0-9]+]]: ; %bb2 ; GCN-NEXT: ; in Loop: Header=[[LOOPBB]] Depth=1 Index: llvm/trunk/test/CodeGen/AMDGPU/cf-loop-on-constant.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/cf-loop-on-constant.ll +++ llvm/trunk/test/CodeGen/AMDGPU/cf-loop-on-constant.ll @@ -98,6 +98,9 @@ ; GCN: v_cmp_eq_u32_e32 vcc, 1, ; GCN: [[LOOPBB:BB[0-9]+_[0-9]+]] +; GCN: s_add_i32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80 +; GCN: s_add_i32 s{{[0-9]+}}, s{{[0-9]+}}, 4 + ; GCN: s_cbranch_vccnz [[LOOPBB]] ; GCN-NEXT: ; BB#2 ; GCN-NEXT: s_endpgm Index: llvm/trunk/test/CodeGen/AMDGPU/coalescer_remat.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/coalescer_remat.ll +++ llvm/trunk/test/CodeGen/AMDGPU/coalescer_remat.ll @@ -12,7 +12,7 @@ ; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0 ; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0 ; It's probably OK if this is slightly higher: -; CHECK: ; NumVgprs: 9 +; CHECK: ; NumVgprs: 8 define void @foobar(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in, i32 %flag) { entry: %cmpflag = icmp eq i32 %flag, 1 Index: llvm/trunk/test/CodeGen/AMDGPU/hoist-cond.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/hoist-cond.ll +++ llvm/trunk/test/CodeGen/AMDGPU/hoist-cond.ll @@ -4,7 +4,7 @@ ; At the same time condition shall not be serialized into a VGPR and deserialized later ; using another v_cmp + v_cndmask, but used directly in s_and_saveexec_b64. -; CHECK: v_cmp_{{..}}_u32_e64 [[COND:s\[[0-9]+:[0-9]+\]]] +; CHECK: v_cmp_{{..}}_u32_e{{32|64}} [[COND:s\[[0-9]+:[0-9]+\]|vcc]] ; CHECK: BB0_1: ; CHECK-NOT: v_cmp ; CHECK_NOT: v_cndmask Index: llvm/trunk/test/CodeGen/AMDGPU/lds-m0-init-in-loop.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/lds-m0-init-in-loop.ll +++ llvm/trunk/test/CodeGen/AMDGPU/lds-m0-init-in-loop.ll @@ -1,5 +1,8 @@ ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; FIXME: Enabling critical edge splitting will fix this. +; XFAIL: * + ; Make sure that m0 is not reinitialized in the loop. ; GCN-LABEL: {{^}}copy_local_to_global_loop_m0_init: @@ -12,7 +15,7 @@ ; GCN: ds_read_b32 ; GCN: buffer_store_dword -; GCN: s_cbranch_vccz BB0_2 +; GCN: s_cbranch_scc0 BB0_2 ; GCN: BB0_3: ; GCN-NEXT: s_endpgm Index: llvm/trunk/test/CodeGen/AMDGPU/loop_break.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/loop_break.ll +++ llvm/trunk/test/CodeGen/AMDGPU/loop_break.ll @@ -27,9 +27,8 @@ ; GCN: [[LOOP_ENTRY:BB[0-9]+_[0-9]+]]: ; %bb1 ; GCN: s_or_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], exec, [[INITMASK]] -; GCN: v_cmp_lt_i32_e32 vcc, -; GCN: s_and_b64 vcc, exec, vcc -; GCN-NEXT: s_cbranch_vccnz [[FLOW:BB[0-9]+_[0-9]+]] +; GCN: s_cmp_gt_i32 s{{[0-9]+}}, -1 +; GCN-NEXT: s_cbranch_scc1 [[FLOW:BB[0-9]+_[0-9]+]] ; GCN: ; BB#2: ; %bb4 ; GCN: buffer_load_dword Index: llvm/trunk/test/CodeGen/AMDGPU/uniform-cfg.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/uniform-cfg.ll +++ llvm/trunk/test/CodeGen/AMDGPU/uniform-cfg.ll @@ -1,16 +1,17 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -march=amdgcn -mcpu=verde -machine-sink-split-probability-threshold=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -machine-sink-split-probability-threshold=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; GCN-LABEL: {{^}}uniform_if_scc: ; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 0 -; GCN-DAG: v_mov_b32_e32 [[STORE_VAL:v[0-9]+]], 0 +; GCN-DAG: s_mov_b32 [[S_VAL:s[0-9]+]], 0 ; GCN: s_cbranch_scc1 [[IF_LABEL:[0-9_A-Za-z]+]] ; Fall-through to the else -; GCN: v_mov_b32_e32 [[STORE_VAL]], 1 +; GCN: s_mov_b32 [[S_VAL]], 1 ; GCN: [[IF_LABEL]]: -; GCN: buffer_store_dword [[STORE_VAL]] +; GCN: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[S_VAL]] +; GCN: buffer_store_dword [[V_VAL]] define void @uniform_if_scc(i32 %cond, i32 addrspace(1)* %out) { entry: %cmp0 = icmp eq i32 %cond, 0 @@ -29,17 +30,16 @@ } ; GCN-LABEL: {{^}}uniform_if_vcc: -; FIXME: We could use _e32 here if we re-used the 0 from [[STORE_VAL]], and -; also scheduled the write first. ; GCN-DAG: v_cmp_eq_f32_e64 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 0{{$}} -; GCN-DAG: v_mov_b32_e32 [[STORE_VAL:v[0-9]+]], 0 +; GCN-DAG: s_mov_b32 [[S_VAL:s[0-9]+]], 0 ; GCN: s_cbranch_vccnz [[IF_LABEL:[0-9_A-Za-z]+]] ; Fall-through to the else -; GCN: v_mov_b32_e32 [[STORE_VAL]], 1 +; GCN: s_mov_b32 [[S_VAL]], 1 ; GCN: [[IF_LABEL]]: -; GCN: buffer_store_dword [[STORE_VAL]] +; GCN: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[S_VAL]] +; GCN: buffer_store_dword [[V_VAL]] define void @uniform_if_vcc(float %cond, i32 addrspace(1)* %out) { entry: %cmp0 = fcmp oeq float %cond, 0.0 @@ -59,14 +59,15 @@ ; GCN-LABEL: {{^}}uniform_if_swap_br_targets_scc: ; GCN-DAG: s_cmp_lg_u32 s{{[0-9]+}}, 0 -; GCN-DAG: v_mov_b32_e32 [[STORE_VAL:v[0-9]+]], 0 +; GCN-DAG: s_mov_b32 [[S_VAL:s[0-9]+]], 0 ; GCN: s_cbranch_scc1 [[IF_LABEL:[0-9_A-Za-z]+]] ; Fall-through to the else -; GCN: v_mov_b32_e32 [[STORE_VAL]], 1 +; GCN: s_mov_b32 [[S_VAL]], 1 ; GCN: [[IF_LABEL]]: -; GCN: buffer_store_dword [[STORE_VAL]] +; GCN: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[S_VAL]] +; GCN: buffer_store_dword [[V_VAL]] define void @uniform_if_swap_br_targets_scc(i32 %cond, i32 addrspace(1)* %out) { entry: %cmp0 = icmp eq i32 %cond, 0 @@ -85,17 +86,16 @@ } ; GCN-LABEL: {{^}}uniform_if_swap_br_targets_vcc: -; FIXME: We could use _e32 here if we re-used the 0 from [[STORE_VAL]], and -; also scheduled the write first. ; GCN-DAG: v_cmp_neq_f32_e64 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 0{{$}} -; GCN-DAG: v_mov_b32_e32 [[STORE_VAL:v[0-9]+]], 0 +; GCN-DAG: s_mov_b32 [[S_VAL:s[0-9]+]], 0 ; GCN: s_cbranch_vccnz [[IF_LABEL:[0-9_A-Za-z]+]] ; Fall-through to the else -; GCN: v_mov_b32_e32 [[STORE_VAL]], 1 +; GCN: s_mov_b32 [[S_VAL]], 1 ; GCN: [[IF_LABEL]]: -; GCN: buffer_store_dword [[STORE_VAL]] +; GCN: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[S_VAL]] +; GCN: buffer_store_dword [[V_VAL]] define void @uniform_if_swap_br_targets_vcc(float %cond, i32 addrspace(1)* %out) { entry: %cmp0 = fcmp oeq float %cond, 0.0 @@ -276,15 +276,12 @@ ret void } -; GCN-LABEL: {{^}}uniform_loop: -; GCN: {{^}}[[LOOP_LABEL:[A-Z0-9_a-z]+]]: -; FIXME: We need to teach GCNFixSGPRCopies about uniform branches so we -; get s_add_i32 here. -; GCN: v_add_i32_e32 [[I:v[0-9]+]], vcc, -1, v{{[0-9]+}} -; GCN: v_cmp_ne_u32_e32 vcc, 0, [[I]] -; GCN: s_and_b64 vcc, exec, vcc -; GCN: s_cbranch_vccnz [[LOOP_LABEL]] -; GCN: s_endpgm +; SI-LABEL: {{^}}uniform_loop: +; SI: {{^}}[[LOOP_LABEL:[A-Z0-9_a-z]+]]: +; SI: s_add_i32 [[I:s[0-9]+]], s{{[0-9]+}}, -1 +; SI: s_cmp_lg_u32 [[I]], 0 +; SI: s_cbranch_scc1 [[LOOP_LABEL]] +; SI: s_endpgm define void @uniform_loop(i32 addrspace(1)* %out, i32 %a) { entry: br label %loop @@ -433,7 +430,7 @@ ; GCN-LABEL: {{^}}uniform_if_scc_i64_eq: ; VI-DAG: s_cmp_eq_u64 s{{\[[0-9]+:[0-9]+\]}}, 0 -; GCN-DAG: v_mov_b32_e32 [[STORE_VAL:v[0-9]+]], 0 +; GCN-DAG: s_mov_b32 [[S_VAL:s[0-9]+]], 0 ; SI: v_cmp_eq_u64_e64 ; SI: s_cbranch_vccnz [[IF_LABEL:[0-9_A-Za-z]+]] @@ -441,10 +438,11 @@ ; VI: s_cbranch_scc1 [[IF_LABEL:[0-9_A-Za-z]+]] ; Fall-through to the else -; GCN: v_mov_b32_e32 [[STORE_VAL]], 1 +; GCN: s_mov_b32 [[S_VAL]], 1 ; GCN: [[IF_LABEL]]: -; GCN: buffer_store_dword [[STORE_VAL]] +; GCN: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[S_VAL]] +; GCN: buffer_store_dword [[V_VAL]] define void @uniform_if_scc_i64_eq(i64 %cond, i32 addrspace(1)* %out) { entry: %cmp0 = icmp eq i64 %cond, 0 @@ -464,7 +462,7 @@ ; GCN-LABEL: {{^}}uniform_if_scc_i64_ne: ; VI-DAG: s_cmp_lg_u64 s{{\[[0-9]+:[0-9]+\]}}, 0 -; GCN-DAG: v_mov_b32_e32 [[STORE_VAL:v[0-9]+]], 0 +; GCN-DAG: s_mov_b32 [[S_VAL:s[0-9]+]], 0 ; SI: v_cmp_ne_u64_e64 ; SI: s_cbranch_vccnz [[IF_LABEL:[0-9_A-Za-z]+]] @@ -472,10 +470,11 @@ ; VI: s_cbranch_scc1 [[IF_LABEL:[0-9_A-Za-z]+]] ; Fall-through to the else -; GCN: v_mov_b32_e32 [[STORE_VAL]], 1 +; GCN: s_mov_b32 [[S_VAL]], 1 ; GCN: [[IF_LABEL]]: -; GCN: buffer_store_dword [[STORE_VAL]] +; GCN: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[S_VAL]] +; GCN: buffer_store_dword [[V_VAL]] define void @uniform_if_scc_i64_ne(i64 %cond, i32 addrspace(1)* %out) { entry: %cmp0 = icmp ne i64 %cond, 0 @@ -494,14 +493,16 @@ } ; GCN-LABEL: {{^}}uniform_if_scc_i64_sgt: +; GCN: s_mov_b32 [[S_VAL:s[0-9]+]], 0 ; GCN: v_cmp_gt_i64_e64 ; GCN: s_cbranch_vccnz [[IF_LABEL:[0-9_A-Za-z]+]] ; Fall-through to the else -; GCN: v_mov_b32_e32 [[STORE_VAL]], 1 +; GCN: s_mov_b32 [[S_VAL]], 1 ; GCN: [[IF_LABEL]]: -; GCN: buffer_store_dword [[STORE_VAL]] +; GCN: v_mov_b32_e32 [[V_VAL]], [[S_VAL]] +; GCN: buffer_store_dword [[V_VAL]] define void @uniform_if_scc_i64_sgt(i64 %cond, i32 addrspace(1)* %out) { entry: %cmp0 = icmp sgt i64 %cond, 0 Index: llvm/trunk/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll +++ llvm/trunk/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll @@ -7,13 +7,11 @@ ; CHECK: s_and_saveexec_b64 ; CHECK-NEXT: s_xor_b64 ; CHECK-NEXT: ; mask branch -; CHECK-NEXT: s_cbranch_execz ; CHECK-NEXT: BB{{[0-9]+_[0-9]+}}: ; %loop_body.preheader ; CHECK: [[LOOP_BODY_LABEL:BB[0-9]+_[0-9]+]]: -; CHECK: s_and_b64 vcc, exec, vcc -; CHECK: s_cbranch_vccz [[LOOP_BODY_LABEL]] +; CHECK: s_cbranch_scc0 [[LOOP_BODY_LABEL]] ; CHECK: s_endpgm define amdgpu_ps void @test1(<8 x i32> inreg %rsrc, <2 x i32> %addr.base, i32 %y, i32 %p) { Index: llvm/trunk/test/CodeGen/AMDGPU/valu-i1.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/valu-i1.ll +++ llvm/trunk/test/CodeGen/AMDGPU/valu-i1.ll @@ -101,9 +101,8 @@ ; SI: [[LABEL_LOOP:BB[0-9]+_[0-9]+]]: ; SI: buffer_load_dword ; SI-DAG: buffer_store_dword -; SI-DAG: v_cmp_eq_u32_e32 vcc, -; SI-DAG: s_and_b64 vcc, exec, vcc -; SI: s_cbranch_vccz [[LABEL_LOOP]] +; SI-DAG: s_cmpk_eq_i32 s{{[0-9]+}}, 0x100 +; SI: s_cbranch_scc0 [[LABEL_LOOP]] ; SI: [[LABEL_EXIT]]: ; SI: s_endpgm