Index: llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -1014,6 +1014,7 @@ addPass(&PostRAHazardRecognizerID); addPass(&SIRemoveShortExecBranchesID); + addPass(&PeepholeOptimizerID); addPass(&SIInsertSkipsPassID); addPass(&BranchRelaxationPassID); } Index: llvm/lib/Target/AMDGPU/SIInsertSkips.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIInsertSkips.cpp +++ llvm/lib/Target/AMDGPU/SIInsertSkips.cpp @@ -361,98 +361,6 @@ return true; } -bool SIInsertSkips::optimizeVccBranch(MachineInstr &MI) const { - // Match: - // sreg = -1 - // vcc = S_AND_B64 exec, sreg - // S_CBRANCH_VCC[N]Z - // => - // S_CBRANCH_EXEC[N]Z - bool Changed = false; - MachineBasicBlock &MBB = *MI.getParent(); - const GCNSubtarget &ST = MBB.getParent()->getSubtarget(); - const bool IsWave32 = ST.isWave32(); - const unsigned CondReg = TRI->getVCC(); - const unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - const unsigned And = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; - - MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(), - E = MBB.rend(); - bool ReadsCond = false; - unsigned Threshold = 5; - for (++A ; A != E ; ++A) { - if (!--Threshold) - return false; - if (A->modifiesRegister(ExecReg, TRI)) - return false; - if (A->modifiesRegister(CondReg, TRI)) { - if (!A->definesRegister(CondReg, TRI) || A->getOpcode() != And) - return false; - break; - } - ReadsCond |= A->readsRegister(CondReg, TRI); - } - if (A == E) - return false; - - MachineOperand &Op1 = A->getOperand(1); - MachineOperand &Op2 = A->getOperand(2); - if (Op1.getReg() != ExecReg && Op2.isReg() && Op2.getReg() == ExecReg) { - TII->commuteInstruction(*A); - Changed = true; - } - if (Op1.getReg() != ExecReg) - return Changed; - if (Op2.isImm() && Op2.getImm() != -1) - return Changed; - - unsigned SReg = AMDGPU::NoRegister; - if (Op2.isReg()) { - SReg = Op2.getReg(); - auto M = std::next(A); - bool ReadsSreg = false; - for ( ; M != E ; ++M) { - if (M->definesRegister(SReg, TRI)) - break; - if (M->modifiesRegister(SReg, TRI)) - return Changed; - ReadsSreg |= M->readsRegister(SReg, TRI); - } - if (M == E || - !M->isMoveImmediate() || - !M->getOperand(1).isImm() || - M->getOperand(1).getImm() != -1) - return Changed; - // First if sreg is only used in and instruction fold the immediate - // into that and. - if (!ReadsSreg && Op2.isKill()) { - A->getOperand(2).ChangeToImmediate(-1); - M->eraseFromParent(); - } - } - - if (!ReadsCond && A->registerDefIsDead(AMDGPU::SCC) && - MI.killsRegister(CondReg, TRI)) - A->eraseFromParent(); - - bool IsVCCZ = MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ; - if (SReg == ExecReg) { - if (IsVCCZ) { - MI.eraseFromParent(); - return true; - } - MI.setDesc(TII->get(AMDGPU::S_BRANCH)); - } else { - MI.setDesc(TII->get(IsVCCZ ? AMDGPU::S_CBRANCH_EXECZ - : AMDGPU::S_CBRANCH_EXECNZ)); - } - - MI.RemoveOperand(MI.findRegisterUseOperandIdx(CondReg, false /*Kill*/, TRI)); - MI.addImplicitDefUseOperands(*MBB.getParent()); - - return true; -} - bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) { const GCNSubtarget &ST = MF.getSubtarget(); TII = ST.getInstrInfo(); @@ -533,12 +441,6 @@ MDT->getBase().insertEdge(&MBB, EmptyMBBAtEnd); } break; - - case AMDGPU::S_CBRANCH_VCCZ: - case AMDGPU::S_CBRANCH_VCCNZ: - MadeChange |= optimizeVccBranch(MI); - break; - default: break; } Index: llvm/lib/Target/AMDGPU/SIInstrInfo.h =================================================================== --- llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -311,6 +311,8 @@ unsigned DstReg, ArrayRef Cond, unsigned TrueReg, unsigned FalseReg) const; + bool optimizeCondBranch(MachineInstr &MI) const override; + unsigned getAddressSpaceForPseudoSourceKind( unsigned Kind) const override; Index: llvm/lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2127,6 +2127,108 @@ return Count; } +bool SIInstrInfo::optimizeCondBranch(MachineInstr &MI) const { + switch (MI.getOpcode()) { + case AMDGPU::S_CBRANCH_VCCZ: + case AMDGPU::S_CBRANCH_VCCNZ: { + // Optimize the vcc branch instruction with the given pattern + // into an exec branch instruction. + // Pattern: + // sreg = -1 + // vcc = S_AND_B64 exec, sreg + // S_CBRANCH_VCC[N]Z + // => + // S_CBRANCH_EXEC[N]Z + // + MachineBasicBlock &MBB = *MI.getParent(); + const GCNSubtarget &ST = MBB.getParent()->getSubtarget(); + const bool IsWave32 = ST.isWave32(); + const unsigned CondReg = RI.getVCC(); + const unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + const unsigned And = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; + + MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(), + E = MBB.rend(); + bool ReadsCond = false; + unsigned Threshold = 5; + bool Changed = false; + + for (++A; A != E; ++A) { + if (!--Threshold) + return false; + if (A->modifiesRegister(ExecReg, &RI)) + return false; + if (A->modifiesRegister(CondReg, &RI)) { + if (!A->definesRegister(CondReg, &RI) || A->getOpcode() != And) + return false; + break; + } + ReadsCond |= A->readsRegister(CondReg, &RI); + } + if (A == E) + return false; + + MachineOperand &Op1 = A->getOperand(1); + MachineOperand &Op2 = A->getOperand(2); + if (Op1.getReg() != ExecReg && Op2.isReg() && Op2.getReg() == ExecReg) { + commuteInstruction(*A); + Changed = true; + } + if (Op1.getReg() != ExecReg) + return Changed; + if (Op2.isImm() && Op2.getImm() != -1) + return Changed; + + Register SReg; + if (Op2.isReg()) { + SReg = Op2.getReg(); + auto M = std::next(A); + bool ReadsSreg = false; + for (; M != E; ++M) { + if (M->definesRegister(SReg, &RI)) + break; + if (M->modifiesRegister(SReg, &RI)) + return Changed; + ReadsSreg |= M->readsRegister(SReg, &RI); + } + if (M == E || !M->isMoveImmediate() || !M->getOperand(1).isImm() || + M->getOperand(1).getImm() != -1) + return Changed; + // First if sreg is only used in and instruction fold the immediate + // into that and. + if (!ReadsSreg && Op2.isKill()) { + A->getOperand(2).ChangeToImmediate(-1); + M->eraseFromParent(); + } + } + + if (!ReadsCond && A->registerDefIsDead(AMDGPU::SCC) && + MI.killsRegister(CondReg, &RI)) + A->eraseFromParent(); + + bool IsVCCZ = MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ; + if (SReg == ExecReg) { + if (IsVCCZ) { + MI.eraseFromParent(); + return true; + } + MI.setDesc(get(AMDGPU::S_BRANCH)); + } else { + MI.setDesc( + get(IsVCCZ ? AMDGPU::S_CBRANCH_EXECZ : AMDGPU::S_CBRANCH_EXECNZ)); + } + + MI.RemoveOperand( + MI.findRegisterUseOperandIdx(CondReg, false /*Kill*/, &RI)); + MI.addImplicitDefUseOperands(*MBB.getParent()); + + return true; + } + default: + return false; + } +} + // Copy the flags onto the implicit condition register operand. static void preserveCondRegFlags(MachineOperand &CondReg, const MachineOperand &OrigCond) { Index: llvm/test/CodeGen/AMDGPU/insert-skip-from-vcc.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/insert-skip-from-vcc.mir +++ llvm/test/CodeGen/AMDGPU/insert-skip-from-vcc.mir @@ -1,5 +1,5 @@ -# RUN: llc -march=amdgcn -mcpu=fiji -run-pass si-insert-skips -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s -# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass si-insert-skips -verify-machineinstrs -o - %s | FileCheck -check-prefix=W32 %s +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass peephole-opt -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass peephole-opt -verify-machineinstrs -o - %s | FileCheck -check-prefix=W32 %s --- # GCN-LABEL: name: and_execz_mov_vccz Index: llvm/test/CodeGen/AMDGPU/wave32.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/wave32.ll +++ llvm/test/CodeGen/AMDGPU/wave32.ll @@ -836,8 +836,10 @@ } ; GCN-LABEL: {{^}}test_branch_true: -; GFX1032: s_and_b32 vcc_lo, exec_lo, -1 -; GFX1064: s_and_b64 vcc, exec, -1 +; GFX1032: s_mov_b32 [[S_REG:s[0-9]+]], -1 +; GFX1032: s_and_b32 vcc_lo, exec_lo, [[S_REG]] +; GFX1064: s_mov_b64 [[S_REG:s\[[0-9]+:[0-9]+\]]], -1 +; GFX1064: s_and_b64 vcc, exec, [[S_REG]] define amdgpu_kernel void @test_branch_true() #2 { entry: br i1 true, label %for.end, label %for.body.lr.ph