Index: lib/Target/AMDGPU/AMDGPU.h =================================================================== --- lib/Target/AMDGPU/AMDGPU.h +++ lib/Target/AMDGPU/AMDGPU.h @@ -37,6 +37,7 @@ FunctionPass *createR600ISelDag(TargetMachine *TM, CodeGenOpt::Level OptLevel); // SI Passes +FunctionPass *createGCNDPPCombinePass(); FunctionPass *createSIAnnotateControlFlowPass(); FunctionPass *createSIFoldOperandsPass(); FunctionPass *createSIPeepholeSDWAPass(); @@ -92,6 +93,9 @@ void initializeAMDGPURewriteOutArgumentsPass(PassRegistry &); extern char &AMDGPURewriteOutArgumentsID; +void initializeGCNDPPCombinePass(PassRegistry &); +extern char &GCNDPPCombineID; + void initializeR600ClauseMergePassPass(PassRegistry &); extern char &R600ClauseMergePassID; Index: lib/Target/AMDGPU/AMDGPU.td =================================================================== --- lib/Target/AMDGPU/AMDGPU.td +++ lib/Target/AMDGPU/AMDGPU.td @@ -11,6 +11,10 @@ include "llvm/Target/Target.td" include "AMDGPUFeatures.td" +class BoolToList { + list ret = !if(Value, [1], []); +} + //===------------------------------------------------------------===// // Subtarget Features (device properties) //===------------------------------------------------------------===// Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -106,6 +106,11 @@ cl::desc("Enable SDWA peepholer"), cl::init(true)); +static cl::opt EnableDPPCombine( + "amdgpu-dpp-combine", + cl::desc("Enable DPP combiner"), + cl::init(false)); + // Enable address space based alias analysis static cl::opt EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden, cl::desc("Enable AMDGPU Alias Analysis"), @@ -158,6 +163,7 @@ initializeR600VectorRegMergerPass(*PR); initializeGlobalISel(*PR); initializeAMDGPUDAGToDAGISelPass(*PR); + initializeGCNDPPCombinePass(*PR); initializeSILowerI1CopiesPass(*PR); initializeSIFixSGPRCopiesPass(*PR); initializeSIFixVGPRCopiesPass(*PR); @@ -795,6 +801,8 @@ // // XXX - Can we get away without running DeadMachineInstructionElim again? addPass(&SIFoldOperandsID); + if (EnableDPPCombine) + addPass(&GCNDPPCombineID); addPass(&DeadMachineInstructionElimID); addPass(&SILoadStoreOptimizerID); if (EnableSDWAPeephole) { Index: lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp =================================================================== --- lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -5275,12 +5275,14 @@ ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1); } - // All DPP instructions with at least one source operand have a fake "old" - // source at the beginning that's tied to the dst operand. Handle it here. - if (Desc.getNumOperands() >= 2) - Inst.addOperand(Inst.getOperand(0)); - for (unsigned E = Operands.size(); I != E; ++I) { + auto TiedTo = Desc.getOperandConstraint(Inst.getNumOperands(), + MCOI::TIED_TO); + if (TiedTo != -1) { + assert((unsigned)TiedTo < Inst.getNumOperands()); + // handle tied old or src2 for MAC instructions + Inst.addOperand(Inst.getOperand(TiedTo)); + } AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]); // Add the register arguments if (Op.isReg() && Op.Reg.RegNo == AMDGPU::VCC) { Index: lib/Target/AMDGPU/CMakeLists.txt =================================================================== --- lib/Target/AMDGPU/CMakeLists.txt +++ lib/Target/AMDGPU/CMakeLists.txt @@ -118,6 +118,7 @@ SIShrinkInstructions.cpp SIWholeQuadMode.cpp GCNILPSched.cpp + GCNDPPCombine.cpp ) add_subdirectory(AsmParser) Index: lib/Target/AMDGPU/GCNDPPCombine.cpp =================================================================== --- /dev/null +++ lib/Target/AMDGPU/GCNDPPCombine.cpp @@ -0,0 +1,382 @@ +//=======- GCNDPPCombine.cpp - optimization for DPP instructions ---==========// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// This pass combines dpp moves with the using instructions +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/Pass.h" +#include + +using namespace llvm; + +#define DEBUG_TYPE "gcn-dpp-combine" + +STATISTIC(NumDPPMovsCombined, "Number of DPP moves combined."); + +namespace { + +class GCNDPPCombine : public MachineFunctionPass { + MachineRegisterInfo *MRI; + const SIInstrInfo *TII; + + using RegSubRegPair = TargetInstrInfo::RegSubRegPair; + + MachineOperand *getOldOpndValue(MachineOperand &OldOpnd) const; + + RegSubRegPair foldOldOpnd(MachineInstr &OrigMI, + RegSubRegPair OldOpndVGPR, + MachineOperand &OldOpndValue) const; + + MachineInstr *createDPPInst(MachineInstr &OrigMI, + MachineInstr &MovMI, + RegSubRegPair OldOpndVGPR, + MachineOperand *OldOpnd, + bool BoundCtrlZero) const; + + MachineInstr *createDPPInst(MachineInstr &OrigMI, + MachineInstr &MovMI, + RegSubRegPair OldOpndVGPR, + bool BoundCtrlZero) const; + + bool combineDPPMov(MachineInstr &MI) const; + +public: + static char ID; + + GCNDPPCombine() : MachineFunctionPass(ID) { + initializeGCNDPPCombinePass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + StringRef getPassName() const override { return "GCN DPP Combine"; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // end anonymous namespace + +INITIALIZE_PASS(GCNDPPCombine, DEBUG_TYPE, "GCN DPP Combine", false, false) + +char GCNDPPCombine::ID = 0; + +char &llvm::GCNDPPCombineID = GCNDPPCombine::ID; + +FunctionPass *llvm::createGCNDPPCombinePass() { + return new GCNDPPCombine(); +} + +static int getDPPOp(unsigned Op) { + auto DPP32 = AMDGPU::getDPPOp32(Op); + if (DPP32 != -1) + return DPP32; + return AMDGPU::getDPPOp64(Op); +} + +// tracks the register operand definition and returns: +// 1. immediate operand used to initialize the register if found +// 2. nullptr if the register operand is undef +// 3. the operand itself otherwise +MachineOperand *GCNDPPCombine::getOldOpndValue(MachineOperand &OldOpnd) const { + auto *Def = getVRegSubRegDef(getRegSubRegPair(OldOpnd), *MRI); + if (!Def) + return nullptr; + + switch(Def->getOpcode()) { + default: break; + case AMDGPU::IMPLICIT_DEF: + return nullptr; + case AMDGPU::COPY: + case AMDGPU::V_MOV_B32_e32: { + auto &Op1 = Def->getOperand(1); + if (Op1.isImm()) + return &Op1; + break; + } + } + return &OldOpnd; +} + +MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, + MachineInstr &MovMI, + RegSubRegPair OldOpndVGPR, + bool BoundCtrlZero) const { + assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp); + assert(TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst)->getReg() == + TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0)->getReg()); + + auto OrigOp = OrigMI.getOpcode(); + auto DPPOp = getDPPOp(OrigOp); + if (DPPOp == -1) { + LLVM_DEBUG(dbgs() << " failed: no DPP opcode\n"); + return nullptr; + } + + auto DPPInst = BuildMI(*OrigMI.getParent(), OrigMI, + OrigMI.getDebugLoc(), TII->get(DPPOp)); + bool Fail = false; + do { + auto *Dst = TII->getNamedOperand(OrigMI, AMDGPU::OpName::vdst); + assert(Dst); + DPPInst.add(*Dst); + int NumOperands = 1; + + const int OldIdx = AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::old); + if (OldIdx != -1) { + assert(OldIdx == NumOperands); + assert(isOfRegClass(OldOpndVGPR, AMDGPU::VGPR_32RegClass, *MRI)); + DPPInst.addReg(OldOpndVGPR.Reg, 0, OldOpndVGPR.SubReg); + ++NumOperands; + } + + if (auto *Mod0 = TII->getNamedOperand(OrigMI, + AMDGPU::OpName::src0_modifiers)) { + assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp, + AMDGPU::OpName::src0_modifiers)); + DPPInst.addImm(Mod0->getImm()); + ++NumOperands; + } + auto *Src0 = TII->getNamedOperand(MovMI, AMDGPU::OpName::src0); + assert(Src0); + if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src0)) { + LLVM_DEBUG(dbgs() << " failed: src0 is illegal\n"); + Fail = true; + break; + } + DPPInst.add(*Src0); + ++NumOperands; + + if (auto *Mod1 = TII->getNamedOperand(OrigMI, + AMDGPU::OpName::src1_modifiers)) { + assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp, + AMDGPU::OpName::src1_modifiers)); + DPPInst.addImm(Mod1->getImm()); + ++NumOperands; + } + if (auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1)) { + if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src1)) { + LLVM_DEBUG(dbgs() << " failed: src1 is illegal\n"); + Fail = true; + break; + } + DPPInst.add(*Src1); + ++NumOperands; + } + + if (auto *Src2 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2)) { + if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src2)) { + LLVM_DEBUG(dbgs() << " failed: src2 is illegal\n"); + Fail = true; + break; + } + DPPInst.add(*Src2); + } + + DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl)); + DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask)); + DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask)); + DPPInst.addImm(BoundCtrlZero ? 1 : 0); + } while (false); + + if (Fail) { + DPPInst.getInstr()->eraseFromParent(); + return nullptr; + } + LLVM_DEBUG(dbgs() << " combined: " << *DPPInst.getInstr()); + return DPPInst.getInstr(); +} + +GCNDPPCombine::RegSubRegPair +GCNDPPCombine::foldOldOpnd(MachineInstr &OrigMI, + RegSubRegPair OldOpndVGPR, + MachineOperand &OldOpndValue) const { + assert(OldOpndValue.isImm()); + switch (OrigMI.getOpcode()) { + default: break; + case AMDGPU::V_MAX_U32_e32: + if (OldOpndValue.getImm() == std::numeric_limits::max()) + return OldOpndVGPR; + break; + case AMDGPU::V_MAX_I32_e32: + if (OldOpndValue.getImm() == std::numeric_limits::max()) + return OldOpndVGPR; + break; + case AMDGPU::V_MIN_I32_e32: + if (OldOpndValue.getImm() == std::numeric_limits::min()) + return OldOpndVGPR; + break; + + case AMDGPU::V_MUL_I32_I24_e32: + case AMDGPU::V_MUL_U32_U24_e32: + if (OldOpndValue.getImm() == 1) { + auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1); + assert(Src1 && Src1->isReg()); + return getRegSubRegPair(*Src1); + } + break; + } + return RegSubRegPair(); +} + +// Cases to combine: +// $bound_ctrl is DPP_BOUND_ZERO, $old is any +// $bound_ctrl is DPP_BOUND_OFF, $old is 0 +// -> $old = undef, $bound_ctrl = DPP_BOUND_ZERO + +// $bound_ctrl is DPP_BOUND_OFF, $old is undef +// -> $old = undef, $bound_ctrl = DPP_BOUND_OFF + +// $bound_ctrl is DPP_BOUND_OFF, $old is foldable +// -> $old = folded value, $bound_ctrl = DPP_BOUND_OFF + +MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, + MachineInstr &MovMI, + RegSubRegPair OldOpndVGPR, + MachineOperand *OldOpndValue, + bool BoundCtrlZero) const { + assert(OldOpndVGPR.Reg); + if (!BoundCtrlZero && OldOpndValue) { + assert(OldOpndValue->isImm()); + OldOpndVGPR = foldOldOpnd(OrigMI, OldOpndVGPR, *OldOpndValue); + if (!OldOpndVGPR.Reg) { + LLVM_DEBUG(dbgs() << " failed: old immediate cannot be folded\n"); + return nullptr; + } + } + return createDPPInst(OrigMI, MovMI, OldOpndVGPR, BoundCtrlZero); +} + +bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const { + assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp); + auto *BCZOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::bound_ctrl); + assert(BCZOpnd && BCZOpnd->isImm()); + bool BoundCtrlZero = 0 != BCZOpnd->getImm(); + + LLVM_DEBUG(dbgs() << "\nDPP combine: " << MovMI); + + auto *OldOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::old); + assert(OldOpnd && OldOpnd->isReg()); + auto OldOpndVGPR = getRegSubRegPair(*OldOpnd); + auto *OldOpndValue = getOldOpndValue(*OldOpnd); + assert(!OldOpndValue || OldOpndValue->isImm() || OldOpndValue == OldOpnd); + if (OldOpndValue) { + if (BoundCtrlZero) { + OldOpndVGPR.Reg = 0; // should be undef, ignore old operand + OldOpndValue = nullptr; + } else { + if (!OldOpndValue->isImm()) { + LLVM_DEBUG(dbgs() << " failed: old operand isn't an imm or undef\n"); + return false; + } + if (OldOpndValue->getImm() == 0) { + OldOpndVGPR.Reg = 0; // should be undef + OldOpndValue = nullptr; + BoundCtrlZero = true; + } + } + } + + LLVM_DEBUG(dbgs() << " old="; + if (!OldOpndValue) dbgs() << "undef"; + else dbgs() << OldOpndValue->getImm(); + dbgs() << ", bound_ctrl=" << BoundCtrlZero << '\n'); + + std::vector OrigMIs, DPPMIs; + if (!OldOpndVGPR.Reg) { // OldOpndVGPR = undef + OldOpndVGPR = RegSubRegPair(MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass)); + auto UndefInst = BuildMI(*MovMI.getParent(), MovMI, MovMI.getDebugLoc(), + TII->get(AMDGPU::IMPLICIT_DEF), OldOpndVGPR.Reg); + DPPMIs.push_back(UndefInst.getInstr()); + } + + OrigMIs.push_back(&MovMI); + bool Rollback = true; + for (auto &Use : MRI->use_operands(TII->getNamedOperand( + MovMI, AMDGPU::OpName::vdst)->getReg())) { + Rollback = true; + + auto &OrigMI = *Use.getParent(); + LLVM_DEBUG(dbgs() << " combining: " << OrigMI); + if (&Use == TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0)) { + if (auto *DPPInst = createDPPInst(OrigMI, MovMI, OldOpndVGPR, + OldOpndValue, BoundCtrlZero)) { + DPPMIs.push_back(DPPInst); + Rollback = false; + } + } else if (OrigMI.isCommutable() && + &Use == TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1)) { + auto *BB = OrigMI.getParent(); + auto *NewMI = BB->getParent()->CloneMachineInstr(&OrigMI); + BB->insert(OrigMI, NewMI); + if (TII->commuteInstruction(*NewMI)) { + LLVM_DEBUG(dbgs() << " commuted: " << *NewMI); + if (auto *DPPInst = createDPPInst(*NewMI, MovMI, OldOpndVGPR, + OldOpndValue, BoundCtrlZero)) { + DPPMIs.push_back(DPPInst); + Rollback = false; + } + } else + LLVM_DEBUG(dbgs() << " failed: cannot be commuted\n"); + NewMI->eraseFromParent(); + } else + LLVM_DEBUG(dbgs() << " failed: no suitable operands\n"); + if (Rollback) + break; + OrigMIs.push_back(&OrigMI); + } + + for (auto *MI : *(Rollback? &DPPMIs : &OrigMIs)) + MI->eraseFromParent(); + + return !Rollback; +} + +bool GCNDPPCombine::runOnMachineFunction(MachineFunction &MF) { + auto &ST = MF.getSubtarget(); + if (!ST.hasDPP() || skipFunction(MF.getFunction())) + return false; + + MRI = &MF.getRegInfo(); + TII = ST.getInstrInfo(); + + assert(MRI->isSSA() && "Must be run on SSA"); + + std::vector DPPMoves; + for (auto &MBB : MF) { + for (auto &MI : MBB) { + if (MI.getOpcode() == AMDGPU::V_MOV_B32_dpp) + DPPMoves.push_back(&MI); + } + } + + bool Changed = false; + for (auto *MI : DPPMoves) { + if (combineDPPMov(*MI)) { + Changed = true; + ++NumDPPMovsCombined; + } + } + return Changed; +} Index: lib/Target/AMDGPU/SIInstrInfo.h =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.h +++ lib/Target/AMDGPU/SIInstrInfo.h @@ -908,9 +908,36 @@ /// Return -1 if the target-specific opcode for the pseudo instruction does /// not exist. If Opcode is not a pseudo instruction, this is identity. int pseudoToMCOpcode(int Opcode) const; - }; +/// \brief Returns true if a reg:subreg pair P has a TRC class +inline bool isOfRegClass(const TargetInstrInfo::RegSubRegPair &P, + const TargetRegisterClass &TRC, + MachineRegisterInfo &MRI) { + auto *RC = MRI.getRegClass(P.Reg); + if (!P.SubReg) + return RC == &TRC; + auto *TRI = MRI.getTargetRegisterInfo(); + return RC == TRI->getMatchingSuperRegClass(RC, &TRC, P.SubReg); +} + +/// \brief Create RegSubRegPair from a register MachineOperand +inline +TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O) { + assert(O.isReg()); + return TargetInstrInfo::RegSubRegPair(O.getReg(), O.getSubReg()); +} + +/// \brief Return the SubReg component from REG_SEQUENCE +TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI, + unsigned SubReg); + +/// \brief Return the defining instruction for a given reg:subreg pair +/// skipping copy like instructions and subreg-manipulation pseudos. +/// Following another subreg of a reg:subreg isn't supported. +MachineInstr *getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, + MachineRegisterInfo &MRI); + namespace AMDGPU { LLVM_READONLY @@ -923,6 +950,12 @@ int getSDWAOp(uint16_t Opcode); LLVM_READONLY + int getDPPOp32(uint16_t Opcode); + + LLVM_READONLY + int getDPPOp64(uint16_t Opcode); + + LLVM_READONLY int getBasicFromSDWAOp(uint16_t Opcode); LLVM_READONLY Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -5302,3 +5302,83 @@ return MCOp; } + +TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd) { + assert(RegOpnd.isReg()); + return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() : + getRegSubRegPair(RegOpnd); +} + +TargetInstrInfo::RegSubRegPair +llvm::getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg) { + assert(MI.isRegSequence()); + for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I) + if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) { + auto &RegOp = MI.getOperand(1 + 2 * I); + return getRegOrUndef(RegOp); + } + return TargetInstrInfo::RegSubRegPair(); +} + +// Try to find the definition of reg:subreg in subreg-manipulation pseudos +// Following a subreg of reg:subreg isn't supported +static bool followSubRegDef(MachineInstr &MI, + TargetInstrInfo::RegSubRegPair &RSR) { + if (!RSR.SubReg) + return false; + switch (MI.getOpcode()) { + default: break; + case AMDGPU::REG_SEQUENCE: + RSR = getRegSequenceSubReg(MI, RSR.SubReg); + return true; + // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg + case AMDGPU::INSERT_SUBREG: + if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm()) + // inserted the subreg we're looking for + RSR = getRegOrUndef(MI.getOperand(2)); + else { // the subreg in the rest of the reg + auto R1 = getRegOrUndef(MI.getOperand(1)); + if (R1.SubReg) // subreg of subreg isn't supported + return false; + RSR.Reg = R1.Reg; + } + return true; + } + return false; +} + +MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, + MachineRegisterInfo &MRI) { + assert(MRI.isSSA()); + if (!TargetRegisterInfo::isVirtualRegister(P.Reg)) + return nullptr; + + auto RSR = P; + auto *DefInst = MRI.getVRegDef(RSR.Reg); + while (auto *MI = DefInst) { + DefInst = nullptr; + switch (MI->getOpcode()) { + case AMDGPU::COPY: + case AMDGPU::V_MOV_B32_e32: { + auto &Op1 = MI->getOperand(1); + if (Op1.isReg() && + TargetRegisterInfo::isVirtualRegister(Op1.getReg())) { + if (Op1.isUndef()) + return nullptr; + RSR = getRegSubRegPair(Op1); + DefInst = MRI.getVRegDef(RSR.Reg); + } + break; + } + default: + if (followSubRegDef(*MI, RSR)) { + if (!RSR.Reg) + return nullptr; + DefInst = MRI.getVRegDef(RSR.Reg); + } + } + if (!DefInst) + return MI; + } + return nullptr; +} Index: lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.td +++ lib/Target/AMDGPU/SIInstrInfo.td @@ -1622,7 +1622,7 @@ 0, // 64-bit dst - No DPP or SDWA for 64-bit operands !if(!eq(Src0VT.Size, 64), 0, // 64-bit src0 - !if(!eq(Src0VT.Size, 64), + !if(!eq(Src1VT.Size, 64), 0, // 64-bit src2 1 ) @@ -1631,6 +1631,12 @@ ); } +class getHasDPP { + bit ret = !if(!eq(NumSrcArgs, 0), 0, + getHasExt.ret); +} + class BitOr { bit ret = !if(a, 1, !if(b, 1, 0)); } @@ -1710,7 +1716,7 @@ field bit HasSDWAOMod = isFloatType.ret; field bit HasExt = getHasExt.ret; - field bit HasExtDPP = HasExt; + field bit HasExtDPP = getHasDPP.ret; field bit HasExtSDWA = HasExt; field bit HasExtSDWA9 = HasExt; field int NeedPatGen = PatGenMode.NoPattern; @@ -1741,8 +1747,10 @@ getOpSelMod.ret, getOpSelMod.ret, getOpSelMod.ret>.ret; - field dag InsDPP = getInsDPP.ret; + field dag InsDPP = !if(HasExtDPP, + getInsDPP.ret, + (ins)); field dag InsSDWA = getInsSDWA.ret; @@ -1756,7 +1764,8 @@ HasSrc0FloatMods, HasSrc1FloatMods, HasSrc2FloatMods>.ret; - field string AsmDPP = getAsmDPP.ret; + field string AsmDPP = !if(HasExtDPP, + getAsmDPP.ret, ""); field string AsmSDWA = getAsmSDWA.ret; field string AsmSDWA9 = getAsmSDWA9.ret; } @@ -1931,6 +1940,23 @@ let ValueCols = [["Default"]]; } +// Maps ordinary instructions to their DPP counterparts +def getDPPOp32 : InstrMapping { + let FilterClass = "VOP"; + let RowFields = ["OpName"]; + let ColFields = ["AsmVariantName"]; + let KeyCol = ["Default"]; + let ValueCols = [["DPP"]]; +} +def getDPPOp64 : InstrMapping { + let FilterClass = "VOP"; + let RowFields = ["OpName"]; + let ColFields = ["AsmVariantName"]; + let KeyCol = ["VOP3"]; + let ValueCols = [["DPP"]]; +} + + // Maps an commuted opcode to its original version def getCommuteOrig : InstrMapping { let FilterClass = "Commutable_REV"; Index: lib/Target/AMDGPU/VOP1Instructions.td =================================================================== --- lib/Target/AMDGPU/VOP1Instructions.td +++ lib/Target/AMDGPU/VOP1Instructions.td @@ -84,6 +84,10 @@ let AsmMatchConverter = "cvtSdwaVOP1"; } +class VOP1_DPP_Pseudo pattern=[]> : + VOP_DPP_Pseudo { +} + class getVOP1Pat64 : LetDummies { list ret = !if(P.HasModifiers, @@ -103,6 +107,8 @@ def _e32 : VOP1_Pseudo ; def _e64 : VOP3_Pseudo .ret>; def _sdwa : VOP1_SDWA_Pseudo ; + foreach _ = BoolToList.ret in + def _dpp : VOP1_DPP_Pseudo ; } // Special profile for instructions which have clamp @@ -500,13 +506,8 @@ // VI //===----------------------------------------------------------------------===// -class VOP1_DPP op, VOP1_Pseudo ps, VOPProfile P = ps.Pfl> : - VOP_DPP { - let Defs = ps.Defs; - let Uses = ps.Uses; - let SchedRW = ps.SchedRW; - let hasSideEffects = ps.hasSideEffects; - +class VOP1_DPPe op, VOP1_DPP_Pseudo ps, VOPProfile P = ps.Pfl> : + VOP_DPPe

{ bits<8> vdst; let Inst{8-0} = 0xfa; // dpp let Inst{16-9} = op; @@ -544,9 +545,10 @@ VOP_SDWA9_Real (NAME#"_sdwa")>, VOP1_SDWA9Ae (NAME#"_sdwa").Pfl>; - // For now left dpp only for asm/dasm - // TODO: add corresponding pseudo - def _dpp : VOP1_DPP(NAME#"_e32")>; + foreach _ = BoolToList(NAME#"_e32").Pfl.HasExtDPP>.ret in + def _dpp_vi : + VOP_DPP_Real(NAME#"_dpp"), SIEncodingFamily.VI>, + VOP1_DPPe(NAME#"_dpp")>; } defm V_NOP : VOP1_Real_vi <0x0>; @@ -717,9 +719,11 @@ VOP_SDWA9_Real (NAME#"_sdwa")>, VOP1_SDWA9Ae (NAME#"_sdwa").Pfl>; - // For now left dpp only for asm/dasm - // TODO: add corresponding pseudo - def _dpp : VOP1_DPP(NAME#"_e32")>; + foreach _ = BoolToList(NAME#"_e32").Pfl.HasExtDPP>.ret in + def _dpp_gfx9 : + VOP_DPP_Real(NAME#"_dpp"), SIEncodingFamily.GFX9>, + VOP1_DPPe(NAME#"_dpp")>; + } defm V_SCREEN_PARTITION_4SE_B32 : VOP1_Real_gfx9 <0x37>; Index: lib/Target/AMDGPU/VOP2Instructions.td =================================================================== --- lib/Target/AMDGPU/VOP2Instructions.td +++ lib/Target/AMDGPU/VOP2Instructions.td @@ -105,6 +105,11 @@ let AsmMatchConverter = "cvtSdwaVOP2"; } +class VOP2_DPP_Pseudo pattern=[]> : + VOP_DPP_Pseudo { +} + + class getVOP2Pat64 : LetDummies { list ret = !if(P.HasModifiers, [(set P.DstVT:$vdst, @@ -155,7 +160,12 @@ bit GFX9Renamed = 0> : VOP2Inst_e32, VOP2Inst_e64, - VOP2Inst_sdwa; + VOP2Inst_sdwa { + let renamedInGFX9 = GFX9Renamed in { + foreach _ = BoolToList.ret in + def _dpp : VOP2_DPP_Pseudo ; + } +} multiclass VOP2bInst { let AsmMatchConverter = "cvtSdwaVOP2b"; } + foreach _ = BoolToList.ret in + def _dpp : VOP2_DPP_Pseudo ; } def _e64 : VOP3_Pseudo .ret>, @@ -194,6 +206,9 @@ def _sdwa : VOP2_SDWA_Pseudo { let AsmMatchConverter = "cvtSdwaVOP2b"; } + + foreach _ = BoolToList.ret in + def _dpp : VOP2_DPP_Pseudo ; } def _e64 : VOP3_Pseudo .ret>, @@ -233,9 +248,9 @@ let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VGPR_32:$src2); let Ins64 = getIns64, 3, 0, HasModifiers, HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret; - let InsDPP = (ins DstRCDPP:$old, - Src0ModDPP:$src0_modifiers, Src0DPP:$src0, + let InsDPP = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0, Src1ModDPP:$src1_modifiers, Src1DPP:$src1, + VGPR_32:$src2, // stub argument dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, bank_mask:$bank_mask, bound_ctrl:$bound_ctrl); @@ -778,13 +793,8 @@ // VI //===----------------------------------------------------------------------===// -class VOP2_DPP op, VOP2_Pseudo ps, string OpName = ps.OpName, VOPProfile P = ps.Pfl> : - VOP_DPP { - let Defs = ps.Defs; - let Uses = ps.Uses; - let SchedRW = ps.SchedRW; - let hasSideEffects = ps.hasSideEffects; - +class VOP2_DPPe op, VOP2_DPP_Pseudo ps, VOPProfile P = ps.Pfl> : + VOP_DPPe

{ bits<8> vdst; bits<8> src1; let Inst{8-0} = 0xfa; //dpp @@ -865,8 +875,13 @@ VOP2_SDWA_Pseudo ps = !cast(OpName#"_sdwa"); let AsmString = AsmName # ps.AsmOperands; } - def _dpp : - VOP2_DPP(OpName#"_e32"), AsmName>; + foreach _ = BoolToList(OpName#"_e32").Pfl.HasExtDPP>.ret in + def _dpp_vi : + VOP_DPP_Real(OpName#"_dpp"), SIEncodingFamily.VI>, + VOP2_DPPe(OpName#"_dpp")> { + VOP2_DPP_Pseudo ps = !cast(OpName#"_dpp"); + let AsmString = AsmName # ps.AsmOperands; + } } } @@ -893,10 +908,14 @@ VOP2_SDWA_Pseudo ps = !cast(OpName#"_sdwa"); let AsmString = AsmName # ps.AsmOperands; } - def _dpp_gfx9 : - VOP2_DPP(OpName#"_e32"), AsmName> { - let DecoderNamespace = "SDWA9"; - } + foreach _ = BoolToList(OpName#"_e32").Pfl.HasExtDPP>.ret in + def _dpp_gfx9 : + VOP_DPP_Real(OpName#"_dpp"), SIEncodingFamily.GFX9>, + VOP2_DPPe(OpName#"_dpp")> { + VOP2_DPP_Pseudo ps = !cast(OpName#"_dpp"); + let AsmString = AsmName # ps.AsmOperands; + let DecoderNamespace = "SDWA9"; + } } multiclass VOP2_Real_e32e64_gfx9 op> { @@ -914,19 +933,23 @@ VOP_SDWA9_Real (NAME#"_sdwa")>, VOP2_SDWA9Ae (NAME#"_sdwa").Pfl> { } - def _dpp_gfx9 : - VOP2_DPP(NAME#"_e32")> { - let DecoderNamespace = "SDWA9"; - } + foreach _ = BoolToList(NAME#"_e32").Pfl.HasExtDPP>.ret in + def _dpp_gfx9 : + VOP_DPP_Real(NAME#"_dpp"), SIEncodingFamily.GFX9>, + VOP2_DPPe(NAME#"_dpp")> { + let DecoderNamespace = "SDWA9"; + } } } // AssemblerPredicates = [isGFX9] multiclass VOP2_Real_e32e64_vi op> : Base_VOP2_Real_e32e64_vi, VOP2_SDWA_Real, VOP2_SDWA9_Real { - // For now left dpp only for asm/dasm - // TODO: add corresponding pseudo - def _dpp : VOP2_DPP(NAME#"_e32")>; + + foreach _ = BoolToList(NAME#"_e32").Pfl.HasExtDPP>.ret in + def _dpp_vi : + VOP_DPP_Real(NAME#"_dpp"), SIEncodingFamily.VI>, + VOP2_DPPe(NAME#"_dpp")>; } defm V_CNDMASK_B32 : VOP2_Real_e32e64_vi <0x0>; Index: lib/Target/AMDGPU/VOPInstructions.td =================================================================== --- lib/Target/AMDGPU/VOPInstructions.td +++ lib/Target/AMDGPU/VOPInstructions.td @@ -505,9 +505,14 @@ let Inst{63-60} = row_mask; } -class VOP_DPP : - InstSI , - VOP_DPPe

{ +class VOP_DPP_Pseudo pattern=[]> : + InstSI , + VOP , + SIMCInstr , + MnemonicAlias { + + let isPseudo = 1; + let isCodeGenOnly = 1; let mayLoad = 0; let mayStore = 0; @@ -517,6 +522,11 @@ let VALU = 1; let DPP = 1; let Size = 8; + let Uses = [EXEC]; + let isConvergent = 1; + + string Mnemonic = OpName; + string AsmOperands = P.AsmDPP; let AsmMatchConverter = !if(!eq(P.HasModifiers,1), "cvtDPP", ""); let SubtargetPredicate = HasDPP; @@ -526,6 +536,36 @@ let Constraints = !if(P.NumSrcArgs, "$old = $vdst", ""); let DisableEncoding = !if(P.NumSrcArgs, "$old", ""); let DecoderNamespace = "DPP"; + + VOPProfile Pfl = P; +} + +class VOP_DPP_Real : + InstSI , + SIMCInstr { + + let isPseudo = 0; + let isCodeGenOnly = 0; + + let Defs = ps.Defs; + let Uses = ps.Uses; + let SchedRW = ps.SchedRW; + let hasSideEffects = ps.hasSideEffects; + + let Constraints = ps.Constraints; + let DisableEncoding = ps.DisableEncoding; + + // Copy relevant pseudo op flags + let isConvergent = ps.isConvergent; + let SubtargetPredicate = ps.SubtargetPredicate; + let AssemblerPredicate = ps.AssemblerPredicate; + let AsmMatchConverter = ps.AsmMatchConverter; + let AsmVariantName = ps.AsmVariantName; + let UseNamedOperandTable = ps.UseNamedOperandTable; + let DecoderNamespace = ps.DecoderNamespace; + let Constraints = ps.Constraints; + let DisableEncoding = ps.DisableEncoding; + let TSFlags = ps.TSFlags; } class getNumNodeArgs { Index: test/CodeGen/AMDGPU/dpp_combine.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/dpp_combine.ll @@ -0,0 +1,186 @@ +; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-dpp-combine -verify-machineinstrs < %s | FileCheck %s + +; VOP2 with literal cannot be combined +; CHECK-LABEL: {{^}}dpp_combine_i32_literal: +; CHECK: v_mov_b32_dpp [[OLD:v[0-9]+]], {{v[0-9]+}} quad_perm:[1,0,0,0] row_mask:0x2 bank_mask:0x1 bound_ctrl:0 +; CHECK: v_add_u32_e32 {{v[0-9]+}}, vcc, 42, [[OLD]] +define amdgpu_kernel void @dpp_combine_i32_literal(i32 addrspace(1)* %out, i32 %in) { + %dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 %in, i32 1, i32 2, i32 1, i1 1) #0 + %res = add nsw i32 %dpp, 42 + store i32 %res, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}dpp_combine_i32_bz: +; CHECK: v_add_u32_dpp {{v[0-9]+}}, vcc, {{v[0-9]+}}, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 +define amdgpu_kernel void @dpp_combine_i32_bz(i32 addrspace(1)* %out, i32 %in) { + %x = tail call i32 @llvm.amdgcn.workitem.id.x() + %dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 %in, i32 1, i32 1, i32 1, i1 1) #0 + %res = add nsw i32 %dpp, %x + store i32 %res, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}dpp_combine_i32_boff_undef: +; CHECK: v_add_u32_dpp {{v[0-9]+}}, vcc, {{v[0-9]+}}, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +define amdgpu_kernel void @dpp_combine_i32_boff_undef(i32 addrspace(1)* %out, i32 %in) { + %x = tail call i32 @llvm.amdgcn.workitem.id.x() + %dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 %in, i32 1, i32 1, i32 1, i1 0) #0 + %res = add nsw i32 %dpp, %x + store i32 %res, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}dpp_combine_i32_boff_0: +; CHECK: v_add_u32_dpp {{v[0-9]+}}, vcc, {{v[0-9]+}}, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 +define amdgpu_kernel void @dpp_combine_i32_boff_0(i32 addrspace(1)* %out, i32 %in) { + %x = tail call i32 @llvm.amdgcn.workitem.id.x() + %dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %in, i32 1, i32 1, i32 1, i1 0) #0 + %res = add nsw i32 %dpp, %x + store i32 %res, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}dpp_combine_i32_boff_max: +; CHECK: v_bfrev_b32_e32 [[OLD:v[0-9]+]], -2 +; CHECK: v_max_i32_dpp [[OLD]], {{v[0-9]+}}, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +define amdgpu_kernel void @dpp_combine_i32_boff_max(i32 addrspace(1)* %out, i32 %in) { + %x = tail call i32 @llvm.amdgcn.workitem.id.x() + %dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 2147483647, i32 %in, i32 1, i32 1, i32 1, i1 0) #0 + %cmp = icmp sge i32 %dpp, %x + %res = select i1 %cmp, i32 %dpp, i32 %x + store i32 %res, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}dpp_combine_i32_boff_min: +; CHECK: v_bfrev_b32_e32 [[OLD:v[0-9]+]], 1 +; CHECK: v_min_i32_dpp [[OLD]], {{v[0-9]+}}, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +define amdgpu_kernel void @dpp_combine_i32_boff_min(i32 addrspace(1)* %out, i32 %in) { + %x = tail call i32 @llvm.amdgcn.workitem.id.x() + %dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 -2147483648, i32 %in, i32 1, i32 1, i32 1, i1 0) #0 + %cmp = icmp sle i32 %dpp, %x + %res = select i1 %cmp, i32 %dpp, i32 %x + store i32 %res, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}dpp_combine_i32_boff_mul: +; CHECK: v_mul_i32_i24_dpp v0, v3, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +define amdgpu_kernel void @dpp_combine_i32_boff_mul(i32 addrspace(1)* %out, i32 %in) { + %x = tail call i32 @llvm.amdgcn.workitem.id.x() + %dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 1, i32 %in, i32 1, i32 1, i32 1, i1 0) #0 + + %dpp.shl = shl i32 %dpp, 8 + %dpp.24 = ashr i32 %dpp.shl, 8 + %x.shl = shl i32 %x, 8 + %x.24 = ashr i32 %x.shl, 8 + %res = mul i32 %dpp.24, %x.24 + store i32 %res, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}dpp_combine_i32_commute: +; CHECK: v_subrev_u32_dpp {{v[0-9]+}}, vcc, {{v[0-9]+}}, v0 quad_perm:[2,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 +define amdgpu_kernel void @dpp_combine_i32_commute(i32 addrspace(1)* %out, i32 %in) { + %x = tail call i32 @llvm.amdgcn.workitem.id.x() + %dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 %in, i32 2, i32 1, i32 1, i1 1) #0 + %res = sub nsw i32 %x, %dpp + store i32 %res, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}dpp_combine_f32: +; CHECK: v_add_f32_dpp {{v[0-9]+}}, {{v[0-9]+}}, v0 quad_perm:[3,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 +define amdgpu_kernel void @dpp_combine_f32(i32 addrspace(1)* %out, i32 %in) { + %x = tail call i32 @llvm.amdgcn.workitem.id.x() + + %dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 %in, i32 3, i32 1, i32 1, i1 1) #0 + %dpp.f32 = bitcast i32 %dpp to float + %x.f32 = bitcast i32 %x to float + %res.f32 = fadd float %x.f32, %dpp.f32 + %res = bitcast float %res.f32 to i32 + store i32 %res, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}dpp_combine_test_f32_mods: +; CHECK: v_mul_f32_dpp {{v[0-9]+}}, |{{v[0-9]+}}|, -v0 quad_perm:[0,1,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 +define amdgpu_kernel void @dpp_combine_test_f32_mods(i32 addrspace(1)* %out, i32 %in) { + %x = tail call i32 @llvm.amdgcn.workitem.id.x() + + %dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 %in, i32 4, i32 1, i32 1, i1 1) #0 + + %x.f32 = bitcast i32 %x to float + %x.f32.neg = fsub float -0.000000e+00, %x.f32 + + %dpp.f32 = bitcast i32 %dpp to float + %dpp.f32.cmp = fcmp fast olt float %dpp.f32, 0.000000e+00 + %dpp.f32.sign = select i1 %dpp.f32.cmp, float -1.000000e+00, float 1.000000e+00 + %dpp.f32.abs = fmul fast float %dpp.f32, %dpp.f32.sign + + %res.f32 = fmul float %x.f32.neg, %dpp.f32.abs + %res = bitcast float %res.f32 to i32 + store i32 %res, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}dpp_combine_mac: +; CHECK: v_mac_f32_dpp v0, {{v[0-9]+}}, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 +define amdgpu_kernel void @dpp_combine_mac(float addrspace(1)* %out, i32 %in) { + %x = tail call i32 @llvm.amdgcn.workitem.id.x() + %y = tail call i32 @llvm.amdgcn.workitem.id.y() + %dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 %in, i32 1, i32 1, i32 1, i1 1) #0 + %dpp.f32 = bitcast i32 %dpp to float + %x.f32 = bitcast i32 %x to float + %y.f32 = bitcast i32 %y to float + + %mult = fmul float %dpp.f32, %y.f32 + %res = fadd float %mult, %x.f32 + store float %res, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}dpp_combine_sequence: +define amdgpu_kernel void @dpp_combine_sequence(i32 addrspace(1)* %out, i32 %in, i1 %cmp) { + %x = tail call i32 @llvm.amdgcn.workitem.id.x() + %dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 %in, i32 1, i32 1, i32 1, i1 1) #0 + br i1 %cmp, label %bb1, label %bb2 +bb1: +; CHECK: v_add_u32_dpp {{v[0-9]+}}, vcc, {{v[0-9]+}}, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 + %resadd = add nsw i32 %dpp, %x + br label %bb3 +bb2: +; CHECK: v_subrev_u32_dpp {{v[0-9]+}}, vcc, {{v[0-9]+}}, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 + %ressub = sub nsw i32 %x, %dpp + br label %bb3 +bb3: + %res = phi i32 [%resadd, %bb1], [%ressub, %bb2] + store i32 %res, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}dpp_combine_sequence_negative: +; CHECK: v_mov_b32_dpp v1, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 +define amdgpu_kernel void @dpp_combine_sequence_negative(i32 addrspace(1)* %out, i32 %in, i1 %cmp) { + %x = tail call i32 @llvm.amdgcn.workitem.id.x() + %dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 %in, i32 1, i32 1, i32 1, i1 1) #0 + br i1 %cmp, label %bb1, label %bb2 +bb1: + %resadd = add nsw i32 %dpp, %x + br label %bb3 +bb2: + %ressub = sub nsw i32 2, %dpp ; break seq + br label %bb3 +bb3: + %res = phi i32 [%resadd, %bb1], [%ressub, %bb2] + store i32 %res, i32 addrspace(1)* %out + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() +declare i32 @llvm.amdgcn.workitem.id.y() +declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1) #0 + +attributes #0 = { nounwind readnone convergent } + Index: test/CodeGen/AMDGPU/dpp_combine_subregs.mir =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/dpp_combine_subregs.mir @@ -0,0 +1,94 @@ +# RUN: llc -march=amdgcn -mcpu=tonga -run-pass=gcn-dpp-combine -o - %s | FileCheck %s + +# test if $old definition is correctly tracked through subreg manipulation pseudos + +--- +# CHECK-LABEL: name: mul_old_subreg +# CHECK: %7:vgpr_32 = V_MUL_I32_I24_dpp %0.sub1, %1, %0.sub1, 1, 1, 1, 0, implicit $exec + +name: mul_old_subreg +tracksRegLiveness: true +registers: + - { id: 0, class: vreg_64 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: vgpr_32 } + - { id: 3, class: vgpr_32 } + - { id: 4, class: vreg_64 } + - { id: 5, class: vreg_64 } + - { id: 6, class: vgpr_32 } + - { id: 7, class: vgpr_32 } + +liveins: + - { reg: '$vgpr0', virtual-reg: '%0' } + - { reg: '$vgpr1', virtual-reg: '%1' } +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + %0:vreg_64 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = V_MOV_B32_e32 1, implicit $exec + %3:vgpr_32 = V_MOV_B32_e32 42, implicit $exec + %4 = REG_SEQUENCE %2, %subreg.sub0, %3, %subreg.sub1 + %5 = INSERT_SUBREG %4, %1, %subreg.sub1 ; %5.sub0 is taken from %4 + %6:vgpr_32 = V_MOV_B32_dpp %5.sub0, %1, 1, 1, 1, 0, implicit $exec + %7:vgpr_32 = V_MUL_I32_I24_e32 %6, %0.sub1, implicit $exec +... + +# CHECK-LABEL: name: add_old_subreg +# CHECK: [[OLD:\%[0-9]+]]:vgpr_32 = IMPLICIT_DEF +# CHECK: %5:vgpr_32 = V_ADD_U32_dpp [[OLD]], %1, %0.sub1, 1, 1, 1, 1, implicit $exec + +name: add_old_subreg +tracksRegLiveness: true +registers: + - { id: 0, class: vreg_64 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: vgpr_32 } + - { id: 3, class: vreg_64 } + - { id: 4, class: vgpr_32 } + - { id: 5, class: vgpr_32 } + +liveins: + - { reg: '$vgpr0', virtual-reg: '%0' } + - { reg: '$vgpr1', virtual-reg: '%1' } +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + %0:vreg_64 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %3:vreg_64 = INSERT_SUBREG %0, %2, %subreg.sub1 ; %3.sub1 is inserted + %4:vgpr_32 = V_MOV_B32_dpp %3.sub1, %1, 1, 1, 1, 0, implicit $exec + %5:vgpr_32 = V_ADD_U32_e32 %4, %0.sub1, implicit $exec +... + +# CHECK-LABEL: name: add_old_subreg_undef +# CHECK: %5:vgpr_32 = V_ADD_U32_dpp %3.sub1, %1, %0.sub1, 1, 1, 1, 0, implicit $exec + +name: add_old_subreg_undef +tracksRegLiveness: true +registers: + - { id: 0, class: vreg_64 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: vgpr_32 } + - { id: 3, class: vreg_64 } + - { id: 4, class: vgpr_32 } + - { id: 5, class: vgpr_32 } + +liveins: + - { reg: '$vgpr0', virtual-reg: '%0' } + - { reg: '$vgpr1', virtual-reg: '%1' } +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + %0:vreg_64 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %3:vreg_64 = REG_SEQUENCE %2, %subreg.sub0 ; %3.sub1 is undef + %4:vgpr_32 = V_MOV_B32_dpp %3.sub1, %1, 1, 1, 1, 0, implicit $exec + %5:vgpr_32 = V_ADD_U32_e32 %4, %0.sub1, implicit $exec +... + Index: test/MC/AMDGPU/vop_dpp.s =================================================================== --- test/MC/AMDGPU/vop_dpp.s +++ test/MC/AMDGPU/vop_dpp.s @@ -116,7 +116,6 @@ //===----------------------------------------------------------------------===// // NOSICI: error: -// VI9: v_nop row_shl:1 row_mask:0xa bank_mask:0x1 bound_ctrl:0 ; encoding: [0xfa,0x00,0x00,0x7e,0x00,0x01,0x09,0xa1] v_nop row_shl:1 row_mask:0xa bank_mask:0x1 bound_ctrl:0 // NOSICI: error: